Skip to content

Commit

Permalink
fix: prevent the fmt command from modifying non-YARA files.
Browse files Browse the repository at this point in the history
The `fmt` command didn't take into account if the input file was actually a YARA source code file or not, it tried to format the file anyways, modifying binary files or whatever you passed to it. Now the `fmt` fails if the input file contains invalid UTF-8 sequences, and don't modify the files in such cases.

Closes #286.
  • Loading branch information
plusvic committed Jan 27, 2025
1 parent b4b289d commit 32b897d
Show file tree
Hide file tree
Showing 7 changed files with 120 additions and 34 deletions.
20 changes: 14 additions & 6 deletions cli/src/commands/fmt.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
use std::fs::File;
use std::io::{Cursor, Seek, SeekFrom};
use std::path::PathBuf;
use std::{fs, io, process};

Expand Down Expand Up @@ -54,19 +55,26 @@ pub fn exec_fmt(
config.rule.empty_line_after_section_header,
);

let mut changed = false;
let mut modified = false;

for file in files {
let input = fs::read(file.as_path())?;
changed = if check {
modified = if check {
formatter.format(input.as_slice(), io::sink())?
} else {
let output_file = File::create(file.as_path())?;
formatter.format(input.as_slice(), output_file)?
} || changed;
let mut formatted = Cursor::new(Vec::with_capacity(input.len()));
if formatter.format(input.as_slice(), &mut formatted)? {
formatted.seek(SeekFrom::Start(0))?;
let mut output_file = File::create(file.as_path())?;
io::copy(&mut formatted, &mut output_file)?;
true
} else {
false
}
} || modified;
}

if changed {
if modified {
process::exit(1)
}

Expand Down
38 changes: 38 additions & 0 deletions cli/src/tests/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -339,6 +339,44 @@ fn cli_issue_280() {
.success();
}

#[test]
fn cli_fmt() {
let temp_dir = TempDir::new().unwrap();
let input_file = temp_dir.child("rule.yar");

input_file.write_str("rule test { condition: true }").unwrap();

Command::cargo_bin("yr")
.unwrap()
.arg("fmt")
.arg(input_file.path())
.assert()
.code(1); // Exit code 1 indicates that the file was modified.

Command::cargo_bin("yr")
.unwrap()
.arg("fmt")
.arg(input_file.path())
.assert()
.code(0); // Second time that we format the same file, no expected changes.
}

#[test]
fn cli_fmt_utf8_error() {
let temp_dir = TempDir::new().unwrap();
let input_file = temp_dir.child("rule.yar");

input_file.write_binary(&[0xff, 0xff]).unwrap();

Command::cargo_bin("yr")
.unwrap()
.arg("fmt")
.arg(input_file.path())
.assert()
.stderr("error: invalid UTF-8 at [0..1]\n")
.code(1);
}

#[cfg(feature = "debug-cmd")]
#[test]
fn cli_debug_ast() {
Expand Down
35 changes: 23 additions & 12 deletions fmt/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ use thiserror::Error;

use tokens::Token::*;
use tokens::TokenStream;
use yara_x_parser::cst::SyntaxKind;
use yara_x_parser::cst::{Event, SyntaxKind};
use yara_x_parser::{Parser, Span};

use crate::align::Align;
Expand All @@ -49,16 +49,16 @@ mod tests;
#[allow(clippy::large_enum_variant)]
pub enum Error {
/// Error while reading from input.
#[error("Read error")]
#[error("read error")]
ReadError(io::Error),

/// Error while writing to output.
#[error("Write error")]
#[error("write error")]
WriteError(io::Error),

/// Error while parsing the input.
#[error("Parse error")]
ParseError { message: String, span: Span },
/// The input file contained invalid UTF-8.
#[error("invalid UTF-8 at {0}")]
InvalidUTF8(Span),
}

/// Formats YARA source code automatically.
Expand Down Expand Up @@ -328,10 +328,7 @@ impl Formatter {
/// Returns `true` if the output differs from the input.
///
/// This function will fail if it can't read from the input, write to the
/// output, or when the input doesn't contain syntactically valid YARA
/// rules.
///
/// TODO: syntactically invalid rules may be formatted
/// output, or when the input contains invalid UTF-8 characters.
pub fn format<R, W>(
&self,
mut input: R,
Expand All @@ -341,19 +338,33 @@ impl Formatter {
R: io::Read,
W: io::Write,
{
let mut invalid_utf8 = Option::None;
let mut in_buf = Vec::new();

input.read_to_end(&mut in_buf).map_err(Error::ReadError)?;

let stream = Parser::new(in_buf.as_slice()).into_cst_stream();
let tokens = Tokens::new(stream);
let cst_stream = Parser::new(in_buf.as_slice()).into_cst_stream();

// Inspect the CST stream looking for events indicating the presence of
// invalid UTF-8 sequences.
let events = cst_stream.into_iter().inspect(|evt| {
if let Event::Token { kind: SyntaxKind::INVALID_UTF8, span } = evt
{
invalid_utf8.get_or_insert(span.clone());
}
});

let tokens = Tokens::new(in_buf.as_slice(), events);
let mut out_buf = Cursor::new(Vec::new());

self.format_impl(tokens)
.write_to(&mut out_buf)
.map_err(Error::WriteError)?;

if let Some(span) = invalid_utf8 {
return Err(Error::InvalidUTF8(span));
}

let modified = in_buf.ne(out_buf.get_ref());

output.write_all(out_buf.get_ref()).map_err(Error::WriteError)?;
Expand Down
3 changes: 2 additions & 1 deletion fmt/src/processor/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@ use crate::tokens::{categories, Token};
fn tokenize(source: &str) -> Vec<Token> {
let events =
Parser::new(source.as_bytes()).into_cst_stream().whitespaces(false);
tokens::Tokens::new(events).collect()

tokens::Tokens::new(source.as_bytes(), events).collect()
}

#[test]
Expand Down
24 changes: 21 additions & 3 deletions fmt/src/tests.rs
Original file line number Diff line number Diff line change
@@ -1,12 +1,15 @@
use std::io::Cursor;
use std::path::PathBuf;
use std::{fs, str};

use bstr::ByteSlice;
use pretty_assertions::assert_eq;
use rayon::prelude::*;
use yara_x_parser::Parser;

use yara_x_parser::{Parser, Span};

use crate::tokens::{TokenStream, Tokens};
use crate::Formatter;
use crate::{Error, Formatter};

#[test]
fn spacer() {
Expand All @@ -31,13 +34,28 @@ fn spacer() {
let mut output = Vec::new();
let rules = t.0.as_bytes();
let events = Parser::new(rules).into_cst_stream().whitespaces(false);
let tokens = Tokens::new(events);
let tokens = Tokens::new(rules, events);

Formatter::add_spacing(tokens).write_to(&mut output).unwrap();
assert_eq!(str::from_utf8(&output).unwrap(), t.1);
}
}

#[test]
fn invalid_utf8() {
let mut output = Cursor::new(Vec::new());

match Formatter::new()
.format(b"\xFF\xFF".as_bytes(), &mut output)
.expect_err("expected UTF-8 error")
{
Error::InvalidUTF8(span) => {
assert_eq!(span, Span(0..1))
}
_ => panic!(),
}
}

#[test]
fn format() {
let files: Vec<_> =
Expand Down
30 changes: 20 additions & 10 deletions fmt/src/tokens/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ use lazy_static::lazy_static;
use std::collections::VecDeque;
use std::str::from_utf8_unchecked;

use yara_x_parser::cst::{CSTStream, Event, SyntaxKind};
use yara_x_parser::cst::{Event, SyntaxKind};

#[cfg(test)]
mod tests;
Expand Down Expand Up @@ -481,20 +481,30 @@ pub(crate) trait TokenStream<'a>: Iterator<Item = Token<'a>> {
// implements the TokenStream trait.
impl<'a, T> TokenStream<'a> for T where T: Iterator<Item = Token<'a>> {}

/// An iterator that takes a [`CSTStream`] generated by the parser and produces
/// a sequence of tokens.
pub(crate) struct Tokens<'src> {
events: CSTStream<'src>,
/// An iterator that takes a sequence of events generated by the parser and
/// produces a sequence of tokens which constitute the input to the formatter.
pub(crate) struct Tokens<'src, E>
where
E: Iterator<Item = Event>,
{
source: &'src [u8],
events: E,
buffer: VecDeque<Token<'src>>,
}

impl<'src> Tokens<'src> {
pub fn new(stream: CSTStream<'src>) -> Self {
Self { events: stream, buffer: VecDeque::new() }
impl<'src, E> Tokens<'src, E>
where
E: Iterator<Item = Event>,
{
pub fn new(source: &'src [u8], events: E) -> Self {
Self { source, events, buffer: VecDeque::new() }
}
}

impl<'src> Iterator for Tokens<'src> {
impl<'src, E> Iterator for Tokens<'src, E>
where
E: Iterator<Item = Event>,
{
type Item = Token<'src>;

fn next(&mut self) -> Option<Self::Item> {
Expand All @@ -507,7 +517,7 @@ impl<'src> Iterator for Tokens<'src> {
Event::Begin(kind) => return Some(Token::Begin(kind)),
Event::End(kind) => return Some(Token::End(kind)),
Event::Token { kind, span } => {
let token_bytes = &self.events.source()[span.range()];
let token_bytes = &self.source[span.range()];
// The whitespace token has a different treatment because
// the parser returns a single whitespace token when
// multiple whitespaces appear together. Here we separate
Expand Down
4 changes: 2 additions & 2 deletions fmt/src/tokens/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ fn token_generation() {
.whitespaces(false)
.newlines(false);

let tokens: Vec<Token> = Tokens::new(events).collect();
let tokens: Vec<Token> = Tokens::new(rule.as_bytes(), events).collect();

assert_eq!(
tokens,
Expand Down Expand Up @@ -112,7 +112,7 @@ fn whitespaces() {
}"#;

let events = Parser::new(rule.as_bytes()).into_cst_stream();
let tokens: Vec<Token> = Tokens::new(events).collect();
let tokens: Vec<Token> = Tokens::new(rule.as_bytes(), events).collect();

assert_eq!(
tokens,
Expand Down

0 comments on commit 32b897d

Please sign in to comment.