fix: prevent the fmt command from modifying non-YARA files.

The `fmt` command didn't take into account if the input file was actually a YARA source code file or not, it tried to format the file anyways, modifying binary files or whatever you passed to it. Now the `fmt` fails if the input file contains invalid UTF-8 sequences, and don't modify the files in such cases. Closes #286.
VirusTotal · Jan 27, 2025 · 32b897d · 32b897d
1 parent b4b289d
commit 32b897d
Show file tree

Hide file tree

Showing 7 changed files with 120 additions and 34 deletions.
diff --git a/cli/src/commands/fmt.rs b/cli/src/commands/fmt.rs
@@ -1,4 +1,5 @@
 use std::fs::File;
+use std::io::{Cursor, Seek, SeekFrom};
 use std::path::PathBuf;
 use std::{fs, io, process};
 
@@ -54,19 +55,26 @@ pub fn exec_fmt(
             config.rule.empty_line_after_section_header,
         );
 
-    let mut changed = false;
+    let mut modified = false;
 
     for file in files {
         let input = fs::read(file.as_path())?;
-        changed = if check {
+        modified = if check {
             formatter.format(input.as_slice(), io::sink())?
         } else {
-            let output_file = File::create(file.as_path())?;
-            formatter.format(input.as_slice(), output_file)?
-        } || changed;
+            let mut formatted = Cursor::new(Vec::with_capacity(input.len()));
+            if formatter.format(input.as_slice(), &mut formatted)? {
+                formatted.seek(SeekFrom::Start(0))?;
+                let mut output_file = File::create(file.as_path())?;
+                io::copy(&mut formatted, &mut output_file)?;
+                true
+            } else {
+                false
+            }
+        } || modified;
     }
 
-    if changed {
+    if modified {
         process::exit(1)
     }
 

diff --git a/cli/src/tests/mod.rs b/cli/src/tests/mod.rs
@@ -339,6 +339,44 @@ fn cli_issue_280() {
         .success();
 }
 
+#[test]
+fn cli_fmt() {
+    let temp_dir = TempDir::new().unwrap();
+    let input_file = temp_dir.child("rule.yar");
+
+    input_file.write_str("rule test { condition: true }").unwrap();
+
+    Command::cargo_bin("yr")
+        .unwrap()
+        .arg("fmt")
+        .arg(input_file.path())
+        .assert()
+        .code(1); // Exit code 1 indicates that the file was modified.
+
+    Command::cargo_bin("yr")
+        .unwrap()
+        .arg("fmt")
+        .arg(input_file.path())
+        .assert()
+        .code(0); // Second time that we format the same file, no expected changes.
+}
+
+#[test]
+fn cli_fmt_utf8_error() {
+    let temp_dir = TempDir::new().unwrap();
+    let input_file = temp_dir.child("rule.yar");
+
+    input_file.write_binary(&[0xff, 0xff]).unwrap();
+
+    Command::cargo_bin("yr")
+        .unwrap()
+        .arg("fmt")
+        .arg(input_file.path())
+        .assert()
+        .stderr("error: invalid UTF-8 at [0..1]\n")
+        .code(1);
+}
+
 #[cfg(feature = "debug-cmd")]
 #[test]
 fn cli_debug_ast() {

diff --git a/fmt/src/lib.rs b/fmt/src/lib.rs
@@ -22,7 +22,7 @@ use thiserror::Error;
 
 use tokens::Token::*;
 use tokens::TokenStream;
-use yara_x_parser::cst::SyntaxKind;
+use yara_x_parser::cst::{Event, SyntaxKind};
 use yara_x_parser::{Parser, Span};
 
 use crate::align::Align;
@@ -49,16 +49,16 @@ mod tests;
 #[allow(clippy::large_enum_variant)]
 pub enum Error {
     /// Error while reading from input.
-    #[error("Read error")]
+    #[error("read error")]
     ReadError(io::Error),
 
     /// Error while writing to output.
-    #[error("Write error")]
+    #[error("write error")]
     WriteError(io::Error),
 
-    /// Error while parsing the input.
-    #[error("Parse error")]
-    ParseError { message: String, span: Span },
+    /// The input file contained invalid UTF-8.
+    #[error("invalid UTF-8 at {0}")]
+    InvalidUTF8(Span),
 }
 
 /// Formats YARA source code automatically.
@@ -328,10 +328,7 @@ impl Formatter {
     /// Returns `true` if the output differs from the input.
     ///
     /// This function will fail if it can't read from the input, write to the
-    /// output, or when the input doesn't contain syntactically valid YARA
-    /// rules.
-    ///
-    /// TODO: syntactically invalid rules may be formatted
+    /// output, or when the input contains invalid UTF-8 characters.
     pub fn format<R, W>(
         &self,
         mut input: R,
@@ -341,19 +338,33 @@ impl Formatter {
         R: io::Read,
         W: io::Write,
     {
+        let mut invalid_utf8 = Option::None;
         let mut in_buf = Vec::new();
 
         input.read_to_end(&mut in_buf).map_err(Error::ReadError)?;
 
-        let stream = Parser::new(in_buf.as_slice()).into_cst_stream();
-        let tokens = Tokens::new(stream);
+        let cst_stream = Parser::new(in_buf.as_slice()).into_cst_stream();
+
+        // Inspect the CST stream looking for events indicating the presence of
+        // invalid UTF-8 sequences.
+        let events = cst_stream.into_iter().inspect(|evt| {
+            if let Event::Token { kind: SyntaxKind::INVALID_UTF8, span } = evt
+            {
+                invalid_utf8.get_or_insert(span.clone());
+            }
+        });
 
+        let tokens = Tokens::new(in_buf.as_slice(), events);
         let mut out_buf = Cursor::new(Vec::new());
 
         self.format_impl(tokens)
             .write_to(&mut out_buf)
             .map_err(Error::WriteError)?;
 
+        if let Some(span) = invalid_utf8 {
+            return Err(Error::InvalidUTF8(span));
+        }
+
         let modified = in_buf.ne(out_buf.get_ref());
 
         output.write_all(out_buf.get_ref()).map_err(Error::WriteError)?;

diff --git a/fmt/src/processor/tests.rs b/fmt/src/processor/tests.rs
@@ -10,7 +10,8 @@ use crate::tokens::{categories, Token};
 fn tokenize(source: &str) -> Vec<Token> {
     let events =
         Parser::new(source.as_bytes()).into_cst_stream().whitespaces(false);
-    tokens::Tokens::new(events).collect()
+
+    tokens::Tokens::new(source.as_bytes(), events).collect()
 }
 
 #[test]

diff --git a/fmt/src/tests.rs b/fmt/src/tests.rs
@@ -1,12 +1,15 @@
+use std::io::Cursor;
 use std::path::PathBuf;
 use std::{fs, str};
 
+use bstr::ByteSlice;
 use pretty_assertions::assert_eq;
 use rayon::prelude::*;
-use yara_x_parser::Parser;
+
+use yara_x_parser::{Parser, Span};
 
 use crate::tokens::{TokenStream, Tokens};
-use crate::Formatter;
+use crate::{Error, Formatter};
 
 #[test]
 fn spacer() {
@@ -31,13 +34,28 @@ fn spacer() {
         let mut output = Vec::new();
         let rules = t.0.as_bytes();
         let events = Parser::new(rules).into_cst_stream().whitespaces(false);
-        let tokens = Tokens::new(events);
+        let tokens = Tokens::new(rules, events);
 
         Formatter::add_spacing(tokens).write_to(&mut output).unwrap();
         assert_eq!(str::from_utf8(&output).unwrap(), t.1);
     }
 }
 
+#[test]
+fn invalid_utf8() {
+    let mut output = Cursor::new(Vec::new());
+
+    match Formatter::new()
+        .format(b"\xFF\xFF".as_bytes(), &mut output)
+        .expect_err("expected UTF-8 error")
+    {
+        Error::InvalidUTF8(span) => {
+            assert_eq!(span, Span(0..1))
+        }
+        _ => panic!(),
+    }
+}
+
 #[test]
 fn format() {
     let files: Vec<_> =

diff --git a/fmt/src/tokens/mod.rs b/fmt/src/tokens/mod.rs
@@ -2,7 +2,7 @@ use lazy_static::lazy_static;
 use std::collections::VecDeque;
 use std::str::from_utf8_unchecked;
 
-use yara_x_parser::cst::{CSTStream, Event, SyntaxKind};
+use yara_x_parser::cst::{Event, SyntaxKind};
 
 #[cfg(test)]
 mod tests;
@@ -481,20 +481,30 @@ pub(crate) trait TokenStream<'a>: Iterator<Item = Token<'a>> {
 // implements the TokenStream trait.
 impl<'a, T> TokenStream<'a> for T where T: Iterator<Item = Token<'a>> {}
 
-/// An iterator that takes a [`CSTStream`] generated by the parser and produces
-/// a sequence of tokens.
-pub(crate) struct Tokens<'src> {
-    events: CSTStream<'src>,
+/// An iterator that takes a sequence of events generated by the parser and
+/// produces a sequence of tokens which constitute the input to the formatter.
+pub(crate) struct Tokens<'src, E>
+where
+    E: Iterator<Item = Event>,
+{
+    source: &'src [u8],
+    events: E,
     buffer: VecDeque<Token<'src>>,
 }
 
-impl<'src> Tokens<'src> {
-    pub fn new(stream: CSTStream<'src>) -> Self {
-        Self { events: stream, buffer: VecDeque::new() }
+impl<'src, E> Tokens<'src, E>
+where
+    E: Iterator<Item = Event>,
+{
+    pub fn new(source: &'src [u8], events: E) -> Self {
+        Self { source, events, buffer: VecDeque::new() }
     }
 }
 
-impl<'src> Iterator for Tokens<'src> {
+impl<'src, E> Iterator for Tokens<'src, E>
+where
+    E: Iterator<Item = Event>,
+{
     type Item = Token<'src>;
 
     fn next(&mut self) -> Option<Self::Item> {
@@ -507,7 +517,7 @@ impl<'src> Iterator for Tokens<'src> {
                 Event::Begin(kind) => return Some(Token::Begin(kind)),
                 Event::End(kind) => return Some(Token::End(kind)),
                 Event::Token { kind, span } => {
-                    let token_bytes = &self.events.source()[span.range()];
+                    let token_bytes = &self.source[span.range()];
                     // The whitespace token has a different treatment because
                     // the parser returns a single whitespace token when
                     // multiple whitespaces appear together. Here we separate

diff --git a/fmt/src/tokens/tests.rs b/fmt/src/tokens/tests.rs
@@ -21,7 +21,7 @@ fn token_generation() {
         .whitespaces(false)
         .newlines(false);
 
-    let tokens: Vec<Token> = Tokens::new(events).collect();
+    let tokens: Vec<Token> = Tokens::new(rule.as_bytes(), events).collect();
 
     assert_eq!(
         tokens,
@@ -112,7 +112,7 @@ fn whitespaces() {
     }"#;
 
     let events = Parser::new(rule.as_bytes()).into_cst_stream();
-    let tokens: Vec<Token> = Tokens::new(events).collect();
+    let tokens: Vec<Token> = Tokens::new(rule.as_bytes(), events).collect();
 
     assert_eq!(
         tokens,