From 17a0dc514bfc39fef32ff7f0bddabc63eb17a13a Mon Sep 17 00:00:00 2001
From: "Victor M. Alvarez" <vmalvarez@virustotal.com>
Date: Fri, 17 Jan 2025 16:21:11 +0100
Subject: [PATCH] refactor: use `Utf8Chunks` in `unexpected_token` instead of
 `from_utf8`.

This is not actually faster (if the string is valid UTF-8 it will be iterated until the end), but at least we don't need to use unsafe code.
---
 parser/src/tokenizer/mod.rs | 44 ++++++++++++++-----------------------
 1 file changed, 16 insertions(+), 28 deletions(-)

diff --git a/parser/src/tokenizer/mod.rs b/parser/src/tokenizer/mod.rs
index 5ab1684e..c62757fe 100644
--- a/parser/src/tokenizer/mod.rs
+++ b/parser/src/tokenizer/mod.rs
@@ -5,7 +5,6 @@ YARA source code and produces a sequence of tokens that is later processed by
 the parser. Each token is represented by a variant of the [`Token`] type.
 */
 
-use std::str;
 use std::str::from_utf8;
 
 use logos::Logos;
@@ -204,33 +203,22 @@ impl Tokenizer<'_> {
         let start = lexer.span().start;
         let end = lexer.source().len();
         let unexpected = lexer.source().get(start..end).unwrap();
-        // Make sure that `unexpected` contains a valid UTF-8 string, or take the
-        // first few bytes that are valid and ignore the rest.
-        // TODO: This could be implemented more efficiently using Utf8Chunks, but
-        // it was introduced in Rust 1.79. With Utf8Chunks we can iterate over the
-        // byte slice until finding an invalid UTF-8 character or a whitespace,
-        // whatever comes first. We don't need to use `str::from_utf8`, which
-        // validates the whole string until the end.
-        // https://doc.rust-lang.org/std/str/struct.Utf8Chunks.html
-        let unexpected = match from_utf8(unexpected) {
-            Ok(unexpected) => unexpected,
-            Err(err) => {
-                if err.valid_up_to() == 0 {
-                    return Token::INVALID_UTF8(
-                        Span(start as u32..(start + 1) as u32)
-                            .offset(self.lexer_starting_pos),
-                    );
-                } else {
-                    // unexpected[0..err.valid_up_to()] is guaranteed to be valid
-                    // UTF-8.
-                    unsafe {
-                        str::from_utf8_unchecked(
-                            &unexpected[0..err.valid_up_to()],
-                        )
-                    }
-                }
-            }
-        };
+
+        // Make sure that `unexpected` contains a valid UTF-8 string, or take
+        // the first few bytes that are valid and ignore the rest. It's safe to
+        // call .unwrap() because there must be at least one UTF-8 chunk, either
+        // valid or invalid.
+        let chunk = unexpected.utf8_chunks().next().unwrap();
+
+        if chunk.valid().is_empty() {
+            return Token::INVALID_UTF8(
+                Span(start as u32..(start + 1) as u32)
+                    .offset(self.lexer_starting_pos),
+            );
+        }
+
+        // `unexpected` is the valid UTF-8 prefix.
+        let unexpected = chunk.valid();
 
         // Truncate `unexpected` at the first whitespace if any.
         let unexpected = unexpected.split(char::is_whitespace).next().unwrap();