Fix: parsing ident starting with underscore in certain dialects

MohamedAbdeen21 · MohamedAbdeen21 · commit cba873081280 · 2025-05-05T20:55:20.000+01:00
The dialects that support underscore as a separator in numeric literals
used to parse ._123 as a number, meaning that an identifier like
._abc would be parsed as Number `._` and word `abc`, which is
obv wrong.

This PR splits the tokenizer branch for numbers and periods into
two branches to make things easier, fixes the issue mentioned above
and adds tests.
diff --git a/src/tokenizer.rs b/src/tokenizer.rs
@@ -1189,8 +1189,8 @@ impl<'a> Tokenizer<'a> {
 
                     Ok(Some(Token::make_word(&word.concat(), Some(quote_start))))
                 }
-                // numbers and period
-                '0'..='9' | '.' => {
+                // Numbers
+                '0'..='9' => {
                     // Some dialects support underscore as number separator
                     // There can only be one at a time and it must be followed by another digit
                     let is_number_separator = |ch: char, next_char: Option<char>| {
@@ -1199,11 +1199,12 @@ impl<'a> Tokenizer<'a> {
                             && next_char.is_some_and(|next_ch| next_ch.is_ascii_hexdigit())
                     };
 
+                    // Start with number or potential separator
                     let mut s = peeking_next_take_while(chars, |ch, next_ch| {
                         ch.is_ascii_digit() || is_number_separator(ch, next_ch)
                     });
 
-                    // match binary literal that starts with 0x
+                    // Match binary literal that starts with 0x
                     if s == "0" && chars.peek() == Some(&'x') {
                         chars.next();
                         let s2 = peeking_next_take_while(chars, |ch, next_ch| {
@@ -1212,60 +1213,41 @@ impl<'a> Tokenizer<'a> {
                         return Ok(Some(Token::HexStringLiteral(s2)));
                     }
 
-                    // match one period
+                    // Match fractional part after a dot
                     if let Some('.') = chars.peek() {
                         s.push('.');
                         chars.next();
                     }
 
-                    // If the dialect supports identifiers that start with a numeric prefix
-                    // and we have now consumed a dot, check if the previous token was a Word.
-                    // If so, what follows is definitely not part of a decimal number and
-                    // we should yield the dot as a dedicated token so compound identifiers
-                    // starting with digits can be parsed correctly.
-                    if s == "." && self.dialect.supports_numeric_prefix() {
-                        if let Some(Token::Word(_)) = prev_token {
-                            return Ok(Some(Token::Period));
-                        }
-                    }
-
-                    // Consume fractional digits.
+                    // Consume fractional digits
                     s += &peeking_next_take_while(chars, |ch, next_ch| {
                         ch.is_ascii_digit() || is_number_separator(ch, next_ch)
                     });
 
-                    // No fraction -> Token::Period
-                    if s == "." {
-                        return Ok(Some(Token::Period));
-                    }
-
-                    // Parse exponent as number
+                    // Parse exponent part (e.g., e+10 or E-5)
                     let mut exponent_part = String::new();
                     if chars.peek() == Some(&'e') || chars.peek() == Some(&'E') {
                         let mut char_clone = chars.peekable.clone();
-                        exponent_part.push(char_clone.next().unwrap());
+                        exponent_part.push(char_clone.next().unwrap()); // consume 'e' or 'E'
 
                         // Optional sign
-                        match char_clone.peek() {
-                            Some(&c) if matches!(c, '+' | '-') => {
+                        if let Some(&c) = char_clone.peek() {
+                            if c == '+' || c == '-' {
                                 exponent_part.push(c);
                                 char_clone.next();
                             }
-                            _ => (),
                         }
 
-                        match char_clone.peek() {
-                            // Definitely an exponent, get original iterator up to speed and use it
-                            Some(&c) if c.is_ascii_digit() => {
+                        // Parse digits after the exponent
+                        if let Some(&c) = char_clone.peek() {
+                            if c.is_ascii_digit() {
                                 for _ in 0..exponent_part.len() {
                                     chars.next();
                                 }
                                 exponent_part +=
                                     &peeking_take_while(chars, |ch| ch.is_ascii_digit());
                                 s += exponent_part.as_str();
                             }
-                            // Not an exponent, discard the work done
-                            _ => (),
                         }
                     }
 
@@ -1274,8 +1256,7 @@ impl<'a> Tokenizer<'a> {
                     // be tokenized as a word.
                     if self.dialect.supports_numeric_prefix() {
                         if exponent_part.is_empty() {
-                            // If it is not a number with an exponent, it may be
-                            // an identifier starting with digits.
+                            // Handle as potential word if no exponent part
                             let word =
                                 peeking_take_while(chars, |ch| self.dialect.is_identifier_part(ch));
 
@@ -1284,20 +1265,84 @@ impl<'a> Tokenizer<'a> {
                                 return Ok(Some(Token::make_word(s.as_str(), None)));
                             }
                         } else if prev_token == Some(&Token::Period) {
-                            // If the previous token was a period, thus not belonging to a number,
-                            // the value we have is part of an identifier.
+                            // Handle as word if it follows a period
                             return Ok(Some(Token::make_word(s.as_str(), None)));
                         }
                     }
 
+                    // Handle "L" suffix for long numbers
                     let long = if chars.peek() == Some(&'L') {
                         chars.next();
                         true
                     } else {
                         false
                     };
+
+                    // Return the final token for the number
                     Ok(Some(Token::Number(s, long)))
                 }
+
+                // Period (`.`) handling
+                '.' => {
+                    chars.next(); // consume the dot
+
+                    match chars.peek() {
+                        Some('_') => {
+                            // Handle "._" case as a period (special token) followed by identifier
+                            Ok(Some(Token::Period))
+                        }
+                        Some(ch)
+                            //  Hive and mysql dialects allow numeric prefixes for identifers
+                            if ch.is_ascii_digit()
+                                && self.dialect.supports_numeric_prefix()
+                                && matches!(prev_token, Some(Token::Word(_))) =>
+                        {
+                            Ok(Some(Token::Period))
+                        }
+                        Some(ch) if ch.is_ascii_digit() => {
+                            // Handle numbers starting with a dot (e.g., ".123")
+                            let mut s = String::from(".");
+                            let is_number_separator = |ch: char, next_char: Option<char>| {
+                                self.dialect.supports_numeric_literal_underscores()
+                                    && ch == '_'
+                                    && next_char.is_some_and(|c| c.is_ascii_digit())
+                            };
+
+                            s += &peeking_next_take_while(chars, |ch, next_ch| {
+                                ch.is_ascii_digit() || is_number_separator(ch, next_ch)
+                            });
+
+                            // Handle exponent part
+                            if matches!(chars.peek(), Some('e' | 'E')) {
+                                let mut exp = String::new();
+                                exp.push(chars.next().unwrap());
+
+                                if matches!(chars.peek(), Some('+' | '-')) {
+                                    exp.push(chars.next().unwrap());
+                                }
+
+                                if matches!(chars.peek(), Some(c) if c.is_ascii_digit()) {
+                                    exp += &peeking_take_while(chars, |c| c.is_ascii_digit());
+                                    s += &exp;
+                                }
+                            }
+
+                            // Handle "L" suffix for long numbers
+                            let long = if chars.peek() == Some(&'L') {
+                                chars.next();
+                                true
+                            } else {
+                                false
+                            };
+
+                            Ok(Some(Token::Number(s, long)))
+                        }
+                        _ => {
+                            // Just a plain period
+                            Ok(Some(Token::Period))
+                        }
+                    }
+                }
                 // punctuation
                 '(' => self.consume_and_return(chars, Token::LParen),
                 ')' => self.consume_and_return(chars, Token::RParen),
@@ -2435,6 +2480,32 @@ mod tests {
         compare(expected, tokens);
     }
 
+    #[test]
+    fn tokenize_period_underscore() {
+        let sql = String::from("SELECT table._col");
+        // a dialect that supports underscores in numeric literals
+        let dialect = PostgreSqlDialect {};
+        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
+
+        let expected = vec![
+            Token::make_keyword("SELECT"),
+            Token::Whitespace(Whitespace::Space),
+            Token::Word(Word {
+                value: "table".to_string(),
+                quote_style: None,
+                keyword: Keyword::TABLE,
+            }),
+            Token::Period,
+            Token::Word(Word {
+                value: "_col".to_string(),
+                quote_style: None,
+                keyword: Keyword::NoKeyword,
+            }),
+        ];
+
+        compare(expected, tokens);
+    }
+
     #[test]
     fn tokenize_select_float() {
         let sql = String::from("SELECT .1");