Fix tokenization of qualified identifiers with numeric prefix.

Roman Borschel · Roman Borschel · commit 02798831d40f · 2025-04-09T21:13:59.000+02:00
Queries with qualified identifiers having numeric prefixes currently
fail to parse due to incorrect tokenization.

Currently, "t.123abc" tokenizes as "t" (Word) followed by ".123abc"
(Number).
diff --git a/src/tokenizer.rs b/src/tokenizer.rs
@@ -895,7 +895,7 @@ impl<'a> Tokenizer<'a> {
         };
 
         let mut location = state.location();
-        while let Some(token) = self.next_token(&mut state)? {
+        while let Some(token) = self.next_token(&mut state, buf.last().map(|t| &t.token))? {
             let span = location.span_to(state.location());
 
             buf.push(TokenWithSpan { token, span });
@@ -932,7 +932,11 @@ impl<'a> Tokenizer<'a> {
     }
 
     /// Get the next token or return None
-    fn next_token(&self, chars: &mut State) -> Result<Option<Token>, TokenizerError> {
+    fn next_token(
+        &self,
+        chars: &mut State,
+        prev_token: Option<&Token>,
+    ) -> Result<Option<Token>, TokenizerError> {
         match chars.peek() {
             Some(&ch) => match ch {
                 ' ' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Space)),
@@ -1211,17 +1215,29 @@ impl<'a> Tokenizer<'a> {
                         chars.next();
                     }
 
+                    // If the dialect supports identifiers that start with a numeric prefix
+                    // and we have now consumed a dot, check if the previous token was a Word.
+                    // If so, what follows is definitely not part of a decimal number and
+                    // we should yield the dot as a dedicated token so compound identifiers
+                    // starting with digits can be parsed correctly.
+                    if s == "." && self.dialect.supports_numeric_prefix() {
+                        if let Some(Token::Word(_)) = prev_token {
+                            return Ok(Some(Token::Period));
+                        }
+                    }
+
+                    // Consume fractional digits.
                     s += &peeking_next_take_while(chars, |ch, next_ch| {
                         ch.is_ascii_digit() || is_number_separator(ch, next_ch)
                     });
 
-                    // No number -> Token::Period
+                    // No fraction -> Token::Period
                     if s == "." {
                         return Ok(Some(Token::Period));
                     }
 
-                    let mut exponent_part = String::new();
                     // Parse exponent as number
+                    let mut exponent_part = String::new();
                     if chars.peek() == Some(&'e') || chars.peek() == Some(&'E') {
                         let mut char_clone = chars.peekable.clone();
                         exponent_part.push(char_clone.next().unwrap());
@@ -1250,14 +1266,23 @@ impl<'a> Tokenizer<'a> {
                         }
                     }
 
-                    // mysql dialect supports identifiers that start with a numeric prefix,
-                    // as long as they aren't an exponent number.
-                    if self.dialect.supports_numeric_prefix() && exponent_part.is_empty() {
-                        let word =
-                            peeking_take_while(chars, |ch| self.dialect.is_identifier_part(ch));
-
-                        if !word.is_empty() {
-                            s += word.as_str();
+                    // If the dialect supports identifiers that start with a numeric prefix,
+                    // we need to check if the value is in fact an identifier and must thus
+                    // be tokenized as a word.
+                    if self.dialect.supports_numeric_prefix() {
+                        if exponent_part.is_empty() {
+                            // If it is not a number with an exponent, it may be
+                            // an unqualified identifier starting with digits.
+                            let word =
+                                peeking_take_while(chars, |ch| self.dialect.is_identifier_part(ch));
+
+                            if !word.is_empty() {
+                                s += word.as_str();
+                                return Ok(Some(Token::make_word(s.as_str(), None)));
+                            }
+                        } else if prev_token == Some(&Token::Period) {
+                            // If the previous token was a period, thus not belonging to a number,
+                            // the value we have is part of an identifier.
                             return Ok(Some(Token::make_word(s.as_str(), None)));
                         }
                     }
@@ -3960,4 +3985,31 @@ mod tests {
                 ],
             );
     }
+
+    #[test]
+    fn test_tokenize_identifiers_numeric_prefix() {
+        all_dialects_where(|dialect| dialect.supports_numeric_prefix())
+            .tokenizes_to("123abc", vec![Token::make_word("123abc", None)]);
+
+        all_dialects_where(|dialect| dialect.supports_numeric_prefix())
+            .tokenizes_to("12e34", vec![Token::Number("12e34".to_string(), false)]);
+
+        all_dialects_where(|dialect| dialect.supports_numeric_prefix()).tokenizes_to(
+            "t.12e34",
+            vec![
+                Token::make_word("t", None),
+                Token::Period,
+                Token::make_word("12e34", None),
+            ],
+        );
+
+        all_dialects_where(|dialect| dialect.supports_numeric_prefix()).tokenizes_to(
+            "t.1two3",
+            vec![
+                Token::make_word("t", None),
+                Token::Period,
+                Token::make_word("1two3", None),
+            ],
+        );
+    }
 }
diff --git a/tests/sqlparser_mysql.rs b/tests/sqlparser_mysql.rs
@@ -1926,6 +1926,106 @@ fn parse_select_with_numeric_prefix_column_name() {
     }
 }
 
+#[test]
+fn parse_qualified_identifiers_with_numeric_prefix() {
+    // Case 1: Qualified column name that starts with digits.
+    mysql().verified_stmt("SELECT t.15to29 FROM my_table AS t");
+    match mysql()
+        .parse_sql_statements("SELECT t.15to29 FROM my_table AS t")
+        .unwrap()
+        .pop()
+    {
+        Some(Statement::Query(q)) => match *q.body {
+            SetExpr::Select(s) => match s.projection.last() {
+                Some(SelectItem::UnnamedExpr(Expr::CompoundIdentifier(parts))) => {
+                    assert_eq!(&[Ident::new("t"), Ident::new("15to29")], &parts[..]);
+                }
+                proj => panic!("Unexpected projection: {:?}", proj),
+            },
+            body => panic!("Unexpected statement body: {:?}", body),
+        },
+        stmt => panic!("Unexpected statement: {:?}", stmt),
+    }
+
+    // Case 2: Qualified column name that starts with digits and on its own represents a number.
+    mysql().verified_stmt("SELECT t.15e29 FROM my_table AS t");
+    match mysql()
+        .parse_sql_statements("SELECT t.15e29 FROM my_table AS t")
+        .unwrap()
+        .pop()
+    {
+        Some(Statement::Query(q)) => match *q.body {
+            SetExpr::Select(s) => match s.projection.last() {
+                Some(SelectItem::UnnamedExpr(Expr::CompoundIdentifier(parts))) => {
+                    assert_eq!(&[Ident::new("t"), Ident::new("15e29")], &parts[..]);
+                }
+                proj => panic!("Unexpected projection: {:?}", proj),
+            },
+            body => panic!("Unexpected statement body: {:?}", body),
+        },
+        stmt => panic!("Unexpected statement: {:?}", stmt),
+    }
+
+    // Case 3: Unqualified, the same token is parsed as a number.
+    match mysql()
+        .parse_sql_statements("SELECT 15e29 FROM my_table")
+        .unwrap()
+        .pop()
+    {
+        Some(Statement::Query(q)) => match *q.body {
+            SetExpr::Select(s) => match s.projection.last() {
+                Some(SelectItem::UnnamedExpr(Expr::Value(ValueWithSpan { value, .. }))) => {
+                    assert_eq!(&number("15e29"), value);
+                }
+                proj => panic!("Unexpected projection: {:?}", proj),
+            },
+            body => panic!("Unexpected statement body: {:?}", body),
+        },
+        stmt => panic!("Unexpected statement: {:?}", stmt),
+    }
+
+    // Case 4: Quoted simple identifier.
+    mysql().verified_stmt("SELECT `15e29` FROM my_table");
+    match mysql()
+        .parse_sql_statements("SELECT `15e29` FROM my_table")
+        .unwrap()
+        .pop()
+    {
+        Some(Statement::Query(q)) => match *q.body {
+            SetExpr::Select(s) => match s.projection.last() {
+                Some(SelectItem::UnnamedExpr(Expr::Identifier(name))) => {
+                    assert_eq!(&Ident::with_quote('`', "15e29"), name);
+                }
+                proj => panic!("Unexpected projection: {:?}", proj),
+            },
+            body => panic!("Unexpected statement body: {:?}", body),
+        },
+        stmt => panic!("Unexpected statement: {:?}", stmt),
+    }
+
+    // Case 5: Quoted compound identifier.
+    mysql().verified_stmt("SELECT t.`15e29` FROM my_table");
+    match mysql()
+        .parse_sql_statements("SELECT t.`15e29` FROM my_table AS t")
+        .unwrap()
+        .pop()
+    {
+        Some(Statement::Query(q)) => match *q.body {
+            SetExpr::Select(s) => match s.projection.last() {
+                Some(SelectItem::UnnamedExpr(Expr::CompoundIdentifier(parts))) => {
+                    assert_eq!(
+                        &[Ident::new("t"), Ident::with_quote('`', "15e29")],
+                        &parts[..]
+                    );
+                }
+                proj => panic!("Unexpected projection: {:?}", proj),
+            },
+            body => panic!("Unexpected statement body: {:?}", body),
+        },
+        stmt => panic!("Unexpected statement: {:?}", stmt),
+    }
+}
+
 // Don't run with bigdecimal as it fails like this on rust beta:
 //
 // 'parse_select_with_concatenation_of_exp_number_and_numeric_prefix_column'