From cba8730812800ada0c2b6bf260d8334610a5d3b9 Mon Sep 17 00:00:00 2001 From: MohamedAbdeen21 Date: Wed, 30 Apr 2025 23:29:24 +0100 Subject: [PATCH 1/3] Fix: parsing ident starting with underscore in certain dialects The dialects that support underscore as a separator in numeric literals used to parse ._123 as a number, meaning that an identifier like ._abc would be parsed as Number `._` and word `abc`, which is obv wrong. This PR splits the tokenizer branch for numbers and periods into two branches to make things easier, fixes the issue mentioned above and adds tests. --- src/tokenizer.rs | 141 +++++++++++++++++++++++++++++++++++------------ 1 file changed, 106 insertions(+), 35 deletions(-) diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 4fad54624..8bf7ceddd 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -1189,8 +1189,8 @@ impl<'a> Tokenizer<'a> { Ok(Some(Token::make_word(&word.concat(), Some(quote_start)))) } - // numbers and period - '0'..='9' | '.' => { + // Numbers + '0'..='9' => { // Some dialects support underscore as number separator // There can only be one at a time and it must be followed by another digit let is_number_separator = |ch: char, next_char: Option| { @@ -1199,11 +1199,12 @@ impl<'a> Tokenizer<'a> { && next_char.is_some_and(|next_ch| next_ch.is_ascii_hexdigit()) }; + // Start with number or potential separator let mut s = peeking_next_take_while(chars, |ch, next_ch| { ch.is_ascii_digit() || is_number_separator(ch, next_ch) }); - // match binary literal that starts with 0x + // Match binary literal that starts with 0x if s == "0" && chars.peek() == Some(&'x') { chars.next(); let s2 = peeking_next_take_while(chars, |ch, next_ch| { @@ -1212,51 +1213,34 @@ impl<'a> Tokenizer<'a> { return Ok(Some(Token::HexStringLiteral(s2))); } - // match one period + // Match fractional part after a dot if let Some('.') = chars.peek() { s.push('.'); chars.next(); } - // If the dialect supports identifiers that start with a numeric prefix - // and we have now consumed a dot, check if the previous token was a Word. - // If so, what follows is definitely not part of a decimal number and - // we should yield the dot as a dedicated token so compound identifiers - // starting with digits can be parsed correctly. - if s == "." && self.dialect.supports_numeric_prefix() { - if let Some(Token::Word(_)) = prev_token { - return Ok(Some(Token::Period)); - } - } - - // Consume fractional digits. + // Consume fractional digits s += &peeking_next_take_while(chars, |ch, next_ch| { ch.is_ascii_digit() || is_number_separator(ch, next_ch) }); - // No fraction -> Token::Period - if s == "." { - return Ok(Some(Token::Period)); - } - - // Parse exponent as number + // Parse exponent part (e.g., e+10 or E-5) let mut exponent_part = String::new(); if chars.peek() == Some(&'e') || chars.peek() == Some(&'E') { let mut char_clone = chars.peekable.clone(); - exponent_part.push(char_clone.next().unwrap()); + exponent_part.push(char_clone.next().unwrap()); // consume 'e' or 'E' // Optional sign - match char_clone.peek() { - Some(&c) if matches!(c, '+' | '-') => { + if let Some(&c) = char_clone.peek() { + if c == '+' || c == '-' { exponent_part.push(c); char_clone.next(); } - _ => (), } - match char_clone.peek() { - // Definitely an exponent, get original iterator up to speed and use it - Some(&c) if c.is_ascii_digit() => { + // Parse digits after the exponent + if let Some(&c) = char_clone.peek() { + if c.is_ascii_digit() { for _ in 0..exponent_part.len() { chars.next(); } @@ -1264,8 +1248,6 @@ impl<'a> Tokenizer<'a> { &peeking_take_while(chars, |ch| ch.is_ascii_digit()); s += exponent_part.as_str(); } - // Not an exponent, discard the work done - _ => (), } } @@ -1274,8 +1256,7 @@ impl<'a> Tokenizer<'a> { // be tokenized as a word. if self.dialect.supports_numeric_prefix() { if exponent_part.is_empty() { - // If it is not a number with an exponent, it may be - // an identifier starting with digits. + // Handle as potential word if no exponent part let word = peeking_take_while(chars, |ch| self.dialect.is_identifier_part(ch)); @@ -1284,20 +1265,84 @@ impl<'a> Tokenizer<'a> { return Ok(Some(Token::make_word(s.as_str(), None))); } } else if prev_token == Some(&Token::Period) { - // If the previous token was a period, thus not belonging to a number, - // the value we have is part of an identifier. + // Handle as word if it follows a period return Ok(Some(Token::make_word(s.as_str(), None))); } } + // Handle "L" suffix for long numbers let long = if chars.peek() == Some(&'L') { chars.next(); true } else { false }; + + // Return the final token for the number Ok(Some(Token::Number(s, long))) } + + // Period (`.`) handling + '.' => { + chars.next(); // consume the dot + + match chars.peek() { + Some('_') => { + // Handle "._" case as a period (special token) followed by identifier + Ok(Some(Token::Period)) + } + Some(ch) + // Hive and mysql dialects allow numeric prefixes for identifers + if ch.is_ascii_digit() + && self.dialect.supports_numeric_prefix() + && matches!(prev_token, Some(Token::Word(_))) => + { + Ok(Some(Token::Period)) + } + Some(ch) if ch.is_ascii_digit() => { + // Handle numbers starting with a dot (e.g., ".123") + let mut s = String::from("."); + let is_number_separator = |ch: char, next_char: Option| { + self.dialect.supports_numeric_literal_underscores() + && ch == '_' + && next_char.is_some_and(|c| c.is_ascii_digit()) + }; + + s += &peeking_next_take_while(chars, |ch, next_ch| { + ch.is_ascii_digit() || is_number_separator(ch, next_ch) + }); + + // Handle exponent part + if matches!(chars.peek(), Some('e' | 'E')) { + let mut exp = String::new(); + exp.push(chars.next().unwrap()); + + if matches!(chars.peek(), Some('+' | '-')) { + exp.push(chars.next().unwrap()); + } + + if matches!(chars.peek(), Some(c) if c.is_ascii_digit()) { + exp += &peeking_take_while(chars, |c| c.is_ascii_digit()); + s += &exp; + } + } + + // Handle "L" suffix for long numbers + let long = if chars.peek() == Some(&'L') { + chars.next(); + true + } else { + false + }; + + Ok(Some(Token::Number(s, long))) + } + _ => { + // Just a plain period + Ok(Some(Token::Period)) + } + } + } // punctuation '(' => self.consume_and_return(chars, Token::LParen), ')' => self.consume_and_return(chars, Token::RParen), @@ -2435,6 +2480,32 @@ mod tests { compare(expected, tokens); } + #[test] + fn tokenize_period_underscore() { + let sql = String::from("SELECT table._col"); + // a dialect that supports underscores in numeric literals + let dialect = PostgreSqlDialect {}; + let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap(); + + let expected = vec![ + Token::make_keyword("SELECT"), + Token::Whitespace(Whitespace::Space), + Token::Word(Word { + value: "table".to_string(), + quote_style: None, + keyword: Keyword::TABLE, + }), + Token::Period, + Token::Word(Word { + value: "_col".to_string(), + quote_style: None, + keyword: Keyword::NoKeyword, + }), + ]; + + compare(expected, tokens); + } + #[test] fn tokenize_select_float() { let sql = String::from("SELECT .1"); From 5c5b5e19e7986493d694776c78b5aeb56579fa60 Mon Sep 17 00:00:00 2001 From: MohamedAbdeen21 Date: Thu, 1 May 2025 21:42:57 +0100 Subject: [PATCH 2/3] Error on numeric literals and unqualified identifers starting with ._ --- src/tokenizer.rs | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 8bf7ceddd..7c4fbf9cc 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -1287,10 +1287,17 @@ impl<'a> Tokenizer<'a> { chars.next(); // consume the dot match chars.peek() { - Some('_') => { - // Handle "._" case as a period (special token) followed by identifier + // Handle "._" case as a period followed by identifier + // if the last token was a word + Some('_') if matches!(prev_token, Some(Token::Word(_))) => { Ok(Some(Token::Period)) } + Some('_') => { + self.tokenizer_error( + chars.location(), + "Unexpected an underscore here".to_string(), + ) + } Some(ch) // Hive and mysql dialects allow numeric prefixes for identifers if ch.is_ascii_digit() @@ -2504,6 +2511,16 @@ mod tests { ]; compare(expected, tokens); + + let sql = String::from("SELECT ._123"); + if let Ok(tokens) = Tokenizer::new(&dialect, &sql).tokenize() { + panic!("Tokenizer should have failed on {sql}, but it succeeded with {tokens:?}"); + } + + let sql = String::from("SELECT ._abc"); + if let Ok(tokens) = Tokenizer::new(&dialect, &sql).tokenize() { + panic!("Tokenizer should have failed on {sql}, but it succeeded with {tokens:?}"); + } } #[test] From c1a32205e962e3c4c76220706695257f139b4e33 Mon Sep 17 00:00:00 2001 From: MohamedAbdeen21 Date: Thu, 1 May 2025 21:46:33 +0100 Subject: [PATCH 3/3] Update error message --- src/tokenizer.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 7c4fbf9cc..10e810fbc 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -1295,7 +1295,7 @@ impl<'a> Tokenizer<'a> { Some('_') => { self.tokenizer_error( chars.location(), - "Unexpected an underscore here".to_string(), + "Unexpected underscore here".to_string(), ) } Some(ch)