-
Notifications
You must be signed in to change notification settings - Fork 601
Fix: parsing ident starting with underscore in certain dialects #1835
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1186,8 +1186,8 @@ impl<'a> Tokenizer<'a> { | |
|
||
Ok(Some(Token::make_word(&word.concat(), Some(quote_start)))) | ||
} | ||
// numbers and period | ||
'0'..='9' | '.' => { | ||
// Numbers | ||
'0'..='9' => { | ||
// Some dialects support underscore as number separator | ||
// There can only be one at a time and it must be followed by another digit | ||
let is_number_separator = |ch: char, next_char: Option<char>| { | ||
|
@@ -1196,11 +1196,12 @@ impl<'a> Tokenizer<'a> { | |
&& next_char.is_some_and(|next_ch| next_ch.is_ascii_hexdigit()) | ||
}; | ||
|
||
// Start with number or potential separator | ||
let mut s = peeking_next_take_while(chars, |ch, next_ch| { | ||
ch.is_ascii_digit() || is_number_separator(ch, next_ch) | ||
}); | ||
|
||
// match binary literal that starts with 0x | ||
// Match binary literal that starts with 0x | ||
if s == "0" && chars.peek() == Some(&'x') { | ||
chars.next(); | ||
let s2 = peeking_next_take_while(chars, |ch, next_ch| { | ||
|
@@ -1209,60 +1210,41 @@ impl<'a> Tokenizer<'a> { | |
return Ok(Some(Token::HexStringLiteral(s2))); | ||
} | ||
|
||
// match one period | ||
// Match fractional part after a dot | ||
if let Some('.') = chars.peek() { | ||
s.push('.'); | ||
chars.next(); | ||
} | ||
|
||
// If the dialect supports identifiers that start with a numeric prefix | ||
// and we have now consumed a dot, check if the previous token was a Word. | ||
// If so, what follows is definitely not part of a decimal number and | ||
// we should yield the dot as a dedicated token so compound identifiers | ||
// starting with digits can be parsed correctly. | ||
if s == "." && self.dialect.supports_numeric_prefix() { | ||
if let Some(Token::Word(_)) = prev_token { | ||
return Ok(Some(Token::Period)); | ||
} | ||
} | ||
|
||
// Consume fractional digits. | ||
// Consume fractional digits | ||
s += &peeking_next_take_while(chars, |ch, next_ch| { | ||
ch.is_ascii_digit() || is_number_separator(ch, next_ch) | ||
}); | ||
|
||
// No fraction -> Token::Period | ||
if s == "." { | ||
return Ok(Some(Token::Period)); | ||
} | ||
|
||
// Parse exponent as number | ||
// Parse exponent part (e.g., e+10 or E-5) | ||
let mut exponent_part = String::new(); | ||
if chars.peek() == Some(&'e') || chars.peek() == Some(&'E') { | ||
let mut char_clone = chars.peekable.clone(); | ||
exponent_part.push(char_clone.next().unwrap()); | ||
exponent_part.push(char_clone.next().unwrap()); // consume 'e' or 'E' | ||
|
||
// Optional sign | ||
match char_clone.peek() { | ||
Some(&c) if matches!(c, '+' | '-') => { | ||
if let Some(&c) = char_clone.peek() { | ||
if c == '+' || c == '-' { | ||
exponent_part.push(c); | ||
char_clone.next(); | ||
} | ||
_ => (), | ||
} | ||
|
||
match char_clone.peek() { | ||
// Definitely an exponent, get original iterator up to speed and use it | ||
Some(&c) if c.is_ascii_digit() => { | ||
// Parse digits after the exponent | ||
if let Some(&c) = char_clone.peek() { | ||
if c.is_ascii_digit() { | ||
for _ in 0..exponent_part.len() { | ||
chars.next(); | ||
} | ||
exponent_part += | ||
&peeking_take_while(chars, |ch| ch.is_ascii_digit()); | ||
s += exponent_part.as_str(); | ||
} | ||
// Not an exponent, discard the work done | ||
_ => (), | ||
} | ||
} | ||
|
||
|
@@ -1271,8 +1253,7 @@ impl<'a> Tokenizer<'a> { | |
// be tokenized as a word. | ||
if self.dialect.supports_numeric_prefix() { | ||
if exponent_part.is_empty() { | ||
// If it is not a number with an exponent, it may be | ||
// an identifier starting with digits. | ||
// Handle as potential word if no exponent part | ||
let word = | ||
peeking_take_while(chars, |ch| self.dialect.is_identifier_part(ch)); | ||
|
||
|
@@ -1281,20 +1262,91 @@ impl<'a> Tokenizer<'a> { | |
return Ok(Some(Token::make_word(s.as_str(), None))); | ||
} | ||
} else if prev_token == Some(&Token::Period) { | ||
// If the previous token was a period, thus not belonging to a number, | ||
// the value we have is part of an identifier. | ||
// Handle as word if it follows a period | ||
return Ok(Some(Token::make_word(s.as_str(), None))); | ||
} | ||
} | ||
|
||
// Handle "L" suffix for long numbers | ||
let long = if chars.peek() == Some(&'L') { | ||
chars.next(); | ||
true | ||
} else { | ||
false | ||
}; | ||
|
||
// Return the final token for the number | ||
Ok(Some(Token::Number(s, long))) | ||
} | ||
|
||
// Period (`.`) handling | ||
'.' => { | ||
chars.next(); // consume the dot | ||
|
||
match chars.peek() { | ||
// Handle "._" case as a period followed by identifier | ||
// if the last token was a word | ||
Some('_') if matches!(prev_token, Some(Token::Word(_))) => { | ||
Ok(Some(Token::Period)) | ||
} | ||
Some('_') => { | ||
self.tokenizer_error( | ||
chars.location(), | ||
"Unexpected underscore here".to_string(), | ||
) | ||
} | ||
Some(ch) | ||
// Hive and mysql dialects allow numeric prefixes for identifers | ||
if ch.is_ascii_digit() | ||
&& self.dialect.supports_numeric_prefix() | ||
&& matches!(prev_token, Some(Token::Word(_))) => | ||
{ | ||
Ok(Some(Token::Period)) | ||
} | ||
Some(ch) if ch.is_ascii_digit() => { | ||
// Handle numbers starting with a dot (e.g., ".123") | ||
let mut s = String::from("."); | ||
let is_number_separator = |ch: char, next_char: Option<char>| { | ||
self.dialect.supports_numeric_literal_underscores() | ||
&& ch == '_' | ||
&& next_char.is_some_and(|c| c.is_ascii_digit()) | ||
}; | ||
|
||
s += &peeking_next_take_while(chars, |ch, next_ch| { | ||
ch.is_ascii_digit() || is_number_separator(ch, next_ch) | ||
}); | ||
|
||
// Handle exponent part | ||
if matches!(chars.peek(), Some('e' | 'E')) { | ||
let mut exp = String::new(); | ||
exp.push(chars.next().unwrap()); | ||
|
||
if matches!(chars.peek(), Some('+' | '-')) { | ||
exp.push(chars.next().unwrap()); | ||
} | ||
|
||
if matches!(chars.peek(), Some(c) if c.is_ascii_digit()) { | ||
exp += &peeking_take_while(chars, |c| c.is_ascii_digit()); | ||
s += &exp; | ||
} | ||
} | ||
|
||
// Handle "L" suffix for long numbers | ||
let long = if chars.peek() == Some(&'L') { | ||
chars.next(); | ||
true | ||
Comment on lines
+1309
to
+1337
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. hmm most of this logic looks to already be duplicated on the number parsing code path, so that that side effect would be undesirable I think. If I understood the issue being solved for, its only the case of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I know. I tried to fix it without splitting the branches, but I couldn't. Feel free to take a go at it if possible There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The problem is the match happens on a peek and you need to consume the dot in order to peek the underscore. What if the second peek wasn't an underscore? You need to un-consume the dot for it to be parsed as part of the number. |
||
} else { | ||
false | ||
}; | ||
|
||
Ok(Some(Token::Number(s, long))) | ||
} | ||
_ => { | ||
// Just a plain period | ||
Ok(Some(Token::Period)) | ||
} | ||
} | ||
} | ||
// punctuation | ||
'(' => self.consume_and_return(chars, Token::LParen), | ||
')' => self.consume_and_return(chars, Token::RParen), | ||
|
@@ -2429,6 +2481,42 @@ mod tests { | |
compare(expected, tokens); | ||
} | ||
|
||
#[test] | ||
fn tokenize_period_underscore() { | ||
let sql = String::from("SELECT table._col"); | ||
// a dialect that supports underscores in numeric literals | ||
let dialect = PostgreSqlDialect {}; | ||
let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap(); | ||
|
||
let expected = vec![ | ||
Token::make_keyword("SELECT"), | ||
Token::Whitespace(Whitespace::Space), | ||
Token::Word(Word { | ||
value: "table".to_string(), | ||
quote_style: None, | ||
keyword: Keyword::TABLE, | ||
}), | ||
Token::Period, | ||
Token::Word(Word { | ||
value: "_col".to_string(), | ||
quote_style: None, | ||
keyword: Keyword::NoKeyword, | ||
}), | ||
]; | ||
|
||
compare(expected, tokens); | ||
|
||
let sql = String::from("SELECT ._123"); | ||
if let Ok(tokens) = Tokenizer::new(&dialect, &sql).tokenize() { | ||
panic!("Tokenizer should have failed on {sql}, but it succeeded with {tokens:?}"); | ||
} | ||
|
||
let sql = String::from("SELECT ._abc"); | ||
if let Ok(tokens) = Tokenizer::new(&dialect, &sql).tokenize() { | ||
panic!("Tokenizer should have failed on {sql}, but it succeeded with {tokens:?}"); | ||
} | ||
} | ||
|
||
#[test] | ||
fn tokenize_select_float() { | ||
let sql = String::from("SELECT .1"); | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I wonder if its worth returning an error here or whether we lose anything by allowing the tokenizer continue? I'm guessing its still possible for the tokenizer to return
Token::Period
here as well?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
AFAIK underscore after a dot can only be an identifier, so it makes sense to error here