Skip to content

Fix: parsing ident starting with underscore in certain dialects #1835

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
158 changes: 123 additions & 35 deletions src/tokenizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1186,8 +1186,8 @@ impl<'a> Tokenizer<'a> {

Ok(Some(Token::make_word(&word.concat(), Some(quote_start))))
}
// numbers and period
'0'..='9' | '.' => {
// Numbers
'0'..='9' => {
// Some dialects support underscore as number separator
// There can only be one at a time and it must be followed by another digit
let is_number_separator = |ch: char, next_char: Option<char>| {
Expand All @@ -1196,11 +1196,12 @@ impl<'a> Tokenizer<'a> {
&& next_char.is_some_and(|next_ch| next_ch.is_ascii_hexdigit())
};

// Start with number or potential separator
let mut s = peeking_next_take_while(chars, |ch, next_ch| {
ch.is_ascii_digit() || is_number_separator(ch, next_ch)
});

// match binary literal that starts with 0x
// Match binary literal that starts with 0x
if s == "0" && chars.peek() == Some(&'x') {
chars.next();
let s2 = peeking_next_take_while(chars, |ch, next_ch| {
Expand All @@ -1209,60 +1210,41 @@ impl<'a> Tokenizer<'a> {
return Ok(Some(Token::HexStringLiteral(s2)));
}

// match one period
// Match fractional part after a dot
if let Some('.') = chars.peek() {
s.push('.');
chars.next();
}

// If the dialect supports identifiers that start with a numeric prefix
// and we have now consumed a dot, check if the previous token was a Word.
// If so, what follows is definitely not part of a decimal number and
// we should yield the dot as a dedicated token so compound identifiers
// starting with digits can be parsed correctly.
if s == "." && self.dialect.supports_numeric_prefix() {
if let Some(Token::Word(_)) = prev_token {
return Ok(Some(Token::Period));
}
}

// Consume fractional digits.
// Consume fractional digits
s += &peeking_next_take_while(chars, |ch, next_ch| {
ch.is_ascii_digit() || is_number_separator(ch, next_ch)
});

// No fraction -> Token::Period
if s == "." {
return Ok(Some(Token::Period));
}

// Parse exponent as number
// Parse exponent part (e.g., e+10 or E-5)
let mut exponent_part = String::new();
if chars.peek() == Some(&'e') || chars.peek() == Some(&'E') {
let mut char_clone = chars.peekable.clone();
exponent_part.push(char_clone.next().unwrap());
exponent_part.push(char_clone.next().unwrap()); // consume 'e' or 'E'

// Optional sign
match char_clone.peek() {
Some(&c) if matches!(c, '+' | '-') => {
if let Some(&c) = char_clone.peek() {
if c == '+' || c == '-' {
exponent_part.push(c);
char_clone.next();
}
_ => (),
}

match char_clone.peek() {
// Definitely an exponent, get original iterator up to speed and use it
Some(&c) if c.is_ascii_digit() => {
// Parse digits after the exponent
if let Some(&c) = char_clone.peek() {
if c.is_ascii_digit() {
for _ in 0..exponent_part.len() {
chars.next();
}
exponent_part +=
&peeking_take_while(chars, |ch| ch.is_ascii_digit());
s += exponent_part.as_str();
}
// Not an exponent, discard the work done
_ => (),
}
}

Expand All @@ -1271,8 +1253,7 @@ impl<'a> Tokenizer<'a> {
// be tokenized as a word.
if self.dialect.supports_numeric_prefix() {
if exponent_part.is_empty() {
// If it is not a number with an exponent, it may be
// an identifier starting with digits.
// Handle as potential word if no exponent part
let word =
peeking_take_while(chars, |ch| self.dialect.is_identifier_part(ch));

Expand All @@ -1281,20 +1262,91 @@ impl<'a> Tokenizer<'a> {
return Ok(Some(Token::make_word(s.as_str(), None)));
}
} else if prev_token == Some(&Token::Period) {
// If the previous token was a period, thus not belonging to a number,
// the value we have is part of an identifier.
// Handle as word if it follows a period
return Ok(Some(Token::make_word(s.as_str(), None)));
}
}

// Handle "L" suffix for long numbers
let long = if chars.peek() == Some(&'L') {
chars.next();
true
} else {
false
};

// Return the final token for the number
Ok(Some(Token::Number(s, long)))
}

// Period (`.`) handling
'.' => {
chars.next(); // consume the dot

match chars.peek() {
// Handle "._" case as a period followed by identifier
// if the last token was a word
Some('_') if matches!(prev_token, Some(Token::Word(_))) => {
Ok(Some(Token::Period))
}
Some('_') => {
self.tokenizer_error(
chars.location(),
"Unexpected underscore here".to_string(),
)
}
Comment on lines +1292 to +1297
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder if its worth returning an error here or whether we lose anything by allowing the tokenizer continue? I'm guessing its still possible for the tokenizer to return Token::Period here as well?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

AFAIK underscore after a dot can only be an identifier, so it makes sense to error here

Some(ch)
// Hive and mysql dialects allow numeric prefixes for identifers
if ch.is_ascii_digit()
&& self.dialect.supports_numeric_prefix()
&& matches!(prev_token, Some(Token::Word(_))) =>
{
Ok(Some(Token::Period))
}
Some(ch) if ch.is_ascii_digit() => {
// Handle numbers starting with a dot (e.g., ".123")
let mut s = String::from(".");
let is_number_separator = |ch: char, next_char: Option<char>| {
self.dialect.supports_numeric_literal_underscores()
&& ch == '_'
&& next_char.is_some_and(|c| c.is_ascii_digit())
};

s += &peeking_next_take_while(chars, |ch, next_ch| {
ch.is_ascii_digit() || is_number_separator(ch, next_ch)
});

// Handle exponent part
if matches!(chars.peek(), Some('e' | 'E')) {
let mut exp = String::new();
exp.push(chars.next().unwrap());

if matches!(chars.peek(), Some('+' | '-')) {
exp.push(chars.next().unwrap());
}

if matches!(chars.peek(), Some(c) if c.is_ascii_digit()) {
exp += &peeking_take_while(chars, |c| c.is_ascii_digit());
s += &exp;
}
}

// Handle "L" suffix for long numbers
let long = if chars.peek() == Some(&'L') {
chars.next();
true
Comment on lines +1309 to +1337
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hmm most of this logic looks to already be duplicated on the number parsing code path, so that that side effect would be undesirable I think.

If I understood the issue being solved for, its only the case of ._ being parsed as a number, would it be possible/more-desirable to only update the existing logic to properly detect and handle that case or is the current logic not well equipped to handle that sanely?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I know. I tried to fix it without splitting the branches, but I couldn't.

Feel free to take a go at it if possible

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The problem is the match happens on a peek and you need to consume the dot in order to peek the underscore.

What if the second peek wasn't an underscore? You need to un-consume the dot for it to be parsed as part of the number.

} else {
false
};

Ok(Some(Token::Number(s, long)))
}
_ => {
// Just a plain period
Ok(Some(Token::Period))
}
}
}
// punctuation
'(' => self.consume_and_return(chars, Token::LParen),
')' => self.consume_and_return(chars, Token::RParen),
Expand Down Expand Up @@ -2429,6 +2481,42 @@ mod tests {
compare(expected, tokens);
}

#[test]
fn tokenize_period_underscore() {
let sql = String::from("SELECT table._col");
// a dialect that supports underscores in numeric literals
let dialect = PostgreSqlDialect {};
let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();

let expected = vec![
Token::make_keyword("SELECT"),
Token::Whitespace(Whitespace::Space),
Token::Word(Word {
value: "table".to_string(),
quote_style: None,
keyword: Keyword::TABLE,
}),
Token::Period,
Token::Word(Word {
value: "_col".to_string(),
quote_style: None,
keyword: Keyword::NoKeyword,
}),
];

compare(expected, tokens);

let sql = String::from("SELECT ._123");
if let Ok(tokens) = Tokenizer::new(&dialect, &sql).tokenize() {
panic!("Tokenizer should have failed on {sql}, but it succeeded with {tokens:?}");
}

let sql = String::from("SELECT ._abc");
if let Ok(tokens) = Tokenizer::new(&dialect, &sql).tokenize() {
panic!("Tokenizer should have failed on {sql}, but it succeeded with {tokens:?}");
}
}

#[test]
fn tokenize_select_float() {
let sql = String::from("SELECT .1");
Expand Down