Skip to content

Commit cba8730

Browse files
Fix: parsing ident starting with underscore in certain dialects
The dialects that support underscore as a separator in numeric literals used to parse ._123 as a number, meaning that an identifier like ._abc would be parsed as Number `._` and word `abc`, which is obv wrong. This PR splits the tokenizer branch for numbers and periods into two branches to make things easier, fixes the issue mentioned above and adds tests.
1 parent ac1c339 commit cba8730

File tree

1 file changed

+106
-35
lines changed

1 file changed

+106
-35
lines changed

src/tokenizer.rs

Lines changed: 106 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -1189,8 +1189,8 @@ impl<'a> Tokenizer<'a> {
11891189

11901190
Ok(Some(Token::make_word(&word.concat(), Some(quote_start))))
11911191
}
1192-
// numbers and period
1193-
'0'..='9' | '.' => {
1192+
// Numbers
1193+
'0'..='9' => {
11941194
// Some dialects support underscore as number separator
11951195
// There can only be one at a time and it must be followed by another digit
11961196
let is_number_separator = |ch: char, next_char: Option<char>| {
@@ -1199,11 +1199,12 @@ impl<'a> Tokenizer<'a> {
11991199
&& next_char.is_some_and(|next_ch| next_ch.is_ascii_hexdigit())
12001200
};
12011201

1202+
// Start with number or potential separator
12021203
let mut s = peeking_next_take_while(chars, |ch, next_ch| {
12031204
ch.is_ascii_digit() || is_number_separator(ch, next_ch)
12041205
});
12051206

1206-
// match binary literal that starts with 0x
1207+
// Match binary literal that starts with 0x
12071208
if s == "0" && chars.peek() == Some(&'x') {
12081209
chars.next();
12091210
let s2 = peeking_next_take_while(chars, |ch, next_ch| {
@@ -1212,60 +1213,41 @@ impl<'a> Tokenizer<'a> {
12121213
return Ok(Some(Token::HexStringLiteral(s2)));
12131214
}
12141215

1215-
// match one period
1216+
// Match fractional part after a dot
12161217
if let Some('.') = chars.peek() {
12171218
s.push('.');
12181219
chars.next();
12191220
}
12201221

1221-
// If the dialect supports identifiers that start with a numeric prefix
1222-
// and we have now consumed a dot, check if the previous token was a Word.
1223-
// If so, what follows is definitely not part of a decimal number and
1224-
// we should yield the dot as a dedicated token so compound identifiers
1225-
// starting with digits can be parsed correctly.
1226-
if s == "." && self.dialect.supports_numeric_prefix() {
1227-
if let Some(Token::Word(_)) = prev_token {
1228-
return Ok(Some(Token::Period));
1229-
}
1230-
}
1231-
1232-
// Consume fractional digits.
1222+
// Consume fractional digits
12331223
s += &peeking_next_take_while(chars, |ch, next_ch| {
12341224
ch.is_ascii_digit() || is_number_separator(ch, next_ch)
12351225
});
12361226

1237-
// No fraction -> Token::Period
1238-
if s == "." {
1239-
return Ok(Some(Token::Period));
1240-
}
1241-
1242-
// Parse exponent as number
1227+
// Parse exponent part (e.g., e+10 or E-5)
12431228
let mut exponent_part = String::new();
12441229
if chars.peek() == Some(&'e') || chars.peek() == Some(&'E') {
12451230
let mut char_clone = chars.peekable.clone();
1246-
exponent_part.push(char_clone.next().unwrap());
1231+
exponent_part.push(char_clone.next().unwrap()); // consume 'e' or 'E'
12471232

12481233
// Optional sign
1249-
match char_clone.peek() {
1250-
Some(&c) if matches!(c, '+' | '-') => {
1234+
if let Some(&c) = char_clone.peek() {
1235+
if c == '+' || c == '-' {
12511236
exponent_part.push(c);
12521237
char_clone.next();
12531238
}
1254-
_ => (),
12551239
}
12561240

1257-
match char_clone.peek() {
1258-
// Definitely an exponent, get original iterator up to speed and use it
1259-
Some(&c) if c.is_ascii_digit() => {
1241+
// Parse digits after the exponent
1242+
if let Some(&c) = char_clone.peek() {
1243+
if c.is_ascii_digit() {
12601244
for _ in 0..exponent_part.len() {
12611245
chars.next();
12621246
}
12631247
exponent_part +=
12641248
&peeking_take_while(chars, |ch| ch.is_ascii_digit());
12651249
s += exponent_part.as_str();
12661250
}
1267-
// Not an exponent, discard the work done
1268-
_ => (),
12691251
}
12701252
}
12711253

@@ -1274,8 +1256,7 @@ impl<'a> Tokenizer<'a> {
12741256
// be tokenized as a word.
12751257
if self.dialect.supports_numeric_prefix() {
12761258
if exponent_part.is_empty() {
1277-
// If it is not a number with an exponent, it may be
1278-
// an identifier starting with digits.
1259+
// Handle as potential word if no exponent part
12791260
let word =
12801261
peeking_take_while(chars, |ch| self.dialect.is_identifier_part(ch));
12811262

@@ -1284,20 +1265,84 @@ impl<'a> Tokenizer<'a> {
12841265
return Ok(Some(Token::make_word(s.as_str(), None)));
12851266
}
12861267
} else if prev_token == Some(&Token::Period) {
1287-
// If the previous token was a period, thus not belonging to a number,
1288-
// the value we have is part of an identifier.
1268+
// Handle as word if it follows a period
12891269
return Ok(Some(Token::make_word(s.as_str(), None)));
12901270
}
12911271
}
12921272

1273+
// Handle "L" suffix for long numbers
12931274
let long = if chars.peek() == Some(&'L') {
12941275
chars.next();
12951276
true
12961277
} else {
12971278
false
12981279
};
1280+
1281+
// Return the final token for the number
12991282
Ok(Some(Token::Number(s, long)))
13001283
}
1284+
1285+
// Period (`.`) handling
1286+
'.' => {
1287+
chars.next(); // consume the dot
1288+
1289+
match chars.peek() {
1290+
Some('_') => {
1291+
// Handle "._" case as a period (special token) followed by identifier
1292+
Ok(Some(Token::Period))
1293+
}
1294+
Some(ch)
1295+
// Hive and mysql dialects allow numeric prefixes for identifers
1296+
if ch.is_ascii_digit()
1297+
&& self.dialect.supports_numeric_prefix()
1298+
&& matches!(prev_token, Some(Token::Word(_))) =>
1299+
{
1300+
Ok(Some(Token::Period))
1301+
}
1302+
Some(ch) if ch.is_ascii_digit() => {
1303+
// Handle numbers starting with a dot (e.g., ".123")
1304+
let mut s = String::from(".");
1305+
let is_number_separator = |ch: char, next_char: Option<char>| {
1306+
self.dialect.supports_numeric_literal_underscores()
1307+
&& ch == '_'
1308+
&& next_char.is_some_and(|c| c.is_ascii_digit())
1309+
};
1310+
1311+
s += &peeking_next_take_while(chars, |ch, next_ch| {
1312+
ch.is_ascii_digit() || is_number_separator(ch, next_ch)
1313+
});
1314+
1315+
// Handle exponent part
1316+
if matches!(chars.peek(), Some('e' | 'E')) {
1317+
let mut exp = String::new();
1318+
exp.push(chars.next().unwrap());
1319+
1320+
if matches!(chars.peek(), Some('+' | '-')) {
1321+
exp.push(chars.next().unwrap());
1322+
}
1323+
1324+
if matches!(chars.peek(), Some(c) if c.is_ascii_digit()) {
1325+
exp += &peeking_take_while(chars, |c| c.is_ascii_digit());
1326+
s += &exp;
1327+
}
1328+
}
1329+
1330+
// Handle "L" suffix for long numbers
1331+
let long = if chars.peek() == Some(&'L') {
1332+
chars.next();
1333+
true
1334+
} else {
1335+
false
1336+
};
1337+
1338+
Ok(Some(Token::Number(s, long)))
1339+
}
1340+
_ => {
1341+
// Just a plain period
1342+
Ok(Some(Token::Period))
1343+
}
1344+
}
1345+
}
13011346
// punctuation
13021347
'(' => self.consume_and_return(chars, Token::LParen),
13031348
')' => self.consume_and_return(chars, Token::RParen),
@@ -2435,6 +2480,32 @@ mod tests {
24352480
compare(expected, tokens);
24362481
}
24372482

2483+
#[test]
2484+
fn tokenize_period_underscore() {
2485+
let sql = String::from("SELECT table._col");
2486+
// a dialect that supports underscores in numeric literals
2487+
let dialect = PostgreSqlDialect {};
2488+
let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2489+
2490+
let expected = vec![
2491+
Token::make_keyword("SELECT"),
2492+
Token::Whitespace(Whitespace::Space),
2493+
Token::Word(Word {
2494+
value: "table".to_string(),
2495+
quote_style: None,
2496+
keyword: Keyword::TABLE,
2497+
}),
2498+
Token::Period,
2499+
Token::Word(Word {
2500+
value: "_col".to_string(),
2501+
quote_style: None,
2502+
keyword: Keyword::NoKeyword,
2503+
}),
2504+
];
2505+
2506+
compare(expected, tokens);
2507+
}
2508+
24382509
#[test]
24392510
fn tokenize_select_float() {
24402511
let sql = String::from("SELECT .1");

0 commit comments

Comments
 (0)