@@ -1189,8 +1189,8 @@ impl<'a> Tokenizer<'a> {
1189
1189
1190
1190
Ok ( Some ( Token :: make_word ( & word. concat ( ) , Some ( quote_start) ) ) )
1191
1191
}
1192
- // numbers and period
1193
- '0' ..='9' | '.' => {
1192
+ // Numbers
1193
+ '0' ..='9' => {
1194
1194
// Some dialects support underscore as number separator
1195
1195
// There can only be one at a time and it must be followed by another digit
1196
1196
let is_number_separator = |ch : char , next_char : Option < char > | {
@@ -1199,11 +1199,12 @@ impl<'a> Tokenizer<'a> {
1199
1199
&& next_char. is_some_and ( |next_ch| next_ch. is_ascii_hexdigit ( ) )
1200
1200
} ;
1201
1201
1202
+ // Start with number or potential separator
1202
1203
let mut s = peeking_next_take_while ( chars, |ch, next_ch| {
1203
1204
ch. is_ascii_digit ( ) || is_number_separator ( ch, next_ch)
1204
1205
} ) ;
1205
1206
1206
- // match binary literal that starts with 0x
1207
+ // Match binary literal that starts with 0x
1207
1208
if s == "0" && chars. peek ( ) == Some ( & 'x' ) {
1208
1209
chars. next ( ) ;
1209
1210
let s2 = peeking_next_take_while ( chars, |ch, next_ch| {
@@ -1212,60 +1213,41 @@ impl<'a> Tokenizer<'a> {
1212
1213
return Ok ( Some ( Token :: HexStringLiteral ( s2) ) ) ;
1213
1214
}
1214
1215
1215
- // match one period
1216
+ // Match fractional part after a dot
1216
1217
if let Some ( '.' ) = chars. peek ( ) {
1217
1218
s. push ( '.' ) ;
1218
1219
chars. next ( ) ;
1219
1220
}
1220
1221
1221
- // If the dialect supports identifiers that start with a numeric prefix
1222
- // and we have now consumed a dot, check if the previous token was a Word.
1223
- // If so, what follows is definitely not part of a decimal number and
1224
- // we should yield the dot as a dedicated token so compound identifiers
1225
- // starting with digits can be parsed correctly.
1226
- if s == "." && self . dialect . supports_numeric_prefix ( ) {
1227
- if let Some ( Token :: Word ( _) ) = prev_token {
1228
- return Ok ( Some ( Token :: Period ) ) ;
1229
- }
1230
- }
1231
-
1232
- // Consume fractional digits.
1222
+ // Consume fractional digits
1233
1223
s += & peeking_next_take_while ( chars, |ch, next_ch| {
1234
1224
ch. is_ascii_digit ( ) || is_number_separator ( ch, next_ch)
1235
1225
} ) ;
1236
1226
1237
- // No fraction -> Token::Period
1238
- if s == "." {
1239
- return Ok ( Some ( Token :: Period ) ) ;
1240
- }
1241
-
1242
- // Parse exponent as number
1227
+ // Parse exponent part (e.g., e+10 or E-5)
1243
1228
let mut exponent_part = String :: new ( ) ;
1244
1229
if chars. peek ( ) == Some ( & 'e' ) || chars. peek ( ) == Some ( & 'E' ) {
1245
1230
let mut char_clone = chars. peekable . clone ( ) ;
1246
- exponent_part. push ( char_clone. next ( ) . unwrap ( ) ) ;
1231
+ exponent_part. push ( char_clone. next ( ) . unwrap ( ) ) ; // consume 'e' or 'E'
1247
1232
1248
1233
// Optional sign
1249
- match char_clone. peek ( ) {
1250
- Some ( & c ) if matches ! ( c , '+' | '-' ) => {
1234
+ if let Some ( & c ) = char_clone. peek ( ) {
1235
+ if c == '+' || c == '-' {
1251
1236
exponent_part. push ( c) ;
1252
1237
char_clone. next ( ) ;
1253
1238
}
1254
- _ => ( ) ,
1255
1239
}
1256
1240
1257
- match char_clone . peek ( ) {
1258
- // Definitely an exponent, get original iterator up to speed and use it
1259
- Some ( & c ) if c. is_ascii_digit ( ) => {
1241
+ // Parse digits after the exponent
1242
+ if let Some ( & c ) = char_clone . peek ( ) {
1243
+ if c. is_ascii_digit ( ) {
1260
1244
for _ in 0 ..exponent_part. len ( ) {
1261
1245
chars. next ( ) ;
1262
1246
}
1263
1247
exponent_part +=
1264
1248
& peeking_take_while ( chars, |ch| ch. is_ascii_digit ( ) ) ;
1265
1249
s += exponent_part. as_str ( ) ;
1266
1250
}
1267
- // Not an exponent, discard the work done
1268
- _ => ( ) ,
1269
1251
}
1270
1252
}
1271
1253
@@ -1274,8 +1256,7 @@ impl<'a> Tokenizer<'a> {
1274
1256
// be tokenized as a word.
1275
1257
if self . dialect . supports_numeric_prefix ( ) {
1276
1258
if exponent_part. is_empty ( ) {
1277
- // If it is not a number with an exponent, it may be
1278
- // an identifier starting with digits.
1259
+ // Handle as potential word if no exponent part
1279
1260
let word =
1280
1261
peeking_take_while ( chars, |ch| self . dialect . is_identifier_part ( ch) ) ;
1281
1262
@@ -1284,20 +1265,84 @@ impl<'a> Tokenizer<'a> {
1284
1265
return Ok ( Some ( Token :: make_word ( s. as_str ( ) , None ) ) ) ;
1285
1266
}
1286
1267
} else if prev_token == Some ( & Token :: Period ) {
1287
- // If the previous token was a period, thus not belonging to a number,
1288
- // the value we have is part of an identifier.
1268
+ // Handle as word if it follows a period
1289
1269
return Ok ( Some ( Token :: make_word ( s. as_str ( ) , None ) ) ) ;
1290
1270
}
1291
1271
}
1292
1272
1273
+ // Handle "L" suffix for long numbers
1293
1274
let long = if chars. peek ( ) == Some ( & 'L' ) {
1294
1275
chars. next ( ) ;
1295
1276
true
1296
1277
} else {
1297
1278
false
1298
1279
} ;
1280
+
1281
+ // Return the final token for the number
1299
1282
Ok ( Some ( Token :: Number ( s, long) ) )
1300
1283
}
1284
+
1285
+ // Period (`.`) handling
1286
+ '.' => {
1287
+ chars. next ( ) ; // consume the dot
1288
+
1289
+ match chars. peek ( ) {
1290
+ Some ( '_' ) => {
1291
+ // Handle "._" case as a period (special token) followed by identifier
1292
+ Ok ( Some ( Token :: Period ) )
1293
+ }
1294
+ Some ( ch)
1295
+ // Hive and mysql dialects allow numeric prefixes for identifers
1296
+ if ch. is_ascii_digit ( )
1297
+ && self . dialect . supports_numeric_prefix ( )
1298
+ && matches ! ( prev_token, Some ( Token :: Word ( _) ) ) =>
1299
+ {
1300
+ Ok ( Some ( Token :: Period ) )
1301
+ }
1302
+ Some ( ch) if ch. is_ascii_digit ( ) => {
1303
+ // Handle numbers starting with a dot (e.g., ".123")
1304
+ let mut s = String :: from ( "." ) ;
1305
+ let is_number_separator = |ch : char , next_char : Option < char > | {
1306
+ self . dialect . supports_numeric_literal_underscores ( )
1307
+ && ch == '_'
1308
+ && next_char. is_some_and ( |c| c. is_ascii_digit ( ) )
1309
+ } ;
1310
+
1311
+ s += & peeking_next_take_while ( chars, |ch, next_ch| {
1312
+ ch. is_ascii_digit ( ) || is_number_separator ( ch, next_ch)
1313
+ } ) ;
1314
+
1315
+ // Handle exponent part
1316
+ if matches ! ( chars. peek( ) , Some ( 'e' | 'E' ) ) {
1317
+ let mut exp = String :: new ( ) ;
1318
+ exp. push ( chars. next ( ) . unwrap ( ) ) ;
1319
+
1320
+ if matches ! ( chars. peek( ) , Some ( '+' | '-' ) ) {
1321
+ exp. push ( chars. next ( ) . unwrap ( ) ) ;
1322
+ }
1323
+
1324
+ if matches ! ( chars. peek( ) , Some ( c) if c. is_ascii_digit( ) ) {
1325
+ exp += & peeking_take_while ( chars, |c| c. is_ascii_digit ( ) ) ;
1326
+ s += & exp;
1327
+ }
1328
+ }
1329
+
1330
+ // Handle "L" suffix for long numbers
1331
+ let long = if chars. peek ( ) == Some ( & 'L' ) {
1332
+ chars. next ( ) ;
1333
+ true
1334
+ } else {
1335
+ false
1336
+ } ;
1337
+
1338
+ Ok ( Some ( Token :: Number ( s, long) ) )
1339
+ }
1340
+ _ => {
1341
+ // Just a plain period
1342
+ Ok ( Some ( Token :: Period ) )
1343
+ }
1344
+ }
1345
+ }
1301
1346
// punctuation
1302
1347
'(' => self . consume_and_return ( chars, Token :: LParen ) ,
1303
1348
')' => self . consume_and_return ( chars, Token :: RParen ) ,
@@ -2435,6 +2480,32 @@ mod tests {
2435
2480
compare ( expected, tokens) ;
2436
2481
}
2437
2482
2483
+ #[ test]
2484
+ fn tokenize_period_underscore ( ) {
2485
+ let sql = String :: from ( "SELECT table._col" ) ;
2486
+ // a dialect that supports underscores in numeric literals
2487
+ let dialect = PostgreSqlDialect { } ;
2488
+ let tokens = Tokenizer :: new ( & dialect, & sql) . tokenize ( ) . unwrap ( ) ;
2489
+
2490
+ let expected = vec ! [
2491
+ Token :: make_keyword( "SELECT" ) ,
2492
+ Token :: Whitespace ( Whitespace :: Space ) ,
2493
+ Token :: Word ( Word {
2494
+ value: "table" . to_string( ) ,
2495
+ quote_style: None ,
2496
+ keyword: Keyword :: TABLE ,
2497
+ } ) ,
2498
+ Token :: Period ,
2499
+ Token :: Word ( Word {
2500
+ value: "_col" . to_string( ) ,
2501
+ quote_style: None ,
2502
+ keyword: Keyword :: NoKeyword ,
2503
+ } ) ,
2504
+ ] ;
2505
+
2506
+ compare ( expected, tokens) ;
2507
+ }
2508
+
2438
2509
#[ test]
2439
2510
fn tokenize_select_float ( ) {
2440
2511
let sql = String :: from ( "SELECT .1" ) ;
0 commit comments