@@ -895,7 +895,7 @@ impl<'a> Tokenizer<'a> {
895
895
} ;
896
896
897
897
let mut location = state. location ( ) ;
898
- while let Some ( token) = self . next_token ( & mut state) ? {
898
+ while let Some ( token) = self . next_token ( & mut state, buf . last ( ) . map ( |t| & t . token ) ) ? {
899
899
let span = location. span_to ( state. location ( ) ) ;
900
900
901
901
buf. push ( TokenWithSpan { token, span } ) ;
@@ -932,7 +932,11 @@ impl<'a> Tokenizer<'a> {
932
932
}
933
933
934
934
/// Get the next token or return None
935
- fn next_token ( & self , chars : & mut State ) -> Result < Option < Token > , TokenizerError > {
935
+ fn next_token (
936
+ & self ,
937
+ chars : & mut State ,
938
+ prev_token : Option < & Token > ,
939
+ ) -> Result < Option < Token > , TokenizerError > {
936
940
match chars. peek ( ) {
937
941
Some ( & ch) => match ch {
938
942
' ' => self . consume_and_return ( chars, Token :: Whitespace ( Whitespace :: Space ) ) ,
@@ -1211,17 +1215,29 @@ impl<'a> Tokenizer<'a> {
1211
1215
chars. next ( ) ;
1212
1216
}
1213
1217
1218
+ // If the dialect supports identifiers that start with a numeric prefix
1219
+ // and we have now consumed a dot, check if the previous token was a Word.
1220
+ // If so, what follows is definitely not part of a decimal number and
1221
+ // we should yield the dot as a dedicated token so compound identifiers
1222
+ // starting with digits can be parsed correctly.
1223
+ if s == "." && self . dialect . supports_numeric_prefix ( ) {
1224
+ if let Some ( Token :: Word ( _) ) = prev_token {
1225
+ return Ok ( Some ( Token :: Period ) ) ;
1226
+ }
1227
+ }
1228
+
1229
+ // Consume fractional digits.
1214
1230
s += & peeking_next_take_while ( chars, |ch, next_ch| {
1215
1231
ch. is_ascii_digit ( ) || is_number_separator ( ch, next_ch)
1216
1232
} ) ;
1217
1233
1218
- // No number -> Token::Period
1234
+ // No fraction -> Token::Period
1219
1235
if s == "." {
1220
1236
return Ok ( Some ( Token :: Period ) ) ;
1221
1237
}
1222
1238
1223
- let mut exponent_part = String :: new ( ) ;
1224
1239
// Parse exponent as number
1240
+ let mut exponent_part = String :: new ( ) ;
1225
1241
if chars. peek ( ) == Some ( & 'e' ) || chars. peek ( ) == Some ( & 'E' ) {
1226
1242
let mut char_clone = chars. peekable . clone ( ) ;
1227
1243
exponent_part. push ( char_clone. next ( ) . unwrap ( ) ) ;
@@ -1250,14 +1266,23 @@ impl<'a> Tokenizer<'a> {
1250
1266
}
1251
1267
}
1252
1268
1253
- // mysql dialect supports identifiers that start with a numeric prefix,
1254
- // as long as they aren't an exponent number.
1255
- if self . dialect . supports_numeric_prefix ( ) && exponent_part. is_empty ( ) {
1256
- let word =
1257
- peeking_take_while ( chars, |ch| self . dialect . is_identifier_part ( ch) ) ;
1258
-
1259
- if !word. is_empty ( ) {
1260
- s += word. as_str ( ) ;
1269
+ // If the dialect supports identifiers that start with a numeric prefix,
1270
+ // we need to check if the value is in fact an identifier and must thus
1271
+ // be tokenized as a word.
1272
+ if self . dialect . supports_numeric_prefix ( ) {
1273
+ if exponent_part. is_empty ( ) {
1274
+ // If it is not a number with an exponent, it may be
1275
+ // an unqualified identifier starting with digits.
1276
+ let word =
1277
+ peeking_take_while ( chars, |ch| self . dialect . is_identifier_part ( ch) ) ;
1278
+
1279
+ if !word. is_empty ( ) {
1280
+ s += word. as_str ( ) ;
1281
+ return Ok ( Some ( Token :: make_word ( s. as_str ( ) , None ) ) ) ;
1282
+ }
1283
+ } else if prev_token == Some ( & Token :: Period ) {
1284
+ // If the previous token was a period, thus not belonging to a number,
1285
+ // the value we have is part of an identifier.
1261
1286
return Ok ( Some ( Token :: make_word ( s. as_str ( ) , None ) ) ) ;
1262
1287
}
1263
1288
}
@@ -3960,4 +3985,31 @@ mod tests {
3960
3985
] ,
3961
3986
) ;
3962
3987
}
3988
+
3989
+ #[ test]
3990
+ fn test_tokenize_identifiers_numeric_prefix ( ) {
3991
+ all_dialects_where ( |dialect| dialect. supports_numeric_prefix ( ) )
3992
+ . tokenizes_to ( "123abc" , vec ! [ Token :: make_word( "123abc" , None ) ] ) ;
3993
+
3994
+ all_dialects_where ( |dialect| dialect. supports_numeric_prefix ( ) )
3995
+ . tokenizes_to ( "12e34" , vec ! [ Token :: Number ( "12e34" . to_string( ) , false ) ] ) ;
3996
+
3997
+ all_dialects_where ( |dialect| dialect. supports_numeric_prefix ( ) ) . tokenizes_to (
3998
+ "t.12e34" ,
3999
+ vec ! [
4000
+ Token :: make_word( "t" , None ) ,
4001
+ Token :: Period ,
4002
+ Token :: make_word( "12e34" , None ) ,
4003
+ ] ,
4004
+ ) ;
4005
+
4006
+ all_dialects_where ( |dialect| dialect. supports_numeric_prefix ( ) ) . tokenizes_to (
4007
+ "t.1two3" ,
4008
+ vec ! [
4009
+ Token :: make_word( "t" , None ) ,
4010
+ Token :: Period ,
4011
+ Token :: make_word( "1two3" , None ) ,
4012
+ ] ,
4013
+ ) ;
4014
+ }
3963
4015
}
0 commit comments