Skip to content

Commit 0279883

Browse files
author
Roman Borschel
committed
Fix tokenization of qualified identifiers with numeric prefix.
Queries with qualified identifiers having numeric prefixes currently fail to parse due to incorrect tokenization. Currently, "t.123abc" tokenizes as "t" (Word) followed by ".123abc" (Number).
1 parent 0d2976d commit 0279883

File tree

2 files changed

+164
-12
lines changed

2 files changed

+164
-12
lines changed

src/tokenizer.rs

+64-12
Original file line numberDiff line numberDiff line change
@@ -895,7 +895,7 @@ impl<'a> Tokenizer<'a> {
895895
};
896896

897897
let mut location = state.location();
898-
while let Some(token) = self.next_token(&mut state)? {
898+
while let Some(token) = self.next_token(&mut state, buf.last().map(|t| &t.token))? {
899899
let span = location.span_to(state.location());
900900

901901
buf.push(TokenWithSpan { token, span });
@@ -932,7 +932,11 @@ impl<'a> Tokenizer<'a> {
932932
}
933933

934934
/// Get the next token or return None
935-
fn next_token(&self, chars: &mut State) -> Result<Option<Token>, TokenizerError> {
935+
fn next_token(
936+
&self,
937+
chars: &mut State,
938+
prev_token: Option<&Token>,
939+
) -> Result<Option<Token>, TokenizerError> {
936940
match chars.peek() {
937941
Some(&ch) => match ch {
938942
' ' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Space)),
@@ -1211,17 +1215,29 @@ impl<'a> Tokenizer<'a> {
12111215
chars.next();
12121216
}
12131217

1218+
// If the dialect supports identifiers that start with a numeric prefix
1219+
// and we have now consumed a dot, check if the previous token was a Word.
1220+
// If so, what follows is definitely not part of a decimal number and
1221+
// we should yield the dot as a dedicated token so compound identifiers
1222+
// starting with digits can be parsed correctly.
1223+
if s == "." && self.dialect.supports_numeric_prefix() {
1224+
if let Some(Token::Word(_)) = prev_token {
1225+
return Ok(Some(Token::Period));
1226+
}
1227+
}
1228+
1229+
// Consume fractional digits.
12141230
s += &peeking_next_take_while(chars, |ch, next_ch| {
12151231
ch.is_ascii_digit() || is_number_separator(ch, next_ch)
12161232
});
12171233

1218-
// No number -> Token::Period
1234+
// No fraction -> Token::Period
12191235
if s == "." {
12201236
return Ok(Some(Token::Period));
12211237
}
12221238

1223-
let mut exponent_part = String::new();
12241239
// Parse exponent as number
1240+
let mut exponent_part = String::new();
12251241
if chars.peek() == Some(&'e') || chars.peek() == Some(&'E') {
12261242
let mut char_clone = chars.peekable.clone();
12271243
exponent_part.push(char_clone.next().unwrap());
@@ -1250,14 +1266,23 @@ impl<'a> Tokenizer<'a> {
12501266
}
12511267
}
12521268

1253-
// mysql dialect supports identifiers that start with a numeric prefix,
1254-
// as long as they aren't an exponent number.
1255-
if self.dialect.supports_numeric_prefix() && exponent_part.is_empty() {
1256-
let word =
1257-
peeking_take_while(chars, |ch| self.dialect.is_identifier_part(ch));
1258-
1259-
if !word.is_empty() {
1260-
s += word.as_str();
1269+
// If the dialect supports identifiers that start with a numeric prefix,
1270+
// we need to check if the value is in fact an identifier and must thus
1271+
// be tokenized as a word.
1272+
if self.dialect.supports_numeric_prefix() {
1273+
if exponent_part.is_empty() {
1274+
// If it is not a number with an exponent, it may be
1275+
// an unqualified identifier starting with digits.
1276+
let word =
1277+
peeking_take_while(chars, |ch| self.dialect.is_identifier_part(ch));
1278+
1279+
if !word.is_empty() {
1280+
s += word.as_str();
1281+
return Ok(Some(Token::make_word(s.as_str(), None)));
1282+
}
1283+
} else if prev_token == Some(&Token::Period) {
1284+
// If the previous token was a period, thus not belonging to a number,
1285+
// the value we have is part of an identifier.
12611286
return Ok(Some(Token::make_word(s.as_str(), None)));
12621287
}
12631288
}
@@ -3960,4 +3985,31 @@ mod tests {
39603985
],
39613986
);
39623987
}
3988+
3989+
#[test]
3990+
fn test_tokenize_identifiers_numeric_prefix() {
3991+
all_dialects_where(|dialect| dialect.supports_numeric_prefix())
3992+
.tokenizes_to("123abc", vec![Token::make_word("123abc", None)]);
3993+
3994+
all_dialects_where(|dialect| dialect.supports_numeric_prefix())
3995+
.tokenizes_to("12e34", vec![Token::Number("12e34".to_string(), false)]);
3996+
3997+
all_dialects_where(|dialect| dialect.supports_numeric_prefix()).tokenizes_to(
3998+
"t.12e34",
3999+
vec![
4000+
Token::make_word("t", None),
4001+
Token::Period,
4002+
Token::make_word("12e34", None),
4003+
],
4004+
);
4005+
4006+
all_dialects_where(|dialect| dialect.supports_numeric_prefix()).tokenizes_to(
4007+
"t.1two3",
4008+
vec![
4009+
Token::make_word("t", None),
4010+
Token::Period,
4011+
Token::make_word("1two3", None),
4012+
],
4013+
);
4014+
}
39634015
}

tests/sqlparser_mysql.rs

+100
Original file line numberDiff line numberDiff line change
@@ -1926,6 +1926,106 @@ fn parse_select_with_numeric_prefix_column_name() {
19261926
}
19271927
}
19281928

1929+
#[test]
1930+
fn parse_qualified_identifiers_with_numeric_prefix() {
1931+
// Case 1: Qualified column name that starts with digits.
1932+
mysql().verified_stmt("SELECT t.15to29 FROM my_table AS t");
1933+
match mysql()
1934+
.parse_sql_statements("SELECT t.15to29 FROM my_table AS t")
1935+
.unwrap()
1936+
.pop()
1937+
{
1938+
Some(Statement::Query(q)) => match *q.body {
1939+
SetExpr::Select(s) => match s.projection.last() {
1940+
Some(SelectItem::UnnamedExpr(Expr::CompoundIdentifier(parts))) => {
1941+
assert_eq!(&[Ident::new("t"), Ident::new("15to29")], &parts[..]);
1942+
}
1943+
proj => panic!("Unexpected projection: {:?}", proj),
1944+
},
1945+
body => panic!("Unexpected statement body: {:?}", body),
1946+
},
1947+
stmt => panic!("Unexpected statement: {:?}", stmt),
1948+
}
1949+
1950+
// Case 2: Qualified column name that starts with digits and on its own represents a number.
1951+
mysql().verified_stmt("SELECT t.15e29 FROM my_table AS t");
1952+
match mysql()
1953+
.parse_sql_statements("SELECT t.15e29 FROM my_table AS t")
1954+
.unwrap()
1955+
.pop()
1956+
{
1957+
Some(Statement::Query(q)) => match *q.body {
1958+
SetExpr::Select(s) => match s.projection.last() {
1959+
Some(SelectItem::UnnamedExpr(Expr::CompoundIdentifier(parts))) => {
1960+
assert_eq!(&[Ident::new("t"), Ident::new("15e29")], &parts[..]);
1961+
}
1962+
proj => panic!("Unexpected projection: {:?}", proj),
1963+
},
1964+
body => panic!("Unexpected statement body: {:?}", body),
1965+
},
1966+
stmt => panic!("Unexpected statement: {:?}", stmt),
1967+
}
1968+
1969+
// Case 3: Unqualified, the same token is parsed as a number.
1970+
match mysql()
1971+
.parse_sql_statements("SELECT 15e29 FROM my_table")
1972+
.unwrap()
1973+
.pop()
1974+
{
1975+
Some(Statement::Query(q)) => match *q.body {
1976+
SetExpr::Select(s) => match s.projection.last() {
1977+
Some(SelectItem::UnnamedExpr(Expr::Value(ValueWithSpan { value, .. }))) => {
1978+
assert_eq!(&number("15e29"), value);
1979+
}
1980+
proj => panic!("Unexpected projection: {:?}", proj),
1981+
},
1982+
body => panic!("Unexpected statement body: {:?}", body),
1983+
},
1984+
stmt => panic!("Unexpected statement: {:?}", stmt),
1985+
}
1986+
1987+
// Case 4: Quoted simple identifier.
1988+
mysql().verified_stmt("SELECT `15e29` FROM my_table");
1989+
match mysql()
1990+
.parse_sql_statements("SELECT `15e29` FROM my_table")
1991+
.unwrap()
1992+
.pop()
1993+
{
1994+
Some(Statement::Query(q)) => match *q.body {
1995+
SetExpr::Select(s) => match s.projection.last() {
1996+
Some(SelectItem::UnnamedExpr(Expr::Identifier(name))) => {
1997+
assert_eq!(&Ident::with_quote('`', "15e29"), name);
1998+
}
1999+
proj => panic!("Unexpected projection: {:?}", proj),
2000+
},
2001+
body => panic!("Unexpected statement body: {:?}", body),
2002+
},
2003+
stmt => panic!("Unexpected statement: {:?}", stmt),
2004+
}
2005+
2006+
// Case 5: Quoted compound identifier.
2007+
mysql().verified_stmt("SELECT t.`15e29` FROM my_table");
2008+
match mysql()
2009+
.parse_sql_statements("SELECT t.`15e29` FROM my_table AS t")
2010+
.unwrap()
2011+
.pop()
2012+
{
2013+
Some(Statement::Query(q)) => match *q.body {
2014+
SetExpr::Select(s) => match s.projection.last() {
2015+
Some(SelectItem::UnnamedExpr(Expr::CompoundIdentifier(parts))) => {
2016+
assert_eq!(
2017+
&[Ident::new("t"), Ident::with_quote('`', "15e29")],
2018+
&parts[..]
2019+
);
2020+
}
2021+
proj => panic!("Unexpected projection: {:?}", proj),
2022+
},
2023+
body => panic!("Unexpected statement body: {:?}", body),
2024+
},
2025+
stmt => panic!("Unexpected statement: {:?}", stmt),
2026+
}
2027+
}
2028+
19292029
// Don't run with bigdecimal as it fails like this on rust beta:
19302030
//
19312031
// 'parse_select_with_concatenation_of_exp_number_and_numeric_prefix_column'

0 commit comments

Comments
 (0)