Skip to content

Commit cd95b20

Browse files
authored
Improve Syntax Check by New ANTLR Parser (#179)
* Generate new parser by ANTLR * Integrate ANTLR parser to existing code * Fix string similarity and add visitor generated * Use visitor to change default visiting order * Add function arg type check * Add compatibility check for basic operator * Refactor inner classes to different upper class and subpackage * Change default MySQL grammar file for our case * Change default MySQL grammar file for our case * Change default MySQL grammar file for our case * Remove ANTLR plugin dependencies from runtime * Initial commit for syntax analysis by new ANTLR parser * Add missing function to grammar for unit test * Add support for index name with - or /type * Add more ES special functions to pass the unit test * Add more ES special syntax to pass IT * Add more syntax for MINUS to pass all IT * Fix checkstyle violation * Remove unsupported statements * Remove SELECT INTO * Remove more unused syntax * Remove unused function * Remove unused tokens * Remove unused tokens * Remove unused interval and charset syntax * Remove unused interval and charset syntax * Remove more unused syntax * Add setting for enabling new parser * Improve offending symbol location * Improve offending symbol location * Update 3rd party attribution with ANTLR * Add supported functions missing in existing test code * Add integration test * Change grammar for new merged code from master * Add integration test * Add more test cases * Add more test cases, enable/disable setting * Add more test cases * Add more test cases * Simplify exception for now * Move generated source back to build folder * Rename analyze method and assert error message * Fix typo * Address more comments and fix broken tests
1 parent 3e0d611 commit cd95b20

23 files changed

+2018
-512
lines changed

THIRD-PARTY

+638-474
Large diffs are not rendered by default.

build.gradle

+24-1
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ version = "${opendistroVersion}.0"
6363
apply plugin: 'elasticsearch.esplugin'
6464
apply plugin: 'jacoco'
6565
apply from: 'build-tools/sqlplugin-coverage.gradle'
66+
apply plugin: 'antlr'
6667

6768
jacoco {
6869
toolVersion = "0.8.3"
@@ -73,7 +74,12 @@ ext {
7374
licenseFile = rootProject.file('LICENSE.TXT')
7475
noticeFile = rootProject.file('NOTICE')
7576
}
76-
licenseHeaders.enabled = true
77+
78+
// ANTLR generated parser file is too large to be checked which caused licenseHeaders stuck.
79+
licenseHeaders {
80+
enabled = true
81+
excludes = ['com/amazon/opendistroforelasticsearch/sql/antlr/parser/**']
82+
}
7783

7884
// TODO: need to fix java doc to enable JavaDoc
7985
javadoc.enabled = false
@@ -131,6 +137,19 @@ integTestCluster {
131137
distribution = "oss-zip"
132138
}
133139

140+
generateGrammarSource {
141+
arguments += ['-visitor', '-package', 'com.amazon.opendistroforelasticsearch.sql.antlr.parser']
142+
source = sourceSets.main.antlr
143+
outputDirectory = file("build/generated-src/antlr/main/com/amazon/opendistroforelasticsearch/sql/antlr/parser")
144+
}
145+
146+
// Remove ANTLR plugin jars as it's specified as 'compile' dependency internally
147+
configurations {
148+
compile {
149+
extendsFrom = extendsFrom.findAll { it != configurations.antlr }
150+
}
151+
}
152+
134153
check.dependsOn jacocoTestReport
135154

136155
// TODO: fix code style in main and test source code
@@ -175,6 +194,10 @@ dependencies {
175194
compile group: 'com.google.guava', name: 'guava', version:'15.0'
176195
compile group: 'org.json', name: 'json', version:'20180813'
177196

197+
// ANTLR gradle plugin and runtime dependency
198+
antlr "org.antlr:antlr4:4.7.1"
199+
compile "org.antlr:antlr4-runtime:4.7.1"
200+
178201
//compileOnly group: 'org.locationtech.jts', name: 'jts-core', version:'1.15.0'
179202
// compileOnly group: 'org.elasticsearch', name: 'elasticsearch', version:'6.5.3'
180203
// compileOnly group: 'com.unboundid', name: 'unboundid-ldapsdk', version:'3.2.0'

config/checkstyle/checkstyle.xml

+1-1
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
<property name="charset" value="UTF-8" />
88

99
<module name="SuppressionFilter">
10-
<property name="file" value="${suppressions}" />
10+
<property name="file" value="${config_loc}/suppressions.xml" />
1111
</module>
1212

1313
<!-- Checks Java files and forbids empty Javadoc comments -->

config/checkstyle/suppressions.xml

+10
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
<?xml version="1.0"?>
2+
<!DOCTYPE suppressions PUBLIC
3+
"-//Checkstyle//DTD SuppressionFilter Configuration 1.2//EN"
4+
"https://checkstyle.org/dtds/suppressions_1_2.dtd">
5+
6+
<suppressions>
7+
8+
<suppress files="com[\\/]amazon[\\/]opendistroforelasticsearch[\\/]sql[\\/]antlr[\\/]parser" checks=".*"/>
9+
10+
</suppressions>

src/main/antlr/OpenDistroSqlLexer.g4

+310
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,310 @@
1+
/*
2+
MySQL (Positive Technologies) grammar
3+
The MIT License (MIT).
4+
Copyright (c) 2015-2017, Ivan Kochurkin ([email protected]), Positive Technologies.
5+
Copyright (c) 2017, Ivan Khudyashev ([email protected])
6+
7+
Permission is hereby granted, free of charge, to any person obtaining a copy
8+
of this software and associated documentation files (the "Software"), to deal
9+
in the Software without restriction, including without limitation the rights
10+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11+
copies of the Software, and to permit persons to whom the Software is
12+
furnished to do so, subject to the following conditions:
13+
14+
The above copyright notice and this permission notice shall be included in
15+
all copies or substantial portions of the Software.
16+
17+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23+
THE SOFTWARE.
24+
*/
25+
26+
lexer grammar OpenDistroSqlLexer;
27+
28+
channels { SQLCOMMENT, ERRORCHANNEL }
29+
30+
31+
// SKIP
32+
33+
SPACE: [ \t\r\n]+ -> channel(HIDDEN);
34+
SPEC_SQL_COMMENT: '/*!' .+? '*/' -> channel(SQLCOMMENT);
35+
COMMENT_INPUT: '/*' .*? '*/' -> channel(HIDDEN);
36+
LINE_COMMENT: (
37+
('-- ' | '#') ~[\r\n]* ('\r'? '\n' | EOF)
38+
| '--' ('\r'? '\n' | EOF)
39+
) -> channel(HIDDEN);
40+
41+
42+
// Keywords
43+
// Common Keywords
44+
45+
ALL: 'ALL';
46+
AND: 'AND';
47+
AS: 'AS';
48+
ASC: 'ASC';
49+
BETWEEN: 'BETWEEN';
50+
BY: 'BY';
51+
CASE: 'CASE';
52+
CROSS: 'CROSS';
53+
DELETE: 'DELETE';
54+
DESC: 'DESC';
55+
DESCRIBE: 'DESCRIBE';
56+
DISTINCT: 'DISTINCT';
57+
ELSE: 'ELSE';
58+
EXISTS: 'EXISTS';
59+
FALSE: 'FALSE';
60+
FROM: 'FROM';
61+
GROUP: 'GROUP';
62+
HAVING: 'HAVING';
63+
IN: 'IN';
64+
INNER: 'INNER';
65+
IS: 'IS';
66+
JOIN: 'JOIN';
67+
LEFT: 'LEFT';
68+
LIKE: 'LIKE';
69+
LIMIT: 'LIMIT';
70+
MATCH: 'MATCH';
71+
NATURAL: 'NATURAL';
72+
NOT: 'NOT';
73+
NULL_LITERAL: 'NULL';
74+
ON: 'ON';
75+
OR: 'OR';
76+
ORDER: 'ORDER';
77+
OUTER: 'OUTER';
78+
REGEXP: 'REGEXP';
79+
RIGHT: 'RIGHT';
80+
SELECT: 'SELECT';
81+
SHOW: 'SHOW';
82+
THEN: 'THEN';
83+
TRUE: 'TRUE';
84+
UNION: 'UNION';
85+
USING: 'USING';
86+
WHEN: 'WHEN';
87+
WHERE: 'WHERE';
88+
89+
90+
// OD SQL special keyword
91+
MISSING: 'MISSING';
92+
EXCEPT: 'MINUS';
93+
94+
95+
// Group function Keywords
96+
97+
AVG: 'AVG';
98+
COUNT: 'COUNT';
99+
MAX: 'MAX';
100+
MIN: 'MIN';
101+
SUM: 'SUM';
102+
103+
104+
// Common function Keywords
105+
106+
SUBSTRING: 'SUBSTRING';
107+
TRIM: 'TRIM';
108+
YEAR: 'YEAR';
109+
110+
111+
// Keywords, but can be ID
112+
// Common Keywords, but can be ID
113+
114+
END: 'END';
115+
FULL: 'FULL';
116+
OFFSET: 'OFFSET';
117+
118+
119+
// PRIVILEGES
120+
121+
TABLES: 'TABLES';
122+
123+
124+
// Common function names
125+
126+
ABS: 'ABS';
127+
ACOS: 'ACOS';
128+
ASIN: 'ASIN';
129+
ATAN: 'ATAN';
130+
ATAN2: 'ATAN2';
131+
CBRT: 'CBRT';
132+
CEIL: 'CEIL';
133+
CONCAT: 'CONCAT';
134+
CONCAT_WS: 'CONCAT_WS';
135+
COS: 'COS';
136+
COSH: 'COSH';
137+
DATE_FORMAT: 'DATE_FORMAT';
138+
DEGREES: 'DEGREES';
139+
E: 'E';
140+
EXP: 'EXP';
141+
EXPM1: 'EXPM1';
142+
FLOOR: 'FLOOR';
143+
LOG: 'LOG';
144+
LOG10: 'LOG10';
145+
LOG2: 'LOG2';
146+
LOWER: 'LOWER';
147+
PI: 'PI';
148+
POW: 'POW';
149+
RADIANS: 'RADIANS';
150+
RANDOM: 'RANDOM';
151+
RINT: 'RINT';
152+
ROUND: 'ROUND';
153+
SIN: 'SIN';
154+
SINH: 'SINH';
155+
SQRT: 'SQRT';
156+
TAN: 'TAN';
157+
UPPER: 'UPPER';
158+
159+
D: 'D';
160+
T: 'T';
161+
TS: 'TS';
162+
LEFT_BRACE: '{';
163+
RIGHT_BRACE: '}';
164+
165+
166+
// OD SQL special functions
167+
DATE_HISTOGRAM: 'DATE_HISTOGRAM';
168+
DAY_OF_MONTH: 'DAY_OF_MONTH';
169+
DAY_OF_YEAR: 'DAY_OF_YEAR';
170+
DAY_OF_WEEK: 'DAY_OF_WEEK';
171+
EXCLUDE: 'EXCLUDE';
172+
EXTENDED_STATS: 'EXTENDED_STATS';
173+
FIELD: 'FIELD';
174+
FILTER: 'FILTER';
175+
GEO_BOUNDING_BOX: 'GEO_BOUNDING_BOX';
176+
GEO_DISTANCE: 'GEO_DISTANCE';
177+
GEO_INTERSECTS: 'GEO_INTERSECTS';
178+
GEO_POLYGON: 'GEO_POLYGON';
179+
HISTOGRAM: 'HISTOGRAM';
180+
HOUR_OF_DAY: 'HOUR_OF_DAY';
181+
INCLUDE: 'INCLUDE';
182+
IN_TERMS: 'IN_TERMS';
183+
MATCHPHRASE: 'MATCHPHRASE';
184+
MATCH_PHRASE: 'MATCH_PHRASE';
185+
MATCHQUERY: 'MATCHQUERY';
186+
MATCH_QUERY: 'MATCH_QUERY';
187+
MINUTE_OF_DAY: 'MINUTE_OF_DAY';
188+
MINUTE_OF_HOUR: 'MINUTE_OF_HOUR';
189+
MONTH_OF_YEAR: 'MONTH_OF_YEAR';
190+
MULTIMATCH: 'MULTIMATCH';
191+
MULTI_MATCH: 'MULTI_MATCH';
192+
NESTED: 'NESTED';
193+
PERCENTILES: 'PERCENTILES';
194+
REGEXP_QUERY: 'REGEXP_QUERY';
195+
REVERSE_NESTED: 'REVERSE_NESTED';
196+
QUERY: 'QUERY';
197+
RANGE: 'RANGE';
198+
SCORE: 'SCORE';
199+
SECOND_OF_MINUTE: 'SECOND_OF_MINUTE';
200+
STATS: 'STATS';
201+
TERM: 'TERM';
202+
TERMS: 'TERMS';
203+
TOPHITS: 'TOPHITS';
204+
WEEK_OF_YEAR: 'WEEK_OF_YEAR';
205+
WILDCARDQUERY: 'WILDCARDQUERY';
206+
WILDCARD_QUERY: 'WILDCARD_QUERY';
207+
208+
209+
// Operators
210+
211+
// Operators. Arithmetics
212+
213+
STAR: '*';
214+
DIVIDE: '/';
215+
MODULE: '%';
216+
PLUS: '+';
217+
MINUS: '-';
218+
DIV: 'DIV';
219+
MOD: 'MOD';
220+
221+
222+
// Operators. Comparation
223+
224+
EQUAL_SYMBOL: '=';
225+
GREATER_SYMBOL: '>';
226+
LESS_SYMBOL: '<';
227+
EXCLAMATION_SYMBOL: '!';
228+
229+
230+
// Operators. Bit
231+
232+
BIT_NOT_OP: '~';
233+
BIT_OR_OP: '|';
234+
BIT_AND_OP: '&';
235+
BIT_XOR_OP: '^';
236+
237+
238+
// Constructors symbols
239+
240+
DOT: '.';
241+
LR_BRACKET: '(';
242+
RR_BRACKET: ')';
243+
COMMA: ',';
244+
SEMI: ';';
245+
AT_SIGN: '@';
246+
ZERO_DECIMAL: '0';
247+
ONE_DECIMAL: '1';
248+
TWO_DECIMAL: '2';
249+
SINGLE_QUOTE_SYMB: '\'';
250+
DOUBLE_QUOTE_SYMB: '"';
251+
REVERSE_QUOTE_SYMB: '`';
252+
COLON_SYMB: ':';
253+
254+
255+
// Literal Primitives
256+
257+
START_NATIONAL_STRING_LITERAL: 'N' SQUOTA_STRING;
258+
STRING_LITERAL: DQUOTA_STRING | SQUOTA_STRING | BQUOTA_STRING;
259+
DECIMAL_LITERAL: DEC_DIGIT+;
260+
HEXADECIMAL_LITERAL: 'X' '\'' (HEX_DIGIT HEX_DIGIT)+ '\''
261+
| '0X' HEX_DIGIT+;
262+
263+
REAL_LITERAL: (DEC_DIGIT+)? '.' DEC_DIGIT+
264+
| DEC_DIGIT+ '.' EXPONENT_NUM_PART
265+
| (DEC_DIGIT+)? '.' (DEC_DIGIT+ EXPONENT_NUM_PART)
266+
| DEC_DIGIT+ EXPONENT_NUM_PART;
267+
NULL_SPEC_LITERAL: '\\' 'N';
268+
BIT_STRING: BIT_STRING_L;
269+
270+
271+
272+
// Hack for dotID
273+
// Prevent recognize string: .123somelatin AS ((.123), FLOAT_LITERAL), ((somelatin), ID)
274+
// it must recoginze: .123somelatin AS ((.), DOT), (123somelatin, ID)
275+
276+
DOT_ID: '.' ID_LITERAL;
277+
278+
279+
280+
// Identifiers
281+
282+
ID: ID_LITERAL;
283+
// DOUBLE_QUOTE_ID: '"' ~'"'+ '"';
284+
REVERSE_QUOTE_ID: '`' ~'`'+ '`';
285+
STRING_USER_NAME: (
286+
SQUOTA_STRING | DQUOTA_STRING
287+
| BQUOTA_STRING | ID_LITERAL
288+
) '@'
289+
(
290+
SQUOTA_STRING | DQUOTA_STRING
291+
| BQUOTA_STRING | ID_LITERAL
292+
);
293+
294+
295+
// Fragments for Literal primitives
296+
297+
fragment EXPONENT_NUM_PART: 'E' [-+]? DEC_DIGIT+;
298+
fragment ID_LITERAL: [A-Z_$0-9]*?[A-Z_$\-]+?[A-Z_$\-0-9]*;
299+
fragment DQUOTA_STRING: '"' ( '\\'. | '""' | ~('"'| '\\') )* '"';
300+
fragment SQUOTA_STRING: '\'' ('\\'. | '\'\'' | ~('\'' | '\\'))* '\'';
301+
fragment BQUOTA_STRING: '`' ( '\\'. | '``' | ~('`'|'\\'))* '`';
302+
fragment HEX_DIGIT: [0-9A-F];
303+
fragment DEC_DIGIT: [0-9];
304+
fragment BIT_STRING_L: 'B' '\'' [01]+ '\'';
305+
306+
307+
308+
// Last tokens must generate Errors
309+
310+
ERROR_RECOGNITION: . -> channel(ERRORCHANNEL);

0 commit comments

Comments
 (0)