Skip to content

Commit 522414c

Browse files
branch-4.0: [feature](search) add exact dsl for search function #56710 (#56711)
Cherry-picked from #56710 Co-authored-by: Jack <[email protected]>
1 parent 6e075ee commit 522414c

File tree

12 files changed

+483
-5
lines changed

12 files changed

+483
-5
lines changed

be/src/vec/functions/function_search.cpp

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -279,7 +279,8 @@ FunctionSearch::ClauseTypeCategory FunctionSearch::get_clause_type_category(
279279
if (clause_type == "AND" || clause_type == "OR" || clause_type == "NOT") {
280280
return ClauseTypeCategory::COMPOUND;
281281
} else if (clause_type == "TERM" || clause_type == "PREFIX" || clause_type == "WILDCARD" ||
282-
clause_type == "REGEXP" || clause_type == "RANGE" || clause_type == "LIST") {
282+
clause_type == "REGEXP" || clause_type == "RANGE" || clause_type == "LIST" ||
283+
clause_type == "EXACT") {
283284
// Non-tokenized queries: exact matching, pattern matching, range, list operations
284285
return ClauseTypeCategory::NON_TOKENIZED;
285286
} else if (clause_type == "PHRASE" || clause_type == "MATCH" || clause_type == "ANY" ||
@@ -349,6 +350,9 @@ InvertedIndexQueryType FunctionSearch::clause_type_to_query_type(
349350
{"MATCH", InvertedIndexQueryType::MATCH_ANY_QUERY},
350351
{"ANY", InvertedIndexQueryType::MATCH_ANY_QUERY},
351352
{"ALL", InvertedIndexQueryType::MATCH_ALL_QUERY},
353+
354+
// Exact match without tokenization
355+
{"EXACT", InvertedIndexQueryType::EQUAL_QUERY},
352356
};
353357

354358
auto it = clause_type_map.find(clause_type);
@@ -532,6 +536,17 @@ Status FunctionSearch::build_leaf_query(const FunctionSearch& function, const TS
532536
}
533537

534538
if (category == FunctionSearch::ClauseTypeCategory::NON_TOKENIZED) {
539+
if (clause_type == "EXACT") {
540+
// EXACT match: exact string matching without tokenization
541+
// Note: EXACT prefers untokenized index (STRING_TYPE) which doesn't support lowercase
542+
// If only tokenized index exists, EXACT may return empty results because
543+
// tokenized indexes store individual tokens, not complete strings
544+
*out = make_term_query(value_wstr);
545+
VLOG_DEBUG << "search: EXACT clause processed, field=" << field_name << ", value='"
546+
<< value << "'";
547+
return Status::OK();
548+
}
549+
535550
if (clause_type == "PREFIX" || clause_type == "WILDCARD" || clause_type == "REGEXP" ||
536551
clause_type == "RANGE" || clause_type == "LIST") {
537552
VLOG_DEBUG << "search: clause type '" << clause_type

fe/fe-core/src/main/antlr4/org/apache/doris/nereids/search/SearchLexer.g4

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -58,9 +58,10 @@ REGEXP : '/' (~[/] | '\\/')* '/' ;
5858
LBRACKET : '[' -> pushMode(RANGE_MODE) ;
5959
LBRACE : '{' -> pushMode(RANGE_MODE) ;
6060

61-
IN_LPAREN : [Ii][Nn] '(' -> pushMode(LIST_MODE) ;
62-
ANY_LPAREN : [Aa][Nn][Yy] '(' -> pushMode(STRING_MODE) ;
63-
ALL_LPAREN : [Aa][Ll][Ll] '(' -> pushMode(STRING_MODE) ;
61+
IN_LPAREN : [Ii][Nn] '(' -> pushMode(LIST_MODE) ;
62+
ANY_LPAREN : [Aa][Nn][Yy] '(' -> pushMode(STRING_MODE) ;
63+
ALL_LPAREN : [Aa][Ll][Ll] '(' -> pushMode(STRING_MODE) ;
64+
EXACT_LPAREN : [Ee][Xx][Aa][Cc][Tt] '(' -> pushMode(STRING_MODE) ;
6465

6566
WS : [ \t\r\n\u3000]+ -> skip ;
6667

fe/fe-core/src/main/antlr4/org/apache/doris/nereids/search/SearchParser.g4

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ searchValue
3737
| rangeValue
3838
| listValue
3939
| anyAllValue
40+
| exactValue
4041
;
4142

4243
rangeValue
@@ -47,4 +48,5 @@ rangeValue
4748
rangeEndpoint : RANGE_NUMBER | RANGE_STAR ;
4849

4950
listValue : IN_LPAREN LIST_TERM* LIST_RPAREN ;
50-
anyAllValue : (ANY_LPAREN | ALL_LPAREN) STRING_CONTENT? STRING_RPAREN ;
51+
anyAllValue : (ANY_LPAREN | ALL_LPAREN) STRING_CONTENT? STRING_RPAREN ;
52+
exactValue : EXACT_LPAREN STRING_CONTENT? STRING_RPAREN ;

fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/SearchDslParser.java

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,7 @@ public enum QsClauseType {
125125
LIST, // field:IN(value1 value2)
126126
ANY, // field:ANY(value) - any match
127127
ALL, // field:ALL(value) - all match
128+
EXACT, // field:EXACT(value) - exact match without tokenization
128129
AND, // clause1 AND clause2
129130
OR, // clause1 OR clause2
130131
NOT // NOT clause
@@ -298,6 +299,10 @@ public QsNode visitSearchValue(SearchParser.SearchValueContext ctx) {
298299
return createAnyAllNode(fieldName, ctx.anyAllValue().getText());
299300
}
300301

302+
if (ctx.exactValue() != null) {
303+
return createExactNode(fieldName, ctx.exactValue().getText());
304+
}
305+
301306
// Fallback for unknown types
302307
return createTermNode(fieldName, ctx.getText());
303308
}
@@ -372,6 +377,12 @@ private QsNode createAnyAllNode(String fieldName, String anyAllText) {
372377
return new QsNode(QsClauseType.ANY, fieldName, innerContent);
373378
}
374379

380+
private QsNode createExactNode(String fieldName, String exactText) {
381+
// Extract content between parentheses
382+
String innerContent = extractParenthesesContent(exactText);
383+
return new QsNode(QsClauseType.EXACT, fieldName, innerContent);
384+
}
385+
375386
private String extractParenthesesContent(String text) {
376387
int openParen = text.indexOf('(');
377388
int closeParen = text.lastIndexOf(')');
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
-- This file is automatically generated. You should know what you did if you want to edit this
2+
-- !exact_whole --
3+
2 apple banana
4+
5+
-- !exact_partial --
6+
1 apple
7+
8+
-- !exact_case --
9+
4 Apple
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
-- This file is automatically generated. You should know what you did if you want to edit this
2+
-- !normal_exact_lower --
3+
2 hello world
4+
5+
-- !normal_exact_upper --
6+
3 HELLO WORLD
7+
8+
-- !normal_exact_mixed --
9+
1 Hello World
10+
11+
-- !mixed_exact_lower --
12+
2 hello world
13+
14+
-- !mixed_exact_upper --
15+
3 HELLO WORLD
16+
17+
-- !mixed_exact_mixed --
18+
1 Hello World
19+
20+
-- !mixed_any_lowercase --
21+
1 Hello World
22+
2 hello world
23+
3 HELLO WORLD
24+
4 HeLLo WoRLd
25+
26+
-- !exact_case_sensitive --
27+
5 Test Case
28+
29+
-- !any_case_insensitive --
30+
5
31+
6
32+
Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
-- This file is automatically generated. You should know what you did if you want to edit this
2+
-- !exact_basic --
3+
1 Hello World
4+
5+
-- !exact_no_partial --
6+
4 Hello
7+
8+
-- !exact_case_sensitive --
9+
2 hello world
10+
11+
-- !exact_different_case --
12+
3 HELLO WORLD
13+
14+
-- !exact_spaces --
15+
6 a b c
16+
17+
-- !exact_vs_any_exact --
18+
19+
-- !exact_vs_any_any --
20+
1 This is a test document
21+
2 Another test document
22+
3 Third test document
23+
24+
-- !exact_mixed_index --
25+
7 machine learning
26+
27+
-- !exact_special_chars --
28+
9 Special!@#Chars
29+
30+
-- !exact_and --
31+
1 Hello World
32+
33+
-- !exact_or --
34+
1 Hello World
35+
2 hello world
36+
37+
-- !exact_not --
38+
1 Hello World
39+
2 hello world
40+
4 Hello
41+
5 World
42+
6 a b c
43+
7 machine learning
44+
8 deep learning
45+
9 Special!@#Chars
46+
47+
-- !exact_null --
48+
49+
-- !exact_multiple --
50+
1 Hello World test keyword
51+
52+
-- !term_query --
53+
4 Hello
54+
55+
-- !exact_query --
56+
4 Hello
57+
58+
-- !exact_and_all --
59+
1 Hello World This is a test document
60+
61+
-- !exact_case_hello --
62+
4 hello
63+
64+
-- !exact_case_HELLO --
65+
66+
-- !exact_mixed_exact_match --
67+
1 tokenized value
68+
69+
-- !any_mixed_token_match --
70+
1 tokenized value
71+
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
-- This file is automatically generated. You should know what you did if you want to edit this
2+
-- !exact_full_match --
3+
1 machine learning
4+
5+
-- !exact_no_partial --
6+
3 machine
7+
8+
-- !any_token_match --
9+
1 machine learning
10+
2 deep learning
11+
3 machine
12+
4 learning
13+
14+
-- !all_token_match --
15+
1 machine learning
16+
17+
-- !exact_strict --
18+
2 deep learning
19+
20+
-- !any_loose --
21+
1 machine learning
22+
2 deep learning
23+
4 learning
24+
25+
-- !mixed_exact_any --
26+
1 machine learning
27+
5 artificial intelligence
28+
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
suite("test_search_exact_basic") {
19+
def tableName = "exact_basic_test"
20+
21+
sql "DROP TABLE IF EXISTS ${tableName}"
22+
23+
// Simple table with basic index
24+
sql """
25+
CREATE TABLE ${tableName} (
26+
id INT,
27+
name VARCHAR(200),
28+
INDEX idx_name (name) USING INVERTED
29+
) ENGINE=OLAP
30+
DUPLICATE KEY(id)
31+
DISTRIBUTED BY HASH(id) BUCKETS 1
32+
PROPERTIES ("replication_allocation" = "tag.location.default: 1")
33+
"""
34+
35+
// Insert simple test data
36+
sql """INSERT INTO ${tableName} VALUES
37+
(1, 'apple'),
38+
(2, 'apple banana'),
39+
(3, 'banana'),
40+
(4, 'Apple'),
41+
(5, 'APPLE'),
42+
(6, 'apple banana cherry')
43+
"""
44+
45+
Thread.sleep(3000)
46+
47+
// Test 1: EXACT should match the whole value
48+
qt_exact_whole "SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, name FROM ${tableName} WHERE search('name:EXACT(apple banana)') ORDER BY id"
49+
50+
// Test 2: EXACT should not match partial
51+
qt_exact_partial "SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, name FROM ${tableName} WHERE search('name:EXACT(apple)') ORDER BY id"
52+
53+
// Test 3: EXACT is case sensitive (without lowercase config)
54+
qt_exact_case "SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, name FROM ${tableName} WHERE search('name:EXACT(Apple)') ORDER BY id"
55+
}
Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
suite("test_search_exact_lowercase") {
19+
def tableName = "exact_lowercase_test"
20+
21+
sql "DROP TABLE IF EXISTS ${tableName}"
22+
23+
// EXACT on mixed indexes: prefers untokenized, but untokenized index doesn't support lowercase
24+
// So EXACT is always case-sensitive regardless of lowercase config
25+
// This test verifies that EXACT behavior is consistent
26+
sql """
27+
CREATE TABLE ${tableName} (
28+
id INT,
29+
text_normal VARCHAR(200),
30+
text_mixed VARCHAR(200),
31+
INDEX idx_normal (text_normal) USING INVERTED,
32+
INDEX idx_mixed_tokenized (text_mixed) USING INVERTED PROPERTIES("parser" = "unicode", "lower_case" = "true"),
33+
INDEX idx_mixed_untokenized (text_mixed) USING INVERTED
34+
) ENGINE=OLAP
35+
DUPLICATE KEY(id)
36+
DISTRIBUTED BY HASH(id) BUCKETS 1
37+
PROPERTIES ("replication_allocation" = "tag.location.default: 1")
38+
"""
39+
40+
// Insert test data with various cases
41+
sql """INSERT INTO ${tableName} VALUES
42+
(1, 'Hello World', 'Hello World'),
43+
(2, 'hello world', 'hello world'),
44+
(3, 'HELLO WORLD', 'HELLO WORLD'),
45+
(4, 'HeLLo WoRLd', 'HeLLo WoRLd'),
46+
(5, 'Test Case', 'Test Case'),
47+
(6, 'TEST CASE', 'TEST CASE')
48+
"""
49+
50+
Thread.sleep(3000)
51+
52+
// Test 1: EXACT on normal field (untokenized) - case sensitive
53+
qt_normal_exact_lower "SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, text_normal FROM ${tableName} WHERE search('text_normal:EXACT(hello world)') ORDER BY id"
54+
qt_normal_exact_upper "SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, text_normal FROM ${tableName} WHERE search('text_normal:EXACT(HELLO WORLD)') ORDER BY id"
55+
qt_normal_exact_mixed "SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, text_normal FROM ${tableName} WHERE search('text_normal:EXACT(Hello World)') ORDER BY id"
56+
57+
// Test 2: EXACT on mixed index field
58+
// EXACT prefers untokenized index, so it's case sensitive (lowercase config is ignored)
59+
qt_mixed_exact_lower "SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, text_mixed FROM ${tableName} WHERE search('text_mixed:EXACT(hello world)') ORDER BY id"
60+
qt_mixed_exact_upper "SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, text_mixed FROM ${tableName} WHERE search('text_mixed:EXACT(HELLO WORLD)') ORDER BY id"
61+
qt_mixed_exact_mixed "SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, text_mixed FROM ${tableName} WHERE search('text_mixed:EXACT(Hello World)') ORDER BY id"
62+
63+
// Test 3: Verify that ANY on mixed index uses tokenized index with lowercase
64+
qt_mixed_any_lowercase "SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, text_mixed FROM ${tableName} WHERE search('text_mixed:ANY(hello world)') ORDER BY id"
65+
66+
// Test 4: Compare EXACT (case-sensitive) vs ANY (with lowercase)
67+
qt_exact_case_sensitive "SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id FROM ${tableName} WHERE search('text_mixed:EXACT(Test Case)') ORDER BY id"
68+
qt_any_case_insensitive "SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id FROM ${tableName} WHERE search('text_mixed:ANY(test case)') ORDER BY id"
69+
}

0 commit comments

Comments
 (0)