Skip to content

Commit a73aa8b

Browse files
Query Processing Improvements (#438)
- Escape Char Handling of fulltext content that enables users to escape query syntax characters - Query syntax compat improvements - Default text field (no field specification) handling - Index based Punctuation - Stemming, Stopwords - Updated TextPredicate structures - Applying the text properties based on the FT.SEARCH command arguments - Added unit and integ testing Signed-off-by: Karthik Subbarao <[email protected]>
1 parent 8bb3eae commit a73aa8b

File tree

19 files changed

+767
-384
lines changed

19 files changed

+767
-384
lines changed

.config/typos.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,4 +28,5 @@ updat = "updat" # Used for stem matching
2828
extend-ignore-re = [
2929
"baNAna",
3030
"eXIst",
31+
"Hel",
3132
]

integration/test_fulltext.py

Lines changed: 73 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -29,10 +29,10 @@
2929
]
3030
text_query_term = ["FT.SEARCH", "products", '@desc:"wonder"']
3131
text_query_term_nomatch = ["FT.SEARCH", "products", '@desc:"nomatch"']
32-
text_query_prefix = ["FT.SEARCH", "products", '@desc:"wond*"']
33-
text_query_prefix2 = ["FT.SEARCH", "products", '@desc:"wond*"']
34-
text_query_prefix_nomatch = ["FT.SEARCH", "products", '@desc:"nomatch*"']
35-
text_query_prefix_multimatch = ["FT.SEARCH", "products", '@desc:"grea*"']
32+
text_query_prefix = ["FT.SEARCH", "products", '@desc:wond*']
33+
text_query_prefix2 = ["FT.SEARCH", "products", '@desc:wond*']
34+
text_query_prefix_nomatch = ["FT.SEARCH", "products", '@desc:nomatch*']
35+
text_query_prefix_multimatch = ["FT.SEARCH", "products", '@desc:grea*']
3636
text_query_exact_phrase1 = ["FT.SEARCH", "products", '@desc:"word wonder"']
3737
text_query_exact_phrase2 = ["FT.SEARCH", "products", '@desc:"random word wonder"']
3838

@@ -56,9 +56,9 @@
5656

5757
# Search queries for specific fields
5858
text_query_desc_field = ["FT.SEARCH", "products2", '@desc:"wonder"']
59-
text_query_desc_prefix = ["FT.SEARCH", "products2", '@desc:"wonde*"']
59+
text_query_desc_prefix = ["FT.SEARCH", "products2", '@desc:wonde*']
6060
text_query_desc2_field = ["FT.SEARCH", "products2", '@desc2:"wonder"']
61-
text_query_desc2_prefix = ["FT.SEARCH", "products2", '@desc2:"wonde*"']
61+
text_query_desc2_prefix = ["FT.SEARCH", "products2", '@desc2:wonde*']
6262

6363
# Expected results for desc field search
6464
expected_desc_hash_key = b'product:4'
@@ -125,18 +125,18 @@ def test_text_search(self):
125125
result3 = client.execute_command("FT.SEARCH", "products", '@desc:xpe*')
126126
assert result1[0] == 1 and result2[0] == 1 and result3[0] == 0
127127
assert result1[1] == b"product:3" and result2[1] == b"product:3"
128-
# TODO: Update these queries to non stemmed versions after queries are stemmed.
128+
# TODO: Update these queries to non stemmed versions once the stem tree is supported and ingestion is updated.
129129
# Perform an exact phrase search operation on a unique phrase (exists in one doc).
130130
result1 = client.execute_command("FT.SEARCH", "products", '@desc:"great oak from littl"')
131131
result2 = client.execute_command("FT.SEARCH", "products", '@desc:"great oak from littl grey acorn grow"')
132132
assert result1[0] == 1 and result2[0] == 1
133133
assert result1[1] == b"product:1" and result2[1] == b"product:1"
134-
result3 = client.execute_command("FT.SEARCH", "products", '@desc:great @desc:oa* @desc:from @desc:lit* @desc:gr* @desc:acorn @desc:gr*')
134+
result3 = client.execute_command("FT.SEARCH", "products", 'great oa* from lit* gr* acorn gr*')
135135
assert result3[0] == 1
136136
assert result3[1] == b"product:1"
137-
result3 = client.execute_command("FT.SEARCH", "products", '@desc:great @desc:oa* @desc:from @desc:lit* @desc:gr* @desc:acorn @desc:grea*')
137+
result3 = client.execute_command("FT.SEARCH", "products", 'great oa* from lit* gr* acorn grea*')
138138
assert result3[0] == 0
139-
result3 = client.execute_command("FT.SEARCH", "products", '@desc:great @desc:oa* @desc:from @desc:lit* @desc:gr* @desc:acorn @desc:great')
139+
result3 = client.execute_command("FT.SEARCH", "products", 'great oa* from lit* gr* acorn great')
140140
assert result3[0] == 0
141141
# Perform an exact phrase search operation on a phrase existing in 2 documents.
142142
result = client.execute_command("FT.SEARCH", "products", '@desc:"interest desc"')
@@ -174,7 +174,6 @@ def test_text_search(self):
174174
result = client.execute_command("FT.SEARCH", "products", '@desc:"1 2 3 4 5 6 7 8 9 0"')
175175
assert result[0] == 1
176176
assert result[1] == b"product:1"
177-
178177
# TODO: We can test this once the queries are tokenized with punctuation applied.
179178
# result = client.execute_command("FT.SEARCH", "products", '@desc:"inspector\'s palm"')
180179
# TODO: We can test this once the queries are tokenized with punctuation and stopword removal applied.
@@ -365,21 +364,22 @@ def test_default_tokenization(self):
365364
client: Valkey = self.server.get_new_client()
366365
client.execute_command("FT.CREATE idx ON HASH SCHEMA content TEXT")
367366
client.execute_command("HSET", "doc:1", "content", "The quick-running searches are finding EFFECTIVE results!")
368-
369-
# List of queries with pass/fail expectations
367+
client.execute_command("HSET", "doc:2", "content", "But slow searches aren't working...")
368+
# List of queries with match / no match expectations
370369
test_cases = [
371370
("quick*", True, "Punctuation tokenization - hyphen creates word boundaries"),
372371
("effect*", True, "Case insensitivity - lowercase matches uppercase"),
373-
("the", False, "Stop word filtering - common words filtered out"),
372+
("\"The quick-running searches are finding EFFECTIVE results!\"", False, "Stop word cannot be used in exact phrase searches"),
373+
# TODO: Change to True once the stem tree is supported and ingestion is updated.
374+
("\"quick-running searches finding EFFECTIVE results!\"", False, "Exact phrase without stopwords"),
375+
("\"quick-run search find EFFECT result!\"", True, "Exact Phrase Query without stopwords and using stemmed words"),
374376
("find*", True, "Prefix wildcard - matches 'finding'"),
375377
("nonexistent", False, "Non-existent terms return no results")
376378
]
377-
378379
expected_key = b'doc:1'
379380
expected_fields = [b'content', b"The quick-running searches are finding EFFECTIVE results!"]
380-
381381
for query_term, should_match, description in test_cases:
382-
result = client.execute_command("FT.SEARCH", "idx", f'@content:"{query_term}"')
382+
result = client.execute_command("FT.SEARCH", "idx", f'@content:{query_term}')
383383
if should_match:
384384
assert result[0] == 1 and result[1] == expected_key and result[2] == expected_fields, f"Failed: {description}"
385385
else:
@@ -413,16 +413,44 @@ def test_custom_stopwords(self):
413413
client: Valkey = self.server.get_new_client()
414414
client.execute_command("FT.CREATE idx ON HASH STOPWORDS 2 the and SCHEMA content TEXT")
415415
client.execute_command("HSET", "doc:1", "content", "the cat and dog are good")
416+
# non stop words should be findable
417+
result = client.execute_command("FT.SEARCH", "idx", '@content:"cat dog are good"')
418+
assert result[0] == 1 # Regular word indexed
419+
assert result[1] == b'doc:1'
420+
assert result[2] == [b'content', b"the cat and dog are good"]
416421

417422
# Stop words should not be findable
418423
result = client.execute_command("FT.SEARCH", "idx", '@content:"and"')
419424
assert result[0] == 0 # Stop word "and" filtered out
420-
421425
# non stop words should be findable
422426
result = client.execute_command("FT.SEARCH", "idx", '@content:"are"')
423427
assert result[0] == 1 # Regular word indexed
424428
assert result[1] == b'doc:1'
425429
assert result[2] == [b'content', b"the cat and dog are good"]
430+
# Stop words should not be findable
431+
result = client.execute_command("FT.SEARCH", "idx", '@content:"and"')
432+
assert result[0] == 0 # Stop word "and" filtered out
433+
434+
def test_nostem(self):
435+
"""
436+
End-to-end test: FT.CREATE NOSTEM config actually affects stemming in search
437+
"""
438+
client: Valkey = self.server.get_new_client()
439+
client.execute_command("FT.CREATE idx ON HASH NOSTEM SCHEMA content TEXT")
440+
client.execute_command("HSET", "doc:1", "content", "running quickly")
441+
# With NOSTEM, exact tokens should be findable with exact phrase
442+
result = client.execute_command("FT.SEARCH", "idx", '@content:"running"')
443+
assert result[0] == 1 # Exact form "running" found
444+
assert result[1] == b'doc:1'
445+
assert result[2] == [b'content', b"running quickly"]
446+
# With NOSTEM, exact tokens should be findable with non exact phrase
447+
result = client.execute_command("FT.SEARCH", "idx", '@content:"running"')
448+
assert result[0] == 1 # Exact form "running" found
449+
assert result[1] == b'doc:1'
450+
assert result[2] == [b'content', b"running quickly"]
451+
# With NOSTEM, stemmed tokens should not be findable
452+
result = client.execute_command("FT.SEARCH", "idx", '@content:"run"')
453+
assert result[0] == 0
426454

427455
def test_custom_punctuation(self):
428456
"""
@@ -431,16 +459,18 @@ def test_custom_punctuation(self):
431459
client: Valkey = self.server.get_new_client()
432460
client.execute_command("FT.CREATE idx ON HASH PUNCTUATION . SCHEMA content TEXT")
433461
client.execute_command("HSET", "doc:1", "content", "hello.world test@email")
434-
435462
# Dot configured as separator - should find split words
436463
result = client.execute_command("FT.SEARCH", "idx", '@content:"hello"')
437464
assert result[0] == 1 # Found "hello" as separate token
438465
assert result[1] == b'doc:1'
439466
assert result[2] == [b'content', b"hello.world test@email"]
440-
441467
# @ NOT configured as separator - should not be able with split words
442468
result = client.execute_command("FT.SEARCH", "idx", '@content:"test"')
443469
assert result[0] == 0
470+
result = client.execute_command("FT.SEARCH", "idx", '@content:"test@email"')
471+
assert result[0] == 1 # Found "hello" as separate token
472+
assert result[1] == b'doc:1'
473+
assert result[2] == [b'content', b"hello.world test@email"]
444474

445475
def test_add_update_delete_documents_single_client(self):
446476
"""
@@ -638,8 +668,29 @@ def delete_documents(client_id):
638668
perform_concurrent_searches(clients, num_clients, delete_searches, "DELETE")
639669

640670
def test_suffix_search(self):
641-
# TODO
642-
pass
671+
"""Test suffix search functionality using *suffix pattern"""
672+
# Create index
673+
self.client.execute_command("FT.CREATE", "idx", "ON", "HASH", "PREFIX", "1", "doc:", "SCHEMA", "content", "TEXT", "WITHSUFFIXTRIE", "NOSTEM")
674+
# Add test documents
675+
self.client.execute_command("HSET", "doc:1", "content", "running jumping walking")
676+
self.client.execute_command("HSET", "doc:2", "content", "testing debugging coding")
677+
self.client.execute_command("HSET", "doc:3", "content", "reading writing speaking")
678+
self.client.execute_command("HSET", "doc:4", "content", "swimming diving surfing")
679+
# Test suffix search with *ing
680+
result = self.client.execute_command("FT.SEARCH", "idx", "@content:*ing")
681+
assert result[0] == 4 # All documents contain words ending with 'ing'
682+
# Test suffix search with *ing (should match running, jumping, walking, etc.)
683+
result = self.client.execute_command("FT.SEARCH", "idx", "@content:*ning")
684+
assert result[0] == 1 # Only doc:1 has "running"
685+
# Test suffix search with *ing
686+
result = self.client.execute_command("FT.SEARCH", "idx", "@content:*ping")
687+
assert result[0] == 1 # Only doc:1 has "jumping"
688+
# Test suffix search with *ing
689+
result = self.client.execute_command("FT.SEARCH", "idx", "@content:*ding")
690+
assert result[0] == 2 # doc:2 has "coding", doc:3 has "reading"
691+
# Test non-matching suffix
692+
result = self.client.execute_command("FT.SEARCH", "idx", "@content:*xyz")
693+
assert result[0] == 0 # No matches
643694

644695
class TestFullTextDebugMode(ValkeySearchTestCaseDebugMode):
645696
"""

0 commit comments

Comments
 (0)