Skip to content

Commit bc18788

Browse files
committed
implement unit test for fixed token length chunker
Signed-off-by: yuye-aws <[email protected]>
1 parent b8989ef commit bc18788

File tree

1 file changed

+141
-1
lines changed

1 file changed

+141
-1
lines changed
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,142 @@
1-
package org.opensearch.neuralsearch.processor.chunker;public class FixedTokenLengthChunkerTests {
1+
/*
2+
* Copyright OpenSearch Contributors
3+
* SPDX-License-Identifier: Apache-2.0
4+
*/
5+
package org.opensearch.neuralsearch.processor.chunker;
6+
7+
import lombok.SneakyThrows;
8+
import org.apache.lucene.tests.analysis.MockTokenizer;
9+
import org.junit.Before;
10+
import org.opensearch.common.settings.Settings;
11+
import org.opensearch.env.Environment;
12+
import org.opensearch.env.TestEnvironment;
13+
import org.opensearch.index.analysis.TokenizerFactory;
14+
import org.opensearch.index.analysis.AnalysisRegistry;
15+
import org.opensearch.indices.analysis.AnalysisModule;
16+
import org.opensearch.plugins.AnalysisPlugin;
17+
import org.opensearch.test.OpenSearchTestCase;
18+
19+
import java.util.ArrayList;
20+
import java.util.HashMap;
21+
import java.util.List;
22+
import java.util.Map;
23+
24+
import static java.util.Collections.singletonList;
25+
import static java.util.Collections.singletonMap;
26+
import static org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker.TOKEN_LIMIT;
27+
import static org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker.OVERLAP_RATE;
28+
import static org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker.TOKENIZER;
29+
30+
public class FixedTokenLengthChunkerTests extends OpenSearchTestCase {
31+
32+
private FixedTokenLengthChunker FixedTokenLengthChunker;
33+
34+
@Before
35+
@SneakyThrows
36+
public void setup() {
37+
Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()).build();
38+
Environment environment = TestEnvironment.newEnvironment(settings);
39+
40+
AnalysisPlugin plugin = new AnalysisPlugin() {
41+
42+
@Override
43+
public Map<String, AnalysisModule.AnalysisProvider<TokenizerFactory>> getTokenizers() {
44+
return singletonMap(
45+
"keyword",
46+
(indexSettings, environment, name, settings) -> TokenizerFactory.newFactory(
47+
name,
48+
() -> new MockTokenizer(MockTokenizer.KEYWORD, false)
49+
)
50+
);
51+
}
52+
};
53+
AnalysisRegistry analysisRegistry = new AnalysisModule(environment, singletonList(plugin)).getAnalysisRegistry();
54+
FixedTokenLengthChunker = new FixedTokenLengthChunker(analysisRegistry);
55+
}
56+
57+
public void testValidateParameters_whenNoParams_thenSuccessful() {
58+
Map<String, Object> parameters = new HashMap<>();
59+
;
60+
FixedTokenLengthChunker.validateParameters(parameters);
61+
}
62+
63+
public void testValidateParameters_whenIllegalTokenLimitType_thenFail() {
64+
Map<String, Object> parameters = new HashMap<>();
65+
parameters.put(TOKEN_LIMIT, "invalid token limit");
66+
IllegalArgumentException illegalArgumentException = assertThrows(
67+
IllegalArgumentException.class,
68+
() -> FixedTokenLengthChunker.validateParameters(parameters)
69+
);
70+
assertEquals("fixed length parameter [token_limit] cannot be cast to [java.lang.Number]", illegalArgumentException.getMessage());
71+
}
72+
73+
public void testValidateParameters_whenIllegalTokenLimitValue_thenFail() {
74+
Map<String, Object> parameters = new HashMap<>();
75+
parameters.put(TOKEN_LIMIT, -1);
76+
IllegalArgumentException illegalArgumentException = assertThrows(
77+
IllegalArgumentException.class,
78+
() -> FixedTokenLengthChunker.validateParameters(parameters)
79+
);
80+
assertEquals("fixed length parameter [token_limit] must be positive", illegalArgumentException.getMessage());
81+
}
82+
83+
public void testValidateParameters_whenIllegalOverlapRateType_thenFail() {
84+
Map<String, Object> parameters = new HashMap<>();
85+
parameters.put(OVERLAP_RATE, "invalid overlap rate");
86+
IllegalArgumentException illegalArgumentException = assertThrows(
87+
IllegalArgumentException.class,
88+
() -> FixedTokenLengthChunker.validateParameters(parameters)
89+
);
90+
assertEquals("fixed length parameter [overlap_rate] cannot be cast to [java.lang.Number]", illegalArgumentException.getMessage());
91+
}
92+
93+
public void testValidateParameters_whenIllegalOverlapRateValue_thenFail() {
94+
Map<String, Object> parameters = new HashMap<>();
95+
parameters.put(OVERLAP_RATE, 1.0);
96+
IllegalArgumentException illegalArgumentException = assertThrows(
97+
IllegalArgumentException.class,
98+
() -> FixedTokenLengthChunker.validateParameters(parameters)
99+
);
100+
assertEquals(
101+
"fixed length parameter [overlap_rate] must be between 0 and 1, 1 is not included.",
102+
illegalArgumentException.getMessage()
103+
);
104+
}
105+
106+
public void testValidateParameters_whenIllegalTokenizerType_thenFail() {
107+
Map<String, Object> parameters = new HashMap<>();
108+
parameters.put(TOKENIZER, 111);
109+
IllegalArgumentException illegalArgumentException = assertThrows(
110+
IllegalArgumentException.class,
111+
() -> FixedTokenLengthChunker.validateParameters(parameters)
112+
);
113+
assertEquals("fixed length parameter [tokenizer] cannot be cast to [java.lang.String]", illegalArgumentException.getMessage());
114+
}
115+
116+
public void testChunk_withTokenLimit_10() {
117+
Map<String, Object> parameters = new HashMap<>();
118+
parameters.put(TOKEN_LIMIT, 10);
119+
String content =
120+
"This is an example document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch.";
121+
List<String> passages = FixedTokenLengthChunker.chunk(content, parameters);
122+
List<String> expectedPassages = new ArrayList<>();
123+
expectedPassages.add("This is an example document to be chunked The document");
124+
expectedPassages.add("The document contains a single paragraph two sentences and 24");
125+
expectedPassages.add("and 24 tokens by standard tokenizer in OpenSearch");
126+
assertEquals(expectedPassages, passages);
127+
}
128+
129+
public void testChunk_withTokenLimit_20() {
130+
Map<String, Object> parameters = new HashMap<>();
131+
parameters.put(TOKEN_LIMIT, 20);
132+
String content =
133+
"This is an example document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch.";
134+
List<String> passages = FixedTokenLengthChunker.chunk(content, parameters);
135+
List<String> expectedPassages = new ArrayList<>();
136+
expectedPassages.add(
137+
"This is an example document to be chunked The document contains a single paragraph two sentences and 24 tokens by"
138+
);
139+
expectedPassages.add("and 24 tokens by standard tokenizer in OpenSearch");
140+
assertEquals(expectedPassages, passages);
141+
}
2142
}

0 commit comments

Comments
 (0)