4
4
*/
5
5
package org .opensearch .neuralsearch .processor ;
6
6
7
- import com .google .common .collect .ImmutableList ;
8
7
import lombok .SneakyThrows ;
9
- import org .apache .hc .core5 .http .HttpHeaders ;
10
- import org .apache .hc .core5 .http .io .entity .EntityUtils ;
11
- import org .apache .hc .core5 .http .message .BasicHeader ;
12
8
import org .junit .Before ;
13
9
14
10
import java .net .URL ;
19
15
import java .util .Map ;
20
16
import java .util .Objects ;
21
17
22
- import org .opensearch .client .Response ;
23
- import org .opensearch .common .xcontent .XContentHelper ;
24
- import org .opensearch .common .xcontent .XContentType ;
25
18
import org .opensearch .index .query .MatchAllQueryBuilder ;
26
19
import org .opensearch .neuralsearch .BaseNeuralSearchIT ;
27
20
@@ -73,7 +66,9 @@ public void testTextChunkingProcessor_withFixedTokenLengthAlgorithmStandardToken
73
66
try {
74
67
createPipelineProcessor (FIXED_TOKEN_LENGTH_PIPELINE_WITH_STANDARD_TOKENIZER_NAME );
75
68
createTextChunkingIndex (INDEX_NAME , FIXED_TOKEN_LENGTH_PIPELINE_WITH_STANDARD_TOKENIZER_NAME );
76
- ingestDocument (TEST_DOCUMENT );
69
+
70
+ String document = getDocumentFromFilePath (TEST_DOCUMENT );
71
+ ingestDocument (INDEX_NAME , document );
77
72
78
73
List <String > expectedPassages = new ArrayList <>();
79
74
expectedPassages .add ("This is an example document to be chunked. The document " );
@@ -90,7 +85,9 @@ public void testTextChunkingProcessor_withFixedTokenLengthAlgorithmLetterTokeniz
90
85
try {
91
86
createPipelineProcessor (FIXED_TOKEN_LENGTH_PIPELINE_WITH_LETTER_TOKENIZER_NAME );
92
87
createTextChunkingIndex (INDEX_NAME , FIXED_TOKEN_LENGTH_PIPELINE_WITH_LETTER_TOKENIZER_NAME );
93
- ingestDocument (TEST_DOCUMENT );
88
+
89
+ String document = getDocumentFromFilePath (TEST_DOCUMENT );
90
+ ingestDocument (INDEX_NAME , document );
94
91
95
92
List <String > expectedPassages = new ArrayList <>();
96
93
expectedPassages .add ("This is an example document to be chunked. The document " );
@@ -107,7 +104,9 @@ public void testTextChunkingProcessor_withFixedTokenLengthAlgorithmLowercaseToke
107
104
try {
108
105
createPipelineProcessor (FIXED_TOKEN_LENGTH_PIPELINE_WITH_LOWERCASE_TOKENIZER_NAME );
109
106
createTextChunkingIndex (INDEX_NAME , FIXED_TOKEN_LENGTH_PIPELINE_WITH_LOWERCASE_TOKENIZER_NAME );
110
- ingestDocument (TEST_DOCUMENT );
107
+
108
+ String document = getDocumentFromFilePath (TEST_DOCUMENT );
109
+ ingestDocument (INDEX_NAME , document );
111
110
112
111
List <String > expectedPassages = new ArrayList <>();
113
112
expectedPassages .add ("This is an example document to be chunked. The document " );
@@ -124,7 +123,10 @@ public void testTextChunkingProcessor_withFixedTokenLengthAlgorithmStandardToken
124
123
try {
125
124
createPipelineProcessor (FIXED_TOKEN_LENGTH_PIPELINE_WITH_STANDARD_TOKENIZER_NAME );
126
125
createTextChunkingIndex (INDEX_NAME , FIXED_TOKEN_LENGTH_PIPELINE_WITH_STANDARD_TOKENIZER_NAME );
127
- Exception exception = assertThrows (Exception .class , () -> ingestDocument (TEST_LONG_DOCUMENT ));
126
+ Exception exception = assertThrows (Exception .class , () -> {
127
+ String document = getDocumentFromFilePath (TEST_LONG_DOCUMENT );
128
+ ingestDocument (INDEX_NAME , document );
129
+ });
128
130
// max_token_count is 100 by index settings
129
131
assert (exception .getMessage ()
130
132
.contains ("The number of tokens produced by calling _analyze has exceeded the allowed maximum of [100]." ));
@@ -139,7 +141,9 @@ public void testTextChunkingProcessor_withDelimiterAlgorithm_successful() {
139
141
try {
140
142
createPipelineProcessor (DELIMITER_PIPELINE_NAME );
141
143
createTextChunkingIndex (INDEX_NAME , DELIMITER_PIPELINE_NAME );
142
- ingestDocument (TEST_DOCUMENT );
144
+
145
+ String document = getDocumentFromFilePath (TEST_DOCUMENT );
146
+ ingestDocument (INDEX_NAME , document );
143
147
144
148
List <String > expectedPassages = new ArrayList <>();
145
149
expectedPassages .add ("This is an example document to be chunked." );
@@ -157,7 +161,9 @@ public void testTextChunkingProcessor_withCascadePipeline_successful() {
157
161
try {
158
162
createPipelineProcessor (CASCADE_PIPELINE_NAME );
159
163
createTextChunkingIndex (INDEX_NAME , CASCADE_PIPELINE_NAME );
160
- ingestDocument (TEST_DOCUMENT );
164
+
165
+ String document = getDocumentFromFilePath (TEST_DOCUMENT );
166
+ ingestDocument (INDEX_NAME , document );
161
167
162
168
List <String > expectedPassages = new ArrayList <>();
163
169
expectedPassages .add ("This is an example document to be chunked." );
@@ -176,6 +182,23 @@ public void testTextChunkingProcessor_withCascadePipeline_successful() {
176
182
}
177
183
}
178
184
185
+ public void testTextChunkingProcessor_withFixedTokenLengthAlgorithmStandardTokenizer_whenReindexingDocument_thenSuccessful ()
186
+ throws Exception {
187
+ try {
188
+ String fromIndexName = "test-reindex-from" ;
189
+ createIndexWithConfiguration (fromIndexName , "{ \" settings\" : { \" number_of_shards\" : 1, \" number_of_replicas\" : 0 } }" , null );
190
+ String document = getDocumentFromFilePath (TEST_DOCUMENT );
191
+ ingestDocument (fromIndexName , document );
192
+
193
+ createPipelineProcessor (FIXED_TOKEN_LENGTH_PIPELINE_WITH_STANDARD_TOKENIZER_NAME );
194
+ createTextChunkingIndex (INDEX_NAME , FIXED_TOKEN_LENGTH_PIPELINE_WITH_STANDARD_TOKENIZER_NAME );
195
+ reindex (fromIndexName , INDEX_NAME );
196
+ assertEquals (1 , getDocCount (INDEX_NAME ));
197
+ } finally {
198
+ wipeOfTestResources (INDEX_NAME , FIXED_TOKEN_LENGTH_PIPELINE_WITH_STANDARD_TOKENIZER_NAME , null , null );
199
+ }
200
+ }
201
+
179
202
private void validateIndexIngestResults (String indexName , String fieldName , Object expected ) {
180
203
assertEquals (1 , getDocCount (indexName ));
181
204
MatchAllQueryBuilder query = new MatchAllQueryBuilder ();
@@ -205,23 +228,9 @@ private void createTextChunkingIndex(String indexName, String pipelineName) thro
205
228
createIndexWithConfiguration (indexName , Files .readString (Path .of (indexSettingsURLPath .toURI ())), pipelineName );
206
229
}
207
230
208
- private void ingestDocument (String documentPath ) throws Exception {
209
- URL documentURLPath = classLoader .getResource (documentPath );
231
+ private String getDocumentFromFilePath (String filePath ) throws Exception {
232
+ URL documentURLPath = classLoader .getResource (filePath );
210
233
Objects .requireNonNull (documentURLPath );
211
- String document = Files .readString (Path .of (documentURLPath .toURI ()));
212
- Response response = makeRequest (
213
- client (),
214
- "POST" ,
215
- INDEX_NAME + "/_doc?refresh" ,
216
- null ,
217
- toHttpEntity (document ),
218
- ImmutableList .of (new BasicHeader (HttpHeaders .USER_AGENT , "Kibana" ))
219
- );
220
- Map <String , Object > map = XContentHelper .convertToMap (
221
- XContentType .JSON .xContent (),
222
- EntityUtils .toString (response .getEntity ()),
223
- false
224
- );
225
- assertEquals ("created" , map .get ("result" ));
234
+ return Files .readString (Path .of (documentURLPath .toURI ()));
226
235
}
227
236
}
0 commit comments