Skip to content

Commit bb99ce1

Browse files
authored
Support configuring reusing existing embedding for the semantic field. (opensearch-project#1480)
Signed-off-by: Bo Zhang <[email protected]>
1 parent 901307f commit bb99ce1

19 files changed

+654
-38
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
1414
- [Semantic Field] Allow configuring prune strategies for sparse encoding in semantic fields. ([#1434](https://github.com/opensearch-project/neural-search/pull/1434))
1515
- Enable inner hits within collapse parameter for hybrid query ([#1447](https://github.com/opensearch-project/neural-search/pull/1447))
1616
- [Semantic Field] Support configuring the chunking strategies through the semantic field. ([#1446](https://github.com/opensearch-project/neural-search/pull/1446))
17+
- [Semantic Field] Support configuring reusing existing embedding for the semantic field. ([#1480](https://github.com/opensearch-project/neural-search/pull/1480/files))
1718

1819
### Bug Fixes
1920
- Fix for collapse bug with knn query not deduplicating results ([#1413](https://github.com/opensearch-project/neural-search/pull/1413))
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
/*
2+
* Copyright OpenSearch Contributors
3+
* SPDX-License-Identifier: Apache-2.0
4+
*/
5+
package org.opensearch.neuralsearch.constants;
6+
7+
/**
8+
* Constants related to SourceAndMetadata of a doc
9+
*/
10+
public class DocFieldNames {
11+
/**
12+
* field name of the doc index
13+
*/
14+
public static final String INDEX_FIELD = "_index";
15+
/**
16+
* field name of the doc id
17+
*/
18+
public static final String ID_FIELD = "_id";
19+
}

src/main/java/org/opensearch/neuralsearch/constants/SemanticFieldConstants.java

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,4 +59,11 @@ public class SemanticFieldConstants {
5959
* {@link org.opensearch.neuralsearch.mapper.dto.SparseEncodingConfig}
6060
*/
6161
public static final String SPARSE_ENCODING_CONFIG = "sparse_encoding_config";
62+
63+
/**
64+
* Name of the field to configure if we should skip the embedding generation for the semantic field with existing
65+
* reusable embedding. We think the embedding is reusable if it exists and the semantic field value and the
66+
* model is not changed.
67+
*/
68+
public static final String SKIP_EXISTING_EMBEDDING = "skip_existing_embedding";
6269
}

src/main/java/org/opensearch/neuralsearch/mapper/SemanticFieldMapper.java

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@
3939
import static org.opensearch.neuralsearch.constants.SemanticFieldConstants.DENSE_EMBEDDING_CONFIG;
4040
import static org.opensearch.neuralsearch.constants.SemanticFieldConstants.MODEL_ID;
4141
import static org.opensearch.neuralsearch.constants.SemanticFieldConstants.RAW_FIELD_TYPE;
42+
import static org.opensearch.neuralsearch.constants.SemanticFieldConstants.SKIP_EXISTING_EMBEDDING;
4243
import static org.opensearch.neuralsearch.constants.SemanticFieldConstants.SEARCH_MODEL_ID;
4344
import static org.opensearch.neuralsearch.constants.SemanticFieldConstants.SEMANTIC_INFO_FIELD_NAME;
4445
import static org.opensearch.neuralsearch.constants.SemanticFieldConstants.SEMANTIC_FIELD_SEARCH_ANALYZER;
@@ -196,6 +197,14 @@ public static class Builder extends ParametrizedFieldMapper.Builder {
196197
}
197198
}, (value) -> value == null ? null : value.toString());
198199

200+
@Getter
201+
protected final Parameter<Boolean> skipExistingEmbedding = Parameter.boolParam(
202+
SKIP_EXISTING_EMBEDDING,
203+
true,
204+
m -> ((SemanticFieldMapper) m).semanticParameters.getSkipExistingEmbedding(),
205+
false
206+
);
207+
199208
@Setter
200209
protected ParametrizedFieldMapper.Builder delegateBuilder;
201210

@@ -213,7 +222,8 @@ protected List<Parameter<?>> getParameters() {
213222
chunkingConfig,
214223
semanticFieldSearchAnalyzer,
215224
denseEmbeddingConfig,
216-
sparseEncodingConfig
225+
sparseEncodingConfig,
226+
skipExistingEmbedding
217227
);
218228
}
219229

@@ -244,6 +254,7 @@ public SemanticParameters getSemanticParameters() {
244254
.semanticFieldSearchAnalyzer(semanticFieldSearchAnalyzer.getValue())
245255
.denseEmbeddingConfig(denseEmbeddingConfig.getValue())
246256
.sparseEncodingConfig(sparseEncodingConfig.getValue())
257+
.skipExistingEmbedding(skipExistingEmbedding.getValue())
247258
.build();
248259
}
249260
}

src/main/java/org/opensearch/neuralsearch/mapper/dto/SemanticParameters.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,4 +30,6 @@ public boolean isChunkingEnabled() {
3030
}
3131
return chunkingConfig.isEnabled();
3232
}
33+
34+
private final Boolean skipExistingEmbedding;
3335
}

src/main/java/org/opensearch/neuralsearch/plugin/NeuralSearch.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -311,7 +311,8 @@ public Map<String, Processor.Factory> getSystemIngestProcessors(Processor.Parame
311311
clientAccessor,
312312
parameters.env,
313313
parameters.ingestService.getClusterService(),
314-
parameters.analysisRegistry
314+
parameters.analysisRegistry,
315+
parameters.client
315316
)
316317
);
317318
}

src/main/java/org/opensearch/neuralsearch/processor/InferenceProcessor.java

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,9 @@
5252
import org.opensearch.neuralsearch.util.prune.PruneType;
5353
import org.opensearch.neuralsearch.util.prune.PruneUtils;
5454

55+
import static org.opensearch.neuralsearch.constants.DocFieldNames.ID_FIELD;
56+
import static org.opensearch.neuralsearch.constants.DocFieldNames.INDEX_FIELD;
57+
5558
/**
5659
* The abstract class for text processing use cases. Users provide a field name map and a model id.
5760
* During ingestion, the processor will use the corresponding model to inference the input texts,
@@ -62,8 +65,6 @@ public abstract class InferenceProcessor extends AbstractBatchingProcessor {
6265

6366
public static final String MODEL_ID_FIELD = "model_id";
6467
public static final String FIELD_MAP_FIELD = "field_map";
65-
public static final String INDEX_FIELD = "_index";
66-
public static final String ID_FIELD = "_id";
6768
public static final String SKIP_EXISTING = "skip_existing";
6869
public static final boolean DEFAULT_SKIP_EXISTING = false;
6970
private static final BiFunction<Object, Object, Object> REMAPPING_FUNCTION = (v1, v2) -> {

src/main/java/org/opensearch/neuralsearch/processor/SparseEncodingProcessor.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,9 @@
3232
import org.opensearch.neuralsearch.util.prune.PruneUtils;
3333
import org.opensearch.transport.client.OpenSearchClient;
3434

35+
import static org.opensearch.neuralsearch.constants.DocFieldNames.ID_FIELD;
36+
import static org.opensearch.neuralsearch.constants.DocFieldNames.INDEX_FIELD;
37+
3538
/**
3639
* This processor is used for user input data text sparse encoding processing, model_id can be used to indicate which model user use,
3740
* and field_map can be used to indicate which fields needs text embedding and the corresponding keys for the sparse encoding results.

src/main/java/org/opensearch/neuralsearch/processor/TextEmbeddingProcessor.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,9 @@
2727
import org.opensearch.neuralsearch.stats.events.EventStatsManager;
2828
import org.opensearch.neuralsearch.stats.events.EventStatName;
2929

30+
import static org.opensearch.neuralsearch.constants.DocFieldNames.ID_FIELD;
31+
import static org.opensearch.neuralsearch.constants.DocFieldNames.INDEX_FIELD;
32+
3033
/**
3134
* This processor is used for user input data text embedding processing, model_id can be used to indicate which model user use,
3235
* and field_map can be used to indicate which fields needs text embedding and the corresponding keys for the text embedding results.

src/main/java/org/opensearch/neuralsearch/processor/dto/SemanticFieldInfo.java

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,10 @@ public class SemanticFieldInfo {
4545
* The full path to the semantic field in the index mapping
4646
*/
4747
private String semanticFieldFullPathInMapping;
48+
/**
49+
* The full path to the semantic field in the doc
50+
*/
51+
private String semanticFieldFullPathInDoc;
4852
/**
4953
* The full path to the semantic info fields in the doc. The path in the doc will contain the index of the inter
5054
* nested object.
@@ -67,6 +71,17 @@ public class SemanticFieldInfo {
6771

6872
private SparseEncodingConfig sparseEncodingConfig;
6973

74+
/**
75+
* If we should skip the embedding generation for the semantic field with exist reusable embedding. We think the
76+
* embedding is reusable if it exists and semantic field value and model are not changed.
77+
*/
78+
private Boolean skipExistingEmbedding;
79+
80+
/**
81+
* The id of the doc of the semantic field
82+
*/
83+
private String docId;
84+
7085
/**
7186
* @return full path to the chunks field of the semantic field in a doc
7287
*/

0 commit comments

Comments
 (0)