|
| 1 | +/* |
| 2 | + * Copyright OpenSearch Contributors |
| 3 | + * SPDX-License-Identifier: Apache-2.0 |
| 4 | + */ |
| 5 | + |
| 6 | +package org.opensearch.knn.index.codec.derivedsource; |
| 7 | + |
| 8 | +import lombok.extern.log4j.Log4j2; |
| 9 | +import org.apache.lucene.document.KnnByteVectorField; |
| 10 | +import org.apache.lucene.document.KnnFloatVectorField; |
| 11 | +import org.apache.lucene.index.IndexableField; |
| 12 | +import org.apache.lucene.util.BytesRef; |
| 13 | +import org.opensearch.common.collect.Tuple; |
| 14 | +import org.opensearch.common.io.stream.BytesStreamOutput; |
| 15 | +import org.opensearch.common.xcontent.XContentHelper; |
| 16 | +import org.opensearch.common.xcontent.XContentType; |
| 17 | +import org.opensearch.common.xcontent.support.XContentMapValues; |
| 18 | +import org.opensearch.core.index.shard.ShardId; |
| 19 | +import org.opensearch.core.xcontent.MediaType; |
| 20 | +import org.opensearch.core.xcontent.MediaTypeRegistry; |
| 21 | +import org.opensearch.core.xcontent.XContentBuilder; |
| 22 | +import org.opensearch.index.engine.Engine; |
| 23 | +import org.opensearch.index.mapper.ParseContext; |
| 24 | +import org.opensearch.index.shard.IndexingOperationListener; |
| 25 | +import org.opensearch.knn.index.VectorDataType; |
| 26 | +import org.opensearch.knn.index.mapper.KNNVectorFieldMapperUtil; |
| 27 | + |
| 28 | +import java.io.IOException; |
| 29 | +import java.util.ArrayList; |
| 30 | +import java.util.HashMap; |
| 31 | +import java.util.Iterator; |
| 32 | +import java.util.List; |
| 33 | +import java.util.Locale; |
| 34 | +import java.util.Map; |
| 35 | +import java.util.function.Function; |
| 36 | + |
| 37 | +import static org.opensearch.index.mapper.SourceFieldMapper.RECOVERY_SOURCE_NAME; |
| 38 | + |
| 39 | +/** |
| 40 | + * Before applying the indexing operation, we need to ensure that the source that gets added to the translog matches |
| 41 | + * exactly what we will reconstruct. To do this, we reconstruct the source from the binary source and then apply the |
| 42 | + * transformation on top of it and then set the source back. |
| 43 | + */ |
| 44 | +@Log4j2 |
| 45 | +public class DerivedSourceIndexOperationListener implements IndexingOperationListener { |
| 46 | + |
| 47 | + @Override |
| 48 | + public Engine.Index preIndex(ShardId shardId, Engine.Index operation) { |
| 49 | + // If recovery source is enabled, we do not need to modify the translog source. The recovery source will be the |
| 50 | + // original, user provided source |
| 51 | + if (isRecoverySourceEnabled(operation)) { |
| 52 | + return operation; |
| 53 | + } |
| 54 | + Tuple<? extends MediaType, Map<String, Object>> originalSource = XContentHelper.convertToMap( |
| 55 | + operation.parsedDoc().source(), |
| 56 | + true, |
| 57 | + operation.parsedDoc().getMediaType() |
| 58 | + ); |
| 59 | + Map<String, Object> derivedSource = createInjectTransformer(operation).apply(originalSource.v2()); |
| 60 | + |
| 61 | + try (BytesStreamOutput bStream = new BytesStreamOutput();) { |
| 62 | + XContentBuilder builder = MediaTypeRegistry.contentBuilder(originalSource.v1(), bStream).map(derivedSource); |
| 63 | + builder.close(); |
| 64 | + operation.parsedDoc().setSource(bStream.bytes(), XContentType.valueOf(originalSource.v1().subtype().toUpperCase(Locale.ROOT))); |
| 65 | + } catch (IOException e) { |
| 66 | + throw new RuntimeException(e); |
| 67 | + } |
| 68 | + return operation; |
| 69 | + } |
| 70 | + |
| 71 | + private Function<Map<String, Object>, Map<String, Object>> createInjectTransformer(Engine.Index operation) { |
| 72 | + Map<String, List<Object>> injectedVectors = new HashMap<>(); |
| 73 | + |
| 74 | + // For each document, we get the relevant vector fields to compute the injection logic |
| 75 | + for (ParseContext.Document document : operation.parsedDoc().docs()) { |
| 76 | + for (Iterator<IndexableField> it = document.iterator(); it.hasNext();) { |
| 77 | + IndexableField indexableField = it.next(); |
| 78 | + if (indexableField instanceof KnnFloatVectorField knnVectorFieldType) { |
| 79 | + injectedVectors.computeIfAbsent(indexableField.name(), k -> new ArrayList<>()) |
| 80 | + .add(formatVector(VectorDataType.FLOAT, knnVectorFieldType.vectorValue())); |
| 81 | + } |
| 82 | + |
| 83 | + if (indexableField instanceof KnnByteVectorField knnByteVectorField) { |
| 84 | + injectedVectors.computeIfAbsent(indexableField.name(), k -> new ArrayList<>()) |
| 85 | + .add(formatVector(VectorDataType.BYTE, knnByteVectorField.vectorValue())); |
| 86 | + } |
| 87 | + } |
| 88 | + } |
| 89 | + Map<String, Function<Object, Object>> injectTransformers = new HashMap<>(); |
| 90 | + for (Map.Entry<String, List<Object>> entry : injectedVectors.entrySet()) { |
| 91 | + Iterator<Object> iterator = entry.getValue().iterator(); |
| 92 | + injectTransformers.put(entry.getKey(), (Object o) -> o == null ? o : iterator.next()); |
| 93 | + } |
| 94 | + return XContentMapValues.transform(injectTransformers, true); |
| 95 | + } |
| 96 | + |
| 97 | + private boolean isRecoverySourceEnabled(Engine.Index operation) { |
| 98 | + return operation.parsedDoc().rootDoc().getField(RECOVERY_SOURCE_NAME) != null; |
| 99 | + } |
| 100 | + |
| 101 | + protected Object formatVector(VectorDataType vectorDataType, Object vectorValue) { |
| 102 | + if (vectorValue instanceof byte[]) { |
| 103 | + BytesRef vectorBytesRef = new BytesRef((byte[]) vectorValue); |
| 104 | + return KNNVectorFieldMapperUtil.deserializeStoredVector(vectorBytesRef, vectorDataType); |
| 105 | + } |
| 106 | + return vectorValue; |
| 107 | + } |
| 108 | +} |
0 commit comments