opensearch-project · oaganesh · Apr 2, 2025 · Apr 14, 2025 · Apr 18, 2025 · Apr 18, 2025
@@ -25,6 +25,9 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 
 ## [Unreleased 2.x](https://github.com/opensearch-project/k-NN/compare/2.19...2.x)
 ### Features
+* [Vector Profiler] Adding basic generic vector profiler implementation and tests. [#2624](https://github.com/opensearch-project/k-NN/pull/2624)
+* [Vector Profiler] Adding main segment implementation for API and indexing. [#2653](https://github.com/opensearch-project/k-NN/pull/2653)
+
 ### Enhancements
 ### Bug Fixes
 ### Infrastructure

@@ -321,6 +321,7 @@ dependencies {
     testFixturesImplementation group: 'net.minidev', name: 'json-smart', version: "${versions.json_smart}"
     testFixturesImplementation "org.opensearch:common-utils:${version}"
     implementation 'com.github.oshi:oshi-core:6.4.13'
+    implementation 'org.apache.commons:commons-math3:3.6.1'
     api "net.java.dev.jna:jna:${versions.jna}"
     api "net.java.dev.jna:jna-platform:${versions.jna}"
     // OpenSearch core is using slf4j 1.7.36. Therefore, we cannot change the version here.

@@ -80,6 +80,7 @@ public class KNNConstants {
     public static final String RADIAL_SEARCH_KEY = "radial_search";
     public static final String MODEL_VERSION = "model_version";
     public static final String QUANTIZATION_STATE_FILE_SUFFIX = "osknnqstate";
+    public static final String SEGMENT_PROFILE_STATE_FILE_SUFFIX = "segpfstate";
 
     // Lucene specific constants
     public static final String LUCENE_NAME = "lucene";

@@ -9,6 +9,9 @@
 import lombok.AllArgsConstructor;
 import lombok.Getter;
 import lombok.extern.log4j.Log4j2;
+import org.apache.commons.math3.stat.descriptive.AggregateSummaryStatistics;
+import org.apache.commons.math3.stat.descriptive.StatisticalSummaryValues;
+import org.apache.commons.math3.stat.descriptive.SummaryStatistics;
 import org.apache.lucene.index.FieldInfo;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.LeafReaderContext;
@@ -28,6 +31,8 @@
 import org.opensearch.knn.index.memory.NativeMemoryEntryContext;
 import org.opensearch.knn.index.memory.NativeMemoryLoadStrategy;
 import org.opensearch.knn.index.engine.KNNEngine;
+import org.opensearch.knn.index.query.SegmentProfilerUtil;
+import org.opensearch.knn.profiler.SegmentProfilerState;
 
 import java.io.IOException;
 import java.util.ArrayList;
@@ -82,6 +87,83 @@ public String getIndexName() {
         return indexShard.shardId().getIndexName();
     }
 
+    /**
+     * Profile the vector fields in this shard and return statistical information.
+     *
+     * @param fieldName The name of the vector field to profile
+     * @return List of statistical summaries for each dimension
+     */
+    // TODO: Write unit tests to ensure that the segment statistic aggregation is correct.
+    public List<StatisticalSummaryValues> profile(String fieldName) {
+        List<StatisticalSummaryValues> shardVectorProfile = new ArrayList<>();
+
+        try (Engine.Searcher searcher = indexShard.acquireSearcher("knn-profile")) {
+            List<SegmentProfilerState> segmentLevelProfilerStates = new ArrayList<>();
+
+            log.info("[KNN] Beginning profiling for field: {} in shard: {}", fieldName, indexShard.shardId());
+
+            // For each leaf, collect the profile
+            searcher.getIndexReader().leaves().forEach(leaf -> {
+                try {
+                    log.info("[KNN] Processing leaf reader for segment: {}", leaf.reader());
+                    segmentLevelProfilerStates.add(SegmentProfilerUtil.getSegmentProfileState(leaf.reader(), fieldName));
+                    log.info("[KNN] Successfully obtained segment profile state");
+                } catch (Exception e) {
+                    log.error("[KNN] Error profiling segment: {}", e.getMessage(), e);
+                }
+            });
+
+            if (segmentLevelProfilerStates.isEmpty()) {
+                log.info("[KNN] No segment profiles were collected for field: {} in shard: {}", fieldName, indexShard.shardId());
+                return shardVectorProfile; // Return empty list
+            }
+
+            log.info("[KNN] Collected {} segment profiles", segmentLevelProfilerStates.size());
+
+            // Get dimension
+            int dimension = segmentLevelProfilerStates.get(0).getDimension();
+            log.info("[KNN] Vector dimension: {}", dimension);
+
+            // Transpose our list to aggregate per dimension
+            for (int i = 0; i < dimension; i++) {
+                final int dimensionId = i;
+                List<SummaryStatistics> transposed = segmentLevelProfilerStates.stream()
+                    .map(state -> state.getStatistics().get(dimensionId))
+                    .collect(Collectors.toList());
+
+                shardVectorProfile.add(AggregateSummaryStatistics.aggregate(transposed));
+            }
+
+            // Log the results for each dimension
+            for (int i = 0; i < shardVectorProfile.size(); i++) {
+                StatisticalSummaryValues stats = shardVectorProfile.get(i);
+                log.info(
+                    "[KNN] Dimension {}: count={}, min={}, max={}, mean={}, sum={}, variance={}, std_deviation={}",
+                    i,
+                    stats.getN(),
+                    stats.getMin(),
+                    stats.getMax(),
+                    stats.getMean(),
+                    stats.getSum(),
+                    stats.getVariance(),
+                    Math.sqrt(stats.getVariance())
+                );
+            }
+
+            log.info("[KNN] Profiling completed for field: {} in shard: {}", fieldName, indexShard.shardId());
+        } catch (Exception e) {
+            log.error(
+                "[KNN] Critical error during profiling for field: {} in shard: {}: {}",
+                fieldName,
+                indexShard.shardId(),
+                e.getMessage(),
+                e
+            );
+        }
+
+        return shardVectorProfile;
+    }
+
     /**
      * Load all of the k-NN segments for this shard into the cache.
      *
@@ -235,4 +317,13 @@ static class EngineFileContext {
         private final VectorDataType vectorDataType;
         private final SegmentInfo segmentInfo;
     }
+
+    /**
+     * Profile the vector fields in this shard with default field name.
+     *
+     * @return List of statistical summaries for each dimension
+     */
+    public List<StatisticalSummaryValues> profile() {
+        return profile("my_vector_field");
+    }
 }
@@ -12,6 +12,7 @@
 import org.apache.lucene.index.SegmentWriteState;
 import org.apache.lucene.store.IndexOutput;
 import org.opensearch.knn.common.KNNConstants;
+import org.opensearch.knn.profiler.SegmentProfilerState;
 import org.opensearch.knn.quantization.models.quantizationState.QuantizationState;
 
 import java.io.IOException;
@@ -85,6 +86,20 @@ public void writeState(int fieldNumber, QuantizationState quantizationState) thr
         fieldQuantizationStates.add(new FieldQuantizationState(fieldNumber, stateBytes, position));
     }
 
+    /**
+     * Writes a segment profile state as bytes
+     *
+     * @param fieldNumber field number
+     * @param segmentProfilerState segment profiler state
+     * @throws IOException could be thrown while writing
+     */
+    public void writeState(int fieldNumber, SegmentProfilerState segmentProfilerState) throws IOException {
+        byte[] stateBytes = segmentProfilerState.toByteArray();
+        long position = output.getFilePointer();
+        output.writeBytes(stateBytes, stateBytes.length);
+        fieldQuantizationStates.add(new FieldQuantizationState(fieldNumber, stateBytes, position));
+    }
+
     /**
      * Writes index footer and other index information for parsing later
      * @throws IOException could be thrown while writing

@@ -35,6 +35,10 @@
 import org.opensearch.knn.index.quantizationservice.QuantizationService;
 import org.opensearch.knn.memoryoptsearch.VectorSearcher;
 import org.opensearch.knn.memoryoptsearch.VectorSearcherFactory;
+import org.opensearch.knn.profiler.KNN990ProfileStateReader;
+import org.opensearch.knn.profiler.SegmentProfileKNNCollector;
+import org.opensearch.knn.profiler.SegmentProfileStateReadConfig;
+import org.opensearch.knn.profiler.SegmentProfilerState;
 import org.opensearch.knn.quantization.models.quantizationState.QuantizationState;
 import org.opensearch.knn.quantization.models.quantizationState.QuantizationStateCacheManager;
 import org.opensearch.knn.quantization.models.quantizationState.QuantizationStateReadConfig;
@@ -163,6 +167,14 @@ public void search(String field, float[] target, KnnCollector knnCollector, Bits
             return;
         }
 
+        if (knnCollector instanceof SegmentProfileKNNCollector) {
+            SegmentProfilerState segmentProfileState = KNN990ProfileStateReader.read(
+                new SegmentProfileStateReadConfig(segmentReadState, field)
+            );
+            ((SegmentProfileKNNCollector) knnCollector).setSegmentProfilerState(segmentProfileState);
+            return;
+        }
+
         if (trySearchWithMemoryOptimizedSearch(field, target, knnCollector, acceptDocs, true)) {
             return;
         }

@@ -29,6 +29,7 @@
 import org.opensearch.knn.index.quantizationservice.QuantizationService;
 import org.opensearch.knn.index.vectorvalues.KNNVectorValues;
 import org.opensearch.knn.plugin.stats.KNNGraphValue;
+import org.opensearch.knn.profiler.SegmentProfilerState;
 import org.opensearch.knn.quantization.models.quantizationParams.QuantizationParams;
 import org.opensearch.knn.quantization.models.quantizationState.QuantizationState;
 
@@ -107,6 +108,7 @@ public void flush(int maxDoc, final Sorter.DocMap sortMap) throws IOException {
                 field.getVectors()
             );
             final QuantizationState quantizationState = train(field.getFieldInfo(), knnVectorValuesSupplier, totalLiveDocs);
+            profile(field.getFieldInfo(), knnVectorValuesSupplier, totalLiveDocs);
             // should skip graph building only for non quantization use case and if threshold is met
             if (quantizationState == null && shouldSkipBuildingVectorDataStructure(totalLiveDocs)) {
                 log.info(
@@ -150,6 +152,10 @@ public void mergeOneField(final FieldInfo fieldInfo, final MergeState mergeState
         }
 
         final QuantizationState quantizationState = train(fieldInfo, knnVectorValuesSupplier, totalLiveDocs);
+
+        // Write the segment profile state to the directory
+        profile(fieldInfo, knnVectorValuesSupplier, totalLiveDocs);
+
         // should skip graph building only for non quantization use case and if threshold is met
         if (quantizationState == null && shouldSkipBuildingVectorDataStructure(totalLiveDocs)) {
             log.info(
@@ -188,6 +194,7 @@ public void finish() throws IOException {
         if (quantizationStateWriter != null) {
             quantizationStateWriter.writeFooter();
         }
+
         flatVectorsWriter.finish();
     }
 
@@ -241,6 +248,23 @@ private QuantizationState train(
         return quantizationState;
     }
 
+    private SegmentProfilerState profile(
+        final FieldInfo fieldInfo,
+        final Supplier<KNNVectorValues<?>> knnVectorValuesSupplier,
+        final int totalLiveDocs
+    ) throws IOException {
+
+        SegmentProfilerState segmentProfilerState = null;
+        if (totalLiveDocs > 0) {
+            // TODO:Refactor to another init
+            initQuantizationStateWriterIfNecessary();
+            SegmentProfilerState profileResultForSegment = SegmentProfilerState.profileVectors(knnVectorValuesSupplier);
+            quantizationStateWriter.writeState(fieldInfo.getFieldNumber(), profileResultForSegment);
+        }
+
+        return segmentProfilerState;
+    }
+
     /**
      * The {@link KNNVectorValues} will be exhausted after this function run. So make sure that you are not sending the
      * vectorsValues object which you plan to use later

@@ -0,0 +1,37 @@
+/*
+ * Copyright OpenSearch Contributors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+package org.opensearch.knn.index.query;
+
+import lombok.experimental.UtilityClass;
+import org.apache.lucene.index.LeafReader;
+import org.opensearch.knn.profiler.SegmentProfileKNNCollector;
+import org.opensearch.knn.profiler.SegmentProfilerState;
+
+import java.io.IOException;
+import java.util.Locale;
+
+/**
+ * Utility class to get segment profiler state for a given field
+ */
+@UtilityClass
+public class SegmentProfilerUtil {
+
+    /**
+     * Gets the segment profile state for a given field
+     * @param leafReader The leaf reader to query
+     * @param fieldName The field name to profile
+     * @return The segment profiler state
+     * @throws IOException If there's an error reading the segment
+     */
+    public static SegmentProfilerState getSegmentProfileState(final LeafReader leafReader, String fieldName) throws IOException {
+        final SegmentProfileKNNCollector tempCollector = new SegmentProfileKNNCollector();
+        leafReader.searchNearestVectors(fieldName, new float[0], tempCollector, null);
+        if (tempCollector.getSegmentProfilerState() == null) {
+            throw new IllegalStateException(String.format(Locale.ROOT, "No segment state found for field %s", fieldName));
+        }
+        return tempCollector.getSegmentProfilerState();
+    }
+}
@@ -57,6 +57,9 @@
 import org.opensearch.knn.plugin.script.KNNScoringScriptEngine;
 import org.opensearch.knn.plugin.search.KNNConcurrentSearchRequestDecider;
 import org.opensearch.knn.plugin.stats.KNNStats;
+//import org.opensearch.knn.plugin.transport.*;
+import org.opensearch.knn.plugin.transport.KNNProfileTransportAction;
+import org.opensearch.knn.plugin.transport.KNNProfileAction;
 import org.opensearch.knn.plugin.transport.ClearCacheAction;
 import org.opensearch.knn.plugin.transport.ClearCacheTransportAction;
 import org.opensearch.knn.plugin.transport.DeleteModelAction;
@@ -82,6 +85,7 @@
 import org.opensearch.knn.plugin.transport.UpdateModelGraveyardTransportAction;
 import org.opensearch.knn.plugin.transport.UpdateModelMetadataAction;
 import org.opensearch.knn.plugin.transport.UpdateModelMetadataTransportAction;
+import org.opensearch.knn.profiler.RestKNNProfileHandler;
 import org.opensearch.knn.quantization.models.quantizationState.QuantizationStateCache;
 import org.opensearch.knn.training.TrainingJobClusterStateListener;
 import org.opensearch.knn.training.TrainingJobRunner;
@@ -249,6 +253,12 @@ public List<RestHandler> getRestHandlers(
             clusterService,
             indexNameExpressionResolver
         );
+        RestKNNProfileHandler restKNNProfileHandler = new RestKNNProfileHandler(
+            settings,
+            restController,
+            clusterService,
+            indexNameExpressionResolver
+        );
         RestGetModelHandler restGetModelHandler = new RestGetModelHandler();
         RestDeleteModelHandler restDeleteModelHandler = new RestDeleteModelHandler();
         RestTrainModelHandler restTrainModelHandler = new RestTrainModelHandler();
@@ -258,6 +268,7 @@ public List<RestHandler> getRestHandlers(
         return ImmutableList.of(
             restKNNStatsHandler,
             restKNNWarmupHandler,
+            restKNNProfileHandler,
             restGetModelHandler,
             restDeleteModelHandler,
             restTrainModelHandler,
@@ -274,6 +285,7 @@ public List<RestHandler> getRestHandlers(
         return Arrays.asList(
             new ActionHandler<>(KNNStatsAction.INSTANCE, KNNStatsTransportAction.class),
             new ActionHandler<>(KNNWarmupAction.INSTANCE, KNNWarmupTransportAction.class),
+            new ActionHandler<>(KNNProfileAction.INSTANCE, KNNProfileTransportAction.class),
             new ActionHandler<>(UpdateModelMetadataAction.INSTANCE, UpdateModelMetadataTransportAction.class),
             new ActionHandler<>(TrainingJobRouteDecisionInfoAction.INSTANCE, TrainingJobRouteDecisionInfoTransportAction.class),
             new ActionHandler<>(GetModelAction.INSTANCE, GetModelTransportAction.class),

@@ -0,0 +1,20 @@
+/*
+ * Copyright OpenSearch Contributors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+package org.opensearch.knn.plugin.transport;
+
+import org.opensearch.action.ActionType;
+
+/**
+ * Action for profiling KNN vectors in an index
+ */
+public class KNNProfileAction extends ActionType<KNNProfileResponse> {
+    public static final String NAME = "indices:knn/vector/profile";
+    public static final KNNProfileAction INSTANCE = new KNNProfileAction();
+
+    private KNNProfileAction() {
+        super(NAME, KNNProfileResponse::new);
+    }
+}