Skip to content
Closed
Show file tree
Hide file tree
Changes from 5 commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
10fa912
Adding basic generic vector profiler implementation and tests. (#2624)
oaganesh Apr 2, 2025
b955792
Extend QuantizationStateWriter to Vector Profiler
markwu-sde Apr 14, 2025
fefe7c8
Updating main implementation for serialization and aggregation for pr…
oaganesh Apr 18, 2025
49cadd5
Updating Changelog file.
oaganesh Apr 18, 2025
d844f27
Updating import changes.
oaganesh Apr 18, 2025
0b49596
Optimize first child lookup for nested docs (#2637)
jmazanec15 Apr 2, 2025
9f49e51
Add cmake policy flag (#2645)
owenhalpert Apr 2, 2025
04f3265
Add github action to run ITs against remote index builder (#2620)
anntians Apr 3, 2025
a5356f2
Enhance derived source its (#2648)
jmazanec15 Apr 6, 2025
bd9d2d1
[Remote Vector Index Build] Add metric collection (#2615)
owenhalpert Apr 7, 2025
37b3912
Update engine for version 2.19 or above (#2498)
VijayanB Apr 7, 2025
04adc36
Add multi-vector-support faiss patch to IndexHNSW::search_level_0 (#2…
anntians Apr 9, 2025
a0870a2
Combine method and lucene mappers to EngineFieldMapper (#2646)
kotwanikunal Apr 9, 2025
88774a9
3.0.0 Beta Release Notes (#2656)
Vikasht34 Apr 10, 2025
4201fb0
Fix build due to phasing off SecurityManager usage in favor of Java A…
reta Apr 14, 2025
105b970
Prevent derived source from open reader per transform (#2652)
jmazanec15 Apr 14, 2025
ac813ff
Removing redundant type conversions for script scoring for hamming sp…
kasundra07 Apr 14, 2025
eb2a30c
Explain API changes for Exact/ANN/Radial/Disk based KNN search (#2403)
neetikasinghal Apr 15, 2025
5165dfc
Fix a bug to save the best matching similarity function in meta info.…
0ctopus13prime Apr 15, 2025
9feb203
[BUGFIX] FIX nested vector query at efficient filter scenarios (#2641)
luyuncheng Apr 16, 2025
7a509a3
Fix concurrency bug to share non-thread safe graph structure. (#2663)
0ctopus13prime Apr 17, 2025
044c7d1
Switch derived to default (#2664)
jmazanec15 Apr 17, 2025
03f4e18
Change skip building vector data structure log to debug level (#2639)
owenhalpert Apr 17, 2025
d4022d3
Fix quantization cache bugs. (#2666)
0ctopus13prime Apr 18, 2025
23622da
Extend QuantizationStateWriter to Vector Profiler
markwu-sde Apr 14, 2025
b9eb1b8
Adding basic generic vector profiler implementation and tests. (#2624)
oaganesh Apr 2, 2025
9802027
Applying spotless changes.
oaganesh Apr 24, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),

## [Unreleased 2.x](https://github.com/opensearch-project/k-NN/compare/2.19...2.x)
### Features
* [Vector Profiler] Adding basic generic vector profiler implementation and tests. [#2624](https://github.com/opensearch-project/k-NN/pull/2624)
* [Vector Profiler] Adding main segment implementation for API and indexing. [#2653](https://github.com/opensearch-project/k-NN/pull/2653)

### Enhancements
### Bug Fixes
### Infrastructure
Expand Down
1 change: 1 addition & 0 deletions build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -321,6 +321,7 @@ dependencies {
testFixturesImplementation group: 'net.minidev', name: 'json-smart', version: "${versions.json_smart}"
testFixturesImplementation "org.opensearch:common-utils:${version}"
implementation 'com.github.oshi:oshi-core:6.4.13'
implementation 'org.apache.commons:commons-math3:3.6.1'
api "net.java.dev.jna:jna:${versions.jna}"
api "net.java.dev.jna:jna-platform:${versions.jna}"
// OpenSearch core is using slf4j 1.7.36. Therefore, we cannot change the version here.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ public class KNNConstants {
public static final String RADIAL_SEARCH_KEY = "radial_search";
public static final String MODEL_VERSION = "model_version";
public static final String QUANTIZATION_STATE_FILE_SUFFIX = "osknnqstate";
public static final String SEGMENT_PROFILE_STATE_FILE_SUFFIX = "segpfstate";

// Lucene specific constants
public static final String LUCENE_NAME = "lucene";
Expand Down
91 changes: 91 additions & 0 deletions src/main/java/org/opensearch/knn/index/KNNIndexShard.java
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@
import lombok.AllArgsConstructor;
import lombok.Getter;
import lombok.extern.log4j.Log4j2;
import org.apache.commons.math3.stat.descriptive.AggregateSummaryStatistics;
import org.apache.commons.math3.stat.descriptive.StatisticalSummaryValues;
import org.apache.commons.math3.stat.descriptive.SummaryStatistics;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReaderContext;
Expand All @@ -28,6 +31,8 @@
import org.opensearch.knn.index.memory.NativeMemoryEntryContext;
import org.opensearch.knn.index.memory.NativeMemoryLoadStrategy;
import org.opensearch.knn.index.engine.KNNEngine;
import org.opensearch.knn.index.query.SegmentProfilerUtil;
import org.opensearch.knn.profiler.SegmentProfilerState;

import java.io.IOException;
import java.util.ArrayList;
Expand Down Expand Up @@ -82,6 +87,83 @@ public String getIndexName() {
return indexShard.shardId().getIndexName();
}

/**
* Profile the vector fields in this shard and return statistical information.
*
* @param fieldName The name of the vector field to profile
* @return List of statistical summaries for each dimension
*/
// TODO: Write unit tests to ensure that the segment statistic aggregation is correct.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should write a integration test as well that covers across multiple segments. Let's leave this towards the end.

public List<StatisticalSummaryValues> profile(String fieldName) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

NIT: final

List<StatisticalSummaryValues> shardVectorProfile = new ArrayList<>();

try (Engine.Searcher searcher = indexShard.acquireSearcher("knn-profile")) {
List<SegmentProfilerState> segmentLevelProfilerStates = new ArrayList<>();

log.info("[KNN] Beginning profiling for field: {} in shard: {}", fieldName, indexShard.shardId());

// For each leaf, collect the profile
searcher.getIndexReader().leaves().forEach(leaf -> {
try {
log.info("[KNN] Processing leaf reader for segment: {}", leaf.reader());
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we avoid clutter in the logs by reducing these info lines? This doesn't add any value here when debugging as we already have exception handling

segmentLevelProfilerStates.add(SegmentProfilerUtil.getSegmentProfileState(leaf.reader(), fieldName));
log.info("[KNN] Successfully obtained segment profile state");
} catch (Exception e) {
log.error("[KNN] Error profiling segment: {}", e.getMessage(), e);
}
});

if (segmentLevelProfilerStates.isEmpty()) {
log.info("[KNN] No segment profiles were collected for field: {} in shard: {}", fieldName, indexShard.shardId());
return shardVectorProfile; // Return empty list
}

log.info("[KNN] Collected {} segment profiles", segmentLevelProfilerStates.size());

// Get dimension
int dimension = segmentLevelProfilerStates.get(0).getDimension();
log.info("[KNN] Vector dimension: {}", dimension);

// Transpose our list to aggregate per dimension
for (int i = 0; i < dimension; i++) {
final int dimensionId = i;
List<SummaryStatistics> transposed = segmentLevelProfilerStates.stream()
.map(state -> state.getStatistics().get(dimensionId))
.collect(Collectors.toList());

shardVectorProfile.add(AggregateSummaryStatistics.aggregate(transposed));
}

// Log the results for each dimension
for (int i = 0; i < shardVectorProfile.size(); i++) {
StatisticalSummaryValues stats = shardVectorProfile.get(i);
log.info(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've mentioned this before but would it just be easier to use toString here given we're not really focused on a specific output?

"[KNN] Dimension {}: count={}, min={}, max={}, mean={}, sum={}, variance={}, std_deviation={}",
i,
stats.getN(),
stats.getMin(),
stats.getMax(),
stats.getMean(),
stats.getSum(),
stats.getVariance(),
Math.sqrt(stats.getVariance())
);
}

log.info("[KNN] Profiling completed for field: {} in shard: {}", fieldName, indexShard.shardId());
} catch (Exception e) {
log.error(
"[KNN] Critical error during profiling for field: {} in shard: {}: {}",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What does Critical mean in this sense? Error messages should be as succinct and objective as possible.

fieldName,
indexShard.shardId(),
e.getMessage(),
e
);
}

return shardVectorProfile;
}

/**
* Load all of the k-NN segments for this shard into the cache.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

NIT: Does this comment need to be changed?

*
Expand Down Expand Up @@ -235,4 +317,13 @@ static class EngineFileContext {
private final VectorDataType vectorDataType;
private final SegmentInfo segmentInfo;
}

/**
* Profile the vector fields in this shard with default field name.
*
* @return List of statistical summaries for each dimension
*/
public List<StatisticalSummaryValues> profile() {
return profile("my_vector_field");
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.store.IndexOutput;
import org.opensearch.knn.common.KNNConstants;
import org.opensearch.knn.profiler.SegmentProfilerState;
import org.opensearch.knn.quantization.models.quantizationState.QuantizationState;

import java.io.IOException;
Expand Down Expand Up @@ -85,6 +86,20 @@ public void writeState(int fieldNumber, QuantizationState quantizationState) thr
fieldQuantizationStates.add(new FieldQuantizationState(fieldNumber, stateBytes, position));
}

/**
* Writes a segment profile state as bytes
*
* @param fieldNumber field number
* @param segmentProfilerState segment profiler state
* @throws IOException could be thrown while writing
*/
public void writeState(int fieldNumber, SegmentProfilerState segmentProfilerState) throws IOException {
byte[] stateBytes = segmentProfilerState.toByteArray();
long position = output.getFilePointer();
output.writeBytes(stateBytes, stateBytes.length);
fieldQuantizationStates.add(new FieldQuantizationState(fieldNumber, stateBytes, position));
}

/**
* Writes index footer and other index information for parsing later
* @throws IOException could be thrown while writing
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,10 @@
import org.opensearch.knn.index.quantizationservice.QuantizationService;
import org.opensearch.knn.memoryoptsearch.VectorSearcher;
import org.opensearch.knn.memoryoptsearch.VectorSearcherFactory;
import org.opensearch.knn.profiler.KNN990ProfileStateReader;
import org.opensearch.knn.profiler.SegmentProfileKNNCollector;
import org.opensearch.knn.profiler.SegmentProfileStateReadConfig;
import org.opensearch.knn.profiler.SegmentProfilerState;
import org.opensearch.knn.quantization.models.quantizationState.QuantizationState;
import org.opensearch.knn.quantization.models.quantizationState.QuantizationStateCacheManager;
import org.opensearch.knn.quantization.models.quantizationState.QuantizationStateReadConfig;
Expand Down Expand Up @@ -163,6 +167,14 @@ public void search(String field, float[] target, KnnCollector knnCollector, Bits
return;
}

if (knnCollector instanceof SegmentProfileKNNCollector) {
SegmentProfilerState segmentProfileState = KNN990ProfileStateReader.read(
new SegmentProfileStateReadConfig(segmentReadState, field)
);
((SegmentProfileKNNCollector) knnCollector).setSegmentProfilerState(segmentProfileState);
return;
}

if (trySearchWithMemoryOptimizedSearch(field, target, knnCollector, acceptDocs, true)) {
return;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
import org.opensearch.knn.index.quantizationservice.QuantizationService;
import org.opensearch.knn.index.vectorvalues.KNNVectorValues;
import org.opensearch.knn.plugin.stats.KNNGraphValue;
import org.opensearch.knn.profiler.SegmentProfilerState;
import org.opensearch.knn.quantization.models.quantizationParams.QuantizationParams;
import org.opensearch.knn.quantization.models.quantizationState.QuantizationState;

Expand Down Expand Up @@ -107,6 +108,7 @@ public void flush(int maxDoc, final Sorter.DocMap sortMap) throws IOException {
field.getVectors()
);
final QuantizationState quantizationState = train(field.getFieldInfo(), knnVectorValuesSupplier, totalLiveDocs);
profile(field.getFieldInfo(), knnVectorValuesSupplier, totalLiveDocs);
// should skip graph building only for non quantization use case and if threshold is met
if (quantizationState == null && shouldSkipBuildingVectorDataStructure(totalLiveDocs)) {
log.info(
Expand Down Expand Up @@ -150,6 +152,10 @@ public void mergeOneField(final FieldInfo fieldInfo, final MergeState mergeState
}

final QuantizationState quantizationState = train(fieldInfo, knnVectorValuesSupplier, totalLiveDocs);

// Write the segment profile state to the directory
profile(fieldInfo, knnVectorValuesSupplier, totalLiveDocs);

// should skip graph building only for non quantization use case and if threshold is met
if (quantizationState == null && shouldSkipBuildingVectorDataStructure(totalLiveDocs)) {
log.info(
Expand Down Expand Up @@ -188,6 +194,7 @@ public void finish() throws IOException {
if (quantizationStateWriter != null) {
quantizationStateWriter.writeFooter();
}

flatVectorsWriter.finish();
}

Expand Down Expand Up @@ -241,6 +248,23 @@ private QuantizationState train(
return quantizationState;
}

private SegmentProfilerState profile(
final FieldInfo fieldInfo,
final Supplier<KNNVectorValues<?>> knnVectorValuesSupplier,
final int totalLiveDocs
) throws IOException {

SegmentProfilerState segmentProfilerState = null;
if (totalLiveDocs > 0) {
// TODO:Refactor to another init
initQuantizationStateWriterIfNecessary();
SegmentProfilerState profileResultForSegment = SegmentProfilerState.profileVectors(knnVectorValuesSupplier);
quantizationStateWriter.writeState(fieldInfo.getFieldNumber(), profileResultForSegment);
}

return segmentProfilerState;
}

/**
* The {@link KNNVectorValues} will be exhausted after this function run. So make sure that you are not sending the
* vectorsValues object which you plan to use later
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
/*
* Copyright OpenSearch Contributors
* SPDX-License-Identifier: Apache-2.0
*/

package org.opensearch.knn.index.query;

import lombok.experimental.UtilityClass;
import org.apache.lucene.index.LeafReader;
import org.opensearch.knn.profiler.SegmentProfileKNNCollector;
import org.opensearch.knn.profiler.SegmentProfilerState;

import java.io.IOException;
import java.util.Locale;

/**
* Utility class to get segment profiler state for a given field
*/
@UtilityClass
public class SegmentProfilerUtil {

/**
* Gets the segment profile state for a given field
* @param leafReader The leaf reader to query
* @param fieldName The field name to profile
* @return The segment profiler state
* @throws IOException If there's an error reading the segment
*/
public static SegmentProfilerState getSegmentProfileState(final LeafReader leafReader, String fieldName) throws IOException {
final SegmentProfileKNNCollector tempCollector = new SegmentProfileKNNCollector();
leafReader.searchNearestVectors(fieldName, new float[0], tempCollector, null);
if (tempCollector.getSegmentProfilerState() == null) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we want to throw an exception if we're unable to get the SegmentProfilerState? What if the user has the feature disabled?

throw new IllegalStateException(String.format(Locale.ROOT, "No segment state found for field %s", fieldName));
}
return tempCollector.getSegmentProfilerState();
}
}
12 changes: 12 additions & 0 deletions src/main/java/org/opensearch/knn/plugin/KNNPlugin.java
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,9 @@
import org.opensearch.knn.plugin.script.KNNScoringScriptEngine;
import org.opensearch.knn.plugin.search.KNNConcurrentSearchRequestDecider;
import org.opensearch.knn.plugin.stats.KNNStats;
//import org.opensearch.knn.plugin.transport.*;
import org.opensearch.knn.plugin.transport.KNNProfileTransportAction;
import org.opensearch.knn.plugin.transport.KNNProfileAction;
import org.opensearch.knn.plugin.transport.ClearCacheAction;
import org.opensearch.knn.plugin.transport.ClearCacheTransportAction;
import org.opensearch.knn.plugin.transport.DeleteModelAction;
Expand All @@ -82,6 +85,7 @@
import org.opensearch.knn.plugin.transport.UpdateModelGraveyardTransportAction;
import org.opensearch.knn.plugin.transport.UpdateModelMetadataAction;
import org.opensearch.knn.plugin.transport.UpdateModelMetadataTransportAction;
import org.opensearch.knn.profiler.RestKNNProfileHandler;
import org.opensearch.knn.quantization.models.quantizationState.QuantizationStateCache;
import org.opensearch.knn.training.TrainingJobClusterStateListener;
import org.opensearch.knn.training.TrainingJobRunner;
Expand Down Expand Up @@ -249,6 +253,12 @@ public List<RestHandler> getRestHandlers(
clusterService,
indexNameExpressionResolver
);
RestKNNProfileHandler restKNNProfileHandler = new RestKNNProfileHandler(
settings,
restController,
clusterService,
indexNameExpressionResolver
);
RestGetModelHandler restGetModelHandler = new RestGetModelHandler();
RestDeleteModelHandler restDeleteModelHandler = new RestDeleteModelHandler();
RestTrainModelHandler restTrainModelHandler = new RestTrainModelHandler();
Expand All @@ -258,6 +268,7 @@ public List<RestHandler> getRestHandlers(
return ImmutableList.of(
restKNNStatsHandler,
restKNNWarmupHandler,
restKNNProfileHandler,
restGetModelHandler,
restDeleteModelHandler,
restTrainModelHandler,
Expand All @@ -274,6 +285,7 @@ public List<RestHandler> getRestHandlers(
return Arrays.asList(
new ActionHandler<>(KNNStatsAction.INSTANCE, KNNStatsTransportAction.class),
new ActionHandler<>(KNNWarmupAction.INSTANCE, KNNWarmupTransportAction.class),
new ActionHandler<>(KNNProfileAction.INSTANCE, KNNProfileTransportAction.class),
new ActionHandler<>(UpdateModelMetadataAction.INSTANCE, UpdateModelMetadataTransportAction.class),
new ActionHandler<>(TrainingJobRouteDecisionInfoAction.INSTANCE, TrainingJobRouteDecisionInfoTransportAction.class),
new ActionHandler<>(GetModelAction.INSTANCE, GetModelTransportAction.class),
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
/*
* Copyright OpenSearch Contributors
* SPDX-License-Identifier: Apache-2.0
*/

package org.opensearch.knn.plugin.transport;

import org.opensearch.action.ActionType;

/**
* Action for profiling KNN vectors in an index
*/
public class KNNProfileAction extends ActionType<KNNProfileResponse> {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I know we synced offline on this but is it possible to move the API implementation to another PR? I don't think there's

public static final String NAME = "indices:knn/vector/profile";
public static final KNNProfileAction INSTANCE = new KNNProfileAction();

private KNNProfileAction() {
super(NAME, KNNProfileResponse::new);
}
}
Loading
Loading