Skip to content
2 changes: 2 additions & 0 deletions lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,8 @@ Optimizations

* GITHUB#14874: Improve off-heap KNN byte vector query performance in cases where indexing and search are performed by the same process. (Kaival Parikh)

* GITHUB#14932: Switched to GroupVarInt Encoding for HNSW Graph edges, added backwards compatibility (Akira Lonske)

Bug Fixes
---------------------
* GITHUB#14049: Randomize KNN codec params in RandomCodec. Fixes scalar quantization div-by-zero
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,8 @@ public final class Lucene99HnswVectorsFormat extends KnnVectorsFormat {
static final String VECTOR_INDEX_EXTENSION = "vex";

public static final int VERSION_START = 0;
public static final int VERSION_CURRENT = VERSION_START;
public static final int VERSION_GROUPVARINT = 1;
public static final int VERSION_CURRENT = VERSION_GROUPVARINT;

/**
* A maximum configurable maximum max conn.
Expand Down Expand Up @@ -137,6 +138,8 @@ public final class Lucene99HnswVectorsFormat extends KnnVectorsFormat {
private final int numMergeWorkers;
private final TaskExecutor mergeExec;

private final int writeVersion;

/** Constructs a format using default graph construction parameters */
public Lucene99HnswVectorsFormat() {
this(DEFAULT_MAX_CONN, DEFAULT_BEAM_WIDTH, DEFAULT_NUM_MERGE_WORKER, null);
Expand All @@ -153,7 +156,7 @@ public Lucene99HnswVectorsFormat(int maxConn, int beamWidth) {
}

/**
* Constructs a format using the given graph construction parameters and scalar quantization.
* Constructs a format using the given graph construction parameters.
*
* @param maxConn the maximum number of connections to a node in the HNSW graph
* @param beamWidth the size of the queue maintained during graph construction.
Expand All @@ -165,6 +168,28 @@ public Lucene99HnswVectorsFormat(int maxConn, int beamWidth) {
*/
public Lucene99HnswVectorsFormat(
int maxConn, int beamWidth, int numMergeWorkers, ExecutorService mergeExec) {
this(maxConn, beamWidth, numMergeWorkers, mergeExec, VERSION_CURRENT);
}

/**
* Constructs a format using the given graph construction parameters. (This is a Test-Only
* Constructor)
*
* @param maxConn the maximum number of connections to a node in the HNSW graph
* @param beamWidth the size of the queue maintained during graph construction.
* @param numMergeWorkers number of workers (threads) that will be used when doing merge. If
* larger than 1, a non-null {@link ExecutorService} must be passed as mergeExec
* @param mergeExec the {@link ExecutorService} that will be used by ALL vector writers that are
* generated by this format to do the merge. If null, the configured {@link
* MergeScheduler#getIntraMergeExecutor(MergePolicy.OneMerge)} is used.
* @param writeVersion the version used for the writer to encode docID's (VarInt=0, GroupVarInt=1)
*/
Lucene99HnswVectorsFormat(
int maxConn,
int beamWidth,
int numMergeWorkers,
ExecutorService mergeExec,
int writeVersion) {
super("Lucene99HnswVectorsFormat");
if (maxConn <= 0 || maxConn > MAXIMUM_MAX_CONN) {
throw new IllegalArgumentException(
Expand All @@ -182,6 +207,7 @@ public Lucene99HnswVectorsFormat(
}
this.maxConn = maxConn;
this.beamWidth = beamWidth;
this.writeVersion = writeVersion;
if (numMergeWorkers == 1 && mergeExec != null) {
throw new IllegalArgumentException(
"No executor service is needed as we'll use single thread to merge");
Expand All @@ -202,7 +228,8 @@ public KnnVectorsWriter fieldsWriter(SegmentWriteState state) throws IOException
beamWidth,
flatVectorsFormat.fieldsWriter(state),
numMergeWorkers,
mergeExec);
mergeExec,
writeVersion);
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

package org.apache.lucene.codecs.lucene99;

import static org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat.VERSION_GROUPVARINT;
import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;

import java.io.IOException;
Expand Down Expand Up @@ -49,6 +50,7 @@
import org.apache.lucene.store.RandomAccessInput;
import org.apache.lucene.util.BitSet;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.GroupVIntUtil;
import org.apache.lucene.util.IOSupplier;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.RamUsageEstimator;
Expand Down Expand Up @@ -78,6 +80,7 @@ public final class Lucene99HnswVectorsReader extends KnnVectorsReader
private final FieldInfos fieldInfos;
private final IntObjectHashMap<FieldEntry> fields;
private final IndexInput vectorIndex;
private final int version;

public Lucene99HnswVectorsReader(SegmentReadState state, FlatVectorsReader flatVectorsReader)
throws IOException {
Expand Down Expand Up @@ -105,6 +108,7 @@ public Lucene99HnswVectorsReader(SegmentReadState state, FlatVectorsReader flatV
} finally {
CodecUtil.checkFooter(meta, priorE);
}
this.version = versionMeta;
this.vectorIndex =
openDataInput(
state,
Expand All @@ -130,6 +134,7 @@ private Lucene99HnswVectorsReader(
this.fieldInfos = reader.fieldInfos;
this.fields = reader.fields;
this.vectorIndex = reader.vectorIndex;
this.version = reader.version;
}

@Override
Expand Down Expand Up @@ -501,7 +506,7 @@ static FieldEntry create(
}

/** Read the nearest-neighbors graph from the index input */
private static final class OffHeapHnswGraph extends HnswGraph {
private final class OffHeapHnswGraph extends HnswGraph {

final IndexInput dataIn;
final int[][] nodesByLevel;
Expand Down Expand Up @@ -552,9 +557,16 @@ public void seek(int level, int targetOrd) throws IOException {
arcCount = dataIn.readVInt();
assert arcCount <= currentNeighborsBuffer.length : "too many neighbors: " + arcCount;
if (arcCount > 0) {
currentNeighborsBuffer[0] = dataIn.readVInt();
for (int i = 1; i < arcCount; i++) {
currentNeighborsBuffer[i] = currentNeighborsBuffer[i - 1] + dataIn.readVInt();
if (version >= VERSION_GROUPVARINT) {
GroupVIntUtil.readGroupVInts(dataIn, currentNeighborsBuffer, arcCount);
for (int i = 1; i < arcCount; i++) {
currentNeighborsBuffer[i] = currentNeighborsBuffer[i - 1] + currentNeighborsBuffer[i];
}
} else {
currentNeighborsBuffer[0] = dataIn.readVInt();
for (int i = 1; i < arcCount; i++) {
currentNeighborsBuffer[i] = currentNeighborsBuffer[i - 1] + dataIn.readVInt();
}
}
}
arc = -1;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@

import static org.apache.lucene.codecs.KnnVectorsWriter.MergedVectorValues.hasVectorValues;
import static org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat.DIRECT_MONOTONIC_BLOCK_SHIFT;
import static org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat.VERSION_CURRENT;
import static org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat.VERSION_GROUPVARINT;
import static org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsReader.SIMILARITY_FUNCTIONS;

import java.io.IOException;
Expand Down Expand Up @@ -76,6 +78,7 @@ public final class Lucene99HnswVectorsWriter extends KnnVectorsWriter {
private final FlatVectorsWriter flatVectorWriter;
private final int numMergeWorkers;
private final TaskExecutor mergeExec;
private final int version;

private final List<FieldWriter<?>> fields = new ArrayList<>();
private boolean finished;
Expand All @@ -88,11 +91,25 @@ public Lucene99HnswVectorsWriter(
int numMergeWorkers,
TaskExecutor mergeExec)
throws IOException {
this(state, M, beamWidth, flatVectorWriter, numMergeWorkers, mergeExec, VERSION_CURRENT);
}

// Test-only constructor, should not be called directly outside of tests.
Lucene99HnswVectorsWriter(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe add a "test-only" comment here too?

SegmentWriteState state,
int M,
int beamWidth,
FlatVectorsWriter flatVectorWriter,
int numMergeWorkers,
TaskExecutor mergeExec,
int version)
throws IOException {
this.M = M;
this.flatVectorWriter = flatVectorWriter;
this.beamWidth = beamWidth;
this.numMergeWorkers = numMergeWorkers;
this.mergeExec = mergeExec;
this.version = version;
segmentWriteState = state;
String metaFileName =
IndexFileNames.segmentFileName(
Expand All @@ -111,13 +128,13 @@ public Lucene99HnswVectorsWriter(
CodecUtil.writeIndexHeader(
meta,
Lucene99HnswVectorsFormat.META_CODEC_NAME,
Lucene99HnswVectorsFormat.VERSION_CURRENT,
version,
state.segmentInfo.getId(),
state.segmentSuffix);
CodecUtil.writeIndexHeader(
vectorIndex,
Lucene99HnswVectorsFormat.VECTOR_INDEX_CODEC_NAME,
Lucene99HnswVectorsFormat.VERSION_CURRENT,
version,
state.segmentInfo.getId(),
state.segmentSuffix);
} catch (Throwable t) {
Expand Down Expand Up @@ -342,8 +359,12 @@ private void reconstructAndWriteNeighbours(
}
// Write the size after duplicates are removed
vectorIndex.writeVInt(actualSize);
for (int i = 0; i < actualSize; i++) {
vectorIndex.writeVInt(scratch[i]);
if (version >= VERSION_GROUPVARINT) {
vectorIndex.writeGroupVInts(scratch, actualSize);
} else {
for (int i = 0; i < actualSize; i++) {
vectorIndex.writeVInt(scratch[i]);
}
}
}

Expand Down Expand Up @@ -444,9 +465,14 @@ private int[][] writeGraph(OnHeapHnswGraph graph) throws IOException {
}
// Write the size after duplicates are removed
vectorIndex.writeVInt(actualSize);
for (int i = 0; i < actualSize; i++) {
vectorIndex.writeVInt(scratch[i]);
if (version >= VERSION_GROUPVARINT) {
vectorIndex.writeGroupVInts(scratch, actualSize);
} else {
for (int i = 0; i < actualSize; i++) {
vectorIndex.writeVInt(scratch[i]);
}
}

offsets[level][nodeOffsetId++] =
Math.toIntExact(vectorIndex.getFilePointer() - offsetStart);
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.lucene.codecs.lucene99;

import static org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat.DEFAULT_BEAM_WIDTH;
import static org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat.DEFAULT_MAX_CONN;
import static org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat.DEFAULT_NUM_MERGE_WORKER;

import org.apache.lucene.codecs.Codec;
import org.apache.lucene.tests.index.BaseKnnVectorsFormatTestCase;
import org.apache.lucene.tests.util.TestUtil;

public class TestLucene99HnswVectorsFormatV0 extends BaseKnnVectorsFormatTestCase {

@Override
protected Codec getCodec() {
return TestUtil.alwaysKnnVectorsFormat(
new Lucene99HnswVectorsFormat(
DEFAULT_MAX_CONN, DEFAULT_BEAM_WIDTH, DEFAULT_NUM_MERGE_WORKER, null, 0));
}
}
Loading