diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index f0e58fb082ae..e0e46e5feac0 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -61,6 +61,8 @@ Optimizations * GITHUB#14874: Improve off-heap KNN byte vector query performance in cases where indexing and search are performed by the same process. (Kaival Parikh) +* GITHUB#14932: Switched to GroupVarInt Encoding for HNSW Graph edges, added backwards compatibility (Akira Lonske) + Bug Fixes --------------------- * GITHUB#14049: Randomize KNN codec params in RandomCodec. Fixes scalar quantization div-by-zero diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsFormat.java index 04ad32d8aa9c..baf15174704e 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsFormat.java @@ -87,7 +87,8 @@ public final class Lucene99HnswVectorsFormat extends KnnVectorsFormat { static final String VECTOR_INDEX_EXTENSION = "vex"; public static final int VERSION_START = 0; - public static final int VERSION_CURRENT = VERSION_START; + public static final int VERSION_GROUPVARINT = 1; + public static final int VERSION_CURRENT = VERSION_GROUPVARINT; /** * A maximum configurable maximum max conn. @@ -137,6 +138,8 @@ public final class Lucene99HnswVectorsFormat extends KnnVectorsFormat { private final int numMergeWorkers; private final TaskExecutor mergeExec; + private final int writeVersion; + /** Constructs a format using default graph construction parameters */ public Lucene99HnswVectorsFormat() { this(DEFAULT_MAX_CONN, DEFAULT_BEAM_WIDTH, DEFAULT_NUM_MERGE_WORKER, null); @@ -153,7 +156,7 @@ public Lucene99HnswVectorsFormat(int maxConn, int beamWidth) { } /** - * Constructs a format using the given graph construction parameters and scalar quantization. + * Constructs a format using the given graph construction parameters. * * @param maxConn the maximum number of connections to a node in the HNSW graph * @param beamWidth the size of the queue maintained during graph construction. @@ -165,6 +168,28 @@ public Lucene99HnswVectorsFormat(int maxConn, int beamWidth) { */ public Lucene99HnswVectorsFormat( int maxConn, int beamWidth, int numMergeWorkers, ExecutorService mergeExec) { + this(maxConn, beamWidth, numMergeWorkers, mergeExec, VERSION_CURRENT); + } + + /** + * Constructs a format using the given graph construction parameters. (This is a Test-Only + * Constructor) + * + * @param maxConn the maximum number of connections to a node in the HNSW graph + * @param beamWidth the size of the queue maintained during graph construction. + * @param numMergeWorkers number of workers (threads) that will be used when doing merge. If + * larger than 1, a non-null {@link ExecutorService} must be passed as mergeExec + * @param mergeExec the {@link ExecutorService} that will be used by ALL vector writers that are + * generated by this format to do the merge. If null, the configured {@link + * MergeScheduler#getIntraMergeExecutor(MergePolicy.OneMerge)} is used. + * @param writeVersion the version used for the writer to encode docID's (VarInt=0, GroupVarInt=1) + */ + Lucene99HnswVectorsFormat( + int maxConn, + int beamWidth, + int numMergeWorkers, + ExecutorService mergeExec, + int writeVersion) { super("Lucene99HnswVectorsFormat"); if (maxConn <= 0 || maxConn > MAXIMUM_MAX_CONN) { throw new IllegalArgumentException( @@ -182,6 +207,7 @@ public Lucene99HnswVectorsFormat( } this.maxConn = maxConn; this.beamWidth = beamWidth; + this.writeVersion = writeVersion; if (numMergeWorkers == 1 && mergeExec != null) { throw new IllegalArgumentException( "No executor service is needed as we'll use single thread to merge"); @@ -202,7 +228,8 @@ public KnnVectorsWriter fieldsWriter(SegmentWriteState state) throws IOException beamWidth, flatVectorsFormat.fieldsWriter(state), numMergeWorkers, - mergeExec); + mergeExec, + writeVersion); } @Override diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsReader.java index faa885f7b2ea..9671073bb646 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsReader.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsReader.java @@ -17,6 +17,7 @@ package org.apache.lucene.codecs.lucene99; +import static org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat.VERSION_GROUPVARINT; import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS; import java.io.IOException; @@ -49,6 +50,7 @@ import org.apache.lucene.store.RandomAccessInput; import org.apache.lucene.util.BitSet; import org.apache.lucene.util.Bits; +import org.apache.lucene.util.GroupVIntUtil; import org.apache.lucene.util.IOSupplier; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.RamUsageEstimator; @@ -78,6 +80,7 @@ public final class Lucene99HnswVectorsReader extends KnnVectorsReader private final FieldInfos fieldInfos; private final IntObjectHashMap fields; private final IndexInput vectorIndex; + private final int version; public Lucene99HnswVectorsReader(SegmentReadState state, FlatVectorsReader flatVectorsReader) throws IOException { @@ -105,6 +108,7 @@ public Lucene99HnswVectorsReader(SegmentReadState state, FlatVectorsReader flatV } finally { CodecUtil.checkFooter(meta, priorE); } + this.version = versionMeta; this.vectorIndex = openDataInput( state, @@ -130,6 +134,7 @@ private Lucene99HnswVectorsReader( this.fieldInfos = reader.fieldInfos; this.fields = reader.fields; this.vectorIndex = reader.vectorIndex; + this.version = reader.version; } @Override @@ -501,7 +506,7 @@ static FieldEntry create( } /** Read the nearest-neighbors graph from the index input */ - private static final class OffHeapHnswGraph extends HnswGraph { + private final class OffHeapHnswGraph extends HnswGraph { final IndexInput dataIn; final int[][] nodesByLevel; @@ -552,9 +557,16 @@ public void seek(int level, int targetOrd) throws IOException { arcCount = dataIn.readVInt(); assert arcCount <= currentNeighborsBuffer.length : "too many neighbors: " + arcCount; if (arcCount > 0) { - currentNeighborsBuffer[0] = dataIn.readVInt(); - for (int i = 1; i < arcCount; i++) { - currentNeighborsBuffer[i] = currentNeighborsBuffer[i - 1] + dataIn.readVInt(); + if (version >= VERSION_GROUPVARINT) { + GroupVIntUtil.readGroupVInts(dataIn, currentNeighborsBuffer, arcCount); + for (int i = 1; i < arcCount; i++) { + currentNeighborsBuffer[i] = currentNeighborsBuffer[i - 1] + currentNeighborsBuffer[i]; + } + } else { + currentNeighborsBuffer[0] = dataIn.readVInt(); + for (int i = 1; i < arcCount; i++) { + currentNeighborsBuffer[i] = currentNeighborsBuffer[i - 1] + dataIn.readVInt(); + } } } arc = -1; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsWriter.java index 1334eab7dc8b..49973956192e 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsWriter.java @@ -19,6 +19,8 @@ import static org.apache.lucene.codecs.KnnVectorsWriter.MergedVectorValues.hasVectorValues; import static org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat.DIRECT_MONOTONIC_BLOCK_SHIFT; +import static org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat.VERSION_CURRENT; +import static org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat.VERSION_GROUPVARINT; import static org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsReader.SIMILARITY_FUNCTIONS; import java.io.IOException; @@ -76,6 +78,7 @@ public final class Lucene99HnswVectorsWriter extends KnnVectorsWriter { private final FlatVectorsWriter flatVectorWriter; private final int numMergeWorkers; private final TaskExecutor mergeExec; + private final int version; private final List> fields = new ArrayList<>(); private boolean finished; @@ -88,11 +91,25 @@ public Lucene99HnswVectorsWriter( int numMergeWorkers, TaskExecutor mergeExec) throws IOException { + this(state, M, beamWidth, flatVectorWriter, numMergeWorkers, mergeExec, VERSION_CURRENT); + } + + // Test-only constructor, should not be called directly outside of tests. + Lucene99HnswVectorsWriter( + SegmentWriteState state, + int M, + int beamWidth, + FlatVectorsWriter flatVectorWriter, + int numMergeWorkers, + TaskExecutor mergeExec, + int version) + throws IOException { this.M = M; this.flatVectorWriter = flatVectorWriter; this.beamWidth = beamWidth; this.numMergeWorkers = numMergeWorkers; this.mergeExec = mergeExec; + this.version = version; segmentWriteState = state; String metaFileName = IndexFileNames.segmentFileName( @@ -111,13 +128,13 @@ public Lucene99HnswVectorsWriter( CodecUtil.writeIndexHeader( meta, Lucene99HnswVectorsFormat.META_CODEC_NAME, - Lucene99HnswVectorsFormat.VERSION_CURRENT, + version, state.segmentInfo.getId(), state.segmentSuffix); CodecUtil.writeIndexHeader( vectorIndex, Lucene99HnswVectorsFormat.VECTOR_INDEX_CODEC_NAME, - Lucene99HnswVectorsFormat.VERSION_CURRENT, + version, state.segmentInfo.getId(), state.segmentSuffix); } catch (Throwable t) { @@ -342,8 +359,12 @@ private void reconstructAndWriteNeighbours( } // Write the size after duplicates are removed vectorIndex.writeVInt(actualSize); - for (int i = 0; i < actualSize; i++) { - vectorIndex.writeVInt(scratch[i]); + if (version >= VERSION_GROUPVARINT) { + vectorIndex.writeGroupVInts(scratch, actualSize); + } else { + for (int i = 0; i < actualSize; i++) { + vectorIndex.writeVInt(scratch[i]); + } } } @@ -444,9 +465,14 @@ private int[][] writeGraph(OnHeapHnswGraph graph) throws IOException { } // Write the size after duplicates are removed vectorIndex.writeVInt(actualSize); - for (int i = 0; i < actualSize; i++) { - vectorIndex.writeVInt(scratch[i]); + if (version >= VERSION_GROUPVARINT) { + vectorIndex.writeGroupVInts(scratch, actualSize); + } else { + for (int i = 0; i < actualSize; i++) { + vectorIndex.writeVInt(scratch[i]); + } } + offsets[level][nodeOffsetId++] = Math.toIntExact(vectorIndex.getFilePointer() - offsetStart); } diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99HnswVectorsFormatV0.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99HnswVectorsFormatV0.java new file mode 100644 index 000000000000..e30cafdaf680 --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99HnswVectorsFormatV0.java @@ -0,0 +1,36 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.codecs.lucene99; + +import static org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat.DEFAULT_BEAM_WIDTH; +import static org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat.DEFAULT_MAX_CONN; +import static org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat.DEFAULT_NUM_MERGE_WORKER; + +import org.apache.lucene.codecs.Codec; +import org.apache.lucene.tests.index.BaseKnnVectorsFormatTestCase; +import org.apache.lucene.tests.util.TestUtil; + +public class TestLucene99HnswVectorsFormatV0 extends BaseKnnVectorsFormatTestCase { + + @Override + protected Codec getCodec() { + return TestUtil.alwaysKnnVectorsFormat( + new Lucene99HnswVectorsFormat( + DEFAULT_MAX_CONN, DEFAULT_BEAM_WIDTH, DEFAULT_NUM_MERGE_WORKER, null, 0)); + } +}