Skip to content

Commit 5fbd3ea

Browse files
committed
HSEARCH-5020 Increase the max allowed dimension for a Lucene backend
1 parent 686421e commit 5fbd3ea

File tree

4 files changed

+160
-6
lines changed

4 files changed

+160
-6
lines changed

backend/lucene/src/main/java/org/hibernate/search/backend/lucene/lowlevel/codec/impl/HibernateSearchKnnVectorsFormat.java

+4-2
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,10 @@
2020
import org.apache.lucene.index.SegmentWriteState;
2121

2222
public class HibernateSearchKnnVectorsFormat extends KnnVectorsFormat {
23-
public static final int DEFAULT_MAX_DIMENSIONS = KnnVectorsFormat.DEFAULT_MAX_DIMENSIONS;
23+
// OpenSearch has a limit of 16000
24+
// Elasticsearch has a limit of 4096
25+
// We'll keep it at 4096 for now as well:
26+
public static final int DEFAULT_MAX_DIMENSIONS = 4096;
2427
private static final KnnVectorsFormat DEFAULT_KNN_VECTORS_FORMAT = new HibernateSearchKnnVectorsFormat();
2528

2629
public static KnnVectorsFormat defaultFormat() {
@@ -59,7 +62,6 @@ public KnnVectorsReader fieldsReader(SegmentReadState state) throws IOException
5962

6063
@Override
6164
public int getMaxDimensions(String fieldName) {
62-
// TODO: HSEARCH-5020: we can make this configurable, apparently there are models that produce larger vectors than this default allows.
6365
return DEFAULT_MAX_DIMENSIONS;
6466
}
6567

documentation/src/main/asciidoc/public/reference/_mapping-directfieldmapping.adoc

+6-2
Original file line numberDiff line numberDiff line change
@@ -364,10 +364,14 @@ highlighter, if they already support the other two (`[PLAIN, UNIFIED]`).
364364
include::../components/_incubating-warning.adoc[]
365365
+
366366
The size of the stored vectors. This is a required field. This size should match the vector size of the vectors produced by
367-
the model used to convert the data into vector representation. It is expected to be a positive integer value in range `[1,1024]`.
367+
the model used to convert the data into vector representation.
368+
It is expected to be a positive integer value. Maximum accepted value is backend-specific.
369+
For the <<backend-lucene, Lucene backend>> the dimension must be in `[1, 4096]` range.
370+
As for the <<backend-elasticsearch, Elasticsearch backend>> the range depends on the distribution.
371+
See the link:{elasticsearchDocUrl}/dense-vector.html#dense-vector-params[Elasticsearch]/link:{openSearchDocUrl}/search-plugins/knn/approximate-knn/#get-started-with-approximate-k-nn[OpenSearch]
372+
specific documentation to learn about the vector types of these distributions.
368373
+
369374
Only available on `@VectorField`.
370-
// TODO: HSEARCH-5020: to update the section once we make this configurable
371375

372376
[[mapping-directfieldmapping-vectorSimilarity]] `vectorSimilarity`::
373377
+

integrationtest/backend/lucene/src/test/java/org/hibernate/search/integrationtest/backend/lucene/mapping/LuceneVectorFieldIT.java

+2-2
Original file line numberDiff line numberDiff line change
@@ -22,9 +22,9 @@ class LuceneVectorFieldIT {
2222
public final SearchSetupHelper setupHelper = SearchSetupHelper.create();
2323

2424
@ParameterizedTest
25-
@ValueSource(ints = { -1, -1000, 1025, 10000, Integer.MAX_VALUE, Integer.MIN_VALUE })
25+
@ValueSource(ints = { -1, -1000, 4097, 10000, Integer.MAX_VALUE, Integer.MIN_VALUE })
2626
void assertDimension(int dimension) {
27-
test( dimension, 5, 10, "dimension", dimension, 1024 );
27+
test( dimension, 5, 10, "dimension", dimension, 4096 );
2828
}
2929

3030
@ParameterizedTest

integrationtest/mapper/pojo-standalone-realbackend/src/test/java/org/hibernate/search/integrationtest/mapper/pojo/standalone/realbackend/mapping/VectorFieldIT.java

+148
Original file line numberDiff line numberDiff line change
@@ -6,14 +6,19 @@
66
*/
77
package org.hibernate.search.integrationtest.mapper.pojo.standalone.realbackend.mapping;
88

9+
import static org.assertj.core.api.Assertions.assertThat;
910
import static org.assertj.core.api.Assertions.assertThatThrownBy;
1011

1112
import java.lang.invoke.MethodHandles;
1213
import java.util.ArrayList;
14+
import java.util.Arrays;
1315
import java.util.Collection;
1416
import java.util.List;
1517
import java.util.Objects;
1618

19+
import org.hibernate.search.backend.elasticsearch.ElasticsearchDistributionName;
20+
import org.hibernate.search.engine.backend.types.VectorSimilarity;
21+
import org.hibernate.search.engine.search.projection.dsl.SearchProjectionFactory;
1722
import org.hibernate.search.integrationtest.mapper.pojo.standalone.realbackend.testsupport.BackendConfigurations;
1823
import org.hibernate.search.mapper.pojo.bridge.ValueBridge;
1924
import org.hibernate.search.mapper.pojo.bridge.binding.ValueBindingContext;
@@ -24,21 +29,148 @@
2429
import org.hibernate.search.mapper.pojo.mapping.definition.annotation.DocumentId;
2530
import org.hibernate.search.mapper.pojo.mapping.definition.annotation.Indexed;
2631
import org.hibernate.search.mapper.pojo.mapping.definition.annotation.VectorField;
32+
import org.hibernate.search.mapper.pojo.mapping.definition.programmatic.TypeMappingStep;
33+
import org.hibernate.search.mapper.pojo.standalone.cfg.StandalonePojoMapperSettings;
34+
import org.hibernate.search.mapper.pojo.standalone.mapping.SearchMapping;
35+
import org.hibernate.search.mapper.pojo.standalone.mapping.StandalonePojoMappingConfigurer;
36+
import org.hibernate.search.mapper.pojo.standalone.session.SearchSession;
37+
import org.hibernate.search.mapper.pojo.standalone.work.SearchIndexingPlan;
2738
import org.hibernate.search.util.common.SearchException;
39+
import org.hibernate.search.util.impl.integrationtest.backend.elasticsearch.dialect.ElasticsearchTestDialect;
40+
import org.hibernate.search.util.impl.integrationtest.common.extension.BackendConfiguration;
41+
import org.hibernate.search.util.impl.integrationtest.common.reporting.FailureReportChecker;
2842
import org.hibernate.search.util.impl.integrationtest.common.reporting.FailureReportUtils;
2943
import org.hibernate.search.util.impl.integrationtest.mapper.pojo.standalone.StandalonePojoMappingSetupHelper;
3044

3145
import org.junit.jupiter.api.Test;
3246
import org.junit.jupiter.api.extension.RegisterExtension;
47+
import org.junit.jupiter.params.ParameterizedTest;
48+
import org.junit.jupiter.params.provider.ValueSource;
3349

3450
class VectorFieldIT {
3551

3652
private static final String INDEX_NAME = "IndexName";
53+
private static final int BATCHES = 20;
54+
private static final int BATCH_SIZE = 1_000;
3755

3856
@RegisterExtension
3957
public StandalonePojoMappingSetupHelper setupHelper = StandalonePojoMappingSetupHelper.withSingleBackend(
4058
MethodHandles.lookup(), BackendConfigurations.simple() );
4159

60+
/*
61+
* While for the test of the max-allowed dimension it would be enough to index a single document and then search for it,
62+
* in this case we want to generate more than a few documents to see how the backends,
63+
* would handle the relatively high number of large vectors.
64+
* For Lucene-specific limit tests see LuceneVectorFieldIT.
65+
* As for the Elasticsearch/OpenSearch -- we only transmit the error-response from the backend to the user,
66+
* so there's no need to do that much of extensive testing for this backend and `vectorSizeLimits_more_than_max` covers the basics.
67+
*/
68+
@Test
69+
void vectorSizeLimits_max_allowed_dimension_with_lots_of_documents() {
70+
int maxDimension = maxDimension();
71+
@Indexed(index = INDEX_NAME)
72+
class IndexedEntity {
73+
@DocumentId
74+
Integer id;
75+
float[] floats;
76+
byte[] bytes;
77+
78+
public IndexedEntity(Integer id) {
79+
this.id = id;
80+
this.floats = new float[maxDimension];
81+
this.bytes = new byte[maxDimension];
82+
Arrays.fill( floats, id / (float) maxDimension );
83+
Arrays.fill( floats, id * maxDimension % Byte.MAX_VALUE );
84+
}
85+
}
86+
87+
SearchMapping setup = setupHelper.start().expectCustomBeans().withProperty(
88+
StandalonePojoMapperSettings.MAPPING_CONFIGURER,
89+
(StandalonePojoMappingConfigurer) context -> {
90+
TypeMappingStep book = context.programmaticMapping()
91+
.type( IndexedEntity.class );
92+
book.property( "floats" )
93+
.vectorField( maxDimension ).vectorSimilarity( VectorSimilarity.L2 );
94+
book.property( "bytes" )
95+
.vectorField( maxDimension ).vectorSimilarity( VectorSimilarity.L2 );
96+
}
97+
).setup( IndexedEntity.class );
98+
99+
for ( int j = 0; j < BATCHES; j++ ) {
100+
try ( SearchSession session = setup.createSession() ) {
101+
SearchIndexingPlan searchIndexingPlan = session.indexingPlan();
102+
for ( int i = 0; i < BATCH_SIZE; i++ ) {
103+
searchIndexingPlan.add( new IndexedEntity( i + j * BATCH_SIZE ) );
104+
}
105+
}
106+
}
107+
108+
try ( SearchSession session = setup.createSession() ) {
109+
List<Object> bytes = session.search( IndexedEntity.class ).select( SearchProjectionFactory::id )
110+
.where( f -> f.knn( BATCHES ).field( "bytes" ).matching( new byte[maxDimension] ) )
111+
.fetchAllHits();
112+
assertThat( bytes ).hasSizeGreaterThanOrEqualTo( BATCHES );
113+
114+
List<Object> floats = session.search( IndexedEntity.class ).select( SearchProjectionFactory::id )
115+
.where( f -> f.knn( BATCHES ).field( "floats" ).matching( new float[maxDimension] ) )
116+
.fetchAllHits();
117+
assertThat( floats ).hasSizeGreaterThanOrEqualTo( BATCHES );
118+
}
119+
}
120+
121+
@ParameterizedTest
122+
@ValueSource(ints = { 1, 2, 5, 10, 150, 500, 500000000 })
123+
void vectorSizeLimits_more_than_max(int increment) {
124+
int dimension = maxDimension() + increment;
125+
@Indexed(index = INDEX_NAME)
126+
class IndexedEntity {
127+
@DocumentId
128+
Integer id;
129+
float[] floats;
130+
byte[] bytes;
131+
132+
public IndexedEntity(Integer id) {
133+
this.id = id;
134+
this.floats = new float[dimension];
135+
this.bytes = new byte[dimension];
136+
Arrays.fill( floats, id / (float) dimension );
137+
Arrays.fill( floats, id * dimension % Byte.MAX_VALUE );
138+
}
139+
}
140+
141+
FailureReportChecker failure;
142+
if ( BackendConfiguration.isLucene() ) {
143+
String[] message = new String[] {
144+
"Vector 'dimension' cannot be equal to",
145+
Objects.toString( dimension ),
146+
"It must be a positive integer value lesser than or equal to"
147+
};
148+
failure = FailureReportUtils.hasFailureReport()
149+
.typeContext( IndexedEntity.class.getName() )
150+
.pathContext( ".floats" )
151+
.failure( message )
152+
.pathContext( ".bytes" )
153+
.failure( message );
154+
}
155+
else {
156+
failure = FailureReportUtils.hasFailureReport()
157+
.typeContext( IndexedEntity.class.getName() );
158+
}
159+
assertThatThrownBy( () -> setupHelper.start().expectCustomBeans().withProperty(
160+
StandalonePojoMapperSettings.MAPPING_CONFIGURER,
161+
(StandalonePojoMappingConfigurer) context -> {
162+
TypeMappingStep book = context.programmaticMapping()
163+
.type( IndexedEntity.class );
164+
book.property( "floats" )
165+
.vectorField( dimension ).vectorSimilarity( VectorSimilarity.L2 );
166+
book.property( "bytes" )
167+
.vectorField( dimension ).vectorSimilarity( VectorSimilarity.L2 );
168+
}
169+
).setup( IndexedEntity.class ) )
170+
.isInstanceOf( SearchException.class )
171+
.satisfies( failure );
172+
}
173+
42174
/*
43175
* This test relies on a backend implementation to make sure that the vector dimension was somehow set for the field.
44176
* hence it requires a real backend.
@@ -67,6 +199,22 @@ class IndexedEntity {
67199
) );
68200
}
69201

202+
private static int maxDimension() {
203+
if ( BackendConfiguration.isLucene() ) {
204+
return 4096;
205+
}
206+
else {
207+
ElasticsearchDistributionName distribution = ElasticsearchTestDialect.getActualVersion().distribution();
208+
if ( ElasticsearchDistributionName.ELASTIC.equals( distribution ) ) {
209+
return 4096;
210+
}
211+
else {
212+
// looks like there's a bug in OpenSearch? it won't accept 16000
213+
return 1024;
214+
}
215+
}
216+
}
217+
70218
@SuppressWarnings("rawtypes")
71219
public static class ValidImplicitTypeBridge implements ValueBridge<Collection, float[]> {
72220

0 commit comments

Comments
 (0)