From 301d85db5114ee3d70241717b59b854c1e1bcd3e Mon Sep 17 00:00:00 2001 From: Ignacio Vera Date: Tue, 30 Sep 2025 16:24:47 +0200 Subject: [PATCH 1/4] Fix returned Impacts when frequencies are not indexed --- .../codecs/lucene103/Lucene103PostingsReader.java | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene103/Lucene103PostingsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene103/Lucene103PostingsReader.java index e2c7d023928f..3fc3dd113311 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene103/Lucene103PostingsReader.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene103/Lucene103PostingsReader.java @@ -68,12 +68,9 @@ public final class Lucene103PostingsReader extends PostingsReaderBase { static final VectorizationProvider VECTORIZATION_PROVIDER = VectorizationProvider.getInstance(); - // Dummy impacts, composed of the maximum possible term frequency and the lowest possible - // (unsigned) norm value. This is typically used on tail blocks, which don't actually record - // impacts as the storage overhead would not be worth any query evaluation speedup, since there's - // less than 128 docs left to evaluate anyway. - private static final List DUMMY_IMPACTS = - Collections.singletonList(new Impact(Integer.MAX_VALUE, 1L)); + + private static final List DUMMY_IMPACTS_NO_FREQ = + Collections.singletonList(new Impact(1, 1L)); private final IndexInput docIn; private final IndexInput posIn; @@ -1382,7 +1379,8 @@ public List getImpacts(int level) { return readImpacts(level1SerializedImpacts, level1Impacts); } } - return DUMMY_IMPACTS; + // Max freq is 1 since freqs are not indexed + return DUMMY_IMPACTS_NO_FREQ; } private List readImpacts(BytesRef serialized, MutableImpactList impactsList) { From 652d3ee62878fa619583a72671ea1fbc2677bd11 Mon Sep 17 00:00:00 2001 From: Ignacio Vera Date: Tue, 30 Sep 2025 17:54:21 +0200 Subject: [PATCH 2/4] doh --- .../codecs/lucene103/Lucene103PostingsReader.java | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene103/Lucene103PostingsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene103/Lucene103PostingsReader.java index 3fc3dd113311..93e2253b6ac8 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene103/Lucene103PostingsReader.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene103/Lucene103PostingsReader.java @@ -68,9 +68,14 @@ public final class Lucene103PostingsReader extends PostingsReaderBase { static final VectorizationProvider VECTORIZATION_PROVIDER = VectorizationProvider.getInstance(); - - private static final List DUMMY_IMPACTS_NO_FREQ = - Collections.singletonList(new Impact(1, 1L)); + // Dummy impacts, composed of the maximum possible term frequency and the lowest possible + // (unsigned) norm value. This is typically used on tail blocks, which don't actually record + // impacts as the storage overhead would not be worth any query evaluation speedup, since there's + // less than 128 docs left to evaluate anyway. + private static final List DUMMY_IMPACTS = + Collections.singletonList(new Impact(Integer.MAX_VALUE, 1L)); + // impacts when there is no frequency, max frequency is 1. + private static final List IMPACTS_NO_FREQ = Collections.singletonList(new Impact(1, 1L)); private final IndexInput docIn; private final IndexInput posIn; @@ -1378,9 +1383,9 @@ public List getImpacts(int level) { if (level == 1) { return readImpacts(level1SerializedImpacts, level1Impacts); } + return DUMMY_IMPACTS; } - // Max freq is 1 since freqs are not indexed - return DUMMY_IMPACTS_NO_FREQ; + return IMPACTS_NO_FREQ; } private List readImpacts(BytesRef serialized, MutableImpactList impactsList) { From 1ba118b86356346d6d07fe71311033e51cbeac9b Mon Sep 17 00:00:00 2001 From: Ignacio Vera Date: Tue, 30 Sep 2025 18:37:11 +0200 Subject: [PATCH 3/4] add test --- .../TestLucene103PostingsFormat.java | 28 +++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene103/TestLucene103PostingsFormat.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene103/TestLucene103PostingsFormat.java index 210fde9fa23c..c6d115d97794 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/lucene103/TestLucene103PostingsFormat.java +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene103/TestLucene103PostingsFormat.java @@ -29,8 +29,12 @@ import org.apache.lucene.document.Field; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.Impact; +import org.apache.lucene.index.ImpactsEnum; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.LeafReader; +import org.apache.lucene.index.PostingsEnum; +import org.apache.lucene.index.TermsEnum; import org.apache.lucene.store.ByteArrayDataInput; import org.apache.lucene.store.ByteArrayDataOutput; import org.apache.lucene.store.Directory; @@ -39,7 +43,9 @@ import org.apache.lucene.store.IndexOutput; import org.apache.lucene.tests.analysis.MockAnalyzer; import org.apache.lucene.tests.index.BasePostingsFormatTestCase; +import org.apache.lucene.tests.index.RandomIndexWriter; import org.apache.lucene.tests.util.TestUtil; +import org.apache.lucene.util.BytesRef; public class TestLucene103PostingsFormat extends BasePostingsFormatTestCase { @@ -154,4 +160,26 @@ private void doTestImpactSerialization(List impacts) throws IOException } } } + + public void testImpactsNoFreqs() throws Exception { + try (Directory dir = newDirectory()) { + IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random())); + iwc.setCodec(getCodec()); + try (RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc)) { + Document doc = new Document(); + doc.add(newStringField("field", "value", Field.Store.NO)); + iw.addDocument(doc); + try (DirectoryReader ir = iw.getReader()) { + LeafReader ar = getOnlyLeafReader(ir); + TermsEnum termsEnum = ar.terms("field").iterator(); + termsEnum.seekExact(new BytesRef("value")); + ImpactsEnum impactsEnum = termsEnum.impacts(PostingsEnum.FREQS); + List impacts = impactsEnum.getImpacts().getImpacts(0); + assertEquals(1, impacts.size()); + assertEquals(1, impacts.get(0).freq); + assertEquals(1L, impacts.get(0).norm); + } + } + } + } } From d3f00a9558dd4de9bb57ffcfb81f90c1327ad853 Mon Sep 17 00:00:00 2001 From: Ignacio Vera Date: Wed, 1 Oct 2025 06:59:06 +0200 Subject: [PATCH 4/4] Add entry in CHANGES.txt --- lucene/CHANGES.txt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index d9351003d1f6..af3f15b9138b 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -7,7 +7,9 @@ http://s.apache.org/luceneversions Bug Fixes --------------------- -(No changes) +* GITHUB#15263: Fix the returned Impact returned from Lucene103PostingsReader when frequencies + are not indexed. It was returning a wrong frequency in that case affecting scoring which + might led to performance issues. (Ignacio Vera) ======================= Lucene 10.3.0 =======================