Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions docs/changelog/138524.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
pr: 138524
summary: Remove feature flag to enable binary doc value compression
area: Mapping
type: feature
issues: []
highlight:
title: Remove feature flag to enable binary doc value compression
body: |-
Add compression for binary doc values using Zstd and blocks with a variable number of values.

Block-wise LZ4 compression was previously added to Lucene in LUCENE-9211 and removed in LUCENE-9378 due to query performance issues. This approach stored a constant number of values per block (specifically 32 values). This made it easy to map a given value index (e.g., docId) to the block containing it by doing blockId = docId / 32.
Unfortunately, if values are very large, we must still have exactly 32 values per block, and (de)compressing a block could cause very high memory usage. As a result, we had to keep the number of values small, meaning that in the average case, a block was much smaller than ideal.
To overcome the issues of blocks with a constant number of values, this PR adds block-wise compression with a variable number of values per block. It stores a minimum of 1 document per block and stops adding values when the size of a block exceeds a threshold or the number of values exceeds a threshold.
Like the previous version, it stores an array of addresses for the start of each block. Additionally, it stores a parallel array with the docId at the start of each block. When looking up a given docId, if it is not in the current block, we binary search the array of docId starts to find the blockId containing the value. We then look up the address of the block. After this, decompression works very similarly to the code from LUCENE-9211; the main difference being that Zstd(1) is used instead of LZ4.

notable: true
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
import org.apache.lucene.codecs.DocValuesProducer;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;
import org.elasticsearch.common.util.FeatureFlag;
import org.elasticsearch.core.SuppressForbidden;
import org.elasticsearch.index.codec.tsdb.BinaryDVCompressionMode;

Expand All @@ -37,8 +36,6 @@
*/
public class ES819TSDBDocValuesFormat extends org.apache.lucene.codecs.DocValuesFormat {

public static final boolean BINARY_DV_COMPRESSION_FEATURE_FLAG = new FeatureFlag("binary_dv_compression").isEnabled();

static final int NUMERIC_BLOCK_SHIFT = 7;
public static final int NUMERIC_BLOCK_SIZE = 1 << NUMERIC_BLOCK_SHIFT;
static final int NUMERIC_BLOCK_MASK = NUMERIC_BLOCK_SIZE - 1;
Expand Down Expand Up @@ -145,7 +142,7 @@ public ES819TSDBDocValuesFormat() {
DEFAULT_SKIP_INDEX_INTERVAL_SIZE,
ORDINAL_RANGE_ENCODING_MIN_DOC_PER_ORDINAL,
OPTIMIZED_MERGE_ENABLE_DEFAULT,
BINARY_DV_COMPRESSION_FEATURE_FLAG ? BinaryDVCompressionMode.COMPRESSED_ZSTD_LEVEL_1 : BinaryDVCompressionMode.NO_COMPRESS,
BinaryDVCompressionMode.COMPRESSED_ZSTD_LEVEL_1,
true
);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -121,13 +121,9 @@ protected Codec getCodec() {
return codec;
}

public void testBinaryCompressionFeatureFlag() {
public void testBinaryCompressionEnabled() {
ES819TSDBDocValuesFormat docValueFormat = new ES819TSDBDocValuesFormat();
if (ES819TSDBDocValuesFormat.BINARY_DV_COMPRESSION_FEATURE_FLAG) {
assertThat(docValueFormat.binaryDVCompressionMode, equalTo(BinaryDVCompressionMode.COMPRESSED_ZSTD_LEVEL_1));
} else {
assertThat(docValueFormat.binaryDVCompressionMode, equalTo(BinaryDVCompressionMode.NO_COMPRESS));
}
assertThat(docValueFormat.binaryDVCompressionMode, equalTo(BinaryDVCompressionMode.COMPRESSED_ZSTD_LEVEL_1));
}

public void testBlockWiseBinary() throws Exception {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,7 @@ public enum FeatureFlag {
),
RANDOM_SAMPLING("es.random_sampling_feature_flag_enabled=true", Version.fromString("9.2.0"), null),
INFERENCE_API_CCM("es.inference_api_ccm_feature_flag_enabled=true", Version.fromString("9.3.0"), null),
GENERIC_VECTOR_FORMAT("es.generic_vector_format_feature_flag_enabled=true", Version.fromString("9.3.0"), null),
BINARY_DOC_VALUE_COMPRESSION("es.binary_dv_compression_feature_flag_enabled=true", Version.fromString("9.3.0"), null);
GENERIC_VECTOR_FORMAT("es.generic_vector_format_feature_flag_enabled=true", Version.fromString("9.3.0"), null);

public final String systemProperty;
public final Version from;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@ public class LogsdbTestSuiteIT extends ESClientYamlSuiteTestCase {
.setting("xpack.security.autoconfiguration.enabled", "false")
.setting("xpack.license.self_generated.type", "trial")
.feature(FeatureFlag.DOC_VALUES_SKIPPER)
.feature(FeatureFlag.BINARY_DOC_VALUE_COMPRESSION)
.build();

public LogsdbTestSuiteIT(@Name("yaml") ClientYamlTestCandidate testCandidate) {
Expand Down