Skip to content

Commit 82bbdfb

Browse files
Shailesh-Kumar-SinghShailesh Singhreta
authored
Fix Bug - Handle unsigned long in sorting order assertion of LongHashSet (#17207)
* Fix Bug - handle unsigned long in assertion of LongHashSet Signed-off-by: Shailesh Singh <[email protected]> * renamed TestDocValuesUnsignedLongHashSet.java to DocValuesUnsignedLongHashSetTests.java Signed-off-by: Shailesh Singh <[email protected]> * Update server/src/main/java/org/opensearch/lucene/util/UnsignedLongHashSet.java Co-authored-by: Andriy Redko <[email protected]> Signed-off-by: Shailesh Singh <[email protected]> --------- Signed-off-by: Shailesh Singh <[email protected]> Signed-off-by: Shailesh Singh <[email protected]> Co-authored-by: Shailesh Singh <[email protected]> Co-authored-by: Andriy Redko <[email protected]>
1 parent 09af518 commit 82bbdfb

File tree

4 files changed

+284
-3
lines changed

4 files changed

+284
-3
lines changed

CHANGELOG.md

+1
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
1111
- Introduce a setting to disable download of full cluster state from remote on term mismatch([#16798](https://github.com/opensearch-project/OpenSearch/pull/16798/))
1212
- Added ability to retrieve value from DocValues in a flat_object filed([#16802](https://github.com/opensearch-project/OpenSearch/pull/16802))
1313
- Improve performace of NumericTermAggregation by avoiding unnecessary sorting([#17252](https://github.com/opensearch-project/OpenSearch/pull/17252))
14+
- Fix Bug - Handle unsigned long in sorting order assertion of LongHashSet ([#17207](https://github.com/opensearch-project/OpenSearch/pull/17207))
1415
- Implemented computation of segment replication stats at shard level ([#17055](https://github.com/opensearch-project/OpenSearch/pull/17055))
1516
- [Rule Based Auto-tagging] Add in-memory attribute value store ([#17342](https://github.com/opensearch-project/OpenSearch/pull/17342))
1617

server/src/main/java/org/opensearch/index/document/SortedUnsignedLongDocValuesSetQuery.java

+3-3
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525
import org.apache.lucene.search.ScorerSupplier;
2626
import org.apache.lucene.search.TwoPhaseIterator;
2727
import org.apache.lucene.search.Weight;
28-
import org.opensearch.lucene.util.LongHashSet;
28+
import org.opensearch.lucene.util.UnsignedLongHashSet;
2929

3030
import java.io.IOException;
3131
import java.math.BigInteger;
@@ -40,12 +40,12 @@
4040
public abstract class SortedUnsignedLongDocValuesSetQuery extends Query {
4141

4242
private final String field;
43-
private final LongHashSet numbers;
43+
private final UnsignedLongHashSet numbers;
4444

4545
SortedUnsignedLongDocValuesSetQuery(String field, BigInteger[] numbers) {
4646
this.field = Objects.requireNonNull(field);
4747
Arrays.sort(numbers);
48-
this.numbers = new LongHashSet(Arrays.stream(numbers).mapToLong(n -> n.longValue()).toArray());
48+
this.numbers = new UnsignedLongHashSet(Arrays.stream(numbers).mapToLong(n -> n.longValue()).toArray());
4949
}
5050

5151
@Override
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,139 @@
1+
/*
2+
* SPDX-License-Identifier: Apache-2.0
3+
*
4+
* The OpenSearch Contributors require contributions made to
5+
* this file be licensed under the Apache-2.0 license or a
6+
* compatible open source license.
7+
*/
8+
9+
package org.opensearch.lucene.util;
10+
11+
import org.apache.lucene.util.Accountable;
12+
import org.apache.lucene.util.RamUsageEstimator;
13+
import org.apache.lucene.util.packed.PackedInts;
14+
import org.opensearch.common.Numbers;
15+
16+
import java.util.Arrays;
17+
import java.util.Objects;
18+
import java.util.stream.Collectors;
19+
import java.util.stream.LongStream;
20+
21+
/** Set of unsigned-longs, optimized for docvalues usage */
22+
public final class UnsignedLongHashSet implements Accountable {
23+
private static final long BASE_RAM_BYTES = RamUsageEstimator.shallowSizeOfInstance(UnsignedLongHashSet.class);
24+
25+
private static final long MISSING = Numbers.MIN_UNSIGNED_LONG_VALUE_AS_LONG;
26+
27+
final long[] table;
28+
final int mask;
29+
final boolean hasMissingValue;
30+
final int size;
31+
/** minimum value in the set, or Numbers.MAX_UNSIGNED_LONG_VALUE_AS_LONG for an empty set */
32+
public final long minValue;
33+
/** maximum value in the set, or Numbers.MIN_UNSIGNED_LONG_VALUE_AS_LONG for an empty set */
34+
public final long maxValue;
35+
36+
/** Construct a set. Values must be in sorted order. */
37+
public UnsignedLongHashSet(long[] values) {
38+
int tableSize = Math.toIntExact(values.length * 3L / 2);
39+
tableSize = 1 << PackedInts.bitsRequired(tableSize); // make it a power of 2
40+
assert tableSize >= values.length * 3L / 2;
41+
table = new long[tableSize];
42+
Arrays.fill(table, MISSING);
43+
mask = tableSize - 1;
44+
boolean hasMissingValue = false;
45+
int size = 0;
46+
long previousValue = Numbers.MIN_UNSIGNED_LONG_VALUE_AS_LONG; // for assert
47+
for (long value : values) {
48+
if (value == MISSING) {
49+
size += hasMissingValue ? 0 : 1;
50+
hasMissingValue = true;
51+
} else if (add(value)) {
52+
++size;
53+
}
54+
assert Long.compareUnsigned(value, previousValue) >= 0 : " values must be provided in sorted order";
55+
previousValue = value;
56+
}
57+
this.hasMissingValue = hasMissingValue;
58+
this.size = size;
59+
this.minValue = values.length == 0 ? Numbers.MAX_UNSIGNED_LONG_VALUE_AS_LONG : values[0];
60+
this.maxValue = values.length == 0 ? Numbers.MIN_UNSIGNED_LONG_VALUE_AS_LONG : values[values.length - 1];
61+
}
62+
63+
private boolean add(long l) {
64+
assert l != MISSING;
65+
final int slot = Long.hashCode(l) & mask;
66+
for (int i = slot;; i = (i + 1) & mask) {
67+
if (table[i] == MISSING) {
68+
table[i] = l;
69+
return true;
70+
} else if (table[i] == l) {
71+
// already added
72+
return false;
73+
}
74+
}
75+
}
76+
77+
/**
78+
* check for membership in the set.
79+
*
80+
* <p>You should use {@link #minValue} and {@link #maxValue} to guide/terminate iteration before
81+
* calling this.
82+
*/
83+
public boolean contains(long l) {
84+
if (l == MISSING) {
85+
return hasMissingValue;
86+
}
87+
final int slot = Long.hashCode(l) & mask;
88+
for (int i = slot;; i = (i + 1) & mask) {
89+
if (table[i] == MISSING) {
90+
return false;
91+
} else if (table[i] == l) {
92+
return true;
93+
}
94+
}
95+
}
96+
97+
/** returns a stream of all values contained in this set */
98+
public LongStream stream() {
99+
LongStream stream = Arrays.stream(table).filter(v -> v != MISSING);
100+
if (hasMissingValue) {
101+
stream = LongStream.concat(LongStream.of(MISSING), stream);
102+
}
103+
return stream;
104+
}
105+
106+
@Override
107+
public int hashCode() {
108+
return Objects.hash(size, minValue, maxValue, mask, hasMissingValue, Arrays.hashCode(table));
109+
}
110+
111+
@Override
112+
public boolean equals(Object obj) {
113+
if (obj != null && obj instanceof UnsignedLongHashSet) {
114+
UnsignedLongHashSet that = (UnsignedLongHashSet) obj;
115+
return size == that.size
116+
&& minValue == that.minValue
117+
&& maxValue == that.maxValue
118+
&& mask == that.mask
119+
&& hasMissingValue == that.hasMissingValue
120+
&& Arrays.equals(table, that.table);
121+
}
122+
return false;
123+
}
124+
125+
@Override
126+
public String toString() {
127+
return stream().mapToObj(Long::toUnsignedString).collect(Collectors.joining(", ", "[", "]"));
128+
}
129+
130+
/** number of elements in the set */
131+
public int size() {
132+
return size;
133+
}
134+
135+
@Override
136+
public long ramBytesUsed() {
137+
return BASE_RAM_BYTES + RamUsageEstimator.sizeOfObject(table);
138+
}
139+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,141 @@
1+
package org.opensearch.lucene.util;
2+
3+
/*
4+
* SPDX-License-Identifier: Apache-2.0
5+
*
6+
* The OpenSearch Contributors require contributions made to
7+
* this file be licensed under the Apache-2.0 license or a
8+
* compatible open source license.
9+
*/
10+
11+
import org.apache.lucene.tests.util.LuceneTestCase;
12+
import org.opensearch.common.Numbers;
13+
14+
import java.util.Arrays;
15+
import java.util.HashSet;
16+
import java.util.Set;
17+
import java.util.stream.Collectors;
18+
import java.util.stream.LongStream;
19+
20+
public class DocValuesUnsignedLongHashSetTests extends LuceneTestCase {
21+
22+
private void assertEquals(Set<Long> set1, UnsignedLongHashSet unsignedLongHashSet) {
23+
assertEquals(set1.size(), unsignedLongHashSet.size());
24+
25+
Set<Long> set2 = unsignedLongHashSet.stream().boxed().collect(Collectors.toSet());
26+
LuceneTestCase.assertEquals(set1, set2);
27+
28+
if (set1.isEmpty() == false) {
29+
Set<Long> set3 = new HashSet<>(set1);
30+
long removed = set3.iterator().next();
31+
while (true) {
32+
long next = random().nextLong();
33+
if (next != removed && set3.add(next)) {
34+
assertFalse(unsignedLongHashSet.contains(next));
35+
break;
36+
}
37+
}
38+
assertNotEquals(set3, unsignedLongHashSet);
39+
}
40+
41+
assertTrue(set1.stream().allMatch(unsignedLongHashSet::contains));
42+
}
43+
44+
private void assertNotEquals(Set<Long> set1, UnsignedLongHashSet unsignedLongHashSet) {
45+
Set<Long> set2 = unsignedLongHashSet.stream().boxed().collect(Collectors.toSet());
46+
47+
LuceneTestCase.assertNotEquals(set1, set2);
48+
49+
UnsignedLongHashSet set3 = new UnsignedLongHashSet(
50+
set1.stream().sorted(Long::compareUnsigned).mapToLong(Long::longValue).toArray()
51+
);
52+
53+
LuceneTestCase.assertNotEquals(set2, set3.stream().boxed().collect(Collectors.toSet()));
54+
55+
assertFalse(set1.stream().allMatch(unsignedLongHashSet::contains));
56+
}
57+
58+
public void testEmpty() {
59+
Set<Long> set1 = new HashSet<>();
60+
UnsignedLongHashSet set2 = new UnsignedLongHashSet(new long[] {});
61+
assertEquals(0, set2.size());
62+
assertEquals(Numbers.MAX_UNSIGNED_LONG_VALUE_AS_LONG, set2.minValue);
63+
assertEquals(Numbers.MIN_UNSIGNED_LONG_VALUE_AS_LONG, set2.maxValue);
64+
assertEquals(set1, set2);
65+
}
66+
67+
public void testOneValue() {
68+
Set<Long> set1 = new HashSet<>(Arrays.asList(42L));
69+
UnsignedLongHashSet set2 = new UnsignedLongHashSet(new long[] { 42L });
70+
assertEquals(1, set2.size());
71+
assertEquals(42L, set2.minValue);
72+
assertEquals(42L, set2.maxValue);
73+
assertEquals(set1, set2);
74+
75+
set1 = new HashSet<>(Arrays.asList(Numbers.MIN_UNSIGNED_LONG_VALUE_AS_LONG));
76+
set2 = new UnsignedLongHashSet(new long[] { Numbers.MIN_UNSIGNED_LONG_VALUE_AS_LONG });
77+
assertEquals(1, set2.size());
78+
assertEquals(Numbers.MIN_UNSIGNED_LONG_VALUE_AS_LONG, set2.minValue);
79+
assertEquals(Numbers.MIN_UNSIGNED_LONG_VALUE_AS_LONG, set2.maxValue);
80+
assertEquals(set1, set2);
81+
}
82+
83+
public void testTwoValues() {
84+
Set<Long> set1 = new HashSet<>(Arrays.asList(42L, Numbers.MAX_UNSIGNED_LONG_VALUE_AS_LONG));
85+
UnsignedLongHashSet set2 = new UnsignedLongHashSet(new long[] { 42L, Numbers.MAX_UNSIGNED_LONG_VALUE_AS_LONG });
86+
assertEquals(2, set2.size());
87+
assertEquals(42, set2.minValue);
88+
assertEquals(Numbers.MAX_UNSIGNED_LONG_VALUE_AS_LONG, set2.maxValue);
89+
assertEquals(set1, set2);
90+
91+
set1 = new HashSet<>(Arrays.asList(Numbers.MIN_UNSIGNED_LONG_VALUE_AS_LONG, 42L));
92+
set2 = new UnsignedLongHashSet(new long[] { Numbers.MIN_UNSIGNED_LONG_VALUE_AS_LONG, 42L });
93+
assertEquals(2, set2.size());
94+
assertEquals(Numbers.MIN_UNSIGNED_LONG_VALUE_AS_LONG, set2.minValue);
95+
assertEquals(42, set2.maxValue);
96+
assertEquals(set1, set2);
97+
}
98+
99+
public void testSameValue() {
100+
UnsignedLongHashSet set2 = new UnsignedLongHashSet(new long[] { 42L, 42L });
101+
assertEquals(1, set2.size());
102+
assertEquals(42L, set2.minValue);
103+
assertEquals(42L, set2.maxValue);
104+
}
105+
106+
public void testSameMissingPlaceholder() {
107+
UnsignedLongHashSet set2 = new UnsignedLongHashSet(new long[] { Long.MIN_VALUE, Long.MIN_VALUE });
108+
assertEquals(1, set2.size());
109+
assertEquals(Long.MIN_VALUE, set2.minValue);
110+
assertEquals(Long.MIN_VALUE, set2.maxValue);
111+
}
112+
113+
public void testRandom() {
114+
final int iters = atLeast(10);
115+
for (int iter = 0; iter < iters; ++iter) {
116+
long[] values = new long[random().nextInt(1 << random().nextInt(16))];
117+
for (int i = 0; i < values.length; ++i) {
118+
if (i == 0 || random().nextInt(10) < 9) {
119+
values[i] = random().nextLong();
120+
} else {
121+
values[i] = values[random().nextInt(i)];
122+
}
123+
}
124+
if (values.length > 0 && random().nextBoolean()) {
125+
values[values.length / 2] = Long.MIN_VALUE;
126+
}
127+
Set<Long> set1 = LongStream.of(values).boxed().collect(Collectors.toSet());
128+
Long[] longObjects = Arrays.stream(values).boxed().toArray(Long[]::new);
129+
// Sort using compareUnsigned
130+
Arrays.sort(longObjects, Long::compareUnsigned);
131+
132+
long[] arr = new long[values.length];
133+
// Convert back to long[]
134+
for (int i = 0; i < arr.length; i++) {
135+
arr[i] = longObjects[i];
136+
}
137+
UnsignedLongHashSet set2 = new UnsignedLongHashSet(arr);
138+
assertEquals(set1, set2);
139+
}
140+
}
141+
}

0 commit comments

Comments
 (0)