Skip to content

Fix Bug - Handle unsigned long in sorting order assertion of LongHashSet #17207

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
- Introduce a setting to disable download of full cluster state from remote on term mismatch([#16798](https://github.com/opensearch-project/OpenSearch/pull/16798/))
- Added ability to retrieve value from DocValues in a flat_object filed([#16802](https://github.com/opensearch-project/OpenSearch/pull/16802))
- Improve performace of NumericTermAggregation by avoiding unnecessary sorting([#17252](https://github.com/opensearch-project/OpenSearch/pull/17252))
- Fix Bug - Handle unsigned long in sorting order assertion of LongHashSet ([#17207](https://github.com/opensearch-project/OpenSearch/pull/17207))
- Implemented computation of segment replication stats at shard level ([#17055](https://github.com/opensearch-project/OpenSearch/pull/17055))
- [Rule Based Auto-tagging] Add in-memory attribute value store ([#17342](https://github.com/opensearch-project/OpenSearch/pull/17342))

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
import org.apache.lucene.search.ScorerSupplier;
import org.apache.lucene.search.TwoPhaseIterator;
import org.apache.lucene.search.Weight;
import org.opensearch.lucene.util.LongHashSet;
import org.opensearch.lucene.util.UnsignedLongHashSet;

import java.io.IOException;
import java.math.BigInteger;
Expand All @@ -40,12 +40,12 @@
public abstract class SortedUnsignedLongDocValuesSetQuery extends Query {

private final String field;
private final LongHashSet numbers;
private final UnsignedLongHashSet numbers;

SortedUnsignedLongDocValuesSetQuery(String field, BigInteger[] numbers) {
this.field = Objects.requireNonNull(field);
Arrays.sort(numbers);
this.numbers = new LongHashSet(Arrays.stream(numbers).mapToLong(n -> n.longValue()).toArray());
this.numbers = new UnsignedLongHashSet(Arrays.stream(numbers).mapToLong(n -> n.longValue()).toArray());
}

@Override
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
/*
* SPDX-License-Identifier: Apache-2.0
*
* The OpenSearch Contributors require contributions made to
* this file be licensed under the Apache-2.0 license or a
* compatible open source license.
*/

package org.opensearch.lucene.util;

import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.packed.PackedInts;
import org.opensearch.common.Numbers;

import java.util.Arrays;
import java.util.Objects;
import java.util.stream.Collectors;
import java.util.stream.LongStream;

/** Set of unsigned-longs, optimized for docvalues usage */
public final class UnsignedLongHashSet implements Accountable {
private static final long BASE_RAM_BYTES = RamUsageEstimator.shallowSizeOfInstance(UnsignedLongHashSet.class);

private static final long MISSING = Numbers.MIN_UNSIGNED_LONG_VALUE_AS_LONG;

final long[] table;
final int mask;
final boolean hasMissingValue;
final int size;
/** minimum value in the set, or Numbers.MAX_UNSIGNED_LONG_VALUE_AS_LONG for an empty set */
public final long minValue;
/** maximum value in the set, or Numbers.MIN_UNSIGNED_LONG_VALUE_AS_LONG for an empty set */
public final long maxValue;

/** Construct a set. Values must be in sorted order. */
public UnsignedLongHashSet(long[] values) {
int tableSize = Math.toIntExact(values.length * 3L / 2);
tableSize = 1 << PackedInts.bitsRequired(tableSize); // make it a power of 2
assert tableSize >= values.length * 3L / 2;
table = new long[tableSize];
Arrays.fill(table, MISSING);
mask = tableSize - 1;
boolean hasMissingValue = false;
int size = 0;
long previousValue = Numbers.MIN_UNSIGNED_LONG_VALUE_AS_LONG; // for assert
for (long value : values) {
if (value == MISSING) {
size += hasMissingValue ? 0 : 1;
hasMissingValue = true;
} else if (add(value)) {
++size;
}
assert Long.compareUnsigned(value, previousValue) >= 0 : " values must be provided in sorted order";
previousValue = value;
}
this.hasMissingValue = hasMissingValue;
this.size = size;
this.minValue = values.length == 0 ? Numbers.MAX_UNSIGNED_LONG_VALUE_AS_LONG : values[0];
this.maxValue = values.length == 0 ? Numbers.MIN_UNSIGNED_LONG_VALUE_AS_LONG : values[values.length - 1];
}

private boolean add(long l) {
assert l != MISSING;
final int slot = Long.hashCode(l) & mask;
for (int i = slot;; i = (i + 1) & mask) {
if (table[i] == MISSING) {
table[i] = l;
return true;
} else if (table[i] == l) {
// already added
return false;
}
}
}

/**
* check for membership in the set.
*
* <p>You should use {@link #minValue} and {@link #maxValue} to guide/terminate iteration before
* calling this.
*/
public boolean contains(long l) {
if (l == MISSING) {
return hasMissingValue;
}
final int slot = Long.hashCode(l) & mask;
for (int i = slot;; i = (i + 1) & mask) {
if (table[i] == MISSING) {
return false;
} else if (table[i] == l) {
return true;
}
}
}

/** returns a stream of all values contained in this set */
public LongStream stream() {
LongStream stream = Arrays.stream(table).filter(v -> v != MISSING);
if (hasMissingValue) {
stream = LongStream.concat(LongStream.of(MISSING), stream);
}
return stream;
}

@Override
public int hashCode() {
return Objects.hash(size, minValue, maxValue, mask, hasMissingValue, Arrays.hashCode(table));

Check warning on line 108 in server/src/main/java/org/opensearch/lucene/util/UnsignedLongHashSet.java

View check run for this annotation

Codecov / codecov/patch

server/src/main/java/org/opensearch/lucene/util/UnsignedLongHashSet.java#L108

Added line #L108 was not covered by tests
}

@Override
public boolean equals(Object obj) {
if (obj != null && obj instanceof UnsignedLongHashSet) {
UnsignedLongHashSet that = (UnsignedLongHashSet) obj;
return size == that.size
&& minValue == that.minValue
&& maxValue == that.maxValue
&& mask == that.mask
&& hasMissingValue == that.hasMissingValue
&& Arrays.equals(table, that.table);
}
return false;

Check warning on line 122 in server/src/main/java/org/opensearch/lucene/util/UnsignedLongHashSet.java

View check run for this annotation

Codecov / codecov/patch

server/src/main/java/org/opensearch/lucene/util/UnsignedLongHashSet.java#L122

Added line #L122 was not covered by tests
}

@Override
public String toString() {
return stream().mapToObj(Long::toUnsignedString).collect(Collectors.joining(", ", "[", "]"));

Check warning on line 127 in server/src/main/java/org/opensearch/lucene/util/UnsignedLongHashSet.java

View check run for this annotation

Codecov / codecov/patch

server/src/main/java/org/opensearch/lucene/util/UnsignedLongHashSet.java#L127

Added line #L127 was not covered by tests
}

/** number of elements in the set */
public int size() {
return size;
}

@Override
public long ramBytesUsed() {
return BASE_RAM_BYTES + RamUsageEstimator.sizeOfObject(table);

Check warning on line 137 in server/src/main/java/org/opensearch/lucene/util/UnsignedLongHashSet.java

View check run for this annotation

Codecov / codecov/patch

server/src/main/java/org/opensearch/lucene/util/UnsignedLongHashSet.java#L137

Added line #L137 was not covered by tests
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
package org.opensearch.lucene.util;

/*
* SPDX-License-Identifier: Apache-2.0
*
* The OpenSearch Contributors require contributions made to
* this file be licensed under the Apache-2.0 license or a
* compatible open source license.
*/

import org.apache.lucene.tests.util.LuceneTestCase;
import org.opensearch.common.Numbers;

import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.LongStream;

public class DocValuesUnsignedLongHashSetTests extends LuceneTestCase {

private void assertEquals(Set<Long> set1, UnsignedLongHashSet unsignedLongHashSet) {
assertEquals(set1.size(), unsignedLongHashSet.size());

Set<Long> set2 = unsignedLongHashSet.stream().boxed().collect(Collectors.toSet());
LuceneTestCase.assertEquals(set1, set2);

if (set1.isEmpty() == false) {
Set<Long> set3 = new HashSet<>(set1);
long removed = set3.iterator().next();
while (true) {
long next = random().nextLong();
if (next != removed && set3.add(next)) {
assertFalse(unsignedLongHashSet.contains(next));
break;
}
}
assertNotEquals(set3, unsignedLongHashSet);
}

assertTrue(set1.stream().allMatch(unsignedLongHashSet::contains));
}

private void assertNotEquals(Set<Long> set1, UnsignedLongHashSet unsignedLongHashSet) {
Set<Long> set2 = unsignedLongHashSet.stream().boxed().collect(Collectors.toSet());

LuceneTestCase.assertNotEquals(set1, set2);

UnsignedLongHashSet set3 = new UnsignedLongHashSet(
set1.stream().sorted(Long::compareUnsigned).mapToLong(Long::longValue).toArray()
);

LuceneTestCase.assertNotEquals(set2, set3.stream().boxed().collect(Collectors.toSet()));

assertFalse(set1.stream().allMatch(unsignedLongHashSet::contains));
}

public void testEmpty() {
Set<Long> set1 = new HashSet<>();
UnsignedLongHashSet set2 = new UnsignedLongHashSet(new long[] {});
assertEquals(0, set2.size());
assertEquals(Numbers.MAX_UNSIGNED_LONG_VALUE_AS_LONG, set2.minValue);
assertEquals(Numbers.MIN_UNSIGNED_LONG_VALUE_AS_LONG, set2.maxValue);
assertEquals(set1, set2);
}

public void testOneValue() {
Set<Long> set1 = new HashSet<>(Arrays.asList(42L));
UnsignedLongHashSet set2 = new UnsignedLongHashSet(new long[] { 42L });
assertEquals(1, set2.size());
assertEquals(42L, set2.minValue);
assertEquals(42L, set2.maxValue);
assertEquals(set1, set2);

set1 = new HashSet<>(Arrays.asList(Numbers.MIN_UNSIGNED_LONG_VALUE_AS_LONG));
set2 = new UnsignedLongHashSet(new long[] { Numbers.MIN_UNSIGNED_LONG_VALUE_AS_LONG });
assertEquals(1, set2.size());
assertEquals(Numbers.MIN_UNSIGNED_LONG_VALUE_AS_LONG, set2.minValue);
assertEquals(Numbers.MIN_UNSIGNED_LONG_VALUE_AS_LONG, set2.maxValue);
assertEquals(set1, set2);
}

public void testTwoValues() {
Set<Long> set1 = new HashSet<>(Arrays.asList(42L, Numbers.MAX_UNSIGNED_LONG_VALUE_AS_LONG));
UnsignedLongHashSet set2 = new UnsignedLongHashSet(new long[] { 42L, Numbers.MAX_UNSIGNED_LONG_VALUE_AS_LONG });
assertEquals(2, set2.size());
assertEquals(42, set2.minValue);
assertEquals(Numbers.MAX_UNSIGNED_LONG_VALUE_AS_LONG, set2.maxValue);
assertEquals(set1, set2);

set1 = new HashSet<>(Arrays.asList(Numbers.MIN_UNSIGNED_LONG_VALUE_AS_LONG, 42L));
set2 = new UnsignedLongHashSet(new long[] { Numbers.MIN_UNSIGNED_LONG_VALUE_AS_LONG, 42L });
assertEquals(2, set2.size());
assertEquals(Numbers.MIN_UNSIGNED_LONG_VALUE_AS_LONG, set2.minValue);
assertEquals(42, set2.maxValue);
assertEquals(set1, set2);
}

public void testSameValue() {
UnsignedLongHashSet set2 = new UnsignedLongHashSet(new long[] { 42L, 42L });
assertEquals(1, set2.size());
assertEquals(42L, set2.minValue);
assertEquals(42L, set2.maxValue);
}

public void testSameMissingPlaceholder() {
UnsignedLongHashSet set2 = new UnsignedLongHashSet(new long[] { Long.MIN_VALUE, Long.MIN_VALUE });
assertEquals(1, set2.size());
assertEquals(Long.MIN_VALUE, set2.minValue);
assertEquals(Long.MIN_VALUE, set2.maxValue);
}

public void testRandom() {
final int iters = atLeast(10);
for (int iter = 0; iter < iters; ++iter) {
long[] values = new long[random().nextInt(1 << random().nextInt(16))];
for (int i = 0; i < values.length; ++i) {
if (i == 0 || random().nextInt(10) < 9) {
values[i] = random().nextLong();
} else {
values[i] = values[random().nextInt(i)];
}
}
if (values.length > 0 && random().nextBoolean()) {
values[values.length / 2] = Long.MIN_VALUE;
}
Set<Long> set1 = LongStream.of(values).boxed().collect(Collectors.toSet());
Long[] longObjects = Arrays.stream(values).boxed().toArray(Long[]::new);
// Sort using compareUnsigned
Arrays.sort(longObjects, Long::compareUnsigned);

long[] arr = new long[values.length];
// Convert back to long[]
for (int i = 0; i < arr.length; i++) {
arr[i] = longObjects[i];
}
UnsignedLongHashSet set2 = new UnsignedLongHashSet(arr);
assertEquals(set1, set2);
}
}
}
Loading