Skip to content

Update Lucene dependencies #1336

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
May 22, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,5 +26,6 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
### Documentation

### Maintenance
- Update Lucene dependencies ([#1336](https://github.com/opensearch-project/neural-search/pull/1336))

### Refactoring
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
"name": "traced_small_model",
"version": "1.0.0",
"model_format": "TORCH_SCRIPT",
"function_name": "TEXT_EMBEDDING",
"model_task_type": "text_embedding",
"model_content_hash_value": "e13b74006290a9d0f58c1376f9629d4ebc05a0f9385f40db837452b167ae9021",
"model_group_id": "%s",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
"name": "traced_small_model",
"version": "1.0.0",
"model_format": "TORCH_SCRIPT",
"function_name": "TEXT_EMBEDDING",
"model_task_type": "text_embedding",
"model_content_hash_value": "e13b74006290a9d0f58c1376f9629d4ebc05a0f9385f40db837452b167ae9021",
"model_group_id": "%s",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ public class HybridBulkScorer extends BulkScorer {
@Getter
private final float[][] windowScores;
private final HybridQueryDocIdStream hybridQueryDocIdStream;
@Getter
private final int maxDoc;
private int[] docIds;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
package org.opensearch.neuralsearch.query;

import lombok.RequiredArgsConstructor;
import lombok.Setter;
import org.apache.lucene.search.CheckedIntConsumer;
import org.apache.lucene.search.DocIdStream;
import org.apache.lucene.util.FixedBitSet;
Expand All @@ -20,16 +19,17 @@
public class HybridQueryDocIdStream extends DocIdStream {
private static final int BLOCK_SHIFT = 6;
private final HybridBulkScorer hybridBulkScorer;
@Setter
private int base;
private int upTo;

/**
* Iterate over all doc ids and collect each doc id with leaf collector
* @param consumer consumer that is called for each accepted doc id
* @throws IOException in case of IO exception
*/
@Override
public void forEach(CheckedIntConsumer<IOException> consumer) throws IOException {
public void forEach(int upTo, CheckedIntConsumer<IOException> consumer) throws IOException {
upTo = Math.min(upTo, hybridBulkScorer.getMaxDoc());
// bitset that represents matching documents, bit is set (1) if doc id is a match
FixedBitSet matchingBitSet = hybridBulkScorer.getMatching();
long[] bitArray = matchingBitSet.getBits();
Expand All @@ -52,12 +52,33 @@ public void forEach(CheckedIntConsumer<IOException> consumer) throws IOException
hybridBulkScorer.getHybridSubQueryScorer().getSubQueryScores()[subQueryIndex] = scoreOfDocIdForSubQuery;
}
// process the document with its base offset
consumer.accept(base | docIndexInWindow);
int doc = base | docIndexInWindow;
if (doc < upTo + base) {
consumer.accept(doc);
this.upTo++;
}
// reset scores after processing of one doc, this is required because scorer object is re-used
hybridBulkScorer.getHybridSubQueryScorer().resetScores();
// reset bit for this doc id to indicate that it has been consumed
bits ^= 1L << numberOfTrailingZeros;
}
}
}

@Override
public int count(int upTo) throws IOException {
int[] count = new int[1];
forEach(upTo, (doc -> count[0]++));
return count[0];
}

@Override
public boolean mayHaveRemaining() {
return this.upTo + 1 < hybridBulkScorer.getMaxDoc();
}

public void setBase(int base) {
this.base = base;
this.upTo = base;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -51,10 +51,16 @@ public HybridQueryScorer(final List<Scorer> subScorers) throws IOException {
super();
this.subScorers = Collections.unmodifiableList(subScorers);
this.numSubqueries = subScorers.size();
this.subScorersPQ = initializeSubScorersPQ();
List<HybridDisiWrapper> hybridDisiWrappers = initializeSubScorersList();
if (hybridDisiWrappers.isEmpty()) {
throw new IllegalArgumentException("There must be at least 1 subScorers");
}
this.subScorersPQ = DisiPriorityQueue.ofMaxSize(numSubqueries);
this.subScorersPQ.addAll(hybridDisiWrappers.toArray(new DisiWrapper[0]), 0, hybridDisiWrappers.size());
boolean needsScores = scoreMode != ScoreMode.COMPLETE_NO_SCORES;

this.approximation = new HybridSubqueriesDISIApproximation(this.subScorersPQ);
this.approximation = new HybridSubqueriesDISIApproximation(hybridDisiWrappers, subScorersPQ);

if (scoreMode == ScoreMode.TOP_SCORES) {
this.disjunctionBlockPropagator = new HybridScoreBlockBoundaryPropagator(subScorers);
} else {
Expand Down Expand Up @@ -181,44 +187,20 @@ public int docID() {
return subScorersPQ.top().doc;
}

/**
* Return array of scores per sub-query for doc id that is defined by current iterator position
* @return
* @throws IOException
*/
public float[] hybridScores() throws IOException {
float[] scores = new float[numSubqueries];
// retrieves sub-matches using DisjunctionDisiScorer's two-phase iteration process.
// while the two-phase iterator can efficiently skip blocks of document IDs during matching,
// the DisiWrapper (obtained from subScorersPQ.topList()) ensures sequential document ID iteration.
// this is necessary for maintaining correct scoring order.
DisiWrapper topList = getSubMatches();

for (HybridDisiWrapper disiWrapper = (HybridDisiWrapper) topList; disiWrapper != null; disiWrapper =
(HybridDisiWrapper) disiWrapper.next) {
// check if this doc has match in the subQuery. If not, add score as 0.0 and continue
Scorer scorer = disiWrapper.scorer;
if (scorer.docID() == DocIdSetIterator.NO_MORE_DOCS) {
continue;
}
scores[disiWrapper.getSubQueryIndex()] = scorer.score();
}
return scores;
}

private DisiPriorityQueue initializeSubScorersPQ() {
private List<HybridDisiWrapper> initializeSubScorersList() {
Objects.requireNonNull(subScorers, "should not be null");
// we need to count this way in order to include all identical sub-queries
DisiPriorityQueue subScorersPQ = new DisiPriorityQueue(numSubqueries);
List<HybridDisiWrapper> hybridDisiWrappers = new ArrayList<>();
for (int idx = 0; idx < numSubqueries; idx++) {
Scorer scorer = subScorers.get(idx);
if (scorer == null) {
continue;
}
final HybridDisiWrapper disiWrapper = new HybridDisiWrapper(scorer, idx);
subScorersPQ.add(disiWrapper);
hybridDisiWrappers.add(disiWrapper);

}
return subScorersPQ;
return hybridDisiWrappers;
}

@Override
Expand Down Expand Up @@ -324,9 +306,12 @@ static class HybridSubqueriesDISIApproximation extends DocIdSetIterator {
final DocIdSetIterator docIdSetIterator;
final DisiPriorityQueue subIterators;

public HybridSubqueriesDISIApproximation(final DisiPriorityQueue subIterators) {
docIdSetIterator = new DisjunctionDISIApproximation(subIterators);
this.subIterators = subIterators;
public HybridSubqueriesDISIApproximation(
final Collection<? extends DisiWrapper> subIterators,
final DisiPriorityQueue subIteratorsPQ
) {
docIdSetIterator = new DisjunctionDISIApproximation(subIterators, 0);
this.subIterators = subIteratorsPQ;
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,7 @@ public void testForEach_whenCrossing64BitBoundary_thenAllDocsProcessed() {
private HybridBulkScorer createMockScorerWithDocs(FixedBitSet matchingDocs, int numDocs) {
HybridBulkScorer mockScorer = mock(HybridBulkScorer.class);
when(mockScorer.getMatching()).thenReturn(matchingDocs);
when(mockScorer.getMaxDoc()).thenReturn(200);

// setup window scores with the specified number of docs
float[][] windowScores = new float[2][numDocs];
Expand All @@ -144,6 +145,7 @@ private HybridBulkScorer createMockScorerWithDocs(FixedBitSet matchingDocs, int
private HybridBulkScorer createMockScorerWithDocs(FixedBitSet matchingDocs) {
HybridBulkScorer mockScorer = mock(HybridBulkScorer.class);
when(mockScorer.getMatching()).thenReturn(matchingDocs);
when(mockScorer.getMaxDoc()).thenReturn(200);

// setup window scores
float[][] windowScores = new float[2][NUM_DOCS]; // 2 sub-queries
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,6 @@
import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
import static org.mockito.ArgumentMatchers.anyInt;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.never;
import static org.mockito.Mockito.times;
import static org.mockito.Mockito.verify;
import static org.mockito.Mockito.when;
import static org.opensearch.neuralsearch.util.TestUtils.DELTA_FOR_SCORE_ASSERTION;
Expand All @@ -21,7 +19,6 @@
import java.util.List;
import java.util.Set;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.stream.Collectors;

import org.apache.commons.lang3.tuple.ImmutablePair;
import org.apache.commons.lang3.tuple.Pair;
Expand Down Expand Up @@ -58,113 +55,6 @@ public void testWithRandomDocuments_whenOneSubScorer_thenReturnSuccessfully() {
testWithQuery(docs, scores, hybridQueryScorer);
}

@SneakyThrows
public void testWithRandomDocumentsAndHybridScores_whenMultipleScorers_thenReturnSuccessfully() {
int maxDocId1 = TestUtil.nextInt(random(), 10, 10_000);
Pair<int[], float[]> docsAndScores1 = generateDocuments(maxDocId1);
int[] docs1 = docsAndScores1.getLeft();
float[] scores1 = docsAndScores1.getRight();
int maxDocId2 = TestUtil.nextInt(random(), 10, 10_000);
Pair<int[], float[]> docsAndScores2 = generateDocuments(maxDocId2);
int[] docs2 = docsAndScores2.getLeft();
float[] scores2 = docsAndScores2.getRight();

Weight weight = mock(Weight.class);

HybridQueryScorer hybridQueryScorer = new HybridQueryScorer(
Arrays.asList(
scorer(docs1, scores1, fakeWeight(new MatchAllDocsQuery())),
scorer(docs2, scores2, fakeWeight(new MatchNoDocsQuery()))
)
);
int doc = -1;
int numOfActualDocs = 0;
Set<Integer> uniqueDocs1 = Arrays.stream(docs1).boxed().collect(Collectors.toSet());
Set<Integer> uniqueDocs2 = Arrays.stream(docs2).boxed().collect(Collectors.toSet());
while (doc != NO_MORE_DOCS) {
doc = hybridQueryScorer.iterator().nextDoc();
if (doc == DocIdSetIterator.NO_MORE_DOCS) {
continue;
}
float[] actualTotalScores = hybridQueryScorer.hybridScores();
float actualTotalScore = 0.0f;
for (float score : actualTotalScores) {
actualTotalScore += score;
}
float expectedScore = 0.0f;
if (uniqueDocs1.contains(doc)) {
int idx = Arrays.binarySearch(docs1, doc);
expectedScore += scores1[idx];
}
if (uniqueDocs2.contains(doc)) {
int idx = Arrays.binarySearch(docs2, doc);
expectedScore += scores2[idx];
}
assertEquals(expectedScore, actualTotalScore, DELTA_FOR_SCORE_ASSERTION);
numOfActualDocs++;
}

int totalUniqueCount = uniqueDocs1.size();
for (int n : uniqueDocs2) {
if (!uniqueDocs1.contains(n)) {
totalUniqueCount++;
}
}
assertEquals(totalUniqueCount, numOfActualDocs);
}

@SneakyThrows
public void testWithRandomDocumentsAndCombinedScore_whenMultipleScorers_thenReturnSuccessfully() {
int maxDocId1 = TestUtil.nextInt(random(), 10, 10_000);
Pair<int[], float[]> docsAndScores1 = generateDocuments(maxDocId1);
int[] docs1 = docsAndScores1.getLeft();
float[] scores1 = docsAndScores1.getRight();
int maxDocId2 = TestUtil.nextInt(random(), 10, 10_000);
Pair<int[], float[]> docsAndScores2 = generateDocuments(maxDocId2);
int[] docs2 = docsAndScores2.getLeft();
float[] scores2 = docsAndScores2.getRight();

HybridQueryScorer hybridQueryScorer = new HybridQueryScorer(
Arrays.asList(
scorer(docs1, scores1, fakeWeight(new MatchAllDocsQuery())),
scorer(docs2, scores2, fakeWeight(new MatchNoDocsQuery()))
)
);
int doc = -1;
int numOfActualDocs = 0;
Set<Integer> uniqueDocs1 = Arrays.stream(docs1).boxed().collect(Collectors.toSet());
Set<Integer> uniqueDocs2 = Arrays.stream(docs2).boxed().collect(Collectors.toSet());
while (doc != NO_MORE_DOCS) {
doc = hybridQueryScorer.iterator().nextDoc();
if (doc == DocIdSetIterator.NO_MORE_DOCS) {
continue;
}
float expectedScore = 0.0f;
if (uniqueDocs1.contains(doc)) {
int idx = Arrays.binarySearch(docs1, doc);
expectedScore += scores1[idx];
}
if (uniqueDocs2.contains(doc)) {
int idx = Arrays.binarySearch(docs2, doc);
expectedScore += scores2[idx];
}
float hybridScore = 0.0f;
for (float score : hybridQueryScorer.hybridScores()) {
hybridScore += score;
}
assertEquals(expectedScore, hybridScore, DELTA_FOR_SCORE_ASSERTION);
numOfActualDocs++;
}

int totalUniqueCount = uniqueDocs1.size();
for (int n : uniqueDocs2) {
if (!uniqueDocs1.contains(n)) {
totalUniqueCount++;
}
}
assertEquals(totalUniqueCount, numOfActualDocs);
}

@SneakyThrows
public void testWithRandomDocuments_whenMultipleScorersAndSomeScorersEmpty_thenReturnSuccessfully() {
int maxDocId = TestUtil.nextInt(random(), 10, 10_000);
Expand Down Expand Up @@ -202,11 +92,6 @@ public void testMaxScore_whenMultipleScorers_thenSuccessful() {

maxScore = hybridQueryScorerWithSomeNullSubScorers.getMaxScore(Integer.MAX_VALUE);
assertTrue(maxScore > 0.0f);

HybridQueryScorer hybridQueryScorerWithAllNullSubScorers = new HybridQueryScorer(Arrays.asList(null, null));

maxScore = hybridQueryScorerWithAllNullSubScorers.getMaxScore(Integer.MAX_VALUE);
assertEquals(0.0f, maxScore, 0.0f);
}

@SneakyThrows
Expand Down Expand Up @@ -517,14 +402,6 @@ public void testScore_whenMultipleQueries_thenCombineScores() {
assertEquals("Combined score should be sum of bool and neural scores", 1.6f, combinedScore, DELTA_FOR_SCORE_ASSERTION);
}

@SneakyThrows
public void testScore_whenEmptySubScorers_thenReturnZero() {
HybridQueryScorer hybridScorer = new HybridQueryScorer(Collections.emptyList());
float score = hybridScorer.score(null);

assertEquals("Score should be 0.0 for null wrapper", 0.0f, score, DELTA_FOR_SCORE_ASSERTION);
}

@SneakyThrows
public void testInitialization_whenValidScorer_thenSuccessful() {
// Create scorer with iterator
Expand Down Expand Up @@ -558,46 +435,6 @@ public void testInitialization_whenValidScorer_thenSuccessful() {
assertEquals("Cost should be 1", 1L, wrapper.cost);
}

@SneakyThrows
public void testHybridScores_withTwoPhaseIterator() throws IOException {
// Create weight and scorers
Scorer scorer1 = mock(Scorer.class);
TwoPhaseIterator twoPhaseIterator = mock(TwoPhaseIterator.class);
DocIdSetIterator approximation = mock(DocIdSetIterator.class);

// Setup two-phase behavior
when(scorer1.twoPhaseIterator()).thenReturn(twoPhaseIterator);
when(twoPhaseIterator.approximation()).thenReturn(approximation);
when(scorer1.iterator()).thenReturn(approximation);
when(approximation.cost()).thenReturn(1L);

// Setup DocIdSetIterator behavior - use different docIDs
when(approximation.docID()).thenReturn(5); // approximation at doc 5
when(scorer1.docID()).thenReturn(5); // scorer at same doc
when(scorer1.score()).thenReturn(2.0f);

// matches() always returns false - document should never match
when(twoPhaseIterator.matches()).thenReturn(false);

// Create HybridQueryScorer with two-phase iterator
List<Scorer> subScorers = Collections.singletonList(scorer1);
HybridQueryScorer hybridScorer = new HybridQueryScorer(subScorers);

// Call matches() first to establish non-matching state
TwoPhaseIterator hybridTwoPhase = hybridScorer.twoPhaseIterator();
assertNotNull("Should have two phase iterator", hybridTwoPhase);
assertFalse("Document should not match", hybridTwoPhase.matches());

// Get scores - should be zero since document doesn't match
float[] scores = hybridScorer.hybridScores();
assertEquals("Should have one score entry", 1, scores.length);
assertEquals("Score should be 0 for non-matching document", 0.0f, scores[0], DELTA_FOR_SCORE_ASSERTION);

// Verify score() was never called since document didn't match
verify(scorer1, never()).score();
verify(twoPhaseIterator, times(1)).matches();
}

@SneakyThrows
public void testTwoPhaseIterator_withNestedTwoPhaseQuery() {
// Create a scorer that uses two-phase iteration
Expand Down
Loading
Loading