Update Lucene dependencies (opensearch-project#1336)

ryanbogan · YeonghyeonKO · commit 6d5dea255fac · 2025-05-29T18:32:44.000+09:00
* Update Lucene dependencies

Signed-off-by: Ryan Bogan &lt;rbogan@amazon.com&gt;

* Add changelog entry

Signed-off-by: Ryan Bogan &lt;rbogan@amazon.com&gt;

* Update model request body for bwc and integ tests

Signed-off-by: Ryan Bogan &lt;rbogan@amazon.com&gt;

---------

Signed-off-by: Ryan Bogan &lt;rbogan@amazon.com&gt;
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -18,5 +18,6 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 ### Documentation
 
 ### Maintenance
+ - Update Lucene dependencies ([#1336](https://github.com/opensearch-project/neural-search/pull/1336))
 
 ### Refactoring
diff --git a/qa/restart-upgrade/src/test/resources/processor/UploadModelRequestBody.json b/qa/restart-upgrade/src/test/resources/processor/UploadModelRequestBody.json
@@ -2,6 +2,7 @@
   "name": "traced_small_model",
   "version": "1.0.0",
   "model_format": "TORCH_SCRIPT",
+  "function_name": "TEXT_EMBEDDING",
   "model_task_type": "text_embedding",
   "model_content_hash_value": "e13b74006290a9d0f58c1376f9629d4ebc05a0f9385f40db837452b167ae9021",
   "model_group_id": "%s",
diff --git a/qa/rolling-upgrade/src/test/resources/processor/UploadModelRequestBody.json b/qa/rolling-upgrade/src/test/resources/processor/UploadModelRequestBody.json
@@ -2,6 +2,7 @@
   "name": "traced_small_model",
   "version": "1.0.0",
   "model_format": "TORCH_SCRIPT",
+  "function_name": "TEXT_EMBEDDING",
   "model_task_type": "text_embedding",
   "model_content_hash_value": "e13b74006290a9d0f58c1376f9629d4ebc05a0f9385f40db837452b167ae9021",
   "model_group_id": "%s",
diff --git a/src/main/java/org/opensearch/neuralsearch/query/HybridBulkScorer.java b/src/main/java/org/opensearch/neuralsearch/query/HybridBulkScorer.java
@@ -35,6 +35,7 @@ public class HybridBulkScorer extends BulkScorer {
     @Getter
     private final float[][] windowScores;
     private final HybridQueryDocIdStream hybridQueryDocIdStream;
+    @Getter
     private final int maxDoc;
     private int[] docIds;
 
diff --git a/src/main/java/org/opensearch/neuralsearch/query/HybridQueryDocIdStream.java b/src/main/java/org/opensearch/neuralsearch/query/HybridQueryDocIdStream.java
@@ -5,7 +5,6 @@
 package org.opensearch.neuralsearch.query;
 
 import lombok.RequiredArgsConstructor;
-import lombok.Setter;
 import org.apache.lucene.search.CheckedIntConsumer;
 import org.apache.lucene.search.DocIdStream;
 import org.apache.lucene.util.FixedBitSet;
@@ -20,16 +19,17 @@
 public class HybridQueryDocIdStream extends DocIdStream {
     private static final int BLOCK_SHIFT = 6;
     private final HybridBulkScorer hybridBulkScorer;
-    @Setter
     private int base;
+    private int upTo;
 
     /**
      * Iterate over all doc ids and collect each doc id with leaf collector
      * @param consumer consumer that is called for each accepted doc id
      * @throws IOException in case of IO exception
      */
     @Override
-    public void forEach(CheckedIntConsumer<IOException> consumer) throws IOException {
+    public void forEach(int upTo, CheckedIntConsumer<IOException> consumer) throws IOException {
+        upTo = Math.min(upTo, hybridBulkScorer.getMaxDoc());
         // bitset that represents matching documents, bit is set (1) if doc id is a match
         FixedBitSet matchingBitSet = hybridBulkScorer.getMatching();
         long[] bitArray = matchingBitSet.getBits();
@@ -52,12 +52,33 @@ public void forEach(CheckedIntConsumer<IOException> consumer) throws IOException
                     hybridBulkScorer.getHybridSubQueryScorer().getSubQueryScores()[subQueryIndex] = scoreOfDocIdForSubQuery;
                 }
                 // process the document with its base offset
-                consumer.accept(base | docIndexInWindow);
+                int doc = base | docIndexInWindow;
+                if (doc < upTo + base) {
+                    consumer.accept(doc);
+                    this.upTo++;
+                }
                 // reset scores after processing of one doc, this is required because scorer object is re-used
                 hybridBulkScorer.getHybridSubQueryScorer().resetScores();
                 // reset bit for this doc id to indicate that it has been consumed
                 bits ^= 1L << numberOfTrailingZeros;
             }
         }
     }
+
+    @Override
+    public int count(int upTo) throws IOException {
+        int[] count = new int[1];
+        forEach(upTo, (doc -> count[0]++));
+        return count[0];
+    }
+
+    @Override
+    public boolean mayHaveRemaining() {
+        return this.upTo + 1 < hybridBulkScorer.getMaxDoc();
+    }
+
+    public void setBase(int base) {
+        this.base = base;
+        this.upTo = base;
+    }
 }
diff --git a/src/main/java/org/opensearch/neuralsearch/query/HybridQueryScorer.java b/src/main/java/org/opensearch/neuralsearch/query/HybridQueryScorer.java
@@ -51,10 +51,16 @@ public HybridQueryScorer(final List<Scorer> subScorers) throws IOException {
         super();
         this.subScorers = Collections.unmodifiableList(subScorers);
         this.numSubqueries = subScorers.size();
-        this.subScorersPQ = initializeSubScorersPQ();
+        List<HybridDisiWrapper> hybridDisiWrappers = initializeSubScorersList();
+        if (hybridDisiWrappers.isEmpty()) {
+            throw new IllegalArgumentException("There must be at least 1 subScorers");
+        }
+        this.subScorersPQ = DisiPriorityQueue.ofMaxSize(numSubqueries);
+        this.subScorersPQ.addAll(hybridDisiWrappers.toArray(new DisiWrapper[0]), 0, hybridDisiWrappers.size());
         boolean needsScores = scoreMode != ScoreMode.COMPLETE_NO_SCORES;
 
-        this.approximation = new HybridSubqueriesDISIApproximation(this.subScorersPQ);
+        this.approximation = new HybridSubqueriesDISIApproximation(hybridDisiWrappers, subScorersPQ);
+
         if (scoreMode == ScoreMode.TOP_SCORES) {
             this.disjunctionBlockPropagator = new HybridScoreBlockBoundaryPropagator(subScorers);
         } else {
@@ -181,44 +187,20 @@ public int docID() {
         return subScorersPQ.top().doc;
     }
 
-    /**
-     * Return array of scores per sub-query for doc id that is defined by current iterator position
-     * @return
-     * @throws IOException
-     */
-    public float[] hybridScores() throws IOException {
-        float[] scores = new float[numSubqueries];
-        // retrieves sub-matches using DisjunctionDisiScorer's two-phase iteration process.
-        // while the two-phase iterator can efficiently skip blocks of document IDs during matching,
-        // the DisiWrapper (obtained from subScorersPQ.topList()) ensures sequential document ID iteration.
-        // this is necessary for maintaining correct scoring order.
-        DisiWrapper topList = getSubMatches();
-
-        for (HybridDisiWrapper disiWrapper = (HybridDisiWrapper) topList; disiWrapper != null; disiWrapper =
-            (HybridDisiWrapper) disiWrapper.next) {
-            // check if this doc has match in the subQuery. If not, add score as 0.0 and continue
-            Scorer scorer = disiWrapper.scorer;
-            if (scorer.docID() == DocIdSetIterator.NO_MORE_DOCS) {
-                continue;
-            }
-            scores[disiWrapper.getSubQueryIndex()] = scorer.score();
-        }
-        return scores;
-    }
-
-    private DisiPriorityQueue initializeSubScorersPQ() {
+    private List<HybridDisiWrapper> initializeSubScorersList() {
         Objects.requireNonNull(subScorers, "should not be null");
         // we need to count this way in order to include all identical sub-queries
-        DisiPriorityQueue subScorersPQ = new DisiPriorityQueue(numSubqueries);
+        List<HybridDisiWrapper> hybridDisiWrappers = new ArrayList<>();
         for (int idx = 0; idx < numSubqueries; idx++) {
             Scorer scorer = subScorers.get(idx);
             if (scorer == null) {
                 continue;
             }
             final HybridDisiWrapper disiWrapper = new HybridDisiWrapper(scorer, idx);
-            subScorersPQ.add(disiWrapper);
+            hybridDisiWrappers.add(disiWrapper);
+
         }
-        return subScorersPQ;
+        return hybridDisiWrappers;
     }
 
     @Override
@@ -324,9 +306,12 @@ static class HybridSubqueriesDISIApproximation extends DocIdSetIterator {
         final DocIdSetIterator docIdSetIterator;
         final DisiPriorityQueue subIterators;
 
-        public HybridSubqueriesDISIApproximation(final DisiPriorityQueue subIterators) {
-            docIdSetIterator = new DisjunctionDISIApproximation(subIterators);
-            this.subIterators = subIterators;
+        public HybridSubqueriesDISIApproximation(
+            final Collection<? extends DisiWrapper> subIterators,
+            final DisiPriorityQueue subIteratorsPQ
+        ) {
+            docIdSetIterator = new DisjunctionDISIApproximation(subIterators, 0);
+            this.subIterators = subIteratorsPQ;
         }
 
         @Override
diff --git a/src/test/java/org/opensearch/neuralsearch/query/HybridQueryDocIdStreamTests.java b/src/test/java/org/opensearch/neuralsearch/query/HybridQueryDocIdStreamTests.java
@@ -124,6 +124,7 @@ public void testForEach_whenCrossing64BitBoundary_thenAllDocsProcessed() {
     private HybridBulkScorer createMockScorerWithDocs(FixedBitSet matchingDocs, int numDocs) {
         HybridBulkScorer mockScorer = mock(HybridBulkScorer.class);
         when(mockScorer.getMatching()).thenReturn(matchingDocs);
+        when(mockScorer.getMaxDoc()).thenReturn(200);
 
         // setup window scores with the specified number of docs
         float[][] windowScores = new float[2][numDocs];
@@ -144,6 +145,7 @@ private HybridBulkScorer createMockScorerWithDocs(FixedBitSet matchingDocs, int
     private HybridBulkScorer createMockScorerWithDocs(FixedBitSet matchingDocs) {
         HybridBulkScorer mockScorer = mock(HybridBulkScorer.class);
         when(mockScorer.getMatching()).thenReturn(matchingDocs);
+        when(mockScorer.getMaxDoc()).thenReturn(200);
 
         // setup window scores
         float[][] windowScores = new float[2][NUM_DOCS]; // 2 sub-queries
diff --git a/src/test/java/org/opensearch/neuralsearch/query/HybridQueryScorerTests.java b/src/test/java/org/opensearch/neuralsearch/query/HybridQueryScorerTests.java
@@ -7,8 +7,6 @@
 import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
 import static org.mockito.ArgumentMatchers.anyInt;
 import static org.mockito.Mockito.mock;
-import static org.mockito.Mockito.never;
-import static org.mockito.Mockito.times;
 import static org.mockito.Mockito.verify;
 import static org.mockito.Mockito.when;
 import static org.opensearch.neuralsearch.util.TestUtils.DELTA_FOR_SCORE_ASSERTION;
@@ -21,7 +19,6 @@
 import java.util.List;
 import java.util.Set;
 import java.util.concurrent.atomic.AtomicInteger;
-import java.util.stream.Collectors;
 
 import org.apache.commons.lang3.tuple.ImmutablePair;
 import org.apache.commons.lang3.tuple.Pair;
@@ -58,113 +55,6 @@ public void testWithRandomDocuments_whenOneSubScorer_thenReturnSuccessfully() {
         testWithQuery(docs, scores, hybridQueryScorer);
     }
 
-    @SneakyThrows
-    public void testWithRandomDocumentsAndHybridScores_whenMultipleScorers_thenReturnSuccessfully() {
-        int maxDocId1 = TestUtil.nextInt(random(), 10, 10_000);
-        Pair<int[], float[]> docsAndScores1 = generateDocuments(maxDocId1);
-        int[] docs1 = docsAndScores1.getLeft();
-        float[] scores1 = docsAndScores1.getRight();
-        int maxDocId2 = TestUtil.nextInt(random(), 10, 10_000);
-        Pair<int[], float[]> docsAndScores2 = generateDocuments(maxDocId2);
-        int[] docs2 = docsAndScores2.getLeft();
-        float[] scores2 = docsAndScores2.getRight();
-
-        Weight weight = mock(Weight.class);
-
-        HybridQueryScorer hybridQueryScorer = new HybridQueryScorer(
-            Arrays.asList(
-                scorer(docs1, scores1, fakeWeight(new MatchAllDocsQuery())),
-                scorer(docs2, scores2, fakeWeight(new MatchNoDocsQuery()))
-            )
-        );
-        int doc = -1;
-        int numOfActualDocs = 0;
-        Set<Integer> uniqueDocs1 = Arrays.stream(docs1).boxed().collect(Collectors.toSet());
-        Set<Integer> uniqueDocs2 = Arrays.stream(docs2).boxed().collect(Collectors.toSet());
-        while (doc != NO_MORE_DOCS) {
-            doc = hybridQueryScorer.iterator().nextDoc();
-            if (doc == DocIdSetIterator.NO_MORE_DOCS) {
-                continue;
-            }
-            float[] actualTotalScores = hybridQueryScorer.hybridScores();
-            float actualTotalScore = 0.0f;
-            for (float score : actualTotalScores) {
-                actualTotalScore += score;
-            }
-            float expectedScore = 0.0f;
-            if (uniqueDocs1.contains(doc)) {
-                int idx = Arrays.binarySearch(docs1, doc);
-                expectedScore += scores1[idx];
-            }
-            if (uniqueDocs2.contains(doc)) {
-                int idx = Arrays.binarySearch(docs2, doc);
-                expectedScore += scores2[idx];
-            }
-            assertEquals(expectedScore, actualTotalScore, DELTA_FOR_SCORE_ASSERTION);
-            numOfActualDocs++;
-        }
-
-        int totalUniqueCount = uniqueDocs1.size();
-        for (int n : uniqueDocs2) {
-            if (!uniqueDocs1.contains(n)) {
-                totalUniqueCount++;
-            }
-        }
-        assertEquals(totalUniqueCount, numOfActualDocs);
-    }
-
-    @SneakyThrows
-    public void testWithRandomDocumentsAndCombinedScore_whenMultipleScorers_thenReturnSuccessfully() {
-        int maxDocId1 = TestUtil.nextInt(random(), 10, 10_000);
-        Pair<int[], float[]> docsAndScores1 = generateDocuments(maxDocId1);
-        int[] docs1 = docsAndScores1.getLeft();
-        float[] scores1 = docsAndScores1.getRight();
-        int maxDocId2 = TestUtil.nextInt(random(), 10, 10_000);
-        Pair<int[], float[]> docsAndScores2 = generateDocuments(maxDocId2);
-        int[] docs2 = docsAndScores2.getLeft();
-        float[] scores2 = docsAndScores2.getRight();
-
-        HybridQueryScorer hybridQueryScorer = new HybridQueryScorer(
-            Arrays.asList(
-                scorer(docs1, scores1, fakeWeight(new MatchAllDocsQuery())),
-                scorer(docs2, scores2, fakeWeight(new MatchNoDocsQuery()))
-            )
-        );
-        int doc = -1;
-        int numOfActualDocs = 0;
-        Set<Integer> uniqueDocs1 = Arrays.stream(docs1).boxed().collect(Collectors.toSet());
-        Set<Integer> uniqueDocs2 = Arrays.stream(docs2).boxed().collect(Collectors.toSet());
-        while (doc != NO_MORE_DOCS) {
-            doc = hybridQueryScorer.iterator().nextDoc();
-            if (doc == DocIdSetIterator.NO_MORE_DOCS) {
-                continue;
-            }
-            float expectedScore = 0.0f;
-            if (uniqueDocs1.contains(doc)) {
-                int idx = Arrays.binarySearch(docs1, doc);
-                expectedScore += scores1[idx];
-            }
-            if (uniqueDocs2.contains(doc)) {
-                int idx = Arrays.binarySearch(docs2, doc);
-                expectedScore += scores2[idx];
-            }
-            float hybridScore = 0.0f;
-            for (float score : hybridQueryScorer.hybridScores()) {
-                hybridScore += score;
-            }
-            assertEquals(expectedScore, hybridScore, DELTA_FOR_SCORE_ASSERTION);
-            numOfActualDocs++;
-        }
-
-        int totalUniqueCount = uniqueDocs1.size();
-        for (int n : uniqueDocs2) {
-            if (!uniqueDocs1.contains(n)) {
-                totalUniqueCount++;
-            }
-        }
-        assertEquals(totalUniqueCount, numOfActualDocs);
-    }
-
     @SneakyThrows
     public void testWithRandomDocuments_whenMultipleScorersAndSomeScorersEmpty_thenReturnSuccessfully() {
         int maxDocId = TestUtil.nextInt(random(), 10, 10_000);
@@ -202,11 +92,6 @@ public void testMaxScore_whenMultipleScorers_thenSuccessful() {
 
         maxScore = hybridQueryScorerWithSomeNullSubScorers.getMaxScore(Integer.MAX_VALUE);
         assertTrue(maxScore > 0.0f);
-
-        HybridQueryScorer hybridQueryScorerWithAllNullSubScorers = new HybridQueryScorer(Arrays.asList(null, null));
-
-        maxScore = hybridQueryScorerWithAllNullSubScorers.getMaxScore(Integer.MAX_VALUE);
-        assertEquals(0.0f, maxScore, 0.0f);
     }
 
     @SneakyThrows
@@ -517,14 +402,6 @@ public void testScore_whenMultipleQueries_thenCombineScores() {
         assertEquals("Combined score should be sum of bool and neural scores", 1.6f, combinedScore, DELTA_FOR_SCORE_ASSERTION);
     }
 
-    @SneakyThrows
-    public void testScore_whenEmptySubScorers_thenReturnZero() {
-        HybridQueryScorer hybridScorer = new HybridQueryScorer(Collections.emptyList());
-        float score = hybridScorer.score(null);
-
-        assertEquals("Score should be 0.0 for null wrapper", 0.0f, score, DELTA_FOR_SCORE_ASSERTION);
-    }
-
     @SneakyThrows
     public void testInitialization_whenValidScorer_thenSuccessful() {
         // Create scorer with iterator
@@ -558,46 +435,6 @@ public void testInitialization_whenValidScorer_thenSuccessful() {
         assertEquals("Cost should be 1", 1L, wrapper.cost);
     }
 
-    @SneakyThrows
-    public void testHybridScores_withTwoPhaseIterator() throws IOException {
-        // Create weight and scorers
-        Scorer scorer1 = mock(Scorer.class);
-        TwoPhaseIterator twoPhaseIterator = mock(TwoPhaseIterator.class);
-        DocIdSetIterator approximation = mock(DocIdSetIterator.class);
-
-        // Setup two-phase behavior
-        when(scorer1.twoPhaseIterator()).thenReturn(twoPhaseIterator);
-        when(twoPhaseIterator.approximation()).thenReturn(approximation);
-        when(scorer1.iterator()).thenReturn(approximation);
-        when(approximation.cost()).thenReturn(1L);
-
-        // Setup DocIdSetIterator behavior - use different docIDs
-        when(approximation.docID()).thenReturn(5);  // approximation at doc 5
-        when(scorer1.docID()).thenReturn(5);        // scorer at same doc
-        when(scorer1.score()).thenReturn(2.0f);
-
-        // matches() always returns false - document should never match
-        when(twoPhaseIterator.matches()).thenReturn(false);
-
-        // Create HybridQueryScorer with two-phase iterator
-        List<Scorer> subScorers = Collections.singletonList(scorer1);
-        HybridQueryScorer hybridScorer = new HybridQueryScorer(subScorers);
-
-        // Call matches() first to establish non-matching state
-        TwoPhaseIterator hybridTwoPhase = hybridScorer.twoPhaseIterator();
-        assertNotNull("Should have two phase iterator", hybridTwoPhase);
-        assertFalse("Document should not match", hybridTwoPhase.matches());
-
-        // Get scores - should be zero since document doesn't match
-        float[] scores = hybridScorer.hybridScores();
-        assertEquals("Should have one score entry", 1, scores.length);
-        assertEquals("Score should be 0 for non-matching document", 0.0f, scores[0], DELTA_FOR_SCORE_ASSERTION);
-
-        // Verify score() was never called since document didn't match
-        verify(scorer1, never()).score();
-        verify(twoPhaseIterator, times(1)).matches();
-    }
-
     @SneakyThrows
     public void testTwoPhaseIterator_withNestedTwoPhaseQuery() {
         // Create a scorer that uses two-phase iteration
diff --git a/src/test/java/org/opensearch/neuralsearch/query/HybridScorerSupplierTests.java b/src/test/java/org/opensearch/neuralsearch/query/HybridScorerSupplierTests.java
diff --git a/src/test/resources/processor/UploadModelRequestBody.json b/src/test/resources/processor/UploadModelRequestBody.json