Skip to content

Commit 6d5dea2

Browse files
ryanboganYeonghyeonKO
authored andcommitted
Update Lucene dependencies (opensearch-project#1336)
* Update Lucene dependencies Signed-off-by: Ryan Bogan <[email protected]> * Add changelog entry Signed-off-by: Ryan Bogan <[email protected]> * Update model request body for bwc and integ tests Signed-off-by: Ryan Bogan <[email protected]> --------- Signed-off-by: Ryan Bogan <[email protected]>
1 parent 4dd9371 commit 6d5dea2

File tree

10 files changed

+60
-205
lines changed

10 files changed

+60
-205
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,5 +18,6 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
1818
### Documentation
1919

2020
### Maintenance
21+
- Update Lucene dependencies ([#1336](https://github.com/opensearch-project/neural-search/pull/1336))
2122

2223
### Refactoring

qa/restart-upgrade/src/test/resources/processor/UploadModelRequestBody.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
"name": "traced_small_model",
33
"version": "1.0.0",
44
"model_format": "TORCH_SCRIPT",
5+
"function_name": "TEXT_EMBEDDING",
56
"model_task_type": "text_embedding",
67
"model_content_hash_value": "e13b74006290a9d0f58c1376f9629d4ebc05a0f9385f40db837452b167ae9021",
78
"model_group_id": "%s",

qa/rolling-upgrade/src/test/resources/processor/UploadModelRequestBody.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
"name": "traced_small_model",
33
"version": "1.0.0",
44
"model_format": "TORCH_SCRIPT",
5+
"function_name": "TEXT_EMBEDDING",
56
"model_task_type": "text_embedding",
67
"model_content_hash_value": "e13b74006290a9d0f58c1376f9629d4ebc05a0f9385f40db837452b167ae9021",
78
"model_group_id": "%s",

src/main/java/org/opensearch/neuralsearch/query/HybridBulkScorer.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ public class HybridBulkScorer extends BulkScorer {
3535
@Getter
3636
private final float[][] windowScores;
3737
private final HybridQueryDocIdStream hybridQueryDocIdStream;
38+
@Getter
3839
private final int maxDoc;
3940
private int[] docIds;
4041

src/main/java/org/opensearch/neuralsearch/query/HybridQueryDocIdStream.java

Lines changed: 25 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
package org.opensearch.neuralsearch.query;
66

77
import lombok.RequiredArgsConstructor;
8-
import lombok.Setter;
98
import org.apache.lucene.search.CheckedIntConsumer;
109
import org.apache.lucene.search.DocIdStream;
1110
import org.apache.lucene.util.FixedBitSet;
@@ -20,16 +19,17 @@
2019
public class HybridQueryDocIdStream extends DocIdStream {
2120
private static final int BLOCK_SHIFT = 6;
2221
private final HybridBulkScorer hybridBulkScorer;
23-
@Setter
2422
private int base;
23+
private int upTo;
2524

2625
/**
2726
* Iterate over all doc ids and collect each doc id with leaf collector
2827
* @param consumer consumer that is called for each accepted doc id
2928
* @throws IOException in case of IO exception
3029
*/
3130
@Override
32-
public void forEach(CheckedIntConsumer<IOException> consumer) throws IOException {
31+
public void forEach(int upTo, CheckedIntConsumer<IOException> consumer) throws IOException {
32+
upTo = Math.min(upTo, hybridBulkScorer.getMaxDoc());
3333
// bitset that represents matching documents, bit is set (1) if doc id is a match
3434
FixedBitSet matchingBitSet = hybridBulkScorer.getMatching();
3535
long[] bitArray = matchingBitSet.getBits();
@@ -52,12 +52,33 @@ public void forEach(CheckedIntConsumer<IOException> consumer) throws IOException
5252
hybridBulkScorer.getHybridSubQueryScorer().getSubQueryScores()[subQueryIndex] = scoreOfDocIdForSubQuery;
5353
}
5454
// process the document with its base offset
55-
consumer.accept(base | docIndexInWindow);
55+
int doc = base | docIndexInWindow;
56+
if (doc < upTo + base) {
57+
consumer.accept(doc);
58+
this.upTo++;
59+
}
5660
// reset scores after processing of one doc, this is required because scorer object is re-used
5761
hybridBulkScorer.getHybridSubQueryScorer().resetScores();
5862
// reset bit for this doc id to indicate that it has been consumed
5963
bits ^= 1L << numberOfTrailingZeros;
6064
}
6165
}
6266
}
67+
68+
@Override
69+
public int count(int upTo) throws IOException {
70+
int[] count = new int[1];
71+
forEach(upTo, (doc -> count[0]++));
72+
return count[0];
73+
}
74+
75+
@Override
76+
public boolean mayHaveRemaining() {
77+
return this.upTo + 1 < hybridBulkScorer.getMaxDoc();
78+
}
79+
80+
public void setBase(int base) {
81+
this.base = base;
82+
this.upTo = base;
83+
}
6384
}

src/main/java/org/opensearch/neuralsearch/query/HybridQueryScorer.java

Lines changed: 19 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -51,10 +51,16 @@ public HybridQueryScorer(final List<Scorer> subScorers) throws IOException {
5151
super();
5252
this.subScorers = Collections.unmodifiableList(subScorers);
5353
this.numSubqueries = subScorers.size();
54-
this.subScorersPQ = initializeSubScorersPQ();
54+
List<HybridDisiWrapper> hybridDisiWrappers = initializeSubScorersList();
55+
if (hybridDisiWrappers.isEmpty()) {
56+
throw new IllegalArgumentException("There must be at least 1 subScorers");
57+
}
58+
this.subScorersPQ = DisiPriorityQueue.ofMaxSize(numSubqueries);
59+
this.subScorersPQ.addAll(hybridDisiWrappers.toArray(new DisiWrapper[0]), 0, hybridDisiWrappers.size());
5560
boolean needsScores = scoreMode != ScoreMode.COMPLETE_NO_SCORES;
5661

57-
this.approximation = new HybridSubqueriesDISIApproximation(this.subScorersPQ);
62+
this.approximation = new HybridSubqueriesDISIApproximation(hybridDisiWrappers, subScorersPQ);
63+
5864
if (scoreMode == ScoreMode.TOP_SCORES) {
5965
this.disjunctionBlockPropagator = new HybridScoreBlockBoundaryPropagator(subScorers);
6066
} else {
@@ -181,44 +187,20 @@ public int docID() {
181187
return subScorersPQ.top().doc;
182188
}
183189

184-
/**
185-
* Return array of scores per sub-query for doc id that is defined by current iterator position
186-
* @return
187-
* @throws IOException
188-
*/
189-
public float[] hybridScores() throws IOException {
190-
float[] scores = new float[numSubqueries];
191-
// retrieves sub-matches using DisjunctionDisiScorer's two-phase iteration process.
192-
// while the two-phase iterator can efficiently skip blocks of document IDs during matching,
193-
// the DisiWrapper (obtained from subScorersPQ.topList()) ensures sequential document ID iteration.
194-
// this is necessary for maintaining correct scoring order.
195-
DisiWrapper topList = getSubMatches();
196-
197-
for (HybridDisiWrapper disiWrapper = (HybridDisiWrapper) topList; disiWrapper != null; disiWrapper =
198-
(HybridDisiWrapper) disiWrapper.next) {
199-
// check if this doc has match in the subQuery. If not, add score as 0.0 and continue
200-
Scorer scorer = disiWrapper.scorer;
201-
if (scorer.docID() == DocIdSetIterator.NO_MORE_DOCS) {
202-
continue;
203-
}
204-
scores[disiWrapper.getSubQueryIndex()] = scorer.score();
205-
}
206-
return scores;
207-
}
208-
209-
private DisiPriorityQueue initializeSubScorersPQ() {
190+
private List<HybridDisiWrapper> initializeSubScorersList() {
210191
Objects.requireNonNull(subScorers, "should not be null");
211192
// we need to count this way in order to include all identical sub-queries
212-
DisiPriorityQueue subScorersPQ = new DisiPriorityQueue(numSubqueries);
193+
List<HybridDisiWrapper> hybridDisiWrappers = new ArrayList<>();
213194
for (int idx = 0; idx < numSubqueries; idx++) {
214195
Scorer scorer = subScorers.get(idx);
215196
if (scorer == null) {
216197
continue;
217198
}
218199
final HybridDisiWrapper disiWrapper = new HybridDisiWrapper(scorer, idx);
219-
subScorersPQ.add(disiWrapper);
200+
hybridDisiWrappers.add(disiWrapper);
201+
220202
}
221-
return subScorersPQ;
203+
return hybridDisiWrappers;
222204
}
223205

224206
@Override
@@ -324,9 +306,12 @@ static class HybridSubqueriesDISIApproximation extends DocIdSetIterator {
324306
final DocIdSetIterator docIdSetIterator;
325307
final DisiPriorityQueue subIterators;
326308

327-
public HybridSubqueriesDISIApproximation(final DisiPriorityQueue subIterators) {
328-
docIdSetIterator = new DisjunctionDISIApproximation(subIterators);
329-
this.subIterators = subIterators;
309+
public HybridSubqueriesDISIApproximation(
310+
final Collection<? extends DisiWrapper> subIterators,
311+
final DisiPriorityQueue subIteratorsPQ
312+
) {
313+
docIdSetIterator = new DisjunctionDISIApproximation(subIterators, 0);
314+
this.subIterators = subIteratorsPQ;
330315
}
331316

332317
@Override

src/test/java/org/opensearch/neuralsearch/query/HybridQueryDocIdStreamTests.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,7 @@ public void testForEach_whenCrossing64BitBoundary_thenAllDocsProcessed() {
124124
private HybridBulkScorer createMockScorerWithDocs(FixedBitSet matchingDocs, int numDocs) {
125125
HybridBulkScorer mockScorer = mock(HybridBulkScorer.class);
126126
when(mockScorer.getMatching()).thenReturn(matchingDocs);
127+
when(mockScorer.getMaxDoc()).thenReturn(200);
127128

128129
// setup window scores with the specified number of docs
129130
float[][] windowScores = new float[2][numDocs];
@@ -144,6 +145,7 @@ private HybridBulkScorer createMockScorerWithDocs(FixedBitSet matchingDocs, int
144145
private HybridBulkScorer createMockScorerWithDocs(FixedBitSet matchingDocs) {
145146
HybridBulkScorer mockScorer = mock(HybridBulkScorer.class);
146147
when(mockScorer.getMatching()).thenReturn(matchingDocs);
148+
when(mockScorer.getMaxDoc()).thenReturn(200);
147149

148150
// setup window scores
149151
float[][] windowScores = new float[2][NUM_DOCS]; // 2 sub-queries

src/test/java/org/opensearch/neuralsearch/query/HybridQueryScorerTests.java

Lines changed: 0 additions & 163 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,6 @@
77
import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
88
import static org.mockito.ArgumentMatchers.anyInt;
99
import static org.mockito.Mockito.mock;
10-
import static org.mockito.Mockito.never;
11-
import static org.mockito.Mockito.times;
1210
import static org.mockito.Mockito.verify;
1311
import static org.mockito.Mockito.when;
1412
import static org.opensearch.neuralsearch.util.TestUtils.DELTA_FOR_SCORE_ASSERTION;
@@ -21,7 +19,6 @@
2119
import java.util.List;
2220
import java.util.Set;
2321
import java.util.concurrent.atomic.AtomicInteger;
24-
import java.util.stream.Collectors;
2522

2623
import org.apache.commons.lang3.tuple.ImmutablePair;
2724
import org.apache.commons.lang3.tuple.Pair;
@@ -58,113 +55,6 @@ public void testWithRandomDocuments_whenOneSubScorer_thenReturnSuccessfully() {
5855
testWithQuery(docs, scores, hybridQueryScorer);
5956
}
6057

61-
@SneakyThrows
62-
public void testWithRandomDocumentsAndHybridScores_whenMultipleScorers_thenReturnSuccessfully() {
63-
int maxDocId1 = TestUtil.nextInt(random(), 10, 10_000);
64-
Pair<int[], float[]> docsAndScores1 = generateDocuments(maxDocId1);
65-
int[] docs1 = docsAndScores1.getLeft();
66-
float[] scores1 = docsAndScores1.getRight();
67-
int maxDocId2 = TestUtil.nextInt(random(), 10, 10_000);
68-
Pair<int[], float[]> docsAndScores2 = generateDocuments(maxDocId2);
69-
int[] docs2 = docsAndScores2.getLeft();
70-
float[] scores2 = docsAndScores2.getRight();
71-
72-
Weight weight = mock(Weight.class);
73-
74-
HybridQueryScorer hybridQueryScorer = new HybridQueryScorer(
75-
Arrays.asList(
76-
scorer(docs1, scores1, fakeWeight(new MatchAllDocsQuery())),
77-
scorer(docs2, scores2, fakeWeight(new MatchNoDocsQuery()))
78-
)
79-
);
80-
int doc = -1;
81-
int numOfActualDocs = 0;
82-
Set<Integer> uniqueDocs1 = Arrays.stream(docs1).boxed().collect(Collectors.toSet());
83-
Set<Integer> uniqueDocs2 = Arrays.stream(docs2).boxed().collect(Collectors.toSet());
84-
while (doc != NO_MORE_DOCS) {
85-
doc = hybridQueryScorer.iterator().nextDoc();
86-
if (doc == DocIdSetIterator.NO_MORE_DOCS) {
87-
continue;
88-
}
89-
float[] actualTotalScores = hybridQueryScorer.hybridScores();
90-
float actualTotalScore = 0.0f;
91-
for (float score : actualTotalScores) {
92-
actualTotalScore += score;
93-
}
94-
float expectedScore = 0.0f;
95-
if (uniqueDocs1.contains(doc)) {
96-
int idx = Arrays.binarySearch(docs1, doc);
97-
expectedScore += scores1[idx];
98-
}
99-
if (uniqueDocs2.contains(doc)) {
100-
int idx = Arrays.binarySearch(docs2, doc);
101-
expectedScore += scores2[idx];
102-
}
103-
assertEquals(expectedScore, actualTotalScore, DELTA_FOR_SCORE_ASSERTION);
104-
numOfActualDocs++;
105-
}
106-
107-
int totalUniqueCount = uniqueDocs1.size();
108-
for (int n : uniqueDocs2) {
109-
if (!uniqueDocs1.contains(n)) {
110-
totalUniqueCount++;
111-
}
112-
}
113-
assertEquals(totalUniqueCount, numOfActualDocs);
114-
}
115-
116-
@SneakyThrows
117-
public void testWithRandomDocumentsAndCombinedScore_whenMultipleScorers_thenReturnSuccessfully() {
118-
int maxDocId1 = TestUtil.nextInt(random(), 10, 10_000);
119-
Pair<int[], float[]> docsAndScores1 = generateDocuments(maxDocId1);
120-
int[] docs1 = docsAndScores1.getLeft();
121-
float[] scores1 = docsAndScores1.getRight();
122-
int maxDocId2 = TestUtil.nextInt(random(), 10, 10_000);
123-
Pair<int[], float[]> docsAndScores2 = generateDocuments(maxDocId2);
124-
int[] docs2 = docsAndScores2.getLeft();
125-
float[] scores2 = docsAndScores2.getRight();
126-
127-
HybridQueryScorer hybridQueryScorer = new HybridQueryScorer(
128-
Arrays.asList(
129-
scorer(docs1, scores1, fakeWeight(new MatchAllDocsQuery())),
130-
scorer(docs2, scores2, fakeWeight(new MatchNoDocsQuery()))
131-
)
132-
);
133-
int doc = -1;
134-
int numOfActualDocs = 0;
135-
Set<Integer> uniqueDocs1 = Arrays.stream(docs1).boxed().collect(Collectors.toSet());
136-
Set<Integer> uniqueDocs2 = Arrays.stream(docs2).boxed().collect(Collectors.toSet());
137-
while (doc != NO_MORE_DOCS) {
138-
doc = hybridQueryScorer.iterator().nextDoc();
139-
if (doc == DocIdSetIterator.NO_MORE_DOCS) {
140-
continue;
141-
}
142-
float expectedScore = 0.0f;
143-
if (uniqueDocs1.contains(doc)) {
144-
int idx = Arrays.binarySearch(docs1, doc);
145-
expectedScore += scores1[idx];
146-
}
147-
if (uniqueDocs2.contains(doc)) {
148-
int idx = Arrays.binarySearch(docs2, doc);
149-
expectedScore += scores2[idx];
150-
}
151-
float hybridScore = 0.0f;
152-
for (float score : hybridQueryScorer.hybridScores()) {
153-
hybridScore += score;
154-
}
155-
assertEquals(expectedScore, hybridScore, DELTA_FOR_SCORE_ASSERTION);
156-
numOfActualDocs++;
157-
}
158-
159-
int totalUniqueCount = uniqueDocs1.size();
160-
for (int n : uniqueDocs2) {
161-
if (!uniqueDocs1.contains(n)) {
162-
totalUniqueCount++;
163-
}
164-
}
165-
assertEquals(totalUniqueCount, numOfActualDocs);
166-
}
167-
16858
@SneakyThrows
16959
public void testWithRandomDocuments_whenMultipleScorersAndSomeScorersEmpty_thenReturnSuccessfully() {
17060
int maxDocId = TestUtil.nextInt(random(), 10, 10_000);
@@ -202,11 +92,6 @@ public void testMaxScore_whenMultipleScorers_thenSuccessful() {
20292

20393
maxScore = hybridQueryScorerWithSomeNullSubScorers.getMaxScore(Integer.MAX_VALUE);
20494
assertTrue(maxScore > 0.0f);
205-
206-
HybridQueryScorer hybridQueryScorerWithAllNullSubScorers = new HybridQueryScorer(Arrays.asList(null, null));
207-
208-
maxScore = hybridQueryScorerWithAllNullSubScorers.getMaxScore(Integer.MAX_VALUE);
209-
assertEquals(0.0f, maxScore, 0.0f);
21095
}
21196

21297
@SneakyThrows
@@ -517,14 +402,6 @@ public void testScore_whenMultipleQueries_thenCombineScores() {
517402
assertEquals("Combined score should be sum of bool and neural scores", 1.6f, combinedScore, DELTA_FOR_SCORE_ASSERTION);
518403
}
519404

520-
@SneakyThrows
521-
public void testScore_whenEmptySubScorers_thenReturnZero() {
522-
HybridQueryScorer hybridScorer = new HybridQueryScorer(Collections.emptyList());
523-
float score = hybridScorer.score(null);
524-
525-
assertEquals("Score should be 0.0 for null wrapper", 0.0f, score, DELTA_FOR_SCORE_ASSERTION);
526-
}
527-
528405
@SneakyThrows
529406
public void testInitialization_whenValidScorer_thenSuccessful() {
530407
// Create scorer with iterator
@@ -558,46 +435,6 @@ public void testInitialization_whenValidScorer_thenSuccessful() {
558435
assertEquals("Cost should be 1", 1L, wrapper.cost);
559436
}
560437

561-
@SneakyThrows
562-
public void testHybridScores_withTwoPhaseIterator() throws IOException {
563-
// Create weight and scorers
564-
Scorer scorer1 = mock(Scorer.class);
565-
TwoPhaseIterator twoPhaseIterator = mock(TwoPhaseIterator.class);
566-
DocIdSetIterator approximation = mock(DocIdSetIterator.class);
567-
568-
// Setup two-phase behavior
569-
when(scorer1.twoPhaseIterator()).thenReturn(twoPhaseIterator);
570-
when(twoPhaseIterator.approximation()).thenReturn(approximation);
571-
when(scorer1.iterator()).thenReturn(approximation);
572-
when(approximation.cost()).thenReturn(1L);
573-
574-
// Setup DocIdSetIterator behavior - use different docIDs
575-
when(approximation.docID()).thenReturn(5); // approximation at doc 5
576-
when(scorer1.docID()).thenReturn(5); // scorer at same doc
577-
when(scorer1.score()).thenReturn(2.0f);
578-
579-
// matches() always returns false - document should never match
580-
when(twoPhaseIterator.matches()).thenReturn(false);
581-
582-
// Create HybridQueryScorer with two-phase iterator
583-
List<Scorer> subScorers = Collections.singletonList(scorer1);
584-
HybridQueryScorer hybridScorer = new HybridQueryScorer(subScorers);
585-
586-
// Call matches() first to establish non-matching state
587-
TwoPhaseIterator hybridTwoPhase = hybridScorer.twoPhaseIterator();
588-
assertNotNull("Should have two phase iterator", hybridTwoPhase);
589-
assertFalse("Document should not match", hybridTwoPhase.matches());
590-
591-
// Get scores - should be zero since document doesn't match
592-
float[] scores = hybridScorer.hybridScores();
593-
assertEquals("Should have one score entry", 1, scores.length);
594-
assertEquals("Score should be 0 for non-matching document", 0.0f, scores[0], DELTA_FOR_SCORE_ASSERTION);
595-
596-
// Verify score() was never called since document didn't match
597-
verify(scorer1, never()).score();
598-
verify(twoPhaseIterator, times(1)).matches();
599-
}
600-
601438
@SneakyThrows
602439
public void testTwoPhaseIterator_withNestedTwoPhaseQuery() {
603440
// Create a scorer that uses two-phase iteration

0 commit comments

Comments
 (0)