Skip to content

Commit 7711e98

Browse files
committed
Update unit and integ tests
Signed-off-by: Andy Qin <[email protected]>
1 parent 2e55f3e commit 7711e98

File tree

4 files changed

+92
-6
lines changed

4 files changed

+92
-6
lines changed

src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -195,7 +195,6 @@ public IngestDocument execute(final IngestDocument ingestDocument) {
195195
runtimeParameters.put(MAX_CHUNK_LIMIT_FIELD, maxChunkLimit);
196196
runtimeParameters.put(CHUNK_STRING_COUNT_FIELD, chunkStringCount);
197197
chunkMapType(sourceAndMetadataMap, fieldMap, runtimeParameters);
198-
199198
recordChunkingExecutionStats(chunker.getAlgorithmName());
200199
return ingestDocument;
201200
}

src/main/java/org/opensearch/neuralsearch/stats/info/InfoStatsManager.java

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -143,8 +143,8 @@ private void addIngestProcessorStats(Map<InfoStatName, CountableInfoStatSnapshot
143143

144144
/**
145145
* Counts text chunking processor stats based on processor config
146-
* @param stats
147-
* @param processorConfig
146+
* @param stats map containing the stat to increment
147+
* @param processorConfig map of the processor config, parsed to add stats
148148
*/
149149
private void countTextChunkingProcessorStats(Map<InfoStatName, CountableInfoStatSnapshot> stats, Map<String, Object> processorConfig) {
150150
increment(stats, InfoStatName.TEXT_CHUNKING_PROCESSORS);
@@ -157,6 +157,8 @@ private void countTextChunkingProcessorStats(Map<InfoStatName, CountableInfoStat
157157
switch (algorithmKey) {
158158
case DelimiterChunker.ALGORITHM_NAME -> increment(stats, InfoStatName.TEXT_CHUNKING_DELIMITER_PROCESSORS);
159159
case FixedTokenLengthChunker.ALGORITHM_NAME -> increment(stats, InfoStatName.TEXT_CHUNKING_FIXED_LENGTH_PROCESSORS);
160+
// If no algorithm is specified, the default is fixed length
161+
default -> increment(stats, InfoStatName.TEXT_CHUNKING_FIXED_LENGTH_PROCESSORS);
160162
}
161163
}
162164

src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorIT.java

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -186,11 +186,22 @@ public void testTextChunkingProcessor_processorStats() {
186186
ingestDocument(INDEX_NAME, document);
187187
ingestDocument(INDEX_NAME, document);
188188

189+
List<String> expectedPassages = new ArrayList<>();
190+
expectedPassages.add("This is an example document to be chunked. The document ");
191+
expectedPassages.add("contains a single paragraph, two sentences and 24 tokens by ");
192+
expectedPassages.add("standard tokenizer in OpenSearch.");
193+
validateIndexIngestResultsWithMultipleDocs(INDEX_NAME, OUTPUT_FIELD, expectedPassages, 2);
194+
189195
ingestDocument(INDEX_NAME2, document);
190196
ingestDocument(INDEX_NAME2, document);
191197
ingestDocument(INDEX_NAME2, document);
192198

193-
// Get stats request
199+
expectedPassages = new ArrayList<>();
200+
expectedPassages.add("This is an example document to be chunked.");
201+
expectedPassages.add(" The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch.");
202+
validateIndexIngestResultsWithMultipleDocs(INDEX_NAME2, OUTPUT_FIELD, expectedPassages, 3);
203+
204+
// Get stats
194205
String responseBody = executeNeuralStatRequest(new ArrayList<>(), new ArrayList<>());
195206
Map<String, Object> stats = parseInfoStatsResponse(responseBody);
196207
Map<String, Object> allNodesStats = parseAggregatedNodeStatsResponse(responseBody);
@@ -208,8 +219,8 @@ public void testTextChunkingProcessor_processorStats() {
208219
updateClusterSettings("plugins.neural_search.stats_enabled", false);
209220
}
210221

211-
private void validateIndexIngestResults(String indexName, String fieldName, Object expected) {
212-
assertEquals(1, getDocCount(indexName));
222+
private void validateIndexIngestResultsWithMultipleDocs(String indexName, String fieldName, Object expected, int docCount) {
223+
assertEquals(docCount, getDocCount(indexName));
213224
MatchAllQueryBuilder query = new MatchAllQueryBuilder();
214225
Map<String, Object> searchResults = search(indexName, query, 10);
215226
assertNotNull(searchResults);
@@ -224,6 +235,10 @@ private void validateIndexIngestResults(String indexName, String fieldName, Obje
224235
assertEquals(expected, ingestOutputs);
225236
}
226237

238+
private void validateIndexIngestResults(String indexName, String fieldName, Object expected) {
239+
validateIndexIngestResultsWithMultipleDocs(indexName, fieldName, expected, 1);
240+
}
241+
227242
private void createPipelineProcessor(String pipelineName) throws Exception {
228243
URL pipelineURLPath = classLoader.getResource(PIPELINE_CONFIGS_BY_NAME.get(pipelineName));
229244
Objects.requireNonNull(pipelineURLPath);

src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorTests.java

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
import org.apache.lucene.tests.analysis.MockTokenizer;
1010
import org.junit.Before;
1111
import java.util.ArrayList;
12+
import java.util.EnumSet;
1213
import java.util.HashMap;
1314
import java.util.List;
1415
import java.util.Locale;
@@ -38,7 +39,9 @@
3839
import org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker;
3940
import org.opensearch.neuralsearch.processor.factory.TextChunkingProcessorFactory;
4041
import org.opensearch.neuralsearch.settings.NeuralSearchSettingsAccessor;
42+
import org.opensearch.neuralsearch.stats.events.EventStatName;
4143
import org.opensearch.neuralsearch.stats.events.EventStatsManager;
44+
import org.opensearch.neuralsearch.stats.events.TimestampedEventStatSnapshot;
4245
import org.opensearch.plugins.AnalysisPlugin;
4346
import org.opensearch.test.OpenSearchTestCase;
4447
import static org.opensearch.neuralsearch.processor.TextChunkingProcessor.TYPE;
@@ -95,6 +98,7 @@ public void setup() {
9598
when(clusterService.state()).thenReturn(clusterState);
9699
textChunkingProcessorFactory = new TextChunkingProcessorFactory(environment, clusterService, getAnalysisRegistry());
97100

101+
EventStatsManager.instance().reset();
98102
NeuralSearchSettingsAccessor settingsAccessor = mock(NeuralSearchSettingsAccessor.class);
99103
when(settingsAccessor.isStatsEnabled()).thenReturn(true);
100104
EventStatsManager.instance().initialize(settingsAccessor);
@@ -978,4 +982,70 @@ public void testExecute_withIgnoreMissing_thenSucceed() {
978982
IngestDocument document = processor.execute(ingestDocument);
979983
assertFalse(document.getSourceAndMetadata().containsKey(OUTPUT_FIELD));
980984
}
985+
986+
@SneakyThrows
987+
public void testExecute_statsDisabled_thenSucceed() {
988+
NeuralSearchSettingsAccessor settingsAccessor = mock(NeuralSearchSettingsAccessor.class);
989+
when(settingsAccessor.isStatsEnabled()).thenReturn(false);
990+
EventStatsManager.instance().initialize(settingsAccessor);
991+
992+
TextChunkingProcessor processor = createFixedTokenLengthInstance(createStringFieldMap());
993+
IngestDocument ingestDocument = createIngestDocumentWithSourceData(createSourceDataString());
994+
IngestDocument document = processor.execute(ingestDocument);
995+
assert document.getSourceAndMetadata().containsKey(OUTPUT_FIELD);
996+
Object passages = document.getSourceAndMetadata().get(OUTPUT_FIELD);
997+
assert (passages instanceof List<?>);
998+
List<String> expectedPassages = new ArrayList<>();
999+
expectedPassages.add("This is an example document to be chunked. The document ");
1000+
expectedPassages.add("contains a single paragraph, two sentences and 24 tokens by ");
1001+
expectedPassages.add("standard tokenizer in OpenSearch.");
1002+
assertEquals(expectedPassages, passages);
1003+
1004+
Map<EventStatName, TimestampedEventStatSnapshot> snapshots = EventStatsManager.instance()
1005+
.getTimestampedEventStatSnapshots(EnumSet.allOf(EventStatName.class));
1006+
1007+
assertEquals(0L, snapshots.get(EventStatName.TEXT_CHUNKING_PROCESSOR_EXECUTIONS).getValue().longValue());
1008+
assertEquals(0L, snapshots.get(EventStatName.TEXT_CHUNKING_FIXED_LENGTH_EXECUTIONS).getValue().longValue());
1009+
}
1010+
1011+
@SneakyThrows
1012+
public void testExecute_statsEnabled_withFixedTokenLength_andSourceDataString_thenSucceed() {
1013+
TextChunkingProcessor processor = createFixedTokenLengthInstance(createStringFieldMap());
1014+
IngestDocument ingestDocument = createIngestDocumentWithSourceData(createSourceDataString());
1015+
IngestDocument document = processor.execute(ingestDocument);
1016+
assert document.getSourceAndMetadata().containsKey(OUTPUT_FIELD);
1017+
Object passages = document.getSourceAndMetadata().get(OUTPUT_FIELD);
1018+
assert (passages instanceof List<?>);
1019+
List<String> expectedPassages = new ArrayList<>();
1020+
expectedPassages.add("This is an example document to be chunked. The document ");
1021+
expectedPassages.add("contains a single paragraph, two sentences and 24 tokens by ");
1022+
expectedPassages.add("standard tokenizer in OpenSearch.");
1023+
assertEquals(expectedPassages, passages);
1024+
1025+
Map<EventStatName, TimestampedEventStatSnapshot> snapshots = EventStatsManager.instance()
1026+
.getTimestampedEventStatSnapshots(EnumSet.allOf(EventStatName.class));
1027+
1028+
assertEquals(1L, snapshots.get(EventStatName.TEXT_CHUNKING_PROCESSOR_EXECUTIONS).getValue().longValue());
1029+
assertEquals(1L, snapshots.get(EventStatName.TEXT_CHUNKING_FIXED_LENGTH_EXECUTIONS).getValue().longValue());
1030+
}
1031+
1032+
@SneakyThrows
1033+
public void testExecute_statsEnabled_withDelimiter_andSourceDataString_thenSucceed() {
1034+
TextChunkingProcessor processor = createDelimiterInstance();
1035+
IngestDocument ingestDocument = createIngestDocumentWithSourceData(createSourceDataString());
1036+
IngestDocument document = processor.execute(ingestDocument);
1037+
assert document.getSourceAndMetadata().containsKey(OUTPUT_FIELD);
1038+
Object passages = document.getSourceAndMetadata().get(OUTPUT_FIELD);
1039+
assert (passages instanceof List<?>);
1040+
List<String> expectedPassages = new ArrayList<>();
1041+
expectedPassages.add("This is an example document to be chunked.");
1042+
expectedPassages.add(" The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch.");
1043+
assertEquals(expectedPassages, passages);
1044+
1045+
Map<EventStatName, TimestampedEventStatSnapshot> snapshots = EventStatsManager.instance()
1046+
.getTimestampedEventStatSnapshots(EnumSet.allOf(EventStatName.class));
1047+
1048+
assertEquals(1L, snapshots.get(EventStatName.TEXT_CHUNKING_PROCESSOR_EXECUTIONS).getValue().longValue());
1049+
assertEquals(1L, snapshots.get(EventStatName.TEXT_CHUNKING_DELIMITER_EXECUTIONS).getValue().longValue());
1050+
}
9811051
}

0 commit comments

Comments
 (0)