Skip to content

Commit 4bf94d7

Browse files
committed
[TEST] Add integration test codes for fixed_char_length chunking algorithm
Signed-off-by: yeonghyeonKo <[email protected]>
1 parent 985558a commit 4bf94d7

File tree

4 files changed

+89
-4
lines changed

4 files changed

+89
-4
lines changed

src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorIT.java

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,8 @@ public class TextChunkingProcessorIT extends BaseNeuralSearchIT {
3939

4040
private static final String DELIMITER_PIPELINE_NAME = "pipeline-text-chunking-delimiter";
4141

42+
private static final String FIXED_CHAR_LENGTH_PIPELINE_NAME = "pipeline-text-chunking-fixed-char-length";
43+
4244
private static final String CASCADE_PIPELINE_NAME = "pipeline-text-chunking-cascade";
4345

4446
private static final String TEST_DOCUMENT = "processor/chunker/TextChunkingTestDocument.json";
@@ -54,6 +56,8 @@ public class TextChunkingProcessorIT extends BaseNeuralSearchIT {
5456
"processor/chunker/PipelineForFixedTokenLengthChunkerWithLowercaseTokenizer.json",
5557
DELIMITER_PIPELINE_NAME,
5658
"processor/chunker/PipelineForDelimiterChunker.json",
59+
FIXED_CHAR_LENGTH_PIPELINE_NAME,
60+
"processor/chunker/PipelineForFixedCharLengthChunker.json",
5761
CASCADE_PIPELINE_NAME,
5862
"processor/chunker/PipelineForCascadedChunker.json"
5963
);
@@ -137,6 +141,22 @@ public void testTextChunkingProcessor_withDelimiterAlgorithm_successful() {
137141
validateIndexIngestResults(INDEX_NAME, OUTPUT_FIELD, expectedPassages);
138142
}
139143

144+
@SneakyThrows
145+
public void testTextChunkingProcessor_withFixedCharLengthAlgorithm_thenSucceed() {
146+
createPipelineProcessor(FIXED_CHAR_LENGTH_PIPELINE_NAME);
147+
createTextChunkingIndex(INDEX_NAME, FIXED_CHAR_LENGTH_PIPELINE_NAME);
148+
149+
String document = getDocumentFromFilePath(TEST_DOCUMENT);
150+
ingestDocument(INDEX_NAME, document);
151+
152+
List<String> expectedPassages = new ArrayList<>();
153+
expectedPassages.add("This is an example document to be chunked. The doc");
154+
expectedPassages.add("d. The document contains a single paragraph, two s");
155+
expectedPassages.add("aph, two sentences and 24 tokens by standard token");
156+
expectedPassages.add("dard tokenizer in OpenSearch.");
157+
validateIndexIngestResults(INDEX_NAME, OUTPUT_FIELD, expectedPassages);
158+
}
159+
140160
@SneakyThrows
141161
public void testTextChunkingProcessor_withCascadePipeline_successful() {
142162
createPipelineProcessor(CASCADE_PIPELINE_NAME);
@@ -209,11 +229,11 @@ public void testTextChunkingProcessor_processorStats_successful() {
209229
// Parse json to get stats
210230
assertEquals(5, getNestedValue(allNodesStats, EventStatName.TEXT_CHUNKING_PROCESSOR_EXECUTIONS));
211231
assertEquals(3, getNestedValue(allNodesStats, EventStatName.TEXT_CHUNKING_DELIMITER_EXECUTIONS));
212-
assertEquals(2, getNestedValue(allNodesStats, EventStatName.TEXT_CHUNKING_FIXED_LENGTH_EXECUTIONS));
232+
assertEquals(2, getNestedValue(allNodesStats, EventStatName.TEXT_CHUNKING_FIXED_TOKEN_LENGTH_EXECUTIONS));
213233

214234
assertEquals(3, getNestedValue(stats, InfoStatName.TEXT_CHUNKING_PROCESSORS));
215235
assertEquals(1, getNestedValue(stats, InfoStatName.TEXT_CHUNKING_DELIMITER_PROCESSORS));
216-
assertEquals(2, getNestedValue(stats, InfoStatName.TEXT_CHUNKING_FIXED_LENGTH_PROCESSORS));
236+
assertEquals(2, getNestedValue(stats, InfoStatName.TEXT_CHUNKING_FIXED_TOKEN_LENGTH_PROCESSORS));
217237

218238
// Reset stats
219239
updateClusterSettings("plugins.neural_search.stats_enabled", false);

src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorTests.java

Lines changed: 43 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
import org.opensearch.ingest.IngestDocument;
3737
import org.opensearch.ingest.Processor;
3838
import org.opensearch.neuralsearch.processor.chunker.DelimiterChunker;
39+
import org.opensearch.neuralsearch.processor.chunker.FixedCharLengthChunker;
3940
import org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker;
4041
import org.opensearch.neuralsearch.processor.factory.TextChunkingProcessorFactory;
4142
import org.opensearch.neuralsearch.settings.NeuralSearchSettingsAccessor;
@@ -127,6 +128,13 @@ private Map<String, Object> createDelimiterParameters() {
127128
return parameters;
128129
}
129130

131+
private Map<String, Object> createFixedCharLengthParameters() {
132+
Map<String, Object> parameters = new HashMap<>();
133+
parameters.put(FixedCharLengthChunker.CHAR_LIMIT_FIELD, 50);
134+
parameters.put(FixedCharLengthChunker.OVERLAP_RATE_FIELD, 0.2);
135+
return parameters;
136+
}
137+
130138
private Map<String, Object> createStringFieldMap() {
131139
Map<String, Object> fieldMap = new HashMap<>();
132140
fieldMap.put(INPUT_FIELD, OUTPUT_FIELD);
@@ -190,6 +198,17 @@ private TextChunkingProcessor createDelimiterInstance() {
190198
return textChunkingProcessorFactory.create(registry, PROCESSOR_TAG, DESCRIPTION, config);
191199
}
192200

201+
@SneakyThrows
202+
private TextChunkingProcessor createFixedCharLengthInstance(Map<String, Object> fieldMap) {
203+
Map<String, Object> config = new HashMap<>();
204+
Map<String, Object> algorithmMap = new HashMap<>();
205+
algorithmMap.put(FixedCharLengthChunker.ALGORITHM_NAME, createFixedCharLengthParameters());
206+
config.put(FIELD_MAP_FIELD, fieldMap);
207+
config.put(ALGORITHM_FIELD, algorithmMap);
208+
Map<String, Processor.Factory> registry = new HashMap<>();
209+
return textChunkingProcessorFactory.create(registry, PROCESSOR_TAG, DESCRIPTION, config);
210+
}
211+
193212
@SneakyThrows
194213
private TextChunkingProcessor createIgnoreMissingInstance() {
195214
Map<String, Object> config = new HashMap<>();
@@ -1003,7 +1022,7 @@ public void testExecute_statsDisabled_thenSucceed() {
10031022
.getTimestampedEventStatSnapshots(EnumSet.allOf(EventStatName.class));
10041023

10051024
assertEquals(0L, snapshots.get(EventStatName.TEXT_CHUNKING_PROCESSOR_EXECUTIONS).getValue().longValue());
1006-
assertEquals(0L, snapshots.get(EventStatName.TEXT_CHUNKING_FIXED_LENGTH_EXECUTIONS).getValue().longValue());
1025+
assertEquals(0L, snapshots.get(EventStatName.TEXT_CHUNKING_FIXED_TOKEN_LENGTH_EXECUTIONS).getValue().longValue());
10071026
}
10081027

10091028
@SneakyThrows
@@ -1024,7 +1043,7 @@ public void testExecute_statsEnabled_withFixedTokenLength_andSourceDataString_th
10241043
.getTimestampedEventStatSnapshots(EnumSet.allOf(EventStatName.class));
10251044

10261045
assertEquals(1L, snapshots.get(EventStatName.TEXT_CHUNKING_PROCESSOR_EXECUTIONS).getValue().longValue());
1027-
assertEquals(1L, snapshots.get(EventStatName.TEXT_CHUNKING_FIXED_LENGTH_EXECUTIONS).getValue().longValue());
1046+
assertEquals(1L, snapshots.get(EventStatName.TEXT_CHUNKING_FIXED_TOKEN_LENGTH_EXECUTIONS).getValue().longValue());
10281047
}
10291048

10301049
@SneakyThrows
@@ -1046,4 +1065,26 @@ public void testExecute_statsEnabled_withDelimiter_andSourceDataString_thenSucce
10461065
assertEquals(1L, snapshots.get(EventStatName.TEXT_CHUNKING_PROCESSOR_EXECUTIONS).getValue().longValue());
10471066
assertEquals(1L, snapshots.get(EventStatName.TEXT_CHUNKING_DELIMITER_EXECUTIONS).getValue().longValue());
10481067
}
1068+
1069+
@SneakyThrows
1070+
public void testExecute_statsEnabled_withFixedCharLength_andSourceDataString_thenSucceed() {
1071+
TextChunkingProcessor processor = createFixedCharLengthInstance(createStringFieldMap());
1072+
IngestDocument ingestDocument = createIngestDocumentWithSourceData(createSourceDataString());
1073+
IngestDocument document = processor.execute(ingestDocument);
1074+
assert document.getSourceAndMetadata().containsKey(OUTPUT_FIELD);
1075+
Object passages = document.getSourceAndMetadata().get(OUTPUT_FIELD);
1076+
assert (passages instanceof List<?>);
1077+
List<String> expectedPassages = new ArrayList<>();
1078+
expectedPassages.add("This is an example document to be chunked. The doc");
1079+
expectedPassages.add("d. The document contains a single paragraph, two s");
1080+
expectedPassages.add("aph, two sentences and 24 tokens by standard token");
1081+
expectedPassages.add("dard tokenizer in OpenSearch.");
1082+
assertEquals(expectedPassages, passages);
1083+
1084+
Map<EventStatName, TimestampedEventStatSnapshot> snapshots = EventStatsManager.instance()
1085+
.getTimestampedEventStatSnapshots(EnumSet.allOf(EventStatName.class));
1086+
1087+
assertEquals(1L, snapshots.get(EventStatName.TEXT_CHUNKING_PROCESSOR_EXECUTIONS).getValue().longValue());
1088+
assertEquals(1L, snapshots.get(EventStatName.TEXT_CHUNKING_FIXED_CHAR_LENGTH_EXECUTIONS).getValue().longValue());
1089+
}
10491090
}

src/test/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactoryTests.java

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,12 @@ public void testCreate_Delimiter() {
3030
assert (chunker instanceof DelimiterChunker);
3131
}
3232

33+
public void testCreate_FixedCharLength() {
34+
Chunker chunker = ChunkerFactory.create(FixedCharLengthChunker.ALGORITHM_NAME, createChunkParameters());
35+
assertNotNull(chunker);
36+
assert (chunker instanceof FixedCharLengthChunker);
37+
}
38+
3339
public void testCreate_Invalid() {
3440
String invalidChunkerName = "Invalid Chunker Algorithm";
3541
assertThrows(NullPointerException.class, () -> ChunkerFactory.create(invalidChunkerName, createChunkParameters()));
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
{
2+
"description": "An example fixed character length chunker pipeline with overlap_rate",
3+
"processors" : [
4+
{
5+
"text_chunking": {
6+
"field_map": {
7+
"body": "body_chunk"
8+
},
9+
"algorithm": {
10+
"fixed_char_length": {
11+
"char_limit": 50,
12+
"overlap_rate": 0.2
13+
}
14+
}
15+
}
16+
}
17+
]
18+
}

0 commit comments

Comments
 (0)