36
36
import org .opensearch .ingest .IngestDocument ;
37
37
import org .opensearch .ingest .Processor ;
38
38
import org .opensearch .neuralsearch .processor .chunker .DelimiterChunker ;
39
+ import org .opensearch .neuralsearch .processor .chunker .FixedCharLengthChunker ;
39
40
import org .opensearch .neuralsearch .processor .chunker .FixedTokenLengthChunker ;
40
41
import org .opensearch .neuralsearch .processor .factory .TextChunkingProcessorFactory ;
41
42
import org .opensearch .neuralsearch .settings .NeuralSearchSettingsAccessor ;
@@ -127,6 +128,13 @@ private Map<String, Object> createDelimiterParameters() {
127
128
return parameters ;
128
129
}
129
130
131
+ private Map <String , Object > createFixedCharLengthParameters () {
132
+ Map <String , Object > parameters = new HashMap <>();
133
+ parameters .put (FixedCharLengthChunker .CHAR_LIMIT_FIELD , 50 );
134
+ parameters .put (FixedCharLengthChunker .OVERLAP_RATE_FIELD , 0.2 );
135
+ return parameters ;
136
+ }
137
+
130
138
private Map <String , Object > createStringFieldMap () {
131
139
Map <String , Object > fieldMap = new HashMap <>();
132
140
fieldMap .put (INPUT_FIELD , OUTPUT_FIELD );
@@ -190,6 +198,17 @@ private TextChunkingProcessor createDelimiterInstance() {
190
198
return textChunkingProcessorFactory .create (registry , PROCESSOR_TAG , DESCRIPTION , config );
191
199
}
192
200
201
+ @ SneakyThrows
202
+ private TextChunkingProcessor createFixedCharLengthInstance (Map <String , Object > fieldMap ) {
203
+ Map <String , Object > config = new HashMap <>();
204
+ Map <String , Object > algorithmMap = new HashMap <>();
205
+ algorithmMap .put (FixedCharLengthChunker .ALGORITHM_NAME , createFixedCharLengthParameters ());
206
+ config .put (FIELD_MAP_FIELD , fieldMap );
207
+ config .put (ALGORITHM_FIELD , algorithmMap );
208
+ Map <String , Processor .Factory > registry = new HashMap <>();
209
+ return textChunkingProcessorFactory .create (registry , PROCESSOR_TAG , DESCRIPTION , config );
210
+ }
211
+
193
212
@ SneakyThrows
194
213
private TextChunkingProcessor createIgnoreMissingInstance () {
195
214
Map <String , Object > config = new HashMap <>();
@@ -1003,7 +1022,7 @@ public void testExecute_statsDisabled_thenSucceed() {
1003
1022
.getTimestampedEventStatSnapshots (EnumSet .allOf (EventStatName .class ));
1004
1023
1005
1024
assertEquals (0L , snapshots .get (EventStatName .TEXT_CHUNKING_PROCESSOR_EXECUTIONS ).getValue ().longValue ());
1006
- assertEquals (0L , snapshots .get (EventStatName .TEXT_CHUNKING_FIXED_LENGTH_EXECUTIONS ).getValue ().longValue ());
1025
+ assertEquals (0L , snapshots .get (EventStatName .TEXT_CHUNKING_FIXED_TOKEN_LENGTH_EXECUTIONS ).getValue ().longValue ());
1007
1026
}
1008
1027
1009
1028
@ SneakyThrows
@@ -1024,7 +1043,7 @@ public void testExecute_statsEnabled_withFixedTokenLength_andSourceDataString_th
1024
1043
.getTimestampedEventStatSnapshots (EnumSet .allOf (EventStatName .class ));
1025
1044
1026
1045
assertEquals (1L , snapshots .get (EventStatName .TEXT_CHUNKING_PROCESSOR_EXECUTIONS ).getValue ().longValue ());
1027
- assertEquals (1L , snapshots .get (EventStatName .TEXT_CHUNKING_FIXED_LENGTH_EXECUTIONS ).getValue ().longValue ());
1046
+ assertEquals (1L , snapshots .get (EventStatName .TEXT_CHUNKING_FIXED_TOKEN_LENGTH_EXECUTIONS ).getValue ().longValue ());
1028
1047
}
1029
1048
1030
1049
@ SneakyThrows
@@ -1046,4 +1065,26 @@ public void testExecute_statsEnabled_withDelimiter_andSourceDataString_thenSucce
1046
1065
assertEquals (1L , snapshots .get (EventStatName .TEXT_CHUNKING_PROCESSOR_EXECUTIONS ).getValue ().longValue ());
1047
1066
assertEquals (1L , snapshots .get (EventStatName .TEXT_CHUNKING_DELIMITER_EXECUTIONS ).getValue ().longValue ());
1048
1067
}
1068
+
1069
+ @ SneakyThrows
1070
+ public void testExecute_statsEnabled_withFixedCharLength_andSourceDataString_thenSucceed () {
1071
+ TextChunkingProcessor processor = createFixedCharLengthInstance (createStringFieldMap ());
1072
+ IngestDocument ingestDocument = createIngestDocumentWithSourceData (createSourceDataString ());
1073
+ IngestDocument document = processor .execute (ingestDocument );
1074
+ assert document .getSourceAndMetadata ().containsKey (OUTPUT_FIELD );
1075
+ Object passages = document .getSourceAndMetadata ().get (OUTPUT_FIELD );
1076
+ assert (passages instanceof List <?>);
1077
+ List <String > expectedPassages = new ArrayList <>();
1078
+ expectedPassages .add ("This is an example document to be chunked. The doc" );
1079
+ expectedPassages .add ("d. The document contains a single paragraph, two s" );
1080
+ expectedPassages .add ("aph, two sentences and 24 tokens by standard token" );
1081
+ expectedPassages .add ("dard tokenizer in OpenSearch." );
1082
+ assertEquals (expectedPassages , passages );
1083
+
1084
+ Map <EventStatName , TimestampedEventStatSnapshot > snapshots = EventStatsManager .instance ()
1085
+ .getTimestampedEventStatSnapshots (EnumSet .allOf (EventStatName .class ));
1086
+
1087
+ assertEquals (1L , snapshots .get (EventStatName .TEXT_CHUNKING_PROCESSOR_EXECUTIONS ).getValue ().longValue ());
1088
+ assertEquals (1L , snapshots .get (EventStatName .TEXT_CHUNKING_FIXED_CHAR_LENGTH_EXECUTIONS ).getValue ().longValue ());
1089
+ }
1049
1090
}
0 commit comments