opensearch-project
diff --git a/‎CHANGELOG.md
Lines changed: 2 additions & 0 deletions b/‎CHANGELOG.md
Lines changed: 2 additions & 0 deletions
diff --git a/‎qa/rolling-upgrade/src/test/java/org/opensearch/neuralsearch/bwc/rolling/RestNeuralStatsActionIT.java
Lines changed: 1 addition & 0 deletions b/‎qa/rolling-upgrade/src/test/java/org/opensearch/neuralsearch/bwc/rolling/RestNeuralStatsActionIT.java
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java
Lines changed: 4 additions & 1 deletion b/‎src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java
Lines changed: 4 additions & 1 deletion
diff --git a/‎src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java
Lines changed: 3 additions & 1 deletion b/‎src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java
Lines changed: 3 additions & 1 deletion
diff --git a/‎src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedCharLengthChunker.java
Lines changed: 129 additions & 0 deletions b/‎src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedCharLengthChunker.java
Lines changed: 129 additions & 0 deletions
diff --git a/‎src/main/java/org/opensearch/neuralsearch/stats/events/EventStatName.java
Lines changed: 8 additions & 2 deletions b/‎src/main/java/org/opensearch/neuralsearch/stats/events/EventStatName.java
Lines changed: 8 additions & 2 deletions
diff --git a/‎src/main/java/org/opensearch/neuralsearch/stats/info/InfoStatName.java
Lines changed: 8 additions & 2 deletions b/‎src/main/java/org/opensearch/neuralsearch/stats/info/InfoStatName.java
Lines changed: 8 additions & 2 deletions
diff --git a/‎src/main/java/org/opensearch/neuralsearch/stats/info/InfoStatsManager.java
Lines changed: 5 additions & 2 deletions b/‎src/main/java/org/opensearch/neuralsearch/stats/info/InfoStatsManager.java
Lines changed: 5 additions & 2 deletions
@@ -12,6 +12,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 - [Semantic Field] Implement the query logic for the semantic field. ([#1315](https://github.com/opensearch-project/neural-search/pull/1315))
 - [Semantic Field] Enhance semantic field to allow to enable/disable chunking. ([#1337](https://github.com/opensearch-project/neural-search/pull/1337))
 - [Semantic Field] Implement the search analyzer support for semantic field at query time. ([#1341](https://github.com/opensearch-project/neural-search/pull/1341))
+- Add `FixedCharLengthChunker` for character length-based chunking ([#1342](https://github.com/opensearch-project/neural-search/pull/1342))
 - [Semantic Field] Implement the search analyzer support for semantic field at semantic field index creation time. ([#1367](https://github.com/opensearch-project/neural-search/pull/1367))
 
 ### Enhancements
@@ -32,6 +33,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 - Filter requested stats based on minimum cluster version to fix BWC tests for stats API ([#1373](https://github.com/opensearch-project/neural-search/pull/1373))
 
 ### Infrastructure
+- [3.0] Update neural-search for OpenSearch 3.0 beta compatibility ([#1245](https://github.com/opensearch-project/neural-search/pull/1245))
 
 ### Documentation
 
 
@@ -120,3 +120,4 @@ public void testStats_E2EFlow() throws Exception {
         }
     }
 }
+
@@ -25,6 +25,7 @@
 import org.opensearch.index.mapper.IndexFieldMapper;
 import org.opensearch.neuralsearch.processor.chunker.ChunkerFactory;
 import org.opensearch.neuralsearch.processor.chunker.DelimiterChunker;
+import org.opensearch.neuralsearch.processor.chunker.FixedCharLengthChunker;
 import org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker;
 import org.opensearch.neuralsearch.stats.events.EventStatName;
 import org.opensearch.neuralsearch.stats.events.EventStatsManager;
@@ -58,7 +59,9 @@ public final class TextChunkingProcessor extends AbstractProcessor {
         DelimiterChunker.ALGORITHM_NAME,
         () -> EventStatsManager.increment(EventStatName.TEXT_CHUNKING_DELIMITER_EXECUTIONS),
         FixedTokenLengthChunker.ALGORITHM_NAME,
-        () -> EventStatsManager.increment(EventStatName.TEXT_CHUNKING_FIXED_LENGTH_EXECUTIONS)
+        () -> EventStatsManager.increment(EventStatName.TEXT_CHUNKING_FIXED_TOKEN_LENGTH_EXECUTIONS),
+        FixedCharLengthChunker.ALGORITHM_NAME,
+        () -> EventStatsManager.increment(EventStatName.TEXT_CHUNKING_FIXED_CHAR_LENGTH_EXECUTIONS)
     );
 
     private int maxChunkLimit;
 
@@ -22,7 +22,9 @@ private ChunkerFactory() {} // no instance of this factory class
         FixedTokenLengthChunker.ALGORITHM_NAME,
         FixedTokenLengthChunker::new,
         DelimiterChunker.ALGORITHM_NAME,
-        DelimiterChunker::new
+        DelimiterChunker::new,
+        FixedCharLengthChunker.ALGORITHM_NAME,
+        FixedCharLengthChunker::new
     );
 
     /** Set of supported chunker algorithm types */
 
@@ -0,0 +1,129 @@
+/*
+ * Copyright OpenSearch Contributors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+package org.opensearch.neuralsearch.processor.chunker;
+
+import java.util.Locale;
+import java.util.Map;
+import java.util.List;
+import java.util.ArrayList;
+
+import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterParser.parseInteger;
+import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterParser.parseDoubleWithDefault;
+import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterParser.parsePositiveIntegerWithDefault;
+
+/**
+ * The implementation {@link Chunker} for fixed character length algorithm.
+ */
+public final class FixedCharLengthChunker extends Chunker {
+
+    /** The identifier for the fixed character length chunking algorithm. */
+    public static final String ALGORITHM_NAME = "fixed_char_length";
+
+    /** Field name for specifying the maximum number of characters per chunk. */
+    public static final String CHAR_LIMIT_FIELD = "char_limit";
+
+    /** Field name for specifying the overlap rate between consecutive chunks based on fixed character length. */
+    public static final String OVERLAP_RATE_FIELD = "overlap_rate";
+
+    // Default values for each non-runtime parameter
+    private static final int DEFAULT_CHAR_LIMIT = 2048; // Default character limit per chunk (512 tokens * 4 chars)
+    private static final double DEFAULT_OVERLAP_RATE = 0.0;
+
+    // Parameter restrictions
+    private static final double OVERLAP_RATE_LOWER_BOUND = 0.0;
+    private static final double OVERLAP_RATE_UPPER_BOUND = 0.5; // Max 50% overlap
+
+    // Parameter values
+    private int charLimit;
+    private double overlapRate;
+
+    /**
+     * Constructor that initializes the fixed character length chunker with the specified parameters.
+     * @param parameters a map with non-runtime parameters to be parsed
+     */
+    public FixedCharLengthChunker(final Map<String, Object> parameters) {
+        parseParameters(parameters);
+    }
+
+    /**
+     * Parse the parameters for fixed character length algorithm.
+     * Throw IllegalArgumentException when parameters are invalid.
+     *
+     * @param parameters a map with non-runtime parameters as the following:
+     * 1. char_limit: the character limit for each chunked passage
+     * 2. overlap_rate: the overlapping degree for each chunked passage, indicating how many characters come from the previous passage
+     * Here are requirements for non-runtime parameters:
+     * 1. char_limit must be a positive integer
+     * 2. overlap_rate must be within range [0, 0.5]
+     */
+    @Override
+    public void parseParameters(Map<String, Object> parameters) {
+        this.charLimit = parsePositiveIntegerWithDefault(parameters, CHAR_LIMIT_FIELD, DEFAULT_CHAR_LIMIT);
+        this.overlapRate = parseDoubleWithDefault(parameters, OVERLAP_RATE_FIELD, DEFAULT_OVERLAP_RATE);
+
+        if (overlapRate < OVERLAP_RATE_LOWER_BOUND || overlapRate > OVERLAP_RATE_UPPER_BOUND) {
+            throw new IllegalArgumentException(
+                String.format(
+                    Locale.ROOT,
+                    "Parameter [%s] must be between %s and %s, but was %s",
+                    OVERLAP_RATE_FIELD,
+                    OVERLAP_RATE_LOWER_BOUND,
+                    OVERLAP_RATE_UPPER_BOUND,
+                    overlapRate
+                )
+            );
+        }
+    }
+
+    /**
+     * Return the chunked passages for fixed character length algorithm.
+     * Throw IllegalArgumentException when runtime parameters are invalid.
+     *
+     * @param content input string
+     * @param runtimeParameters a map for runtime parameters, containing the following runtime parameters:
+     * 1. max_chunk_limit: field level max chunk limit
+     * 2. chunk_string_count: number of non-empty strings (including itself) which need to be chunked later
+     */
+    @Override
+    public List<String> chunk(final String content, final Map<String, Object> runtimeParameters) {
+        int runtimeMaxChunkLimit = parseInteger(runtimeParameters, MAX_CHUNK_LIMIT_FIELD);
+        int chunkStringCount = parseInteger(runtimeParameters, CHUNK_STRING_COUNT_FIELD);
+
+        List<String> chunkResult = new ArrayList<>();
+
+        int startCharIndex = 0;
+        int overlapCharNumber = (int) Math.floor(this.charLimit * this.overlapRate);
+        // Ensure `chunkInterval` is positive. charLimit is positive. overlapRate is [0, 0.5].
+        // So, (charLimit - overlapCharNumber) >= 0.5 * charLimit, which is always > 0 if charLimit >= 1.
+        int chunkInterval = this.charLimit - overlapCharNumber;
+
+        while (startCharIndex < content.length()) {
+            if (Chunker.checkRunTimeMaxChunkLimit(chunkResult.size(), runtimeMaxChunkLimit, chunkStringCount)) {
+                chunkResult.add(content.substring(startCharIndex));
+                break;
+            }
+
+            int endPosition;
+            // Check if the current chunk will extend to or past the end of the content
+            if (startCharIndex + this.charLimit >= content.length()) {
+                endPosition = content.length(); // Ensure chunk goes to the very end
+                chunkResult.add(content.substring(startCharIndex, endPosition));
+                break;
+            } else {
+                endPosition = startCharIndex + this.charLimit;
+                chunkResult.add(content.substring(startCharIndex, endPosition));
+            }
+
+            startCharIndex += chunkInterval;
+        }
+
+        return chunkResult;
+    }
+
+    @Override
+    public String getAlgorithmName() {
+        return ALGORITHM_NAME;
+    }
+}
@@ -39,8 +39,8 @@ public enum EventStatName implements StatName {
         EventStatType.TIMESTAMPED_EVENT_COUNTER,
         Version.V_3_1_0
     ),
-    TEXT_CHUNKING_FIXED_LENGTH_EXECUTIONS(
-        "text_chunking_fixed_length_executions",
+    TEXT_CHUNKING_FIXED_TOKEN_LENGTH_EXECUTIONS(
+        "text_chunking_fixed_token_length_executions",
         "processors.ingest",
         EventStatType.TIMESTAMPED_EVENT_COUNTER,
         Version.V_3_1_0
@@ -51,6 +51,12 @@ public enum EventStatName implements StatName {
         EventStatType.TIMESTAMPED_EVENT_COUNTER,
         Version.V_3_1_0
     ),
+    TEXT_CHUNKING_FIXED_CHAR_LENGTH_EXECUTIONS(
+        "text_chunking_fixed_char_length_executions",
+        "processors.ingest",
+        EventStatType.TIMESTAMPED_EVENT_COUNTER,
+        Version.V_3_1_0
+    ),
     SEMANTIC_FIELD_PROCESSOR_EXECUTIONS(
         "semantic_field_executions",
         "processors.ingest",
 
@@ -36,8 +36,14 @@ public enum InfoStatName implements StatName {
         InfoStatType.INFO_COUNTER,
         Version.V_3_1_0
     ),
-    TEXT_CHUNKING_FIXED_LENGTH_PROCESSORS(
-        "text_chunking_fixed_length_processors",
+    TEXT_CHUNKING_FIXED_TOKEN_LENGTH_PROCESSORS(
+        "text_chunking_fixed_token_length_processors",
+        "processors.ingest",
+        InfoStatType.INFO_COUNTER,
+        Version.V_3_1_0
+    ),
+    TEXT_CHUNKING_FIXED_CHAR_LENGTH_PROCESSORS(
+        "text_chunking_fixed_char_length_processors",
         "processors.ingest",
         InfoStatType.INFO_COUNTER,
         Version.V_3_1_0
 
@@ -18,6 +18,7 @@
 import org.opensearch.neuralsearch.processor.normalization.MinMaxScoreNormalizationTechnique;
 import org.opensearch.neuralsearch.processor.normalization.ZScoreNormalizationTechnique;
 import org.opensearch.neuralsearch.processor.chunker.DelimiterChunker;
+import org.opensearch.neuralsearch.processor.chunker.FixedCharLengthChunker;
 import org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker;
 import org.opensearch.neuralsearch.settings.NeuralSearchSettingsAccessor;
 import org.opensearch.neuralsearch.stats.common.StatSnapshot;
@@ -48,7 +49,9 @@ public class InfoStatsManager {
         DelimiterChunker.ALGORITHM_NAME,
         stats -> increment(stats, InfoStatName.TEXT_CHUNKING_DELIMITER_PROCESSORS),
         FixedTokenLengthChunker.ALGORITHM_NAME,
-        stats -> increment(stats, InfoStatName.TEXT_CHUNKING_FIXED_LENGTH_PROCESSORS)
+        stats -> increment(stats, InfoStatName.TEXT_CHUNKING_FIXED_TOKEN_LENGTH_PROCESSORS),
+        FixedCharLengthChunker.ALGORITHM_NAME,
+        stats -> increment(stats, InfoStatName.TEXT_CHUNKING_FIXED_CHAR_LENGTH_PROCESSORS)
     );
 
     private static final Map<String, Consumer<Map<InfoStatName, CountableInfoStatSnapshot>>> normTechniqueIncrementers = Map.of(
@@ -216,7 +219,7 @@ private void countTextChunkingProcessorStats(Map<InfoStatName, CountableInfoStat
 
         // If no algorithm is specified, default case is fixed length
         if (chunkingAlgorithmIncrementers.containsKey(algorithmKey) == false) {
-            increment(stats, InfoStatName.TEXT_CHUNKING_FIXED_LENGTH_PROCESSORS);
+            increment(stats, InfoStatName.TEXT_CHUNKING_FIXED_TOKEN_LENGTH_PROCESSORS);
         } else {
             // Map is guaranteed to contain key in this block, so we can do direct map get
             chunkingAlgorithmIncrementers.get(algorithmKey).accept(stats);
Original file line number	Diff line number	Diff line change
`@@ -120,3 +120,4 @@ public void testStats_E2EFlow() throws Exception {`
`120`	`120`	`}`
`121`	`121`	`}`
`122`	`122`	`}`
	`123`	`+`