ryanbogan
diff --git a/‎CHANGELOG.md
Lines changed: 1 addition & 0 deletions b/‎CHANGELOG.md
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/main/java/org/opensearch/neuralsearch/plugin/NeuralSearch.java
Lines changed: 6 additions & 1 deletion b/‎src/main/java/org/opensearch/neuralsearch/plugin/NeuralSearch.java
Lines changed: 6 additions & 1 deletion
diff --git a/‎src/main/java/org/opensearch/neuralsearch/processor/InferenceProcessor.java
Lines changed: 241 additions & 40 deletions b/‎src/main/java/org/opensearch/neuralsearch/processor/InferenceProcessor.java
Lines changed: 241 additions & 40 deletions
diff --git a/‎src/main/java/org/opensearch/neuralsearch/processor/TextEmbeddingProcessor.java
Lines changed: 81 additions & 7 deletions b/‎src/main/java/org/opensearch/neuralsearch/processor/TextEmbeddingProcessor.java
Lines changed: 81 additions & 7 deletions
diff --git a/‎src/main/java/org/opensearch/neuralsearch/processor/factory/TextEmbeddingProcessorFactory.java
Lines changed: 24 additions & 2 deletions b/‎src/main/java/org/opensearch/neuralsearch/processor/factory/TextEmbeddingProcessorFactory.java
Lines changed: 24 additions & 2 deletions
@@ -10,6 +10,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 - Support filter function for HybridQueryBuilder and NeuralQueryBuilder ([#1206](https://github.com/opensearch-project/neural-search/pull/1206))
 - Add Z Score normalization technique ([#1224](https://github.com/opensearch-project/neural-search/pull/1224))
 - Support semantic sentence highlighter ([#1193](https://github.com/opensearch-project/neural-search/pull/1193))
+- Optimize embedding generation in Text Embedding Processor ([#1191](https://github.com/opensearch-project/neural-search/pull/1191))
 
 ### Enhancements
 
 
@@ -140,7 +140,12 @@ public Map<String, Processor.Factory> getProcessors(Processor.Parameters paramet
         clientAccessor = new MLCommonsClientAccessor(new MachineLearningNodeClient(parameters.client));
         return Map.of(
             TextEmbeddingProcessor.TYPE,
-            new TextEmbeddingProcessorFactory(clientAccessor, parameters.env, parameters.ingestService.getClusterService()),
+            new TextEmbeddingProcessorFactory(
+                parameters.client,
+                clientAccessor,
+                parameters.env,
+                parameters.ingestService.getClusterService()
+            ),
             SparseEncodingProcessor.TYPE,
             new SparseEncodingProcessorFactory(clientAccessor, parameters.env, parameters.ingestService.getClusterService()),
             TextImageEmbeddingProcessor.TYPE,
 
@@ -9,13 +9,20 @@
 import java.util.function.BiConsumer;
 import java.util.function.Consumer;
 
+import org.opensearch.action.get.GetAction;
+import org.opensearch.action.get.GetRequest;
+import org.opensearch.action.get.MultiGetAction;
 import org.opensearch.cluster.service.ClusterService;
 import org.opensearch.core.action.ActionListener;
+import org.opensearch.core.common.util.CollectionUtils;
 import org.opensearch.env.Environment;
 import org.opensearch.ingest.IngestDocument;
+import org.opensearch.ingest.IngestDocumentWrapper;
 import org.opensearch.neuralsearch.ml.MLCommonsClientAccessor;
 
 import lombok.extern.log4j.Log4j2;
+import org.opensearch.neuralsearch.processor.optimization.TextEmbeddingInferenceFilter;
+import org.opensearch.transport.client.OpenSearchClient;
 
 /**
  * This processor is used for user input data text embedding processing, model_id can be used to indicate which model user use,
@@ -26,33 +33,57 @@ public final class TextEmbeddingProcessor extends InferenceProcessor {
 
     public static final String TYPE = "text_embedding";
     public static final String LIST_TYPE_NESTED_MAP_KEY = "knn";
+    public static final String SKIP_EXISTING = "skip_existing";
+    public static final boolean DEFAULT_SKIP_EXISTING = false;
+    private static final String INDEX_FIELD = "_index";
+    private static final String ID_FIELD = "_id";
+    private final OpenSearchClient openSearchClient;
+    private final boolean skipExisting;
+    private final TextEmbeddingInferenceFilter textEmbeddingInferenceFilter;
 
     public TextEmbeddingProcessor(
         String tag,
         String description,
         int batchSize,
         String modelId,
         Map<String, Object> fieldMap,
+        boolean skipExisting,
+        TextEmbeddingInferenceFilter textEmbeddingInferenceFilter,
+        OpenSearchClient openSearchClient,
         MLCommonsClientAccessor clientAccessor,
         Environment environment,
         ClusterService clusterService
     ) {
         super(tag, description, batchSize, TYPE, LIST_TYPE_NESTED_MAP_KEY, modelId, fieldMap, clientAccessor, environment, clusterService);
+        this.skipExisting = skipExisting;
+        this.textEmbeddingInferenceFilter = textEmbeddingInferenceFilter;
+        this.openSearchClient = openSearchClient;
     }
 
     @Override
     public void doExecute(
         IngestDocument ingestDocument,
-        Map<String, Object> ProcessMap,
+        Map<String, Object> processMap,
         List<String> inferenceList,
         BiConsumer<IngestDocument, Exception> handler
     ) {
-        mlCommonsClientAccessor.inferenceSentences(
-            TextInferenceRequest.builder().modelId(this.modelId).inputTexts(inferenceList).build(),
-            ActionListener.wrap(vectors -> {
-                setVectorFieldsToDocument(ingestDocument, ProcessMap, vectors);
-                handler.accept(ingestDocument, null);
-            }, e -> { handler.accept(null, e); })
+        // skip existing flag is turned off. Call model inference without filtering
+        if (skipExisting == false) {
+            makeInferenceCall(ingestDocument, processMap, inferenceList, handler);
+            return;
+        }
+        // if skipExisting flag is turned on, eligible inference texts will be compared and filtered after embeddings are copied
+        String index = ingestDocument.getSourceAndMetadata().get(INDEX_FIELD).toString();
+        String id = ingestDocument.getSourceAndMetadata().get(ID_FIELD).toString();
+        openSearchClient.execute(
+            GetAction.INSTANCE,
+            new GetRequest(index, id),
+            ActionListener.wrap(
+                response -> getResponseHandler(response, ingestDocument, processMap, inferenceList, handler, textEmbeddingInferenceFilter),
+                e -> {
+                    handler.accept(null, e);
+                }
+            )
         );
     }
 
@@ -63,4 +94,47 @@ public void doBatchExecute(List<String> inferenceList, Consumer<List<?>> handler
             ActionListener.wrap(handler::accept, onException)
         );
     }
+
+    @Override
+    public void subBatchExecute(List<IngestDocumentWrapper> ingestDocumentWrappers, Consumer<List<IngestDocumentWrapper>> handler) {
+        try {
+            if (CollectionUtils.isEmpty(ingestDocumentWrappers)) {
+                handler.accept(ingestDocumentWrappers);
+                return;
+            }
+            List<DataForInference> dataForInferences = getDataForInference(ingestDocumentWrappers);
+            List<String> inferenceList = constructInferenceTexts(dataForInferences);
+            if (inferenceList.isEmpty()) {
+                handler.accept(ingestDocumentWrappers);
+                return;
+            }
+            // skip existing flag is turned off. Call doSubBatchExecute without filtering
+            if (skipExisting == false) {
+                doSubBatchExecute(ingestDocumentWrappers, inferenceList, dataForInferences, handler);
+                return;
+            }
+            // skipExisting flag is turned on, eligible inference texts in dataForInferences will be compared and filtered after embeddings
+            // are copied
+            openSearchClient.execute(
+                MultiGetAction.INSTANCE,
+                buildMultiGetRequest(dataForInferences),
+                ActionListener.wrap(
+                    response -> multiGetResponseHandler(
+                        response,
+                        ingestDocumentWrappers,
+                        inferenceList,
+                        dataForInferences,
+                        handler,
+                        textEmbeddingInferenceFilter
+                    ),
+                    e -> {
+                        // When exception is thrown in for MultiGetAction, set exception to all ingestDocumentWrappers
+                        updateWithExceptions(getIngestDocumentWrappers(dataForInferences), handler, e);
+                    }
+                )
+            );
+        } catch (Exception e) {
+            updateWithExceptions(ingestDocumentWrappers, handler, e);
+        }
+    }
 }
@@ -4,8 +4,11 @@
  */
 package org.opensearch.neuralsearch.processor.factory;
 
+import static org.opensearch.ingest.ConfigurationUtils.readBooleanProperty;
 import static org.opensearch.ingest.ConfigurationUtils.readMap;
 import static org.opensearch.ingest.ConfigurationUtils.readStringProperty;
+import static org.opensearch.neuralsearch.processor.TextEmbeddingProcessor.SKIP_EXISTING;
+import static org.opensearch.neuralsearch.processor.TextEmbeddingProcessor.DEFAULT_SKIP_EXISTING;
 import static org.opensearch.neuralsearch.processor.TextEmbeddingProcessor.TYPE;
 import static org.opensearch.neuralsearch.processor.TextEmbeddingProcessor.MODEL_ID_FIELD;
 import static org.opensearch.neuralsearch.processor.TextEmbeddingProcessor.FIELD_MAP_FIELD;
@@ -17,24 +20,30 @@
 import org.opensearch.ingest.AbstractBatchingProcessor;
 import org.opensearch.neuralsearch.ml.MLCommonsClientAccessor;
 import org.opensearch.neuralsearch.processor.TextEmbeddingProcessor;
+import org.opensearch.neuralsearch.processor.optimization.TextEmbeddingInferenceFilter;
+import org.opensearch.transport.client.OpenSearchClient;
 
 /**
  * Factory for text embedding ingest processor for ingestion pipeline. Instantiates processor based on user provided input.
  */
 public final class TextEmbeddingProcessorFactory extends AbstractBatchingProcessor.Factory {
 
+    private final OpenSearchClient openSearchClient;
+
     private final MLCommonsClientAccessor clientAccessor;
 
     private final Environment environment;
 
     private final ClusterService clusterService;
 
     public TextEmbeddingProcessorFactory(
+        final OpenSearchClient openSearchClient,
         final MLCommonsClientAccessor clientAccessor,
         final Environment environment,
         final ClusterService clusterService
     ) {
         super(TYPE);
+        this.openSearchClient = openSearchClient;
         this.clientAccessor = clientAccessor;
         this.environment = environment;
         this.clusterService = clusterService;
@@ -43,7 +52,20 @@ public TextEmbeddingProcessorFactory(
     @Override
     protected AbstractBatchingProcessor newProcessor(String tag, String description, int batchSize, Map<String, Object> config) {
         String modelId = readStringProperty(TYPE, tag, config, MODEL_ID_FIELD);
-        Map<String, Object> filedMap = readMap(TYPE, tag, config, FIELD_MAP_FIELD);
-        return new TextEmbeddingProcessor(tag, description, batchSize, modelId, filedMap, clientAccessor, environment, clusterService);
+        Map<String, Object> fieldMap = readMap(TYPE, tag, config, FIELD_MAP_FIELD);
+        boolean skipExisting = readBooleanProperty(TYPE, tag, config, SKIP_EXISTING, DEFAULT_SKIP_EXISTING);
+        return new TextEmbeddingProcessor(
+            tag,
+            description,
+            batchSize,
+            modelId,
+            fieldMap,
+            skipExisting,
+            skipExisting ? new TextEmbeddingInferenceFilter(fieldMap) : null,
+            openSearchClient,
+            clientAccessor,
+            environment,
+            clusterService
+        );
     }
 }