added aggregation precomputation for rare string terms aggregators and mapped string terms aggregator Signed-off-by: Anthony Leong <aj.leong623@gmail.com>

ajleong623 · ajleong623 · commit f6371a25034e · 2025-04-27T17:15:30.000-07:00
diff --git a/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/MapStringTermsAggregator.java b/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/MapStringTermsAggregator.java
@@ -32,7 +32,10 @@
 package org.opensearch.search.aggregations.bucket.terms;
 
 import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermsEnum;
 import org.apache.lucene.search.ScoreMode;
+import org.apache.lucene.search.Weight;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.BytesRefBuilder;
 import org.apache.lucene.util.PriorityQueue;
@@ -75,8 +78,35 @@
 public class MapStringTermsAggregator extends AbstractStringTermsAggregator {
     private final CollectorSource collectorSource;
     private final ResultStrategy<?, ?> resultStrategy;
+    private Weight weight;
     private final BytesKeyedBucketOrds bucketOrds;
     private final IncludeExclude.StringFilter includeExclude;
+    protected final String fieldName;
+
+    public MapStringTermsAggregator(
+        String name,
+        AggregatorFactories factories,
+        CollectorSource collectorSource,
+        Function<MapStringTermsAggregator, ResultStrategy<?, ?>> resultStrategy,
+        BucketOrder order,
+        DocValueFormat format,
+        BucketCountThresholds bucketCountThresholds,
+        IncludeExclude.StringFilter includeExclude,
+        SearchContext context,
+        Aggregator parent,
+        SubAggCollectionMode collectionMode,
+        boolean showTermDocCountError,
+        CardinalityUpperBound cardinality,
+        Map<String, Object> metadata,
+        String fieldName
+    ) throws IOException {
+        super(name, factories, context, parent, order, format, bucketCountThresholds, collectionMode, showTermDocCountError, metadata);
+        this.collectorSource = collectorSource;
+        this.resultStrategy = resultStrategy.apply(this); // ResultStrategy needs a reference to the Aggregator to do its job.
+        this.includeExclude = includeExclude;
+        bucketOrds = BytesKeyedBucketOrds.build(context.bigArrays(), cardinality);
+        this.fieldName = fieldName;
+    }
 
     public MapStringTermsAggregator(
         String name,
@@ -99,6 +129,20 @@ public MapStringTermsAggregator(
         this.resultStrategy = resultStrategy.apply(this); // ResultStrategy needs a reference to the Aggregator to do its job.
         this.includeExclude = includeExclude;
         bucketOrds = BytesKeyedBucketOrds.build(context.bigArrays(), cardinality);
+        if (collectorSource instanceof ValuesSourceCollectorSource) {
+            ValuesSource valuesCollectorSource = ((ValuesSourceCollectorSource) collectorSource).getValuesSource(); 
+            if (valuesCollectorSource instanceof ValuesSource.Bytes.FieldData) {
+                this.fieldName = ((ValuesSource.Bytes.FieldData) valuesCollectorSource).getIndexFieldName();
+            } else {
+                this.fieldName = null;
+            }
+        } else {
+            this.fieldName = null;
+        }
+    }
+
+    public void setWeight(Weight weight) {
+        this.weight = weight;
     }
 
     @Override
@@ -130,6 +174,51 @@ public LeafBucketCollector getLeafCollector(LeafReaderContext ctx, LeafBucketCol
         );
     }
 
+    @Override
+    protected boolean tryPrecomputeAggregationForLeaf(LeafReaderContext ctx) throws IOException {
+        if (subAggregators.length > 0 || includeExclude != null || fieldName == null) {
+            // The optimization does not work when there are subaggregations or if there is a filter.
+            // The query has to be a match all, otherwise
+            return false;
+        }
+
+        // The optimization could only be used if there are no deleted documents and the top-level 
+        // query matches all documents in the segment. 
+        if (weight == null) {
+            return false;
+        } else {
+            if (weight.count(ctx) == 0) {
+                return true;
+            } else if (weight.count(ctx) != ctx.reader().maxDoc()) {
+                return false;
+            }
+        }
+
+        Terms stringTerms = ctx.reader().terms(fieldName);
+        if (stringTerms == null) {
+            // Field is not indexed.
+            return false;
+        }
+
+        TermsEnum stringTermsEnum = stringTerms.iterator();
+        BytesRef stringTerm = stringTermsEnum.next();
+
+        // Here, we will iterate over all the terms in the segment and add the counts into the bucket. 
+        while (stringTerm != null) {
+            long bucketOrdinal = bucketOrds.add(0L, stringTerm);
+            if (bucketOrdinal < 0) { // already seen
+                bucketOrdinal = -1 - bucketOrdinal;
+            }
+            int amount = stringTermsEnum.docFreq();
+            if (resultStrategy instanceof SignificantTermsResults) {
+                ((SignificantTermsResults)resultStrategy).updateSubsetSizes(0L, amount);
+            }
+            incrementBucketDocCount(bucketOrdinal, amount);
+            stringTerm = stringTermsEnum.next();
+        }
+        return true;
+    }
+
     @Override
     public InternalAggregation[] buildAggregations(long[] owningBucketOrds) throws IOException {
         return resultStrategy.buildAggregations(owningBucketOrds);
@@ -196,6 +285,10 @@ public boolean needsScores() {
             return valuesSource.needsScores();
         }
 
+        public ValuesSource getValuesSource() {
+            return valuesSource;
+        }
+
         @Override
         public LeafBucketCollector getLeafCollector(
             IncludeExclude.StringFilter includeExclude,
@@ -501,6 +594,11 @@ String describe() {
             return "significant_terms";
         }
 
+        public void updateSubsetSizes(long owningBucketOrd, int amount) {
+            subsetSizes = context.bigArrays().grow(subsetSizes, owningBucketOrd + 1);
+            subsetSizes.increment(owningBucketOrd, amount);
+        }
+
         @Override
         LeafBucketCollector wrapCollector(LeafBucketCollector primary) {
             return new LeafBucketCollectorBase(primary, null) {
diff --git a/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/SignificantTextAggregatorFactory.java b/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/SignificantTextAggregatorFactory.java
@@ -168,7 +168,8 @@ protected Aggregator createInternal(
             SubAggCollectionMode.BREADTH_FIRST,
             false,
             cardinality,
-            metadata
+            metadata,
+            indexedFieldName
         );
     }
 
diff --git a/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/StringRareTermsAggregator.java b/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/StringRareTermsAggregator.java
@@ -32,6 +32,9 @@
 package org.opensearch.search.aggregations.bucket.terms;
 
 import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.search.Weight;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.BytesRefBuilder;
 import org.opensearch.common.lease.Releasables;
@@ -64,7 +67,9 @@
 public class StringRareTermsAggregator extends AbstractRareTermsAggregator {
     private final ValuesSource.Bytes valuesSource;
     private final IncludeExclude.StringFilter filter;
+    private Weight weight;
     private final BytesKeyedBucketOrds bucketOrds;
+    protected final String fieldName;
 
     StringRareTermsAggregator(
         String name,
@@ -83,6 +88,13 @@ public class StringRareTermsAggregator extends AbstractRareTermsAggregator {
         this.valuesSource = valuesSource;
         this.filter = filter;
         this.bucketOrds = BytesKeyedBucketOrds.build(context.bigArrays(), cardinality);
+        this.fieldName = (valuesSource instanceof ValuesSource.Bytes.WithOrdinals.FieldData)
+            ? ((ValuesSource.Bytes.WithOrdinals.FieldData) valuesSource).getIndexFieldName()
+            : null;
+    }
+
+    public void setWeight(Weight weight) {
+        this.weight = weight;
     }
 
     @Override
@@ -122,6 +134,47 @@ public void collect(int docId, long owningBucketOrd) throws IOException {
         };
     }
 
+    @Override
+    protected boolean tryPrecomputeAggregationForLeaf(LeafReaderContext ctx) throws IOException {
+        if (subAggregators.length > 0 || filter != null) {
+            // The optimization does not work when there are subaggregations or if there is a filter.
+            // The query has to be a match all, otherwise
+            return false;
+        }
+
+        // The optimization could only be used if there are no deleted documents and the top-level 
+        // query matches all documents in the segment. 
+        if (weight == null) {
+            return false;
+        } else {
+            if (weight.count(ctx) == 0) {
+                return true;
+            } else if (weight.count(ctx) != ctx.reader().maxDoc()) {
+                return false;
+            }
+        }
+
+        Terms stringTerms = ctx.reader().terms(fieldName);
+        if (stringTerms == null) {
+            // Field is not indexed.
+            return false;
+        }
+
+        TermsEnum stringTermsEnum = stringTerms.iterator();
+        BytesRef stringTerm = stringTermsEnum.next();
+
+        // Here, we will iterate over all the terms in the segment and add the counts into the bucket. 
+        while (stringTerm != null) {
+            long bucketOrdinal = bucketOrds.add(0L, stringTerm);
+            if (bucketOrdinal < 0) { // already seen
+                bucketOrdinal = -1 - bucketOrdinal;
+            }
+            incrementBucketDocCount(bucketOrdinal, stringTermsEnum.docFreq());
+            stringTerm = stringTermsEnum.next();
+        }
+        return true;
+    }
+
     @Override
     public InternalAggregation[] buildAggregations(long[] owningBucketOrds) throws IOException {
         /*
diff --git a/server/src/main/java/org/opensearch/search/aggregations/support/ValuesSource.java b/server/src/main/java/org/opensearch/search/aggregations/support/ValuesSource.java
@@ -304,6 +304,10 @@ public SortedBinaryDocValues bytesValues(LeafReaderContext context) {
                 return indexFieldData.load(context).getBytesValues();
             }
 
+            public String getIndexFieldName() {
+                return this.indexFieldData.getFieldName();
+            }
+
         }
 
         /**
diff --git a/server/src/test/java/org/opensearch/search/aggregations/bucket/terms/RareTermsAggregatorTests.java b/server/src/test/java/org/opensearch/search/aggregations/bucket/terms/RareTermsAggregatorTests.java
@@ -39,6 +39,7 @@
 import org.apache.lucene.document.SortedDocValuesField;
 import org.apache.lucene.document.SortedNumericDocValuesField;
 import org.apache.lucene.document.SortedSetDocValuesField;
+import org.apache.lucene.document.StringField;
 import org.apache.lucene.index.DirectoryReader;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.search.FieldExistsQuery;
@@ -156,6 +157,12 @@ public void testMatchAllDocs() throws IOException {
             assertThat(bucket.getKeyAsString(), equalTo("1"));
             assertThat(bucket.getDocCount(), equalTo(1L));
         });
+        testSearchCaseIndexString(query, dataset, aggregation -> aggregation.field(KEYWORD_FIELD).maxDocCount(1), agg -> {
+            assertEquals(1, agg.getBuckets().size());
+            StringRareTerms.Bucket bucket = (StringRareTerms.Bucket) agg.getBuckets().get(0);
+            assertThat(bucket.getKeyAsString(), equalTo("1"));
+            assertThat(bucket.getDocCount(), equalTo(1L));
+        }, true);
     }
 
     public void testManyDocsOneRare() throws IOException {
@@ -581,6 +588,21 @@ private void testSearchCase(
 
     }
 
+    private void testSearchCaseIndexString(
+        Query query,
+        List<Long> dataset,
+        Consumer<RareTermsAggregationBuilder> configure,
+        Consumer<InternalMappedRareTerms<?, ?>> verify,
+        boolean shouldIndex
+    ) throws IOException {
+        RareTermsAggregationBuilder aggregationBuilder = new RareTermsAggregationBuilder("_name");
+        if (configure != null) {
+            configure.accept(aggregationBuilder);
+        }
+        verify.accept(executeTestCaseIndexString(query, dataset, aggregationBuilder, shouldIndex));
+
+    }
+
     private <A extends InternalAggregation> A executeTestCase(Query query, List<Long> dataset, AggregationBuilder aggregationBuilder)
         throws IOException {
         try (Directory directory = newDirectory()) {
@@ -610,6 +632,38 @@ private <A extends InternalAggregation> A executeTestCase(Query query, List<Long
         }
     }
 
+    private <A extends InternalAggregation> A executeTestCaseIndexString(Query query, List<Long> dataset, AggregationBuilder aggregationBuilder, boolean shouldIndex)
+        throws IOException {
+        try (Directory directory = newDirectory()) {
+            try (RandomIndexWriter indexWriter = new RandomIndexWriter(random(), directory)) {
+                Document document = new Document();
+                List<Long> shuffledDataset = new ArrayList<>(dataset);
+                Collections.shuffle(shuffledDataset, random());
+                for (Long value : shuffledDataset) {
+                    document.add(new SortedNumericDocValuesField(LONG_FIELD, value));
+                    document.add(new LongPoint(LONG_FIELD, value));
+                    document.add(new SortedSetDocValuesField(KEYWORD_FIELD, new BytesRef(Long.toString(value))));
+                    if (shouldIndex) {
+                        document.add(new StringField(KEYWORD_FIELD, Long.toString(value), Field.Store.NO));
+                    }
+                    document.add(new SortedSetDocValuesField("even_odd", new BytesRef(value % 2 == 0 ? "even" : "odd")));
+                    indexWriter.addDocument(document);
+                    document.clear();
+                }
+            }
+
+            try (IndexReader indexReader = DirectoryReader.open(directory)) {
+                IndexSearcher indexSearcher = newIndexSearcher(indexReader);
+
+                MappedFieldType[] types = new MappedFieldType[] {
+                    keywordField(KEYWORD_FIELD),
+                    longField(LONG_FIELD),
+                    keywordField("even_odd") };
+                return searchAndReduce(indexSearcher, query, aggregationBuilder, types);
+            }
+        }
+    }
+
     @Override
     public void doAssertReducedMultiBucketConsumer(Aggregation agg, MultiBucketConsumerService.MultiBucketConsumer bucketConsumer) {
         /*

Original file line number	Diff line number	Diff line change
`@@ -168,7 +168,8 @@ protected Aggregator createInternal(`
`168`	`168`	`SubAggCollectionMode.BREADTH_FIRST,`
`169`	`169`	`false,`
`170`	`170`	`cardinality,`
`171`		`- metadata`
	`171`	`+ metadata,`
	`172`	`+ indexedFieldName`
`172`	`173`	`);`
`173`	`174`	`}`
`174`	`175`
Original file line number	Diff line number	Diff line change
`@@ -304,6 +304,10 @@ public SortedBinaryDocValues bytesValues(LeafReaderContext context) {`
`304`	`304`	`return indexFieldData.load(context).getBytesValues();`
`305`	`305`	`}`
`306`	`306`
	`307`	`+ public String getIndexFieldName() {`
	`308`	`+ return this.indexFieldData.getFieldName();`
	`309`	`+ }`
	`310`	`+`
`307`	`311`	`}`
`308`	`312`
`309`	`313`	`/**`