opensearch-project · ajleong623 · Jun 1, 2025 · Jun 1, 2025 · Jun 26, 2025 · Jun 26, 2025
@@ -31,8 +31,13 @@
 
 package org.opensearch.search.aggregations.bucket.missing;
 
+import org.apache.lucene.index.DocValues;
+import org.apache.lucene.index.FieldInfos;
 import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.index.NumericDocValues;
+import org.apache.lucene.search.Weight;
 import org.opensearch.index.fielddata.DocValueBits;
+import org.opensearch.index.mapper.DocCountFieldMapper;
 import org.opensearch.search.aggregations.Aggregator;
 import org.opensearch.search.aggregations.AggregatorFactories;
 import org.opensearch.search.aggregations.CardinalityUpperBound;
@@ -46,7 +51,11 @@
 import org.opensearch.search.internal.SearchContext;
 
 import java.io.IOException;
+import java.util.HashSet;
 import java.util.Map;
+import java.util.Set;
+
+import static org.apache.lucene.index.SortedSetDocValues.NO_MORE_DOCS;
 
 /**
  * Aggregate all docs that are missing a value.
@@ -55,7 +64,10 @@
  */
 public class MissingAggregator extends BucketsAggregator implements SingleBucketAggregator {
 
+    private Weight weight;
     private final ValuesSource valuesSource;
+    protected final String fieldName;
+    private final ValuesSourceConfig valuesSourceConfig;
 
     public MissingAggregator(
         String name,
@@ -69,6 +81,16 @@
         super(name, factories, aggregationContext, parent, cardinality, metadata);
         // TODO: Stop using nulls here
         this.valuesSource = valuesSourceConfig.hasValues() ? valuesSourceConfig.getValuesSource() : null;
+        if (this.valuesSource != null) {
+            this.fieldName = valuesSource.getIndexFieldName();
+        } else {
+            this.fieldName = null;
+        }
+        this.valuesSourceConfig = valuesSourceConfig;
+    }
+
+    public void setWeight(Weight weight) {
+        this.weight = weight;
     }
 
     @Override
@@ -94,6 +116,66 @@
         };
     }
 
+    @Override
+    protected boolean tryPrecomputeAggregationForLeaf(LeafReaderContext ctx) throws IOException {
+        if (subAggregators.length > 0) {
+            // The optimization does not work when there are subaggregations or if there is a filter.
+            // The query has to be a match all, otherwise
+            //
+            return false;
+        }
+
+        if (valuesSourceConfig.missing() != null) {
+            // we do not collect any documents through the missing aggregation when the missing parameter
+            // is up.
+            return true;
+        }
+
+        if (fieldName == null) {
+            // The optimization does not work when there are subaggregations or if there is a filter.
+            // The query has to be a match all, otherwise
+            //
+            return false;
+        }
+
+        // The optimization could only be used if there are no deleted documents and the top-level
+        // query matches all documents in the segment.
+        if (weight == null) {
+            return false;
+        } else {
+            if (weight.count(ctx) == 0) {
+                return true;
+            } else if (weight.count(ctx) != ctx.reader().maxDoc()) {
+                return false;
+            }
+        }
+
+        Set<String> indexedFields = new HashSet<>(FieldInfos.getIndexedFields(ctx.reader()));
+
+        // This will only work if the field name is indexed because otherwise, the reader would not
+        // have kept track of the doc count of the fieldname. There is a case where a field might be nonexistent
+        // but still can be calculated.
+        if (indexedFields.contains(fieldName) == false && ctx.reader().getFieldInfos().fieldInfo(fieldName) != null) {
+            return false;
+        }
+
+        NumericDocValues docCountValues = DocValues.getNumeric(ctx.reader(), DocCountFieldMapper.NAME);
+        if (docCountValues.nextDoc() != NO_MORE_DOCS) {
+            // This segment has at least one document with the _doc_count field.
+            return false;
+        }
+
+        long docCountWithFieldName = ctx.reader().getDocCount(fieldName);
+        int totalDocCount = ctx.reader().maxDoc();
+
+        // The missing aggregation bucket will count the number of documents where the field name is
+        // either null or not present in that document. We are subtracting the documents where the field
+        // value is valid.
+        incrementBucketDocCount(0, totalDocCount - docCountWithFieldName);
+
+        return true;
+    }
+
     @Override
     public InternalAggregation[] buildAggregations(long[] owningBucketOrds) throws IOException {
         return buildAggregationsForSingleBucket(

@@ -73,6 +73,7 @@
 import org.opensearch.search.aggregations.bucket.terms.SignificanceLookup.BackgroundFrequencyForBytes;
 import org.opensearch.search.aggregations.bucket.terms.heuristic.SignificanceHeuristic;
 import org.opensearch.search.aggregations.support.ValuesSource;
+import org.opensearch.search.aggregations.support.ValuesSourceConfig;
 import org.opensearch.search.internal.SearchContext;
 import org.opensearch.search.startree.StarTreeQueryHelper;
 import org.opensearch.search.startree.StarTreeTraversalUtil;
@@ -108,6 +109,7 @@ public class GlobalOrdinalsStringTermsAggregator extends AbstractStringTermsAggr
     protected int segmentsWithSingleValuedOrds = 0;
     protected int segmentsWithMultiValuedOrds = 0;
     LongUnaryOperator globalOperator;
+    private final ValuesSourceConfig config;
 
     /**
      * Lookup global ordinals
@@ -133,7 +135,8 @@ public GlobalOrdinalsStringTermsAggregator(
         SubAggCollectionMode collectionMode,
         boolean showTermDocCountError,
         CardinalityUpperBound cardinality,
-        Map<String, Object> metadata
+        Map<String, Object> metadata,
+        ValuesSourceConfig config
     ) throws IOException {
         super(name, factories, context, parent, order, format, bucketCountThresholds, collectionMode, showTermDocCountError, metadata);
         this.resultStrategy = resultStrategy.apply(this); // ResultStrategy needs a reference to the Aggregator to do its job.
@@ -154,9 +157,8 @@ public GlobalOrdinalsStringTermsAggregator(
                 return new DenseGlobalOrds();
             });
         }
-        this.fieldName = (valuesSource instanceof ValuesSource.Bytes.WithOrdinals.FieldData)
-            ? ((ValuesSource.Bytes.WithOrdinals.FieldData) valuesSource).getIndexFieldName()
-            : null;
+        this.fieldName = valuesSource.getIndexFieldName();
+        this.config = config;
     }
 
     String descriptCollectionStrategy() {
@@ -193,6 +195,14 @@ boolean tryCollectFromTermFrequencies(LeafReaderContext ctx, BiConsumer<Long, In
             }
         }
 
+        // If the missing property is specified in the builder, and there are documents with the
+        // field missing, we might not be able to use the index unless there is a way to
+        // calculate which ordinal value that missing field is (something I am not sure how to
+        // do yet).
+        if (config != null && config.missing() != null && ((weight.count(ctx) == ctx.reader().getDocCount(fieldName)) == false)) {
+            return false;
+        }
+
         Terms segmentTerms = ctx.reader().terms(this.fieldName);
         if (segmentTerms == null) {
             // Field is not indexed.
@@ -482,7 +492,8 @@ static class LowCardinality extends GlobalOrdinalsStringTermsAggregator {
             boolean remapGlobalOrds,
             SubAggCollectionMode collectionMode,
             boolean showTermDocCountError,
-            Map<String, Object> metadata
+            Map<String, Object> metadata,
+            ValuesSourceConfig config
         ) throws IOException {
             super(
                 name,
@@ -499,7 +510,8 @@ static class LowCardinality extends GlobalOrdinalsStringTermsAggregator {
                 collectionMode,
                 showTermDocCountError,
                 CardinalityUpperBound.ONE,
-                metadata
+                metadata,
+                config
             );
             assert factories == null || factories.countAggregators() == 0;
             this.segmentDocCounts = context.bigArrays().newLongArray(1, true);

@@ -31,15 +31,21 @@
 
 package org.opensearch.search.aggregations.bucket.terms;
 
+import org.apache.lucene.index.DocValues;
 import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.index.NumericDocValues;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermsEnum;
 import org.apache.lucene.search.ScoreMode;
+import org.apache.lucene.search.Weight;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.BytesRefBuilder;
 import org.apache.lucene.util.PriorityQueue;
 import org.opensearch.common.lease.Releasable;
 import org.opensearch.common.lease.Releasables;
 import org.opensearch.common.util.LongArray;
 import org.opensearch.index.fielddata.SortedBinaryDocValues;
+import org.opensearch.index.mapper.DocCountFieldMapper;
 import org.opensearch.search.DocValueFormat;
 import org.opensearch.search.aggregations.Aggregator;
 import org.opensearch.search.aggregations.AggregatorFactories;
@@ -54,6 +60,7 @@
 import org.opensearch.search.aggregations.bucket.terms.SignificanceLookup.BackgroundFrequencyForBytes;
 import org.opensearch.search.aggregations.bucket.terms.heuristic.SignificanceHeuristic;
 import org.opensearch.search.aggregations.support.ValuesSource;
+import org.opensearch.search.aggregations.support.ValuesSourceConfig;
 import org.opensearch.search.internal.SearchContext;
 
 import java.io.IOException;
@@ -65,6 +72,8 @@
 import java.util.function.Supplier;
 
 import static org.opensearch.search.aggregations.InternalOrder.isKeyOrder;
+import static org.apache.lucene.index.SortedSetDocValues.NO_MORE_DOCS;
+import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
 
 /**
  * An aggregator of string values that hashes the strings on the fly rather
@@ -75,8 +84,11 @@
 public class MapStringTermsAggregator extends AbstractStringTermsAggregator {
     private final CollectorSource collectorSource;
     private final ResultStrategy<?, ?> resultStrategy;
+    private Weight weight;
     private final BytesKeyedBucketOrds bucketOrds;
     private final IncludeExclude.StringFilter includeExclude;
+    protected final String fieldName;
+    private final ValuesSourceConfig config;
 
     public MapStringTermsAggregator(
         String name,
@@ -92,13 +104,52 @@
         SubAggCollectionMode collectionMode,
         boolean showTermDocCountError,
         CardinalityUpperBound cardinality,
-        Map<String, Object> metadata
+        Map<String, Object> metadata,
+        String fieldName,
+        ValuesSourceConfig config
     ) throws IOException {
         super(name, factories, context, parent, order, format, bucketCountThresholds, collectionMode, showTermDocCountError, metadata);
         this.collectorSource = collectorSource;
         this.resultStrategy = resultStrategy.apply(this); // ResultStrategy needs a reference to the Aggregator to do its job.
         this.includeExclude = includeExclude;
         bucketOrds = BytesKeyedBucketOrds.build(context.bigArrays(), cardinality);
+        this.fieldName = fieldName;
+        this.config = config;
+    }
+
+    public MapStringTermsAggregator(
+        String name,
+        AggregatorFactories factories,
+        CollectorSource collectorSource,
+        Function<MapStringTermsAggregator, ResultStrategy<?, ?>> resultStrategy,
+        BucketOrder order,
+        DocValueFormat format,
+        BucketCountThresholds bucketCountThresholds,
+        IncludeExclude.StringFilter includeExclude,
+        SearchContext context,
+        Aggregator parent,
+        SubAggCollectionMode collectionMode,
+        boolean showTermDocCountError,
+        CardinalityUpperBound cardinality,
+        Map<String, Object> metadata,
+        ValuesSourceConfig config
+    ) throws IOException {
+        super(name, factories, context, parent, order, format, bucketCountThresholds, collectionMode, showTermDocCountError, metadata);
+        this.collectorSource = collectorSource;
+        this.resultStrategy = resultStrategy.apply(this); // ResultStrategy needs a reference to the Aggregator to do its job.
+        this.includeExclude = includeExclude;
+        bucketOrds = BytesKeyedBucketOrds.build(context.bigArrays(), cardinality);
+        if (collectorSource instanceof ValuesSourceCollectorSource) {
+            ValuesSource valuesCollectorSource = ((ValuesSourceCollectorSource) collectorSource).getValuesSource();
+            this.fieldName = valuesCollectorSource.getIndexFieldName();
+        } else {
+            this.fieldName = null;
+        }
+        this.config = config;
+    }
+
+    public void setWeight(Weight weight) {
+        this.weight = weight;
     }
 
     @Override
@@ -130,6 +181,68 @@
         );
     }
 
+    @Override
+    protected boolean tryPrecomputeAggregationForLeaf(LeafReaderContext ctx) throws IOException {
+        // TODO: A note is that in scripted aggregations, the way of collecting from buckets is determined from
+        // the script aggregator. For now, we will not be able to support the script aggregation.
+
+        if (subAggregators.length > 0 || includeExclude != null || fieldName == null) {
+            // The optimization does not work when there are subaggregations or if there is a filter.
+            // The query has to be a match all, otherwise
+            return false;
+        }
+
+        // If the missing property is specified in the builder, and there are documents with the
+        // field missing, we might not be able to use the index unless there is some way we can
+        // calculate which ordinal value that missing field is (something I am not sure how to
+        // do yet).
+        if (config != null && config.missing() != null && ((weight.count(ctx) == ctx.reader().getDocCount(fieldName)) == false)) {
+            return false;
+        }
+
+        // The optimization could only be used if there are no deleted documents and the top-level
+        // query matches all documents in the segment.
+        if (weight == null) {
+            return false;
+        } else {
+            if (weight.count(ctx) == 0) {
+                return true;
+            } else if (weight.count(ctx) != ctx.reader().maxDoc()) {
+                return false;
+            }
+        }
+
+        Terms stringTerms = ctx.reader().terms(fieldName);
+        if (stringTerms == null) {
+            // Field is not indexed.
+            return false;
+        }
+
+        NumericDocValues docCountValues = DocValues.getNumeric(ctx.reader(), DocCountFieldMapper.NAME);
+        if (docCountValues.nextDoc() != NO_MORE_DOCS) {
+            // This segment has at least one document with the _doc_count field.
+            return false;
+        }
+
+        TermsEnum stringTermsEnum = stringTerms.iterator();
+        BytesRef stringTerm = stringTermsEnum.next();
+
+        // Here, we will iterate over all the terms in the segment and add the counts into the bucket.
+        while (stringTerm != null) {
+            long bucketOrdinal = bucketOrds.add(0L, stringTerm);
+            if (bucketOrdinal < 0) { // already seen
+                bucketOrdinal = -1 - bucketOrdinal;
+            }
+            int amount = stringTermsEnum.docFreq();
+            if (resultStrategy instanceof SignificantTermsResults) {
+                ((SignificantTermsResults) resultStrategy).updateSubsetSizes(0L, amount);
+            }
+            incrementBucketDocCount(bucketOrdinal, amount);
+            stringTerm = stringTermsEnum.next();
+        }
+        return true;
+    }
+
     @Override
     public InternalAggregation[] buildAggregations(long[] owningBucketOrds) throws IOException {
         return resultStrategy.buildAggregations(owningBucketOrds);
@@ -196,6 +309,10 @@
             return valuesSource.needsScores();
         }
 
+        public ValuesSource getValuesSource() {
+            return valuesSource;
+        }
+
         @Override
         public LeafBucketCollector getLeafCollector(
             IncludeExclude.StringFilter includeExclude,
@@ -501,6 +618,11 @@
             return "significant_terms";
         }
 
+        public void updateSubsetSizes(long owningBucketOrd, int amount) {
+            subsetSizes = context.bigArrays().grow(subsetSizes, owningBucketOrd + 1);
+            subsetSizes.increment(owningBucketOrd, amount);
+        }
+
         @Override
         LeafBucketCollector wrapCollector(LeafBucketCollector primary) {
             return new LeafBucketCollectorBase(primary, null) {