Skip to content

Added aggregation precomputation for rare terms #18106

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 12 commits into
base: main
Choose a base branch
from
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,13 @@

package org.opensearch.search.aggregations.bucket.missing;

import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.search.Weight;
import org.opensearch.index.fielddata.DocValueBits;
import org.opensearch.index.mapper.DocCountFieldMapper;
import org.opensearch.search.aggregations.Aggregator;
import org.opensearch.search.aggregations.AggregatorFactories;
import org.opensearch.search.aggregations.CardinalityUpperBound;
Expand All @@ -46,7 +51,11 @@
import org.opensearch.search.internal.SearchContext;

import java.io.IOException;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;

import static org.apache.lucene.index.SortedSetDocValues.NO_MORE_DOCS;

/**
* Aggregate all docs that are missing a value.
Expand All @@ -55,7 +64,10 @@
*/
public class MissingAggregator extends BucketsAggregator implements SingleBucketAggregator {

private Weight weight;
private final ValuesSource valuesSource;
protected final String fieldName;
private final ValuesSourceConfig valuesSourceConfig;

public MissingAggregator(
String name,
Expand All @@ -69,6 +81,16 @@
super(name, factories, aggregationContext, parent, cardinality, metadata);
// TODO: Stop using nulls here
this.valuesSource = valuesSourceConfig.hasValues() ? valuesSourceConfig.getValuesSource() : null;
if (this.valuesSource != null) {
this.fieldName = valuesSource.getIndexFieldName();
} else {
this.fieldName = null;
}
this.valuesSourceConfig = valuesSourceConfig;
}

public void setWeight(Weight weight) {
this.weight = weight;
}

@Override
Expand All @@ -94,6 +116,66 @@
};
}

@Override
protected boolean tryPrecomputeAggregationForLeaf(LeafReaderContext ctx) throws IOException {
if (subAggregators.length > 0) {
// The optimization does not work when there are subaggregations or if there is a filter.
// The query has to be a match all, otherwise
//
return false;

Check warning on line 125 in server/src/main/java/org/opensearch/search/aggregations/bucket/missing/MissingAggregator.java

View check run for this annotation

Codecov / codecov/patch

server/src/main/java/org/opensearch/search/aggregations/bucket/missing/MissingAggregator.java#L125

Added line #L125 was not covered by tests
}

if (valuesSourceConfig.missing() != null) {
// we do not collect any documents through the missing aggregation when the missing parameter
// is up.
return true;
}

if (fieldName == null) {
// The optimization does not work when there are subaggregations or if there is a filter.
// The query has to be a match all, otherwise
//
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the comment is misplaced here.
Can you please check the comments on the entire PR once. Also, please remove empty comment lines.

return false;
}

// The optimization could only be used if there are no deleted documents and the top-level
// query matches all documents in the segment.
if (weight == null) {
return false;

Check warning on line 144 in server/src/main/java/org/opensearch/search/aggregations/bucket/missing/MissingAggregator.java

View check run for this annotation

Codecov / codecov/patch

server/src/main/java/org/opensearch/search/aggregations/bucket/missing/MissingAggregator.java#L144

Added line #L144 was not covered by tests
} else {
if (weight.count(ctx) == 0) {
return true;

Check warning on line 147 in server/src/main/java/org/opensearch/search/aggregations/bucket/missing/MissingAggregator.java

View check run for this annotation

Codecov / codecov/patch

server/src/main/java/org/opensearch/search/aggregations/bucket/missing/MissingAggregator.java#L147

Added line #L147 was not covered by tests
} else if (weight.count(ctx) != ctx.reader().maxDoc()) {
return false;

Check warning on line 149 in server/src/main/java/org/opensearch/search/aggregations/bucket/missing/MissingAggregator.java

View check run for this annotation

Codecov / codecov/patch

server/src/main/java/org/opensearch/search/aggregations/bucket/missing/MissingAggregator.java#L149

Added line #L149 was not covered by tests
}
}

Set<String> indexedFields = new HashSet<>(FieldInfos.getIndexedFields(ctx.reader()));

// This will only work if the field name is indexed because otherwise, the reader would not
// have kept track of the doc count of the fieldname. There is a case where a field might be nonexistent
// but still can be calculated.
if (indexedFields.contains(fieldName) == false && ctx.reader().getFieldInfos().fieldInfo(fieldName) != null) {
return false;
}

NumericDocValues docCountValues = DocValues.getNumeric(ctx.reader(), DocCountFieldMapper.NAME);
if (docCountValues.nextDoc() != NO_MORE_DOCS) {
// This segment has at least one document with the _doc_count field.
return false;

Check warning on line 165 in server/src/main/java/org/opensearch/search/aggregations/bucket/missing/MissingAggregator.java

View check run for this annotation

Codecov / codecov/patch

server/src/main/java/org/opensearch/search/aggregations/bucket/missing/MissingAggregator.java#L165

Added line #L165 was not covered by tests
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think if you separate out the test cases as I commented in test files - that can give you a good code coverage as well.

}

long docCountWithFieldName = ctx.reader().getDocCount(fieldName);
int totalDocCount = ctx.reader().maxDoc();

// The missing aggregation bucket will count the number of documents where the field name is
// either null or not present in that document. We are subtracting the documents where the field
// value is valid.
incrementBucketDocCount(0, totalDocCount - docCountWithFieldName);

return true;
}

@Override
public InternalAggregation[] buildAggregations(long[] owningBucketOrds) throws IOException {
return buildAggregationsForSingleBucket(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@
import org.opensearch.search.aggregations.bucket.terms.SignificanceLookup.BackgroundFrequencyForBytes;
import org.opensearch.search.aggregations.bucket.terms.heuristic.SignificanceHeuristic;
import org.opensearch.search.aggregations.support.ValuesSource;
import org.opensearch.search.aggregations.support.ValuesSourceConfig;
import org.opensearch.search.internal.SearchContext;
import org.opensearch.search.startree.StarTreeQueryHelper;
import org.opensearch.search.startree.StarTreeTraversalUtil;
Expand Down Expand Up @@ -108,6 +109,7 @@ public class GlobalOrdinalsStringTermsAggregator extends AbstractStringTermsAggr
protected int segmentsWithSingleValuedOrds = 0;
protected int segmentsWithMultiValuedOrds = 0;
LongUnaryOperator globalOperator;
private final ValuesSourceConfig config;

/**
* Lookup global ordinals
Expand All @@ -133,7 +135,8 @@ public GlobalOrdinalsStringTermsAggregator(
SubAggCollectionMode collectionMode,
boolean showTermDocCountError,
CardinalityUpperBound cardinality,
Map<String, Object> metadata
Map<String, Object> metadata,
ValuesSourceConfig config
) throws IOException {
super(name, factories, context, parent, order, format, bucketCountThresholds, collectionMode, showTermDocCountError, metadata);
this.resultStrategy = resultStrategy.apply(this); // ResultStrategy needs a reference to the Aggregator to do its job.
Expand All @@ -154,9 +157,8 @@ public GlobalOrdinalsStringTermsAggregator(
return new DenseGlobalOrds();
});
}
this.fieldName = (valuesSource instanceof ValuesSource.Bytes.WithOrdinals.FieldData)
? ((ValuesSource.Bytes.WithOrdinals.FieldData) valuesSource).getIndexFieldName()
: null;
this.fieldName = valuesSource.getIndexFieldName();
this.config = config;
}

String descriptCollectionStrategy() {
Expand Down Expand Up @@ -193,6 +195,14 @@ boolean tryCollectFromTermFrequencies(LeafReaderContext ctx, BiConsumer<Long, In
}
}

// If the missing property is specified in the builder, and there are documents with the
// field missing, we might not be able to use the index unless there is a way to
// calculate which ordinal value that missing field is (something I am not sure how to
// do yet).
if (config != null && config.missing() != null && ((weight.count(ctx) == ctx.reader().getDocCount(fieldName)) == false)) {
return false;
}

Terms segmentTerms = ctx.reader().terms(this.fieldName);
if (segmentTerms == null) {
// Field is not indexed.
Expand Down Expand Up @@ -482,7 +492,8 @@ static class LowCardinality extends GlobalOrdinalsStringTermsAggregator {
boolean remapGlobalOrds,
SubAggCollectionMode collectionMode,
boolean showTermDocCountError,
Map<String, Object> metadata
Map<String, Object> metadata,
ValuesSourceConfig config
) throws IOException {
super(
name,
Expand All @@ -499,7 +510,8 @@ static class LowCardinality extends GlobalOrdinalsStringTermsAggregator {
collectionMode,
showTermDocCountError,
CardinalityUpperBound.ONE,
metadata
metadata,
config
);
assert factories == null || factories.countAggregators() == 0;
this.segmentDocCounts = context.bigArrays().newLongArray(1, true);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,15 +31,21 @@

package org.opensearch.search.aggregations.bucket.terms;

import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.ScoreMode;
import org.apache.lucene.search.Weight;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.PriorityQueue;
import org.opensearch.common.lease.Releasable;
import org.opensearch.common.lease.Releasables;
import org.opensearch.common.util.LongArray;
import org.opensearch.index.fielddata.SortedBinaryDocValues;
import org.opensearch.index.mapper.DocCountFieldMapper;
import org.opensearch.search.DocValueFormat;
import org.opensearch.search.aggregations.Aggregator;
import org.opensearch.search.aggregations.AggregatorFactories;
Expand All @@ -54,6 +60,7 @@
import org.opensearch.search.aggregations.bucket.terms.SignificanceLookup.BackgroundFrequencyForBytes;
import org.opensearch.search.aggregations.bucket.terms.heuristic.SignificanceHeuristic;
import org.opensearch.search.aggregations.support.ValuesSource;
import org.opensearch.search.aggregations.support.ValuesSourceConfig;
import org.opensearch.search.internal.SearchContext;

import java.io.IOException;
Expand All @@ -65,6 +72,8 @@
import java.util.function.Supplier;

import static org.opensearch.search.aggregations.InternalOrder.isKeyOrder;
import static org.apache.lucene.index.SortedSetDocValues.NO_MORE_DOCS;
import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;

/**
* An aggregator of string values that hashes the strings on the fly rather
Expand All @@ -75,8 +84,11 @@
public class MapStringTermsAggregator extends AbstractStringTermsAggregator {
private final CollectorSource collectorSource;
private final ResultStrategy<?, ?> resultStrategy;
private Weight weight;
private final BytesKeyedBucketOrds bucketOrds;
private final IncludeExclude.StringFilter includeExclude;
protected final String fieldName;
private final ValuesSourceConfig config;

public MapStringTermsAggregator(
String name,
Expand All @@ -92,13 +104,52 @@
SubAggCollectionMode collectionMode,
boolean showTermDocCountError,
CardinalityUpperBound cardinality,
Map<String, Object> metadata
Map<String, Object> metadata,
String fieldName,
ValuesSourceConfig config
) throws IOException {
super(name, factories, context, parent, order, format, bucketCountThresholds, collectionMode, showTermDocCountError, metadata);
this.collectorSource = collectorSource;
this.resultStrategy = resultStrategy.apply(this); // ResultStrategy needs a reference to the Aggregator to do its job.
this.includeExclude = includeExclude;
bucketOrds = BytesKeyedBucketOrds.build(context.bigArrays(), cardinality);
this.fieldName = fieldName;
this.config = config;
}

public MapStringTermsAggregator(
String name,
AggregatorFactories factories,
CollectorSource collectorSource,
Function<MapStringTermsAggregator, ResultStrategy<?, ?>> resultStrategy,
BucketOrder order,
DocValueFormat format,
BucketCountThresholds bucketCountThresholds,
IncludeExclude.StringFilter includeExclude,
SearchContext context,
Aggregator parent,
SubAggCollectionMode collectionMode,
boolean showTermDocCountError,
CardinalityUpperBound cardinality,
Map<String, Object> metadata,
ValuesSourceConfig config
) throws IOException {
super(name, factories, context, parent, order, format, bucketCountThresholds, collectionMode, showTermDocCountError, metadata);
this.collectorSource = collectorSource;
this.resultStrategy = resultStrategy.apply(this); // ResultStrategy needs a reference to the Aggregator to do its job.
this.includeExclude = includeExclude;
bucketOrds = BytesKeyedBucketOrds.build(context.bigArrays(), cardinality);
if (collectorSource instanceof ValuesSourceCollectorSource) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't like the idea of being uncertain about where the fieldName is going to come from, basically either from constructor above or fetching from value source. Let's be deterministic on where we are going to fetch the field name.


Also, you can probably use pattern matching for instanceof:

if (collectorSource instanceof ValuesSourceCollectorSource valuesCollectorSource) {
    this.fieldName = valuesCollectorSource.getValuesSource().getIndexFieldName();
}

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point. I will just stick with fetching from the value source. Since I made the modification to add the field name to the constructor, previous implementations should not be affected.

ValuesSource valuesCollectorSource = ((ValuesSourceCollectorSource) collectorSource).getValuesSource();
this.fieldName = valuesCollectorSource.getIndexFieldName();
} else {
this.fieldName = null;

Check warning on line 146 in server/src/main/java/org/opensearch/search/aggregations/bucket/terms/MapStringTermsAggregator.java

View check run for this annotation

Codecov / codecov/patch

server/src/main/java/org/opensearch/search/aggregations/bucket/terms/MapStringTermsAggregator.java#L146

Added line #L146 was not covered by tests
}
this.config = config;
}

public void setWeight(Weight weight) {
this.weight = weight;
}

@Override
Expand Down Expand Up @@ -130,6 +181,68 @@
);
}

@Override
protected boolean tryPrecomputeAggregationForLeaf(LeafReaderContext ctx) throws IOException {
// TODO: A note is that in scripted aggregations, the way of collecting from buckets is determined from
// the script aggregator. For now, we will not be able to support the script aggregation.

if (subAggregators.length > 0 || includeExclude != null || fieldName == null) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can pull up null checks for weight and config here so that you don't have to assert it again.

Right now you are checking for config != null twice, and checking up (weight.count(ctx) == ctx.reader().getDocCount(fieldName) before checking for weight == null.

Copy link
Contributor Author

@ajleong623 ajleong623 Jun 30, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We might be able to proceed if config == null, but if there is a script or there is both a missing parameter and there are actual missing values, we will not be able to use the precomputation optimization. But I can move up the weight check.

// The optimization does not work when there are subaggregations or if there is a filter.
// The query has to be a match all, otherwise
return false;
}

// If the missing property is specified in the builder, and there are documents with the
// field missing, we might not be able to use the index unless there is some way we can
// calculate which ordinal value that missing field is (something I am not sure how to
// do yet).
if (config != null && config.missing() != null && ((weight.count(ctx) == ctx.reader().getDocCount(fieldName)) == false)) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: weight.count(ctx) != ctx.reader().getDocCount(fieldName) instead of asserting equality as false.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right. I looked at the formatting guidelines again, and I only have to assert the equality as false for unary negations.

return false;

Check warning on line 200 in server/src/main/java/org/opensearch/search/aggregations/bucket/terms/MapStringTermsAggregator.java

View check run for this annotation

Codecov / codecov/patch

server/src/main/java/org/opensearch/search/aggregations/bucket/terms/MapStringTermsAggregator.java#L200

Added line #L200 was not covered by tests
}

// The optimization could only be used if there are no deleted documents and the top-level
// query matches all documents in the segment.
if (weight == null) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: Moving this null check towards the start of method can make this more readable.

return false;
} else {
if (weight.count(ctx) == 0) {
return true;
} else if (weight.count(ctx) != ctx.reader().maxDoc()) {
return false;
}
}

Terms stringTerms = ctx.reader().terms(fieldName);
if (stringTerms == null) {
// Field is not indexed.
return false;
}

NumericDocValues docCountValues = DocValues.getNumeric(ctx.reader(), DocCountFieldMapper.NAME);
if (docCountValues.nextDoc() != NO_MORE_DOCS) {
// This segment has at least one document with the _doc_count field.
return false;

Check warning on line 224 in server/src/main/java/org/opensearch/search/aggregations/bucket/terms/MapStringTermsAggregator.java

View check run for this annotation

Codecov / codecov/patch

server/src/main/java/org/opensearch/search/aggregations/bucket/terms/MapStringTermsAggregator.java#L224

Added line #L224 was not covered by tests
}

TermsEnum stringTermsEnum = stringTerms.iterator();
BytesRef stringTerm = stringTermsEnum.next();

// Here, we will iterate over all the terms in the segment and add the counts into the bucket.
while (stringTerm != null) {
long bucketOrdinal = bucketOrds.add(0L, stringTerm);
if (bucketOrdinal < 0) { // already seen
bucketOrdinal = -1 - bucketOrdinal;

Check warning on line 234 in server/src/main/java/org/opensearch/search/aggregations/bucket/terms/MapStringTermsAggregator.java

View check run for this annotation

Codecov / codecov/patch

server/src/main/java/org/opensearch/search/aggregations/bucket/terms/MapStringTermsAggregator.java#L234

Added line #L234 was not covered by tests
}
int amount = stringTermsEnum.docFreq();
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: rename amount to docCount or docFreq

if (resultStrategy instanceof SignificantTermsResults) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit:

           if (resultStrategy instanceof SignificantTermsResults sigTermsResultStrategy) {
               sigTermsResultStrategy.updateSubsetSizes(0L, docCount);
           }

((SignificantTermsResults) resultStrategy).updateSubsetSizes(0L, amount);
}
incrementBucketDocCount(bucketOrdinal, amount);
stringTerm = stringTermsEnum.next();
}
return true;
}

@Override
public InternalAggregation[] buildAggregations(long[] owningBucketOrds) throws IOException {
return resultStrategy.buildAggregations(owningBucketOrds);
Expand Down Expand Up @@ -196,6 +309,10 @@
return valuesSource.needsScores();
}

public ValuesSource getValuesSource() {
return valuesSource;
}

@Override
public LeafBucketCollector getLeafCollector(
IncludeExclude.StringFilter includeExclude,
Expand Down Expand Up @@ -501,6 +618,11 @@
return "significant_terms";
}

public void updateSubsetSizes(long owningBucketOrd, int amount) {
subsetSizes = context.bigArrays().grow(subsetSizes, owningBucketOrd + 1);
subsetSizes.increment(owningBucketOrd, amount);
}

@Override
LeafBucketCollector wrapCollector(LeafBucketCollector primary) {
return new LeafBucketCollectorBase(primary, null) {
Expand Down
Loading
Loading