Skip to content

Commit a80478b

Browse files
committed
Adds various Query overrides to Keyword Field
The keywordfield mapper provides access to various query types, e.g. the termsQuery, fuzzyQuery. These are inherited as is from the StringType. But we do not take into account the fact that keyword fields can have doc_values enabled. This PR adds the ability for various queries to first check if doc_values are enabled and if so out-source the work to lucene to decide if it's better to use index values or doc_values when running queries. Signed-off-by: Harsha Vamsi Kalluri <[email protected]>
1 parent 9d85e56 commit a80478b

File tree

7 files changed

+446
-18
lines changed

7 files changed

+446
-18
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
6363
- Return 409 Conflict HTTP status instead of 503 on failure to concurrently execute snapshots ([#8986](https://github.com/opensearch-project/OpenSearch/pull/5855))
6464
- Add task completion count in search backpressure stats API ([#10028](https://github.com/opensearch-project/OpenSearch/pull/10028/))
6565
- Performance improvement for Datetime field caching ([#4558](https://github.com/opensearch-project/OpenSearch/issues/4558))
66+
- Performance improvement for MultiTerm Queries on Keyword fields ([#7057](https://github.com/opensearch-project/OpenSearch/issues/7057))
6667

6768

6869
### Deprecated
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
---
2+
"search on keyword fields with doc_values enabled":
3+
- do:
4+
indices.create:
5+
index: test
6+
body:
7+
mappings:
8+
properties:
9+
"some_keyword":
10+
type: "keyword"
11+
index: true
12+
doc_values: true
13+
14+
- do:
15+
bulk:
16+
index: test
17+
refresh: true
18+
body:
19+
- '{"index": {"_index": "test", "_id": "1" }}'
20+
- '{ "some_keyword": "ingesting some random keyword data" }'
21+
- '{ "index": { "_index": "test", "_id": "2" }}'
22+
- '{ "some_keyword": "400" }'
23+
- '{ "index": { "_index": "test", "_id": "3" } }'
24+
- '{ "some_keyword": "5" }'
25+
26+
- do:
27+
search:
28+
index: test
29+
body:
30+
query:
31+
prefix:
32+
some_keyword: "ing"
33+
34+
- match: { hits.hits.0._source.some_keyword: "ingesting some random keyword data" }
35+
36+
- do:
37+
search:
38+
index: test
39+
body:
40+
query:
41+
range: {
42+
"some_keyword": {
43+
"lt": 500
44+
} }
45+
46+
- match: { hits.total.value: 2 }

server/src/main/java/org/opensearch/index/mapper/KeywordFieldMapper.java

Lines changed: 232 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -38,11 +38,24 @@
3838
import org.apache.lucene.document.FieldType;
3939
import org.apache.lucene.document.SortedSetDocValuesField;
4040
import org.apache.lucene.index.IndexOptions;
41+
import org.apache.lucene.index.Term;
42+
import org.apache.lucene.search.FuzzyQuery;
43+
import org.apache.lucene.search.IndexOrDocValuesQuery;
4144
import org.apache.lucene.search.MultiTermQuery;
45+
import org.apache.lucene.search.PrefixQuery;
4246
import org.apache.lucene.search.Query;
47+
import org.apache.lucene.search.RegexpQuery;
48+
import org.apache.lucene.search.TermInSetQuery;
49+
import org.apache.lucene.search.TermRangeQuery;
50+
import org.apache.lucene.search.WildcardQuery;
4351
import org.apache.lucene.util.BytesRef;
52+
import org.apache.lucene.util.automaton.Operations;
53+
import org.opensearch.OpenSearchException;
4454
import org.opensearch.common.Nullable;
55+
import org.opensearch.common.lucene.BytesRefs;
4556
import org.opensearch.common.lucene.Lucene;
57+
import org.opensearch.common.lucene.search.AutomatonQueries;
58+
import org.opensearch.common.unit.Fuzziness;
4659
import org.opensearch.core.xcontent.XContentParser;
4760
import org.opensearch.index.analysis.IndexAnalyzers;
4861
import org.opensearch.index.analysis.NamedAnalyzer;
@@ -62,6 +75,8 @@
6275
import java.util.Objects;
6376
import java.util.function.Supplier;
6477

78+
import static org.opensearch.search.SearchService.ALLOW_EXPENSIVE_QUERIES;
79+
6580
/**
6681
* A field mapper for keywords. This mapper accepts strings and indexes them as-is.
6782
*
@@ -317,7 +332,7 @@ public IndexFieldData.Builder fielddataBuilder(String fullyQualifiedIndexName, S
317332
@Override
318333
public ValueFetcher valueFetcher(QueryShardContext context, SearchLookup searchLookup, String format) {
319334
if (format != null) {
320-
throw new IllegalArgumentException("Field [" + name() + "] of type [" + typeName() + "] doesn't support formats.");
335+
throw new IllegalArgumentException("Field [" + name() + "] of type [" + typeName() + "] doesn't " + "support formats.");
321336
}
322337

323338
return new SourceValueFetcher(name(), context, nullValue) {
@@ -372,17 +387,226 @@ protected BytesRef indexedValueForSearch(Object value) {
372387
return getTextSearchInfo().getSearchAnalyzer().normalize(name(), value.toString());
373388
}
374389

390+
@Override
391+
public Query termsQuery(List<?> values, QueryShardContext context) {
392+
failIfNotIndexedAndNoDocValues();
393+
// has index and doc_values enabled
394+
if (isSearchable() && hasDocValues()) {
395+
BytesRef[] bytesRefs = new BytesRef[values.size()];
396+
for (int i = 0; i < bytesRefs.length; i++) {
397+
bytesRefs[i] = indexedValueForSearch(values.get(i));
398+
}
399+
Query indexQuery = new TermInSetQuery(name(), bytesRefs);
400+
Query dvQuery = new TermInSetQuery(MultiTermQuery.DOC_VALUES_REWRITE, name(), bytesRefs);
401+
return new IndexOrDocValuesQuery(indexQuery, dvQuery);
402+
}
403+
// if we only have doc_values enabled, we construct a new query with doc_values re-written
404+
if (hasDocValues()) {
405+
BytesRef[] bytesRefs = new BytesRef[values.size()];
406+
for (int i = 0; i < bytesRefs.length; i++) {
407+
bytesRefs[i] = indexedValueForSearch(values.get(i));
408+
}
409+
return new TermInSetQuery(MultiTermQuery.DOC_VALUES_REWRITE, name(), bytesRefs);
410+
}
411+
// has index enabled, we're going to return the query as is
412+
return super.termsQuery(values, context);
413+
}
414+
415+
@Override
416+
public Query prefixQuery(
417+
String value,
418+
@Nullable MultiTermQuery.RewriteMethod method,
419+
boolean caseInsensitive,
420+
QueryShardContext context
421+
) {
422+
if (context.allowExpensiveQueries() == false) {
423+
throw new OpenSearchException(
424+
"[prefix] queries cannot be executed when '"
425+
+ ALLOW_EXPENSIVE_QUERIES.getKey()
426+
+ "' is set to false. For optimised prefix queries on text "
427+
+ "fields please enable [index_prefixes]."
428+
);
429+
}
430+
failIfNotIndexedAndNoDocValues();
431+
if (isSearchable() && hasDocValues()) {
432+
Query indexQuery = super.prefixQuery(value, method, caseInsensitive, context);
433+
Query dvQuery = super.prefixQuery(value, MultiTermQuery.DOC_VALUES_REWRITE, caseInsensitive, context);
434+
return new IndexOrDocValuesQuery(indexQuery, dvQuery);
435+
}
436+
if (hasDocValues()) {
437+
if (caseInsensitive) {
438+
return AutomatonQueries.caseInsensitivePrefixQuery(
439+
(new Term(name(), indexedValueForSearch(value))),
440+
MultiTermQuery.DOC_VALUES_REWRITE
441+
);
442+
}
443+
return new PrefixQuery(new Term(name(), indexedValueForSearch(value)), MultiTermQuery.DOC_VALUES_REWRITE);
444+
}
445+
return super.prefixQuery(value, method, caseInsensitive, context);
446+
}
447+
448+
@Override
449+
public Query regexpQuery(
450+
String value,
451+
int syntaxFlags,
452+
int matchFlags,
453+
int maxDeterminizedStates,
454+
@Nullable MultiTermQuery.RewriteMethod method,
455+
QueryShardContext context
456+
) {
457+
if (context.allowExpensiveQueries() == false) {
458+
throw new OpenSearchException(
459+
"[regexp] queries cannot be executed when '" + ALLOW_EXPENSIVE_QUERIES.getKey() + "' is set to " + "false."
460+
);
461+
}
462+
failIfNotIndexedAndNoDocValues();
463+
if (isSearchable() && hasDocValues()) {
464+
Query indexQuery = super.regexpQuery(value, syntaxFlags, matchFlags, maxDeterminizedStates, method, context);
465+
Query dvQuery = super.regexpQuery(
466+
value,
467+
syntaxFlags,
468+
matchFlags,
469+
maxDeterminizedStates,
470+
MultiTermQuery.DOC_VALUES_REWRITE,
471+
context
472+
);
473+
return new IndexOrDocValuesQuery(indexQuery, dvQuery);
474+
}
475+
if (hasDocValues()) {
476+
return new RegexpQuery(
477+
new Term(name(), indexedValueForSearch(value)),
478+
syntaxFlags,
479+
matchFlags,
480+
RegexpQuery.DEFAULT_PROVIDER,
481+
maxDeterminizedStates,
482+
MultiTermQuery.DOC_VALUES_REWRITE
483+
);
484+
}
485+
return super.regexpQuery(value, syntaxFlags, matchFlags, maxDeterminizedStates, method, context);
486+
}
487+
488+
@Override
489+
public Query rangeQuery(Object lowerTerm, Object upperTerm, boolean includeLower, boolean includeUpper, QueryShardContext context) {
490+
if (context.allowExpensiveQueries() == false) {
491+
throw new OpenSearchException(
492+
"[range] queries on [text] or [keyword] fields cannot be executed when '"
493+
+ ALLOW_EXPENSIVE_QUERIES.getKey()
494+
+ "' is set to false."
495+
);
496+
}
497+
failIfNotIndexedAndNoDocValues();
498+
if (isSearchable() && hasDocValues()) {
499+
Query indexQuery = new TermRangeQuery(
500+
name(),
501+
lowerTerm == null ? null : indexedValueForSearch(lowerTerm),
502+
upperTerm == null ? null : indexedValueForSearch(upperTerm),
503+
includeLower,
504+
includeUpper
505+
);
506+
Query dvQuery = new TermRangeQuery(
507+
name(),
508+
lowerTerm == null ? null : indexedValueForSearch(lowerTerm),
509+
upperTerm == null ? null : indexedValueForSearch(upperTerm),
510+
includeLower,
511+
includeUpper,
512+
MultiTermQuery.DOC_VALUES_REWRITE
513+
);
514+
return new IndexOrDocValuesQuery(indexQuery, dvQuery);
515+
}
516+
if (hasDocValues()) {
517+
return new TermRangeQuery(
518+
name(),
519+
lowerTerm == null ? null : indexedValueForSearch(lowerTerm),
520+
upperTerm == null ? null : indexedValueForSearch(upperTerm),
521+
includeLower,
522+
includeUpper,
523+
MultiTermQuery.DOC_VALUES_REWRITE
524+
);
525+
}
526+
return new TermRangeQuery(
527+
name(),
528+
lowerTerm == null ? null : indexedValueForSearch(lowerTerm),
529+
upperTerm == null ? null : indexedValueForSearch(upperTerm),
530+
includeLower,
531+
includeUpper
532+
);
533+
}
534+
535+
@Override
536+
public Query fuzzyQuery(
537+
Object value,
538+
Fuzziness fuzziness,
539+
int prefixLength,
540+
int maxExpansions,
541+
boolean transpositions,
542+
@Nullable MultiTermQuery.RewriteMethod method,
543+
QueryShardContext context
544+
) {
545+
failIfNotIndexedAndNoDocValues();
546+
if (context.allowExpensiveQueries() == false) {
547+
throw new OpenSearchException(
548+
"[fuzzy] queries cannot be executed when '" + ALLOW_EXPENSIVE_QUERIES.getKey() + "' is set to " + "false."
549+
);
550+
}
551+
if (isSearchable() && hasDocValues()) {
552+
Query indexQuery = super.fuzzyQuery(value, fuzziness, prefixLength, maxExpansions, transpositions, context);
553+
Query dvQuery = super.fuzzyQuery(
554+
value,
555+
fuzziness,
556+
prefixLength,
557+
maxExpansions,
558+
transpositions,
559+
MultiTermQuery.DOC_VALUES_REWRITE,
560+
context
561+
);
562+
return new IndexOrDocValuesQuery(indexQuery, dvQuery);
563+
}
564+
if (hasDocValues()) {
565+
return new FuzzyQuery(
566+
new Term(name(), indexedValueForSearch(value)),
567+
fuzziness.asDistance(BytesRefs.toString(value)),
568+
prefixLength,
569+
maxExpansions,
570+
transpositions,
571+
MultiTermQuery.DOC_VALUES_REWRITE
572+
);
573+
}
574+
return super.fuzzyQuery(value, fuzziness, prefixLength, maxExpansions, transpositions, context);
575+
}
576+
375577
@Override
376578
public Query wildcardQuery(
377579
String value,
378580
@Nullable MultiTermQuery.RewriteMethod method,
379-
boolean caseInsensitve,
581+
boolean caseInsensitive,
380582
QueryShardContext context
381583
) {
382-
// keyword field types are always normalized, so ignore case sensitivity and force normalize the wildcard
584+
if (context.allowExpensiveQueries() == false) {
585+
throw new OpenSearchException(
586+
"[wildcard] queries cannot be executed when '" + ALLOW_EXPENSIVE_QUERIES.getKey() + "' is set to " + "false."
587+
);
588+
}
589+
failIfNotIndexedAndNoDocValues();
590+
// keyword field types are always normalized, so ignore case sensitivity and force normalize the
591+
// wildcard
383592
// query text
384-
return super.wildcardQuery(value, method, caseInsensitve, true, context);
593+
if (isSearchable() && hasDocValues()) {
594+
Query indexQuery = super.wildcardQuery(value, method, caseInsensitive, true, context);
595+
Query dvQuery = super.wildcardQuery(value, MultiTermQuery.DOC_VALUES_REWRITE, caseInsensitive, true, context);
596+
return new IndexOrDocValuesQuery(indexQuery, dvQuery);
597+
}
598+
if (hasDocValues()) {
599+
Term term;
600+
value = normalizeWildcardPattern(name(), value, getTextSearchInfo().getSearchAnalyzer());
601+
term = new Term(name(), value);
602+
if (caseInsensitive) {
603+
return AutomatonQueries.caseInsensitiveWildcardQuery(term, method);
604+
}
605+
return new WildcardQuery(term, Operations.DEFAULT_DETERMINIZE_WORK_LIMIT, MultiTermQuery.DOC_VALUES_REWRITE);
606+
}
607+
return super.wildcardQuery(value, method, caseInsensitive, true, context);
385608
}
609+
386610
}
387611

388612
private final boolean indexed;
@@ -422,8 +646,10 @@ protected KeywordFieldMapper(
422646
this.indexAnalyzers = builder.indexAnalyzers;
423647
}
424648

425-
/** Values that have more chars than the return value of this method will
426-
* be skipped at parsing time. */
649+
/**
650+
* Values that have more chars than the return value of this method will
651+
* be skipped at parsing time.
652+
*/
427653
public int ignoreAbove() {
428654
return ignoreAbove;
429655
}

server/src/main/java/org/opensearch/index/mapper/MappedFieldType.java

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -269,6 +269,21 @@ public Query fuzzyQuery(
269269
);
270270
}
271271

272+
// Fuzzy Query with re-write method
273+
public Query fuzzyQuery(
274+
Object value,
275+
Fuzziness fuzziness,
276+
int prefixLength,
277+
int maxExpansions,
278+
boolean transpositions,
279+
@Nullable MultiTermQuery.RewriteMethod method,
280+
QueryShardContext context
281+
) {
282+
throw new IllegalArgumentException(
283+
"Can only use fuzzy queries on keyword and text fields - not on [" + name + "] which is of type [" + typeName() + "]"
284+
);
285+
}
286+
272287
// Case sensitive form of prefix query
273288
public final Query prefixQuery(String value, @Nullable MultiTermQuery.RewriteMethod method, QueryShardContext context) {
274289
return prefixQuery(value, method, false, context);
@@ -433,6 +448,15 @@ protected final void failIfNotIndexed() {
433448
}
434449
}
435450

451+
protected final void failIfNotIndexedAndNoDocValues() {
452+
// we fail if a field is both not indexed and does not have doc_values enabled
453+
if (isIndexed == false && hasDocValues() == false) {
454+
throw new IllegalArgumentException(
455+
"Cannot search on field [" + name() + "] since it is both not indexed," + " and does not have doc_values enabled."
456+
);
457+
}
458+
}
459+
436460
public boolean eagerGlobalOrdinals() {
437461
return eagerGlobalOrdinals;
438462
}

server/src/main/java/org/opensearch/search/fetch/subphase/highlight/CustomQueryScorer.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
package org.opensearch.search.fetch.subphase.highlight;
3434

3535
import org.apache.lucene.index.IndexReader;
36+
import org.apache.lucene.search.IndexOrDocValuesQuery;
3637
import org.apache.lucene.search.Query;
3738
import org.apache.lucene.search.highlight.QueryScorer;
3839
import org.apache.lucene.search.highlight.WeightedSpanTerm;
@@ -104,6 +105,8 @@ protected void extract(Query query, float boost, Map<String, WeightedSpanTerm> t
104105
super.extract(((FunctionScoreQuery) query).getSubQuery(), boost, terms);
105106
} else if (query instanceof OpenSearchToParentBlockJoinQuery) {
106107
super.extract(((OpenSearchToParentBlockJoinQuery) query).getChildQuery(), boost, terms);
108+
} else if (query instanceof IndexOrDocValuesQuery) {
109+
super.extract(((IndexOrDocValuesQuery) query).getIndexQuery(), boost, terms);
107110
} else {
108111
super.extract(query, boost, terms);
109112
}

0 commit comments

Comments
 (0)