Skip to content

Commit

Permalink
Allow doc-values only search on keyword fields (#82846)
Browse files Browse the repository at this point in the history
Allows searching on keyword fields when those fields are not indexed (index: false) but just doc values are enabled.

This enables searches on archive data, which has access to doc values but not index structures. When combined with
searchable snapshots, it allows downloading only data for a given (doc value) field to quickly filter down to a select set
of documents.

Relates #81210 and #52728
  • Loading branch information
ywelsch authored Jan 24, 2022
1 parent 93d041d commit fd7f69c
Show file tree
Hide file tree
Showing 10 changed files with 154 additions and 39 deletions.
4 changes: 2 additions & 2 deletions docs/reference/mapping/params/doc-values.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@ makes this data access pattern possible. They store the same values as the
sorting and aggregations. Doc values are supported on almost all field types,
with the __notable exception of `text` and `annotated_text` fields__.

<<number,Numeric types>>, such as `long` and `double`, and <<date,Date types>>
can also be queried
<<number,Numeric types>>, <<date,date types>>, and the <<keyword, keyword type>>
can also be queried using term or range-based queries
when they are not <<mapping-index,indexed>> but only have doc values enabled.
Query performance on doc values is much slower than on index structures, but
offers an interesting tradeoff between disk usage and query performance for
Expand Down
5 changes: 4 additions & 1 deletion docs/reference/mapping/types/keyword.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,10 @@ The following parameters are accepted by `keyword` fields:

<<mapping-index,`index`>>::

Should the field be searchable? Accepts `true` (default) or `false`.
Should the field be quickly searchable? Accepts `true` (default) and
`false`. `keyword` fields that only have <<doc-values,`doc_values`>>
enabled can still be queried using term or range-based queries,
albeit slower.

<<index-options,`index_options`>>::

Expand Down
2 changes: 1 addition & 1 deletion docs/reference/query-dsl.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ the stability of the cluster. Those queries can be categorised as follows:

* Queries that need to do linear scans to identify matches:
** <<query-dsl-script-query,`script` queries>>
** queries on <<number,numeric>> and <<date,date>> fields that are not indexed
** queries on <<number,numeric>>, <<date,date>>, or <<keyword,keyword>> fields that are not indexed
but have <<doc-values,doc values>> enabled

* Queries that have a high up-front cost:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,9 @@ setup:
non_indexed_date:
type: date
index: false
non_indexed_keyword:
type: keyword
index: false
geo:
type: keyword
object:
Expand Down Expand Up @@ -225,6 +228,18 @@ setup:

- match: {fields.non_indexed_date.date.searchable: true}

---
"Field caps for keyword field with only doc values":
- skip:
version: " - 8.0.99"
reason: "doc values search was added in 8.1.0"
- do:
field_caps:
index: 'test1,test2,test3'
fields: non_indexed_keyword

- match: {fields.non_indexed_keyword.keyword.searchable: true}

---
"Get object and nested field caps":

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,9 @@ setup:
type: date
format: yyyy/MM/dd
index: false
keyword:
type: keyword
index: false

- do:
index:
Expand All @@ -50,6 +53,7 @@ setup:
long: 1
short: 1
date: "2017/01/01"
keyword: "key1"

- do:
index:
Expand All @@ -64,6 +68,7 @@ setup:
long: 2
short: 2
date: "2017/01/02"
keyword: "key2"

- do:
indices.refresh: {}
Expand Down Expand Up @@ -220,3 +225,30 @@ setup:
index: test
body: { query: { range: { date: { gte: "2017/01/01" } } } }
- length: { hits.hits: 2 }

---
"Test match query on keyword field where only doc values are enabled":

- do:
search:
index: test
body: { query: { match: { keyword: { query: "key1" } } } }
- length: { hits.hits: 1 }

---
"Test terms query on keyword field where only doc values are enabled":

- do:
search:
index: test
body: { query: { terms: { keyword: [ "key1", "key2" ] } } }
- length: { hits.hits: 2 }

---
"Test range query on keyword field where only doc values are enabled":

- do:
search:
index: test
body: { query: { range: { keyword: { gte: "key1" } } } }
- length: { hits.hits: 2 }
Original file line number Diff line number Diff line change
Expand Up @@ -338,6 +338,16 @@ public KeywordFieldType(String name, NamedAnalyzer analyzer) {
this.isDimension = false;
}

@Override
protected boolean allowDocValueBasedQueries() {
return true;
}

@Override
public boolean isSearchable() {
return isIndexed() || hasDocValues();
}

@Override
public TermsEnum getTerms(boolean caseInsensitive, String string, SearchExecutionContext queryShardContext, String searchAfter)
throws IOException {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
package org.elasticsearch.index.mapper;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.SortedSetDocValuesField;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.AutomatonQuery;
import org.apache.lucene.search.FuzzyQuery;
Expand Down Expand Up @@ -210,13 +211,27 @@ public Query rangeQuery(
+ "' is set to false."
);
}
failIfNotIndexed();
return new TermRangeQuery(
name(),
lowerTerm == null ? null : indexedValueForSearch(lowerTerm),
upperTerm == null ? null : indexedValueForSearch(upperTerm),
includeLower,
includeUpper
);
if (allowDocValueBasedQueries()) {
failIfNotIndexedNorDocValuesFallback(context);
} else {
failIfNotIndexed();
}
if (isIndexed()) {
return new TermRangeQuery(
name(),
lowerTerm == null ? null : indexedValueForSearch(lowerTerm),
upperTerm == null ? null : indexedValueForSearch(upperTerm),
includeLower,
includeUpper
);
} else {
return SortedSetDocValuesField.newSlowRangeQuery(
name(),
lowerTerm == null ? null : indexedValueForSearch(lowerTerm),
upperTerm == null ? null : indexedValueForSearch(upperTerm),
includeLower,
includeUpper
);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,9 @@

package org.elasticsearch.index.mapper;

import org.apache.lucene.document.SortedSetDocValuesField;
import org.apache.lucene.index.Term;
import org.apache.lucene.sandbox.search.DocValuesTermsQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermInSetQuery;
import org.apache.lucene.search.TermQuery;
Expand All @@ -35,6 +37,10 @@ public TermBasedFieldType(
super(name, isIndexed, isStored, hasDocValues, textSearchInfo, meta);
}

protected boolean allowDocValueBasedQueries() {
return false;
}

/** Returns the indexed value used to construct search "values".
* This method is used for the default implementations of most
* query factory methods such as {@link #termQuery}. */
Expand All @@ -55,15 +61,31 @@ public boolean mayExistInIndex(SearchExecutionContext context) {

@Override
public Query termQuery(Object value, SearchExecutionContext context) {
failIfNotIndexed();
return new TermQuery(new Term(name(), indexedValueForSearch(value)));
if (allowDocValueBasedQueries()) {
failIfNotIndexedNorDocValuesFallback(context);
} else {
failIfNotIndexed();
}
if (isIndexed()) {
return new TermQuery(new Term(name(), indexedValueForSearch(value)));
} else {
return SortedSetDocValuesField.newSlowExactQuery(name(), indexedValueForSearch(value));
}
}

@Override
public Query termsQuery(Collection<?> values, SearchExecutionContext context) {
failIfNotIndexed();
if (allowDocValueBasedQueries()) {
failIfNotIndexedNorDocValuesFallback(context);
} else {
failIfNotIndexed();
}
BytesRef[] bytesRefs = values.stream().map(this::indexedValueForSearch).toArray(BytesRef[]::new);
return new TermInSetQuery(name(), bytesRefs);
if (isIndexed()) {
return new TermInSetQuery(name(), bytesRefs);
} else {
return new DocValuesTermsQuery(name(), bytesRefs);
}
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,9 @@
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.SortedSetDocValuesField;
import org.apache.lucene.index.Term;
import org.apache.lucene.sandbox.search.DocValuesTermsQuery;
import org.apache.lucene.search.DocValuesFieldExistsQuery;
import org.apache.lucene.search.FuzzyQuery;
import org.apache.lucene.search.NormsFieldExistsQuery;
Expand Down Expand Up @@ -52,7 +54,7 @@
public class KeywordFieldTypeTests extends FieldTypeTestCase {

public void testIsFieldWithinQuery() throws IOException {
KeywordFieldType ft = new KeywordFieldType("field");
KeywordFieldType ft = new KeywordFieldType("field", randomBoolean(), randomBoolean(), Map.of());
// current impl ignores args and should always return INTERSECTS
assertEquals(
Relation.INTERSECTS,
Expand All @@ -64,18 +66,21 @@ public void testIsFieldWithinQuery() throws IOException {
randomBoolean(),
null,
null,
null
MOCK_CONTEXT
)
);
}

public void testTermQuery() {
MappedFieldType ft = new KeywordFieldType("field");
assertEquals(new TermQuery(new Term("field", "foo")), ft.termQuery("foo", null));
assertEquals(new TermQuery(new Term("field", "foo")), ft.termQuery("foo", MOCK_CONTEXT));

MappedFieldType unsearchable = new KeywordFieldType("field", false, true, Collections.emptyMap());
IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () -> unsearchable.termQuery("bar", null));
assertEquals("Cannot search on field [field] since it is not indexed.", e.getMessage());
MappedFieldType ft2 = new KeywordFieldType("field", false, true, Map.of());
assertEquals(SortedSetDocValuesField.newSlowExactQuery("field", new BytesRef("foo")), ft2.termQuery("foo", MOCK_CONTEXT));

MappedFieldType unsearchable = new KeywordFieldType("field", false, false, Collections.emptyMap());
IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () -> unsearchable.termQuery("bar", MOCK_CONTEXT));
assertEquals("Cannot search on field [field] since it is not indexed nor has doc values.", e.getMessage());
}

public void testTermQueryWithNormalizer() {
Expand All @@ -93,38 +98,45 @@ protected TokenStream normalize(String fieldName, TokenStream in) {
}
};
MappedFieldType ft = new KeywordFieldType("field", new NamedAnalyzer("my_normalizer", AnalyzerScope.INDEX, normalizer));
assertEquals(new TermQuery(new Term("field", "foo bar")), ft.termQuery("fOo BaR", null));
assertEquals(new TermQuery(new Term("field", "foo bar")), ft.termQuery("fOo BaR", MOCK_CONTEXT));
}

public void testTermsQuery() {
MappedFieldType ft = new KeywordFieldType("field");
List<BytesRef> terms = new ArrayList<>();
terms.add(new BytesRef("foo"));
terms.add(new BytesRef("bar"));
assertEquals(new TermInSetQuery("field", terms), ft.termsQuery(Arrays.asList("foo", "bar"), null));
assertEquals(new TermInSetQuery("field", terms), ft.termsQuery(Arrays.asList("foo", "bar"), MOCK_CONTEXT));

MappedFieldType unsearchable = new KeywordFieldType("field", false, true, Collections.emptyMap());
MappedFieldType ft2 = new KeywordFieldType("field", false, true, Map.of());
assertEquals(new DocValuesTermsQuery("field", terms), ft2.termsQuery(Arrays.asList("foo", "bar"), MOCK_CONTEXT));

MappedFieldType unsearchable = new KeywordFieldType("field", false, false, Collections.emptyMap());
IllegalArgumentException e = expectThrows(
IllegalArgumentException.class,
() -> unsearchable.termsQuery(Arrays.asList("foo", "bar"), null)
() -> unsearchable.termsQuery(Arrays.asList("foo", "bar"), MOCK_CONTEXT)
);
assertEquals("Cannot search on field [field] since it is not indexed.", e.getMessage());
assertEquals("Cannot search on field [field] since it is not indexed nor has doc values.", e.getMessage());
}

public void testExistsQuery() {
{
KeywordFieldType ft = new KeywordFieldType("field");
assertEquals(new DocValuesFieldExistsQuery("field"), ft.existsQuery(null));
assertEquals(new DocValuesFieldExistsQuery("field"), ft.existsQuery(MOCK_CONTEXT));
}
{
KeywordFieldType ft = new KeywordFieldType("field", false, true, Map.of());
assertEquals(new DocValuesFieldExistsQuery("field"), ft.existsQuery(MOCK_CONTEXT));
}
{
FieldType fieldType = new FieldType();
fieldType.setOmitNorms(false);
KeywordFieldType ft = new KeywordFieldType("field", fieldType);
assertEquals(new NormsFieldExistsQuery("field"), ft.existsQuery(null));
assertEquals(new NormsFieldExistsQuery("field"), ft.existsQuery(MOCK_CONTEXT));
}
{
KeywordFieldType ft = new KeywordFieldType("field", true, false, Collections.emptyMap());
assertEquals(new TermQuery(new Term(FieldNamesFieldMapper.NAME, "field")), ft.existsQuery(null));
assertEquals(new TermQuery(new Term(FieldNamesFieldMapper.NAME, "field")), ft.existsQuery(MOCK_CONTEXT));
}
}

Expand All @@ -135,6 +147,12 @@ public void testRangeQuery() {
ft.rangeQuery("foo", "bar", true, false, null, null, null, MOCK_CONTEXT)
);

MappedFieldType ft2 = new KeywordFieldType("field", false, true, Map.of());
assertEquals(
SortedSetDocValuesField.newSlowRangeQuery("field", BytesRefs.toBytesRef("foo"), BytesRefs.toBytesRef("bar"), true, false),
ft2.rangeQuery("foo", "bar", true, false, null, null, null, MOCK_CONTEXT)
);

ElasticsearchException ee = expectThrows(
ElasticsearchException.class,
() -> ft.rangeQuery("foo", "bar", true, false, null, null, null, MOCK_CONTEXT_DISALLOW_EXPENSIVE)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,22 +50,22 @@ public void testAliasWithIncompatibleTypes() throws Exception {

public void testAliasWithIncompatibleSearchableProperty() throws Exception {
createIndexWithMapping("test1", builder -> {
builder.startObject("id").field("type", "keyword").endObject();
builder.startObject("id").field("type", "text").endObject();
builder.startObject("value").field("type", "boolean").endObject();
});

createIndexWithMapping("test2", builder -> {
builder.startObject("id").field("type", "keyword").field("index", false).endObject();
builder.startObject("id").field("type", "text").field("index", false).endObject();
builder.startObject("value").field("type", "boolean").endObject();
});

createIndexWithMapping("test3", builder -> {
builder.startObject("id").field("type", "keyword").field("index", false).endObject();
builder.startObject("id").field("type", "text").field("index", false).endObject();
builder.startObject("value").field("type", "boolean").endObject();
});

createIndexWithMapping("test4", builder -> {
builder.startObject("id").field("type", "keyword").field("index", false).endObject();
builder.startObject("id").field("type", "text").field("index", false).endObject();
builder.startObject("value").field("type", "boolean").endObject();
});

Expand All @@ -79,16 +79,16 @@ public void testAliasWithIncompatibleSearchableProperty() throws Exception {
assertResultsForQuery(
"SYS COLUMNS",
new String[][] {
{ "test1", "id", "KEYWORD" },
{ "test1", "id", "TEXT" },
{ "test1", "value", "BOOLEAN" },
{ "test2", "id", "KEYWORD" },
{ "test2", "id", "TEXT" },
{ "test2", "value", "BOOLEAN" },
{ "test3", "id", "KEYWORD" },
{ "test3", "id", "TEXT" },
{ "test3", "value", "BOOLEAN" },
{ "test4", "id", "KEYWORD" },
{ "test4", "id", "TEXT" },
{ "test4", "value", "BOOLEAN" },
{ "test_alias", "value", "BOOLEAN" },
{ "test_alias2", "id", "KEYWORD" },
{ "test_alias2", "id", "TEXT" },
{ "test_alias2", "value", "BOOLEAN" } }
);
}
Expand Down

0 comments on commit fd7f69c

Please sign in to comment.