From fd7f69cea613dcdd63de8ba277cd0212ce584377 Mon Sep 17 00:00:00 2001 From: Yannick Welsch Date: Mon, 24 Jan 2022 08:57:11 +0100 Subject: [PATCH] Allow doc-values only search on keyword fields (#82846) Allows searching on keyword fields when those fields are not indexed (index: false) but just doc values are enabled. This enables searches on archive data, which has access to doc values but not index structures. When combined with searchable snapshots, it allows downloading only data for a given (doc value) field to quickly filter down to a select set of documents. Relates #81210 and #52728 --- .../mapping/params/doc-values.asciidoc | 4 +- docs/reference/mapping/types/keyword.asciidoc | 5 +- docs/reference/query-dsl.asciidoc | 2 +- .../test/field_caps/10_basic.yml | 15 ++++++ .../test/search/390_doc_values_search.yml | 32 +++++++++++++ .../index/mapper/KeywordFieldMapper.java | 10 ++++ .../index/mapper/StringFieldType.java | 31 +++++++++---- .../index/mapper/TermBasedFieldType.java | 30 ++++++++++-- .../index/mapper/KeywordFieldTypeTests.java | 46 +++++++++++++------ .../xpack/sql/qa/jdbc/SysColumnsTestCase.java | 18 ++++---- 10 files changed, 154 insertions(+), 39 deletions(-) diff --git a/docs/reference/mapping/params/doc-values.asciidoc b/docs/reference/mapping/params/doc-values.asciidoc index 962097984033a..7d3b4170fc9da 100644 --- a/docs/reference/mapping/params/doc-values.asciidoc +++ b/docs/reference/mapping/params/doc-values.asciidoc @@ -17,8 +17,8 @@ makes this data access pattern possible. They store the same values as the sorting and aggregations. Doc values are supported on almost all field types, with the __notable exception of `text` and `annotated_text` fields__. -<>, such as `long` and `double`, and <> -can also be queried +<>, <>, and the <> +can also be queried using term or range-based queries when they are not <> but only have doc values enabled. Query performance on doc values is much slower than on index structures, but offers an interesting tradeoff between disk usage and query performance for diff --git a/docs/reference/mapping/types/keyword.asciidoc b/docs/reference/mapping/types/keyword.asciidoc index deb2e5e49a1da..df77e897761c0 100644 --- a/docs/reference/mapping/types/keyword.asciidoc +++ b/docs/reference/mapping/types/keyword.asciidoc @@ -80,7 +80,10 @@ The following parameters are accepted by `keyword` fields: <>:: - Should the field be searchable? Accepts `true` (default) or `false`. + Should the field be quickly searchable? Accepts `true` (default) and + `false`. `keyword` fields that only have <> + enabled can still be queried using term or range-based queries, + albeit slower. <>:: diff --git a/docs/reference/query-dsl.asciidoc b/docs/reference/query-dsl.asciidoc index 8cab06a03e19f..39d60c0116f95 100644 --- a/docs/reference/query-dsl.asciidoc +++ b/docs/reference/query-dsl.asciidoc @@ -33,7 +33,7 @@ the stability of the cluster. Those queries can be categorised as follows: * Queries that need to do linear scans to identify matches: ** <> -** queries on <> and <> fields that are not indexed +** queries on <>, <>, or <> fields that are not indexed but have <> enabled * Queries that have a high up-front cost: diff --git a/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/field_caps/10_basic.yml b/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/field_caps/10_basic.yml index 5fd1d9a2f133a..ff6643ad72f7f 100644 --- a/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/field_caps/10_basic.yml +++ b/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/field_caps/10_basic.yml @@ -86,6 +86,9 @@ setup: non_indexed_date: type: date index: false + non_indexed_keyword: + type: keyword + index: false geo: type: keyword object: @@ -225,6 +228,18 @@ setup: - match: {fields.non_indexed_date.date.searchable: true} +--- +"Field caps for keyword field with only doc values": + - skip: + version: " - 8.0.99" + reason: "doc values search was added in 8.1.0" + - do: + field_caps: + index: 'test1,test2,test3' + fields: non_indexed_keyword + + - match: {fields.non_indexed_keyword.keyword.searchable: true} + --- "Get object and nested field caps": diff --git a/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search/390_doc_values_search.yml b/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search/390_doc_values_search.yml index e2859dfbedb17..0e8aec31e85d1 100644 --- a/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search/390_doc_values_search.yml +++ b/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search/390_doc_values_search.yml @@ -36,6 +36,9 @@ setup: type: date format: yyyy/MM/dd index: false + keyword: + type: keyword + index: false - do: index: @@ -50,6 +53,7 @@ setup: long: 1 short: 1 date: "2017/01/01" + keyword: "key1" - do: index: @@ -64,6 +68,7 @@ setup: long: 2 short: 2 date: "2017/01/02" + keyword: "key2" - do: indices.refresh: {} @@ -220,3 +225,30 @@ setup: index: test body: { query: { range: { date: { gte: "2017/01/01" } } } } - length: { hits.hits: 2 } + +--- +"Test match query on keyword field where only doc values are enabled": + + - do: + search: + index: test + body: { query: { match: { keyword: { query: "key1" } } } } + - length: { hits.hits: 1 } + +--- +"Test terms query on keyword field where only doc values are enabled": + + - do: + search: + index: test + body: { query: { terms: { keyword: [ "key1", "key2" ] } } } + - length: { hits.hits: 2 } + +--- +"Test range query on keyword field where only doc values are enabled": + + - do: + search: + index: test + body: { query: { range: { keyword: { gte: "key1" } } } } + - length: { hits.hits: 2 } diff --git a/server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java index f79c49a7b73f6..13ca733666a52 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java @@ -338,6 +338,16 @@ public KeywordFieldType(String name, NamedAnalyzer analyzer) { this.isDimension = false; } + @Override + protected boolean allowDocValueBasedQueries() { + return true; + } + + @Override + public boolean isSearchable() { + return isIndexed() || hasDocValues(); + } + @Override public TermsEnum getTerms(boolean caseInsensitive, String string, SearchExecutionContext queryShardContext, String searchAfter) throws IOException { diff --git a/server/src/main/java/org/elasticsearch/index/mapper/StringFieldType.java b/server/src/main/java/org/elasticsearch/index/mapper/StringFieldType.java index 791a394e832aa..3979ebf218dd4 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/StringFieldType.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/StringFieldType.java @@ -9,6 +9,7 @@ package org.elasticsearch.index.mapper; import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.document.SortedSetDocValuesField; import org.apache.lucene.index.Term; import org.apache.lucene.search.AutomatonQuery; import org.apache.lucene.search.FuzzyQuery; @@ -210,13 +211,27 @@ public Query rangeQuery( + "' is set to false." ); } - failIfNotIndexed(); - return new TermRangeQuery( - name(), - lowerTerm == null ? null : indexedValueForSearch(lowerTerm), - upperTerm == null ? null : indexedValueForSearch(upperTerm), - includeLower, - includeUpper - ); + if (allowDocValueBasedQueries()) { + failIfNotIndexedNorDocValuesFallback(context); + } else { + failIfNotIndexed(); + } + if (isIndexed()) { + return new TermRangeQuery( + name(), + lowerTerm == null ? null : indexedValueForSearch(lowerTerm), + upperTerm == null ? null : indexedValueForSearch(upperTerm), + includeLower, + includeUpper + ); + } else { + return SortedSetDocValuesField.newSlowRangeQuery( + name(), + lowerTerm == null ? null : indexedValueForSearch(lowerTerm), + upperTerm == null ? null : indexedValueForSearch(upperTerm), + includeLower, + includeUpper + ); + } } } diff --git a/server/src/main/java/org/elasticsearch/index/mapper/TermBasedFieldType.java b/server/src/main/java/org/elasticsearch/index/mapper/TermBasedFieldType.java index 80e6d04d967d5..02db08072039f 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/TermBasedFieldType.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/TermBasedFieldType.java @@ -8,7 +8,9 @@ package org.elasticsearch.index.mapper; +import org.apache.lucene.document.SortedSetDocValuesField; import org.apache.lucene.index.Term; +import org.apache.lucene.sandbox.search.DocValuesTermsQuery; import org.apache.lucene.search.Query; import org.apache.lucene.search.TermInSetQuery; import org.apache.lucene.search.TermQuery; @@ -35,6 +37,10 @@ public TermBasedFieldType( super(name, isIndexed, isStored, hasDocValues, textSearchInfo, meta); } + protected boolean allowDocValueBasedQueries() { + return false; + } + /** Returns the indexed value used to construct search "values". * This method is used for the default implementations of most * query factory methods such as {@link #termQuery}. */ @@ -55,15 +61,31 @@ public boolean mayExistInIndex(SearchExecutionContext context) { @Override public Query termQuery(Object value, SearchExecutionContext context) { - failIfNotIndexed(); - return new TermQuery(new Term(name(), indexedValueForSearch(value))); + if (allowDocValueBasedQueries()) { + failIfNotIndexedNorDocValuesFallback(context); + } else { + failIfNotIndexed(); + } + if (isIndexed()) { + return new TermQuery(new Term(name(), indexedValueForSearch(value))); + } else { + return SortedSetDocValuesField.newSlowExactQuery(name(), indexedValueForSearch(value)); + } } @Override public Query termsQuery(Collection values, SearchExecutionContext context) { - failIfNotIndexed(); + if (allowDocValueBasedQueries()) { + failIfNotIndexedNorDocValuesFallback(context); + } else { + failIfNotIndexed(); + } BytesRef[] bytesRefs = values.stream().map(this::indexedValueForSearch).toArray(BytesRef[]::new); - return new TermInSetQuery(name(), bytesRefs); + if (isIndexed()) { + return new TermInSetQuery(name(), bytesRefs); + } else { + return new DocValuesTermsQuery(name(), bytesRefs); + } } } diff --git a/server/src/test/java/org/elasticsearch/index/mapper/KeywordFieldTypeTests.java b/server/src/test/java/org/elasticsearch/index/mapper/KeywordFieldTypeTests.java index 5ecd015fef613..3037d7d3c9703 100644 --- a/server/src/test/java/org/elasticsearch/index/mapper/KeywordFieldTypeTests.java +++ b/server/src/test/java/org/elasticsearch/index/mapper/KeywordFieldTypeTests.java @@ -17,7 +17,9 @@ import org.apache.lucene.analysis.core.WhitespaceTokenizer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.FieldType; +import org.apache.lucene.document.SortedSetDocValuesField; import org.apache.lucene.index.Term; +import org.apache.lucene.sandbox.search.DocValuesTermsQuery; import org.apache.lucene.search.DocValuesFieldExistsQuery; import org.apache.lucene.search.FuzzyQuery; import org.apache.lucene.search.NormsFieldExistsQuery; @@ -52,7 +54,7 @@ public class KeywordFieldTypeTests extends FieldTypeTestCase { public void testIsFieldWithinQuery() throws IOException { - KeywordFieldType ft = new KeywordFieldType("field"); + KeywordFieldType ft = new KeywordFieldType("field", randomBoolean(), randomBoolean(), Map.of()); // current impl ignores args and should always return INTERSECTS assertEquals( Relation.INTERSECTS, @@ -64,18 +66,21 @@ public void testIsFieldWithinQuery() throws IOException { randomBoolean(), null, null, - null + MOCK_CONTEXT ) ); } public void testTermQuery() { MappedFieldType ft = new KeywordFieldType("field"); - assertEquals(new TermQuery(new Term("field", "foo")), ft.termQuery("foo", null)); + assertEquals(new TermQuery(new Term("field", "foo")), ft.termQuery("foo", MOCK_CONTEXT)); - MappedFieldType unsearchable = new KeywordFieldType("field", false, true, Collections.emptyMap()); - IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () -> unsearchable.termQuery("bar", null)); - assertEquals("Cannot search on field [field] since it is not indexed.", e.getMessage()); + MappedFieldType ft2 = new KeywordFieldType("field", false, true, Map.of()); + assertEquals(SortedSetDocValuesField.newSlowExactQuery("field", new BytesRef("foo")), ft2.termQuery("foo", MOCK_CONTEXT)); + + MappedFieldType unsearchable = new KeywordFieldType("field", false, false, Collections.emptyMap()); + IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () -> unsearchable.termQuery("bar", MOCK_CONTEXT)); + assertEquals("Cannot search on field [field] since it is not indexed nor has doc values.", e.getMessage()); } public void testTermQueryWithNormalizer() { @@ -93,7 +98,7 @@ protected TokenStream normalize(String fieldName, TokenStream in) { } }; MappedFieldType ft = new KeywordFieldType("field", new NamedAnalyzer("my_normalizer", AnalyzerScope.INDEX, normalizer)); - assertEquals(new TermQuery(new Term("field", "foo bar")), ft.termQuery("fOo BaR", null)); + assertEquals(new TermQuery(new Term("field", "foo bar")), ft.termQuery("fOo BaR", MOCK_CONTEXT)); } public void testTermsQuery() { @@ -101,30 +106,37 @@ public void testTermsQuery() { List terms = new ArrayList<>(); terms.add(new BytesRef("foo")); terms.add(new BytesRef("bar")); - assertEquals(new TermInSetQuery("field", terms), ft.termsQuery(Arrays.asList("foo", "bar"), null)); + assertEquals(new TermInSetQuery("field", terms), ft.termsQuery(Arrays.asList("foo", "bar"), MOCK_CONTEXT)); - MappedFieldType unsearchable = new KeywordFieldType("field", false, true, Collections.emptyMap()); + MappedFieldType ft2 = new KeywordFieldType("field", false, true, Map.of()); + assertEquals(new DocValuesTermsQuery("field", terms), ft2.termsQuery(Arrays.asList("foo", "bar"), MOCK_CONTEXT)); + + MappedFieldType unsearchable = new KeywordFieldType("field", false, false, Collections.emptyMap()); IllegalArgumentException e = expectThrows( IllegalArgumentException.class, - () -> unsearchable.termsQuery(Arrays.asList("foo", "bar"), null) + () -> unsearchable.termsQuery(Arrays.asList("foo", "bar"), MOCK_CONTEXT) ); - assertEquals("Cannot search on field [field] since it is not indexed.", e.getMessage()); + assertEquals("Cannot search on field [field] since it is not indexed nor has doc values.", e.getMessage()); } public void testExistsQuery() { { KeywordFieldType ft = new KeywordFieldType("field"); - assertEquals(new DocValuesFieldExistsQuery("field"), ft.existsQuery(null)); + assertEquals(new DocValuesFieldExistsQuery("field"), ft.existsQuery(MOCK_CONTEXT)); + } + { + KeywordFieldType ft = new KeywordFieldType("field", false, true, Map.of()); + assertEquals(new DocValuesFieldExistsQuery("field"), ft.existsQuery(MOCK_CONTEXT)); } { FieldType fieldType = new FieldType(); fieldType.setOmitNorms(false); KeywordFieldType ft = new KeywordFieldType("field", fieldType); - assertEquals(new NormsFieldExistsQuery("field"), ft.existsQuery(null)); + assertEquals(new NormsFieldExistsQuery("field"), ft.existsQuery(MOCK_CONTEXT)); } { KeywordFieldType ft = new KeywordFieldType("field", true, false, Collections.emptyMap()); - assertEquals(new TermQuery(new Term(FieldNamesFieldMapper.NAME, "field")), ft.existsQuery(null)); + assertEquals(new TermQuery(new Term(FieldNamesFieldMapper.NAME, "field")), ft.existsQuery(MOCK_CONTEXT)); } } @@ -135,6 +147,12 @@ public void testRangeQuery() { ft.rangeQuery("foo", "bar", true, false, null, null, null, MOCK_CONTEXT) ); + MappedFieldType ft2 = new KeywordFieldType("field", false, true, Map.of()); + assertEquals( + SortedSetDocValuesField.newSlowRangeQuery("field", BytesRefs.toBytesRef("foo"), BytesRefs.toBytesRef("bar"), true, false), + ft2.rangeQuery("foo", "bar", true, false, null, null, null, MOCK_CONTEXT) + ); + ElasticsearchException ee = expectThrows( ElasticsearchException.class, () -> ft.rangeQuery("foo", "bar", true, false, null, null, null, MOCK_CONTEXT_DISALLOW_EXPENSIVE) diff --git a/x-pack/plugin/sql/qa/server/src/main/java/org/elasticsearch/xpack/sql/qa/jdbc/SysColumnsTestCase.java b/x-pack/plugin/sql/qa/server/src/main/java/org/elasticsearch/xpack/sql/qa/jdbc/SysColumnsTestCase.java index 3c9dacc905819..44a882cde21fe 100644 --- a/x-pack/plugin/sql/qa/server/src/main/java/org/elasticsearch/xpack/sql/qa/jdbc/SysColumnsTestCase.java +++ b/x-pack/plugin/sql/qa/server/src/main/java/org/elasticsearch/xpack/sql/qa/jdbc/SysColumnsTestCase.java @@ -50,22 +50,22 @@ public void testAliasWithIncompatibleTypes() throws Exception { public void testAliasWithIncompatibleSearchableProperty() throws Exception { createIndexWithMapping("test1", builder -> { - builder.startObject("id").field("type", "keyword").endObject(); + builder.startObject("id").field("type", "text").endObject(); builder.startObject("value").field("type", "boolean").endObject(); }); createIndexWithMapping("test2", builder -> { - builder.startObject("id").field("type", "keyword").field("index", false).endObject(); + builder.startObject("id").field("type", "text").field("index", false).endObject(); builder.startObject("value").field("type", "boolean").endObject(); }); createIndexWithMapping("test3", builder -> { - builder.startObject("id").field("type", "keyword").field("index", false).endObject(); + builder.startObject("id").field("type", "text").field("index", false).endObject(); builder.startObject("value").field("type", "boolean").endObject(); }); createIndexWithMapping("test4", builder -> { - builder.startObject("id").field("type", "keyword").field("index", false).endObject(); + builder.startObject("id").field("type", "text").field("index", false).endObject(); builder.startObject("value").field("type", "boolean").endObject(); }); @@ -79,16 +79,16 @@ public void testAliasWithIncompatibleSearchableProperty() throws Exception { assertResultsForQuery( "SYS COLUMNS", new String[][] { - { "test1", "id", "KEYWORD" }, + { "test1", "id", "TEXT" }, { "test1", "value", "BOOLEAN" }, - { "test2", "id", "KEYWORD" }, + { "test2", "id", "TEXT" }, { "test2", "value", "BOOLEAN" }, - { "test3", "id", "KEYWORD" }, + { "test3", "id", "TEXT" }, { "test3", "value", "BOOLEAN" }, - { "test4", "id", "KEYWORD" }, + { "test4", "id", "TEXT" }, { "test4", "value", "BOOLEAN" }, { "test_alias", "value", "BOOLEAN" }, - { "test_alias2", "id", "KEYWORD" }, + { "test_alias2", "id", "TEXT" }, { "test_alias2", "value", "BOOLEAN" } } ); }