Allow docvalues-only search on number types (#82409)

Allows searching on number field types (long, short, int, float, double, byte, half_float) when those fields are not indexed (index: false) but just doc values are enabled. This enables searches on archive data, which has access to doc values but not index structures. When combined with searchable snapshots, it allows downloading only data for a given (doc value) field to quickly filter down to a select set of documents. Note to reviewers: I have split isSearchable into two separate methods isIndexed and isSearchable on MappedFieldType. The former one is about whether actual indexing data structures have been used (postings or points), and the latter one on whether you can run queries on the given field (e.g. used by field caps). For number field types, queries are now allowed whenever points are available or when doc values are available (i.e. searchability is expanded). Relates #81210 and #52728
elastic · Jan 13, 2022 · e421477 · e421477
1 parent 42afe10
commit e421477
Show file tree

Hide file tree

Showing 50 changed files with 619 additions and 189 deletions.
diff --git a/docs/reference/mapping/params/doc-values.asciidoc b/docs/reference/mapping/params/doc-values.asciidoc
@@ -17,6 +17,13 @@ makes this data access pattern possible. They store the same values as the
 sorting and aggregations. Doc values are supported on almost all field types,
 with the __notable exception of `text` and `annotated_text` fields__.
 
+<<number,Numeric types>>, such as `long` and `double`, can also be queried
+when they are not <<mapping-index,indexed>> but only have doc values enabled.
+Query performance on doc values is much slower than on index structures, but
+offers an interesting tradeoff between disk usage and query performance for
+fields that are only rarely queried and where query performance is not as
+important.
+
 All fields which support doc values have them enabled by default. If you are
 sure that you don't need to sort or aggregate on a field, or access the field
 value from a script, you can disable doc values in order to save disk space:

diff --git a/docs/reference/mapping/params/index.asciidoc b/docs/reference/mapping/params/index.asciidoc
@@ -2,5 +2,6 @@
 === `index`
 
 The `index` option controls whether field values are indexed. It accepts `true`
-or `false` and defaults to `true`. Fields that are not indexed are not queryable.
+or `false` and defaults to `true`. Fields that are not indexed are typically
+not queryable.
 
diff --git a/docs/reference/mapping/types/numeric.asciidoc b/docs/reference/mapping/types/numeric.asciidoc
@@ -131,7 +131,9 @@ The following parameters are accepted by numeric types:
 
 <<mapping-index,`index`>>::
 
-    Should the field be searchable? Accepts `true` (default) and `false`.
+    Should the field be quickly searchable? Accepts `true` (default) and
+    `false`. Numeric fields that only have <<doc-values,`doc_values`>>
+    enabled can also be queried, albeit slower.
 
 <<mapping-field-meta,`meta`>>::
 

diff --git a/docs/reference/query-dsl.asciidoc b/docs/reference/query-dsl.asciidoc
@@ -33,6 +33,7 @@ the stability of the cluster. Those queries can be categorised as follows:
 
 * Queries that need to do linear scans to identify matches:
 ** <<query-dsl-script-query,`script` queries>>
+** queries on <<number,numeric fields>> that are not indexed but have <<doc-values,doc values>> enabled
 
 * Queries that have a high up-front cost:
 ** <<query-dsl-fuzzy-query,`fuzzy` queries>> (except on

diff --git a/...er-extras/src/main/java/org/elasticsearch/index/mapper/extras/ScaledFloatFieldMapper.java b/...er-extras/src/main/java/org/elasticsearch/index/mapper/extras/ScaledFloatFieldMapper.java
@@ -198,7 +198,11 @@ public ScaledFloatFieldType(
         }
 
         public ScaledFloatFieldType(String name, double scalingFactor) {
-            this(name, true, false, true, Collections.emptyMap(), scalingFactor, null, null);
+            this(name, scalingFactor, true);
+        }
+
+        public ScaledFloatFieldType(String name, double scalingFactor, boolean indexed) {
+            this(name, indexed, false, true, Collections.emptyMap(), scalingFactor, null, null);
         }
 
         public double getScalingFactor() {
@@ -212,20 +216,24 @@ public String typeName() {
 
         @Override
         public Query termQuery(Object value, SearchExecutionContext context) {
-            failIfNotIndexed();
+            failIfNotIndexedNorDocValuesFallback(context);
             long scaledValue = Math.round(scale(value));
-            return NumberFieldMapper.NumberType.LONG.termQuery(name(), scaledValue);
+            return NumberFieldMapper.NumberType.LONG.termQuery(name(), scaledValue, isIndexed());
         }
 
         @Override
         public Query termsQuery(Collection<?> values, SearchExecutionContext context) {
-            failIfNotIndexed();
-            List<Long> scaledValues = new ArrayList<>(values.size());
-            for (Object value : values) {
-                long scaledValue = Math.round(scale(value));
-                scaledValues.add(scaledValue);
+            failIfNotIndexedNorDocValuesFallback(context);
+            if (isIndexed()) {
+                List<Long> scaledValues = new ArrayList<>(values.size());
+                for (Object value : values) {
+                    long scaledValue = Math.round(scale(value));
+                    scaledValues.add(scaledValue);
+                }
+                return NumberFieldMapper.NumberType.LONG.termsQuery(name(), Collections.unmodifiableList(scaledValues));
+            } else {
+                return super.termsQuery(values, context);
             }
-            return NumberFieldMapper.NumberType.LONG.termsQuery(name(), Collections.unmodifiableList(scaledValues));
         }
 
         @Override
@@ -236,7 +244,7 @@ public Query rangeQuery(
             boolean includeUpper,
             SearchExecutionContext context
         ) {
-            failIfNotIndexed();
+            failIfNotIndexedNorDocValuesFallback(context);
             Long lo = null;
             if (lowerTerm != null) {
                 double dValue = scale(lowerTerm);
@@ -253,7 +261,7 @@ public Query rangeQuery(
                 }
                 hi = Math.round(Math.floor(dValue));
             }
-            return NumberFieldMapper.NumberType.LONG.rangeQuery(name(), lo, hi, true, true, hasDocValues(), context);
+            return NumberFieldMapper.NumberType.LONG.rangeQuery(name(), lo, hi, true, true, hasDocValues(), context, isIndexed());
         }
 
         @Override

diff --git a/...extras/src/test/java/org/elasticsearch/index/mapper/extras/ScaledFloatFieldTypeTests.java b/...extras/src/test/java/org/elasticsearch/index/mapper/extras/ScaledFloatFieldTypeTests.java
@@ -18,6 +18,8 @@
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.NumericUtils;
+import org.elasticsearch.ElasticsearchException;
 import org.elasticsearch.core.internal.io.IOUtils;
 import org.elasticsearch.index.fielddata.IndexNumericFieldData;
 import org.elasticsearch.index.fielddata.LeafNumericFieldData;
@@ -41,7 +43,14 @@ public void testTermQuery() {
         );
         double value = (randomDouble() * 2 - 1) * 10000;
         long scaledValue = Math.round(value * ft.getScalingFactor());
-        assertEquals(LongPoint.newExactQuery("scaled_float", scaledValue), ft.termQuery(value, null));
+        assertEquals(LongPoint.newExactQuery("scaled_float", scaledValue), ft.termQuery(value, MOCK_CONTEXT));
+
+        MappedFieldType ft2 = new ScaledFloatFieldMapper.ScaledFloatFieldType("scaled_float", 0.1 + randomDouble() * 100, false);
+        ElasticsearchException e2 = expectThrows(ElasticsearchException.class, () -> ft2.termQuery("42", MOCK_CONTEXT_DISALLOW_EXPENSIVE));
+        assertEquals(
+            "Cannot search on field [scaled_float] since it is not indexed and 'search.allow_expensive_queries' is set to false.",
+            e2.getMessage()
+        );
     }
 
     public void testTermsQuery() {
@@ -53,7 +62,20 @@ public void testTermsQuery() {
         long scaledValue1 = Math.round(value1 * ft.getScalingFactor());
         double value2 = (randomDouble() * 2 - 1) * 10000;
         long scaledValue2 = Math.round(value2 * ft.getScalingFactor());
-        assertEquals(LongPoint.newSetQuery("scaled_float", scaledValue1, scaledValue2), ft.termsQuery(Arrays.asList(value1, value2), null));
+        assertEquals(
+            LongPoint.newSetQuery("scaled_float", scaledValue1, scaledValue2),
+            ft.termsQuery(Arrays.asList(value1, value2), MOCK_CONTEXT)
+        );
+
+        MappedFieldType ft2 = new ScaledFloatFieldMapper.ScaledFloatFieldType("scaled_float", 0.1 + randomDouble() * 100, false);
+        ElasticsearchException e2 = expectThrows(
+            ElasticsearchException.class,
+            () -> ft2.termsQuery(Arrays.asList(value1, value2), MOCK_CONTEXT_DISALLOW_EXPENSIVE)
+        );
+        assertEquals(
+            "Cannot search on field [scaled_float] since it is not indexed and 'search.allow_expensive_queries' is set to false.",
+            e2.getMessage()
+        );
     }
 
     public void testRangeQuery() throws IOException {
@@ -62,9 +84,9 @@ public void testRangeQuery() throws IOException {
         // searching doubles that are rounded to the closest half float
         ScaledFloatFieldMapper.ScaledFloatFieldType ft = new ScaledFloatFieldMapper.ScaledFloatFieldType(
             "scaled_float",
-            true,
-            false,
+            randomBoolean(),
             false,
+            true,
             Collections.emptyMap(),
             0.1 + randomDouble() * 100,
             null,
@@ -79,7 +101,9 @@ public void testRangeQuery() throws IOException {
             long scaledValue = Math.round(value * ft.getScalingFactor());
             double rounded = scaledValue / ft.getScalingFactor();
             doc.add(new LongPoint("scaled_float", scaledValue));
+            doc.add(new SortedNumericDocValuesField("scaled_float", scaledValue));
             doc.add(new DoublePoint("double", rounded));
+            doc.add(new SortedNumericDocValuesField("double", NumericUtils.doubleToSortableLong(rounded)));
             w.addDocument(doc);
         }
         final DirectoryReader reader = DirectoryReader.open(w);
@@ -91,7 +115,16 @@ public void testRangeQuery() throws IOException {
             Double u = randomBoolean() ? null : (randomDouble() * 2 - 1) * 10000;
             boolean includeLower = randomBoolean();
             boolean includeUpper = randomBoolean();
-            Query doubleQ = NumberFieldMapper.NumberType.DOUBLE.rangeQuery("double", l, u, includeLower, includeUpper, false, MOCK_CONTEXT);
+            Query doubleQ = NumberFieldMapper.NumberType.DOUBLE.rangeQuery(
+                "double",
+                l,
+                u,
+                includeLower,
+                includeUpper,
+                false,
+                MOCK_CONTEXT,
+                randomBoolean()
+            );
             Query scaledFloatQ = ft.rangeQuery(l, u, includeLower, includeUpper, MOCK_CONTEXT);
             assertEquals(searcher.count(doubleQ), searcher.count(scaledFloatQ));
         }

diff --git a/modules/percolator/src/test/java/org/elasticsearch/percolator/CandidateQueryTests.java b/modules/percolator/src/test/java/org/elasticsearch/percolator/CandidateQueryTests.java
@@ -257,7 +257,7 @@ public void testDuel() throws Exception {
         // many iterations with boolean queries, which are the most complex queries to deal with when nested
         int numRandomBoolQueries = 1000;
         for (int i = 0; i < numRandomBoolQueries; i++) {
-            queryFunctions.add(() -> createRandomBooleanQuery(1, stringFields, stringContent, intFieldType, intValues));
+            queryFunctions.add(() -> createRandomBooleanQuery(1, stringFields, stringContent, intFieldType, intValues, context));
         }
         queryFunctions.add(() -> {
             int numClauses = randomIntBetween(1, 1 << randomIntBetween(2, 4));
@@ -312,7 +312,8 @@ private BooleanQuery createRandomBooleanQuery(
         List<String> fields,
         Map<String, List<String>> content,
         MappedFieldType intFieldType,
-        List<Integer> intValues
+        List<Integer> intValues,
+        SearchExecutionContext context
     ) {
         BooleanQuery.Builder builder = new BooleanQuery.Builder();
         int numClauses = randomIntBetween(1, 1 << randomIntBetween(2, 4)); // use low numbers of clauses more often
@@ -326,24 +327,24 @@ private BooleanQuery createRandomBooleanQuery(
                     String field = randomFrom(fields);
                     builder.add(new TermQuery(new Term(field, randomFrom(content.get(field)))), occur);
                 } else {
-                    builder.add(intFieldType.termQuery(randomFrom(intValues), null), occur);
+                    builder.add(intFieldType.termQuery(randomFrom(intValues), context), occur);
                 }
             } else if (rarely() && depth <= 3) {
                 occur = randomFrom(Arrays.asList(Occur.FILTER, Occur.MUST, Occur.SHOULD));
-                builder.add(createRandomBooleanQuery(depth + 1, fields, content, intFieldType, intValues), occur);
+                builder.add(createRandomBooleanQuery(depth + 1, fields, content, intFieldType, intValues, context), occur);
             } else if (rarely()) {
                 if (randomBoolean()) {
                     occur = randomFrom(Arrays.asList(Occur.FILTER, Occur.MUST, Occur.SHOULD));
                     if (randomBoolean()) {
                         builder.add(new TermQuery(new Term("unknown_field", randomAlphaOfLength(8))), occur);
                     } else {
-                        builder.add(intFieldType.termQuery(randomFrom(intValues), null), occur);
+                        builder.add(intFieldType.termQuery(randomFrom(intValues), context), occur);
                     }
                 } else if (randomBoolean()) {
                     String field = randomFrom(fields);
                     builder.add(new TermQuery(new Term(field, randomFrom(content.get(field)))), occur = Occur.MUST_NOT);
                 } else {
-                    builder.add(intFieldType.termQuery(randomFrom(intValues), null), occur = Occur.MUST_NOT);
+                    builder.add(intFieldType.termQuery(randomFrom(intValues), context), occur = Occur.MUST_NOT);
                 }
             } else {
                 if (randomBoolean()) {
@@ -352,7 +353,7 @@ private BooleanQuery createRandomBooleanQuery(
                         String field = randomFrom(fields);
                         builder.add(new TermQuery(new Term(field, randomFrom(content.get(field)))), occur);
                     } else {
-                        builder.add(intFieldType.termQuery(randomFrom(intValues), null), occur);
+                        builder.add(intFieldType.termQuery(randomFrom(intValues), context), occur);
                     }
                 } else {
                     builder.add(new TermQuery(new Term("unknown_field", randomAlphaOfLength(8))), occur = Occur.MUST_NOT);

diff --git a/rest-api-spec/build.gradle b/rest-api-spec/build.gradle
@@ -218,6 +218,8 @@ tasks.named("yamlRestTestV7CompatTransform").configure { task ->
   // sync_id is no longer available in SegmentInfos.userData // "indices.flush/10_basic/Index synced flush rest test"
   task.replaceIsTrue("indices.testing.shards.0.0.commit.user_data.sync_id", "indices.testing.shards.0.0.commit.user_data")
 
+  // we can now search using doc values only
+  task.replaceValueInMatch("fields.object\\.nested1.long.searchable", true)
 }
 
 tasks.register('enforceYamlTestConvention').configure {

diff --git a/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/field_caps/10_basic.yml b/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/field_caps/10_basic.yml
@@ -178,7 +178,6 @@ setup:
         index: 'test1,test2,test3'
         fields: object*
 
-  - match: {fields.object\.nested1.long.searchable:                       false}
   - match: {fields.object\.nested1.long.aggregatable:                     true}
   - match: {fields.object\.nested1.long.indices:                          ["test3"]}
   - is_false: fields.object\.nested1.long.non_searchable_indices
@@ -198,6 +197,19 @@ setup:
   - match: {fields.object\.nested2.keyword.indices:                       ["test3"]}
   - is_false: fields.object\.nested2.keyword.non_aggregatable_indices
   - is_false: fields.object\.nested2.keyword.non_searchable_indices
+
+---
+"Field caps for number field with only doc values":
+  - skip:
+      version: " - 8.0.99"
+      reason: "doc values search was added in 8.1.0"
+  - do:
+      field_caps:
+        index: 'test1,test2,test3'
+        fields: object*
+
+  - match: {fields.object\.nested1.long.searchable:                       true}
+
 ---
 "Get object and nested field caps":