Add checks in term and terms queries that input terms are not too long (

elastic#99818) Lucene indexes do not allow terms of greater than 32k bytes long. Any queries that contain terms that exceed this length will by definition not match anything, and can cause cluster instability by consuming large amounts of heap. They are also generally always a user error (for example, a termsquery that concatenates all its inputs into a single string rather than splitting them into json arrays). This commit adds some checking to Term and Terms query builders that will throw an exception if any of their input terms are greater than the maximum allowed length by the lucene IndexWriter. Fixes elastic#99802
romseygeek · Sep 25, 2023 · 23099a4 · 23099a4
1 parent 06aad5a
commit 23099a4
Show file tree

Hide file tree

Showing 6 changed files with 72 additions and 8 deletions.
diff --git a/docs/changelog/99818.yaml b/docs/changelog/99818.yaml
@@ -0,0 +1,6 @@
+pr: 99818
+summary: Add checks in term and terms queries that input terms are not too long
+area: Search
+type: enhancement
+issues:
+ - 99802
diff --git a/server/src/main/java/org/elasticsearch/common/lucene/BytesRefs.java b/server/src/main/java/org/elasticsearch/common/lucene/BytesRefs.java
@@ -8,6 +8,7 @@
 
 package org.elasticsearch.common.lucene;
 
+import org.apache.lucene.index.IndexWriter;
 import org.apache.lucene.util.BytesRef;
 
 public class BytesRefs {
@@ -39,4 +40,34 @@ public static BytesRef toBytesRef(Object value) {
         return new BytesRef(value.toString());
     }
 
+    /**
+     * Checks that the input is not longer than {@link IndexWriter#MAX_TERM_LENGTH}
+     * @param input a BytesRef
+     * @return the same BytesRef, if no exception has been thrown
+     * @throws IllegalArgumentException if the input is too long
+     */
+    public static BytesRef checkIndexableLength(BytesRef input) {
+        if (input.length > IndexWriter.MAX_TERM_LENGTH) {
+            throw new IllegalArgumentException(
+                "Term is longer than maximum indexable length, term starting with [" + safeStringPrefix(input, 10)
+            );
+        }
+        return input;
+    }
+
+    /**
+     * Produces a UTF-string prefix of the input BytesRef.  If the prefix cutoff would produce
+     * ill-formed UTF, it falls back to the hexadecimal representation.
+     * @param input an input BytesRef
+     * @return a String prefix
+     */
+    private static String safeStringPrefix(BytesRef input, int prefixLength) {
+        BytesRef prefix = new BytesRef(input.bytes, input.offset, prefixLength);
+        try {
+            return prefix.utf8ToString();
+        } catch (Exception e) {
+            return prefix.toString();
+        }
+    }
+
 }
diff --git a/server/src/main/java/org/elasticsearch/index/query/AbstractQueryBuilder.java b/server/src/main/java/org/elasticsearch/index/query/AbstractQueryBuilder.java
@@ -211,9 +211,9 @@ public final int hashCode() {
      */
     static Object maybeConvertToBytesRef(Object obj) {
         if (obj instanceof String) {
-            return BytesRefs.toBytesRef(obj);
+            return BytesRefs.checkIndexableLength(BytesRefs.toBytesRef(obj));
         } else if (obj instanceof CharBuffer) {
-            return new BytesRef((CharBuffer) obj);
+            return BytesRefs.checkIndexableLength(new BytesRef((CharBuffer) obj));
         } else if (obj instanceof BigInteger) {
             return BytesRefs.toBytesRef(obj);
         }

diff --git a/server/src/test/java/org/elasticsearch/index/query/AbstractQueryBuilderTests.java b/server/src/test/java/org/elasticsearch/index/query/AbstractQueryBuilderTests.java
@@ -8,6 +8,7 @@
 
 package org.elasticsearch.index.query;
 
+import org.apache.lucene.index.IndexWriter;
 import org.elasticsearch.common.ParsingException;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.search.SearchModule;
@@ -22,6 +23,7 @@
 
 import static java.util.Collections.emptyList;
 import static org.elasticsearch.index.query.AbstractQueryBuilder.parseInnerQueryBuilder;
+import static org.hamcrest.Matchers.containsString;
 
 public class AbstractQueryBuilderTests extends ESTestCase {
 
@@ -84,4 +86,10 @@ protected NamedXContentRegistry xContentRegistry() {
         return xContentRegistry;
     }
 
+    public void testMaybeConvertToBytesRefLongTerm() {
+        String longTerm = "a".repeat(IndexWriter.MAX_TERM_LENGTH + 1);
+        Exception e = expectThrows(IllegalArgumentException.class, () -> AbstractQueryBuilder.maybeConvertToBytesRef(longTerm));
+        assertThat(e.getMessage(), containsString("term starting with [aaaaa"));
+    }
+
 }
diff --git a/server/src/test/java/org/elasticsearch/index/query/TermQueryBuilderTests.java b/server/src/test/java/org/elasticsearch/index/query/TermQueryBuilderTests.java
@@ -8,6 +8,7 @@
 
 package org.elasticsearch.index.query;
 
+import org.apache.lucene.index.IndexWriter;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.search.AutomatonQuery;
 import org.apache.lucene.search.MatchNoDocsQuery;
@@ -20,9 +21,11 @@
 import org.elasticsearch.xcontent.json.JsonStringEncoder;
 
 import java.io.IOException;
+import java.util.Locale;
 
 import static org.hamcrest.CoreMatchers.equalTo;
 import static org.hamcrest.CoreMatchers.instanceOf;
+import static org.hamcrest.Matchers.containsString;
 import static org.hamcrest.Matchers.either;
 
 public class TermQueryBuilderTests extends AbstractTermQueryTestCase<TermQueryBuilder> {
@@ -227,4 +230,11 @@ public void testMustRewrite() throws IOException {
         IllegalStateException e = expectThrows(IllegalStateException.class, () -> queryBuilder.toQuery(context));
         assertEquals("Rewrite first", e.getMessage());
     }
+
+    public void testLongTerm() throws IOException {
+        String longTerm = "a".repeat(IndexWriter.MAX_TERM_LENGTH + 1);
+        Exception e = expectThrows(IllegalArgumentException.class, () -> parseQuery(String.format(Locale.ROOT, """
+            { "term" : { "foo" : "%s" } }""", longTerm)));
+        assertThat(e.getMessage(), containsString("term starting with [aaaaa"));
+    }
 }
diff --git a/server/src/test/java/org/elasticsearch/index/query/TermsQueryBuilderTests.java b/server/src/test/java/org/elasticsearch/index/query/TermsQueryBuilderTests.java
@@ -8,6 +8,7 @@
 
 package org.elasticsearch.index.query;
 
+import org.apache.lucene.index.IndexWriter;
 import org.apache.lucene.search.BooleanQuery;
 import org.apache.lucene.search.ConstantScoreQuery;
 import org.apache.lucene.search.MatchNoDocsQuery;
@@ -35,6 +36,7 @@
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.List;
+import java.util.Locale;
 import java.util.Objects;
 
 import static org.hamcrest.Matchers.containsString;
@@ -71,7 +73,7 @@ protected TermsQueryBuilder doCreateTestQueryBuilder() {
                     || choice.equals(INT_RANGE_FIELD_NAME)
                     || choice.equals(DATE_RANGE_FIELD_NAME)
                     || choice.equals(DATE_NANOS_FIELD_NAME), // TODO: needs testing for date_nanos type
-                () -> getRandomFieldName()
+                AbstractQueryTestCase::getRandomFieldName
             );
             Object[] values = new Object[randomInt(5)];
             for (int i = 0; i < values.length; i++) {
@@ -95,7 +97,7 @@ private TermsLookup randomTermsLookup() {
     protected void doAssertLuceneQuery(TermsQueryBuilder queryBuilder, Query query, SearchExecutionContext context) throws IOException {
         if (queryBuilder.termsLookup() == null && (queryBuilder.values() == null || queryBuilder.values().isEmpty())) {
             assertThat(query, instanceOf(MatchNoDocsQuery.class));
-        } else if (queryBuilder.termsLookup() != null && randomTerms.size() == 0) {
+        } else if (queryBuilder.termsLookup() != null && randomTerms.isEmpty()) {
             assertThat(query, instanceOf(MatchNoDocsQuery.class));
         } else {
             assertThat(
@@ -138,14 +140,14 @@ protected void doAssertLuceneQuery(TermsQueryBuilder queryBuilder, Query query,
         }
     }
 
-    public void testEmtpyFieldName() {
+    public void testEmptyFieldName() {
         IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () -> new TermsQueryBuilder(null, "term"));
         assertEquals("field name cannot be null.", e.getMessage());
         e = expectThrows(IllegalArgumentException.class, () -> new TermsQueryBuilder("", "term"));
         assertEquals("field name cannot be null.", e.getMessage());
     }
 
-    public void testEmtpyTermsLookup() {
+    public void testEmptyTermsLookup() {
         IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () -> new TermsQueryBuilder("field", (TermsLookup) null));
         assertEquals("No value or termsLookup specified for terms query", e.getMessage());
     }
@@ -194,7 +196,7 @@ public GetResponse executeGet(GetRequest getRequest) {
         try {
             XContentBuilder builder = XContentFactory.jsonBuilder().prettyPrint();
             builder.startObject();
-            builder.array(termsPath, randomTerms.toArray(new Object[randomTerms.size()]));
+            builder.array(termsPath, randomTerms.toArray(Object[]::new));
             builder.endObject();
             json = Strings.toString(builder);
         } catch (IOException ex) {
@@ -262,7 +264,7 @@ public void testMustRewrite() throws IOException {
         assertEquals("query must be rewritten first", e.getMessage());
 
         // terms lookup removes null values
-        List<Object> nonNullTerms = randomTerms.stream().filter(x -> x != null).toList();
+        List<Object> nonNullTerms = randomTerms.stream().filter(Objects::nonNull).toList();
         QueryBuilder expected;
         if (nonNullTerms.isEmpty()) {
             expected = new MatchNoneQueryBuilder();
@@ -308,6 +310,13 @@ public void testRewriteIndexQueryToNotMatchNone() throws IOException {
         }
     }
 
+    public void testLongTerm() throws IOException {
+        String longTerm = "a".repeat(IndexWriter.MAX_TERM_LENGTH + 1);
+        Exception e = expectThrows(IllegalArgumentException.class, () -> parseQuery(String.format(Locale.getDefault(), """
+            { "terms" : { "foo" : [ "q", "%s" ] } }""", longTerm)));
+        assertThat(e.getMessage(), containsString("term starting with [aaaaa"));
+    }
+
     @Override
     protected QueryBuilder parseQuery(XContentParser parser) throws IOException {
         QueryBuilder query = super.parseQuery(parser);