diff --git a/server/src/main/java/org/elasticsearch/index/search/MatchQuery.java b/server/src/main/java/org/elasticsearch/index/search/MatchQuery.java index 0635bbd08a756..597666a7c6012 100644 --- a/server/src/main/java/org/elasticsearch/index/search/MatchQuery.java +++ b/server/src/main/java/org/elasticsearch/index/search/MatchQuery.java @@ -44,6 +44,7 @@ import org.apache.lucene.search.spans.SpanTermQuery; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.QueryBuilder; +import org.apache.lucene.util.graph.GraphTokenStreamFiniteStrings; import org.elasticsearch.ElasticsearchException; import org.elasticsearch.common.io.stream.StreamInput; import org.elasticsearch.common.io.stream.StreamOutput; @@ -57,11 +58,14 @@ import org.elasticsearch.index.query.support.QueryParsers; import java.io.IOException; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; import static org.elasticsearch.common.lucene.search.Queries.newLenientFieldQuery; import static org.elasticsearch.common.lucene.search.Queries.newUnmappedFieldQuery; -public class MatchQuery { +public class MatchQuery { public enum Type implements Writeable { /** @@ -503,6 +507,82 @@ private Query boolToExtendedCommonTermsQuery(BooleanQuery bq, } return query; } + + /** + * Overrides {@link QueryBuilder#analyzeGraphPhrase(TokenStream, String, int)} to add + * a limit (see {@link BooleanQuery#getMaxClauseCount()}) to the number of {@link SpanQuery} + * that this method can create. + * + * TODO Remove when https://issues.apache.org/jira/browse/LUCENE-8479 is fixed. + */ + @Override + protected SpanQuery analyzeGraphPhrase(TokenStream source, String field, int phraseSlop) throws IOException { + source.reset(); + GraphTokenStreamFiniteStrings graph = new GraphTokenStreamFiniteStrings(source); + List clauses = new ArrayList<>(); + int[] articulationPoints = graph.articulationPoints(); + int lastState = 0; + int maxBooleanClause = BooleanQuery.getMaxClauseCount(); + for (int i = 0; i <= articulationPoints.length; i++) { + int start = lastState; + int end = -1; + if (i < articulationPoints.length) { + end = articulationPoints[i]; + } + lastState = end; + final SpanQuery queryPos; + if (graph.hasSidePath(start)) { + List queries = new ArrayList<>(); + Iterator it = graph.getFiniteStrings(start, end); + while (it.hasNext()) { + TokenStream ts = it.next(); + SpanQuery q = createSpanQuery(ts, field); + if (q != null) { + if (queries.size() >= maxBooleanClause) { + throw new BooleanQuery.TooManyClauses(); + } + queries.add(q); + } + } + if (queries.size() > 0) { + queryPos = new SpanOrQuery(queries.toArray(new SpanQuery[0])); + } else { + queryPos = null; + } + } else { + Term[] terms = graph.getTerms(field, start); + assert terms.length > 0; + if (terms.length >= maxBooleanClause) { + throw new BooleanQuery.TooManyClauses(); + } + if (terms.length == 1) { + queryPos = new SpanTermQuery(terms[0]); + } else { + SpanTermQuery[] orClauses = new SpanTermQuery[terms.length]; + for (int idx = 0; idx < terms.length; idx++) { + orClauses[idx] = new SpanTermQuery(terms[idx]); + } + + queryPos = new SpanOrQuery(orClauses); + } + } + + if (queryPos != null) { + if (clauses.size() >= maxBooleanClause) { + throw new BooleanQuery.TooManyClauses(); + } + clauses.add(queryPos); + } + } + + if (clauses.isEmpty()) { + return null; + } else if (clauses.size() == 1) { + return clauses.get(0); + } else { + return new SpanNearQuery(clauses.toArray(new SpanQuery[0]), phraseSlop, true); + } + } } /** diff --git a/server/src/test/java/org/elasticsearch/index/query/MatchQueryBuilderTests.java b/server/src/test/java/org/elasticsearch/index/query/MatchQueryBuilderTests.java index 496d8512d4e28..69bb5943a7c98 100644 --- a/server/src/test/java/org/elasticsearch/index/query/MatchQueryBuilderTests.java +++ b/server/src/test/java/org/elasticsearch/index/query/MatchQueryBuilderTests.java @@ -19,6 +19,11 @@ package org.elasticsearch.index.query; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.CannedBinaryTokenStream; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.index.Term; import org.apache.lucene.queries.ExtendedCommonTermsQuery; import org.apache.lucene.search.BooleanClause; @@ -30,6 +35,7 @@ import org.apache.lucene.search.PointRangeQuery; import org.apache.lucene.search.Query; import org.apache.lucene.search.TermQuery; +import org.apache.lucene.util.BytesRef; import org.elasticsearch.action.admin.indices.mapping.put.PutMappingRequest; import org.elasticsearch.common.ParsingException; import org.elasticsearch.common.Strings; @@ -46,6 +52,8 @@ import org.hamcrest.Matcher; import java.io.IOException; +import java.io.Reader; +import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Locale; @@ -392,4 +400,72 @@ public void testLenientPhraseQuery() throws Exception { assertThat(query.toString(), containsString("field:[string_no_pos] was indexed without position data; cannot run PhraseQuery")); } + + public void testMaxBooleanClause() { + MatchQuery query = new MatchQuery(createShardContext()); + query.setAnalyzer(new MockGraphAnalyzer(createGiantGraph(40))); + expectThrows(BooleanQuery.TooManyClauses.class, () -> query.parse(Type.PHRASE, STRING_FIELD_NAME, "")); + query.setAnalyzer(new MockGraphAnalyzer(createGiantGraphMultiTerms())); + expectThrows(BooleanQuery.TooManyClauses.class, () -> query.parse(Type.PHRASE, STRING_FIELD_NAME, "")); + } + + private static class MockGraphAnalyzer extends Analyzer { + final CannedBinaryTokenStream.BinaryToken[] tokens; + + private MockGraphAnalyzer(CannedBinaryTokenStream.BinaryToken[] tokens ) { + this.tokens = tokens; + } + @Override + protected TokenStreamComponents createComponents(String fieldName) { + Tokenizer tokenizer = new MockTokenizer(MockTokenizer.SIMPLE, true); + return new TokenStreamComponents(tokenizer) { + @Override + public TokenStream getTokenStream() { + return new CannedBinaryTokenStream(tokens); + } + + @Override + protected void setReader(final Reader reader) { + } + }; + } + } + + /** + * Creates a graph token stream with 2 side paths at each position. + **/ + private static CannedBinaryTokenStream.BinaryToken[] createGiantGraph(int numPos) { + List tokens = new ArrayList<>(); + BytesRef term1 = new BytesRef("foo"); + BytesRef term2 = new BytesRef("bar"); + for (int i = 0; i < numPos;) { + if (i % 2 == 0) { + tokens.add(new CannedBinaryTokenStream.BinaryToken(term2, 1, 1)); + tokens.add(new CannedBinaryTokenStream.BinaryToken(term1, 0, 2)); + i += 2; + } else { + tokens.add(new CannedBinaryTokenStream.BinaryToken(term2, 1, 1)); + i++; + } + } + return tokens.toArray(new CannedBinaryTokenStream.BinaryToken[0]); + } + + /** + * Creates a graph token stream with {@link BooleanQuery#getMaxClauseCount()} + * expansions at the last position. + **/ + private static CannedBinaryTokenStream.BinaryToken[] createGiantGraphMultiTerms() { + List tokens = new ArrayList<>(); + BytesRef term1 = new BytesRef("foo"); + BytesRef term2 = new BytesRef("bar"); + tokens.add(new CannedBinaryTokenStream.BinaryToken(term2, 1, 1)); + tokens.add(new CannedBinaryTokenStream.BinaryToken(term1, 0, 2)); + tokens.add(new CannedBinaryTokenStream.BinaryToken(term2, 1, 1)); + tokens.add(new CannedBinaryTokenStream.BinaryToken(term2, 1, 1)); + for (int i = 0; i < BooleanQuery.getMaxClauseCount(); i++) { + tokens.add(new CannedBinaryTokenStream.BinaryToken(term1, 0, 1)); + } + return tokens.toArray(new CannedBinaryTokenStream.BinaryToken[0]); + } }