Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add a limit for graph phrase query expansion #34031

Merged
merged 3 commits into from
Sep 25, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.QueryBuilder;
import org.apache.lucene.util.graph.GraphTokenStreamFiniteStrings;
import org.elasticsearch.ElasticsearchException;
import org.elasticsearch.common.io.stream.StreamInput;
import org.elasticsearch.common.io.stream.StreamOutput;
Expand All @@ -57,11 +58,14 @@
import org.elasticsearch.index.query.support.QueryParsers;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

import static org.elasticsearch.common.lucene.search.Queries.newLenientFieldQuery;
import static org.elasticsearch.common.lucene.search.Queries.newUnmappedFieldQuery;

public class MatchQuery {
public class MatchQuery {

public enum Type implements Writeable {
/**
Expand Down Expand Up @@ -503,6 +507,82 @@ private Query boolToExtendedCommonTermsQuery(BooleanQuery bq,
}
return query;
}

/**
* Overrides {@link QueryBuilder#analyzeGraphPhrase(TokenStream, String, int)} to add
* a limit (see {@link BooleanQuery#getMaxClauseCount()}) to the number of {@link SpanQuery}
* that this method can create.
*
* TODO Remove when https://issues.apache.org/jira/browse/LUCENE-8479 is fixed.
*/
@Override
protected SpanQuery analyzeGraphPhrase(TokenStream source, String field, int phraseSlop) throws IOException {
source.reset();
GraphTokenStreamFiniteStrings graph = new GraphTokenStreamFiniteStrings(source);
List<SpanQuery> clauses = new ArrayList<>();
int[] articulationPoints = graph.articulationPoints();
int lastState = 0;
int maxBooleanClause = BooleanQuery.getMaxClauseCount();
for (int i = 0; i <= articulationPoints.length; i++) {
int start = lastState;
int end = -1;
if (i < articulationPoints.length) {
end = articulationPoints[i];
}
lastState = end;
final SpanQuery queryPos;
if (graph.hasSidePath(start)) {
List<SpanQuery> queries = new ArrayList<>();
Iterator<TokenStream> it = graph.getFiniteStrings(start, end);
while (it.hasNext()) {
TokenStream ts = it.next();
SpanQuery q = createSpanQuery(ts, field);
if (q != null) {
if (queries.size() >= maxBooleanClause) {
throw new BooleanQuery.TooManyClauses();
}
queries.add(q);
}
}
if (queries.size() > 0) {
queryPos = new SpanOrQuery(queries.toArray(new SpanQuery[0]));
} else {
queryPos = null;
}
} else {
Term[] terms = graph.getTerms(field, start);
assert terms.length > 0;
if (terms.length >= maxBooleanClause) {
throw new BooleanQuery.TooManyClauses();
}
if (terms.length == 1) {
queryPos = new SpanTermQuery(terms[0]);
} else {
SpanTermQuery[] orClauses = new SpanTermQuery[terms.length];
for (int idx = 0; idx < terms.length; idx++) {
orClauses[idx] = new SpanTermQuery(terms[idx]);
}

queryPos = new SpanOrQuery(orClauses);
}
}

if (queryPos != null) {
if (clauses.size() >= maxBooleanClause) {
throw new BooleanQuery.TooManyClauses();
}
clauses.add(queryPos);
}
}

if (clauses.isEmpty()) {
return null;
} else if (clauses.size() == 1) {
return clauses.get(0);
} else {
return new SpanNearQuery(clauses.toArray(new SpanQuery[0]), phraseSlop, true);
}
}
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,11 @@

package org.elasticsearch.index.query;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CannedBinaryTokenStream;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.index.Term;
import org.apache.lucene.queries.ExtendedCommonTermsQuery;
import org.apache.lucene.search.BooleanClause;
Expand All @@ -30,6 +35,7 @@
import org.apache.lucene.search.PointRangeQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.util.BytesRef;
import org.elasticsearch.action.admin.indices.mapping.put.PutMappingRequest;
import org.elasticsearch.common.ParsingException;
import org.elasticsearch.common.Strings;
Expand All @@ -46,6 +52,8 @@
import org.hamcrest.Matcher;

import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
Expand Down Expand Up @@ -392,4 +400,72 @@ public void testLenientPhraseQuery() throws Exception {
assertThat(query.toString(),
containsString("field:[string_no_pos] was indexed without position data; cannot run PhraseQuery"));
}

public void testMaxBooleanClause() {
MatchQuery query = new MatchQuery(createShardContext());
query.setAnalyzer(new MockGraphAnalyzer(createGiantGraph(40)));
expectThrows(BooleanQuery.TooManyClauses.class, () -> query.parse(Type.PHRASE, STRING_FIELD_NAME, ""));
query.setAnalyzer(new MockGraphAnalyzer(createGiantGraphMultiTerms()));
expectThrows(BooleanQuery.TooManyClauses.class, () -> query.parse(Type.PHRASE, STRING_FIELD_NAME, ""));
}

private static class MockGraphAnalyzer extends Analyzer {
final CannedBinaryTokenStream.BinaryToken[] tokens;

private MockGraphAnalyzer(CannedBinaryTokenStream.BinaryToken[] tokens ) {
this.tokens = tokens;
}
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.SIMPLE, true);
return new TokenStreamComponents(tokenizer) {
@Override
public TokenStream getTokenStream() {
return new CannedBinaryTokenStream(tokens);
}

@Override
protected void setReader(final Reader reader) {
}
};
}
}

/**
* Creates a graph token stream with 2 side paths at each position.
**/
private static CannedBinaryTokenStream.BinaryToken[] createGiantGraph(int numPos) {
List<CannedBinaryTokenStream.BinaryToken> tokens = new ArrayList<>();
BytesRef term1 = new BytesRef("foo");
BytesRef term2 = new BytesRef("bar");
for (int i = 0; i < numPos;) {
if (i % 2 == 0) {
tokens.add(new CannedBinaryTokenStream.BinaryToken(term2, 1, 1));
tokens.add(new CannedBinaryTokenStream.BinaryToken(term1, 0, 2));
i += 2;
} else {
tokens.add(new CannedBinaryTokenStream.BinaryToken(term2, 1, 1));
i++;
}
}
return tokens.toArray(new CannedBinaryTokenStream.BinaryToken[0]);
}

/**
* Creates a graph token stream with {@link BooleanQuery#getMaxClauseCount()}
* expansions at the last position.
**/
private static CannedBinaryTokenStream.BinaryToken[] createGiantGraphMultiTerms() {
List<CannedBinaryTokenStream.BinaryToken> tokens = new ArrayList<>();
BytesRef term1 = new BytesRef("foo");
BytesRef term2 = new BytesRef("bar");
tokens.add(new CannedBinaryTokenStream.BinaryToken(term2, 1, 1));
tokens.add(new CannedBinaryTokenStream.BinaryToken(term1, 0, 2));
tokens.add(new CannedBinaryTokenStream.BinaryToken(term2, 1, 1));
tokens.add(new CannedBinaryTokenStream.BinaryToken(term2, 1, 1));
for (int i = 0; i < BooleanQuery.getMaxClauseCount(); i++) {
tokens.add(new CannedBinaryTokenStream.BinaryToken(term1, 0, 1));
}
return tokens.toArray(new CannedBinaryTokenStream.BinaryToken[0]);
}
}