From 216f13541217b8505fe25838aa49d9dbbae9bf48 Mon Sep 17 00:00:00 2001 From: markharwood Date: Mon, 11 Oct 2021 14:42:17 +0100 Subject: [PATCH] Wildcard field regex query fix (#78839) Fix for wildcard field query optimisation that rewrites to a match all for regexes like .* A bug was found in this complex rewrite logic so we have simplified the detection of .* type regexes by examining the Automaton for the regex rather than our parsed form of it which is expressed as a Lucene BooleanQuery. The old logic relied on a recursive "simplify" function on the BooleanQuery which has now been removed. We now rely on Lucene's query rewrite logic to simplify expressions at query time and consequently some of the tests had to change to do some of this rewriting before running test comparisons. Closes #78391 --- .../MatchAllButRequireVerificationQuery.java | 51 -------- .../wildcard/mapper/WildcardFieldMapper.java | 117 +++++------------- .../mapper/WildcardFieldMapperTests.java | 107 ++++++++++++++-- 3 files changed, 125 insertions(+), 150 deletions(-) delete mode 100644 x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/MatchAllButRequireVerificationQuery.java diff --git a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/MatchAllButRequireVerificationQuery.java b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/MatchAllButRequireVerificationQuery.java deleted file mode 100644 index 053d36c65c38e..0000000000000 --- a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/MatchAllButRequireVerificationQuery.java +++ /dev/null @@ -1,51 +0,0 @@ -/* - * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one - * or more contributor license agreements. Licensed under the Elastic License - * 2.0; you may not use this file except in compliance with the Elastic License - * 2.0. - */ - -package org.elasticsearch.xpack.wildcard.mapper; - -import org.apache.lucene.index.IndexReader; -import org.apache.lucene.search.MatchAllDocsQuery; -import org.apache.lucene.search.Query; -import org.apache.lucene.search.QueryVisitor; - -import java.io.IOException; - -/** - * A query that matches all documents. The class is more of a marker - * that we encountered something that will need verification. - * (A MatchAllDocs query is used to indicate we can match all - * _without_ verification) - */ -public final class MatchAllButRequireVerificationQuery extends Query { - - @Override - public Query rewrite(IndexReader reader) throws IOException { - return new MatchAllDocsQuery(); - } - - @Override - public String toString(String field) { - return "*:* (tbc)"; - } - - @Override - public boolean equals(Object o) { - return sameClassAs(o); - } - - @Override - public int hashCode() { - return classHash(); - } - - @Override - public void visit(QueryVisitor visitor) { - visitor.visitLeaf(this); - } - - -} diff --git a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java index b53f72c22e35f..8f9be13101abe 100644 --- a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java +++ b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java @@ -37,6 +37,8 @@ import org.apache.lucene.search.WildcardQuery; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.automaton.Automaton; +import org.apache.lucene.util.automaton.MinimizationOperations; +import org.apache.lucene.util.automaton.Operations; import org.apache.lucene.util.automaton.RegExp; import org.apache.lucene.util.automaton.RegExp.Kind; import org.elasticsearch.ElasticsearchParseException; @@ -348,26 +350,25 @@ public Query regexpQuery(String value, int syntaxFlags, int matchFlags, int maxD if (value.length() == 0) { return new MatchNoDocsQuery(); } + + //Check for simple "match all expressions e.g. .* + RegExp regExp = new RegExp(value, syntaxFlags, matchFlags); + Automaton a = regExp.toAutomaton(); + a = Operations.determinize(a, maxDeterminizedStates); + a = MinimizationOperations.minimize(a, maxDeterminizedStates); + if (Operations.isTotal(a)) { // Will match all + return existsQuery(context); + } RegExp ngramRegex = new RegExp(addLineEndChars(value), syntaxFlags, matchFlags); + Query approxBooleanQuery = toApproximationQuery(ngramRegex); Query approxNgramQuery = rewriteBoolToNgramQuery(approxBooleanQuery); - // MatchAll is a special case meaning the regex is known to match everything .* and - // there is no need for verification. - if (approxNgramQuery instanceof MatchAllDocsQuery) { - return existsQuery(context); - } RegExp regex = new RegExp(value, syntaxFlags, matchFlags); Automaton automaton = regex.toAutomaton(maxDeterminizedStates); - // MatchAllButRequireVerificationQuery is a special case meaning the regex is reduced to a single - // clause which we can't accelerate at all and needs verification. Example would be ".." - if (approxNgramQuery instanceof MatchAllButRequireVerificationQuery) { - return new BinaryDvConfirmedAutomatonQuery(new MatchAllDocsQuery(), name(), value, automaton); - } - // We can accelerate execution with the ngram query return new BinaryDvConfirmedAutomatonQuery(approxNgramQuery, name(), value, automaton); } @@ -414,12 +415,12 @@ public static Query toApproximationQuery(RegExp r) throws IllegalArgumentExcepti // plain TermQuery objects together. Boolean queries are interpreted as a black box and not // concatenated. BooleanQuery.Builder wrapper = new BooleanQuery.Builder(); - wrapper.add(result, Occur.MUST); + wrapper.add(result, Occur.FILTER); result = wrapper.build(); } } else { // Expressions like (a){0,3} match empty string or up to 3 a's. - result = new MatchAllButRequireVerificationQuery(); + result = new MatchAllDocsQuery(); } break; case REGEXP_ANYSTRING: @@ -436,7 +437,7 @@ public static Query toApproximationQuery(RegExp r) throws IllegalArgumentExcepti case REGEXP_INTERVAL: case REGEXP_EMPTY: case REGEXP_AUTOMATON: - result = new MatchAllButRequireVerificationQuery(); + result = new MatchAllDocsQuery(); break; } assert result != null; // All regex types are understood and translated to a query. @@ -456,21 +457,21 @@ private static Query createConcatenationQuery(RegExp r) { sequence.append(tq.getTerm().text()); } else { if (sequence.length() > 0) { - bAnd.add(new TermQuery(new Term("", sequence.toString())), Occur.MUST); + bAnd.add(new TermQuery(new Term("", sequence.toString())), Occur.FILTER); sequence = new StringBuilder(); } - bAnd.add(query, Occur.MUST); + bAnd.add(query, Occur.FILTER); } } if (sequence.length() > 0) { - bAnd.add(new TermQuery(new Term("", sequence.toString())), Occur.MUST); + bAnd.add(new TermQuery(new Term("", sequence.toString())), Occur.FILTER); } BooleanQuery combined = bAnd.build(); if (combined.clauses().size() > 0) { return combined; } // There's something in the regex we couldn't represent as a query - resort to a match all with verification - return new MatchAllButRequireVerificationQuery(); + return new MatchAllDocsQuery(); } @@ -498,7 +499,7 @@ private static Query createUnionQuery(RegExp r) { } } // There's something in the regex we couldn't represent as a query - resort to a match all with verification - return new MatchAllButRequireVerificationQuery(); + return new MatchAllDocsQuery(); } private static void findLeaves(RegExp exp, Kind kind, List queries) { @@ -529,7 +530,7 @@ private Query rewriteBoolToNgramQuery(Query approxQuery) { for (BooleanClause clause : bq) { Query q = rewriteBoolToNgramQuery(clause.getQuery()); if (q != null) { - if (clause.getOccur().equals(Occur.MUST)) { + if (clause.getOccur().equals(Occur.FILTER)) { // Can't drop "should" clauses because it can elevate a sibling optional item // to mandatory (shoulds with 1 clause) causing false negatives // Dropping MUSTs increase false positives which are OK because are verified anyway. @@ -541,7 +542,7 @@ private Query rewriteBoolToNgramQuery(Query approxQuery) { rewritten.add(q, clause.getOccur()); } } - return simplify(rewritten.build()); + return rewritten.build(); } if (approxQuery instanceof TermQuery) { TermQuery tq = (TermQuery) approxQuery; @@ -549,7 +550,7 @@ private Query rewriteBoolToNgramQuery(Query approxQuery) { //Remove simple terms that are only string beginnings or ends. String s = tq.getTerm().text(); if (s.equals(WildcardFieldMapper.TOKEN_START_STRING) || s.equals(WildcardFieldMapper.TOKEN_END_STRING)) { - return new MatchAllButRequireVerificationQuery(); + return new MatchAllDocsQuery(); } // Break term into tokens @@ -557,75 +558,15 @@ private Query rewriteBoolToNgramQuery(Query approxQuery) { getNgramTokens(tokens, s); BooleanQuery.Builder rewritten = new BooleanQuery.Builder(); for (String string : tokens) { - addClause(string, rewritten, Occur.MUST); + addClause(string, rewritten, Occur.FILTER); } - return simplify(rewritten.build()); + return rewritten.build(); } - if (isMatchAll(approxQuery)) { + if (approxQuery instanceof MatchAllDocsQuery) { return approxQuery; } throw new IllegalStateException("Invalid query type found parsing regex query:" + approxQuery); } - - static Query simplify(Query input) { - if (input instanceof BooleanQuery == false) { - return input; - } - BooleanQuery result = (BooleanQuery) input; - if (result.clauses().size() == 0) { - // A ".*" clause can produce zero clauses in which case we return MatchAll - return new MatchAllDocsQuery(); - } - if (result.clauses().size() == 1) { - return simplify(result.clauses().get(0).getQuery()); - } - - // We may have a mix of MatchAll and concrete queries - assess if we can simplify - int matchAllCount = 0; - int verifyCount = 0; - boolean allConcretesAreOptional = true; - for (BooleanClause booleanClause : result.clauses()) { - Query q = booleanClause.getQuery(); - if (q instanceof MatchAllDocsQuery) { - matchAllCount++; - } else if (q instanceof MatchAllButRequireVerificationQuery) { - verifyCount++; - } else { - // Concrete query - if (booleanClause.getOccur() != Occur.SHOULD) { - allConcretesAreOptional = false; - } - } - } - - if ((allConcretesAreOptional && matchAllCount > 0)) { - // Any match all expression takes precedence over all optional concrete queries. - return new MatchAllDocsQuery(); - } - - if ((allConcretesAreOptional && verifyCount > 0)) { - // Any match all expression that needs verification takes precedence over all optional concrete queries. - return new MatchAllButRequireVerificationQuery(); - } - - // We have some mandatory concrete queries - strip out the superfluous match all expressions - if (allConcretesAreOptional == false && matchAllCount + verifyCount > 0) { - BooleanQuery.Builder rewritten = new BooleanQuery.Builder(); - for (BooleanClause booleanClause : result.clauses()) { - if (isMatchAll(booleanClause.getQuery()) == false) { - rewritten.add(booleanClause); - } - } - return simplify(rewritten.build()); - } - return result; - } - - - static boolean isMatchAll(Query q) { - return q instanceof MatchAllDocsQuery || q instanceof MatchAllButRequireVerificationQuery; - } - protected void getNgramTokens(Set tokens, String fragment) { if (fragment.equals(TOKEN_START_STRING) || fragment.equals(TOKEN_END_STRING)) { // If a regex is a form of match-all e.g. ".*" we only produce the token start/end markers as search @@ -666,7 +607,7 @@ private void addClause(String token, BooleanQuery.Builder bqBuilder, Occur occur if (tokenSize < 2 || token.equals(WildcardFieldMapper.TOKEN_END_STRING)) { // there's something concrete to be searched but it's too short // Require verification. - bqBuilder.add(new BooleanClause(new MatchAllButRequireVerificationQuery(), occur)); + bqBuilder.add(new BooleanClause(new MatchAllDocsQuery(), occur)); return; } if (tokenSize == NGRAM_SIZE) { @@ -723,11 +664,11 @@ public Query rangeQuery( if (tokenSize == NGRAM_SIZE) { TermQuery tq = new TermQuery(new Term(name(), token)); - bqBuilder.add(new BooleanClause(tq, Occur.MUST)); + bqBuilder.add(new BooleanClause(tq, Occur.FILTER)); } else { PrefixQuery wq = new PrefixQuery(new Term(name(), token)); wq.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_REWRITE); - bqBuilder.add(new BooleanClause(wq, Occur.MUST)); + bqBuilder.add(new BooleanClause(wq, Occur.FILTER)); } } BooleanQuery bq = bqBuilder.build(); diff --git a/x-pack/plugin/wildcard/src/test/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapperTests.java b/x-pack/plugin/wildcard/src/test/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapperTests.java index 737bce58c9e4a..d1dfb9f95ec3b 100644 --- a/x-pack/plugin/wildcard/src/test/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapperTests.java +++ b/x-pack/plugin/wildcard/src/test/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapperTests.java @@ -19,8 +19,10 @@ import org.apache.lucene.index.Term; import org.apache.lucene.queryparser.classic.ParseException; import org.apache.lucene.queryparser.classic.QueryParser; +import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.BoostQuery; import org.apache.lucene.search.ConstantScoreQuery; import org.apache.lucene.search.DocValuesFieldExistsQuery; import org.apache.lucene.search.IndexSearcher; @@ -34,6 +36,7 @@ import org.apache.lucene.search.TermRangeQuery; import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.WildcardQuery; +import org.apache.lucene.store.BaseDirectoryWrapper; import org.apache.lucene.store.Directory; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.automaton.Automaton; @@ -97,6 +100,8 @@ static SearchExecutionContext createMockSearchExecutionContext(boolean allowExpe static WildcardFieldMapper wildcardFieldType; static WildcardFieldMapper wildcardFieldType79; static KeywordFieldMapper keywordFieldType; + private DirectoryReader rewriteReader; + private BaseDirectoryWrapper rewriteDir; @Override protected Collection getPlugins() { @@ -120,8 +125,38 @@ public void setUp() throws Exception { org.elasticsearch.index.mapper.KeywordFieldMapper.Builder kwBuilder = new KeywordFieldMapper.Builder(KEYWORD_FIELD_NAME); keywordFieldType = kwBuilder.build(MapperBuilderContext.ROOT); + + rewriteDir = newDirectory(); + IndexWriterConfig iwc = newIndexWriterConfig(WildcardFieldMapper.WILDCARD_ANALYZER_7_10); + RandomIndexWriter iw = new RandomIndexWriter(random(), rewriteDir, iwc); + + // Create a string that is too large and will not be indexed + String docContent = "a"; + Document doc = new Document(); + LuceneDocument parseDoc = new LuceneDocument(); + addFields(parseDoc, doc, docContent); + indexDoc(parseDoc, doc, iw); + + iw.forceMerge(1); + rewriteReader = iw.getReader(); + iw.close(); + super.setUp(); } + + + + @Override + public void tearDown() throws Exception { + try { + rewriteReader.close(); + rewriteDir.close(); + } catch (Exception ignoreCloseFailure) { + // allow any superclass tear down logic to continue + } + // TODO Auto-generated method stub + super.tearDown(); + } public void testTooBigKeywordField() throws IOException { Directory dir = newDirectory(); @@ -484,10 +519,10 @@ public void testRangeQueryVersusKeywordField() throws IOException { public void testRegexAcceleration() throws IOException, ParseException { // All these expressions should rewrite to a match all with no verification step required at all - String superfastRegexes[]= { ".*", "...*..", "(foo|bar|.*)", "@"}; + String superfastRegexes[]= { ".*", "(foo|bar|.*)", "@"}; for (String regex : superfastRegexes) { Query wildcardFieldQuery = wildcardFieldType.fieldType().regexpQuery(regex, RegExp.ALL, 0, 20000, null, MOCK_CONTEXT); - assertTrue(wildcardFieldQuery instanceof DocValuesFieldExistsQuery); + assertTrue(regex + "should have been accelerated", wildcardFieldQuery instanceof DocValuesFieldExistsQuery); } String matchNoDocsRegexes[]= { ""}; for (String regex : matchNoDocsRegexes) { @@ -518,12 +553,14 @@ public void testRegexAcceleration() throws IOException, ParseException { // All these expressions should rewrite to just the verification query (there's no ngram acceleration) // TODO we can possibly improve on some of these - String matchAllButVerifyTests[]= { "..", "(a)?","(a|b){0,3}", "((foo)?|(foo|bar)?)", "@&~(abc.+)", "aaa.+&.+bbb"}; + String matchAllButVerifyTests[]= { "..", "(a)?","(a|b){0,3}", "((foo)?|(foo|bar)?)", "@&~(abc.+)", "aaa.+&.+bbb", "a*", "...*.."}; for (String regex : matchAllButVerifyTests) { Query wildcardFieldQuery = wildcardFieldType.fieldType().regexpQuery(regex, RegExp.ALL, 0, 20000, null, MOCK_CONTEXT); BinaryDvConfirmedAutomatonQuery q = (BinaryDvConfirmedAutomatonQuery)wildcardFieldQuery; + Query approximationQuery = unwrapAnyBoost(q.getApproximationQuery()); + approximationQuery = getSimplifiedApproximationQuery(q.getApproximationQuery()); assertTrue(regex +" was not a pure verify query " +formatQuery(wildcardFieldQuery), - q.getApproximationQuery() instanceof MatchAllDocsQuery); + approximationQuery instanceof MatchAllDocsQuery); } @@ -532,8 +569,8 @@ public void testRegexAcceleration() throws IOException, ParseException { String suboptimalTests[][] = { // TODO short wildcards like a* OR b* aren't great so we just drop them. // Ideally we would attach to successors to create (acd OR bcd) - { "[ab]cd", "+cc_ +c__"} - }; + { "[ab]cd", "+(+cc_ +c__) +*:*"} + }; for (String[] test : suboptimalTests) { String regex = test[0]; String expectedAccelerationQueryString = test[1].replaceAll("_", ""+WildcardFieldMapper.TOKEN_START_OR_END_CHAR); @@ -569,7 +606,7 @@ public void testWildcardAcceleration() throws IOException, ParseException { { "foo*bar", "+_eo +eoo +aaq +aq_ +q__" }, { "foo?bar", "+_eo +eoo +aaq +aq_ +q__" }, { "?foo*bar?", "+eoo +aaq" }, - { "*c", "+c__" } }; + { "*c", "c__" } }; for (String[] test : tests) { String pattern = test[0]; String expectedAccelerationQueryString = test[1].replaceAll("_", "" + WildcardFieldMapper.TOKEN_START_OR_END_CHAR); @@ -714,7 +751,7 @@ public void testFuzzyAcceleration() throws IOException, ParseException { }; for (FuzzyTest test : tests) { Query wildcardFieldQuery = test.getFuzzyQuery(); - testExpectedAccelerationQuery(test.pattern, wildcardFieldQuery, test.getExpectedApproxQuery()); + testExpectedAccelerationQuery(test.pattern, wildcardFieldQuery, getSimplifiedApproximationQuery(test.getExpectedApproxQuery())); } } @@ -765,7 +802,8 @@ public void testRangeAcceleration() throws IOException, ParseException { } } - void testExpectedAccelerationQuery(String regex, Query combinedQuery, String expectedAccelerationQueryString) throws ParseException { + void testExpectedAccelerationQuery(String regex, Query combinedQuery, String expectedAccelerationQueryString) throws ParseException, + IOException { QueryParser qsp = new QueryParser(WILDCARD_FIELD_NAME, new KeywordAnalyzer()); Query expectedAccelerationQuery = qsp.parse(expectedAccelerationQueryString); @@ -780,16 +818,63 @@ private Query unwrapAnyConstantScore(Query q) { return q; } } + private Query unwrapAnyBoost(Query q) { + if (q instanceof BoostQuery) { + BoostQuery csq = (BoostQuery) q; + return csq.getQuery(); + } else { + return q; + } + } + - void testExpectedAccelerationQuery(String regex, Query combinedQuery, Query expectedAccelerationQuery) throws ParseException { + void testExpectedAccelerationQuery(String regex, Query combinedQuery, Query expectedAccelerationQuery) throws ParseException, + IOException { BinaryDvConfirmedAutomatonQuery cq = (BinaryDvConfirmedAutomatonQuery) unwrapAnyConstantScore(combinedQuery); Query approximationQuery = cq.getApproximationQuery(); - + approximationQuery = getSimplifiedApproximationQuery(approximationQuery); String message = "regex: "+ regex +"\nactual query: " + formatQuery(approximationQuery) + "\nexpected query: " + formatQuery(expectedAccelerationQuery) + "\n"; assertEquals(message, expectedAccelerationQuery, approximationQuery); } + // For comparison purposes rewrite and unwrap various superfluous parts to get to raw logic + protected Query getSimplifiedApproximationQuery(Query approximationQuery) throws IOException { + int numRewrites = 0; + int maxNumRewrites = 100; + for (; numRewrites < maxNumRewrites; numRewrites++) { + Query newApprox = approximationQuery.rewrite(rewriteReader); + if (newApprox == approximationQuery) { + break; + } + approximationQuery = newApprox; + + } + assertTrue(numRewrites < maxNumRewrites); + approximationQuery = rewriteFiltersToMustsForComparisonPurposes(approximationQuery); + return approximationQuery; + } + + private Query rewriteFiltersToMustsForComparisonPurposes(Query q) { + q = unwrapAnyBoost(q); + q = unwrapAnyConstantScore(q); + if (q instanceof BooleanQuery) { + BooleanQuery.Builder result = new BooleanQuery.Builder(); + BooleanQuery bq = (BooleanQuery)q; + for (BooleanClause cq : bq.clauses()){ + Query rewritten = rewriteFiltersToMustsForComparisonPurposes(cq.getQuery()); + if(cq.getOccur() == Occur.FILTER) { + result.add(rewritten, Occur.MUST); + } else { + result.add(rewritten, cq.getOccur()); + } + } + return result.build(); + } + + return q; + } + private String getRandomFuzzyPattern(HashSet values, int edits, int prefixLength) { assert edits >=0 && edits <=2; // Pick one of the indexed document values to focus our queries on.