Skip to content

Commit

Permalink
Mask wildcard query special characters on keyword queries (#53127) (#…
Browse files Browse the repository at this point in the history
…53512)

Wildcard queries on keyword fields get normalized, however this normalization
step should exclude the two special characters * and ? in order to keep the
wildcard query itself intact.

Closes #46300
  • Loading branch information
Christoph Büscher authored Mar 24, 2020
1 parent 6b457ab commit 1c1730f
Show file tree
Hide file tree
Showing 6 changed files with 185 additions and 21 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,7 @@ protected KeywordFieldType(KeywordFieldType ref) {
this.splitQueriesOnWhitespace = ref.splitQueriesOnWhitespace;
}

@Override
public KeywordFieldType clone() {
return new KeywordFieldType(this);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,6 @@

import org.apache.lucene.index.Term;
import org.apache.lucene.search.FuzzyQuery;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.MatchNoDocsQuery;
import org.apache.lucene.search.MultiTermQuery;
import org.apache.lucene.search.PrefixQuery;
import org.apache.lucene.search.Query;
Expand All @@ -31,13 +29,16 @@
import org.apache.lucene.search.TermRangeQuery;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.elasticsearch.ElasticsearchException;
import org.elasticsearch.common.lucene.BytesRefs;
import org.elasticsearch.common.unit.Fuzziness;
import org.elasticsearch.index.query.QueryShardContext;
import org.elasticsearch.index.query.support.QueryParsers;

import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import static org.elasticsearch.search.SearchService.ALLOW_EXPENSIVE_QUERIES;

Expand All @@ -47,6 +48,8 @@
* can be implemented. */
public abstract class StringFieldType extends TermBasedFieldType {

private static final Pattern WILDCARD_PATTERN = Pattern.compile("(\\\\.)|([?*]+)");

public StringFieldType() {}

protected StringFieldType(MappedFieldType ref) {
Expand Down Expand Up @@ -92,16 +95,41 @@ public Query prefixQuery(String value, MultiTermQuery.RewriteMethod method, Quer

@Override
public Query wildcardQuery(String value, MultiTermQuery.RewriteMethod method, QueryShardContext context) {
Query termQuery = termQuery(value, context);
if (termQuery instanceof MatchNoDocsQuery || termQuery instanceof MatchAllDocsQuery) {
return termQuery;
}

failIfNotIndexed();
if (context.allowExpensiveQueries() == false) {
throw new ElasticsearchException("[wildcard] queries cannot be executed when '" +
ALLOW_EXPENSIVE_QUERIES.getKey() + "' is set to false.");
}
Term term = MappedFieldType.extractTerm(termQuery);

Term term;
if (searchAnalyzer() != null) {
// we want to normalize everything except wildcard characters, e.g. F?o Ba* to f?o ba*, even if e.g there
// is a char_filter that would otherwise remove them
Matcher wildcardMatcher = WILDCARD_PATTERN.matcher(value);
BytesRefBuilder sb = new BytesRefBuilder();
int last = 0;

while (wildcardMatcher.find()) {
if (wildcardMatcher.start() > 0) {
String chunk = value.substring(last, wildcardMatcher.start());

BytesRef normalized = searchAnalyzer().normalize(name(), chunk);
sb.append(normalized);
}
// append the matched group - without normalizing
sb.append(new BytesRef(wildcardMatcher.group()));

last = wildcardMatcher.end();
}
if (last < value.length()) {
String chunk = value.substring(last);
BytesRef normalized = searchAnalyzer().normalize(name(), chunk);
sb.append(normalized);
}
term = new Term(name(), sb.toBytesRef());
} else {
term = new Term(name(), indexedValueForSearch(value));
}

WildcardQuery query = new WildcardQuery(term);
QueryParsers.setRewriteMethod(query, method);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,17 +31,21 @@
import org.apache.lucene.search.ConstantScoreQuery;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.MatchNoDocsQuery;
import org.apache.lucene.search.MultiTermQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermInSetQuery;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.util.BytesRef;
import org.elasticsearch.ElasticsearchException;
import org.elasticsearch.common.lucene.Lucene;
import org.elasticsearch.common.lucene.search.Queries;
import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.fielddata.IndexFieldData;
import org.elasticsearch.index.fielddata.plain.ConstantIndexFieldData;
import org.elasticsearch.index.query.QueryShardContext;
import org.elasticsearch.index.query.support.QueryParsers;

import java.io.IOException;
import java.util.Arrays;
Expand All @@ -51,6 +55,8 @@
import java.util.Set;
import java.util.function.Function;

import static org.elasticsearch.search.SearchService.ALLOW_EXPENSIVE_QUERIES;

public class TypeFieldMapper extends MetadataFieldMapper {

public static final String NAME = "_type";
Expand Down Expand Up @@ -170,6 +176,25 @@ public Query rangeQuery(Object lowerTerm, Object upperTerm, boolean includeLower
}
return result;
}

@Override
public Query wildcardQuery(String value, MultiTermQuery.RewriteMethod method, QueryShardContext context) {
Query termQuery = termQuery(value, context);
if (termQuery instanceof MatchNoDocsQuery || termQuery instanceof MatchAllDocsQuery) {
return termQuery;
}

if (context.allowExpensiveQueries() == false) {
throw new ElasticsearchException("[wildcard] queries cannot be executed when '" +
ALLOW_EXPENSIVE_QUERIES.getKey() + "' is set to false.");
}
Term term = MappedFieldType.extractTerm(termQuery);

WildcardQuery query = new WildcardQuery(term);
QueryParsers.setRewriteMethod(query, method);
return query;
}

}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -560,13 +560,6 @@ public void testParseRelation() {
assertEquals(ShapeRelation.INTERSECTS, builder.relation());
}

public void testTypeField() throws IOException {
RangeQueryBuilder builder = QueryBuilders.rangeQuery("_type")
.from("value1");
builder.doToQuery(createShardContext());
assertWarnings(QueryShardContext.TYPES_DEPRECATION_MESSAGE);
}

/**
* Range queries should generally be cacheable, at least the ones we create randomly.
* This test makes sure we also test the non-cacheable cases regularly.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@

import java.io.IOException;
import java.util.HashMap;
import java.util.Locale;
import java.util.Map;

import static org.hamcrest.Matchers.equalTo;
Expand Down Expand Up @@ -75,7 +76,9 @@ protected void doAssertLuceneQuery(WildcardQueryBuilder queryBuilder, Query quer

assertThat(wildcardQuery.getField(), equalTo(expectedFieldName));
assertThat(wildcardQuery.getTerm().field(), equalTo(expectedFieldName));
assertThat(wildcardQuery.getTerm().text(), equalTo(queryBuilder.value()));
// wildcard queries get normalized
String text = wildcardQuery.getTerm().text().toLowerCase(Locale.ROOT);
assertThat(text, equalTo(text));
} else {
Query expected = new MatchNoDocsQuery("unknown field [" + expectedFieldName + "]");
assertEquals(expected, query);
Expand Down Expand Up @@ -138,14 +141,14 @@ public void testTypeField() throws IOException {
builder.doToQuery(createShardContext());
assertWarnings(QueryShardContext.TYPES_DEPRECATION_MESSAGE);
}

public void testRewriteIndexQueryToMatchNone() throws IOException {
WildcardQueryBuilder query = new WildcardQueryBuilder("_index", "does_not_exist");
QueryShardContext queryShardContext = createShardContext();
QueryBuilder rewritten = query.rewrite(queryShardContext);
assertThat(rewritten, instanceOf(MatchNoneQueryBuilder.class));
}
}

public void testRewriteIndexQueryNotMatchNone() throws IOException {
String fullIndexName = getIndex().getName();
String firstHalfOfIndexName = fullIndexName.substring(0,fullIndexName.length()/2);
Expand Down
118 changes: 116 additions & 2 deletions server/src/test/java/org/elasticsearch/search/query/SearchQueryIT.java
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@

package org.elasticsearch.search.query;

import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.pattern.PatternReplaceCharFilter;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.search.MultiTermQuery;
import org.apache.lucene.search.join.ScoreMode;
Expand All @@ -32,11 +34,15 @@
import org.elasticsearch.common.Strings;
import org.elasticsearch.common.document.DocumentField;
import org.elasticsearch.common.lucene.search.SpanBooleanQueryRewriteWithMaxClause;
import org.elasticsearch.common.regex.Regex;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.time.DateFormatter;
import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.common.xcontent.XContentFactory;
import org.elasticsearch.common.xcontent.XContentType;
import org.elasticsearch.index.analysis.CharFilterFactory;
import org.elasticsearch.index.analysis.NormalizingCharFilterFactory;
import org.elasticsearch.index.analysis.TokenizerFactory;
import org.elasticsearch.index.query.BoolQueryBuilder;
import org.elasticsearch.index.query.MatchQueryBuilder;
import org.elasticsearch.index.query.MultiMatchQueryBuilder;
Expand All @@ -45,11 +51,14 @@
import org.elasticsearch.index.query.QueryBuilders;
import org.elasticsearch.index.query.RangeQueryBuilder;
import org.elasticsearch.index.query.TermQueryBuilder;
import org.elasticsearch.index.query.WildcardQueryBuilder;
import org.elasticsearch.index.query.WrapperQueryBuilder;
import org.elasticsearch.index.query.functionscore.ScoreFunctionBuilders;
import org.elasticsearch.index.search.MatchQuery;
import org.elasticsearch.indices.IndicesService;
import org.elasticsearch.indices.TermsLookup;
import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider;
import org.elasticsearch.plugins.AnalysisPlugin;
import org.elasticsearch.plugins.Plugin;
import org.elasticsearch.rest.RestStatus;
import org.elasticsearch.search.SearchHit;
Expand All @@ -60,16 +69,20 @@
import org.elasticsearch.test.junit.annotations.TestIssueLogging;

import java.io.IOException;
import java.io.Reader;
import java.time.Instant;
import java.time.ZoneId;
import java.time.ZoneOffset;
import java.time.ZonedDateTime;
import java.time.format.DateTimeFormatter;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Map;
import java.util.Random;
import java.util.concurrent.ExecutionException;
import java.util.regex.Pattern;

import static java.util.Collections.singletonMap;
import static org.elasticsearch.action.support.WriteRequest.RefreshPolicy.IMMEDIATE;
import static org.elasticsearch.cluster.metadata.IndexMetaData.SETTING_NUMBER_OF_SHARDS;
import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder;
Expand Down Expand Up @@ -120,7 +133,7 @@ public class SearchQueryIT extends ESIntegTestCase {

@Override
protected Collection<Class<? extends Plugin>> nodePlugins() {
return Collections.singleton(InternalSettingsPlugin.class);
return Arrays.asList(InternalSettingsPlugin.class, MockAnalysisPlugin.class);
}

@Override
Expand Down Expand Up @@ -1897,6 +1910,107 @@ public void testFieldAliasesForMetaFields() throws Exception {

}

/**
* Test that wildcard queries on keyword fields get normalized
*/
public void testWildcardQueryNormalizationOnKeywordField() {
assertAcked(prepareCreate("test")
.setSettings(Settings.builder()
.put("index.analysis.normalizer.lowercase_normalizer.type", "custom")
.putList("index.analysis.normalizer.lowercase_normalizer.filter", "lowercase")
.build())
.addMapping("_doc", "field1", "type=keyword,normalizer=lowercase_normalizer"));
client().prepareIndex("test", "_doc", "1").setSource("field1", "Bbb Aaa").get();
refresh();

{
WildcardQueryBuilder wildCardQuery = wildcardQuery("field1", "Bb*");
SearchResponse searchResponse = client().prepareSearch().setQuery(wildCardQuery).get();
assertHitCount(searchResponse, 1L);

wildCardQuery = wildcardQuery("field1", "bb*");
searchResponse = client().prepareSearch().setQuery(wildCardQuery).get();
assertHitCount(searchResponse, 1L);
}
}

/**
* Test that wildcard queries on text fields get normalized
*/
public void testWildcardQueryNormalizationOnTextField() {
assertAcked(prepareCreate("test")
.setSettings(Settings.builder()
.put("index.analysis.analyzer.lowercase_analyzer.type", "custom")
.put("index.analysis.analyzer.lowercase_analyzer.tokenizer", "standard")
.putList("index.analysis.analyzer.lowercase_analyzer.filter", "lowercase")
.build())
.addMapping("_doc", "field1", "type=text,analyzer=lowercase_analyzer"));
client().prepareIndex("test", "_doc", "1").setSource("field1", "Bbb Aaa").get();
refresh();

{
WildcardQueryBuilder wildCardQuery = wildcardQuery("field1", "Bb*");
SearchResponse searchResponse = client().prepareSearch().setQuery(wildCardQuery).get();
assertHitCount(searchResponse, 1L);

wildCardQuery = wildcardQuery("field1", "bb*");
searchResponse = client().prepareSearch().setQuery(wildCardQuery).get();
assertHitCount(searchResponse, 1L);
}
}

/**
* Reserved characters should be excluded when the normalization is applied for keyword fields.
* See https://github.com/elastic/elasticsearch/issues/46300 for details.
*/
public void testWildcardQueryNormalizationKeywordSpecialCharacters() {
assertAcked(prepareCreate("test")
.setSettings(Settings.builder().put("index.analysis.char_filter.no_wildcard.type", "mock_pattern_replace")
.put("index.analysis.normalizer.no_wildcard.type", "custom")
.put("index.analysis.normalizer.no_wildcard.char_filter", "no_wildcard").build())
.addMapping("_doc", "field", "type=keyword,normalizer=no_wildcard"));
client().prepareIndex("test", "_doc", "1").setSource("field", "label-1").get();
refresh();

WildcardQueryBuilder wildCardQuery = wildcardQuery("field", "la*");
SearchResponse searchResponse = client().prepareSearch().setQuery(wildCardQuery).get();
assertHitCount(searchResponse, 1L);

wildCardQuery = wildcardQuery("field", "la*el-?");
searchResponse = client().prepareSearch().setQuery(wildCardQuery).get();
assertHitCount(searchResponse, 1L);
}

public static class MockAnalysisPlugin extends Plugin implements AnalysisPlugin {

@Override
public Map<String, AnalysisProvider<CharFilterFactory>> getCharFilters() {
return singletonMap("mock_pattern_replace", (indexSettings, env, name, settings) -> {
class Factory implements NormalizingCharFilterFactory {

private final Pattern pattern = Regex.compile("[\\*\\?]", null);

@Override
public String name() {
return name;
}

@Override
public Reader create(Reader reader) {
return new PatternReplaceCharFilter(pattern, "", reader);
}
}
return new Factory();
});
}

@Override
public Map<String, AnalysisProvider<TokenizerFactory>> getTokenizers() {
return singletonMap("keyword", (indexSettings, environment, name, settings) -> TokenizerFactory.newFactory(name,
() -> new MockTokenizer(MockTokenizer.KEYWORD, false)));
}
}

/**
* Test correct handling {@link SpanBooleanQueryRewriteWithMaxClause#rewrite(IndexReader, MultiTermQuery)}. That rewrite method is e.g.
* set for fuzzy queries with "constant_score" rewrite nested inside a `span_multi` query and would cause NPEs due to an unset
Expand Down

0 comments on commit 1c1730f

Please sign in to comment.