Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Bug] fix case sensitivity for wildcard queries #5462

Merged
merged 5 commits into from
Dec 8, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
### Removed
### Fixed
- Fix 1.x compatibility bug with stored Tasks ([#5412](https://github.com/opensearch-project/OpenSearch/pull/5412))
- Fix case sensitivity for wildcard queries ([#5462](https://github.com/opensearch-project/OpenSearch/pull/5462))
### Security

[Unreleased 3.0]: https://github.com/opensearch-project/OpenSearch/compare/2.4...HEAD
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,39 @@ public void testKeywordWithWhitespace() throws Exception {
assertHitCount(resp, 3L);
}

public void testRegexCaseInsensitivity() throws Exception {
createIndex("messages");
List<IndexRequestBuilder> indexRequests = new ArrayList<>();
indexRequests.add(client().prepareIndex("messages").setId("1").setSource("message", "message: this is a TLS handshake"));
indexRequests.add(client().prepareIndex("messages").setId("2").setSource("message", "message: this is a tcp handshake"));
indexRandom(true, false, indexRequests);

SearchResponse response = client().prepareSearch("messages").setQuery(queryStringQuery("/TLS/").defaultField("message")).get();
assertNoFailures(response);
assertHitCount(response, 1);
assertHits(response.getHits(), "1");

response = client().prepareSearch("messages").setQuery(queryStringQuery("/tls/").defaultField("message")).get();
assertNoFailures(response);
assertHitCount(response, 1);
assertHits(response.getHits(), "1");

response = client().prepareSearch("messages").setQuery(queryStringQuery("/TCP/").defaultField("message")).get();
assertNoFailures(response);
assertHitCount(response, 1);
assertHits(response.getHits(), "2");

response = client().prepareSearch("messages").setQuery(queryStringQuery("/tcp/").defaultField("message")).get();
assertNoFailures(response);
assertHitCount(response, 1);
assertHits(response.getHits(), "2");

response = client().prepareSearch("messages").setQuery(queryStringQuery("/HANDSHAKE/").defaultField("message")).get();
assertNoFailures(response);
assertHitCount(response, 2);
assertHits(response.getHits(), "1", "2");
}

public void testAllFields() throws Exception {
String indexBody = copyToStringFromClasspath("/org/opensearch/search/query/all-query-index.json");

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -89,12 +89,15 @@
import java.time.format.DateTimeFormatter;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashSet;
import java.util.Map;
import java.util.Random;
import java.util.Set;
import java.util.concurrent.ExecutionException;
import java.util.regex.Pattern;

import static java.util.Collections.singletonMap;
import static org.hamcrest.Matchers.containsInAnyOrder;
import static org.opensearch.action.support.WriteRequest.RefreshPolicy.IMMEDIATE;
import static org.opensearch.cluster.metadata.IndexMetadata.SETTING_NUMBER_OF_SHARDS;
import static org.opensearch.common.xcontent.XContentFactory.jsonBuilder;
Expand Down Expand Up @@ -2089,8 +2092,14 @@ public void testWildcardQueryNormalizationOnTextField() {
refresh();

{
// test default case insensitivity: false
WildcardQueryBuilder wildCardQuery = wildcardQuery("field1", "Bb*");
SearchResponse searchResponse = client().prepareSearch().setQuery(wildCardQuery).get();
assertHitCount(searchResponse, 0L);

// test case insensitivity set to true
wildCardQuery = wildcardQuery("field1", "Bb*").caseInsensitive(true);
searchResponse = client().prepareSearch().setQuery(wildCardQuery).get();
assertHitCount(searchResponse, 1L);

wildCardQuery = wildcardQuery("field1", "bb*");
Expand All @@ -2099,6 +2108,24 @@ public void testWildcardQueryNormalizationOnTextField() {
}
}

/** tests wildcard case sensitivity */
public void testWildcardCaseSensitivity() {
assertAcked(prepareCreate("test").setMapping("field", "type=text"));
client().prepareIndex("test").setId("1").setSource("field", "lowercase text").get();
refresh();

// test case sensitive
SearchResponse response = client().prepareSearch("test").setQuery(wildcardQuery("field", "Text").caseInsensitive(false)).get();
assertNoFailures(response);
assertHitCount(response, 0);

// test case insensitive
response = client().prepareSearch("test").setQuery(wildcardQuery("field", "Text").caseInsensitive(true)).get();
assertNoFailures(response);
assertHitCount(response, 1);
assertHits(response.getHits(), "1");
}

/**
* Reserved characters should be excluded when the normalization is applied for keyword fields.
* See https://github.com/elastic/elasticsearch/issues/46300 for details.
Expand Down Expand Up @@ -2175,4 +2202,16 @@ public void testIssueFuzzyInsideSpanMulti() {
SearchResponse response = client().prepareSearch("test").setQuery(query).get();
assertHitCount(response, 1);
}

/**
* asserts the search response hits include the expected ids
*/
private void assertHits(SearchHits hits, String... ids) {
assertThat(hits.getTotalHits().value, equalTo((long) ids.length));
Set<String> hitIds = new HashSet<>();
for (SearchHit hit : hits.getHits()) {
hitIds.add(hit.getId());
}
assertThat(hitIds, containsInAnyOrder(ids));
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,10 @@
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.SortedSetDocValuesField;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.search.MultiTermQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.util.BytesRef;
import org.opensearch.common.Nullable;
import org.opensearch.common.lucene.Lucene;
import org.opensearch.common.xcontent.XContentParser;
import org.opensearch.index.analysis.IndexAnalyzers;
Expand Down Expand Up @@ -368,6 +371,18 @@ protected BytesRef indexedValueForSearch(Object value) {
}
return getTextSearchInfo().getSearchAnalyzer().normalize(name(), value.toString());
}

@Override
public Query wildcardQuery(
String value,
@Nullable MultiTermQuery.RewriteMethod method,
boolean caseInsensitve,
QueryShardContext context
) {
// keyword field types are always normalized, so ignore case sensitivity and force normalize the wildcard
// query text
return super.wildcardQuery(value, method, caseInsensitve, true, context);
}
}

private final boolean indexed;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -281,7 +281,7 @@ public Query prefixQuery(
) {
throw new QueryShardException(
context,
"Can only use prefix queries on keyword, text and wildcard fields - not on [" + name + "] which is of type [" + typeName() + "]"
"Can only use prefix queries on keyword and text fields - not on [" + name + "] which is of type [" + typeName() + "]"
);
}

Expand All @@ -290,6 +290,7 @@ public final Query wildcardQuery(String value, @Nullable MultiTermQuery.RewriteM
return wildcardQuery(value, method, false, context);
}

/** optionally normalize the wildcard pattern based on the value of {@code caseInsensitive} */
public Query wildcardQuery(
String value,
@Nullable MultiTermQuery.RewriteMethod method,
Expand All @@ -298,11 +299,15 @@ public Query wildcardQuery(
) {
throw new QueryShardException(
context,
"Can only use wildcard queries on keyword, text and wildcard fields - not on ["
+ name
+ "] which is of type ["
+ typeName()
+ "]"
"Can only use wildcard queries on keyword and text fields - not on [" + name + "] which is of type [" + typeName() + "]"
);
}

/** always normalizes the wildcard pattern to lowercase */
public Query normalizedWildcardQuery(String value, @Nullable MultiTermQuery.RewriteMethod method, QueryShardContext context) {
throw new QueryShardException(
context,
"Can only use wildcard queries on keyword and text fields - not on [" + name + "] which is of type [" + typeName() + "]"
);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -152,8 +152,34 @@ public static final String normalizeWildcardPattern(String fieldname, String val
return sb.toBytesRef().utf8ToString();
}

/** optionally normalize the wildcard pattern based on the value of {@code caseInsensitive} */
@Override
public Query wildcardQuery(String value, MultiTermQuery.RewriteMethod method, boolean caseInsensitive, QueryShardContext context) {
return wildcardQuery(value, method, caseInsensitive, false, context);
}

/** always normalizes the wildcard pattern to lowercase */
@Override
public Query normalizedWildcardQuery(String value, MultiTermQuery.RewriteMethod method, QueryShardContext context) {
return wildcardQuery(value, method, false, true, context);
}

/**
* return a wildcard query
*
* @param value the pattern
* @param method rewrite method
* @param caseInsensitive should ignore case; note, only used if there is no analyzer, else we use the analyzer rules
* @param normalizeIfAnalyzed force normalize casing if an analyzer is used
* @param context the query shard context
*/
public Query wildcardQuery(
String value,
MultiTermQuery.RewriteMethod method,
boolean caseInsensitive,
boolean normalizeIfAnalyzed,
QueryShardContext context
) {
failIfNotIndexed();
if (context.allowExpensiveQueries() == false) {
throw new OpenSearchException(
Expand All @@ -162,7 +188,7 @@ public Query wildcardQuery(String value, MultiTermQuery.RewriteMethod method, bo
}

Term term;
if (getTextSearchInfo().getSearchAnalyzer() != null) {
if (getTextSearchInfo().getSearchAnalyzer() != null && normalizeIfAnalyzed) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@nknize I think we still do have a conflict here: it makes sense to apply the caseInsensitive hint when user does not specify the analyzer to be used (in this case getTextSearchInfo().getSearchAnalyzer() is the default one), but I think we should not do that if the analyzer or/and analyze_wildcard are provided

[1] https://www.elastic.co/guide/en/elasticsearch/reference/7.10/query-dsl-query-string-query.html

Copy link
Collaborator Author

@nknize nknize Dec 6, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

that's why QueryStringQueryParser calls normalizedWildcardQuery method. It ignores the caseInsensitive parameter for query string queries.

Copy link
Collaborator

@reta reta Dec 6, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think that was caused the confusion to me: we already have 2 variants of wildcardQuery method + now a new one called normalizedWildcardQuery, could we drop normalizedWildcardQuery since this is just delegate to wildcardQuery ?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We sure can.. it's just syntactic sugar. I just thought it was a more descriptive call in the QueryStringQueryParser than calling wildcardQuery(value, method, false, true, context) , but I can throw javadocs there to describe the logic.

Copy link
Collaborator Author

@nknize nknize Dec 6, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ah wait.. nvm..I did it this way because the delegation for normalizedWildcardQuery is specific to StringFieldType only... It's not the same for keyword and constant fields.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this is non issue: since you have this new method

public Query wildcardQuery(
        String value,
        MultiTermQuery.RewriteMethod method,
        boolean caseInsensitive,
        boolean normalizeIfAnalyzed,
        QueryShardContext context
    )

only StringFieldType could implement it, others won't. Another option I was thinking, may be we could embed this into TextSearchInfo / MappedFieldType as a property, since as you mentioned it is specific to StringFieldType only?

Copy link
Collaborator Author

@nknize nknize Dec 7, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What I mean is QueryStringQueryParser calls the generic MappedFieldType.wildcardQuery method. I can eliminate the normalized version of this method call by bumping the new method up the class hierarchy but I'd prefer not do that since normalizeIfAnalyzed is only relevant to the StringFIeldType. (and I don't like the idea of having to keep track of what the caseInsensitive and normalizeIfAnalyzed boolean logic means if we add another field type that doesn't care).

I think this API is cleaner? But I should add javadocs to describe that normalizeWildcardQuery normalizes to lower case whereas wildcardQuery optionally lowers case.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Javadocs would help for sure, I don't think my suggestion (to have overloaded wildcardQuery) is any better than new normalizedWildcardQuery - removes confusion in one place but hurts another one. What do you think about enriching TextSearchInfo for StringFieldType? (than we don't need this normalizeIfAnalyzed argument at all) I believe this could be exactly the right place to do so:

/**
 * Encapsulates information about how to perform text searches over a field
 *
 * @opensearch.internal
 */
public class TextSearchInfo {
  ...
}

Copy link
Collaborator Author

@nknize nknize Dec 8, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What do you think about enriching TextSearchInfo for StringFieldType?

The problem is QueryStringQueryParser needs to change the normalization behavior at search runtime and TextSearchInfo is defined at index creation time for encapsulating Lucene fieldType parameters (e.g., norms, offsets, positions). This could be hacked around by adding a setter to dynamically change TextSearchInfo based on the query but that changes the purpose of TextSearchInfo and I think adds confusion on when to do this and for what type of queries. It might be worth exploring but I think in a follow up enhancement issue since the surface area impacted is larger than the scope of this bug fix? This may also highlight that TextSearchInfo isn't the best classname.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see thanks @nknize

value = normalizeWildcardPattern(name(), value, getTextSearchInfo().getSearchAnalyzer());
term = new Term(name(), value);
} else {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -729,7 +729,8 @@ private Query getWildcardQuerySingle(String field, String termStr) throws ParseE
if (getAllowLeadingWildcard() == false && (termStr.startsWith("*") || termStr.startsWith("?"))) {
throw new ParseException("'*' or '?' not allowed as first character in WildcardQuery");
}
return currentFieldType.wildcardQuery(termStr, getMultiTermRewriteMethod(), context);
// query string query is always normalized
return currentFieldType.normalizedWildcardQuery(termStr, getMultiTermRewriteMethod(), context);
} catch (RuntimeException e) {
if (lenient) {
return newLenientFieldQuery(field, e);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ public void testNumeric() throws Exception {
QueryShardContext context = createShardContext();
QueryShardException e = expectThrows(QueryShardException.class, () -> query.toQuery(context));
assertEquals(
"Can only use prefix queries on keyword, text and wildcard fields - not on [mapped_int] which is of type [integer]",
"Can only use prefix queries on keyword and text fields - not on [mapped_int] which is of type [integer]",
e.getMessage()
);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -873,7 +873,7 @@ public void testPrefixNumeric() throws Exception {
QueryShardContext context = createShardContext();
QueryShardException e = expectThrows(QueryShardException.class, () -> query.toQuery(context));
assertEquals(
"Can only use prefix queries on keyword, text and wildcard fields - not on [mapped_int] which is of type [integer]",
"Can only use prefix queries on keyword and text fields - not on [mapped_int] which is of type [integer]",
e.getMessage()
);
query.lenient(true);
Expand Down