Skip to content

Commit

Permalink
Case Insensitive Support in Regexp Interval (#2237)
Browse files Browse the repository at this point in the history
Add a `case_insensitive` flag to regexp interval source.

Signed-off-by: Matt Weber <[email protected]>
  • Loading branch information
mattweber authored Feb 24, 2022
1 parent 788ba99 commit 37235fa
Show file tree
Hide file tree
Showing 4 changed files with 112 additions and 12 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,56 @@ setup:
- '{"index": {"_index": "test", "_id": "6"}}'
- '{"text" : "that is some cold cold rain"}'

---
"Test regexp":
- skip:
version: " - 1.2.99"
reason: "regexp introduced in 1.3"
- do:
search:
index: test
body:
query:
intervals:
text:
regexp:
pattern: "at[a-z]{2,}here"
- match: { hits.total.value: 1 }

---
"Test regexp, explicit case sensitive":
- skip:
version: " - 1.99.99"
reason: "case_insensitive introduced in 2.0"
- do:
search:
index: test
body:
query:
intervals:
text:
regexp:
pattern: "AT[a-z]{2,}HERE"
case_insensitive: false
- match: { hits.total.value: 0 }

---
"Test regexp, explicit case insensitive":
- skip:
version: " - 1.99.99"
reason: "case_insensitive introduced in 2.0"
- do:
search:
index: test
body:
query:
intervals:
text:
regexp:
pattern: "AT[a-z]{2,}HERE"
case_insensitive: true
- match: { hits.total.value: 1 }

---
"Test ordered matching with via mode":
- skip:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
import org.apache.lucene.search.FuzzyQuery;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.automaton.CompiledAutomaton;
import org.apache.lucene.util.automaton.RegExp;
import org.opensearch.LegacyESVersion;
import org.opensearch.Version;
import org.opensearch.common.ParseField;
Expand Down Expand Up @@ -687,24 +688,41 @@ public static class Regexp extends IntervalsSourceProvider {
private final int flags;
private final String useField;
private final Integer maxExpansions;

public Regexp(String pattern, int flags, String useField, Integer maxExpansions) {
private final boolean caseInsensitive;

/**
* Constructor
*
* {@code flags} is Lucene's <a href="https://github.com/apache/lucene/blob/main/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java#L391-L411">syntax flags</a>
* and {@code caseInsensitive} enables Lucene's only <a href="https://github.com/apache/lucene/blob/main/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java#L416">matching flag</a>.
*/
public Regexp(String pattern, int flags, String useField, Integer maxExpansions, boolean caseInsensitive) {
this.pattern = pattern;
this.flags = flags;
this.useField = useField;
this.maxExpansions = (maxExpansions != null && maxExpansions > 0) ? maxExpansions : null;
this.caseInsensitive = caseInsensitive;
}

public Regexp(StreamInput in) throws IOException {
this.pattern = in.readString();
this.flags = in.readVInt();
this.useField = in.readOptionalString();
this.maxExpansions = in.readOptionalVInt();
if (in.getVersion().onOrAfter(Version.V_2_0_0)) {
this.caseInsensitive = in.readBoolean();
} else {
this.caseInsensitive = false;
}
}

@Override
public IntervalsSource getSource(QueryShardContext context, MappedFieldType fieldType) {
final org.apache.lucene.util.automaton.RegExp regexp = new org.apache.lucene.util.automaton.RegExp(pattern, flags);
final org.apache.lucene.util.automaton.RegExp regexp = new org.apache.lucene.util.automaton.RegExp(
pattern,
flags,
caseInsensitive ? RegExp.ASCII_CASE_INSENSITIVE : 0
);
final CompiledAutomaton automaton = new CompiledAutomaton(regexp.toAutomaton());

if (useField != null) {
Expand Down Expand Up @@ -745,12 +763,13 @@ public boolean equals(Object o) {
return Objects.equals(pattern, regexp.pattern)
&& Objects.equals(flags, regexp.flags)
&& Objects.equals(useField, regexp.useField)
&& Objects.equals(maxExpansions, regexp.maxExpansions);
&& Objects.equals(maxExpansions, regexp.maxExpansions)
&& Objects.equals(caseInsensitive, regexp.caseInsensitive);
}

@Override
public int hashCode() {
return Objects.hash(pattern, flags, useField, maxExpansions);
return Objects.hash(pattern, flags, useField, maxExpansions, caseInsensitive);
}

@Override
Expand All @@ -764,6 +783,9 @@ public void writeTo(StreamOutput out) throws IOException {
out.writeVInt(flags);
out.writeOptionalString(useField);
out.writeOptionalVInt(maxExpansions);
if (out.getVersion().onOrAfter(Version.V_2_0_0)) {
out.writeBoolean(caseInsensitive);
}
}

@Override
Expand All @@ -779,6 +801,9 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws
if (maxExpansions != null) {
builder.field("max_expansions", maxExpansions);
}
if (caseInsensitive) {
builder.field("case_insensitive", caseInsensitive);
}
builder.endObject();
return builder;
}
Expand All @@ -789,13 +814,14 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws
Integer flagsValue = (Integer) args[2];
String useField = (String) args[3];
Integer maxExpansions = (Integer) args[4];
boolean caseInsensitive = args[5] != null && (boolean) args[5];

if (flagsValue != null) {
return new Regexp(pattern, flagsValue, useField, maxExpansions);
return new Regexp(pattern, flagsValue, useField, maxExpansions, caseInsensitive);
} else if (flags != null) {
return new Regexp(pattern, RegexpFlag.resolveValue(flags), useField, maxExpansions);
return new Regexp(pattern, RegexpFlag.resolveValue(flags), useField, maxExpansions, caseInsensitive);
} else {
return new Regexp(pattern, DEFAULT_FLAGS_VALUE, useField, maxExpansions);
return new Regexp(pattern, DEFAULT_FLAGS_VALUE, useField, maxExpansions, caseInsensitive);
}
});
static {
Expand All @@ -804,6 +830,7 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws
PARSER.declareInt(optionalConstructorArg(), new ParseField("flags_value"));
PARSER.declareString(optionalConstructorArg(), new ParseField("use_field"));
PARSER.declareInt(optionalConstructorArg(), new ParseField("max_expansions"));
PARSER.declareBoolean(optionalConstructorArg(), new ParseField("case_insensitive"));
}

public static Regexp fromXContent(XContentParser parser) throws IOException {
Expand All @@ -825,6 +852,10 @@ String getUseField() {
Integer getMaxExpansions() {
return maxExpansions;
}

boolean isCaseInsensitive() {
return caseInsensitive;
}
}

public static class Wildcard extends IntervalsSourceProvider {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -846,7 +846,11 @@ public void testWildcard() throws IOException {
}

private static IntervalsSource buildRegexpSource(String pattern, int flags, Integer maxExpansions) {
final RegExp regexp = new RegExp(pattern, flags);
return buildRegexpSource(pattern, flags, 0, maxExpansions);
}

private static IntervalsSource buildRegexpSource(String pattern, int flags, int matchFlags, Integer maxExpansions) {
final RegExp regexp = new RegExp(pattern, flags, matchFlags);
CompiledAutomaton automaton = new CompiledAutomaton(regexp.toAutomaton());

if (maxExpansions != null) {
Expand Down Expand Up @@ -922,6 +926,15 @@ public void testRegexp() throws IOException {
expected = new IntervalQuery(TEXT_FIELD_NAME, buildRegexpSource("te.m", DEFAULT_FLAGS, 500));
assertEquals(expected, builder.toQuery(createShardContext()));

String regexp_case_insensitive_json = "{ \"intervals\" : { \""
+ TEXT_FIELD_NAME
+ "\": { "
+ "\"regexp\" : { \"pattern\" : \"TE.M\", \"case_insensitive\" : true } } } }";

builder = (IntervalQueryBuilder) parseQuery(regexp_case_insensitive_json);
expected = new IntervalQuery(TEXT_FIELD_NAME, buildRegexpSource("TE.M", DEFAULT_FLAGS, RegExp.ASCII_CASE_INSENSITIVE, null));
assertEquals(expected, builder.toQuery(createShardContext()));

String regexp_neg_max_expand_json = "{ \"intervals\" : { \""
+ TEXT_FIELD_NAME
+ "\": { "
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,8 @@ static Regexp createRandomRegexp() {
randomAlphaOfLengthBetween(0, 3) + (randomBoolean() ? ".*?" : "." + randomAlphaOfLength(4)) + randomAlphaOfLengthBetween(0, 5),
randomBoolean() ? RegexpFlag.resolveValue(randomFrom(FLAGS)) : RegexpFlag.ALL.value(),
randomBoolean() ? randomAlphaOfLength(10) : null,
randomBoolean() ? randomIntBetween(-1, Integer.MAX_VALUE) : null
randomBoolean() ? randomIntBetween(-1, Integer.MAX_VALUE) : null,
randomBoolean()
);
}

Expand All @@ -42,7 +43,9 @@ protected Regexp mutateInstance(Regexp instance) throws IOException {
int flags = instance.getFlags();
String useField = instance.getUseField();
Integer maxExpansions = instance.getMaxExpansions();
int ran = between(0, 3);
boolean caseInsensitive = instance.isCaseInsensitive();

int ran = between(0, 4);
switch (ran) {
case 0:
pattern += randomBoolean() ? ".*?" : randomAlphaOfLength(5);
Expand All @@ -56,10 +59,13 @@ protected Regexp mutateInstance(Regexp instance) throws IOException {
case 3:
maxExpansions = maxExpansions == null ? randomIntBetween(1, Integer.MAX_VALUE) : null;
break;
case 4:
caseInsensitive = !caseInsensitive;
break;
default:
throw new AssertionError("Illegal randomisation branch");
}
return new Regexp(pattern, flags, useField, maxExpansions);
return new Regexp(pattern, flags, useField, maxExpansions, caseInsensitive);
}

@Override
Expand Down

0 comments on commit 37235fa

Please sign in to comment.