Skip to content

Commit

Permalink
Add fuzzy intervals source (#49762)
Browse files Browse the repository at this point in the history
This intervals source will return terms that are similar to an input term, up to
an edit distance defined by fuzziness, similar to FuzzyQuery.

Closes #49595
  • Loading branch information
romseygeek authored Jan 3, 2020
1 parent fd50169 commit 32730cf
Show file tree
Hide file tree
Showing 6 changed files with 272 additions and 1 deletion.
39 changes: 38 additions & 1 deletion docs/reference/query-dsl/intervals-query.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ Valid rules include:
* <<intervals-match,`match`>>
* <<intervals-prefix,`prefix`>>
* <<intervals-wildcard,`wildcard`>>
* <<intervals-fuzzy,`fuzzy`>>
* <<intervals-all_of,`all_of`>>
* <<intervals-any_of,`any_of`>>
--
Expand All @@ -97,7 +98,7 @@ set to `0`, the terms must appear next to each other.
--

`ordered`::
(Optional, boolean)
(Optional, boolean)
If `true`, matching terms must appear in their specified order. Defaults to
`false`.

Expand Down Expand Up @@ -177,6 +178,42 @@ The `pattern` is normalized using the search analyzer from this field, unless
`analyzer` is specified separately.
--

[[intervals-fuzzy]]
==== `fuzzy` rule parameters

The `fuzzy` rule matches terms that are similar to the provided term, within an
edit distance defined by <<fuzziness>>. If the fuzzy expansion matches more than
128 terms, {es} returns an error.

`term`::
(Required, string) The term to match

`prefix_length`::
(Optional, string) Number of beginning characters left unchanged when creating
expansions. Defaults to `0`.

`transpositions`::
(Optional, boolean) Indicates whether edits include transpositions of two
adjacent characters (ab → ba). Defaults to `true`.

`fuzziness`::
(Optional, string) Maximum edit distance allowed for matching. See <<fuzziness>>
for valid values and more information. Defaults to `auto`.

`analyzer`::
(Optional, string) <<analysis, analyzer>> used to normalize the `term`.
Defaults to the top-level `<field>` 's analyzer.

`use_field`::
+
--
(Optional, string) If specified, match intervals from this field rather than the
top-level `<field>`.

The `term` is normalized using the search analyzer from this field, unless
`analyzer` is specified separately.
--

[[intervals-all_of]]
==== `all_of` rule parameters

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -424,3 +424,24 @@ setup:
pattern: out?ide
- match: { hits.total.value: 3 }

---
"Test fuzzy match":
- skip:
version: " - 8.0.0"
reason: "TODO: change to 7.6 in backport"
- do:
search:
index: test
body:
query:
intervals:
text:
all_of:
intervals:
- fuzzy:
query: cald
- prefix:
prefix: out
- match: { hits.total.value: 3 }


Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,10 @@ public static IntervalsSource prefix(BytesRef prefix) {
return new MultiTermIntervalsSource(ca, 128, prefix.utf8ToString());
}

public static IntervalsSource multiterm(CompiledAutomaton ca, String label) {
return new MultiTermIntervalsSource(ca, 128, label);
}

static class MultiTermIntervalsSource extends IntervalsSource {

private final CompiledAutomaton automaton;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,20 +20,25 @@
package org.elasticsearch.index.query;

import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.Term;
import org.apache.lucene.queries.XIntervals;
import org.apache.lucene.queries.intervals.FilteredIntervalsSource;
import org.apache.lucene.queries.intervals.IntervalIterator;
import org.apache.lucene.queries.intervals.Intervals;
import org.apache.lucene.queries.intervals.IntervalsSource;
import org.apache.lucene.search.FuzzyQuery;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.automaton.CompiledAutomaton;
import org.elasticsearch.Version;
import org.elasticsearch.common.ParseField;
import org.elasticsearch.common.ParsingException;
import org.elasticsearch.common.io.stream.NamedWriteable;
import org.elasticsearch.common.io.stream.StreamInput;
import org.elasticsearch.common.io.stream.StreamOutput;
import org.elasticsearch.common.io.stream.Writeable;
import org.elasticsearch.common.unit.Fuzziness;
import org.elasticsearch.common.xcontent.ConstructingObjectParser;
import org.elasticsearch.common.xcontent.ObjectParser;
import org.elasticsearch.common.xcontent.ToXContentFragment;
import org.elasticsearch.common.xcontent.ToXContentObject;
import org.elasticsearch.common.xcontent.XContentBuilder;
Expand Down Expand Up @@ -85,6 +90,8 @@ public static IntervalsSourceProvider fromXContent(XContentParser parser) throws
return Prefix.fromXContent(parser);
case "wildcard":
return Wildcard.fromXContent(parser);
case "fuzzy":
return Fuzzy.fromXContent(parser);
}
throw new ParsingException(parser.getTokenLocation(),
"Unknown interval type [" + parser.currentName() + "], expecting one of [match, any_of, all_of, prefix, wildcard]");
Expand Down Expand Up @@ -691,6 +698,148 @@ String getUseField() {
}
}

public static class Fuzzy extends IntervalsSourceProvider {

public static final String NAME = "fuzzy";

private final String term;
private final int prefixLength;
private final boolean transpositions;
private final Fuzziness fuzziness;
private final String analyzer;
private final String useField;

public Fuzzy(String term, int prefixLength, boolean transpositions, Fuzziness fuzziness, String analyzer, String useField) {
this.term = term;
this.prefixLength = prefixLength;
this.transpositions = transpositions;
this.fuzziness = fuzziness;
this.analyzer = analyzer;
this.useField = useField;
}

public Fuzzy(StreamInput in) throws IOException {
this.term = in.readString();
this.prefixLength = in.readVInt();
this.transpositions = in.readBoolean();
this.fuzziness = new Fuzziness(in);
this.analyzer = in.readOptionalString();
this.useField = in.readOptionalString();
}

@Override
public IntervalsSource getSource(QueryShardContext context, MappedFieldType fieldType) {
NamedAnalyzer analyzer = fieldType.searchAnalyzer();
if (this.analyzer != null) {
analyzer = context.getMapperService().getIndexAnalyzers().get(this.analyzer);
}
IntervalsSource source;
if (useField != null) {
fieldType = context.fieldMapper(useField);
assert fieldType != null;
checkPositions(fieldType);
if (this.analyzer == null) {
analyzer = fieldType.searchAnalyzer();
}
}
checkPositions(fieldType);
BytesRef normalizedTerm = analyzer.normalize(fieldType.name(), term);
FuzzyQuery fq = new FuzzyQuery(new Term(fieldType.name(), normalizedTerm),
fuzziness.asDistance(term), prefixLength, 128, transpositions);
CompiledAutomaton ca = new CompiledAutomaton(fq.toAutomaton());
source = XIntervals.multiterm(ca, term);
if (useField != null) {
source = Intervals.fixField(useField, source);
}
return source;
}

private void checkPositions(MappedFieldType type) {
if (type.indexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) < 0) {
throw new IllegalArgumentException("Cannot create intervals over field [" + type.name() + "] with no positions indexed");
}
}

@Override
public void extractFields(Set<String> fields) {
if (useField != null) {
fields.add(useField);
}
}

@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
Fuzzy fuzzy = (Fuzzy) o;
return prefixLength == fuzzy.prefixLength &&
transpositions == fuzzy.transpositions &&
Objects.equals(term, fuzzy.term) &&
Objects.equals(fuzziness, fuzzy.fuzziness) &&
Objects.equals(analyzer, fuzzy.analyzer) &&
Objects.equals(useField, fuzzy.useField);
}

@Override
public int hashCode() {
return Objects.hash(term, prefixLength, transpositions, fuzziness, analyzer, useField);
}

@Override
public String getWriteableName() {
return NAME;
}

@Override
public void writeTo(StreamOutput out) throws IOException {
out.writeString(term);
out.writeVInt(prefixLength);
out.writeBoolean(transpositions);
fuzziness.writeTo(out);
out.writeOptionalString(analyzer);
out.writeOptionalString(useField);
}

@Override
public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
builder.startObject(NAME);
builder.field("term", term);
builder.field("prefix_length", prefixLength);
builder.field("transpositions", transpositions);
fuzziness.toXContent(builder, params);
if (analyzer != null) {
builder.field("analyzer", analyzer);
}
if (useField != null) {
builder.field("use_field", useField);
}
builder.endObject();
return builder;
}

private static final ConstructingObjectParser<Fuzzy, Void> PARSER = new ConstructingObjectParser<>(NAME, args -> {
String term = (String) args[0];
int prefixLength = (args[1] == null) ? FuzzyQueryBuilder.DEFAULT_PREFIX_LENGTH : (int) args[1];
boolean transpositions = (args[2] == null) ? FuzzyQueryBuilder.DEFAULT_TRANSPOSITIONS : (boolean) args[2];
Fuzziness fuzziness = (args[3] == null) ? FuzzyQueryBuilder.DEFAULT_FUZZINESS : (Fuzziness) args[3];
String analyzer = (String) args[4];
String useField = (String) args[5];
return new Fuzzy(term, prefixLength, transpositions, fuzziness, analyzer, useField);
});
static {
PARSER.declareString(constructorArg(), new ParseField("term"));
PARSER.declareInt(optionalConstructorArg(), new ParseField("prefix_length"));
PARSER.declareBoolean(optionalConstructorArg(), new ParseField("transpositions"));
PARSER.declareField(optionalConstructorArg(), (p, c) -> Fuzziness.parse(p), Fuzziness.FIELD, ObjectParser.ValueType.VALUE);
PARSER.declareString(optionalConstructorArg(), new ParseField("analyzer"));
PARSER.declareString(optionalConstructorArg(), new ParseField("use_field"));
}

public static Fuzzy fromXContent(XContentParser parser) throws IOException {
return PARSER.parse(parser, null);
}
}

static class ScriptFilterSource extends FilteredIntervalsSource {

final IntervalFilterScript script;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -804,6 +804,8 @@ private void registerIntervalsSourceProviders() {
IntervalsSourceProvider.Prefix.NAME, IntervalsSourceProvider.Prefix::new));
namedWriteables.add(new NamedWriteableRegistry.Entry(IntervalsSourceProvider.class,
IntervalsSourceProvider.Wildcard.NAME, IntervalsSourceProvider.Wildcard::new));
namedWriteables.add(new NamedWriteableRegistry.Entry(IntervalsSourceProvider.class,
IntervalsSourceProvider.Fuzzy.NAME, IntervalsSourceProvider.Fuzzy::new));
}

private void registerQuery(QuerySpec<?> spec) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,17 +19,22 @@

package org.elasticsearch.index.query;

import org.apache.lucene.index.Term;
import org.apache.lucene.queries.XIntervals;
import org.apache.lucene.queries.intervals.IntervalQuery;
import org.apache.lucene.queries.intervals.Intervals;
import org.apache.lucene.queries.intervals.IntervalsSource;
import org.apache.lucene.search.BoostQuery;
import org.apache.lucene.search.FuzzyQuery;
import org.apache.lucene.search.MatchNoDocsQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.automaton.CompiledAutomaton;
import org.elasticsearch.common.ParsingException;
import org.elasticsearch.common.Strings;
import org.elasticsearch.common.compress.CompressedXContent;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.unit.Fuzziness;
import org.elasticsearch.common.util.BigArrays;
import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.index.mapper.MapperService;
Expand Down Expand Up @@ -529,4 +534,57 @@ public void testWildcard() throws IOException {
assertEquals(expected, builder.toQuery(createShardContext()));
}

private static IntervalsSource buildFuzzySource(String term, String label, int prefixLength, boolean transpositions, int editDistance) {
FuzzyQuery fq = new FuzzyQuery(new Term("field", term), editDistance, prefixLength, 128, transpositions);
return XIntervals.multiterm(new CompiledAutomaton(fq.toAutomaton()), label);
}

public void testFuzzy() throws IOException {

String json = "{ \"intervals\" : { \"" + STRING_FIELD_NAME + "\": { " +
"\"fuzzy\" : { \"term\" : \"Term\" } } } }";
IntervalQueryBuilder builder = (IntervalQueryBuilder) parseQuery(json);

Query expected = new IntervalQuery(STRING_FIELD_NAME,
buildFuzzySource("term", "Term", FuzzyQueryBuilder.DEFAULT_PREFIX_LENGTH, true, Fuzziness.AUTO.asDistance("term")));
assertEquals(expected, builder.toQuery(createShardContext()));

String json_with_prefix = "{ \"intervals\" : { \"" + STRING_FIELD_NAME + "\": { " +
"\"fuzzy\" : { \"term\" : \"Term\", \"prefix_length\" : 2 } } } }";
builder = (IntervalQueryBuilder) parseQuery(json_with_prefix);
expected = new IntervalQuery(STRING_FIELD_NAME,
buildFuzzySource("term", "Term", 2, true, Fuzziness.AUTO.asDistance("term")));
assertEquals(expected, builder.toQuery(createShardContext()));

String json_with_fuzziness = "{ \"intervals\" : { \"" + STRING_FIELD_NAME + "\": { " +
"\"fuzzy\" : { \"term\" : \"Term\", \"prefix_length\" : 2, \"fuzziness\" : \"1\" } } } }";
builder = (IntervalQueryBuilder) parseQuery(json_with_fuzziness);
expected = new IntervalQuery(STRING_FIELD_NAME,
buildFuzzySource("term", "Term", 2, true, Fuzziness.ONE.asDistance("term")));
assertEquals(expected, builder.toQuery(createShardContext()));

String json_no_transpositions = "{ \"intervals\" : { \"" + STRING_FIELD_NAME + "\": { " +
"\"fuzzy\" : { \"term\" : \"Term\", \"prefix_length\" : 2, \"transpositions\" : false } } } }";
builder = (IntervalQueryBuilder) parseQuery(json_no_transpositions);
expected = new IntervalQuery(STRING_FIELD_NAME,
buildFuzzySource("term", "Term", 2, false, Fuzziness.AUTO.asDistance("term")));
assertEquals(expected, builder.toQuery(createShardContext()));

String json_with_analyzer = "{ \"intervals\" : { \"" + STRING_FIELD_NAME + "\": { " +
"\"fuzzy\" : { \"term\" : \"Term\", \"prefix_length\" : 2, \"analyzer\" : \"keyword\" } } } }";
builder = (IntervalQueryBuilder) parseQuery(json_with_analyzer);
expected = new IntervalQuery(STRING_FIELD_NAME,
buildFuzzySource("Term", "Term", 2, true, Fuzziness.AUTO.asDistance("term")));
assertEquals(expected, builder.toQuery(createShardContext()));

String json_with_fixfield = "{ \"intervals\" : { \"" + STRING_FIELD_NAME + "\": { " +
"\"fuzzy\" : { \"term\" : \"Term\", \"prefix_length\" : 2, \"fuzziness\" : \"1\", " +
"\"use_field\" : \"" + MASKED_FIELD + "\" } } } }";
builder = (IntervalQueryBuilder) parseQuery(json_with_fixfield);
expected = new IntervalQuery(STRING_FIELD_NAME, Intervals.fixField(MASKED_FIELD,
buildFuzzySource("term", "Term", 2, true, Fuzziness.ONE.asDistance("term"))));
assertEquals(expected, builder.toQuery(createShardContext()));

}

}

0 comments on commit 32730cf

Please sign in to comment.