Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add fuzzy intervals source #49762

Merged
merged 3 commits into from
Jan 3, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 38 additions & 1 deletion docs/reference/query-dsl/intervals-query.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ Valid rules include:
* <<intervals-match,`match`>>
* <<intervals-prefix,`prefix`>>
* <<intervals-wildcard,`wildcard`>>
* <<intervals-fuzzy,`fuzzy`>>
* <<intervals-all_of,`all_of`>>
* <<intervals-any_of,`any_of`>>
--
Expand All @@ -97,7 +98,7 @@ set to `0`, the terms must appear next to each other.
--

`ordered`::
(Optional, boolean)
(Optional, boolean)
If `true`, matching terms must appear in their specified order. Defaults to
`false`.

Expand Down Expand Up @@ -177,6 +178,42 @@ The `pattern` is normalized using the search analyzer from this field, unless
`analyzer` is specified separately.
--

[[intervals-fuzzy]]
==== `fuzzy` rule parameters

The `fuzzy` rule matches terms that are similar to the provided term, within an
edit distance defined by <<fuzziness>>. If the fuzzy expansion matches more than
128 terms, {es} returns an error.

`term`::
(Required, string) The term to match

`prefix_length`::
(Optional, string) Number of beginning characters left unchanged when creating
expansions. Defaults to `0`.

`transpositions`::
(Optional, boolean) Indicates whether edits include transpositions of two
adjacent characters (ab → ba). Defaults to `true`.

`fuzziness`::
(Optional, string) Maximum edit distance allowed for matching. See <<fuzziness>>
for valid values and more information. Defaults to `auto`.

`analyzer`::
(Optional, string) <<analysis, analyzer>> used to normalize the `term`.
Defaults to the top-level `<field>` 's analyzer.

`use_field`::
+
--
(Optional, string) If specified, match intervals from this field rather than the
top-level `<field>`.

The `term` is normalized using the search analyzer from this field, unless
`analyzer` is specified separately.
--

[[intervals-all_of]]
==== `all_of` rule parameters

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -424,3 +424,24 @@ setup:
pattern: out?ide
- match: { hits.total.value: 3 }

---
"Test fuzzy match":
- skip:
version: " - 8.0.0"
reason: "TODO: change to 7.6 in backport"
- do:
search:
index: test
body:
query:
intervals:
text:
all_of:
intervals:
- fuzzy:
query: cald
- prefix:
prefix: out
- match: { hits.total.value: 3 }


Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,10 @@ public static IntervalsSource prefix(BytesRef prefix) {
return new MultiTermIntervalsSource(ca, 128, prefix.utf8ToString());
}

public static IntervalsSource multiterm(CompiledAutomaton ca, String label) {
return new MultiTermIntervalsSource(ca, 128, label);
}

static class MultiTermIntervalsSource extends IntervalsSource {

private final CompiledAutomaton automaton;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,20 +20,25 @@
package org.elasticsearch.index.query;

import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.Term;
import org.apache.lucene.queries.XIntervals;
import org.apache.lucene.queries.intervals.FilteredIntervalsSource;
import org.apache.lucene.queries.intervals.IntervalIterator;
import org.apache.lucene.queries.intervals.Intervals;
import org.apache.lucene.queries.intervals.IntervalsSource;
import org.apache.lucene.search.FuzzyQuery;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.automaton.CompiledAutomaton;
import org.elasticsearch.Version;
import org.elasticsearch.common.ParseField;
import org.elasticsearch.common.ParsingException;
import org.elasticsearch.common.io.stream.NamedWriteable;
import org.elasticsearch.common.io.stream.StreamInput;
import org.elasticsearch.common.io.stream.StreamOutput;
import org.elasticsearch.common.io.stream.Writeable;
import org.elasticsearch.common.unit.Fuzziness;
import org.elasticsearch.common.xcontent.ConstructingObjectParser;
import org.elasticsearch.common.xcontent.ObjectParser;
import org.elasticsearch.common.xcontent.ToXContentFragment;
import org.elasticsearch.common.xcontent.ToXContentObject;
import org.elasticsearch.common.xcontent.XContentBuilder;
Expand Down Expand Up @@ -85,6 +90,8 @@ public static IntervalsSourceProvider fromXContent(XContentParser parser) throws
return Prefix.fromXContent(parser);
case "wildcard":
return Wildcard.fromXContent(parser);
case "fuzzy":
return Fuzzy.fromXContent(parser);
}
throw new ParsingException(parser.getTokenLocation(),
"Unknown interval type [" + parser.currentName() + "], expecting one of [match, any_of, all_of, prefix, wildcard]");
Expand Down Expand Up @@ -691,6 +698,148 @@ String getUseField() {
}
}

public static class Fuzzy extends IntervalsSourceProvider {

public static final String NAME = "fuzzy";

private final String term;
private final int prefixLength;
private final boolean transpositions;
private final Fuzziness fuzziness;
private final String analyzer;
private final String useField;

public Fuzzy(String term, int prefixLength, boolean transpositions, Fuzziness fuzziness, String analyzer, String useField) {
this.term = term;
this.prefixLength = prefixLength;
this.transpositions = transpositions;
this.fuzziness = fuzziness;
this.analyzer = analyzer;
this.useField = useField;
}

public Fuzzy(StreamInput in) throws IOException {
this.term = in.readString();
this.prefixLength = in.readVInt();
this.transpositions = in.readBoolean();
this.fuzziness = new Fuzziness(in);
this.analyzer = in.readOptionalString();
this.useField = in.readOptionalString();
}

@Override
public IntervalsSource getSource(QueryShardContext context, MappedFieldType fieldType) {
NamedAnalyzer analyzer = fieldType.searchAnalyzer();
if (this.analyzer != null) {
analyzer = context.getMapperService().getIndexAnalyzers().get(this.analyzer);
}
IntervalsSource source;
if (useField != null) {
fieldType = context.fieldMapper(useField);
assert fieldType != null;
checkPositions(fieldType);
if (this.analyzer == null) {
analyzer = fieldType.searchAnalyzer();
}
}
checkPositions(fieldType);
BytesRef normalizedTerm = analyzer.normalize(fieldType.name(), term);
FuzzyQuery fq = new FuzzyQuery(new Term(fieldType.name(), normalizedTerm),
fuzziness.asDistance(term), prefixLength, 128, transpositions);
CompiledAutomaton ca = new CompiledAutomaton(fq.toAutomaton());
source = XIntervals.multiterm(ca, term);
if (useField != null) {
source = Intervals.fixField(useField, source);
}
return source;
}

private void checkPositions(MappedFieldType type) {
if (type.indexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) < 0) {
throw new IllegalArgumentException("Cannot create intervals over field [" + type.name() + "] with no positions indexed");
}
}

@Override
public void extractFields(Set<String> fields) {
if (useField != null) {
fields.add(useField);
}
}

@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
Fuzzy fuzzy = (Fuzzy) o;
return prefixLength == fuzzy.prefixLength &&
transpositions == fuzzy.transpositions &&
Objects.equals(term, fuzzy.term) &&
Objects.equals(fuzziness, fuzzy.fuzziness) &&
Objects.equals(analyzer, fuzzy.analyzer) &&
Objects.equals(useField, fuzzy.useField);
}

@Override
public int hashCode() {
return Objects.hash(term, prefixLength, transpositions, fuzziness, analyzer, useField);
}

@Override
public String getWriteableName() {
return NAME;
}

@Override
public void writeTo(StreamOutput out) throws IOException {
out.writeString(term);
out.writeVInt(prefixLength);
out.writeBoolean(transpositions);
fuzziness.writeTo(out);
out.writeOptionalString(analyzer);
out.writeOptionalString(useField);
}

@Override
public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
builder.startObject(NAME);
builder.field("term", term);
builder.field("prefix_length", prefixLength);
builder.field("transpositions", transpositions);
fuzziness.toXContent(builder, params);
if (analyzer != null) {
builder.field("analyzer", analyzer);
}
if (useField != null) {
builder.field("use_field", useField);
}
builder.endObject();
return builder;
}

private static final ConstructingObjectParser<Fuzzy, Void> PARSER = new ConstructingObjectParser<>(NAME, args -> {
String term = (String) args[0];
int prefixLength = (args[1] == null) ? FuzzyQueryBuilder.DEFAULT_PREFIX_LENGTH : (int) args[1];
boolean transpositions = (args[2] == null) ? FuzzyQueryBuilder.DEFAULT_TRANSPOSITIONS : (boolean) args[2];
Fuzziness fuzziness = (args[3] == null) ? FuzzyQueryBuilder.DEFAULT_FUZZINESS : (Fuzziness) args[3];
String analyzer = (String) args[4];
String useField = (String) args[5];
return new Fuzzy(term, prefixLength, transpositions, fuzziness, analyzer, useField);
});
static {
PARSER.declareString(constructorArg(), new ParseField("term"));
PARSER.declareInt(optionalConstructorArg(), new ParseField("prefix_length"));
PARSER.declareBoolean(optionalConstructorArg(), new ParseField("transpositions"));
PARSER.declareField(optionalConstructorArg(), (p, c) -> Fuzziness.parse(p), Fuzziness.FIELD, ObjectParser.ValueType.VALUE);
PARSER.declareString(optionalConstructorArg(), new ParseField("analyzer"));
PARSER.declareString(optionalConstructorArg(), new ParseField("use_field"));
}

public static Fuzzy fromXContent(XContentParser parser) throws IOException {
return PARSER.parse(parser, null);
}
}

static class ScriptFilterSource extends FilteredIntervalsSource {

final IntervalFilterScript script;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -804,6 +804,8 @@ private void registerIntervalsSourceProviders() {
IntervalsSourceProvider.Prefix.NAME, IntervalsSourceProvider.Prefix::new));
namedWriteables.add(new NamedWriteableRegistry.Entry(IntervalsSourceProvider.class,
IntervalsSourceProvider.Wildcard.NAME, IntervalsSourceProvider.Wildcard::new));
namedWriteables.add(new NamedWriteableRegistry.Entry(IntervalsSourceProvider.class,
IntervalsSourceProvider.Fuzzy.NAME, IntervalsSourceProvider.Fuzzy::new));
}

private void registerQuery(QuerySpec<?> spec) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,17 +19,22 @@

package org.elasticsearch.index.query;

import org.apache.lucene.index.Term;
import org.apache.lucene.queries.XIntervals;
import org.apache.lucene.queries.intervals.IntervalQuery;
import org.apache.lucene.queries.intervals.Intervals;
import org.apache.lucene.queries.intervals.IntervalsSource;
import org.apache.lucene.search.BoostQuery;
import org.apache.lucene.search.FuzzyQuery;
import org.apache.lucene.search.MatchNoDocsQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.automaton.CompiledAutomaton;
import org.elasticsearch.common.ParsingException;
import org.elasticsearch.common.Strings;
import org.elasticsearch.common.compress.CompressedXContent;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.unit.Fuzziness;
import org.elasticsearch.common.util.BigArrays;
import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.index.mapper.MapperService;
Expand Down Expand Up @@ -529,4 +534,57 @@ public void testWildcard() throws IOException {
assertEquals(expected, builder.toQuery(createShardContext()));
}

private static IntervalsSource buildFuzzySource(String term, String label, int prefixLength, boolean transpositions, int editDistance) {
FuzzyQuery fq = new FuzzyQuery(new Term("field", term), editDistance, prefixLength, 128, transpositions);
return XIntervals.multiterm(new CompiledAutomaton(fq.toAutomaton()), label);
}

public void testFuzzy() throws IOException {

String json = "{ \"intervals\" : { \"" + STRING_FIELD_NAME + "\": { " +
"\"fuzzy\" : { \"term\" : \"Term\" } } } }";
IntervalQueryBuilder builder = (IntervalQueryBuilder) parseQuery(json);

Query expected = new IntervalQuery(STRING_FIELD_NAME,
buildFuzzySource("term", "Term", FuzzyQueryBuilder.DEFAULT_PREFIX_LENGTH, true, Fuzziness.AUTO.asDistance("term")));
assertEquals(expected, builder.toQuery(createShardContext()));

String json_with_prefix = "{ \"intervals\" : { \"" + STRING_FIELD_NAME + "\": { " +
"\"fuzzy\" : { \"term\" : \"Term\", \"prefix_length\" : 2 } } } }";
builder = (IntervalQueryBuilder) parseQuery(json_with_prefix);
expected = new IntervalQuery(STRING_FIELD_NAME,
buildFuzzySource("term", "Term", 2, true, Fuzziness.AUTO.asDistance("term")));
assertEquals(expected, builder.toQuery(createShardContext()));

String json_with_fuzziness = "{ \"intervals\" : { \"" + STRING_FIELD_NAME + "\": { " +
"\"fuzzy\" : { \"term\" : \"Term\", \"prefix_length\" : 2, \"fuzziness\" : \"1\" } } } }";
builder = (IntervalQueryBuilder) parseQuery(json_with_fuzziness);
expected = new IntervalQuery(STRING_FIELD_NAME,
buildFuzzySource("term", "Term", 2, true, Fuzziness.ONE.asDistance("term")));
assertEquals(expected, builder.toQuery(createShardContext()));

String json_no_transpositions = "{ \"intervals\" : { \"" + STRING_FIELD_NAME + "\": { " +
"\"fuzzy\" : { \"term\" : \"Term\", \"prefix_length\" : 2, \"transpositions\" : false } } } }";
builder = (IntervalQueryBuilder) parseQuery(json_no_transpositions);
expected = new IntervalQuery(STRING_FIELD_NAME,
buildFuzzySource("term", "Term", 2, false, Fuzziness.AUTO.asDistance("term")));
assertEquals(expected, builder.toQuery(createShardContext()));

String json_with_analyzer = "{ \"intervals\" : { \"" + STRING_FIELD_NAME + "\": { " +
"\"fuzzy\" : { \"term\" : \"Term\", \"prefix_length\" : 2, \"analyzer\" : \"keyword\" } } } }";
builder = (IntervalQueryBuilder) parseQuery(json_with_analyzer);
expected = new IntervalQuery(STRING_FIELD_NAME,
buildFuzzySource("Term", "Term", 2, true, Fuzziness.AUTO.asDistance("term")));
assertEquals(expected, builder.toQuery(createShardContext()));

String json_with_fixfield = "{ \"intervals\" : { \"" + STRING_FIELD_NAME + "\": { " +
"\"fuzzy\" : { \"term\" : \"Term\", \"prefix_length\" : 2, \"fuzziness\" : \"1\", " +
"\"use_field\" : \"" + MASKED_FIELD + "\" } } } }";
builder = (IntervalQueryBuilder) parseQuery(json_with_fixfield);
expected = new IntervalQuery(STRING_FIELD_NAME, Intervals.fixField(MASKED_FIELD,
buildFuzzySource("term", "Term", 2, true, Fuzziness.ONE.asDistance("term"))));
assertEquals(expected, builder.toQuery(createShardContext()));

}

}