diff --git a/rest-api-spec/src/main/resources/rest-api-spec/test/search/230_interval_query.yml b/rest-api-spec/src/main/resources/rest-api-spec/test/search/230_interval_query.yml index 82aa0883008a8..f7360f5fb1afb 100644 --- a/rest-api-spec/src/main/resources/rest-api-spec/test/search/230_interval_query.yml +++ b/rest-api-spec/src/main/resources/rest-api-spec/test/search/230_interval_query.yml @@ -21,6 +21,10 @@ setup: - '{"text" : "Baby its cold there outside"}' - '{"index": {"_index": "test", "_id": "4"}}' - '{"text" : "Outside it is cold and wet"}' + - '{"index": {"_index": "test", "_id": "5"}}' + - '{"text" : "To be or not to be, that is the question"}' + - '{"index": {"_index": "test", "_id": "6"}}' + - '{"text" : "Id like to be under the sea or not"}' --- "Test ordered matching": @@ -49,6 +53,22 @@ setup: query: "cold outside" - match: { hits.total.value: 3 } +--- +"Test unordered matching with duplicates": + - skip: + version: "- 8.0.0" + reason: Added in v7.6 + - do: + search: + index: test + body: + query: + intervals: + text: + match: + query: "to be or not to be" + - match: { hits.total.value: 1 } + --- "Test explicit unordered matching": - do: @@ -185,6 +205,27 @@ setup: ordered: false - match: { hits.total.value: 2 } +--- +"Test unordered combination with duplicates": + - skip: + version: "- 8.0.0" + reason: Added in v7.6 + - do: + search: + index: test + body: + query: + intervals: + text: + all_of: + intervals: + - match: + query: "to be" + - match: + query: "to be" + ordered: false + - match: { hits.total.value: 1 } + --- "Test block combination": - do: diff --git a/server/src/main/java/org/apache/lucene/queries/XIntervals.java b/server/src/main/java/org/apache/lucene/queries/XIntervals.java index b389a29c21115..0c645b96936eb 100644 --- a/server/src/main/java/org/apache/lucene/queries/XIntervals.java +++ b/server/src/main/java/org/apache/lucene/queries/XIntervals.java @@ -24,6 +24,7 @@ import org.apache.lucene.index.Term; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.queries.intervals.FilteredIntervalsSource; import org.apache.lucene.queries.intervals.IntervalIterator; import org.apache.lucene.queries.intervals.IntervalQuery; import org.apache.lucene.queries.intervals.Intervals; @@ -48,6 +49,8 @@ import java.util.Iterator; import java.util.List; import java.util.Objects; +import java.util.function.ToIntFunction; +import java.util.stream.Collectors; /** * Replacement for {@link Intervals#wildcard(BytesRef)} and {@link Intervals#prefix(BytesRef)} @@ -67,6 +70,28 @@ public static IntervalsSource prefix(BytesRef prefix) { return new MultiTermIntervalsSource(ca, 128, prefix.utf8ToString()); } + public static IntervalsSource maxWidth(IntervalsSource in, ToIntFunction widthFunction) { + return Intervals.or(in.pullUpDisjunctions().stream() + .map(s -> new MaxWidth(s, widthFunction.applyAsInt(s))) + .collect(Collectors.toList())); + } + + private static class MaxWidth extends FilteredIntervalsSource { + + private final int maxWidth; + + MaxWidth(IntervalsSource in, int maxWidth) { + super("MAXWIDTH/" + maxWidth, in); + this.maxWidth = maxWidth; + } + + @Override + protected boolean accept(IntervalIterator it) { + return (it.end() - it.start()) + 1 <= maxWidth; + } + + } + static class MultiTermIntervalsSource extends IntervalsSource { private final CompiledAutomaton automaton; diff --git a/server/src/main/java/org/elasticsearch/index/query/IntervalBuilder.java b/server/src/main/java/org/elasticsearch/index/query/IntervalBuilder.java index a1970759460a3..75c8ae7c2ccdd 100644 --- a/server/src/main/java/org/elasticsearch/index/query/IntervalBuilder.java +++ b/server/src/main/java/org/elasticsearch/index/query/IntervalBuilder.java @@ -26,12 +26,13 @@ import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; import org.apache.lucene.index.LeafReaderContext; -import org.apache.lucene.search.BooleanQuery; -import org.apache.lucene.search.MatchesIterator; -import org.apache.lucene.search.QueryVisitor; +import org.apache.lucene.queries.XIntervals; import org.apache.lucene.queries.intervals.IntervalIterator; import org.apache.lucene.queries.intervals.Intervals; import org.apache.lucene.queries.intervals.IntervalsSource; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.MatchesIterator; +import org.apache.lucene.search.QueryVisitor; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.graph.GraphTokenStreamFiniteStrings; @@ -40,7 +41,9 @@ import java.util.Collection; import java.util.Collections; import java.util.Iterator; +import java.util.LinkedHashMap; import java.util.List; +import java.util.Map; /** * Constructs an IntervalsSource based on analyzed text @@ -102,7 +105,7 @@ protected IntervalsSource analyzeText(CachingTokenFilter stream, int maxGaps, bo return analyzeTerm(stream); } else if (isGraph) { // graph - return combineSources(analyzeGraph(stream), maxGaps, ordered); + return combineSources(analyzeGraph(stream), maxGaps, ordered, true); } else { // phrase if (hasSynonyms) { @@ -110,7 +113,7 @@ protected IntervalsSource analyzeText(CachingTokenFilter stream, int maxGaps, bo return analyzeSynonyms(stream, maxGaps, ordered); } else { // simple phrase - return combineSources(analyzeTerms(stream), maxGaps, ordered); + return combineSources(analyzeTerms(stream), maxGaps, ordered, true); } } @@ -123,13 +126,16 @@ protected IntervalsSource analyzeTerm(TokenStream ts) throws IOException { return Intervals.term(BytesRef.deepCopyOf(bytesAtt.getBytesRef())); } - protected static IntervalsSource combineSources(List sources, int maxGaps, boolean ordered) { + protected static IntervalsSource combineSources(List sources, int maxGaps, boolean ordered, boolean fixedWidth) { if (sources.size() == 0) { return NO_INTERVALS; } if (sources.size() == 1) { return sources.get(0); } + if (ordered == false) { + sources = deduplicate(sources); + } IntervalsSource[] sourcesArray = sources.toArray(new IntervalsSource[0]); if (maxGaps == 0 && ordered) { return Intervals.phrase(sourcesArray); @@ -138,9 +144,36 @@ protected static IntervalsSource combineSources(List sources, i if (maxGaps == -1) { return inner; } + if (fixedWidth) { + return XIntervals.maxWidth(inner, s -> maxGaps + s.minExtent()); + } return Intervals.maxgaps(maxGaps, inner); } + protected static List deduplicate(List sources) { + Map counts = new LinkedHashMap<>(); // preserve order for testing + for (IntervalsSource source : sources) { + counts.compute(source, (k, v) -> v == null ? 1 : v + 1); + } + if (counts.size() == sources.size()) { + return sources; + } + sources = new ArrayList<>(); + for (IntervalsSource source : counts.keySet()) { + int count = counts.get(source); + if (count == 1) { + sources.add(source); + } else { + IntervalsSource[] multiples = new IntervalsSource[count]; + for (int i = 0; i < count; i++) { + multiples[i] = source; + } + sources.add(Intervals.ordered(multiples)); + } + } + return sources; + } + protected List analyzeTerms(TokenStream ts) throws IOException { List terms = new ArrayList<>(); TermToBytesRefAttribute bytesAtt = ts.addAttribute(TermToBytesRefAttribute.class); @@ -189,7 +222,7 @@ else if (synonyms.size() > 1) { else { terms.add(extend(Intervals.or(synonyms.toArray(new IntervalsSource[0])), spaces)); } - return combineSources(terms, maxGaps, ordered); + return combineSources(terms, maxGaps, ordered, true); } protected List analyzeGraph(TokenStream source) throws IOException { @@ -212,7 +245,7 @@ protected List analyzeGraph(TokenStream source) throws IOExcept Iterator it = graph.getFiniteStrings(start, end); while (it.hasNext()) { TokenStream ts = it.next(); - IntervalsSource phrase = combineSources(analyzeTerms(ts), 0, true); + IntervalsSource phrase = combineSources(analyzeTerms(ts), 0, true, true); if (paths.size() >= maxClauseCount) { throw new BooleanQuery.TooManyClauses(); } diff --git a/server/src/main/java/org/elasticsearch/index/query/IntervalsSourceProvider.java b/server/src/main/java/org/elasticsearch/index/query/IntervalsSourceProvider.java index 167266e0892fd..49a5053b0385a 100644 --- a/server/src/main/java/org/elasticsearch/index/query/IntervalsSourceProvider.java +++ b/server/src/main/java/org/elasticsearch/index/query/IntervalsSourceProvider.java @@ -366,7 +366,7 @@ public IntervalsSource getSource(QueryShardContext ctx, MappedFieldType fieldTyp for (IntervalsSourceProvider provider : subSources) { ss.add(provider.getSource(ctx, fieldType)); } - IntervalsSource source = IntervalBuilder.combineSources(ss, maxGaps, ordered); + IntervalsSource source = IntervalBuilder.combineSources(ss, maxGaps, ordered, false); if (filter != null) { return filter.filter(source, ctx, fieldType); } diff --git a/server/src/test/java/org/elasticsearch/index/query/IntervalBuilderTests.java b/server/src/test/java/org/elasticsearch/index/query/IntervalBuilderTests.java index 8c14fead82429..858ca1a46396e 100644 --- a/server/src/test/java/org/elasticsearch/index/query/IntervalBuilderTests.java +++ b/server/src/test/java/org/elasticsearch/index/query/IntervalBuilderTests.java @@ -201,6 +201,21 @@ public void testGraphSynonymsWithGaps() throws IOException { assertEquals(expected, source); + source = BUILDER.analyzeText(new CachingTokenFilter(ts), 2, false); + expected = Intervals.or( + Intervals.maxwidth(6, Intervals.unordered( + Intervals.term("term1"), Intervals.extend(Intervals.term("term2"), 1, 0), Intervals.term("term5") + )), + Intervals.maxwidth(9, Intervals.unordered( + Intervals.term("term1"), + Intervals.phrase( + Intervals.extend(Intervals.term("term3"), 1, 0), + Intervals.extend(Intervals.term("term4"), 2, 0)), + Intervals.term("term5") + )) + ); + assertEquals(expected, source); + } public void testGraphTerminatesOnGap() throws IOException { @@ -222,4 +237,26 @@ public void testGraphTerminatesOnGap() throws IOException { assertEquals(expected, source); } + public void testUnorderedWithRepeats() throws IOException { + IntervalsSource source = BUILDER.analyzeText("to be or not to be", -1, false); + IntervalsSource expected = Intervals.unordered( + Intervals.ordered(Intervals.term("to"), Intervals.term("to")), + Intervals.ordered(Intervals.term("be"), Intervals.term("be")), + Intervals.term("or"), + Intervals.term("not") + ); + assertEquals(expected, source); + } + + public void testUnorderedWithRepeatsAndMaxGaps() throws IOException { + IntervalsSource source = BUILDER.analyzeText("to be or not to be", 2, false); + IntervalsSource expected = Intervals.maxwidth(8, Intervals.unordered( + Intervals.ordered(Intervals.term("to"), Intervals.term("to")), + Intervals.ordered(Intervals.term("be"), Intervals.term("be")), + Intervals.term("or"), + Intervals.term("not") + )); + assertEquals(expected, source); + } + } diff --git a/server/src/test/java/org/elasticsearch/index/query/IntervalQueryBuilderTests.java b/server/src/test/java/org/elasticsearch/index/query/IntervalQueryBuilderTests.java index ed28a06500d5a..27be5e84633eb 100644 --- a/server/src/test/java/org/elasticsearch/index/query/IntervalQueryBuilderTests.java +++ b/server/src/test/java/org/elasticsearch/index/query/IntervalQueryBuilderTests.java @@ -192,7 +192,7 @@ public void testMatchInterval() throws IOException { builder = (IntervalQueryBuilder) parseQuery(json); expected = new IntervalQuery(STRING_FIELD_NAME, - Intervals.maxgaps(40, Intervals.unordered(Intervals.term("hello"), Intervals.term("world")))); + XIntervals.maxWidth(Intervals.unordered(Intervals.term("hello"), Intervals.term("world")), s -> 42)); assertEquals(expected, builder.toQuery(createShardContext())); json = "{ \"intervals\" : " + @@ -217,7 +217,7 @@ public void testMatchInterval() throws IOException { builder = (IntervalQueryBuilder) parseQuery(json); expected = new IntervalQuery(STRING_FIELD_NAME, - Intervals.maxgaps(10, Intervals.ordered(Intervals.term("Hello"), Intervals.term("world")))); + XIntervals.maxWidth(Intervals.ordered(Intervals.term("Hello"), Intervals.term("world")), s -> 12)); assertEquals(expected, builder.toQuery(createShardContext())); json = "{ \"intervals\" : " + @@ -232,7 +232,7 @@ public void testMatchInterval() throws IOException { builder = (IntervalQueryBuilder) parseQuery(json); expected = new IntervalQuery(STRING_FIELD_NAME, Intervals.fixField(MASKED_FIELD, - Intervals.maxgaps(10, Intervals.ordered(Intervals.term("Hello"), Intervals.term("world"))))); + XIntervals.maxWidth(Intervals.ordered(Intervals.term("Hello"), Intervals.term("world")), s -> 12))); assertEquals(expected, builder.toQuery(createShardContext())); json = "{ \"intervals\" : " + @@ -248,7 +248,7 @@ public void testMatchInterval() throws IOException { builder = (IntervalQueryBuilder) parseQuery(json); expected = new IntervalQuery(STRING_FIELD_NAME, - Intervals.containing(Intervals.maxgaps(10, Intervals.ordered(Intervals.term("Hello"), Intervals.term("world"))), + Intervals.containing(XIntervals.maxWidth(Intervals.ordered(Intervals.term("Hello"), Intervals.term("world")), s -> 12), Intervals.term("blah"))); assertEquals(expected, builder.toQuery(createShardContext())); }