Skip to content

Commit

Permalink
LUCENE-9099: Correctly handle repeats in ORDERED and UNORDERED interv…
Browse files Browse the repository at this point in the history
…als (#1097)

If you have repeating intervals in an ordered or unordered interval source, you currently 
get somewhat confusing behaviour:

* `ORDERED(a, a, b)` will return an extra interval over just a b if it first matches a a b, meaning
that you can get incorrect results if used in a `CONTAINING` filter - 
`CONTAINING(ORDERED(x, y), ORDERED(a, a, b))` will match on the document `a x a b y`
* `UNORDERED(a, a)` will match on documents that just containg a single a.

This commit adds a RepeatingIntervalsSource that correctly handles repeats within 
ordered and unordered sources. It also changes the way that gaps are calculated within 
ordered and unordered sources, by using a new width() method on IntervalIterator. The 
default implementation just returns end() - start() + 1, but RepeatingIntervalsSource 
instead returns the sum of the widths of its child iterators. This preserves maxgaps filtering 
on ordered and unordered sources that contain repeats.

In order to correctly handle matches in this scenario, IntervalsSource#matches now always 
returns an explicit IntervalsMatchesIterator rather than a plain MatchesIterator, which adds 
gaps() and width() methods so that submatches can be combined in the same way that 
subiterators are. Extra checks have been added to checkIntervals() to ensure that the same 
intervals are returned by both iterator and matches, and a fix to 
DisjunctionIntervalIterator#matches() is also included - DisjunctionIntervalIterator minimizes 
its intervals, while MatchesUtils.disjunction does not, so there was a discrepancy between 
the two methods.
  • Loading branch information
romseygeek committed Feb 6, 2020
1 parent 3246b26 commit aa916ba
Show file tree
Hide file tree
Showing 23 changed files with 747 additions and 120 deletions.
7 changes: 7 additions & 0 deletions lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,13 @@ Bug Fixes
* LUCENE-9200: Fix TieredMergePolicy to use double (not float) math to make its merging decisions, fixing
a corner-case bug uncovered by fun randomized tests (Robert Muir, Mike McCandless)

* LUCENE-9099: Unordered and Ordered interval queries now correctly handle
repeated subterms - ordered intervals could supply an 'extra' minimized
interval, resulting in odd matches when combined with eg CONTAINS queries;
and unordered intervals would match duplicate subterms on the same position,
so an query for UNORDERED(foo, foo) would match a document containing 'foo'
only once. (Alan Woodward)

Other
---------------------

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,14 @@
import org.apache.lucene.search.Query;
import org.apache.lucene.util.ArrayUtil;

class CachingMatchesIterator extends FilterMatchesIterator {
class CachingMatchesIterator extends FilterMatchesIterator implements IntervalMatchesIterator {

private boolean positioned = false;
private int[] posAndOffsets = new int[4*4];
private Query[] matchingQueries = new Query[4];
private int count = 0;

CachingMatchesIterator(MatchesIterator in) {
CachingMatchesIterator(IntervalMatchesIterator in) {
super(in);
}

Expand Down Expand Up @@ -133,4 +133,13 @@ public Query getQuery() {
};
}

@Override
public int gaps() {
return ((IntervalMatchesIterator)in).gaps();
}

@Override
public int width() {
return ((IntervalMatchesIterator)in).width();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -65,10 +65,10 @@ public final IntervalIterator intervals(String field, LeafReaderContext ctx) thr
protected abstract IntervalIterator combine(List<IntervalIterator> iterators);

@Override
public final MatchesIterator matches(String field, LeafReaderContext ctx, int doc) throws IOException {
List<MatchesIterator> subs = new ArrayList<>();
public final IntervalMatchesIterator matches(String field, LeafReaderContext ctx, int doc) throws IOException {
List<IntervalMatchesIterator> subs = new ArrayList<>();
for (IntervalsSource source : subSources) {
MatchesIterator mi = source.matches(field, ctx, doc);
IntervalMatchesIterator mi = source.matches(field, ctx, doc);
if (mi == null) {
return null;
}
Expand All @@ -87,13 +87,13 @@ public final MatchesIterator matches(String field, LeafReaderContext ctx, int do
return isMinimizing ? new MinimizingConjunctionMatchesIterator(it, subs) : new ConjunctionMatchesIterator(it, subs);
}

private static class ConjunctionMatchesIterator implements MatchesIterator {
private static class ConjunctionMatchesIterator implements IntervalMatchesIterator {

final IntervalIterator iterator;
final List<MatchesIterator> subs;
final List<IntervalMatchesIterator> subs;
boolean cached = true;

private ConjunctionMatchesIterator(IntervalIterator iterator, List<MatchesIterator> subs) {
private ConjunctionMatchesIterator(IntervalIterator iterator, List<IntervalMatchesIterator> subs) {
this.iterator = iterator;
this.subs = subs;
}
Expand Down Expand Up @@ -152,9 +152,19 @@ public MatchesIterator getSubMatches() throws IOException {
public Query getQuery() {
throw new UnsupportedOperationException();
}

@Override
public int gaps() {
return iterator.gaps();
}

@Override
public int width() {
return iterator.width();
}
}

private static class SingletonMatchesIterator extends FilterMatchesIterator {
static class SingletonMatchesIterator extends FilterMatchesIterator {

boolean exhausted = false;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@

import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.MatchesIterator;
import org.apache.lucene.search.QueryVisitor;

abstract class DifferenceIntervalsSource extends IntervalsSource {
Expand All @@ -48,12 +47,12 @@ public final IntervalIterator intervals(String field, LeafReaderContext ctx) thr
}

@Override
public final MatchesIterator matches(String field, LeafReaderContext ctx, int doc) throws IOException {
MatchesIterator minIt = minuend.matches(field, ctx, doc);
public final IntervalMatchesIterator matches(String field, LeafReaderContext ctx, int doc) throws IOException {
IntervalMatchesIterator minIt = minuend.matches(field, ctx, doc);
if (minIt == null) {
return null;
}
MatchesIterator subIt = subtrahend.matches(field, ctx, doc);
IntervalMatchesIterator subIt = subtrahend.matches(field, ctx, doc);
if (subIt == null) {
return minIt;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.MatchesIterator;
import org.apache.lucene.search.MatchesUtils;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.QueryVisitor;
import org.apache.lucene.util.PriorityQueue;
Expand Down Expand Up @@ -82,15 +81,24 @@ public IntervalIterator intervals(String field, LeafReaderContext ctx) throws IO
}

@Override
public MatchesIterator matches(String field, LeafReaderContext ctx, int doc) throws IOException {
List<MatchesIterator> subMatches = new ArrayList<>();
public IntervalMatchesIterator matches(String field, LeafReaderContext ctx, int doc) throws IOException {
List<IntervalMatchesIterator> subMatches = new ArrayList<>();
for (IntervalsSource subSource : subSources) {
MatchesIterator mi = subSource.matches(field, ctx, doc);
IntervalMatchesIterator mi = subSource.matches(field, ctx, doc);
if (mi != null) {
subMatches.add(mi);
}
}
return MatchesUtils.disjunction(subMatches);
if (subMatches.size() == 0) {
return null;
}
DisjunctionIntervalIterator it = new DisjunctionIntervalIterator(
subMatches.stream().map(m -> IntervalMatches.wrapMatches(m, doc)).collect(Collectors.toList())
);
if (it.advance(doc) != doc) {
return null;
}
return new DisjunctionMatchesIterator(it, subMatches);
}

@Override
Expand Down Expand Up @@ -196,6 +204,16 @@ private void reset() throws IOException {
current = EMPTY;
}

int currentOrd() {
assert current != EMPTY && current != EXHAUSTED;
for (int i = 0; i < iterators.size(); i++) {
if (iterators.get(i) == current) {
return i;
}
}
throw new IllegalStateException();
}

@Override
public int nextInterval() throws IOException {
if (current == EMPTY || current == EXHAUSTED) {
Expand Down Expand Up @@ -344,4 +362,64 @@ public float matchCost() {
}
};

private static class DisjunctionMatchesIterator implements IntervalMatchesIterator {

final DisjunctionIntervalIterator it;
final List<IntervalMatchesIterator> subs;

private DisjunctionMatchesIterator(DisjunctionIntervalIterator it, List<IntervalMatchesIterator> subs) {
this.it = it;
this.subs = subs;
}

@Override
public boolean next() throws IOException {
return it.nextInterval() != IntervalIterator.NO_MORE_INTERVALS;
}

@Override
public int startPosition() {
return it.start();
}

@Override
public int endPosition() {
return it.end();
}

@Override
public int startOffset() throws IOException {
int ord = it.currentOrd();
return subs.get(ord).startOffset();
}

@Override
public int endOffset() throws IOException {
int ord = it.currentOrd();
return subs.get(ord).endOffset();
}

@Override
public MatchesIterator getSubMatches() throws IOException {
int ord = it.currentOrd();
return subs.get(ord).getSubMatches();
}

@Override
public Query getQuery() {
int ord = it.currentOrd();
return subs.get(ord).getQuery();
}

@Override
public int gaps() {
return it.gaps();
}

@Override
public int width() {
return it.width();
}
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@
import java.util.stream.Collectors;

import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.search.MatchesIterator;
import org.apache.lucene.search.QueryVisitor;

class ExtendedIntervalsSource extends IntervalsSource {
Expand All @@ -49,8 +48,8 @@ public IntervalIterator intervals(String field, LeafReaderContext ctx) throws IO
}

@Override
public MatchesIterator matches(String field, LeafReaderContext ctx, int doc) throws IOException {
MatchesIterator in = source.matches(field, ctx, doc);
public IntervalMatchesIterator matches(String field, LeafReaderContext ctx, int doc) throws IOException {
IntervalMatchesIterator in = source.matches(field, ctx, doc);
if (in == null) {
return null;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@
import java.util.stream.Collectors;

import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.search.MatchesIterator;
import org.apache.lucene.search.QueryVisitor;

/**
Expand Down Expand Up @@ -108,8 +107,8 @@ protected boolean accept() {
}

@Override
public MatchesIterator matches(String field, LeafReaderContext ctx, int doc) throws IOException {
MatchesIterator mi = in.matches(field, ctx, doc);
public IntervalMatchesIterator matches(String field, LeafReaderContext ctx, int doc) throws IOException {
IntervalMatchesIterator mi = in.matches(field, ctx, doc);
if (mi == null) {
return null;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@
import java.util.stream.Collectors;

import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.search.MatchesIterator;
import org.apache.lucene.search.QueryVisitor;

class FixedFieldIntervalsSource extends IntervalsSource {
Expand All @@ -43,7 +42,7 @@ public IntervalIterator intervals(String field, LeafReaderContext ctx) throws IO
}

@Override
public MatchesIterator matches(String field, LeafReaderContext ctx, int doc) throws IOException {
public IntervalMatchesIterator matches(String field, LeafReaderContext ctx, int doc) throws IOException {
return source.matches(this.field, ctx, doc);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,13 @@ public abstract class IntervalIterator extends DocIdSetIterator {
*/
public abstract int gaps();

/**
* The width of the current interval
*/
public int width() {
return end() - start() + 1;
}

/**
* Advance the iterator to the next interval
*
Expand Down
Loading

0 comments on commit aa916ba

Please sign in to comment.