Skip to content

Commit

Permalink
LUCENE-9999: CombinedFieldQuery can fail with an exception when docum…
Browse files Browse the repository at this point in the history
…ent is missing fields (apache#185)

This change fixes a bug in `MultiNormsLeafSimScorer` that assumes that each
field should have a norm for every term/document.

As part of the fix, it adds validation that the fields have consistent norms
settings.
  • Loading branch information
jtibshirani committed Jul 17, 2021
1 parent 2ad2867 commit 215e4a5
Show file tree
Hide file tree
Showing 4 changed files with 149 additions and 6 deletions.
3 changes: 3 additions & 0 deletions lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,9 @@ Bug Fixes

* LUCENE-9988: Fix DrillSideways correctness bug introduced in LUCENE-9944 (Greg Miller)

* LUCENE-9999: CombinedFieldQuery can fail with an exception when document
is missing some fields. (Jim Ferenczi, Julie Tibshirani)

* LUCENE-10020: DocComparator should not skip docs with the same docID on
multiple sorts with search after (Mayya Sharipova, Julie Tibshirani)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@
import java.util.Objects;
import java.util.Set;
import java.util.TreeMap;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.PostingsEnum;
Expand Down Expand Up @@ -62,6 +64,9 @@
* compute norms the same way as {@link SimilarityBase#computeNorm}, which includes {@link
* BM25Similarity} and {@link DFRSimilarity}. Per-field similarities are not supported.
*
* <p>The query also requires that either all fields or no fields have norms enabled. Having only
* some fields with norms enabled can result in errors.
*
* <p>The scoring is based on BM25F's simple formula described in:
* http://www.staff.city.ac.uk/~sb317/papers/foundations_bm25_review.pdf. This query implements the
* same approach but allows other similarities besides {@link
Expand Down Expand Up @@ -271,6 +276,7 @@ private BooleanQuery rewriteToBoolean() {
@Override
public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost)
throws IOException {
validateConsistentNorms(searcher.getIndexReader());
if (scoreMode.needsScores()) {
return new CombinedFieldWeight(this, searcher, scoreMode, boost);
} else {
Expand All @@ -280,6 +286,29 @@ public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float bo
}
}

private void validateConsistentNorms(IndexReader reader) {
boolean allFieldsHaveNorms = true;
boolean noFieldsHaveNorms = true;

for (LeafReaderContext context : reader.leaves()) {
FieldInfos fieldInfos = context.reader().getFieldInfos();
for (String field : fieldAndWeights.keySet()) {
FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
if (fieldInfo != null) {
allFieldsHaveNorms &= fieldInfo.hasNorms();
noFieldsHaveNorms &= fieldInfo.omitsNorms();
}
}
}

if (allFieldsHaveNorms == false && noFieldsHaveNorms == false) {
throw new IllegalArgumentException(
getClass().getSimpleName()
+ " requires norms to be consistent across fields: some fields cannot "
+ " have norms enabled, while others have norms disabled");
}
}

class CombinedFieldWeight extends Weight {
private final IndexSearcher searcher;
private final TermStates termStates[];
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,13 @@
import org.apache.lucene.search.similarities.Similarity.SimScorer;
import org.apache.lucene.util.SmallFloat;

/** Copy of {@link LeafSimScorer} that sums document's norms from multiple fields. */
/**
* Copy of {@link LeafSimScorer} that sums document's norms from multiple fields.
*
* <p>For all fields, norms must be encoded using {@link SmallFloat#intToByte4}. This scorer also
* requires that either all fields or no fields have norms enabled. Having only some fields with
* norms enabled can result in errors or undefined behavior.
*/
final class MultiNormsLeafSimScorer {
/** Cache of decoded norms. */
private static final float[] LENGTH_TABLE = new float[256];
Expand Down Expand Up @@ -60,6 +66,7 @@ final class MultiNormsLeafSimScorer {
weightList.add(field.weight);
}
}

if (normsList.isEmpty()) {
norms = null;
} else if (normsList.size() == 1) {
Expand Down Expand Up @@ -126,14 +133,16 @@ public long longValue() {
@Override
public boolean advanceExact(int target) throws IOException {
float normValue = 0;
boolean found = false;
for (int i = 0; i < normsArr.length; i++) {
boolean found = normsArr[i].advanceExact(target);
assert found;
normValue +=
weightArr[i] * LENGTH_TABLE[Byte.toUnsignedInt((byte) normsArr[i].longValue())];
if (normsArr[i].advanceExact(target)) {
normValue +=
weightArr[i] * LENGTH_TABLE[Byte.toUnsignedInt((byte) normsArr[i].longValue())];
found = true;
}
}
current = SmallFloat.intToByte4(Math.round(normValue));
return true;
return found;
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,59 @@ public void testSameScore() throws IOException {
dir.close();
}

public void testNormsDisabled() throws IOException {
Directory dir = newDirectory();
Similarity similarity = randomCompatibleSimilarity();

IndexWriterConfig iwc = new IndexWriterConfig();
iwc.setSimilarity(similarity);
RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc);

Document doc = new Document();
doc.add(new StringField("a", "value", Store.NO));
doc.add(new StringField("b", "value", Store.NO));
doc.add(new TextField("c", "value", Store.NO));
w.addDocument(doc);
w.commit();

doc = new Document();
doc.add(new StringField("a", "value", Store.NO));
doc.add(new TextField("c", "value", Store.NO));
w.addDocument(doc);

IndexReader reader = w.getReader();
IndexSearcher searcher = newSearcher(reader);

Similarity searchSimilarity = randomCompatibleSimilarity();
searcher.setSimilarity(searchSimilarity);
TopScoreDocCollector collector = TopScoreDocCollector.create(10, null, 10);

CombinedFieldQuery query =
new CombinedFieldQuery.Builder()
.addField("a", 1.0f)
.addField("b", 1.0f)
.addTerm(new BytesRef("value"))
.build();
searcher.search(query, collector);
TopDocs topDocs = collector.topDocs();
assertEquals(new TotalHits(2, TotalHits.Relation.EQUAL_TO), topDocs.totalHits);

CombinedFieldQuery invalidQuery =
new CombinedFieldQuery.Builder()
.addField("b", 1.0f)
.addField("c", 1.0f)
.addTerm(new BytesRef("value"))
.build();
IllegalArgumentException e =
expectThrows(
IllegalArgumentException.class, () -> searcher.search(invalidQuery, collector));
assertTrue(e.getMessage().contains("requires norms to be consistent across fields"));

reader.close();
w.close();
dir.close();
}

public void testCopyField() throws IOException {
Directory dir = newDirectory();
Similarity similarity = randomCompatibleSimilarity();
Expand Down Expand Up @@ -266,6 +319,55 @@ public void testCopyFieldWithMultipleTerms() throws IOException {
dir.close();
}

public void testCopyFieldWithMissingFields() throws IOException {
Directory dir = newDirectory();
Similarity similarity = randomCompatibleSimilarity();

IndexWriterConfig iwc = new IndexWriterConfig();
iwc.setSimilarity(similarity);
RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc);

int boost1 = Math.max(1, random().nextInt(5));
int boost2 = Math.max(1, random().nextInt(5));
int numMatch = atLeast(10);
for (int i = 0; i < numMatch; i++) {
Document doc = new Document();
int freqA = random().nextInt(5) + 1;
for (int j = 0; j < freqA; j++) {
doc.add(new TextField("a", "foo", Store.NO));
}

// Choose frequencies such that sometimes we don't add field B
int freqB = random().nextInt(3);
for (int j = 0; j < freqB; j++) {
doc.add(new TextField("b", "foo", Store.NO));
}

int freqAB = freqA * boost1 + freqB * boost2;
for (int j = 0; j < freqAB; j++) {
doc.add(new TextField("ab", "foo", Store.NO));
}

w.addDocument(doc);
}

IndexReader reader = w.getReader();
IndexSearcher searcher = newSearcher(reader);
searcher.setSimilarity(similarity);
CombinedFieldQuery query =
new CombinedFieldQuery.Builder()
.addField("a", (float) boost1)
.addField("b", (float) boost2)
.addTerm(new BytesRef("foo"))
.build();

checkExpectedHits(searcher, numMatch, query, new TermQuery(new Term("ab", "foo")));

reader.close();
w.close();
dir.close();
}

private static Similarity randomCompatibleSimilarity() {
return RandomPicks.randomFrom(
random(),
Expand Down

0 comments on commit 215e4a5

Please sign in to comment.