Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

LUCENE-9999: CombinedFieldQuery can fail with an exception when document is missing fields #185

Merged
merged 12 commits into from
Jul 17, 2021
3 changes: 3 additions & 0 deletions lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -398,6 +398,9 @@ Bug Fixes
* LUCENE-9964: Duplicate long values in a document field should only be counted once when using SortedNumericDocValuesFields
(Gautam Worah)

* LUCENE-9999: CombinedFieldQuery can fail with an exception when document
is missing some fields. (Jim Ferenczi, Julie Tibshirani)

* LUCENE-10020: DocComparator should not skip docs with the same docID on
multiple sorts with search after (Mayya Sharipova, Julie Tibshirani)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,9 @@
* compute norms the same way as {@link SimilarityBase#computeNorm}, which includes {@link
* BM25Similarity} and {@link DFRSimilarity}. Per-field similarities are not supported.
*
* <p>The query also requires that either all fields or no fields have norms enabled. Having only
* some fields with norms enabled can result in errors or undefined behavior.
*
* <p>The scoring is based on BM25F's simple formula described in:
* http://www.staff.city.ac.uk/~sb317/papers/foundations_bm25_review.pdf. This query implements the
* same approach but allows other similarities besides {@link
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,13 @@
import org.apache.lucene.search.similarities.Similarity.SimScorer;
import org.apache.lucene.util.SmallFloat;

/** Copy of {@link LeafSimScorer} that sums document's norms from multiple fields. */
/**
* Copy of {@link LeafSimScorer} that sums document's norms from multiple fields.
*
* <p>For all fields, norms must be encoded using {@link SmallFloat#intToByte4}. This scorer also
* requires that either all fields or no fields have norms enabled. Having only some fields with
* norms enabled can result in errors or undefined behavior.
*/
final class MultiNormsLeafSimScorer {
/** Cache of decoded norms. */
private static final float[] LENGTH_TABLE = new float[256];
Expand Down Expand Up @@ -62,6 +68,7 @@ final class MultiNormsLeafSimScorer {
weightList.add(field.weight);
}
}

if (normsList.isEmpty()) {
norms = null;
} else if (normsList.size() == 1) {
Expand Down Expand Up @@ -128,14 +135,16 @@ public long longValue() {
@Override
public boolean advanceExact(int target) throws IOException {
float normValue = 0;
boolean found = false;
for (int i = 0; i < normsArr.length; i++) {
boolean found = normsArr[i].advanceExact(target);
assert found;
normValue +=
weightArr[i] * LENGTH_TABLE[Byte.toUnsignedInt((byte) normsArr[i].longValue())];
if (normsArr[i].advanceExact(target)) {
normValue +=
weightArr[i] * LENGTH_TABLE[Byte.toUnsignedInt((byte) normsArr[i].longValue())];
found = true;
}
}
current = SmallFloat.intToByte4(Math.round(normValue));
return true;
return found;
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@
import org.apache.lucene.search.similarities.LMJelinekMercerSimilarity;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.MMapDirectory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;

Expand Down Expand Up @@ -128,6 +129,42 @@ public void testSameScore() throws IOException {
dir.close();
}

public void testNormsDisabled() throws IOException {
Directory dir = newDirectory();
Similarity similarity = randomCompatibleSimilarity();

IndexWriterConfig iwc = new IndexWriterConfig();
iwc.setSimilarity(similarity);
RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc);

Document doc = new Document();
doc.add(new StringField("a", "value", Store.NO));
doc.add(new StringField("b", "value", Store.NO));
doc.add(new TextField("c", "value", Store.NO));
w.addDocument(doc);

IndexReader reader = w.getReader();
IndexSearcher searcher = newSearcher(reader);

Similarity searchSimilarity = randomCompatibleSimilarity();
searcher.setSimilarity(searchSimilarity);
TopScoreDocCollector collector = TopScoreDocCollector.create(10, null, 10);

CombinedFieldQuery query =
new CombinedFieldQuery.Builder()
.addField("a", 1.0f)
.addField("b", 1.0f)
.addTerm(new BytesRef("value"))
.build();
searcher.search(query, collector);
TopDocs topDocs = collector.topDocs();
assertEquals(new TotalHits(1, TotalHits.Relation.EQUAL_TO), topDocs.totalHits);

reader.close();
w.close();
dir.close();
}

public void testCopyField() throws IOException {
Directory dir = newDirectory();
Similarity similarity = randomCompatibleSimilarity();
Expand Down Expand Up @@ -229,6 +266,55 @@ public void testCopyFieldWithMultipleTerms() throws IOException {
dir.close();
}

public void testCopyFieldWithMissingFields() throws IOException {
Directory dir = new MMapDirectory(createTempDir());
Similarity similarity = randomCompatibleSimilarity();

IndexWriterConfig iwc = new IndexWriterConfig();
iwc.setSimilarity(similarity);
RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc);

int boost1 = Math.max(1, random().nextInt(5));
int boost2 = Math.max(1, random().nextInt(5));
int numMatch = atLeast(10);
for (int i = 0; i < numMatch; i++) {
Document doc = new Document();
int freqA = random().nextInt(5) + 1;
for (int j = 0; j < freqA; j++) {
doc.add(new TextField("a", "foo", Store.NO));
}

// Choose frequencies such that sometimes we don't add field B
int freqB = random().nextInt(3);
for (int j = 0; j < freqB; j++) {
doc.add(new TextField("b", "foo", Store.NO));
}

int freqAB = freqA * boost1 + freqB * boost2;
for (int j = 0; j < freqAB; j++) {
doc.add(new TextField("ab", "foo", Store.NO));
}

w.addDocument(doc);
}

IndexReader reader = w.getReader();
IndexSearcher searcher = newSearcher(reader);
searcher.setSimilarity(similarity);
CombinedFieldQuery query =
new CombinedFieldQuery.Builder()
.addField("a", (float) boost1)
.addField("b", (float) boost2)
.addTerm(new BytesRef("foo"))
.build();

checkExpectedHits(searcher, numMatch, query, new TermQuery(new Term("ab", "foo")));

reader.close();
w.close();
dir.close();
}

private static Similarity randomCompatibleSimilarity() {
return RandomPicks.randomFrom(
random(),
Expand Down