Skip to content

Commit

Permalink
Merge branch 'main' of https://github.com/apache/lucene into issue-10674
Browse files Browse the repository at this point in the history
  • Loading branch information
jmazanec15 committed Sep 12, 2022
2 parents 13d7669 + 30b72ec commit 220089e
Show file tree
Hide file tree
Showing 40 changed files with 459 additions and 366 deletions.
7 changes: 6 additions & 1 deletion lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@ Improvements

* LUCENE-10614: Properly support getTopChildren in RangeFacetCounts. (Yuting Gan)

* LUCENE-10652: Add a top-n range faceting example to RangeFacetsExample. (Yuting Gan)

Optimizations
---------------------
(No changes)
Expand Down Expand Up @@ -85,8 +87,11 @@ Other

Bug Fixes
---------------------
* GITHUB#1068, LUCENE-10674: Ensure BitSetConjDISI returns NO_MORE_DOCS when sub-iterator exhausts. (Jack Mazanec)
* GITHUB#11726: Indexing term vectors on large documents could fail due to
trying to apply a dictionary whose size is greater than the maximum supported
window size for LZ4. (Adrien Grand)

* LUCENE-10674: Ensure BitSetConjDISI returns NO_MORE_DOCS when sub-iterator exhausts. (Jack Mazanec)

Other
---------------------
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
package org.apache.lucene.backward_codecs.lucene90;

import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
import static org.apache.lucene.search.TopDocsCollector.EMPTY_TOPDOCS;

import java.io.IOException;
import java.nio.ByteBuffer;
Expand All @@ -36,7 +35,6 @@
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.VectorSimilarityFunction;
import org.apache.lucene.index.VectorValues;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.TotalHits;
Expand Down Expand Up @@ -278,21 +276,6 @@ public TopDocs search(String field, float[] target, int k, Bits acceptDocs, int
return new TopDocs(new TotalHits(results.visitedCount(), relation), scoreDocs);
}

@Override
public TopDocs searchExhaustively(
String field, float[] target, int k, DocIdSetIterator acceptDocs) throws IOException {
FieldEntry fieldEntry = fields.get(field);
if (fieldEntry == null) {
// The field does not exist or does not index vectors
return EMPTY_TOPDOCS;
}

VectorSimilarityFunction similarityFunction = fieldEntry.similarityFunction;
VectorValues vectorValues = getVectorValues(field);

return exhaustiveSearch(vectorValues, acceptDocs, similarityFunction, target, k);
}

private OffHeapVectorValues getOffHeapVectorValues(FieldEntry fieldEntry) throws IOException {
IndexInput bytesSlice =
vectorData.slice("vector-data", fieldEntry.vectorDataOffset, fieldEntry.vectorDataLength);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
package org.apache.lucene.backward_codecs.lucene91;

import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
import static org.apache.lucene.search.TopDocsCollector.EMPTY_TOPDOCS;

import java.io.IOException;
import java.nio.ByteBuffer;
Expand All @@ -37,7 +36,6 @@
import org.apache.lucene.index.VectorEncoding;
import org.apache.lucene.index.VectorSimilarityFunction;
import org.apache.lucene.index.VectorValues;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.TotalHits;
Expand Down Expand Up @@ -268,21 +266,6 @@ public TopDocs search(String field, float[] target, int k, Bits acceptDocs, int
return new TopDocs(new TotalHits(results.visitedCount(), relation), scoreDocs);
}

@Override
public TopDocs searchExhaustively(
String field, float[] target, int k, DocIdSetIterator acceptDocs) throws IOException {
FieldEntry fieldEntry = fields.get(field);
if (fieldEntry == null) {
// The field does not exist or does not index vectors
return EMPTY_TOPDOCS;
}

VectorSimilarityFunction similarityFunction = fieldEntry.similarityFunction;
VectorValues vectorValues = getVectorValues(field);

return exhaustiveSearch(vectorValues, acceptDocs, similarityFunction, target, k);
}

private OffHeapVectorValues getOffHeapVectorValues(FieldEntry fieldEntry) throws IOException {
IndexInput bytesSlice =
vectorData.slice("vector-data", fieldEntry.vectorDataOffset, fieldEntry.vectorDataLength);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
package org.apache.lucene.backward_codecs.lucene92;

import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
import static org.apache.lucene.search.TopDocsCollector.EMPTY_TOPDOCS;

import java.io.IOException;
import java.util.Arrays;
Expand All @@ -34,7 +33,6 @@
import org.apache.lucene.index.VectorEncoding;
import org.apache.lucene.index.VectorSimilarityFunction;
import org.apache.lucene.index.VectorValues;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.TotalHits;
Expand Down Expand Up @@ -262,21 +260,6 @@ public TopDocs search(String field, float[] target, int k, Bits acceptDocs, int
return new TopDocs(new TotalHits(results.visitedCount(), relation), scoreDocs);
}

@Override
public TopDocs searchExhaustively(
String field, float[] target, int k, DocIdSetIterator acceptDocs) throws IOException {
FieldEntry fieldEntry = fields.get(field);
if (fieldEntry == null) {
// The field does not exist or does not index vectors
return EMPTY_TOPDOCS;
}

VectorSimilarityFunction similarityFunction = fieldEntry.similarityFunction;
VectorValues vectorValues = getVectorValues(field);

return exhaustiveSearch(vectorValues, acceptDocs, similarityFunction, target, k);
}

/** Get knn graph values; used for testing */
public HnswGraph getGraph(String field) throws IOException {
FieldInfo info = fieldInfos.fieldInfo(field);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -183,14 +183,6 @@ public TopDocs search(String field, float[] target, int k, Bits acceptDocs, int
return new TopDocs(new TotalHits(numVisited, relation), topScoreDocs);
}

@Override
public TopDocs searchExhaustively(
String field, float[] target, int k, DocIdSetIterator acceptDocs) throws IOException {
FieldInfo info = readState.fieldInfos.fieldInfo(field);
VectorSimilarityFunction vectorSimilarity = info.getVectorSimilarityFunction();
return exhaustiveSearch(getVectorValues(field), acceptDocs, vectorSimilarity, target, k);
}

@Override
public void checkIntegrity() throws IOException {
IndexInput clone = dataIn.clone();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.index.VectorValues;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.TopDocsCollector;
import org.apache.lucene.util.Bits;
Expand Down Expand Up @@ -105,12 +104,6 @@ public TopDocs search(
return TopDocsCollector.EMPTY_TOPDOCS;
}

@Override
public TopDocs searchExhaustively(
String field, float[] target, int k, DocIdSetIterator acceptDocs) {
return TopDocsCollector.EMPTY_TOPDOCS;
}

@Override
public void close() {}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,16 +20,12 @@
import java.io.Closeable;
import java.io.IOException;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.VectorSimilarityFunction;
import org.apache.lucene.index.VectorValues;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.HitQueue;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.TotalHits;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;

/** Reads vectors from an index. */
public abstract class KnnVectorsReader implements Closeable, Accountable {
Expand Down Expand Up @@ -84,34 +80,6 @@ protected KnnVectorsReader() {}
public abstract TopDocs search(
String field, float[] target, int k, Bits acceptDocs, int visitedLimit) throws IOException;

/**
* Return the k nearest neighbor documents as determined by comparison of their vector values for
* this field, to the given vector, by the field's similarity function. The score of each document
* is derived from the vector similarity in a way that ensures scores are positive and that a
* larger score corresponds to a higher ranking.
*
* <p>The search is exact, guaranteeing the true k closest neighbors will be returned. Typically
* this requires an exhaustive scan of the entire index. It is intended to be used when the number
* of potential matches is limited.
*
* <p>The returned {@link TopDocs} will contain a {@link ScoreDoc} for each nearest neighbor, in
* order of their similarity to the query vector (decreasing scores). The {@link TotalHits}
* contains the number of documents visited during the search. If the search stopped early because
* it hit {@code visitedLimit}, it is indicated through the relation {@code
* TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO}.
*
* <p>The behavior is undefined if the given field doesn't have KNN vectors enabled on its {@link
* FieldInfo}. The return value is never {@code null}.
*
* @param field the vector field to search
* @param target the vector-valued query
* @param k the number of docs to return
* @param acceptDocs {@link DocIdSetIterator} that represents the allowed documents to match.
* @return the k nearest neighbor documents, along with their (similarity-specific) scores.
*/
public abstract TopDocs searchExhaustively(
String field, float[] target, int k, DocIdSetIterator acceptDocs) throws IOException;

/**
* Returns an instance optimized for merging. This instance may only be consumed in the thread
* that called {@link #getMergeInstance()}.
Expand All @@ -121,67 +89,4 @@ public abstract TopDocs searchExhaustively(
public KnnVectorsReader getMergeInstance() {
return this;
}

/** {@link #searchExhaustively} */
protected static TopDocs exhaustiveSearch(
VectorValues vectorValues,
DocIdSetIterator acceptDocs,
VectorSimilarityFunction similarityFunction,
float[] target,
int k)
throws IOException {
HitQueue queue = new HitQueue(k, true);
ScoreDoc topDoc = queue.top();
int doc;
while ((doc = acceptDocs.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
int vectorDoc = vectorValues.advance(doc);
assert vectorDoc == doc;
float score = similarityFunction.compare(vectorValues.vectorValue(), target);
if (score >= topDoc.score) {
topDoc.score = score;
topDoc.doc = doc;
topDoc = queue.updateTop();
}
}
return topDocsFromHitQueue(queue, acceptDocs.cost());
}

/** {@link #searchExhaustively} */
protected static TopDocs exhaustiveSearch(
VectorValues vectorValues,
DocIdSetIterator acceptDocs,
VectorSimilarityFunction similarityFunction,
BytesRef target,
int k)
throws IOException {
HitQueue queue = new HitQueue(k, true);
ScoreDoc topDoc = queue.top();
int doc;
while ((doc = acceptDocs.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
int vectorDoc = vectorValues.advance(doc);
assert vectorDoc == doc;
float score = similarityFunction.compare(vectorValues.binaryValue(), target);
if (score >= topDoc.score) {
topDoc.score = score;
topDoc.doc = doc;
topDoc = queue.updateTop();
}
}
return topDocsFromHitQueue(queue, acceptDocs.cost());
}

private static TopDocs topDocsFromHitQueue(HitQueue queue, long numHits) {
// Remove any remaining sentinel values
while (queue.size() > 0 && queue.top().score < 0) {
queue.pop();
}

ScoreDoc[] topScoreDocs = new ScoreDoc[queue.size()];
for (int i = topScoreDocs.length - 1; i >= 0; i--) {
topScoreDocs[i] = queue.pop();
}

TotalHits totalHits = new TotalHits(numHits, TotalHits.Relation.EQUAL_TO);
return new TopDocs(totalHits, topScoreDocs);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,7 @@ private void doCompress(byte[] bytes, int dictLen, int len, DataOutput out) thro
@Override
public void compress(ByteBuffersDataInput buffersInput, DataOutput out) throws IOException {
final int len = (int) (buffersInput.size() - buffersInput.position());
final int dictLength = len / (NUM_SUB_BLOCKS * DICT_SIZE_FACTOR);
final int dictLength = Math.min(LZ4.MAX_DISTANCE, len / (NUM_SUB_BLOCKS * DICT_SIZE_FACTOR));
final int blockLength = (len - dictLength + NUM_SUB_BLOCKS - 1) / NUM_SUB_BLOCKS;
buffer = ArrayUtil.growNoCopy(buffer, dictLength + blockLength);
out.writeVInt(dictLength);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,6 @@
package org.apache.lucene.codecs.lucene94;

import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
import static org.apache.lucene.search.TopDocsCollector.EMPTY_TOPDOCS;
import static org.apache.lucene.util.VectorUtil.toBytesRef;

import java.io.IOException;
import java.util.Arrays;
Expand All @@ -35,7 +33,6 @@
import org.apache.lucene.index.VectorEncoding;
import org.apache.lucene.index.VectorSimilarityFunction;
import org.apache.lucene.index.VectorValues;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.TotalHits;
Expand Down Expand Up @@ -284,25 +281,6 @@ public TopDocs search(String field, float[] target, int k, Bits acceptDocs, int
return new TopDocs(new TotalHits(results.visitedCount(), relation), scoreDocs);
}

@Override
public TopDocs searchExhaustively(
String field, float[] target, int k, DocIdSetIterator acceptDocs) throws IOException {
FieldEntry fieldEntry = fields.get(field);
if (fieldEntry == null) {
// The field does not exist or does not index vectors
return EMPTY_TOPDOCS;
}

VectorSimilarityFunction similarityFunction = fieldEntry.similarityFunction;
VectorValues vectorValues = getVectorValues(field);

return switch (fieldEntry.vectorEncoding) {
case BYTE -> exhaustiveSearch(
vectorValues, acceptDocs, similarityFunction, toBytesRef(target), k);
case FLOAT32 -> exhaustiveSearch(vectorValues, acceptDocs, similarityFunction, target, k);
};
}

/** Get knn graph values; used for testing */
public HnswGraph getGraph(String field) throws IOException {
FieldInfo info = fieldInfos.fieldInfo(field);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,6 @@ private void writeField(FieldWriter<?> fieldData, int maxDoc) throws IOException
case BYTE -> writeByteVectors(fieldData);
case FLOAT32 -> writeFloat32Vectors(fieldData);
}
;
long vectorDataLength = vectorData.getFilePointer() - vectorDataOffset;

// write graph
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.index.Sorter;
import org.apache.lucene.index.VectorValues;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.TotalHits;
Expand Down Expand Up @@ -268,17 +267,6 @@ public TopDocs search(String field, float[] target, int k, Bits acceptDocs, int
}
}

@Override
public TopDocs searchExhaustively(
String field, float[] target, int k, DocIdSetIterator acceptDocs) throws IOException {
KnnVectorsReader knnVectorsReader = fields.get(field);
if (knnVectorsReader == null) {
return new TopDocs(new TotalHits(0, TotalHits.Relation.EQUAL_TO), new ScoreDoc[0]);
} else {
return knnVectorsReader.searchExhaustively(field, target, k, acceptDocs);
}
}

@Override
public void close() throws IOException {
IOUtils.close(fields.values());
Expand Down
Loading

0 comments on commit 220089e

Please sign in to comment.