Merge branch 'main' of https://github.com/apache/lucene into issue-10674

apache · Sep 12, 2022 · 220089e · 220089e
2 parents 13d7669 + 30b72ec
commit 220089e
Show file tree

Hide file tree

Showing 40 changed files with 459 additions and 366 deletions.
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
@@ -52,6 +52,8 @@ Improvements
 
 * LUCENE-10614: Properly support getTopChildren in RangeFacetCounts. (Yuting Gan)
 
+* LUCENE-10652: Add a top-n range faceting example to RangeFacetsExample. (Yuting Gan)
+
 Optimizations
 ---------------------
 (No changes)
@@ -85,8 +87,11 @@ Other
 
 Bug Fixes
 ---------------------
-* GITHUB#1068, LUCENE-10674: Ensure BitSetConjDISI returns NO_MORE_DOCS when sub-iterator exhausts. (Jack Mazanec)
+* GITHUB#11726: Indexing term vectors on large documents could fail due to
+  trying to apply a dictionary whose size is greater than the maximum supported
+  window size for LZ4. (Adrien Grand)
 
+* LUCENE-10674: Ensure BitSetConjDISI returns NO_MORE_DOCS when sub-iterator exhausts. (Jack Mazanec)
 
 Other
 ---------------------

diff --git a/...codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90HnswVectorsReader.java b/...codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90HnswVectorsReader.java
@@ -18,7 +18,6 @@
 package org.apache.lucene.backward_codecs.lucene90;
 
 import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
-import static org.apache.lucene.search.TopDocsCollector.EMPTY_TOPDOCS;
 
 import java.io.IOException;
 import java.nio.ByteBuffer;
@@ -36,7 +35,6 @@
 import org.apache.lucene.index.SegmentReadState;
 import org.apache.lucene.index.VectorSimilarityFunction;
 import org.apache.lucene.index.VectorValues;
-import org.apache.lucene.search.DocIdSetIterator;
 import org.apache.lucene.search.ScoreDoc;
 import org.apache.lucene.search.TopDocs;
 import org.apache.lucene.search.TotalHits;
@@ -278,21 +276,6 @@ public TopDocs search(String field, float[] target, int k, Bits acceptDocs, int
     return new TopDocs(new TotalHits(results.visitedCount(), relation), scoreDocs);
   }
 
-  @Override
-  public TopDocs searchExhaustively(
-      String field, float[] target, int k, DocIdSetIterator acceptDocs) throws IOException {
-    FieldEntry fieldEntry = fields.get(field);
-    if (fieldEntry == null) {
-      // The field does not exist or does not index vectors
-      return EMPTY_TOPDOCS;
-    }
-
-    VectorSimilarityFunction similarityFunction = fieldEntry.similarityFunction;
-    VectorValues vectorValues = getVectorValues(field);
-
-    return exhaustiveSearch(vectorValues, acceptDocs, similarityFunction, target, k);
-  }
-
   private OffHeapVectorValues getOffHeapVectorValues(FieldEntry fieldEntry) throws IOException {
     IndexInput bytesSlice =
         vectorData.slice("vector-data", fieldEntry.vectorDataOffset, fieldEntry.vectorDataLength);

diff --git a/...codecs/src/java/org/apache/lucene/backward_codecs/lucene91/Lucene91HnswVectorsReader.java b/...codecs/src/java/org/apache/lucene/backward_codecs/lucene91/Lucene91HnswVectorsReader.java
@@ -18,7 +18,6 @@
 package org.apache.lucene.backward_codecs.lucene91;
 
 import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
-import static org.apache.lucene.search.TopDocsCollector.EMPTY_TOPDOCS;
 
 import java.io.IOException;
 import java.nio.ByteBuffer;
@@ -37,7 +36,6 @@
 import org.apache.lucene.index.VectorEncoding;
 import org.apache.lucene.index.VectorSimilarityFunction;
 import org.apache.lucene.index.VectorValues;
-import org.apache.lucene.search.DocIdSetIterator;
 import org.apache.lucene.search.ScoreDoc;
 import org.apache.lucene.search.TopDocs;
 import org.apache.lucene.search.TotalHits;
@@ -268,21 +266,6 @@ public TopDocs search(String field, float[] target, int k, Bits acceptDocs, int
     return new TopDocs(new TotalHits(results.visitedCount(), relation), scoreDocs);
   }
 
-  @Override
-  public TopDocs searchExhaustively(
-      String field, float[] target, int k, DocIdSetIterator acceptDocs) throws IOException {
-    FieldEntry fieldEntry = fields.get(field);
-    if (fieldEntry == null) {
-      // The field does not exist or does not index vectors
-      return EMPTY_TOPDOCS;
-    }
-
-    VectorSimilarityFunction similarityFunction = fieldEntry.similarityFunction;
-    VectorValues vectorValues = getVectorValues(field);
-
-    return exhaustiveSearch(vectorValues, acceptDocs, similarityFunction, target, k);
-  }
-
   private OffHeapVectorValues getOffHeapVectorValues(FieldEntry fieldEntry) throws IOException {
     IndexInput bytesSlice =
         vectorData.slice("vector-data", fieldEntry.vectorDataOffset, fieldEntry.vectorDataLength);

diff --git a/...codecs/src/java/org/apache/lucene/backward_codecs/lucene92/Lucene92HnswVectorsReader.java b/...codecs/src/java/org/apache/lucene/backward_codecs/lucene92/Lucene92HnswVectorsReader.java
@@ -18,7 +18,6 @@
 package org.apache.lucene.backward_codecs.lucene92;
 
 import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
-import static org.apache.lucene.search.TopDocsCollector.EMPTY_TOPDOCS;
 
 import java.io.IOException;
 import java.util.Arrays;
@@ -34,7 +33,6 @@
 import org.apache.lucene.index.VectorEncoding;
 import org.apache.lucene.index.VectorSimilarityFunction;
 import org.apache.lucene.index.VectorValues;
-import org.apache.lucene.search.DocIdSetIterator;
 import org.apache.lucene.search.ScoreDoc;
 import org.apache.lucene.search.TopDocs;
 import org.apache.lucene.search.TotalHits;
@@ -262,21 +260,6 @@ public TopDocs search(String field, float[] target, int k, Bits acceptDocs, int
     return new TopDocs(new TotalHits(results.visitedCount(), relation), scoreDocs);
   }
 
-  @Override
-  public TopDocs searchExhaustively(
-      String field, float[] target, int k, DocIdSetIterator acceptDocs) throws IOException {
-    FieldEntry fieldEntry = fields.get(field);
-    if (fieldEntry == null) {
-      // The field does not exist or does not index vectors
-      return EMPTY_TOPDOCS;
-    }
-
-    VectorSimilarityFunction similarityFunction = fieldEntry.similarityFunction;
-    VectorValues vectorValues = getVectorValues(field);
-
-    return exhaustiveSearch(vectorValues, acceptDocs, similarityFunction, target, k);
-  }
-
   /** Get knn graph values; used for testing */
   public HnswGraph getGraph(String field) throws IOException {
     FieldInfo info = fieldInfos.fieldInfo(field);

diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextKnnVectorsReader.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextKnnVectorsReader.java
@@ -183,14 +183,6 @@ public TopDocs search(String field, float[] target, int k, Bits acceptDocs, int
     return new TopDocs(new TotalHits(numVisited, relation), topScoreDocs);
   }
 
-  @Override
-  public TopDocs searchExhaustively(
-      String field, float[] target, int k, DocIdSetIterator acceptDocs) throws IOException {
-    FieldInfo info = readState.fieldInfos.fieldInfo(field);
-    VectorSimilarityFunction vectorSimilarity = info.getVectorSimilarityFunction();
-    return exhaustiveSearch(getVectorValues(field), acceptDocs, vectorSimilarity, target, k);
-  }
-
   @Override
   public void checkIntegrity() throws IOException {
     IndexInput clone = dataIn.clone();

diff --git a/lucene/core/src/java/org/apache/lucene/codecs/KnnVectorsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/KnnVectorsFormat.java
@@ -21,7 +21,6 @@
 import org.apache.lucene.index.SegmentReadState;
 import org.apache.lucene.index.SegmentWriteState;
 import org.apache.lucene.index.VectorValues;
-import org.apache.lucene.search.DocIdSetIterator;
 import org.apache.lucene.search.TopDocs;
 import org.apache.lucene.search.TopDocsCollector;
 import org.apache.lucene.util.Bits;
@@ -105,12 +104,6 @@ public TopDocs search(
               return TopDocsCollector.EMPTY_TOPDOCS;
             }
 
-            @Override
-            public TopDocs searchExhaustively(
-                String field, float[] target, int k, DocIdSetIterator acceptDocs) {
-              return TopDocsCollector.EMPTY_TOPDOCS;
-            }
-
             @Override
             public void close() {}
 

diff --git a/lucene/core/src/java/org/apache/lucene/codecs/KnnVectorsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/KnnVectorsReader.java
@@ -20,16 +20,12 @@
 import java.io.Closeable;
 import java.io.IOException;
 import org.apache.lucene.index.FieldInfo;
-import org.apache.lucene.index.VectorSimilarityFunction;
 import org.apache.lucene.index.VectorValues;
-import org.apache.lucene.search.DocIdSetIterator;
-import org.apache.lucene.search.HitQueue;
 import org.apache.lucene.search.ScoreDoc;
 import org.apache.lucene.search.TopDocs;
 import org.apache.lucene.search.TotalHits;
 import org.apache.lucene.util.Accountable;
 import org.apache.lucene.util.Bits;
-import org.apache.lucene.util.BytesRef;
 
 /** Reads vectors from an index. */
 public abstract class KnnVectorsReader implements Closeable, Accountable {
@@ -84,34 +80,6 @@ protected KnnVectorsReader() {}
   public abstract TopDocs search(
       String field, float[] target, int k, Bits acceptDocs, int visitedLimit) throws IOException;
 
-  /**
-   * Return the k nearest neighbor documents as determined by comparison of their vector values for
-   * this field, to the given vector, by the field's similarity function. The score of each document
-   * is derived from the vector similarity in a way that ensures scores are positive and that a
-   * larger score corresponds to a higher ranking.
-   *
-   * <p>The search is exact, guaranteeing the true k closest neighbors will be returned. Typically
-   * this requires an exhaustive scan of the entire index. It is intended to be used when the number
-   * of potential matches is limited.
-   *
-   * <p>The returned {@link TopDocs} will contain a {@link ScoreDoc} for each nearest neighbor, in
-   * order of their similarity to the query vector (decreasing scores). The {@link TotalHits}
-   * contains the number of documents visited during the search. If the search stopped early because
-   * it hit {@code visitedLimit}, it is indicated through the relation {@code
-   * TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO}.
-   *
-   * <p>The behavior is undefined if the given field doesn't have KNN vectors enabled on its {@link
-   * FieldInfo}. The return value is never {@code null}.
-   *
-   * @param field the vector field to search
-   * @param target the vector-valued query
-   * @param k the number of docs to return
-   * @param acceptDocs {@link DocIdSetIterator} that represents the allowed documents to match.
-   * @return the k nearest neighbor documents, along with their (similarity-specific) scores.
-   */
-  public abstract TopDocs searchExhaustively(
-      String field, float[] target, int k, DocIdSetIterator acceptDocs) throws IOException;
-
   /**
    * Returns an instance optimized for merging. This instance may only be consumed in the thread
    * that called {@link #getMergeInstance()}.
@@ -121,67 +89,4 @@ public abstract TopDocs searchExhaustively(
   public KnnVectorsReader getMergeInstance() {
     return this;
   }
-
-  /** {@link #searchExhaustively} */
-  protected static TopDocs exhaustiveSearch(
-      VectorValues vectorValues,
-      DocIdSetIterator acceptDocs,
-      VectorSimilarityFunction similarityFunction,
-      float[] target,
-      int k)
-      throws IOException {
-    HitQueue queue = new HitQueue(k, true);
-    ScoreDoc topDoc = queue.top();
-    int doc;
-    while ((doc = acceptDocs.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
-      int vectorDoc = vectorValues.advance(doc);
-      assert vectorDoc == doc;
-      float score = similarityFunction.compare(vectorValues.vectorValue(), target);
-      if (score >= topDoc.score) {
-        topDoc.score = score;
-        topDoc.doc = doc;
-        topDoc = queue.updateTop();
-      }
-    }
-    return topDocsFromHitQueue(queue, acceptDocs.cost());
-  }
-
-  /** {@link #searchExhaustively} */
-  protected static TopDocs exhaustiveSearch(
-      VectorValues vectorValues,
-      DocIdSetIterator acceptDocs,
-      VectorSimilarityFunction similarityFunction,
-      BytesRef target,
-      int k)
-      throws IOException {
-    HitQueue queue = new HitQueue(k, true);
-    ScoreDoc topDoc = queue.top();
-    int doc;
-    while ((doc = acceptDocs.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
-      int vectorDoc = vectorValues.advance(doc);
-      assert vectorDoc == doc;
-      float score = similarityFunction.compare(vectorValues.binaryValue(), target);
-      if (score >= topDoc.score) {
-        topDoc.score = score;
-        topDoc.doc = doc;
-        topDoc = queue.updateTop();
-      }
-    }
-    return topDocsFromHitQueue(queue, acceptDocs.cost());
-  }
-
-  private static TopDocs topDocsFromHitQueue(HitQueue queue, long numHits) {
-    // Remove any remaining sentinel values
-    while (queue.size() > 0 && queue.top().score < 0) {
-      queue.pop();
-    }
-
-    ScoreDoc[] topScoreDocs = new ScoreDoc[queue.size()];
-    for (int i = topScoreDocs.length - 1; i >= 0; i--) {
-      topScoreDocs[i] = queue.pop();
-    }
-
-    TotalHits totalHits = new TotalHits(numHits, TotalHits.Relation.EQUAL_TO);
-    return new TopDocs(totalHits, topScoreDocs);
-  }
 }
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/LZ4WithPresetDictCompressionMode.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/LZ4WithPresetDictCompressionMode.java
@@ -170,7 +170,7 @@ private void doCompress(byte[] bytes, int dictLen, int len, DataOutput out) thro
     @Override
     public void compress(ByteBuffersDataInput buffersInput, DataOutput out) throws IOException {
       final int len = (int) (buffersInput.size() - buffersInput.position());
-      final int dictLength = len / (NUM_SUB_BLOCKS * DICT_SIZE_FACTOR);
+      final int dictLength = Math.min(LZ4.MAX_DISTANCE, len / (NUM_SUB_BLOCKS * DICT_SIZE_FACTOR));
       final int blockLength = (len - dictLength + NUM_SUB_BLOCKS - 1) / NUM_SUB_BLOCKS;
       buffer = ArrayUtil.growNoCopy(buffer, dictLength + blockLength);
       out.writeVInt(dictLength);

diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene94/Lucene94HnswVectorsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene94/Lucene94HnswVectorsReader.java
@@ -18,8 +18,6 @@
 package org.apache.lucene.codecs.lucene94;
 
 import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
-import static org.apache.lucene.search.TopDocsCollector.EMPTY_TOPDOCS;
-import static org.apache.lucene.util.VectorUtil.toBytesRef;
 
 import java.io.IOException;
 import java.util.Arrays;
@@ -35,7 +33,6 @@
 import org.apache.lucene.index.VectorEncoding;
 import org.apache.lucene.index.VectorSimilarityFunction;
 import org.apache.lucene.index.VectorValues;
-import org.apache.lucene.search.DocIdSetIterator;
 import org.apache.lucene.search.ScoreDoc;
 import org.apache.lucene.search.TopDocs;
 import org.apache.lucene.search.TotalHits;
@@ -284,25 +281,6 @@ public TopDocs search(String field, float[] target, int k, Bits acceptDocs, int
     return new TopDocs(new TotalHits(results.visitedCount(), relation), scoreDocs);
   }
 
-  @Override
-  public TopDocs searchExhaustively(
-      String field, float[] target, int k, DocIdSetIterator acceptDocs) throws IOException {
-    FieldEntry fieldEntry = fields.get(field);
-    if (fieldEntry == null) {
-      // The field does not exist or does not index vectors
-      return EMPTY_TOPDOCS;
-    }
-
-    VectorSimilarityFunction similarityFunction = fieldEntry.similarityFunction;
-    VectorValues vectorValues = getVectorValues(field);
-
-    return switch (fieldEntry.vectorEncoding) {
-      case BYTE -> exhaustiveSearch(
-          vectorValues, acceptDocs, similarityFunction, toBytesRef(target), k);
-      case FLOAT32 -> exhaustiveSearch(vectorValues, acceptDocs, similarityFunction, target, k);
-    };
-  }
-
   /** Get knn graph values; used for testing */
   public HnswGraph getGraph(String field) throws IOException {
     FieldInfo info = fieldInfos.fieldInfo(field);

diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene94/Lucene94HnswVectorsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene94/Lucene94HnswVectorsWriter.java
@@ -173,7 +173,6 @@ private void writeField(FieldWriter<?> fieldData, int maxDoc) throws IOException
       case BYTE -> writeByteVectors(fieldData);
       case FLOAT32 -> writeFloat32Vectors(fieldData);
     }
-    ;
     long vectorDataLength = vectorData.getFilePointer() - vectorDataOffset;
 
     // write graph

diff --git a/lucene/core/src/java/org/apache/lucene/codecs/perfield/PerFieldKnnVectorsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/perfield/PerFieldKnnVectorsFormat.java
@@ -33,7 +33,6 @@
 import org.apache.lucene.index.SegmentWriteState;
 import org.apache.lucene.index.Sorter;
 import org.apache.lucene.index.VectorValues;
-import org.apache.lucene.search.DocIdSetIterator;
 import org.apache.lucene.search.ScoreDoc;
 import org.apache.lucene.search.TopDocs;
 import org.apache.lucene.search.TotalHits;
@@ -268,17 +267,6 @@ public TopDocs search(String field, float[] target, int k, Bits acceptDocs, int
       }
     }
 
-    @Override
-    public TopDocs searchExhaustively(
-        String field, float[] target, int k, DocIdSetIterator acceptDocs) throws IOException {
-      KnnVectorsReader knnVectorsReader = fields.get(field);
-      if (knnVectorsReader == null) {
-        return new TopDocs(new TotalHits(0, TotalHits.Relation.EQUAL_TO), new ScoreDoc[0]);
-      } else {
-        return knnVectorsReader.searchExhaustively(field, target, k, acceptDocs);
-      }
-    }
-
     @Override
     public void close() throws IOException {
       IOUtils.close(fields.values());