Add support for index sorting with document blocks (#12829)

Today index sorting will most likely break document blocks added with `IndexWriter#addDocuments(...)` and `#updateDocuments(...)` since the index sorter has no indication of what documents are part of a block. This change automatically adds a marker field to parent documents if configured in `IWC`. These marker documents are optional unless document blocks are indexed and index sorting is configured. In this case indexing blocks will fail unless a parent field is configured. Index sorting will preserve document blocks during sort. Documents within a block not be reordered by the sorting algorithm and will sort along side their parent documents. Relates to #12711
apache · Jan 17, 2024 · 0aa8891 · 0aa8891
1 parent 00e2fe6
commit 0aa8891
Show file tree

Hide file tree

Showing 47 changed files with 1,106 additions and 91 deletions.
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
@@ -24,6 +24,12 @@ New Features
   better-scoring nodes are available, or the best candidate is below a score of `traversalSimilarity` in the lowest
   level. (Aditya Prakash, Kaival Parikh)
 
+* GITHUB#12829: For indices newly created as of 9.10.0 onwards, IndexWriter preserves document blocks indexed via
+  IndexWriter#addDocuments or IndexWriter#updateDocuments also when index sorting is configured. Document blocks are
+  maintained alongside their parent documents during sort and merge. IndexWriterConfig accepts a parent field that is used
+  to maintain block orders if index sorting is used. Note, this is fully optional in Lucene 9.x while will be mandatory for
+  indices that use document blocks together with index sorting as of 10.0.0. (Simon Willnauer)
+
 Improvements
 ---------------------
 

diff --git a/...-codecs/src/java/org/apache/lucene/backward_codecs/lucene60/Lucene60FieldInfosFormat.java b/...-codecs/src/java/org/apache/lucene/backward_codecs/lucene60/Lucene60FieldInfosFormat.java
@@ -217,7 +217,8 @@ private FieldInfo[] readFieldInfos(IndexInput input, int version) throws IOExcep
                 0,
                 VectorEncoding.FLOAT32,
                 VectorSimilarityFunction.EUCLIDEAN,
-                isSoftDeletesField);
+                isSoftDeletesField,
+                false);
       } catch (IllegalStateException e) {
         throw new CorruptIndexException(
             "invalid fieldinfo for field: " + name + ", fieldNumber=" + fieldNumber, input, e);

diff --git a/...-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90FieldInfosFormat.java b/...-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90FieldInfosFormat.java
@@ -194,7 +194,8 @@ public FieldInfos read(
                     vectorDimension,
                     VectorEncoding.FLOAT32,
                     vectorDistFunc,
-                    isSoftDeletesField);
+                    isSoftDeletesField,
+                    false);
             infos[i].checkConsistency();
           } catch (IllegalStateException e) {
             throw new CorruptIndexException(

diff --git a/...cs/src/test/org/apache/lucene/backward_codecs/lucene70/TestLucene70SegmentInfoFormat.java b/...cs/src/test/org/apache/lucene/backward_codecs/lucene70/TestLucene70SegmentInfoFormat.java
@@ -34,4 +34,9 @@ protected Version[] getVersions() {
   protected Codec getCodec() {
     return new Lucene84RWCodec();
   }
+
+  @Override
+  protected boolean supportsHasBlocks() {
+    return false;
+  }
 }
diff --git a/...cs/src/test/org/apache/lucene/backward_codecs/lucene86/TestLucene86SegmentInfoFormat.java b/...cs/src/test/org/apache/lucene/backward_codecs/lucene86/TestLucene86SegmentInfoFormat.java
@@ -34,4 +34,9 @@ protected Version[] getVersions() {
   protected Codec getCodec() {
     return new Lucene87RWCodec();
   }
+
+  @Override
+  protected boolean supportsHasBlocks() {
+    return false;
+  }
 }
diff --git a/...cs/src/test/org/apache/lucene/backward_codecs/lucene90/TestLucene90SegmentInfoFormat.java b/...cs/src/test/org/apache/lucene/backward_codecs/lucene90/TestLucene90SegmentInfoFormat.java
@@ -32,4 +32,9 @@ protected Version[] getVersions() {
   protected Codec getCodec() {
     return new Lucene90RWCodec();
   }
+
+  @Override
+  protected boolean supportsHasBlocks() {
+    return false;
+  }
 }
diff --git a/...backward-codecs/src/test/org/apache/lucene/backward_index/TestBackwardsCompatibility.java b/...backward-codecs/src/test/org/apache/lucene/backward_index/TestBackwardsCompatibility.java
@@ -98,6 +98,8 @@
 import org.apache.lucene.index.Terms;
 import org.apache.lucene.index.TermsEnum;
 import org.apache.lucene.index.VectorSimilarityFunction;
+import org.apache.lucene.search.BooleanClause;
+import org.apache.lucene.search.BooleanQuery;
 import org.apache.lucene.search.DocIdSetIterator;
 import org.apache.lucene.search.FieldDoc;
 import org.apache.lucene.search.FieldExistsQuery;
@@ -2187,6 +2189,83 @@ public void testSortedIndex() throws Exception {
     }
   }
 
+  public void testSortedIndexAddDocBlocks() throws Exception {
+    for (String name : oldSortedNames) {
+      Path path = createTempDir("sorted");
+      InputStream resource = TestBackwardsCompatibility.class.getResourceAsStream(name + ".zip");
+      assertNotNull("Sorted index index " + name + " not found", resource);
+      TestUtil.unzip(resource, path);
+
+      try (Directory dir = newFSDirectory(path)) {
+        final Sort sort;
+        try (DirectoryReader reader = DirectoryReader.open(dir)) {
+          assertEquals(1, reader.leaves().size());
+          sort = reader.leaves().get(0).reader().getMetaData().getSort();
+          assertNotNull(sort);
+          searchExampleIndex(reader);
+        }
+        // open writer
+        try (IndexWriter writer =
+            new IndexWriter(
+                dir,
+                newIndexWriterConfig(new MockAnalyzer(random()))
+                    .setOpenMode(OpenMode.APPEND)
+                    .setIndexSort(sort)
+                    .setMergePolicy(newLogMergePolicy()))) {
+          // add 10 docs
+          for (int i = 0; i < 10; i++) {
+            Document child = new Document();
+            child.add(new StringField("relation", "child", Field.Store.NO));
+            child.add(new StringField("bid", "" + i, Field.Store.NO));
+            child.add(new NumericDocValuesField("dateDV", i));
+            Document parent = new Document();
+            parent.add(new StringField("relation", "parent", Field.Store.NO));
+            parent.add(new StringField("bid", "" + i, Field.Store.NO));
+            parent.add(new NumericDocValuesField("dateDV", i));
+            writer.addDocuments(Arrays.asList(child, child, parent));
+            if (random().nextBoolean()) {
+              writer.flush();
+            }
+          }
+          if (random().nextBoolean()) {
+            writer.forceMerge(1);
+          }
+          writer.commit();
+          try (IndexReader reader = DirectoryReader.open(dir)) {
+            IndexSearcher searcher = new IndexSearcher(reader);
+            for (int i = 0; i < 10; i++) {
+              TopDocs children =
+                  searcher.search(
+                      new BooleanQuery.Builder()
+                          .add(
+                              new TermQuery(new Term("relation", "child")),
+                              BooleanClause.Occur.MUST)
+                          .add(new TermQuery(new Term("bid", "" + i)), BooleanClause.Occur.MUST)
+                          .build(),
+                      2);
+              TopDocs parents =
+                  searcher.search(
+                      new BooleanQuery.Builder()
+                          .add(
+                              new TermQuery(new Term("relation", "parent")),
+                              BooleanClause.Occur.MUST)
+                          .add(new TermQuery(new Term("bid", "" + i)), BooleanClause.Occur.MUST)
+                          .build(),
+                      2);
+              assertEquals(2, children.totalHits.value);
+              assertEquals(1, parents.totalHits.value);
+              // make sure it's sorted
+              assertEquals(children.scoreDocs[0].doc + 1, children.scoreDocs[1].doc);
+              assertEquals(children.scoreDocs[1].doc + 1, parents.scoreDocs[0].doc);
+            }
+          }
+        }
+        // This will confirm the docs are really sorted
+        TestUtil.checkIndex(dir);
+      }
+    }
+  }
+
   private void searchExampleIndex(DirectoryReader reader) throws IOException {
     IndexSearcher searcher = newSearcher(reader);
 

diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextFieldInfosFormat.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextFieldInfosFormat.java
@@ -72,6 +72,7 @@ public class SimpleTextFieldInfosFormat extends FieldInfosFormat {
   static final BytesRef VECTOR_ENCODING = new BytesRef("  vector encoding ");
   static final BytesRef VECTOR_SIMILARITY = new BytesRef("  vector similarity ");
   static final BytesRef SOFT_DELETES = new BytesRef("  soft-deletes ");
+  static final BytesRef PARENT = new BytesRef("  parent ");
 
   @Override
   public FieldInfos read(
@@ -170,6 +171,9 @@ public FieldInfos read(
         SimpleTextUtil.readLine(input, scratch);
         assert StringHelper.startsWith(scratch.get(), SOFT_DELETES);
         boolean isSoftDeletesField = Boolean.parseBoolean(readString(SOFT_DELETES.length, scratch));
+        SimpleTextUtil.readLine(input, scratch);
+        assert StringHelper.startsWith(scratch.get(), PARENT);
+        boolean isParentField = Boolean.parseBoolean(readString(PARENT.length, scratch));
 
         infos[i] =
             new FieldInfo(
@@ -188,7 +192,8 @@ public FieldInfos read(
                 vectorNumDimensions,
                 vectorEncoding,
                 vectorDistFunc,
-                isSoftDeletesField);
+                isSoftDeletesField,
+                isParentField);
       }
 
       SimpleTextUtil.checkFooter(input);
@@ -320,6 +325,10 @@ public void write(
         SimpleTextUtil.write(out, SOFT_DELETES);
         SimpleTextUtil.write(out, Boolean.toString(fi.isSoftDeletesField()), scratch);
         SimpleTextUtil.writeNewline(out);
+
+        SimpleTextUtil.write(out, PARENT);
+        SimpleTextUtil.write(out, Boolean.toString(fi.isParentField()), scratch);
+        SimpleTextUtil.writeNewline(out);
       }
       SimpleTextUtil.writeChecksum(out, scratch);
       success = true;

diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextSegmentInfoFormat.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextSegmentInfoFormat.java
@@ -196,7 +196,13 @@ public SegmentInfo read(
         sortField[i] = SortFieldProvider.forName(provider).readSortField(bytes);
         assert bytes.eof();
       }
-      Sort indexSort = sortField.length == 0 ? null : new Sort(sortField);
+
+      final Sort indexSort;
+      if (sortField.length == 0) {
+        indexSort = null;
+      } else {
+        indexSort = new Sort(sortField);
+      }
 
       SimpleTextUtil.checkFooter(input);
 
@@ -335,7 +341,6 @@ public void write(Directory dir, SegmentInfo si, IOContext ioContext) throws IOE
         SimpleTextUtil.write(output, b.bytes.get().toString(), scratch);
         SimpleTextUtil.writeNewline(output);
       }
-
       SimpleTextUtil.writeChecksum(output, scratch);
     }
   }

diff --git a/lucene/codecs/src/test/org/apache/lucene/codecs/uniformsplit/TestBlockWriter.java b/lucene/codecs/src/test/org/apache/lucene/codecs/uniformsplit/TestBlockWriter.java
@@ -119,6 +119,7 @@ private static FieldInfo getMockFieldInfo(String fieldName, int number) {
         0,
         VectorEncoding.FLOAT32,
         VectorSimilarityFunction.EUCLIDEAN,
-        true);
+        true,
+        false);
   }
 }
diff --git a/.../codecs/src/test/org/apache/lucene/codecs/uniformsplit/sharedterms/TestSTBlockReader.java b/.../codecs/src/test/org/apache/lucene/codecs/uniformsplit/sharedterms/TestSTBlockReader.java
@@ -206,6 +206,7 @@ private static FieldInfo mockFieldInfo(String fieldName, int number) {
         0,
         VectorEncoding.FLOAT32,
         VectorSimilarityFunction.EUCLIDEAN,
+        false,
         false);
   }
 

diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene94/Lucene94FieldInfosFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene94/Lucene94FieldInfosFormat.java
@@ -131,13 +131,14 @@ public FieldInfos read(
       Throwable priorE = null;
       FieldInfo[] infos = null;
       try {
-        CodecUtil.checkIndexHeader(
-            input,
-            Lucene94FieldInfosFormat.CODEC_NAME,
-            Lucene94FieldInfosFormat.FORMAT_START,
-            Lucene94FieldInfosFormat.FORMAT_CURRENT,
-            segmentInfo.getId(),
-            segmentSuffix);
+        int format =
+            CodecUtil.checkIndexHeader(
+                input,
+                Lucene94FieldInfosFormat.CODEC_NAME,
+                Lucene94FieldInfosFormat.FORMAT_START,
+                Lucene94FieldInfosFormat.FORMAT_CURRENT,
+                segmentInfo.getId(),
+                segmentSuffix);
 
         final int size = input.readVInt(); // read in the size
         infos = new FieldInfo[size];
@@ -157,6 +158,18 @@ public FieldInfos read(
           boolean omitNorms = (bits & OMIT_NORMS) != 0;
           boolean storePayloads = (bits & STORE_PAYLOADS) != 0;
           boolean isSoftDeletesField = (bits & SOFT_DELETES_FIELD) != 0;
+          boolean isParentField =
+              format >= FORMAT_PARENT_FIELD ? (bits & PARENT_FIELD_FIELD) != 0 : false;
+
+          if ((bits & 0xE0) != 0) {
+            throw new CorruptIndexException(
+                "unused bits are set \"" + Integer.toBinaryString(bits) + "\"", input);
+          }
+          if (format < FORMAT_PARENT_FIELD && (bits & 0xF0) != 0) {
+            throw new CorruptIndexException(
+                "parent field bit is set but shouldn't \"" + Integer.toBinaryString(bits) + "\"",
+                input);
+          }
 
           final IndexOptions indexOptions = getIndexOptions(input, input.readByte());
 
@@ -200,7 +213,8 @@ public FieldInfos read(
                     vectorDimension,
                     vectorEncoding,
                     vectorDistFunc,
-                    isSoftDeletesField);
+                    isSoftDeletesField,
+                    isParentField);
             infos[i].checkConsistency();
           } catch (IllegalStateException e) {
             throw new CorruptIndexException(
@@ -348,6 +362,7 @@ public void write(
         if (fi.omitsNorms()) bits |= OMIT_NORMS;
         if (fi.hasPayloads()) bits |= STORE_PAYLOADS;
         if (fi.isSoftDeletesField()) bits |= SOFT_DELETES_FIELD;
+        if (fi.isParentField()) bits |= PARENT_FIELD_FIELD;
         output.writeByte(bits);
 
         output.writeByte(indexOptionsByte(fi.getIndexOptions()));
@@ -375,11 +390,14 @@ public void write(
   // Codec header
   static final String CODEC_NAME = "Lucene94FieldInfos";
   static final int FORMAT_START = 0;
-  static final int FORMAT_CURRENT = FORMAT_START;
+  // this doesn't actually change the file format but uses up one more bit an existing bit pattern
+  static final int FORMAT_PARENT_FIELD = 1;
+  static final int FORMAT_CURRENT = FORMAT_PARENT_FIELD;
 
   // Field flags
   static final byte STORE_TERMVECTOR = 0x1;
   static final byte OMIT_NORMS = 0x2;
   static final byte STORE_PAYLOADS = 0x4;
   static final byte SOFT_DELETES_FIELD = 0x8;
+  static final byte PARENT_FIELD_FIELD = 0x10;
 }
diff --git a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java
@@ -1196,34 +1196,39 @@ public static Status.IndexSortStatus testSort(
         comparators[i] = fields[i].getComparator(1, Pruning.NONE).getLeafComparator(readerContext);
       }
 
-      int maxDoc = reader.maxDoc();
-
       try {
-
-        for (int docID = 1; docID < maxDoc; docID++) {
-
+        LeafMetaData metaData = reader.getMetaData();
+        FieldInfos fieldInfos = reader.getFieldInfos();
+        final DocIdSetIterator iter;
+        if (metaData.hasBlocks() && fieldInfos.getParentField() != null) {
+          iter = reader.getNumericDocValues(fieldInfos.getParentField());
+        } else {
+          iter = DocIdSetIterator.all(reader.maxDoc());
+        }
+        int prevDoc = iter.nextDoc();
+        int nextDoc;
+        while ((nextDoc = iter.nextDoc()) != NO_MORE_DOCS) {
           int cmp = 0;
-
           for (int i = 0; i < comparators.length; i++) {
-            // TODO: would be better if copy() didnt cause a term lookup in TermOrdVal & co,
+            // TODO: would be better if copy() didn't cause a term lookup in TermOrdVal & co,
             // the segments are always the same here...
-            comparators[i].copy(0, docID - 1);
+            comparators[i].copy(0, prevDoc);
             comparators[i].setBottom(0);
-            cmp = reverseMul[i] * comparators[i].compareBottom(docID);
+            cmp = reverseMul[i] * comparators[i].compareBottom(nextDoc);
             if (cmp != 0) {
               break;
             }
           }
-
           if (cmp > 0) {
             throw new CheckIndexException(
                 "segment has indexSort="
                     + sort
                     + " but docID="
-                    + (docID - 1)
+                    + (prevDoc)
                     + " sorts after docID="
-                    + docID);
+                    + nextDoc);
           }
+          prevDoc = nextDoc;
         }
         msg(
             infoStream,
-Original file line number
+Diff line change
@@ Expand Up @@
 ,
             VectorEncoding.FLOAT32,
             VectorSimilarityFunction.EUCLIDEAN,
-            true);
+            true,
+            false);
       }
     }