From 193d87abed655df6806cb04df82a71c99380b6ed Mon Sep 17 00:00:00 2001 From: Simon Willnauer Date: Thu, 16 Nov 2023 12:46:12 +0100 Subject: [PATCH 01/44] first cut at sorting only the parent documents --- .../org/apache/lucene/index/CheckIndex.java | 23 +++-- .../org/apache/lucene/index/IndexSorter.java | 6 ++ .../apache/lucene/index/IndexingChain.java | 27 +++--- .../org/apache/lucene/index/LeafMetaData.java | 4 + .../org/apache/lucene/index/MultiSorter.java | 12 +++ .../java/org/apache/lucene/index/Sorter.java | 14 +++- .../apache/lucene/index/TestIndexSorting.java | 83 +++++++++++++++++++ 7 files changed, 148 insertions(+), 21 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java index cd601743f294..f1ac129574e7 100644 --- a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java +++ b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java @@ -70,7 +70,6 @@ import org.apache.lucene.store.FSDirectory; import org.apache.lucene.store.IOContext; import org.apache.lucene.store.Lock; -import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.ArrayUtil.ByteArrayComparator; import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; @@ -89,6 +88,7 @@ import org.apache.lucene.util.automaton.CompiledAutomaton; import org.apache.lucene.util.automaton.Operations; + /** * Basic tool and API to check the health of an index and write a new segments file that removes * reference to problematic segments. @@ -1183,20 +1183,25 @@ public static Status.IndexSortStatus testSort( int maxDoc = reader.maxDoc(); try { - - for (int docID = 1; docID < maxDoc; docID++) { - + LeafMetaData metaData = reader.getMetaData(); + DocIdSetIterator iter = DocIdSetIterator.all(reader.maxDoc()); + if (metaData.hasBlocks()) { + iter = reader.getNumericDocValues("parent"); // NOCOMMIT hard coded + } + int prevDoc = iter.nextDoc(); + int nextDoc; + while ((nextDoc = iter.nextDoc()) != NO_MORE_DOCS) { int cmp = 0; - for (int i = 0; i < comparators.length; i++) { // TODO: would be better if copy() didnt cause a term lookup in TermOrdVal & co, // the segments are always the same here... - comparators[i].copy(0, docID - 1); + comparators[i].copy(0, prevDoc); comparators[i].setBottom(0); - cmp = reverseMul[i] * comparators[i].compareBottom(docID); + cmp = reverseMul[i] * comparators[i].compareBottom(nextDoc); if (cmp != 0) { break; } + prevDoc = nextDoc; } if (cmp > 0) { @@ -1204,9 +1209,9 @@ public static Status.IndexSortStatus testSort( "segment has indexSort=" + sort + " but docID=" - + (docID - 1) + + (prevDoc) + " sorts after docID=" - + docID); + + nextDoc); } } msg( diff --git a/lucene/core/src/java/org/apache/lucene/index/IndexSorter.java b/lucene/core/src/java/org/apache/lucene/index/IndexSorter.java index 10fc196232d6..ac625830c9ea 100644 --- a/lucene/core/src/java/org/apache/lucene/index/IndexSorter.java +++ b/lucene/core/src/java/org/apache/lucene/index/IndexSorter.java @@ -102,6 +102,12 @@ interface SortedDocValuesProvider { SortedDocValues get(LeafReader reader) throws IOException; } + /** Provide a TermsEnum instance for a LeafReader */ + interface TermsEnumProvider { + /** Returns the TermsEnum instance for this LeafReader */ + TermsEnum get(LeafReader reader) throws IOException; + } + /** Sorts documents based on integer values from a NumericDocValues instance */ final class IntSorter implements IndexSorter { diff --git a/lucene/core/src/java/org/apache/lucene/index/IndexingChain.java b/lucene/core/src/java/org/apache/lucene/index/IndexingChain.java index cb2b1d1c1a87..0ac367eb0983 100644 --- a/lucene/core/src/java/org/apache/lucene/index/IndexingChain.java +++ b/lucene/core/src/java/org/apache/lucene/index/IndexingChain.java @@ -27,6 +27,7 @@ import java.util.Objects; import java.util.concurrent.TimeUnit; import java.util.function.Consumer; +import java.util.function.Function; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.codecs.DocValuesConsumer; @@ -47,16 +48,8 @@ import org.apache.lucene.search.similarities.Similarity; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IOContext; -import org.apache.lucene.util.Accountable; -import org.apache.lucene.util.ArrayUtil; -import org.apache.lucene.util.ByteBlockPool; -import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.*; import org.apache.lucene.util.BytesRefHash.MaxBytesLengthExceededException; -import org.apache.lucene.util.Counter; -import org.apache.lucene.util.IOUtils; -import org.apache.lucene.util.InfoStream; -import org.apache.lucene.util.IntBlockPool; -import org.apache.lucene.util.RamUsageEstimator; /** Default general purpose indexing chain, which handles indexing all types of fields. */ final class IndexingChain implements Accountable { @@ -219,7 +212,16 @@ private Sorter.DocMap maybeSortSegment(SegmentWriteState state) throws IOExcepti } LeafReader docValuesReader = getDocValuesLeafReader(); - + Function comparatorWrapper = in -> in; + if (state.segmentInfo.getHasBlocks()) { + final NumericDocValues readerValues = + docValuesReader.getNumericDocValues("parent"); // NOCOMMIT hard coded + BitSet parents = BitSet.of(readerValues, state.segmentInfo.maxDoc()); + comparatorWrapper = + in -> + (docID1, docID2) -> + in.compare(parents.nextSetBit(docID1), parents.nextSetBit(docID2)); + } List comparators = new ArrayList<>(); for (int i = 0; i < indexSort.getSort().length; i++) { SortField sortField = indexSort.getSort()[i]; @@ -227,7 +229,10 @@ private Sorter.DocMap maybeSortSegment(SegmentWriteState state) throws IOExcepti if (sorter == null) { throw new UnsupportedOperationException("Cannot sort index using sort field " + sortField); } - comparators.add(sorter.getDocComparator(docValuesReader, state.segmentInfo.maxDoc())); + + IndexSorter.DocComparator docComparator = + sorter.getDocComparator(docValuesReader, state.segmentInfo.maxDoc()); + comparators.add(comparatorWrapper.apply(docComparator)); } Sorter sorter = new Sorter(indexSort); // returns null if the documents are already sorted diff --git a/lucene/core/src/java/org/apache/lucene/index/LeafMetaData.java b/lucene/core/src/java/org/apache/lucene/index/LeafMetaData.java index 77a9f2a84b64..61511eb0d9b0 100644 --- a/lucene/core/src/java/org/apache/lucene/index/LeafMetaData.java +++ b/lucene/core/src/java/org/apache/lucene/index/LeafMetaData.java @@ -91,4 +91,8 @@ public Sort getSort() { public boolean hasBlocks() { return hasBlocks; } + + public String getRootField() { + return "parent"; // nocommit + } } diff --git a/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java b/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java index 2b1fd2ee6b14..46447223ed92 100644 --- a/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java +++ b/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java @@ -22,6 +22,7 @@ import org.apache.lucene.index.MergeState.DocMap; import org.apache.lucene.search.Sort; import org.apache.lucene.search.SortField; +import org.apache.lucene.util.BitSet; import org.apache.lucene.util.Bits; import org.apache.lucene.util.PriorityQueue; import org.apache.lucene.util.packed.PackedInts; @@ -50,6 +51,17 @@ static MergeState.DocMap[] sort(Sort sort, List readers) throws IOE "Cannot use sort field " + fields[i] + " for index sorting"); } comparables[i] = sorter.getComparableProviders(readers); + for (int j = 0; j < readers.size(); j++) { + CodecReader codecReader = readers.get(j); + if (codecReader.getMetaData().hasBlocks()) { + final NumericDocValues readerValues = + codecReader.getNumericDocValues(codecReader.getMetaData().getRootField()); + BitSet parents = BitSet.of(readerValues, codecReader.maxDoc()); + IndexSorter.ComparableProvider[] providers = comparables[i]; + IndexSorter.ComparableProvider provider = providers[j]; + providers[j] = docId -> provider.getAsComparableLong(parents.nextSetBit(docId)); + } + } reverseMuls[i] = fields[i].getReverse() ? -1 : 1; } int leafCount = readers.size(); diff --git a/lucene/core/src/java/org/apache/lucene/index/Sorter.java b/lucene/core/src/java/org/apache/lucene/index/Sorter.java index 7945505be92f..254b210c4ba6 100644 --- a/lucene/core/src/java/org/apache/lucene/index/Sorter.java +++ b/lucene/core/src/java/org/apache/lucene/index/Sorter.java @@ -17,8 +17,10 @@ package org.apache.lucene.index; import java.io.IOException; +import java.util.function.Function; import org.apache.lucene.search.Sort; import org.apache.lucene.search.SortField; +import org.apache.lucene.util.BitSet; import org.apache.lucene.util.TimSorter; import org.apache.lucene.util.packed.PackedInts; import org.apache.lucene.util.packed.PackedLongValues; @@ -206,13 +208,23 @@ DocMap sort(LeafReader reader) throws IOException { SortField[] fields = sort.getSort(); final IndexSorter.DocComparator[] comparators = new IndexSorter.DocComparator[fields.length]; + Function comparatorWrapper = in -> in; + if (reader.getMetaData().hasBlocks()) { + final NumericDocValues readerValues = + reader.getNumericDocValues(reader.getMetaData().getRootField()); + BitSet parents = BitSet.of(readerValues, reader.maxDoc()); + comparatorWrapper = + in -> + (docID1, docID2) -> + in.compare(parents.nextSetBit(docID1), parents.nextSetBit(docID2)); + } for (int i = 0; i < fields.length; i++) { IndexSorter sorter = fields[i].getIndexSorter(); if (sorter == null) { throw new IllegalArgumentException( "Cannot use sortfield + " + fields[i] + " to sort indexes"); } - comparators[i] = sorter.getDocComparator(reader, reader.maxDoc()); + comparators[i] = comparatorWrapper.apply(sorter.getDocComparator(reader, reader.maxDoc())); } return sort(reader.maxDoc(), comparators); } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java b/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java index e54e76875103..5d93af27f794 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java @@ -20,6 +20,7 @@ import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS; import static org.hamcrest.core.StringContains.containsString; +import com.carrotsearch.randomizedtesting.annotations.Repeat; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; @@ -3171,4 +3172,86 @@ public void testSortDocsAndFreqsAndPositionsAndOffsets() throws IOException { reader.close(); dir.close(); } + + @Repeat(iterations = 100) + public void testIndexSortWithBlocks() throws IOException { + try (Directory dir = newDirectory()) { + IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random())); + AssertingNeedsIndexSortCodec codec = new AssertingNeedsIndexSortCodec(); + iwc.setCodec(codec); + String parentField = "parent"; + Sort indexSort = new Sort(new SortField("foo", SortField.Type.INT)); + iwc.setIndexSort(indexSort); + LogMergePolicy policy = newLogMergePolicy(); + // make sure that merge factor is always > 2 + if (policy.getMergeFactor() <= 2) { + policy.setMergeFactor(3); + } + iwc.setMergePolicy(policy); + + // add already sorted documents + codec.numCalls = 0; + codec.needsIndexSort = false; + try (IndexWriter w = new IndexWriter(dir, iwc)) { + int numDocs = random().nextInt(50, 100); + for (int i = 0; i < numDocs; i++) { + Document child1 = new Document(); + child1.add(new StringField("id", Integer.toString(i), Store.YES)); + child1.add(new NumericDocValuesField("id", i)); + child1.add(new NumericDocValuesField("child", 1)); + child1.add(new NumericDocValuesField("foo", random().nextInt())); + Document child2 = new Document(); + child2.add(new StringField("id", Integer.toString(i), Store.YES)); + child2.add(new NumericDocValuesField("id", i)); + child2.add(new NumericDocValuesField("child", 2)); + child2.add(new NumericDocValuesField("foo", random().nextInt())); + Document parent = new Document(); + parent.add(new StringField("id", Integer.toString(i), Store.YES)); + parent.add(new NumericDocValuesField("id", i)); + parent.add(new NumericDocValuesField("foo", random().nextInt())); + parent.add(new NumericDocValuesField(parentField, 0)); + w.addDocuments(Arrays.asList(child1, child2, parent)); + if (rarely()) { + w.commit(); + } + } + w.commit(); + if (random().nextBoolean()) { + w.forceMerge(1, true); + } + } + + try (DirectoryReader reader = DirectoryReader.open(dir)) { + for (LeafReaderContext ctx : reader.leaves()) { + LeafReader leaf = ctx.reader(); + NumericDocValues parentDV = leaf.getNumericDocValues(parentField); + NumericDocValues ids = leaf.getNumericDocValues("id"); + NumericDocValues children = leaf.getNumericDocValues("child"); + int doc = -1; + int expectedDocID = 2; + while ((doc = parentDV.nextDoc()) != NO_MORE_DOCS) { + assertEquals(expectedDocID, doc); + int id = ids.nextDoc(); + long child1ID = ids.longValue(); + assertEquals(id, children.nextDoc()); + long child1 = children.longValue(); + assertEquals(1, child1); + + id = ids.nextDoc(); + long child2ID = ids.longValue(); + assertEquals(id, children.nextDoc()); + long child2 = children.longValue(); + assertEquals(2, child2); + + int idParent = ids.nextDoc(); + assertEquals(id + 1, idParent); + long parent = ids.longValue(); + assertEquals(child1ID, parent); + assertEquals(child2ID, parent); + expectedDocID += 3; + } + } + } + } + } } From f5eb4f8a79aa02aca7b2522a8b48fdc1e978fc31 Mon Sep 17 00:00:00 2001 From: Simon Willnauer Date: Fri, 17 Nov 2023 23:00:08 +0100 Subject: [PATCH 02/44] progress --- .../lucene99/Lucene99SegmentInfoFormat.java | 10 ++++++++- .../org/apache/lucene/index/CheckIndex.java | 14 ++++++------- .../index/DocumentsWriterPerThread.java | 5 +++++ .../org/apache/lucene/index/IndexSorter.java | 6 ------ .../apache/lucene/index/IndexingChain.java | 3 +-- .../org/apache/lucene/index/LeafMetaData.java | 4 ---- .../org/apache/lucene/index/MultiSorter.java | 5 ++--- .../java/org/apache/lucene/index/Sorter.java | 5 ++--- .../java/org/apache/lucene/search/Sort.java | 21 +++++++++++++++---- .../apache/lucene/index/TestIndexSorting.java | 17 +++++++++------ 10 files changed, 53 insertions(+), 37 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99SegmentInfoFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99SegmentInfoFormat.java index 9341c312299e..fd3ef9344d32 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99SegmentInfoFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99SegmentInfoFormat.java @@ -142,7 +142,8 @@ private SegmentInfo parseSegmentInfo( String name = input.readString(); sortFields[i] = SortFieldProvider.forName(name).readSortField(input); } - indexSort = new Sort(sortFields); + String rootDocfield = input.readByte() == SegmentInfo.YES ? input.readString() : null; + indexSort = new Sort(rootDocfield, sortFields); } else if (numSortFields < 0) { throw new CorruptIndexException("invalid index sort field count: " + numSortFields, input); } else { @@ -232,5 +233,12 @@ private void writeSegmentInfo(DataOutput output, SegmentInfo si) throws IOExcept output.writeString(sorter.getProviderName()); SortFieldProvider.write(sortField, output); } + if (numSortFields > 0) { + String rootDocField = indexSort.getRootDocField(); + output.writeByte((byte) (rootDocField != null ? SegmentInfo.YES : SegmentInfo.NO)); + if (rootDocField != null) { + output.writeString(rootDocField); + } + } } } diff --git a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java index f1ac129574e7..c5a79725869b 100644 --- a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java +++ b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java @@ -1180,20 +1180,18 @@ public static Status.IndexSortStatus testSort( comparators[i] = fields[i].getComparator(1, Pruning.NONE).getLeafComparator(readerContext); } - int maxDoc = reader.maxDoc(); - try { LeafMetaData metaData = reader.getMetaData(); - DocIdSetIterator iter = DocIdSetIterator.all(reader.maxDoc()); - if (metaData.hasBlocks()) { - iter = reader.getNumericDocValues("parent"); // NOCOMMIT hard coded - } + final DocIdSetIterator iter = + metaData.hasBlocks() + ? reader.getNumericDocValues(sort.getRootDocField()) + : DocIdSetIterator.all(reader.maxDoc()); int prevDoc = iter.nextDoc(); int nextDoc; while ((nextDoc = iter.nextDoc()) != NO_MORE_DOCS) { int cmp = 0; for (int i = 0; i < comparators.length; i++) { - // TODO: would be better if copy() didnt cause a term lookup in TermOrdVal & co, + // TODO: would be better if copy() didn't cause a term lookup in TermOrdVal & co, // the segments are always the same here... comparators[i].copy(0, prevDoc); comparators[i].setBottom(0); @@ -1221,7 +1219,7 @@ public static Status.IndexSortStatus testSort( if (failFast) { throw IOUtils.rethrowAlways(e); } - msg(infoStream, "ERROR [" + String.valueOf(e.getMessage()) + "]"); + msg(infoStream, "ERROR [" + e.getMessage() + "]"); status.error = e; if (infoStream != null) { e.printStackTrace(infoStream); diff --git a/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java b/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java index 57ada3a56022..4e8f7d648c94 100644 --- a/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java +++ b/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java @@ -247,6 +247,11 @@ long updateDocuments( } allDocsIndexed = true; if (numDocsInRAM - docsInRamBefore > 1) { + if (segmentInfo.getIndexSort() != null) { + if (segmentInfo.getIndexSort().getRootDocField() == null) { + throw new IllegalArgumentException("A root doc field must be set"); + } + } segmentInfo.setHasBlocks(); } return finishDocuments(deleteNode, docsInRamBefore); diff --git a/lucene/core/src/java/org/apache/lucene/index/IndexSorter.java b/lucene/core/src/java/org/apache/lucene/index/IndexSorter.java index ac625830c9ea..10fc196232d6 100644 --- a/lucene/core/src/java/org/apache/lucene/index/IndexSorter.java +++ b/lucene/core/src/java/org/apache/lucene/index/IndexSorter.java @@ -102,12 +102,6 @@ interface SortedDocValuesProvider { SortedDocValues get(LeafReader reader) throws IOException; } - /** Provide a TermsEnum instance for a LeafReader */ - interface TermsEnumProvider { - /** Returns the TermsEnum instance for this LeafReader */ - TermsEnum get(LeafReader reader) throws IOException; - } - /** Sorts documents based on integer values from a NumericDocValues instance */ final class IntSorter implements IndexSorter { diff --git a/lucene/core/src/java/org/apache/lucene/index/IndexingChain.java b/lucene/core/src/java/org/apache/lucene/index/IndexingChain.java index 0ac367eb0983..1c77b83acf6b 100644 --- a/lucene/core/src/java/org/apache/lucene/index/IndexingChain.java +++ b/lucene/core/src/java/org/apache/lucene/index/IndexingChain.java @@ -214,8 +214,7 @@ private Sorter.DocMap maybeSortSegment(SegmentWriteState state) throws IOExcepti LeafReader docValuesReader = getDocValuesLeafReader(); Function comparatorWrapper = in -> in; if (state.segmentInfo.getHasBlocks()) { - final NumericDocValues readerValues = - docValuesReader.getNumericDocValues("parent"); // NOCOMMIT hard coded + final DocIdSetIterator readerValues = docValuesReader.getNumericDocValues(indexSort.getRootDocField()); BitSet parents = BitSet.of(readerValues, state.segmentInfo.maxDoc()); comparatorWrapper = in -> diff --git a/lucene/core/src/java/org/apache/lucene/index/LeafMetaData.java b/lucene/core/src/java/org/apache/lucene/index/LeafMetaData.java index 61511eb0d9b0..77a9f2a84b64 100644 --- a/lucene/core/src/java/org/apache/lucene/index/LeafMetaData.java +++ b/lucene/core/src/java/org/apache/lucene/index/LeafMetaData.java @@ -91,8 +91,4 @@ public Sort getSort() { public boolean hasBlocks() { return hasBlocks; } - - public String getRootField() { - return "parent"; // nocommit - } } diff --git a/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java b/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java index 46447223ed92..ea0e852422e8 100644 --- a/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java +++ b/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java @@ -54,9 +54,8 @@ static MergeState.DocMap[] sort(Sort sort, List readers) throws IOE for (int j = 0; j < readers.size(); j++) { CodecReader codecReader = readers.get(j); if (codecReader.getMetaData().hasBlocks()) { - final NumericDocValues readerValues = - codecReader.getNumericDocValues(codecReader.getMetaData().getRootField()); - BitSet parents = BitSet.of(readerValues, codecReader.maxDoc()); + BitSet parents = + BitSet.of(sort.getRootDocs().rootDocs(codecReader), codecReader.maxDoc()); IndexSorter.ComparableProvider[] providers = comparables[i]; IndexSorter.ComparableProvider provider = providers[j]; providers[j] = docId -> provider.getAsComparableLong(parents.nextSetBit(docId)); diff --git a/lucene/core/src/java/org/apache/lucene/index/Sorter.java b/lucene/core/src/java/org/apache/lucene/index/Sorter.java index 254b210c4ba6..8a3a7174a862 100644 --- a/lucene/core/src/java/org/apache/lucene/index/Sorter.java +++ b/lucene/core/src/java/org/apache/lucene/index/Sorter.java @@ -210,9 +210,8 @@ DocMap sort(LeafReader reader) throws IOException { Function comparatorWrapper = in -> in; if (reader.getMetaData().hasBlocks()) { - final NumericDocValues readerValues = - reader.getNumericDocValues(reader.getMetaData().getRootField()); - BitSet parents = BitSet.of(readerValues, reader.maxDoc()); + assert sort.getRootDocField() != null; + BitSet parents = BitSet.of(reader.getNumericDocValues(sort.getRootDocField()), reader.maxDoc()); comparatorWrapper = in -> (docID1, docID2) -> diff --git a/lucene/core/src/java/org/apache/lucene/search/Sort.java b/lucene/core/src/java/org/apache/lucene/search/Sort.java index 868fbf16360e..661c5d958b78 100644 --- a/lucene/core/src/java/org/apache/lucene/search/Sort.java +++ b/lucene/core/src/java/org/apache/lucene/search/Sort.java @@ -18,6 +18,7 @@ import java.io.IOException; import java.util.Arrays; +import java.util.Objects; /** * Encapsulates sort criteria for returned hits. @@ -43,6 +44,7 @@ public final class Sort { // internal representation of the sort criteria private final SortField[] fields; + private final String rootDocField; /** * Sorts by computed relevance. This is the same sort criteria as calling {@link @@ -59,10 +61,15 @@ public Sort() { * is still a tie after all SortFields are checked, the internal Lucene docid is used to break it. */ public Sort(SortField... fields) { + this(null, fields); + } + + public Sort(String rootDocField, SortField... fields) { if (fields.length == 0) { throw new IllegalArgumentException("There must be at least 1 sort field"); } this.fields = fields; + this.rootDocField = rootDocField; } /** @@ -74,6 +81,10 @@ public SortField[] getSort() { return fields; } + public String getRootDocField() { + return rootDocField; + } + /** * Rewrites the SortFields in this Sort, returning a new Sort if any of the fields changes during * their rewriting. @@ -85,7 +96,7 @@ public SortField[] getSort() { */ public Sort rewrite(IndexSearcher searcher) throws IOException { boolean changed = false; - + assert rootDocField == null; SortField[] rewrittenSortFields = new SortField[fields.length]; for (int i = 0; i < fields.length; i++) { rewrittenSortFields[i] = fields[i].rewrite(searcher); @@ -100,7 +111,9 @@ public Sort rewrite(IndexSearcher searcher) throws IOException { @Override public String toString() { StringBuilder buffer = new StringBuilder(); - + if (rootDocField != null) { + buffer.append("RootDocField: ").append(rootDocField).append(' '); + } for (int i = 0; i < fields.length; i++) { buffer.append(fields[i].toString()); if ((i + 1) < fields.length) buffer.append(','); @@ -115,13 +128,13 @@ public boolean equals(Object o) { if (this == o) return true; if (!(o instanceof Sort)) return false; final Sort other = (Sort) o; - return Arrays.equals(this.fields, other.fields); + return Objects.equals(rootDocField, other.rootDocField) && Arrays.equals(this.fields, other.fields); } /** Returns a hash code value for this object. */ @Override public int hashCode() { - return 0x45aaf665 + Arrays.hashCode(fields); + return 0x45aaf665 + Arrays.hashCode(fields) + Objects.hashCode(rootDocField); } /** Returns true if the relevance score is needed to sort documents. */ diff --git a/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java b/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java index 5d93af27f794..7f9720bc20db 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java @@ -20,7 +20,6 @@ import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS; import static org.hamcrest.core.StringContains.containsString; -import com.carrotsearch.randomizedtesting.annotations.Repeat; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; @@ -3173,14 +3172,16 @@ public void testSortDocsAndFreqsAndPositionsAndOffsets() throws IOException { dir.close(); } - @Repeat(iterations = 100) public void testIndexSortWithBlocks() throws IOException { try (Directory dir = newDirectory()) { IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random())); AssertingNeedsIndexSortCodec codec = new AssertingNeedsIndexSortCodec(); iwc.setCodec(codec); String parentField = "parent"; - Sort indexSort = new Sort(new SortField("foo", SortField.Type.INT)); + Sort.RootDocs rootDocs = new Sort.RootDocs(parentField, DocValuesType.NUMERIC); + Sort indexSort = + new Sort(rootDocs, + new SortField("foo", SortField.Type.INT)); iwc.setIndexSort(indexSort); LogMergePolicy policy = newLogMergePolicy(); // make sure that merge factor is always > 2 @@ -3209,7 +3210,11 @@ public void testIndexSortWithBlocks() throws IOException { parent.add(new StringField("id", Integer.toString(i), Store.YES)); parent.add(new NumericDocValuesField("id", i)); parent.add(new NumericDocValuesField("foo", random().nextInt())); - parent.add(new NumericDocValuesField(parentField, 0)); + if (rootDocs.type() == DocValuesType.NONE) { + parent.add(new NumericDocValuesField(parentField, 0)); + } else { + parent.add(new StringField(parentField, parentField, Store.NO)); + } w.addDocuments(Arrays.asList(child1, child2, parent)); if (rarely()) { w.commit(); @@ -3224,12 +3229,12 @@ public void testIndexSortWithBlocks() throws IOException { try (DirectoryReader reader = DirectoryReader.open(dir)) { for (LeafReaderContext ctx : reader.leaves()) { LeafReader leaf = ctx.reader(); - NumericDocValues parentDV = leaf.getNumericDocValues(parentField); + DocIdSetIterator parentDISI = rootDocs.rootDocs(leaf); NumericDocValues ids = leaf.getNumericDocValues("id"); NumericDocValues children = leaf.getNumericDocValues("child"); int doc = -1; int expectedDocID = 2; - while ((doc = parentDV.nextDoc()) != NO_MORE_DOCS) { + while ((doc = parentDISI.nextDoc()) != NO_MORE_DOCS) { assertEquals(expectedDocID, doc); int id = ids.nextDoc(); long child1ID = ids.longValue(); From f3b90c028fe856500bc75f4962b11114712a3897 Mon Sep 17 00:00:00 2001 From: Simon Willnauer Date: Sat, 18 Nov 2023 12:32:12 +0100 Subject: [PATCH 03/44] add validator --- .../lucene99/Lucene99SegmentInfoFormat.java | 12 +-- .../org/apache/lucene/index/CheckIndex.java | 2 +- .../index/DocumentsWriterPerThread.java | 98 +++++++++++++++++-- .../apache/lucene/index/IndexingChain.java | 7 +- .../org/apache/lucene/index/MultiSorter.java | 5 +- .../java/org/apache/lucene/index/Sorter.java | 4 +- .../java/org/apache/lucene/search/Sort.java | 24 +++-- .../apache/lucene/index/TestIndexSorting.java | 11 +-- 8 files changed, 123 insertions(+), 40 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99SegmentInfoFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99SegmentInfoFormat.java index fd3ef9344d32..bbf771408432 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99SegmentInfoFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99SegmentInfoFormat.java @@ -142,8 +142,8 @@ private SegmentInfo parseSegmentInfo( String name = input.readString(); sortFields[i] = SortFieldProvider.forName(name).readSortField(input); } - String rootDocfield = input.readByte() == SegmentInfo.YES ? input.readString() : null; - indexSort = new Sort(rootDocfield, sortFields); + String parentField = input.readByte() == SegmentInfo.YES ? input.readString() : null; + indexSort = new Sort(parentField, sortFields); } else if (numSortFields < 0) { throw new CorruptIndexException("invalid index sort field count: " + numSortFields, input); } else { @@ -234,10 +234,10 @@ private void writeSegmentInfo(DataOutput output, SegmentInfo si) throws IOExcept SortFieldProvider.write(sortField, output); } if (numSortFields > 0) { - String rootDocField = indexSort.getRootDocField(); - output.writeByte((byte) (rootDocField != null ? SegmentInfo.YES : SegmentInfo.NO)); - if (rootDocField != null) { - output.writeString(rootDocField); + String parentField = indexSort.getParentField(); + output.writeByte((byte) (parentField != null ? SegmentInfo.YES : SegmentInfo.NO)); + if (parentField != null) { + output.writeString(parentField); } } } diff --git a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java index c5a79725869b..246ef341cb93 100644 --- a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java +++ b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java @@ -1184,7 +1184,7 @@ public static Status.IndexSortStatus testSort( LeafMetaData metaData = reader.getMetaData(); final DocIdSetIterator iter = metaData.hasBlocks() - ? reader.getNumericDocValues(sort.getRootDocField()) + ? reader.getNumericDocValues(sort.getParentField()) : DocIdSetIterator.all(reader.maxDoc()); int prevDoc = iter.nextDoc(); int nextDoc; diff --git a/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java b/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java index 4e8f7d648c94..8a5797225751 100644 --- a/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java +++ b/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java @@ -28,9 +28,12 @@ import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicLong; import java.util.concurrent.locks.ReentrantLock; +import java.util.function.Consumer; + import org.apache.lucene.codecs.Codec; import org.apache.lucene.index.DocumentsWriterDeleteQueue.DeleteSlice; import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.search.Sort; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FlushInfo; import org.apache.lucene.store.IOContext; @@ -135,6 +138,8 @@ void abort() throws IOException { private int[] deleteDocIDs = new int[0]; private int numDeletedDocIds = 0; + private final DocValidator validator; + DocumentsWriterPerThread( int indexVersionCreated, String segmentName, @@ -189,6 +194,8 @@ void abort() throws IOException { fieldInfos, indexWriterConfig, this::onAbortingException); + validator = indexWriterConfig.getIndexSort() != null ? new ParentBlockValidator(indexWriterConfig.getIndexSort()) + : DocValidator.EMPTY; } final void testPoint(String message) { @@ -230,6 +237,7 @@ long updateDocuments( } final int docsInRamBefore = numDocsInRAM; boolean allDocsIndexed = false; + validator.reset(); try { for (Iterable doc : docs) { // Even on exception, the document is still added (but marked @@ -239,21 +247,22 @@ long updateDocuments( // it's very hard to fix (we can't easily distinguish aborting // vs non-aborting exceptions): reserveOneDoc(); + try { - indexingChain.processDocument(numDocsInRAM++, doc); + validator.beforeDocument(); + indexingChain.processDocument(numDocsInRAM++, doc, validator); + validator.afterDocument(); } finally { onNewDocOnRAM.run(); } } - allDocsIndexed = true; - if (numDocsInRAM - docsInRamBefore > 1) { - if (segmentInfo.getIndexSort() != null) { - if (segmentInfo.getIndexSort().getRootDocField() == null) { - throw new IllegalArgumentException("A root doc field must be set"); - } - } + + final int numDocs = numDocsInRAM - docsInRamBefore; + validator.afterDocuments(numDocs); + if (numDocs > 1) { segmentInfo.setHasBlocks(); } + allDocsIndexed = true; return finishDocuments(deleteNode, docsInRamBefore); } finally { if (!allDocsIndexed && !aborted) { @@ -267,6 +276,79 @@ long updateDocuments( } } + private interface DocValidator extends Consumer { + + DocValidator EMPTY = indexableField -> {}; + default void afterDocument() {} + default void beforeDocument() {} + default void afterDocuments(int numDocs) {} + default void reset() {} + } + + private static class ParentBlockValidator implements DocValidator { + private boolean lastDocHasParentField = false; + private boolean childDocHasParentField = false; + private final String parentFieldName; + + @Override + public void reset() { + lastDocHasParentField = false; + childDocHasParentField = false; + } + + private ParentBlockValidator(Sort sort) { + this.parentFieldName = sort.getParentField(); + } + + @Override + public void beforeDocument() { + if (childDocHasParentField == false && lastDocHasParentField) { + childDocHasParentField = true; + } + lastDocHasParentField = false; + } + @Override + public void accept(IndexableField field) { + if (parentFieldName != null) { + if (parentFieldName.equals(field.name()) && DocValuesType.NUMERIC == field.fieldType().docValuesType()) { + lastDocHasParentField = true; + } + } + } + + @Override + public void afterDocument() { + if (childDocHasParentField) { + throw new IllegalArgumentException("only the last document in the block must contain a numeric doc values field named: " + parentFieldName); + } + } + + @Override + public void afterDocuments(int numDocs) { + if (numDocs > 1 && parentFieldName == null) { + throw new IllegalArgumentException("A parent field must be set in order to use document blocks with index sorting"); + } + if (parentFieldName != null && lastDocHasParentField == false) { + throw new IllegalArgumentException("the last document in the block must contain a numeric doc values field named: " + parentFieldName); + } + } + } + + private void validateIndexSortWithBlocks(Iterable lastDoc) { + if (segmentInfo.getIndexSort() != null) { + String parentField = segmentInfo.getIndexSort().getParentField(); + + boolean valid = false; + for (IndexableField field : lastDoc) { + if (parentField.equals(field.name()) && DocValuesType.NUMERIC == field.fieldType().docValuesType()) { + valid = true; + break; + } + } + + } + } + private long finishDocuments(DocumentsWriterDeleteQueue.Node deleteNode, int docIdUpTo) { /* * here we actually finish the document in two steps 1. push the delete into diff --git a/lucene/core/src/java/org/apache/lucene/index/IndexingChain.java b/lucene/core/src/java/org/apache/lucene/index/IndexingChain.java index 1c77b83acf6b..047549d4c50d 100644 --- a/lucene/core/src/java/org/apache/lucene/index/IndexingChain.java +++ b/lucene/core/src/java/org/apache/lucene/index/IndexingChain.java @@ -214,7 +214,7 @@ private Sorter.DocMap maybeSortSegment(SegmentWriteState state) throws IOExcepti LeafReader docValuesReader = getDocValuesLeafReader(); Function comparatorWrapper = in -> in; if (state.segmentInfo.getHasBlocks()) { - final DocIdSetIterator readerValues = docValuesReader.getNumericDocValues(indexSort.getRootDocField()); + final DocIdSetIterator readerValues = docValuesReader.getNumericDocValues(indexSort.getParentField()); BitSet parents = BitSet.of(readerValues, state.segmentInfo.maxDoc()); comparatorWrapper = in -> @@ -530,7 +530,7 @@ private void finishStoredFields() throws IOException { } } - void processDocument(int docID, Iterable document) throws IOException { + void processDocument(int docID, Iterable document, Consumer fieldConsumer) throws IOException { // number of unique fields by names (collapses multiple field instances by the same name) int fieldCount = 0; int indexedFieldCount = 0; // number of unique fields indexed with postings @@ -549,6 +549,7 @@ void processDocument(int docID, Iterable document) thr // 1st pass over doc fields – verify that doc schema matches the index schema // build schema for each unique doc field for (IndexableField field : document) { + fieldConsumer.accept(field); IndexableFieldType fieldType = field.fieldType(); PerField pf = getOrAddPerField(field.name()); if (pf.fieldGen != fieldGen) { // first time we see this field in this document @@ -560,7 +561,7 @@ void processDocument(int docID, Iterable document) thr docFields[docFieldIdx++] = pf; updateDocFieldSchema(field.name(), pf.schema, fieldType); } - // For each field, if it the first time we see this field in this segment, + // For each field, if it's the first time we see this field in this segment, // initialize its FieldInfo. // If we have already seen this field, verify that its schema // within the current doc matches its schema in the index. diff --git a/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java b/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java index ea0e852422e8..af657d753158 100644 --- a/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java +++ b/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java @@ -54,8 +54,9 @@ static MergeState.DocMap[] sort(Sort sort, List readers) throws IOE for (int j = 0; j < readers.size(); j++) { CodecReader codecReader = readers.get(j); if (codecReader.getMetaData().hasBlocks()) { - BitSet parents = - BitSet.of(sort.getRootDocs().rootDocs(codecReader), codecReader.maxDoc()); + NumericDocValues parentDocs = codecReader.getNumericDocValues(sort.getParentField()); + assert parentDocs != null : "parent field must be present if index sorting is used with blocks"; + BitSet parents = BitSet.of(parentDocs, codecReader.maxDoc()); IndexSorter.ComparableProvider[] providers = comparables[i]; IndexSorter.ComparableProvider provider = providers[j]; providers[j] = docId -> provider.getAsComparableLong(parents.nextSetBit(docId)); diff --git a/lucene/core/src/java/org/apache/lucene/index/Sorter.java b/lucene/core/src/java/org/apache/lucene/index/Sorter.java index 8a3a7174a862..27b358b4d72e 100644 --- a/lucene/core/src/java/org/apache/lucene/index/Sorter.java +++ b/lucene/core/src/java/org/apache/lucene/index/Sorter.java @@ -210,8 +210,8 @@ DocMap sort(LeafReader reader) throws IOException { Function comparatorWrapper = in -> in; if (reader.getMetaData().hasBlocks()) { - assert sort.getRootDocField() != null; - BitSet parents = BitSet.of(reader.getNumericDocValues(sort.getRootDocField()), reader.maxDoc()); + assert sort.getParentField() != null : "a parent field must be present when segment has blocks and index sorting"; + BitSet parents = BitSet.of(reader.getNumericDocValues(sort.getParentField()), reader.maxDoc()); comparatorWrapper = in -> (docID1, docID2) -> diff --git a/lucene/core/src/java/org/apache/lucene/search/Sort.java b/lucene/core/src/java/org/apache/lucene/search/Sort.java index 661c5d958b78..78442cff1254 100644 --- a/lucene/core/src/java/org/apache/lucene/search/Sort.java +++ b/lucene/core/src/java/org/apache/lucene/search/Sort.java @@ -44,7 +44,7 @@ public final class Sort { // internal representation of the sort criteria private final SortField[] fields; - private final String rootDocField; + private final String parentField; /** * Sorts by computed relevance. This is the same sort criteria as calling {@link @@ -64,12 +64,12 @@ public Sort(SortField... fields) { this(null, fields); } - public Sort(String rootDocField, SortField... fields) { + public Sort(String parentField, SortField... fields) { if (fields.length == 0) { throw new IllegalArgumentException("There must be at least 1 sort field"); } this.fields = fields; - this.rootDocField = rootDocField; + this.parentField = parentField; } /** @@ -81,8 +81,12 @@ public SortField[] getSort() { return fields; } - public String getRootDocField() { - return rootDocField; + /** + * Returns the field name that marks a document as a parent in a document block. + * + */ + public String getParentField() { + return parentField; } /** @@ -96,7 +100,7 @@ public String getRootDocField() { */ public Sort rewrite(IndexSearcher searcher) throws IOException { boolean changed = false; - assert rootDocField == null; + assert parentField == null; SortField[] rewrittenSortFields = new SortField[fields.length]; for (int i = 0; i < fields.length; i++) { rewrittenSortFields[i] = fields[i].rewrite(searcher); @@ -111,8 +115,8 @@ public Sort rewrite(IndexSearcher searcher) throws IOException { @Override public String toString() { StringBuilder buffer = new StringBuilder(); - if (rootDocField != null) { - buffer.append("RootDocField: ").append(rootDocField).append(' '); + if (parentField != null) { + buffer.append("RootDocField: ").append(parentField).append(' '); } for (int i = 0; i < fields.length; i++) { buffer.append(fields[i].toString()); @@ -128,13 +132,13 @@ public boolean equals(Object o) { if (this == o) return true; if (!(o instanceof Sort)) return false; final Sort other = (Sort) o; - return Objects.equals(rootDocField, other.rootDocField) && Arrays.equals(this.fields, other.fields); + return Objects.equals(parentField, other.parentField) && Arrays.equals(this.fields, other.fields); } /** Returns a hash code value for this object. */ @Override public int hashCode() { - return 0x45aaf665 + Arrays.hashCode(fields) + Objects.hashCode(rootDocField); + return 0x45aaf665 + Arrays.hashCode(fields) + Objects.hashCode(parentField); } /** Returns true if the relevance score is needed to sort documents. */ diff --git a/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java b/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java index 7f9720bc20db..a0184dbaf2bf 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java @@ -3178,9 +3178,8 @@ public void testIndexSortWithBlocks() throws IOException { AssertingNeedsIndexSortCodec codec = new AssertingNeedsIndexSortCodec(); iwc.setCodec(codec); String parentField = "parent"; - Sort.RootDocs rootDocs = new Sort.RootDocs(parentField, DocValuesType.NUMERIC); Sort indexSort = - new Sort(rootDocs, + new Sort(parentField, new SortField("foo", SortField.Type.INT)); iwc.setIndexSort(indexSort); LogMergePolicy policy = newLogMergePolicy(); @@ -3210,11 +3209,7 @@ public void testIndexSortWithBlocks() throws IOException { parent.add(new StringField("id", Integer.toString(i), Store.YES)); parent.add(new NumericDocValuesField("id", i)); parent.add(new NumericDocValuesField("foo", random().nextInt())); - if (rootDocs.type() == DocValuesType.NONE) { - parent.add(new NumericDocValuesField(parentField, 0)); - } else { - parent.add(new StringField(parentField, parentField, Store.NO)); - } + parent.add(new NumericDocValuesField(parentField, 0)); w.addDocuments(Arrays.asList(child1, child2, parent)); if (rarely()) { w.commit(); @@ -3229,7 +3224,7 @@ public void testIndexSortWithBlocks() throws IOException { try (DirectoryReader reader = DirectoryReader.open(dir)) { for (LeafReaderContext ctx : reader.leaves()) { LeafReader leaf = ctx.reader(); - DocIdSetIterator parentDISI = rootDocs.rootDocs(leaf); + DocIdSetIterator parentDISI = leaf.getNumericDocValues(parentField); NumericDocValues ids = leaf.getNumericDocValues("id"); NumericDocValues children = leaf.getNumericDocValues("child"); int doc = -1; From 97b35e00761bb0f559030eb12a827043ab25de7f Mon Sep 17 00:00:00 2001 From: Simon Willnauer Date: Sat, 18 Nov 2023 12:33:25 +0100 Subject: [PATCH 04/44] tidy --- .../index/DocumentsWriterPerThread.java | 32 ++++++++++++------- .../apache/lucene/index/IndexingChain.java | 9 ++++-- .../org/apache/lucene/index/MultiSorter.java | 3 +- .../java/org/apache/lucene/index/Sorter.java | 6 ++-- .../java/org/apache/lucene/search/Sort.java | 8 ++--- .../apache/lucene/index/TestIndexSorting.java | 4 +-- 6 files changed, 38 insertions(+), 24 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java b/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java index 8a5797225751..010d09bbcc0f 100644 --- a/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java +++ b/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java @@ -29,7 +29,6 @@ import java.util.concurrent.atomic.AtomicLong; import java.util.concurrent.locks.ReentrantLock; import java.util.function.Consumer; - import org.apache.lucene.codecs.Codec; import org.apache.lucene.index.DocumentsWriterDeleteQueue.DeleteSlice; import org.apache.lucene.search.DocIdSetIterator; @@ -194,8 +193,10 @@ void abort() throws IOException { fieldInfos, indexWriterConfig, this::onAbortingException); - validator = indexWriterConfig.getIndexSort() != null ? new ParentBlockValidator(indexWriterConfig.getIndexSort()) - : DocValidator.EMPTY; + validator = + indexWriterConfig.getIndexSort() != null + ? new ParentBlockValidator(indexWriterConfig.getIndexSort()) + : indexableField -> {}; } final void testPoint(String message) { @@ -276,12 +277,14 @@ long updateDocuments( } } - private interface DocValidator extends Consumer { + private interface DocValidator extends Consumer { - DocValidator EMPTY = indexableField -> {}; default void afterDocument() {} + default void beforeDocument() {} + default void afterDocuments(int numDocs) {} + default void reset() {} } @@ -307,10 +310,12 @@ public void beforeDocument() { } lastDocHasParentField = false; } + @Override public void accept(IndexableField field) { if (parentFieldName != null) { - if (parentFieldName.equals(field.name()) && DocValuesType.NUMERIC == field.fieldType().docValuesType()) { + if (parentFieldName.equals(field.name()) + && DocValuesType.NUMERIC == field.fieldType().docValuesType()) { lastDocHasParentField = true; } } @@ -319,17 +324,22 @@ public void accept(IndexableField field) { @Override public void afterDocument() { if (childDocHasParentField) { - throw new IllegalArgumentException("only the last document in the block must contain a numeric doc values field named: " + parentFieldName); + throw new IllegalArgumentException( + "only the last document in the block must contain a numeric doc values field named: " + + parentFieldName); } } @Override public void afterDocuments(int numDocs) { if (numDocs > 1 && parentFieldName == null) { - throw new IllegalArgumentException("A parent field must be set in order to use document blocks with index sorting"); + throw new IllegalArgumentException( + "A parent field must be set in order to use document blocks with index sorting"); } if (parentFieldName != null && lastDocHasParentField == false) { - throw new IllegalArgumentException("the last document in the block must contain a numeric doc values field named: " + parentFieldName); + throw new IllegalArgumentException( + "the last document in the block must contain a numeric doc values field named: " + + parentFieldName); } } } @@ -340,12 +350,12 @@ private void validateIndexSortWithBlocks(Iterable last boolean valid = false; for (IndexableField field : lastDoc) { - if (parentField.equals(field.name()) && DocValuesType.NUMERIC == field.fieldType().docValuesType()) { + if (parentField.equals(field.name()) + && DocValuesType.NUMERIC == field.fieldType().docValuesType()) { valid = true; break; } } - } } diff --git a/lucene/core/src/java/org/apache/lucene/index/IndexingChain.java b/lucene/core/src/java/org/apache/lucene/index/IndexingChain.java index 047549d4c50d..f84614beff81 100644 --- a/lucene/core/src/java/org/apache/lucene/index/IndexingChain.java +++ b/lucene/core/src/java/org/apache/lucene/index/IndexingChain.java @@ -214,7 +214,8 @@ private Sorter.DocMap maybeSortSegment(SegmentWriteState state) throws IOExcepti LeafReader docValuesReader = getDocValuesLeafReader(); Function comparatorWrapper = in -> in; if (state.segmentInfo.getHasBlocks()) { - final DocIdSetIterator readerValues = docValuesReader.getNumericDocValues(indexSort.getParentField()); + final DocIdSetIterator readerValues = + docValuesReader.getNumericDocValues(indexSort.getParentField()); BitSet parents = BitSet.of(readerValues, state.segmentInfo.maxDoc()); comparatorWrapper = in -> @@ -530,7 +531,11 @@ private void finishStoredFields() throws IOException { } } - void processDocument(int docID, Iterable document, Consumer fieldConsumer) throws IOException { + void processDocument( + int docID, + Iterable document, + Consumer fieldConsumer) + throws IOException { // number of unique fields by names (collapses multiple field instances by the same name) int fieldCount = 0; int indexedFieldCount = 0; // number of unique fields indexed with postings diff --git a/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java b/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java index af657d753158..914ec144f3b6 100644 --- a/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java +++ b/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java @@ -55,7 +55,8 @@ static MergeState.DocMap[] sort(Sort sort, List readers) throws IOE CodecReader codecReader = readers.get(j); if (codecReader.getMetaData().hasBlocks()) { NumericDocValues parentDocs = codecReader.getNumericDocValues(sort.getParentField()); - assert parentDocs != null : "parent field must be present if index sorting is used with blocks"; + assert parentDocs != null + : "parent field must be present if index sorting is used with blocks"; BitSet parents = BitSet.of(parentDocs, codecReader.maxDoc()); IndexSorter.ComparableProvider[] providers = comparables[i]; IndexSorter.ComparableProvider provider = providers[j]; diff --git a/lucene/core/src/java/org/apache/lucene/index/Sorter.java b/lucene/core/src/java/org/apache/lucene/index/Sorter.java index 27b358b4d72e..55f13e8d8000 100644 --- a/lucene/core/src/java/org/apache/lucene/index/Sorter.java +++ b/lucene/core/src/java/org/apache/lucene/index/Sorter.java @@ -210,8 +210,10 @@ DocMap sort(LeafReader reader) throws IOException { Function comparatorWrapper = in -> in; if (reader.getMetaData().hasBlocks()) { - assert sort.getParentField() != null : "a parent field must be present when segment has blocks and index sorting"; - BitSet parents = BitSet.of(reader.getNumericDocValues(sort.getParentField()), reader.maxDoc()); + assert sort.getParentField() != null + : "a parent field must be present when segment has blocks and index sorting"; + BitSet parents = + BitSet.of(reader.getNumericDocValues(sort.getParentField()), reader.maxDoc()); comparatorWrapper = in -> (docID1, docID2) -> diff --git a/lucene/core/src/java/org/apache/lucene/search/Sort.java b/lucene/core/src/java/org/apache/lucene/search/Sort.java index 78442cff1254..e91768d45398 100644 --- a/lucene/core/src/java/org/apache/lucene/search/Sort.java +++ b/lucene/core/src/java/org/apache/lucene/search/Sort.java @@ -81,10 +81,7 @@ public SortField[] getSort() { return fields; } - /** - * Returns the field name that marks a document as a parent in a document block. - * - */ + /** Returns the field name that marks a document as a parent in a document block. */ public String getParentField() { return parentField; } @@ -132,7 +129,8 @@ public boolean equals(Object o) { if (this == o) return true; if (!(o instanceof Sort)) return false; final Sort other = (Sort) o; - return Objects.equals(parentField, other.parentField) && Arrays.equals(this.fields, other.fields); + return Objects.equals(parentField, other.parentField) + && Arrays.equals(this.fields, other.fields); } /** Returns a hash code value for this object. */ diff --git a/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java b/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java index a0184dbaf2bf..38f65826632e 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java @@ -3178,9 +3178,7 @@ public void testIndexSortWithBlocks() throws IOException { AssertingNeedsIndexSortCodec codec = new AssertingNeedsIndexSortCodec(); iwc.setCodec(codec); String parentField = "parent"; - Sort indexSort = - new Sort(parentField, - new SortField("foo", SortField.Type.INT)); + Sort indexSort = new Sort(parentField, new SortField("foo", SortField.Type.INT)); iwc.setIndexSort(indexSort); LogMergePolicy policy = newLogMergePolicy(); // make sure that merge factor is always > 2 From b65eda0d43c4155da532ac255a37292b3131ed25 Mon Sep 17 00:00:00 2001 From: Simon Willnauer Date: Mon, 20 Nov 2023 11:22:13 +0100 Subject: [PATCH 05/44] fix tests --- .../core/src/java/org/apache/lucene/index/CheckIndex.java | 3 +-- .../org/apache/lucene/search/TestTopFieldCollector.java | 8 ++++++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java index 246ef341cb93..8dc18dca1376 100644 --- a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java +++ b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java @@ -1199,9 +1199,8 @@ public static Status.IndexSortStatus testSort( if (cmp != 0) { break; } - prevDoc = nextDoc; } - + prevDoc = nextDoc; if (cmp > 0) { throw new CheckIndexException( "segment has indexSort=" diff --git a/lucene/core/src/test/org/apache/lucene/search/TestTopFieldCollector.java b/lucene/core/src/test/org/apache/lucene/search/TestTopFieldCollector.java index 3b85d99c850e..47c0d50a55a6 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestTopFieldCollector.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestTopFieldCollector.java @@ -183,9 +183,13 @@ public void testTotalHits() throws Exception { dir, newIndexWriterConfig().setMergePolicy(NoMergePolicy.INSTANCE).setIndexSort(sort)); Document doc = new Document(); doc.add(new NumericDocValuesField("foo", 3)); - w.addDocuments(Arrays.asList(doc, doc, doc, doc)); + for (Document d : Arrays.asList(doc, doc, doc, doc)) { + w.addDocument(d); + } w.flush(); - w.addDocuments(Arrays.asList(doc, doc, doc, doc, doc, doc)); + for (Document d : Arrays.asList(doc, doc, doc, doc, doc, doc)) { + w.addDocument(d); + } w.flush(); IndexReader reader = DirectoryReader.open(w); assertEquals(2, reader.leaves().size()); From 191ebaf7c2bd512aa9006ddb2e154855ae43c789 Mon Sep 17 00:00:00 2001 From: Simon Willnauer Date: Mon, 20 Nov 2023 11:23:51 +0100 Subject: [PATCH 06/44] remove dead code --- .../lucene/index/DocumentsWriterPerThread.java | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java b/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java index 010d09bbcc0f..6f1c74b16ada 100644 --- a/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java +++ b/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java @@ -344,21 +344,6 @@ public void afterDocuments(int numDocs) { } } - private void validateIndexSortWithBlocks(Iterable lastDoc) { - if (segmentInfo.getIndexSort() != null) { - String parentField = segmentInfo.getIndexSort().getParentField(); - - boolean valid = false; - for (IndexableField field : lastDoc) { - if (parentField.equals(field.name()) - && DocValuesType.NUMERIC == field.fieldType().docValuesType()) { - valid = true; - break; - } - } - } - } - private long finishDocuments(DocumentsWriterDeleteQueue.Node deleteNode, int docIdUpTo) { /* * here we actually finish the document in two steps 1. push the delete into From 345edc5e94ff4e23f8a14a698404b4ab5ff34f13 Mon Sep 17 00:00:00 2001 From: Simon Willnauer Date: Tue, 21 Nov 2023 10:33:47 +0100 Subject: [PATCH 07/44] add more tests --- .../TestLucene70SegmentInfoFormat.java | 5 +++ .../TestLucene86SegmentInfoFormat.java | 5 +++ .../TestLucene90SegmentInfoFormat.java | 5 +++ .../SimpleTextSegmentInfoFormat.java | 32 +++++++++++++- .../TestSimpleTextSegmentInfoFormat.java | 1 + .../org/apache/lucene/index/IndexWriter.java | 3 ++ .../org/apache/lucene/index/MultiSorter.java | 2 +- .../java/org/apache/lucene/search/Sort.java | 13 +++++- .../TestLucene99SegmentInfoFormat.java | 1 + .../apache/lucene/index/TestAddIndexes.java | 42 +++++++++++++++++++ .../index/BaseSegmentInfoFormatTestCase.java | 39 ++++++++++++++++- 11 files changed, 144 insertions(+), 4 deletions(-) diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene70/TestLucene70SegmentInfoFormat.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene70/TestLucene70SegmentInfoFormat.java index f3d777de28bc..81482eff07c9 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene70/TestLucene70SegmentInfoFormat.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene70/TestLucene70SegmentInfoFormat.java @@ -35,4 +35,9 @@ protected Version[] getVersions() { protected Codec getCodec() { return new Lucene84RWCodec(); } + + @Override + protected boolean supportsHasBlocks() { + return false; + } } diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene86/TestLucene86SegmentInfoFormat.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene86/TestLucene86SegmentInfoFormat.java index c89016167636..7388dcf82102 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene86/TestLucene86SegmentInfoFormat.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene86/TestLucene86SegmentInfoFormat.java @@ -35,4 +35,9 @@ protected Version[] getVersions() { protected Codec getCodec() { return new Lucene87RWCodec(); } + + @Override + protected boolean supportsHasBlocks() { + return false; + } } diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene90/TestLucene90SegmentInfoFormat.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene90/TestLucene90SegmentInfoFormat.java index 53a8a01a440b..9ecc34906416 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene90/TestLucene90SegmentInfoFormat.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene90/TestLucene90SegmentInfoFormat.java @@ -32,4 +32,9 @@ protected Version[] getVersions() { protected Codec getCodec() { return new Lucene90RWCodec(); } + + @Override + protected boolean supportsHasBlocks() { + return false; + } } diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextSegmentInfoFormat.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextSegmentInfoFormat.java index accdb184df80..90583011135b 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextSegmentInfoFormat.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextSegmentInfoFormat.java @@ -66,6 +66,8 @@ public class SimpleTextSegmentInfoFormat extends SegmentInfoFormat { static final BytesRef SI_FILE = new BytesRef(" file "); static final BytesRef SI_ID = new BytesRef(" id "); static final BytesRef SI_SORT = new BytesRef(" sort "); + static final BytesRef SI_SORT_HAS_PARENT_FIELD = new BytesRef(" has parent "); + static final BytesRef SI_SORT_PARENT_FIELD = new BytesRef(" parent "); static final BytesRef SI_SORT_TYPE = new BytesRef(" type "); static final BytesRef SI_SORT_NAME = new BytesRef(" name "); static final BytesRef SI_SORT_BYTES = new BytesRef(" bytes "); @@ -197,7 +199,21 @@ public SegmentInfo read( sortField[i] = SortFieldProvider.forName(provider).readSortField(bytes); assert bytes.eof(); } - Sort indexSort = sortField.length == 0 ? null : new Sort(sortField); + final Sort indexSort; + if (sortField.length == 0) { + indexSort = null; + } else { + String parentField = null; + SimpleTextUtil.readLine(input, scratch); + assert StringHelper.startsWith(scratch.get(), SI_SORT_HAS_PARENT_FIELD); + final boolean hasParent = Boolean.parseBoolean(readString(SI_SORT_HAS_PARENT_FIELD.length, scratch)); + if (hasParent) { + SimpleTextUtil.readLine(input, scratch); + assert StringHelper.startsWith(scratch.get(), SI_SORT_PARENT_FIELD); + parentField = readString(SI_SORT_PARENT_FIELD.length, scratch); + } + indexSort = new Sort(parentField, sortField); + } SimpleTextUtil.checkFooter(input); @@ -336,7 +352,21 @@ public void write(Directory dir, SegmentInfo si, IOContext ioContext) throws IOE SimpleTextUtil.write(output, b.bytes.get().toString(), scratch); SimpleTextUtil.writeNewline(output); } + if (numSortFields > 0) { + if (indexSort.getParentField() != null) { + SimpleTextUtil.write(output, SI_SORT_HAS_PARENT_FIELD); + SimpleTextUtil.write(output, "true", scratch); + SimpleTextUtil.writeNewline(output); + SimpleTextUtil.write(output, SI_SORT_PARENT_FIELD); + SimpleTextUtil.write(output, indexSort.getParentField(), scratch); + SimpleTextUtil.writeNewline(output); + } else { + SimpleTextUtil.write(output, SI_SORT_HAS_PARENT_FIELD); + SimpleTextUtil.write(output, "false", scratch); + SimpleTextUtil.writeNewline(output); + } + } SimpleTextUtil.writeChecksum(output, scratch); } } diff --git a/lucene/codecs/src/test/org/apache/lucene/codecs/simpletext/TestSimpleTextSegmentInfoFormat.java b/lucene/codecs/src/test/org/apache/lucene/codecs/simpletext/TestSimpleTextSegmentInfoFormat.java index 610f5a2d7564..58ae714f9b37 100644 --- a/lucene/codecs/src/test/org/apache/lucene/codecs/simpletext/TestSimpleTextSegmentInfoFormat.java +++ b/lucene/codecs/src/test/org/apache/lucene/codecs/simpletext/TestSimpleTextSegmentInfoFormat.java @@ -33,4 +33,5 @@ protected Version[] getVersions() { protected Codec getCodec() { return codec; } + } diff --git a/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java b/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java index 68a2ce1cc26d..a4c960d4d19d 100644 --- a/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java +++ b/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java @@ -1225,6 +1225,9 @@ private void validateIndexSort() { /** Returns true if indexSort is a prefix of otherSort. */ static boolean isCongruentSort(Sort indexSort, Sort otherSort) { + if (Objects.equals(indexSort.getParentField(), otherSort.getParentField()) == false) { + return false; + } final SortField[] fields1 = indexSort.getSort(); final SortField[] fields2 = otherSort.getSort(); if (fields1.length > fields2.length) { diff --git a/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java b/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java index 914ec144f3b6..ebce56bec112 100644 --- a/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java +++ b/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java @@ -56,7 +56,7 @@ static MergeState.DocMap[] sort(Sort sort, List readers) throws IOE if (codecReader.getMetaData().hasBlocks()) { NumericDocValues parentDocs = codecReader.getNumericDocValues(sort.getParentField()); assert parentDocs != null - : "parent field must be present if index sorting is used with blocks"; + : "parent field: " + sort.getParentField() + " must be present if index sorting is used with blocks"; BitSet parents = BitSet.of(parentDocs, codecReader.maxDoc()); IndexSorter.ComparableProvider[] providers = comparables[i]; IndexSorter.ComparableProvider provider = providers[j]; diff --git a/lucene/core/src/java/org/apache/lucene/search/Sort.java b/lucene/core/src/java/org/apache/lucene/search/Sort.java index e91768d45398..1ce0e5afef4e 100644 --- a/lucene/core/src/java/org/apache/lucene/search/Sort.java +++ b/lucene/core/src/java/org/apache/lucene/search/Sort.java @@ -64,6 +64,17 @@ public Sort(SortField... fields) { this(null, fields); } + /** + * Sets the sort to the given criteria in succession: the first SortField is checked first, but if + * it produces a tie, then the second SortField is used to break the tie, etc. Finally, if there + * is still a tie after all SortFields are checked, the internal Lucene docid is used to break it. + * + * @param parentField the name of a numeric doc values field that marks the last document of a document blocks + * indexed with {@link org.apache.lucene.index.IndexWriter#addDocuments(Iterable)} or it's update relatives. + * This is required for indices that use index sorting in combination with document blocks in order to maintain the document order + * of the blocks documents. Index sorting will effectively compare the parent (last document) of a block in order to stable sort + * all it's adjacent documents that belong to a block. This field must be a numeric doc values field a + */ public Sort(String parentField, SortField... fields) { if (fields.length == 0) { throw new IllegalArgumentException("There must be at least 1 sort field"); @@ -113,7 +124,7 @@ public Sort rewrite(IndexSearcher searcher) throws IOException { public String toString() { StringBuilder buffer = new StringBuilder(); if (parentField != null) { - buffer.append("RootDocField: ").append(parentField).append(' '); + buffer.append("parent field: ").append(parentField).append(' '); } for (int i = 0; i < fields.length; i++) { buffer.append(fields[i].toString()); diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99SegmentInfoFormat.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99SegmentInfoFormat.java index ebb2a0013e28..10d21b6b2324 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99SegmentInfoFormat.java +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99SegmentInfoFormat.java @@ -33,4 +33,5 @@ protected Version[] getVersions() { protected Codec getCodec() { return TestUtil.getDefaultCodec(); } + } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestAddIndexes.java b/lucene/core/src/test/org/apache/lucene/index/TestAddIndexes.java index cba815fbde36..143fca9a2cd6 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestAddIndexes.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestAddIndexes.java @@ -1679,6 +1679,48 @@ public void testIllegalIndexSortChange2() throws Exception { IOUtils.close(r1, dir1, w2, dir2); } + public void testIllegalIndexSortChange3() throws Exception { + Directory dir1 = newDirectory(); + IndexWriterConfig iwc1 = newIndexWriterConfig(new MockAnalyzer(random())); + iwc1.setIndexSort(new Sort("foobar", new SortField("foo", SortField.Type.INT))); + + RandomIndexWriter w1 = new RandomIndexWriter(random(), dir1, iwc1); + Document parent = new Document(); + parent.add(new NumericDocValuesField("foobar", 0)); + w1.addDocuments(Arrays.asList(new Document(), new Document(), parent)); + w1.commit(); + w1.addDocuments(Arrays.asList(new Document(), new Document(), parent)); + w1.commit(); + // so the index sort is in fact burned into the index: + w1.forceMerge(1); + w1.close(); + + Directory dir2 = newDirectory(); + IndexWriterConfig iwc2 = newIndexWriterConfig(new MockAnalyzer(random())); + iwc2.setIndexSort(new Sort(new SortField("foo", SortField.Type.INT))); + RandomIndexWriter w2 = new RandomIndexWriter(random(), dir2, iwc2); + + IndexReader r1 = DirectoryReader.open(dir1); + String message = + expectThrows( + IllegalArgumentException.class, + () -> { + w2.addIndexes((SegmentReader) getOnlyLeafReader(r1)); + }) + .getMessage(); + assertEquals("cannot change index sort from parent field: foobar to ", message); + + message = + expectThrows( + IllegalArgumentException.class, + () -> { + w2.addIndexes(dir1); + }) + .getMessage(); + assertEquals("cannot change index sort from parent field: foobar to ", message); + IOUtils.close(r1, dir1, w2, dir2); + } + public void testAddIndexesDVUpdateSameSegmentName() throws Exception { Directory dir1 = newDirectory(); IndexWriterConfig iwc1 = newIndexWriterConfig(new MockAnalyzer(random())); diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseSegmentInfoFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseSegmentInfoFormatTestCase.java index 0ebb3fcffdc4..30afc686531d 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseSegmentInfoFormatTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseSegmentInfoFormatTestCase.java @@ -80,6 +80,33 @@ public void testFiles() throws Exception { dir.close(); } + public void testHasBlocks() throws IOException { + assumeTrue("test requires a codec that can read/write hasBlocks", supportsHasBlocks()); + + Directory dir = newDirectory(); + Codec codec = getCodec(); + byte[] id = StringHelper.randomId(); + SegmentInfo info = + new SegmentInfo( + dir, + getVersions()[0], + getVersions()[0], + "_123", + 1, + false, + random().nextBoolean(), + codec, + Collections.emptyMap(), + id, + Collections.emptyMap(), + null); + info.setFiles(Collections.emptySet()); + codec.segmentInfoFormat().write(dir, info, IOContext.DEFAULT); + SegmentInfo info2 = codec.segmentInfoFormat().read(dir, "_123", id, IOContext.DEFAULT); + assertEquals(info.getHasBlocks(), info2.getHasBlocks()); + dir.close(); + } + /** Tests SI writer adds itself to files... */ public void testAddsSelfToFiles() throws Exception { Directory dir = newDirectory(); @@ -260,6 +287,10 @@ protected boolean supportsIndexSort() { return true; } + protected boolean supportsHasBlocks() { + return true; + } + private SortField randomIndexSortField() { boolean reversed = random().nextBoolean(); SortField sortField; @@ -360,7 +391,13 @@ public void testSort() throws IOException { for (int j = 0; j < numSortFields; ++j) { sortFields[j] = randomIndexSortField(); } - sort = new Sort(sortFields); + if (supportsHasBlocks()) { + String parentField = random().nextBoolean() ? null : + TestUtil.randomSimpleString(random(), 1, 10); + sort = new Sort(parentField, sortFields); + } else { + sort = new Sort(sortFields); + } } Directory dir = newDirectory(); From 16939537b0a6b2db90c746c9b5b0610ce3eb744d Mon Sep 17 00:00:00 2001 From: Simon Willnauer Date: Tue, 21 Nov 2023 10:34:35 +0100 Subject: [PATCH 08/44] tidy --- .../SimpleTextSegmentInfoFormat.java | 3 +- .../TestSimpleTextSegmentInfoFormat.java | 1 - .../org/apache/lucene/index/MultiSorter.java | 4 ++- .../java/org/apache/lucene/search/Sort.java | 12 ++++--- .../TestLucene99SegmentInfoFormat.java | 1 - .../apache/lucene/index/TestAddIndexes.java | 32 +++++++++++-------- .../index/BaseSegmentInfoFormatTestCase.java | 30 ++++++++--------- 7 files changed, 45 insertions(+), 38 deletions(-) diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextSegmentInfoFormat.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextSegmentInfoFormat.java index 90583011135b..0cc1496b7437 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextSegmentInfoFormat.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextSegmentInfoFormat.java @@ -206,7 +206,8 @@ public SegmentInfo read( String parentField = null; SimpleTextUtil.readLine(input, scratch); assert StringHelper.startsWith(scratch.get(), SI_SORT_HAS_PARENT_FIELD); - final boolean hasParent = Boolean.parseBoolean(readString(SI_SORT_HAS_PARENT_FIELD.length, scratch)); + final boolean hasParent = + Boolean.parseBoolean(readString(SI_SORT_HAS_PARENT_FIELD.length, scratch)); if (hasParent) { SimpleTextUtil.readLine(input, scratch); assert StringHelper.startsWith(scratch.get(), SI_SORT_PARENT_FIELD); diff --git a/lucene/codecs/src/test/org/apache/lucene/codecs/simpletext/TestSimpleTextSegmentInfoFormat.java b/lucene/codecs/src/test/org/apache/lucene/codecs/simpletext/TestSimpleTextSegmentInfoFormat.java index 58ae714f9b37..610f5a2d7564 100644 --- a/lucene/codecs/src/test/org/apache/lucene/codecs/simpletext/TestSimpleTextSegmentInfoFormat.java +++ b/lucene/codecs/src/test/org/apache/lucene/codecs/simpletext/TestSimpleTextSegmentInfoFormat.java @@ -33,5 +33,4 @@ protected Version[] getVersions() { protected Codec getCodec() { return codec; } - } diff --git a/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java b/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java index ebce56bec112..bb3772661d26 100644 --- a/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java +++ b/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java @@ -56,7 +56,9 @@ static MergeState.DocMap[] sort(Sort sort, List readers) throws IOE if (codecReader.getMetaData().hasBlocks()) { NumericDocValues parentDocs = codecReader.getNumericDocValues(sort.getParentField()); assert parentDocs != null - : "parent field: " + sort.getParentField() + " must be present if index sorting is used with blocks"; + : "parent field: " + + sort.getParentField() + + " must be present if index sorting is used with blocks"; BitSet parents = BitSet.of(parentDocs, codecReader.maxDoc()); IndexSorter.ComparableProvider[] providers = comparables[i]; IndexSorter.ComparableProvider provider = providers[j]; diff --git a/lucene/core/src/java/org/apache/lucene/search/Sort.java b/lucene/core/src/java/org/apache/lucene/search/Sort.java index 1ce0e5afef4e..286d55653cd0 100644 --- a/lucene/core/src/java/org/apache/lucene/search/Sort.java +++ b/lucene/core/src/java/org/apache/lucene/search/Sort.java @@ -69,11 +69,13 @@ public Sort(SortField... fields) { * it produces a tie, then the second SortField is used to break the tie, etc. Finally, if there * is still a tie after all SortFields are checked, the internal Lucene docid is used to break it. * - * @param parentField the name of a numeric doc values field that marks the last document of a document blocks - * indexed with {@link org.apache.lucene.index.IndexWriter#addDocuments(Iterable)} or it's update relatives. - * This is required for indices that use index sorting in combination with document blocks in order to maintain the document order - * of the blocks documents. Index sorting will effectively compare the parent (last document) of a block in order to stable sort - * all it's adjacent documents that belong to a block. This field must be a numeric doc values field a + * @param parentField the name of a numeric doc values field that marks the last document of a + * document blocks indexed with {@link + * org.apache.lucene.index.IndexWriter#addDocuments(Iterable)} or it's update relatives. This + * is required for indices that use index sorting in combination with document blocks in order + * to maintain the document order of the blocks documents. Index sorting will effectively + * compare the parent (last document) of a block in order to stable sort all it's adjacent + * documents that belong to a block. This field must be a numeric doc values field a */ public Sort(String parentField, SortField... fields) { if (fields.length == 0) { diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99SegmentInfoFormat.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99SegmentInfoFormat.java index 10d21b6b2324..ebb2a0013e28 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99SegmentInfoFormat.java +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99SegmentInfoFormat.java @@ -33,5 +33,4 @@ protected Version[] getVersions() { protected Codec getCodec() { return TestUtil.getDefaultCodec(); } - } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestAddIndexes.java b/lucene/core/src/test/org/apache/lucene/index/TestAddIndexes.java index 143fca9a2cd6..9a5fbf2606a2 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestAddIndexes.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestAddIndexes.java @@ -1702,22 +1702,26 @@ public void testIllegalIndexSortChange3() throws Exception { IndexReader r1 = DirectoryReader.open(dir1); String message = - expectThrows( - IllegalArgumentException.class, - () -> { - w2.addIndexes((SegmentReader) getOnlyLeafReader(r1)); - }) - .getMessage(); - assertEquals("cannot change index sort from parent field: foobar to ", message); + expectThrows( + IllegalArgumentException.class, + () -> { + w2.addIndexes((SegmentReader) getOnlyLeafReader(r1)); + }) + .getMessage(); + assertEquals( + "cannot change index sort from parent field: foobar to ", + message); message = - expectThrows( - IllegalArgumentException.class, - () -> { - w2.addIndexes(dir1); - }) - .getMessage(); - assertEquals("cannot change index sort from parent field: foobar to ", message); + expectThrows( + IllegalArgumentException.class, + () -> { + w2.addIndexes(dir1); + }) + .getMessage(); + assertEquals( + "cannot change index sort from parent field: foobar to ", + message); IOUtils.close(r1, dir1, w2, dir2); } diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseSegmentInfoFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseSegmentInfoFormatTestCase.java index 30afc686531d..91950cd5328b 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseSegmentInfoFormatTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseSegmentInfoFormatTestCase.java @@ -87,19 +87,19 @@ public void testHasBlocks() throws IOException { Codec codec = getCodec(); byte[] id = StringHelper.randomId(); SegmentInfo info = - new SegmentInfo( - dir, - getVersions()[0], - getVersions()[0], - "_123", - 1, - false, - random().nextBoolean(), - codec, - Collections.emptyMap(), - id, - Collections.emptyMap(), - null); + new SegmentInfo( + dir, + getVersions()[0], + getVersions()[0], + "_123", + 1, + false, + random().nextBoolean(), + codec, + Collections.emptyMap(), + id, + Collections.emptyMap(), + null); info.setFiles(Collections.emptySet()); codec.segmentInfoFormat().write(dir, info, IOContext.DEFAULT); SegmentInfo info2 = codec.segmentInfoFormat().read(dir, "_123", id, IOContext.DEFAULT); @@ -392,8 +392,8 @@ public void testSort() throws IOException { sortFields[j] = randomIndexSortField(); } if (supportsHasBlocks()) { - String parentField = random().nextBoolean() ? null : - TestUtil.randomSimpleString(random(), 1, 10); + String parentField = + random().nextBoolean() ? null : TestUtil.randomSimpleString(random(), 1, 10); sort = new Sort(parentField, sortFields); } else { sort = new Sort(sortFields); From baaedcdb6d7e713535467295cf79dc87671c10ce Mon Sep 17 00:00:00 2001 From: Simon Willnauer Date: Tue, 21 Nov 2023 11:26:14 +0100 Subject: [PATCH 09/44] add test for validations --- .../apache/lucene/index/TestIndexSorting.java | 52 +++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java b/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java index 38f65826632e..89b5f8641b9c 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java @@ -3172,6 +3172,58 @@ public void testSortDocsAndFreqsAndPositionsAndOffsets() throws IOException { dir.close(); } + public void testBlockIsMissingParentField() throws IOException { + try (Directory dir = newDirectory()) { + IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random())); + String parentField = "parent"; + Sort indexSort = new Sort(parentField, new SortField("foo", SortField.Type.INT)); + iwc.setIndexSort(indexSort); + try (IndexWriter writer = new IndexWriter(dir, iwc)) { + List runables = + Arrays.asList( + () -> { + IllegalArgumentException ex = + expectThrows( + IllegalArgumentException.class, + () -> { + writer.addDocuments(Arrays.asList(new Document(), new Document())); + }); + assertEquals( + "the last document in the block must contain a numeric doc values field named: parent", + ex.getMessage()); + }, + () -> { + IllegalArgumentException ex = + expectThrows( + IllegalArgumentException.class, + () -> { + Document doc = new Document(); + doc.add(new NumericDocValuesField("parent", 0)); + writer.addDocuments(Arrays.asList(doc, new Document())); + }); + assertEquals( + "only the last document in the block must contain a numeric doc values field named: parent", + ex.getMessage()); + }, + () -> { + IllegalArgumentException ex = + expectThrows( + IllegalArgumentException.class, + () -> { + writer.addDocuments(Arrays.asList(new Document())); + }); + assertEquals( + "the last document in the block must contain a numeric doc values field named: parent", + ex.getMessage()); + }); + Collections.shuffle(runables, random()); + for (Runnable runnable : runables) { + runnable.run(); + } + } + } + } + public void testIndexSortWithBlocks() throws IOException { try (Directory dir = newDirectory()) { IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random())); From 2162cce31dd96e98987d80c3d9d9749c09ca15da Mon Sep 17 00:00:00 2001 From: Simon Willnauer Date: Tue, 21 Nov 2023 11:35:12 +0100 Subject: [PATCH 10/44] add test for validations --- .../apache/lucene/index/TestIndexSorting.java | 53 +++++++++++++++++-- 1 file changed, 50 insertions(+), 3 deletions(-) diff --git a/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java b/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java index 89b5f8641b9c..904937b9d95b 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java @@ -3179,7 +3179,7 @@ public void testBlockIsMissingParentField() throws IOException { Sort indexSort = new Sort(parentField, new SortField("foo", SortField.Type.INT)); iwc.setIndexSort(indexSort); try (IndexWriter writer = new IndexWriter(dir, iwc)) { - List runables = + List runnabels = Arrays.asList( () -> { IllegalArgumentException ex = @@ -3216,14 +3216,61 @@ public void testBlockIsMissingParentField() throws IOException { "the last document in the block must contain a numeric doc values field named: parent", ex.getMessage()); }); - Collections.shuffle(runables, random()); - for (Runnable runnable : runables) { + Collections.shuffle(runnabels, random()); + for (Runnable runnable : runnabels) { runnable.run(); } } } } + public void testIndexWithSortIsCongruent() throws IOException { + try (Directory dir = newDirectory()) { + IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random())); + String parentField = "parent"; + Sort indexSort = new Sort(parentField, new SortField("foo", SortField.Type.INT)); + iwc.setIndexSort(indexSort); + try (IndexWriter writer = new IndexWriter(dir, iwc)) { + Document child1 = new Document(); + child1.add(new StringField("id", Integer.toString(1), Store.YES)); + Document child2 = new Document(); + child2.add(new StringField("id", Integer.toString(1), Store.YES)); + Document parent = new Document(); + parent.add(new StringField("id", Integer.toString(1), Store.YES)); + parent.add(new NumericDocValuesField(parentField, 0)); + writer.addDocuments(Arrays.asList(child1, child2, parent)); + writer.flush(); + if (random().nextBoolean()) { + writer.addDocuments(Arrays.asList(child1, child2, parent)); + } + writer.commit(); + } + IllegalArgumentException ex = + expectThrows( + IllegalArgumentException.class, + () -> { + IndexWriterConfig config = new IndexWriterConfig(new MockAnalyzer(random())); + Sort sort = new Sort("someOther", new SortField("foo", SortField.Type.INT)); + config.setIndexSort(sort); + new IndexWriter(dir, config); + }); + assertTrue( + ex.getMessage(), + ex.getMessage().endsWith(" to new indexSort=parent field: someOther ")); + + ex = + expectThrows( + IllegalArgumentException.class, + () -> { + IndexWriterConfig config = new IndexWriterConfig(new MockAnalyzer(random())); + Sort sort = new Sort(new SortField("foo", SortField.Type.INT)); + config.setIndexSort(sort); + new IndexWriter(dir, config); + }); + assertTrue(ex.getMessage(), ex.getMessage().endsWith(" to new indexSort=")); + } + } + public void testIndexSortWithBlocks() throws IOException { try (Directory dir = newDirectory()) { IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random())); From 56a444490c685f4cc772f71ea83e73a186f14bd4 Mon Sep 17 00:00:00 2001 From: Simon Willnauer Date: Tue, 21 Nov 2023 15:23:40 +0100 Subject: [PATCH 11/44] fix imports --- .../src/java/org/apache/lucene/index/CheckIndex.java | 2 +- .../java/org/apache/lucene/index/IndexingChain.java | 11 ++++++++++- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java index 8dc18dca1376..942dfec8ea14 100644 --- a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java +++ b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java @@ -70,6 +70,7 @@ import org.apache.lucene.store.FSDirectory; import org.apache.lucene.store.IOContext; import org.apache.lucene.store.Lock; +import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.ArrayUtil.ByteArrayComparator; import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; @@ -88,7 +89,6 @@ import org.apache.lucene.util.automaton.CompiledAutomaton; import org.apache.lucene.util.automaton.Operations; - /** * Basic tool and API to check the health of an index and write a new segments file that removes * reference to problematic segments. diff --git a/lucene/core/src/java/org/apache/lucene/index/IndexingChain.java b/lucene/core/src/java/org/apache/lucene/index/IndexingChain.java index f84614beff81..f3beca688a25 100644 --- a/lucene/core/src/java/org/apache/lucene/index/IndexingChain.java +++ b/lucene/core/src/java/org/apache/lucene/index/IndexingChain.java @@ -48,8 +48,17 @@ import org.apache.lucene.search.similarities.Similarity; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IOContext; -import org.apache.lucene.util.*; +import org.apache.lucene.util.Accountable; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BitSet; +import org.apache.lucene.util.ByteBlockPool; +import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRefHash.MaxBytesLengthExceededException; +import org.apache.lucene.util.Counter; +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.InfoStream; +import org.apache.lucene.util.IntBlockPool; +import org.apache.lucene.util.RamUsageEstimator; /** Default general purpose indexing chain, which handles indexing all types of fields. */ final class IndexingChain implements Accountable { From ff8144a74e5ca0bff380a0adad54161207e3eda9 Mon Sep 17 00:00:00 2001 From: Simon Willnauer Date: Sun, 3 Dec 2023 13:18:51 +0100 Subject: [PATCH 12/44] apply feedback --- .../index/DocumentsWriterPerThread.java | 139 ++++++++---------- .../apache/lucene/index/IndexingChain.java | 7 +- .../java/org/apache/lucene/search/Sort.java | 15 +- .../apache/lucene/index/TestIndexSorting.java | 47 +++--- 4 files changed, 101 insertions(+), 107 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java b/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java index 6f1c74b16ada..88a3988d59f6 100644 --- a/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java +++ b/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java @@ -21,18 +21,19 @@ import java.util.Collection; import java.util.Collections; import java.util.HashSet; +import java.util.Iterator; import java.util.List; import java.util.Locale; +import java.util.NoSuchElementException; import java.util.Objects; import java.util.Set; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicLong; import java.util.concurrent.locks.ReentrantLock; -import java.util.function.Consumer; import org.apache.lucene.codecs.Codec; +import org.apache.lucene.document.NumericDocValuesField; import org.apache.lucene.index.DocumentsWriterDeleteQueue.DeleteSlice; import org.apache.lucene.search.DocIdSetIterator; -import org.apache.lucene.search.Sort; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FlushInfo; import org.apache.lucene.store.IOContext; @@ -136,8 +137,7 @@ void abort() throws IOException { private final ReentrantLock lock = new ReentrantLock(); private int[] deleteDocIDs = new int[0]; private int numDeletedDocIds = 0; - - private final DocValidator validator; + private final String parentFieldName; DocumentsWriterPerThread( int indexVersionCreated, @@ -193,10 +193,10 @@ void abort() throws IOException { fieldInfos, indexWriterConfig, this::onAbortingException); - validator = - indexWriterConfig.getIndexSort() != null - ? new ParentBlockValidator(indexWriterConfig.getIndexSort()) - : indexableField -> {}; + parentFieldName = + segmentInfo.getIndexSort() != null + ? indexWriterConfig.getIndexSort().getParentField() + : null; } final void testPoint(String message) { @@ -238,9 +238,28 @@ long updateDocuments( } final int docsInRamBefore = numDocsInRAM; boolean allDocsIndexed = false; - validator.reset(); try { - for (Iterable doc : docs) { + final Iterator> iterator = docs.iterator(); + while (iterator.hasNext()) { + Iterable doc = iterator.next(); + if (segmentInfo.getIndexSort() != null) { + if (parentFieldName != null) { + validateNoParentField(doc); + final NumericDocValuesField parentField; + if (iterator.hasNext() == false) { + int numChildren = numDocsInRAM - docsInRamBefore; + parentField = new NumericDocValuesField(parentFieldName, numChildren); + } else { + parentField = null; + } + doc = filterParentDocField(doc, parentField); + } else if (iterator.hasNext()) { + // sort is configured but parent field is missing, yet we have a doc-block + throw new IllegalArgumentException( + "a parent field must be set in order to use document blocks with index sorting; see Sort#getParentField"); + } + } + // Even on exception, the document is still added (but marked // deleted), so we don't need to un-reserve at that point. // Aborting exceptions will actually "lose" more than one @@ -248,18 +267,13 @@ long updateDocuments( // it's very hard to fix (we can't easily distinguish aborting // vs non-aborting exceptions): reserveOneDoc(); - try { - validator.beforeDocument(); - indexingChain.processDocument(numDocsInRAM++, doc, validator); - validator.afterDocument(); + indexingChain.processDocument(numDocsInRAM++, doc); } finally { onNewDocOnRAM.run(); } } - final int numDocs = numDocsInRAM - docsInRamBefore; - validator.afterDocuments(numDocs); if (numDocs > 1) { segmentInfo.setHasBlocks(); } @@ -277,71 +291,44 @@ long updateDocuments( } } - private interface DocValidator extends Consumer { - - default void afterDocument() {} - - default void beforeDocument() {} - - default void afterDocuments(int numDocs) {} - - default void reset() {} + private void validateNoParentField(Iterable doc) { + for (IndexableField field : doc) {} } - private static class ParentBlockValidator implements DocValidator { - private boolean lastDocHasParentField = false; - private boolean childDocHasParentField = false; - private final String parentFieldName; - - @Override - public void reset() { - lastDocHasParentField = false; - childDocHasParentField = false; - } + private Iterable filterParentDocField( + Iterable doc, NumericDocValuesField parentDocMarker) { + return () -> { + final Iterator first = doc.iterator(); + return new Iterator<>() { + NumericDocValuesField additionalField = parentDocMarker; - private ParentBlockValidator(Sort sort) { - this.parentFieldName = sort.getParentField(); - } - - @Override - public void beforeDocument() { - if (childDocHasParentField == false && lastDocHasParentField) { - childDocHasParentField = true; - } - lastDocHasParentField = false; - } - - @Override - public void accept(IndexableField field) { - if (parentFieldName != null) { - if (parentFieldName.equals(field.name()) - && DocValuesType.NUMERIC == field.fieldType().docValuesType()) { - lastDocHasParentField = true; + @Override + public boolean hasNext() { + return first.hasNext() || additionalField != null; } - } - } - - @Override - public void afterDocument() { - if (childDocHasParentField) { - throw new IllegalArgumentException( - "only the last document in the block must contain a numeric doc values field named: " - + parentFieldName); - } - } - @Override - public void afterDocuments(int numDocs) { - if (numDocs > 1 && parentFieldName == null) { - throw new IllegalArgumentException( - "A parent field must be set in order to use document blocks with index sorting"); - } - if (parentFieldName != null && lastDocHasParentField == false) { - throw new IllegalArgumentException( - "the last document in the block must contain a numeric doc values field named: " - + parentFieldName); - } - } + @Override + public IndexableField next() { + if (first.hasNext()) { + IndexableField field = first.next(); + if (parentFieldName.equals(field.name()) + && DocValuesType.NUMERIC == field.fieldType().docValuesType()) { + throw new IllegalArgumentException( + "documents must not contain the parent doc values field \"" + + parentFieldName + + "\""); + } + return field; + } + if (additionalField != null) { + IndexableField field = additionalField; + additionalField = null; + return field; + } + throw new NoSuchElementException(); + } + }; + }; } private long finishDocuments(DocumentsWriterDeleteQueue.Node deleteNode, int docIdUpTo) { diff --git a/lucene/core/src/java/org/apache/lucene/index/IndexingChain.java b/lucene/core/src/java/org/apache/lucene/index/IndexingChain.java index f3beca688a25..0973fc857e81 100644 --- a/lucene/core/src/java/org/apache/lucene/index/IndexingChain.java +++ b/lucene/core/src/java/org/apache/lucene/index/IndexingChain.java @@ -540,11 +540,7 @@ private void finishStoredFields() throws IOException { } } - void processDocument( - int docID, - Iterable document, - Consumer fieldConsumer) - throws IOException { + void processDocument(int docID, Iterable document) throws IOException { // number of unique fields by names (collapses multiple field instances by the same name) int fieldCount = 0; int indexedFieldCount = 0; // number of unique fields indexed with postings @@ -563,7 +559,6 @@ void processDocument( // 1st pass over doc fields – verify that doc schema matches the index schema // build schema for each unique doc field for (IndexableField field : document) { - fieldConsumer.accept(field); IndexableFieldType fieldType = field.fieldType(); PerField pf = getOrAddPerField(field.name()); if (pf.fieldGen != fieldGen) { // first time we see this field in this document diff --git a/lucene/core/src/java/org/apache/lucene/search/Sort.java b/lucene/core/src/java/org/apache/lucene/search/Sort.java index 286d55653cd0..f3e210be2d6f 100644 --- a/lucene/core/src/java/org/apache/lucene/search/Sort.java +++ b/lucene/core/src/java/org/apache/lucene/search/Sort.java @@ -71,11 +71,12 @@ public Sort(SortField... fields) { * * @param parentField the name of a numeric doc values field that marks the last document of a * document blocks indexed with {@link - * org.apache.lucene.index.IndexWriter#addDocuments(Iterable)} or it's update relatives. This - * is required for indices that use index sorting in combination with document blocks in order - * to maintain the document order of the blocks documents. Index sorting will effectively - * compare the parent (last document) of a block in order to stable sort all it's adjacent - * documents that belong to a block. This field must be a numeric doc values field a + * org.apache.lucene.index.IndexWriter#addDocuments(Iterable)} or it's update relatives at + * index time. This is required for indices that use index sorting in combination with + * document blocks in order to maintain the document order of the blocks documents. Index + * sorting will effectively compare the parent (last document) of a block in order to stable + * sort all it's adjacent documents that belong to a block. This field must be a numeric doc + * values field. */ public Sort(String parentField, SortField... fields) { if (fields.length == 0) { @@ -110,7 +111,9 @@ public String getParentField() { */ public Sort rewrite(IndexSearcher searcher) throws IOException { boolean changed = false; - assert parentField == null; + if (parentField == null) { + throw new IllegalStateException("parentFields must not be used with search time sorting"); + } SortField[] rewrittenSortFields = new SortField[fields.length]; for (int i = 0; i < fields.length; i++) { rewrittenSortFields[i] = fields[i].rewrite(searcher); diff --git a/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java b/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java index 904937b9d95b..07ee24dcd722 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java @@ -3172,7 +3172,26 @@ public void testSortDocsAndFreqsAndPositionsAndOffsets() throws IOException { dir.close(); } - public void testBlockIsMissingParentField() throws IOException { + public void testParentFieldNotConfigured() throws IOException { + try (Directory dir = newDirectory()) { + IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random())); + Sort indexSort = new Sort(new SortField("foo", SortField.Type.INT)); + iwc.setIndexSort(indexSort); + try (IndexWriter writer = new IndexWriter(dir, iwc)) { + IllegalArgumentException ex = + expectThrows( + IllegalArgumentException.class, + () -> { + writer.addDocuments(Arrays.asList(new Document(), new Document())); + }); + assertEquals( + "a parent field must be set in order to use document blocks with index sorting; see Sort#getParentField", + ex.getMessage()); + } + } + } + + public void testBlockContainsParentField() throws IOException { try (Directory dir = newDirectory()) { IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random())); String parentField = "parent"; @@ -3181,17 +3200,6 @@ public void testBlockIsMissingParentField() throws IOException { try (IndexWriter writer = new IndexWriter(dir, iwc)) { List runnabels = Arrays.asList( - () -> { - IllegalArgumentException ex = - expectThrows( - IllegalArgumentException.class, - () -> { - writer.addDocuments(Arrays.asList(new Document(), new Document())); - }); - assertEquals( - "the last document in the block must contain a numeric doc values field named: parent", - ex.getMessage()); - }, () -> { IllegalArgumentException ex = expectThrows( @@ -3202,7 +3210,7 @@ public void testBlockIsMissingParentField() throws IOException { writer.addDocuments(Arrays.asList(doc, new Document())); }); assertEquals( - "only the last document in the block must contain a numeric doc values field named: parent", + "documents must not contain the parent doc values field \"parent\"", ex.getMessage()); }, () -> { @@ -3210,10 +3218,12 @@ public void testBlockIsMissingParentField() throws IOException { expectThrows( IllegalArgumentException.class, () -> { - writer.addDocuments(Arrays.asList(new Document())); + Document doc = new Document(); + doc.add(new NumericDocValuesField("parent", 0)); + writer.addDocuments(Arrays.asList(new Document(), doc)); }); assertEquals( - "the last document in the block must contain a numeric doc values field named: parent", + "documents must not contain the parent doc values field \"parent\"", ex.getMessage()); }); Collections.shuffle(runnabels, random()); @@ -3237,7 +3247,6 @@ public void testIndexWithSortIsCongruent() throws IOException { child2.add(new StringField("id", Integer.toString(1), Store.YES)); Document parent = new Document(); parent.add(new StringField("id", Integer.toString(1), Store.YES)); - parent.add(new NumericDocValuesField(parentField, 0)); writer.addDocuments(Arrays.asList(child1, child2, parent)); writer.flush(); if (random().nextBoolean()) { @@ -3306,7 +3315,6 @@ public void testIndexSortWithBlocks() throws IOException { parent.add(new StringField("id", Integer.toString(i), Store.YES)); parent.add(new NumericDocValuesField("id", i)); parent.add(new NumericDocValuesField("foo", random().nextInt())); - parent.add(new NumericDocValuesField(parentField, 0)); w.addDocuments(Arrays.asList(child1, child2, parent)); if (rarely()) { w.commit(); @@ -3321,12 +3329,13 @@ public void testIndexSortWithBlocks() throws IOException { try (DirectoryReader reader = DirectoryReader.open(dir)) { for (LeafReaderContext ctx : reader.leaves()) { LeafReader leaf = ctx.reader(); - DocIdSetIterator parentDISI = leaf.getNumericDocValues(parentField); + NumericDocValues parentDISI = leaf.getNumericDocValues(parentField); NumericDocValues ids = leaf.getNumericDocValues("id"); NumericDocValues children = leaf.getNumericDocValues("child"); - int doc = -1; + int doc; int expectedDocID = 2; while ((doc = parentDISI.nextDoc()) != NO_MORE_DOCS) { + assertEquals(2, parentDISI.longValue()); assertEquals(expectedDocID, doc); int id = ids.nextDoc(); long child1ID = ids.longValue(); From ab4b3bc283427fdfb757960b18ab3bb814028e79 Mon Sep 17 00:00:00 2001 From: Simon Willnauer Date: Sun, 3 Dec 2023 13:22:24 +0100 Subject: [PATCH 13/44] remove dead code --- .../org/apache/lucene/index/DocumentsWriterPerThread.java | 5 ----- 1 file changed, 5 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java b/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java index 88a3988d59f6..0cfec8bc27f1 100644 --- a/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java +++ b/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java @@ -244,7 +244,6 @@ long updateDocuments( Iterable doc = iterator.next(); if (segmentInfo.getIndexSort() != null) { if (parentFieldName != null) { - validateNoParentField(doc); final NumericDocValuesField parentField; if (iterator.hasNext() == false) { int numChildren = numDocsInRAM - docsInRamBefore; @@ -291,10 +290,6 @@ long updateDocuments( } } - private void validateNoParentField(Iterable doc) { - for (IndexableField field : doc) {} - } - private Iterable filterParentDocField( Iterable doc, NumericDocValuesField parentDocMarker) { return () -> { From 224c7df05d66f271ca3534acebfd6ff87f06699e Mon Sep 17 00:00:00 2001 From: Simon Willnauer Date: Sun, 3 Dec 2023 13:32:58 +0100 Subject: [PATCH 14/44] fix comparison --- lucene/core/src/java/org/apache/lucene/search/Sort.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lucene/core/src/java/org/apache/lucene/search/Sort.java b/lucene/core/src/java/org/apache/lucene/search/Sort.java index f3e210be2d6f..41ee8b7e1696 100644 --- a/lucene/core/src/java/org/apache/lucene/search/Sort.java +++ b/lucene/core/src/java/org/apache/lucene/search/Sort.java @@ -111,7 +111,7 @@ public String getParentField() { */ public Sort rewrite(IndexSearcher searcher) throws IOException { boolean changed = false; - if (parentField == null) { + if (parentField != null) { throw new IllegalStateException("parentFields must not be used with search time sorting"); } SortField[] rewrittenSortFields = new SortField[fields.length]; From 86d7032b00a8b93367ab58493b910f51a048d49c Mon Sep 17 00:00:00 2001 From: Simon Willnauer Date: Sun, 3 Dec 2023 14:06:46 +0100 Subject: [PATCH 15/44] add bwc --- .../org/apache/lucene/index/CheckIndex.java | 9 ++++++++- .../lucene/index/DocumentsWriterPerThread.java | 18 ++++++++++-------- .../org/apache/lucene/index/IndexingChain.java | 9 ++++++++- .../org/apache/lucene/index/MultiSorter.java | 9 ++++++++- .../java/org/apache/lucene/index/Sorter.java | 11 ++++++++--- 5 files changed, 42 insertions(+), 14 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java index 942dfec8ea14..24f56a4f4bb0 100644 --- a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java +++ b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java @@ -1182,8 +1182,15 @@ public static Status.IndexSortStatus testSort( try { LeafMetaData metaData = reader.getMetaData(); + if (metaData.hasBlocks() + && sort.getParentField() == null + && metaData.getCreatedVersionMajor() >= Version.LUCENE_10_0_0.major) { + throw new IllegalStateException( + "parent field is not set but the index has block and was created with version: " + + metaData.getCreatedVersionMajor()); + } final DocIdSetIterator iter = - metaData.hasBlocks() + metaData.hasBlocks() && sort.getParentField() != null ? reader.getNumericDocValues(sort.getParentField()) : DocIdSetIterator.all(reader.maxDoc()); int prevDoc = iter.nextDoc(); diff --git a/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java b/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java index 0cfec8bc27f1..4a76658f901d 100644 --- a/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java +++ b/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java @@ -137,7 +137,7 @@ void abort() throws IOException { private final ReentrantLock lock = new ReentrantLock(); private int[] deleteDocIDs = new int[0]; private int numDeletedDocIds = 0; - private final String parentFieldName; + private final int indexVersionCreated; DocumentsWriterPerThread( int indexVersionCreated, @@ -149,6 +149,7 @@ void abort() throws IOException { FieldInfos.Builder fieldInfos, AtomicLong pendingNumDocs, boolean enableTestPoints) { + this.indexVersionCreated = indexVersionCreated; this.directory = new TrackingDirectoryWrapper(directory); this.fieldInfos = fieldInfos; this.indexWriterConfig = indexWriterConfig; @@ -193,10 +194,6 @@ void abort() throws IOException { fieldInfos, indexWriterConfig, this::onAbortingException); - parentFieldName = - segmentInfo.getIndexSort() != null - ? indexWriterConfig.getIndexSort().getParentField() - : null; } final void testPoint(String message) { @@ -243,6 +240,7 @@ long updateDocuments( while (iterator.hasNext()) { Iterable doc = iterator.next(); if (segmentInfo.getIndexSort() != null) { + String parentFieldName = segmentInfo.getIndexSort().getParentField(); if (parentFieldName != null) { final NumericDocValuesField parentField; if (iterator.hasNext() == false) { @@ -251,9 +249,11 @@ long updateDocuments( } else { parentField = null; } - doc = filterParentDocField(doc, parentField); - } else if (iterator.hasNext()) { + doc = filterParentDocField(doc, parentFieldName, parentField); + } else if (iterator.hasNext() && indexVersionCreated >= Version.LUCENE_10_0_0.major) { // sort is configured but parent field is missing, yet we have a doc-block + // yet we must not fail if this index was created in an earlier version where this + // behavior was permitted. throw new IllegalArgumentException( "a parent field must be set in order to use document blocks with index sorting; see Sort#getParentField"); } @@ -291,7 +291,9 @@ long updateDocuments( } private Iterable filterParentDocField( - Iterable doc, NumericDocValuesField parentDocMarker) { + Iterable doc, + String parentFieldName, + NumericDocValuesField parentDocMarker) { return () -> { final Iterator first = doc.iterator(); return new Iterator<>() { diff --git a/lucene/core/src/java/org/apache/lucene/index/IndexingChain.java b/lucene/core/src/java/org/apache/lucene/index/IndexingChain.java index 0973fc857e81..26010c278e2d 100644 --- a/lucene/core/src/java/org/apache/lucene/index/IndexingChain.java +++ b/lucene/core/src/java/org/apache/lucene/index/IndexingChain.java @@ -59,6 +59,7 @@ import org.apache.lucene.util.InfoStream; import org.apache.lucene.util.IntBlockPool; import org.apache.lucene.util.RamUsageEstimator; +import org.apache.lucene.util.Version; /** Default general purpose indexing chain, which handles indexing all types of fields. */ final class IndexingChain implements Accountable { @@ -222,7 +223,8 @@ private Sorter.DocMap maybeSortSegment(SegmentWriteState state) throws IOExcepti LeafReader docValuesReader = getDocValuesLeafReader(); Function comparatorWrapper = in -> in; - if (state.segmentInfo.getHasBlocks()) { + + if (state.segmentInfo.getHasBlocks() && indexSort.getParentField() != null) { final DocIdSetIterator readerValues = docValuesReader.getNumericDocValues(indexSort.getParentField()); BitSet parents = BitSet.of(readerValues, state.segmentInfo.maxDoc()); @@ -231,6 +233,11 @@ private Sorter.DocMap maybeSortSegment(SegmentWriteState state) throws IOExcepti (docID1, docID2) -> in.compare(parents.nextSetBit(docID1), parents.nextSetBit(docID2)); } + assert state.segmentInfo.getHasBlocks() == false + || indexSort.getParentField() != null + || indexCreatedVersionMajor < Version.LUCENE_10_0_0.major + : "parent field is not set but the index has blocks. indexCreatedVersionMajor: " + + indexCreatedVersionMajor; List comparators = new ArrayList<>(); for (int i = 0; i < indexSort.getSort().length; i++) { SortField sortField = indexSort.getSort()[i]; diff --git a/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java b/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java index bb3772661d26..9effcc1df490 100644 --- a/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java +++ b/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java @@ -25,6 +25,7 @@ import org.apache.lucene.util.BitSet; import org.apache.lucene.util.Bits; import org.apache.lucene.util.PriorityQueue; +import org.apache.lucene.util.Version; import org.apache.lucene.util.packed.PackedInts; import org.apache.lucene.util.packed.PackedLongValues; @@ -53,7 +54,8 @@ static MergeState.DocMap[] sort(Sort sort, List readers) throws IOE comparables[i] = sorter.getComparableProviders(readers); for (int j = 0; j < readers.size(); j++) { CodecReader codecReader = readers.get(j); - if (codecReader.getMetaData().hasBlocks()) { + LeafMetaData metaData = codecReader.getMetaData(); + if (metaData.hasBlocks() && sort.getParentField() != null) { NumericDocValues parentDocs = codecReader.getNumericDocValues(sort.getParentField()); assert parentDocs != null : "parent field: " @@ -64,6 +66,11 @@ static MergeState.DocMap[] sort(Sort sort, List readers) throws IOE IndexSorter.ComparableProvider provider = providers[j]; providers[j] = docId -> provider.getAsComparableLong(parents.nextSetBit(docId)); } + assert metaData.hasBlocks() == false + || sort.getParentField() != null + || metaData.getCreatedVersionMajor() < Version.LUCENE_10_0_0.major + : "parent field is not set but the index has blocks. indexCreatedVersionMajor: " + + metaData.getCreatedVersionMajor(); } reverseMuls[i] = fields[i].getReverse() ? -1 : 1; } diff --git a/lucene/core/src/java/org/apache/lucene/index/Sorter.java b/lucene/core/src/java/org/apache/lucene/index/Sorter.java index 55f13e8d8000..2d8a69f1df8a 100644 --- a/lucene/core/src/java/org/apache/lucene/index/Sorter.java +++ b/lucene/core/src/java/org/apache/lucene/index/Sorter.java @@ -22,6 +22,7 @@ import org.apache.lucene.search.SortField; import org.apache.lucene.util.BitSet; import org.apache.lucene.util.TimSorter; +import org.apache.lucene.util.Version; import org.apache.lucene.util.packed.PackedInts; import org.apache.lucene.util.packed.PackedLongValues; @@ -209,9 +210,8 @@ DocMap sort(LeafReader reader) throws IOException { final IndexSorter.DocComparator[] comparators = new IndexSorter.DocComparator[fields.length]; Function comparatorWrapper = in -> in; - if (reader.getMetaData().hasBlocks()) { - assert sort.getParentField() != null - : "a parent field must be present when segment has blocks and index sorting"; + LeafMetaData metaData = reader.getMetaData(); + if (metaData.hasBlocks() && sort.getParentField() != null) { BitSet parents = BitSet.of(reader.getNumericDocValues(sort.getParentField()), reader.maxDoc()); comparatorWrapper = @@ -219,6 +219,11 @@ DocMap sort(LeafReader reader) throws IOException { (docID1, docID2) -> in.compare(parents.nextSetBit(docID1), parents.nextSetBit(docID2)); } + assert metaData.hasBlocks() == false + || sort.getParentField() != null + || metaData.getCreatedVersionMajor() < Version.LUCENE_10_0_0.major + : "parent field is not set but the index has blocks. indexCreatedVersionMajor: " + + metaData.getCreatedVersionMajor(); for (int i = 0; i < fields.length; i++) { IndexSorter sorter = fields[i].getIndexSorter(); if (sorter == null) { From 5a15f589ac704970fbc1970b584dd5d794d608f0 Mon Sep 17 00:00:00 2001 From: Simon Willnauer Date: Sun, 3 Dec 2023 14:17:15 +0100 Subject: [PATCH 16/44] add test for BWC --- .../TestBackwardsCompatibility.java | 48 ++++++++++++++++++- 1 file changed, 47 insertions(+), 1 deletion(-) diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestBackwardsCompatibility.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestBackwardsCompatibility.java index cd12a6fa4769..c4ce09d766f7 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestBackwardsCompatibility.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestBackwardsCompatibility.java @@ -1552,6 +1552,10 @@ public void createIndex(String dirName, boolean doCFS, boolean fullyMerged) thro } private void addDoc(IndexWriter writer, int id) throws IOException { + addDoc(writer, id, false); + } + + private void addDoc(IndexWriter writer, int id, boolean addBlock) throws IOException { Document doc = new Document(); doc.add(new TextField("content", "aaa", Field.Store.NO)); doc.add(new StringField("id", Integer.toString(id), Field.Store.YES)); @@ -1620,7 +1624,11 @@ private void addDoc(IndexWriter writer, int id) throws IOException { // TODO: // index different norms types via similarity (we use a random one currently?!) // remove any analyzer randomness, explicitly add payloads for certain fields. - writer.addDocument(doc); + if (addBlock) { + writer.addDocuments(Arrays.asList(doc, doc)); + } else { + writer.addDocument(doc); + } } private void addNoProxDoc(IndexWriter writer) throws IOException { @@ -2168,6 +2176,44 @@ public void testSortedIndex() throws Exception { } } + public void testSortedIndexWithBlocks() throws Exception { + for (String name : oldSortedNames) { + Path path = createTempDir("sorted"); + InputStream resource = TestBackwardsCompatibility.class.getResourceAsStream(name + ".zip"); + assertNotNull("Sorted index index " + name + " not found", resource); + TestUtil.unzip(resource, path); + + try (Directory dir = newFSDirectory(path)) { + try (DirectoryReader reader = DirectoryReader.open(dir)) { + + assertEquals(1, reader.leaves().size()); + Sort sort = reader.leaves().get(0).reader().getMetaData().getSort(); + assertNotNull(sort); + + // open writer + try (IndexWriter writer = + new IndexWriter( + dir, + newIndexWriterConfig(new MockAnalyzer(random())) + .setOpenMode(OpenMode.APPEND) + .setIndexSort(sort) + .setMergePolicy(newLogMergePolicy()))) { + // add 10 docs + for (int i = 0; i < 10; i++) { + addDoc(writer, DOCS_COUNT + i, true); + } + writer.forceMerge(1); + } + + // This will confirm the docs are really sorted + TestUtil.checkIndex(dir); + + searchExampleIndex(reader); + } + } + } + } + private void searchExampleIndex(DirectoryReader reader) throws IOException { IndexSearcher searcher = newSearcher(reader); From 0431a516da04be9af0e4a28277e93bb8afca3345 Mon Sep 17 00:00:00 2001 From: Simon Willnauer Date: Sun, 3 Dec 2023 14:19:33 +0100 Subject: [PATCH 17/44] fix test --- lucene/core/src/test/org/apache/lucene/index/TestAddIndexes.java | 1 - 1 file changed, 1 deletion(-) diff --git a/lucene/core/src/test/org/apache/lucene/index/TestAddIndexes.java b/lucene/core/src/test/org/apache/lucene/index/TestAddIndexes.java index 9a5fbf2606a2..b7a6110d47bf 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestAddIndexes.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestAddIndexes.java @@ -1686,7 +1686,6 @@ public void testIllegalIndexSortChange3() throws Exception { RandomIndexWriter w1 = new RandomIndexWriter(random(), dir1, iwc1); Document parent = new Document(); - parent.add(new NumericDocValuesField("foobar", 0)); w1.addDocuments(Arrays.asList(new Document(), new Document(), parent)); w1.commit(); w1.addDocuments(Arrays.asList(new Document(), new Document(), parent)); From 3e64766bca57ccedd16f9f99ddc5ab5f19200b92 Mon Sep 17 00:00:00 2001 From: Simon Willnauer Date: Mon, 4 Dec 2023 11:15:29 +0100 Subject: [PATCH 18/44] improve BWC testing --- .../TestBackwardsCompatibility.java | 91 +++++++++++++------ 1 file changed, 62 insertions(+), 29 deletions(-) diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestBackwardsCompatibility.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestBackwardsCompatibility.java index c4ce09d766f7..2485e7ce3e74 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestBackwardsCompatibility.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestBackwardsCompatibility.java @@ -98,6 +98,8 @@ import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.index.VectorSimilarityFunction; +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.FieldDoc; import org.apache.lucene.search.FieldExistsQuery; @@ -1552,10 +1554,6 @@ public void createIndex(String dirName, boolean doCFS, boolean fullyMerged) thro } private void addDoc(IndexWriter writer, int id) throws IOException { - addDoc(writer, id, false); - } - - private void addDoc(IndexWriter writer, int id, boolean addBlock) throws IOException { Document doc = new Document(); doc.add(new TextField("content", "aaa", Field.Store.NO)); doc.add(new StringField("id", Integer.toString(id), Field.Store.YES)); @@ -1624,11 +1622,7 @@ private void addDoc(IndexWriter writer, int id, boolean addBlock) throws IOExcep // TODO: // index different norms types via similarity (we use a random one currently?!) // remove any analyzer randomness, explicitly add payloads for certain fields. - if (addBlock) { - writer.addDocuments(Arrays.asList(doc, doc)); - } else { - writer.addDocument(doc); - } + writer.addDocument(doc); } private void addNoProxDoc(IndexWriter writer) throws IOException { @@ -2176,7 +2170,7 @@ public void testSortedIndex() throws Exception { } } - public void testSortedIndexWithBlocks() throws Exception { + public void testSortedIndexAddDocBlocks() throws Exception { for (String name : oldSortedNames) { Path path = createTempDir("sorted"); InputStream resource = TestBackwardsCompatibility.class.getResourceAsStream(name + ".zip"); @@ -2184,32 +2178,71 @@ public void testSortedIndexWithBlocks() throws Exception { TestUtil.unzip(resource, path); try (Directory dir = newFSDirectory(path)) { + final Sort sort; try (DirectoryReader reader = DirectoryReader.open(dir)) { - assertEquals(1, reader.leaves().size()); - Sort sort = reader.leaves().get(0).reader().getMetaData().getSort(); + sort = reader.leaves().get(0).reader().getMetaData().getSort(); assertNotNull(sort); - - // open writer - try (IndexWriter writer = - new IndexWriter( - dir, - newIndexWriterConfig(new MockAnalyzer(random())) - .setOpenMode(OpenMode.APPEND) - .setIndexSort(sort) - .setMergePolicy(newLogMergePolicy()))) { - // add 10 docs - for (int i = 0; i < 10; i++) { - addDoc(writer, DOCS_COUNT + i, true); + searchExampleIndex(reader); + } + // open writer + try (IndexWriter writer = + new IndexWriter( + dir, + newIndexWriterConfig(new MockAnalyzer(random())) + .setOpenMode(OpenMode.APPEND) + .setIndexSort(sort) + .setMergePolicy(newLogMergePolicy()))) { + // add 10 docs + for (int i = 0; i < 10; i++) { + Document child = new Document(); + child.add(new StringField("relation", "child", Field.Store.NO)); + child.add(new StringField("bid", "" + i, Field.Store.NO)); + child.add(new NumericDocValuesField("dateDV", i)); + Document parent = new Document(); + parent.add(new StringField("relation", "parent", Field.Store.NO)); + parent.add(new StringField("bid", "" + i, Field.Store.NO)); + parent.add(new NumericDocValuesField("dateDV", i)); + writer.addDocuments(Arrays.asList(child, child, parent)); + if (random().nextBoolean()) { + writer.flush(); } + } + if (random().nextBoolean()) { writer.forceMerge(1); } - - // This will confirm the docs are really sorted - TestUtil.checkIndex(dir); - - searchExampleIndex(reader); + writer.commit(); + try (IndexReader reader = DirectoryReader.open(dir)) { + IndexSearcher searcher = new IndexSearcher(reader); + for (int i = 0; i < 10; i++) { + TopDocs children = + searcher.search( + new BooleanQuery.Builder() + .add( + new TermQuery(new Term("relation", "child")), + BooleanClause.Occur.MUST) + .add(new TermQuery(new Term("bid", "" + i)), BooleanClause.Occur.MUST) + .build(), + 2); + TopDocs parents = + searcher.search( + new BooleanQuery.Builder() + .add( + new TermQuery(new Term("relation", "parent")), + BooleanClause.Occur.MUST) + .add(new TermQuery(new Term("bid", "" + i)), BooleanClause.Occur.MUST) + .build(), + 2); + assertEquals(2, children.totalHits.value); + assertEquals(1, parents.totalHits.value); + // make sure it's sorted + assertEquals(children.scoreDocs[0].doc + 1, children.scoreDocs[1].doc); + assertEquals(children.scoreDocs[1].doc + 1, parents.scoreDocs[0].doc); + } + } } + // This will confirm the docs are really sorted + TestUtil.checkIndex(dir); } } } From 5f6c29701f96e426a3afab8a5b4b8dd665ab40d4 Mon Sep 17 00:00:00 2001 From: Simon Willnauer Date: Thu, 7 Dec 2023 16:54:36 +0100 Subject: [PATCH 19/44] address concerns about field comparison --- .../index/DocumentsWriterPerThread.java | 35 ++++---- .../apache/lucene/index/IndexingChain.java | 89 ++++++++++++++++++- .../apache/lucene/index/TestIndexSorting.java | 4 +- 3 files changed, 102 insertions(+), 26 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java b/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java index 4a76658f901d..21a1e5209371 100644 --- a/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java +++ b/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java @@ -138,6 +138,7 @@ void abort() throws IOException { private int[] deleteDocIDs = new int[0]; private int numDeletedDocIds = 0; private final int indexVersionCreated; + private final IndexingChain.ReservedField parentField; DocumentsWriterPerThread( int indexVersionCreated, @@ -194,6 +195,14 @@ void abort() throws IOException { fieldInfos, indexWriterConfig, this::onAbortingException); + if (indexWriterConfig.getIndexSort() != null + && indexWriterConfig.getIndexSort().getParentField() != null) { + this.parentField = + indexingChain.markAsReserved( + new NumericDocValuesField(indexWriterConfig.getIndexSort().getParentField(), -1)); + } else { + this.parentField = null; + } } final void testPoint(String message) { @@ -240,16 +249,12 @@ long updateDocuments( while (iterator.hasNext()) { Iterable doc = iterator.next(); if (segmentInfo.getIndexSort() != null) { - String parentFieldName = segmentInfo.getIndexSort().getParentField(); - if (parentFieldName != null) { - final NumericDocValuesField parentField; + if (parentField != null) { if (iterator.hasNext() == false) { int numChildren = numDocsInRAM - docsInRamBefore; - parentField = new NumericDocValuesField(parentFieldName, numChildren); - } else { - parentField = null; + parentField.getDelegate().setLongValue(numChildren); + doc = addParentField(doc, parentField); } - doc = filterParentDocField(doc, parentFieldName, parentField); } else if (iterator.hasNext() && indexVersionCreated >= Version.LUCENE_10_0_0.major) { // sort is configured but parent field is missing, yet we have a doc-block // yet we must not fail if this index was created in an earlier version where this @@ -258,7 +263,6 @@ long updateDocuments( "a parent field must be set in order to use document blocks with index sorting; see Sort#getParentField"); } } - // Even on exception, the document is still added (but marked // deleted), so we don't need to un-reserve at that point. // Aborting exceptions will actually "lose" more than one @@ -290,14 +294,12 @@ long updateDocuments( } } - private Iterable filterParentDocField( - Iterable doc, - String parentFieldName, - NumericDocValuesField parentDocMarker) { + private Iterable addParentField( + Iterable doc, IndexableField parentField) { return () -> { final Iterator first = doc.iterator(); return new Iterator<>() { - NumericDocValuesField additionalField = parentDocMarker; + IndexableField additionalField = parentField; @Override public boolean hasNext() { @@ -308,13 +310,6 @@ public boolean hasNext() { public IndexableField next() { if (first.hasNext()) { IndexableField field = first.next(); - if (parentFieldName.equals(field.name()) - && DocValuesType.NUMERIC == field.fieldType().docValuesType()) { - throw new IllegalArgumentException( - "documents must not contain the parent doc values field \"" - + parentFieldName - + "\""); - } return field; } if (additionalField != null) { diff --git a/lucene/core/src/java/org/apache/lucene/index/IndexingChain.java b/lucene/core/src/java/org/apache/lucene/index/IndexingChain.java index 26010c278e2d..5596cee68466 100644 --- a/lucene/core/src/java/org/apache/lucene/index/IndexingChain.java +++ b/lucene/core/src/java/org/apache/lucene/index/IndexingChain.java @@ -18,6 +18,7 @@ import java.io.Closeable; import java.io.IOException; +import java.io.Reader; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; @@ -39,6 +40,7 @@ import org.apache.lucene.codecs.PointsFormat; import org.apache.lucene.codecs.PointsWriter; import org.apache.lucene.document.FieldType; +import org.apache.lucene.document.InvertableType; import org.apache.lucene.document.KnnByteVectorField; import org.apache.lucene.document.KnnFloatVectorField; import org.apache.lucene.document.StoredValue; @@ -567,7 +569,14 @@ void processDocument(int docID, Iterable document) thr // build schema for each unique doc field for (IndexableField field : document) { IndexableFieldType fieldType = field.fieldType(); - PerField pf = getOrAddPerField(field.name()); + final boolean isReserved = field.getClass() == ReservedField.class; + PerField pf = getOrAddPerField(field.name(), isReserved); + if (pf.reserved != isReserved) { + throw new IllegalArgumentException( + "\"" + + field.name() + + "\" is a reserved field and should not be added to any document"); + } if (pf.fieldGen != fieldGen) { // first time we see this field in this document fields[fieldCount++] = pf; pf.fieldGen = fieldGen; @@ -762,7 +771,7 @@ private boolean processField(int docID, IndexableField field, PerField pf) throw * Returns a previously created {@link PerField}, absorbing the type information from {@link * FieldType}, and creates a new {@link PerField} if this field name wasn't seen yet. */ - private PerField getOrAddPerField(String fieldName) { + private PerField getOrAddPerField(String fieldName, boolean reserved) { final int hashPos = fieldName.hashCode() & hashMask; PerField pf = fieldHash[hashPos]; while (pf != null && pf.fieldName.equals(fieldName) == false) { @@ -778,7 +787,8 @@ private PerField getOrAddPerField(String fieldName) { schema, indexWriterConfig.getSimilarity(), indexWriterConfig.getInfoStream(), - indexWriterConfig.getAnalyzer()); + indexWriterConfig.getAnalyzer(), + reserved); pf.next = fieldHash[hashPos]; fieldHash[hashPos] = pf; totalFieldCount++; @@ -1043,6 +1053,7 @@ private final class PerField implements Comparable { final String fieldName; final int indexCreatedVersionMajor; final FieldSchema schema; + final boolean reserved; FieldInfo fieldInfo; final Similarity similarity; @@ -1080,13 +1091,15 @@ private final class PerField implements Comparable { FieldSchema schema, Similarity similarity, InfoStream infoStream, - Analyzer analyzer) { + Analyzer analyzer, + boolean reserved) { this.fieldName = fieldName; this.indexCreatedVersionMajor = indexCreatedVersionMajor; this.schema = schema; this.similarity = similarity; this.infoStream = infoStream; this.analyzer = analyzer; + this.reserved = reserved; } void reset(int docId) { @@ -1533,4 +1546,72 @@ void assertSameSchema(FieldInfo fi) { assertSame("point num bytes", fi.getPointNumBytes(), pointNumBytes); } } + + ReservedField markAsReserved(T field) { + getOrAddPerField(field.name(), true); + return new ReservedField(field); + } + + static final class ReservedField implements IndexableField { + + private final T delegate; + + private ReservedField(T delegate) { + this.delegate = delegate; + } + + T getDelegate() { + return delegate; + } + + @Override + public String name() { + return delegate.name(); + } + + @Override + public IndexableFieldType fieldType() { + return delegate.fieldType(); + } + + @Override + public TokenStream tokenStream(Analyzer analyzer, TokenStream reuse) { + return delegate.tokenStream(analyzer, reuse); + } + + @Override + public BytesRef binaryValue() { + return delegate.binaryValue(); + } + + @Override + public String stringValue() { + return delegate.stringValue(); + } + + @Override + public CharSequence getCharSequenceValue() { + return delegate.getCharSequenceValue(); + } + + @Override + public Reader readerValue() { + return delegate.readerValue(); + } + + @Override + public Number numericValue() { + return delegate.numericValue(); + } + + @Override + public StoredValue storedValue() { + return delegate.storedValue(); + } + + @Override + public InvertableType invertableType() { + return delegate.invertableType(); + } + } } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java b/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java index 07ee24dcd722..16189eccdeac 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java @@ -3210,7 +3210,7 @@ public void testBlockContainsParentField() throws IOException { writer.addDocuments(Arrays.asList(doc, new Document())); }); assertEquals( - "documents must not contain the parent doc values field \"parent\"", + "\"parent\" is a reserved field and should not be added to any document", ex.getMessage()); }, () -> { @@ -3223,7 +3223,7 @@ public void testBlockContainsParentField() throws IOException { writer.addDocuments(Arrays.asList(new Document(), doc)); }); assertEquals( - "documents must not contain the parent doc values field \"parent\"", + "\"parent\" is a reserved field and should not be added to any document", ex.getMessage()); }); Collections.shuffle(runnabels, random()); From 6e64b51bd12985f7cf823196afacf14477059600 Mon Sep 17 00:00:00 2001 From: Simon Willnauer Date: Thu, 7 Dec 2023 16:59:59 +0100 Subject: [PATCH 20/44] cleanup --- .../java/org/apache/lucene/index/IndexingChain.java | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/lucene/core/src/java/org/apache/lucene/index/IndexingChain.java b/lucene/core/src/java/org/apache/lucene/index/IndexingChain.java index 5596cee68466..363df0342e42 100644 --- a/lucene/core/src/java/org/apache/lucene/index/IndexingChain.java +++ b/lucene/core/src/java/org/apache/lucene/index/IndexingChain.java @@ -570,7 +570,10 @@ void processDocument(int docID, Iterable document) thr for (IndexableField field : document) { IndexableFieldType fieldType = field.fieldType(); final boolean isReserved = field.getClass() == ReservedField.class; - PerField pf = getOrAddPerField(field.name(), isReserved); + PerField pf = + getOrAddPerField( + field.name(), false + /* we never add reserved fields during indexing should be done during DWPT setup*/ ); if (pf.reserved != isReserved) { throw new IllegalArgumentException( "\"" @@ -1547,6 +1550,11 @@ void assertSameSchema(FieldInfo fi) { } } + /** + * Wraps the given field in a reserved field and registers it as reserved. Only DWPT should do + * this to mark fields as private / reserved to prevent this fieldname to be used from the outside + * of the IW / DWPT eco-system + */ ReservedField markAsReserved(T field) { getOrAddPerField(field.name(), true); return new ReservedField(field); From e31f17f7b2a55b394850d89430741a4ec27d23f7 Mon Sep 17 00:00:00 2001 From: Simon Willnauer Date: Thu, 7 Dec 2023 17:15:28 +0100 Subject: [PATCH 21/44] apply feedback --- .../org/apache/lucene/index/CheckIndex.java | 2 +- .../org/apache/lucene/index/IndexingChain.java | 18 +++++++++++++----- .../java/org/apache/lucene/index/Sorter.java | 14 +++++++++----- .../apache/lucene/index/TestAddIndexes.java | 11 ++++++++++- 4 files changed, 33 insertions(+), 12 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java index 24f56a4f4bb0..92a80f6961d0 100644 --- a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java +++ b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java @@ -1186,7 +1186,7 @@ public static Status.IndexSortStatus testSort( && sort.getParentField() == null && metaData.getCreatedVersionMajor() >= Version.LUCENE_10_0_0.major) { throw new IllegalStateException( - "parent field is not set but the index has block and was created with version: " + "parent field is not set but the index has document blocks and was created with version: " + metaData.getCreatedVersionMajor()); } final DocIdSetIterator iter = diff --git a/lucene/core/src/java/org/apache/lucene/index/IndexingChain.java b/lucene/core/src/java/org/apache/lucene/index/IndexingChain.java index 363df0342e42..989b36dea545 100644 --- a/lucene/core/src/java/org/apache/lucene/index/IndexingChain.java +++ b/lucene/core/src/java/org/apache/lucene/index/IndexingChain.java @@ -229,17 +229,25 @@ private Sorter.DocMap maybeSortSegment(SegmentWriteState state) throws IOExcepti if (state.segmentInfo.getHasBlocks() && indexSort.getParentField() != null) { final DocIdSetIterator readerValues = docValuesReader.getNumericDocValues(indexSort.getParentField()); + if (readerValues == null) { + throw new CorruptIndexException( + "missing doc values for parent field \"" + indexSort.getParentField() + "\"", + "IndexingChain"); + } BitSet parents = BitSet.of(readerValues, state.segmentInfo.maxDoc()); comparatorWrapper = in -> (docID1, docID2) -> in.compare(parents.nextSetBit(docID1), parents.nextSetBit(docID2)); } - assert state.segmentInfo.getHasBlocks() == false - || indexSort.getParentField() != null - || indexCreatedVersionMajor < Version.LUCENE_10_0_0.major - : "parent field is not set but the index has blocks. indexCreatedVersionMajor: " - + indexCreatedVersionMajor; + if (state.segmentInfo.getHasBlocks() + && indexSort.getParentField() == null + && indexCreatedVersionMajor >= Version.LUCENE_10_0_0.major) { + throw new CorruptIndexException( + "parent field is not set but the index has blocks. indexCreatedVersionMajor: " + + indexCreatedVersionMajor, + "IndexingChain"); + } List comparators = new ArrayList<>(); for (int i = 0; i < indexSort.getSort().length; i++) { SortField sortField = indexSort.getSort()[i]; diff --git a/lucene/core/src/java/org/apache/lucene/index/Sorter.java b/lucene/core/src/java/org/apache/lucene/index/Sorter.java index 2d8a69f1df8a..0eebda13804d 100644 --- a/lucene/core/src/java/org/apache/lucene/index/Sorter.java +++ b/lucene/core/src/java/org/apache/lucene/index/Sorter.java @@ -219,11 +219,15 @@ DocMap sort(LeafReader reader) throws IOException { (docID1, docID2) -> in.compare(parents.nextSetBit(docID1), parents.nextSetBit(docID2)); } - assert metaData.hasBlocks() == false - || sort.getParentField() != null - || metaData.getCreatedVersionMajor() < Version.LUCENE_10_0_0.major - : "parent field is not set but the index has blocks. indexCreatedVersionMajor: " - + metaData.getCreatedVersionMajor(); + if (metaData.hasBlocks() + && sort.getParentField() == null + && metaData.getCreatedVersionMajor() >= Version.LUCENE_10_0_0.major) { + throw new CorruptIndexException( + "parent field is not set but the index has blocks. indexCreatedVersionMajor: " + + metaData.getCreatedVersionMajor(), + "Sorter"); + } + for (int i = 0; i < fields.length; i++) { IndexSorter sorter = fields[i].getIndexSorter(); if (sorter == null) { diff --git a/lucene/core/src/test/org/apache/lucene/index/TestAddIndexes.java b/lucene/core/src/test/org/apache/lucene/index/TestAddIndexes.java index b7a6110d47bf..496f0cd38274 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestAddIndexes.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestAddIndexes.java @@ -1721,7 +1721,16 @@ public void testIllegalIndexSortChange3() throws Exception { assertEquals( "cannot change index sort from parent field: foobar to ", message); - IOUtils.close(r1, dir1, w2, dir2); + + Directory dir3 = newDirectory(); + IndexWriterConfig iwc3 = newIndexWriterConfig(new MockAnalyzer(random())); + iwc3.setIndexSort(new Sort("foobar", new SortField("foo", SortField.Type.INT))); + RandomIndexWriter w3 = new RandomIndexWriter(random(), dir3, iwc3); + + w3.addIndexes((SegmentReader) getOnlyLeafReader(r1)); + w3.addIndexes(dir1); + + IOUtils.close(r1, dir1, w2, dir2, w3, dir3); } public void testAddIndexesDVUpdateSameSegmentName() throws Exception { From 4b33f85e1c34a214d7ad528fbccf4269268ce321 Mon Sep 17 00:00:00 2001 From: Simon Willnauer Date: Thu, 14 Dec 2023 17:10:43 +0100 Subject: [PATCH 22/44] Move parent field to IWC and FieldInfos --- .../SimpleTextFieldInfosFormat.java | 11 +- .../SimpleTextSegmentInfoFormat.java | 30 +--- .../lucene94/Lucene94FieldInfosFormat.java | 9 +- .../lucene99/Lucene99SegmentInfoFormat.java | 10 +- .../org/apache/lucene/index/CheckIndex.java | 7 +- .../index/DocumentsWriterPerThread.java | 32 ++-- .../org/apache/lucene/index/FieldInfo.java | 14 +- .../org/apache/lucene/index/FieldInfos.java | 75 ++++++++- .../org/apache/lucene/index/IndexWriter.java | 13 +- .../lucene/index/IndexWriterConfig.java | 5 + .../apache/lucene/index/IndexingChain.java | 11 +- .../lucene/index/LiveIndexWriterConfig.java | 7 + .../org/apache/lucene/index/MultiSorter.java | 10 +- .../lucene/index/ParallelLeafReader.java | 8 +- .../lucene/index/ReadersAndUpdates.java | 3 +- .../java/org/apache/lucene/index/Sorter.java | 7 +- .../internal/tests/IndexPackageAccess.java | 2 +- .../java/org/apache/lucene/search/Sort.java | 37 +---- .../apache/lucene/index/TestAddIndexes.java | 155 +++++++++--------- .../org/apache/lucene/index/TestCodecs.java | 7 +- .../test/org/apache/lucene/index/TestDoc.java | 2 +- .../apache/lucene/index/TestFieldInfos.java | 5 +- .../apache/lucene/index/TestFieldsReader.java | 6 +- .../apache/lucene/index/TestIndexSorting.java | 11 +- .../lucene/index/TestPendingSoftDeletes.java | 15 +- .../lucene/index/TestSegmentMerger.java | 2 +- .../lucene/search/TestSortOptimization.java | 3 +- .../dummy/DummyCompressingCodec.java | 2 +- .../index/BaseFieldInfoFormatTestCase.java | 20 ++- .../index/BaseIndexFileFormatTestCase.java | 3 +- .../index/BaseSegmentInfoFormatTestCase.java | 4 +- .../tests/index/MismatchedLeafReader.java | 3 +- .../tests/index/RandomPostingsTester.java | 2 + 33 files changed, 298 insertions(+), 233 deletions(-) diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextFieldInfosFormat.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextFieldInfosFormat.java index d2c8f563c8d8..21cfe9b613f9 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextFieldInfosFormat.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextFieldInfosFormat.java @@ -72,6 +72,7 @@ public class SimpleTextFieldInfosFormat extends FieldInfosFormat { static final BytesRef VECTOR_ENCODING = new BytesRef(" vector encoding "); static final BytesRef VECTOR_SIMILARITY = new BytesRef(" vector similarity "); static final BytesRef SOFT_DELETES = new BytesRef(" soft-deletes "); + static final BytesRef PARENT = new BytesRef(" parent "); @Override public FieldInfos read( @@ -170,6 +171,9 @@ public FieldInfos read( SimpleTextUtil.readLine(input, scratch); assert StringHelper.startsWith(scratch.get(), SOFT_DELETES); boolean isSoftDeletesField = Boolean.parseBoolean(readString(SOFT_DELETES.length, scratch)); + SimpleTextUtil.readLine(input, scratch); + assert StringHelper.startsWith(scratch.get(), PARENT); + boolean isParentField = Boolean.parseBoolean(readString(PARENT.length, scratch)); infos[i] = new FieldInfo( @@ -188,7 +192,8 @@ public FieldInfos read( vectorNumDimensions, vectorEncoding, vectorDistFunc, - isSoftDeletesField); + isSoftDeletesField, + isParentField); } SimpleTextUtil.checkFooter(input); @@ -320,6 +325,10 @@ public void write( SimpleTextUtil.write(out, SOFT_DELETES); SimpleTextUtil.write(out, Boolean.toString(fi.isSoftDeletesField()), scratch); SimpleTextUtil.writeNewline(out); + + SimpleTextUtil.write(out, PARENT); + SimpleTextUtil.write(out, Boolean.toString(fi.isParentField()), scratch); + SimpleTextUtil.writeNewline(out); } SimpleTextUtil.writeChecksum(out, scratch); success = true; diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextSegmentInfoFormat.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextSegmentInfoFormat.java index 0cc1496b7437..31b3cf2fa7f8 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextSegmentInfoFormat.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextSegmentInfoFormat.java @@ -66,8 +66,6 @@ public class SimpleTextSegmentInfoFormat extends SegmentInfoFormat { static final BytesRef SI_FILE = new BytesRef(" file "); static final BytesRef SI_ID = new BytesRef(" id "); static final BytesRef SI_SORT = new BytesRef(" sort "); - static final BytesRef SI_SORT_HAS_PARENT_FIELD = new BytesRef(" has parent "); - static final BytesRef SI_SORT_PARENT_FIELD = new BytesRef(" parent "); static final BytesRef SI_SORT_TYPE = new BytesRef(" type "); static final BytesRef SI_SORT_NAME = new BytesRef(" name "); static final BytesRef SI_SORT_BYTES = new BytesRef(" bytes "); @@ -199,21 +197,12 @@ public SegmentInfo read( sortField[i] = SortFieldProvider.forName(provider).readSortField(bytes); assert bytes.eof(); } + final Sort indexSort; if (sortField.length == 0) { indexSort = null; } else { - String parentField = null; - SimpleTextUtil.readLine(input, scratch); - assert StringHelper.startsWith(scratch.get(), SI_SORT_HAS_PARENT_FIELD); - final boolean hasParent = - Boolean.parseBoolean(readString(SI_SORT_HAS_PARENT_FIELD.length, scratch)); - if (hasParent) { - SimpleTextUtil.readLine(input, scratch); - assert StringHelper.startsWith(scratch.get(), SI_SORT_PARENT_FIELD); - parentField = readString(SI_SORT_PARENT_FIELD.length, scratch); - } - indexSort = new Sort(parentField, sortField); + indexSort = new Sort(sortField); } SimpleTextUtil.checkFooter(input); @@ -353,21 +342,6 @@ public void write(Directory dir, SegmentInfo si, IOContext ioContext) throws IOE SimpleTextUtil.write(output, b.bytes.get().toString(), scratch); SimpleTextUtil.writeNewline(output); } - if (numSortFields > 0) { - if (indexSort.getParentField() != null) { - SimpleTextUtil.write(output, SI_SORT_HAS_PARENT_FIELD); - SimpleTextUtil.write(output, "true", scratch); - SimpleTextUtil.writeNewline(output); - - SimpleTextUtil.write(output, SI_SORT_PARENT_FIELD); - SimpleTextUtil.write(output, indexSort.getParentField(), scratch); - SimpleTextUtil.writeNewline(output); - } else { - SimpleTextUtil.write(output, SI_SORT_HAS_PARENT_FIELD); - SimpleTextUtil.write(output, "false", scratch); - SimpleTextUtil.writeNewline(output); - } - } SimpleTextUtil.writeChecksum(output, scratch); } } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene94/Lucene94FieldInfosFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene94/Lucene94FieldInfosFormat.java index 99352053a55c..2e2476f6937d 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene94/Lucene94FieldInfosFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene94/Lucene94FieldInfosFormat.java @@ -157,6 +157,7 @@ public FieldInfos read( boolean omitNorms = (bits & OMIT_NORMS) != 0; boolean storePayloads = (bits & STORE_PAYLOADS) != 0; boolean isSoftDeletesField = (bits & SOFT_DELETES_FIELD) != 0; + boolean isParentField = (bits & PARENT_FIELD_FIELD) != 0; final IndexOptions indexOptions = getIndexOptions(input, input.readByte()); @@ -200,7 +201,8 @@ public FieldInfos read( vectorDimension, vectorEncoding, vectorDistFunc, - isSoftDeletesField); + isSoftDeletesField, + isParentField); infos[i].checkConsistency(); } catch (IllegalStateException e) { throw new CorruptIndexException( @@ -348,6 +350,7 @@ public void write( if (fi.omitsNorms()) bits |= OMIT_NORMS; if (fi.hasPayloads()) bits |= STORE_PAYLOADS; if (fi.isSoftDeletesField()) bits |= SOFT_DELETES_FIELD; + if (fi.isParentField()) bits |= PARENT_FIELD_FIELD; output.writeByte(bits); output.writeByte(indexOptionsByte(fi.getIndexOptions())); @@ -375,11 +378,13 @@ public void write( // Codec header static final String CODEC_NAME = "Lucene94FieldInfos"; static final int FORMAT_START = 0; - static final int FORMAT_CURRENT = FORMAT_START; + static final int FORMAT_PARENT_FIELD = 1; + static final int FORMAT_CURRENT = FORMAT_PARENT_FIELD; // Field flags static final byte STORE_TERMVECTOR = 0x1; static final byte OMIT_NORMS = 0x2; static final byte STORE_PAYLOADS = 0x4; static final byte SOFT_DELETES_FIELD = 0x8; + static final byte PARENT_FIELD_FIELD = 0x10; } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99SegmentInfoFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99SegmentInfoFormat.java index bbf771408432..9341c312299e 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99SegmentInfoFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99SegmentInfoFormat.java @@ -142,8 +142,7 @@ private SegmentInfo parseSegmentInfo( String name = input.readString(); sortFields[i] = SortFieldProvider.forName(name).readSortField(input); } - String parentField = input.readByte() == SegmentInfo.YES ? input.readString() : null; - indexSort = new Sort(parentField, sortFields); + indexSort = new Sort(sortFields); } else if (numSortFields < 0) { throw new CorruptIndexException("invalid index sort field count: " + numSortFields, input); } else { @@ -233,12 +232,5 @@ private void writeSegmentInfo(DataOutput output, SegmentInfo si) throws IOExcept output.writeString(sorter.getProviderName()); SortFieldProvider.write(sortField, output); } - if (numSortFields > 0) { - String parentField = indexSort.getParentField(); - output.writeByte((byte) (parentField != null ? SegmentInfo.YES : SegmentInfo.NO)); - if (parentField != null) { - output.writeString(parentField); - } - } } } diff --git a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java index 92a80f6961d0..54cdf6146a90 100644 --- a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java +++ b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java @@ -1182,16 +1182,17 @@ public static Status.IndexSortStatus testSort( try { LeafMetaData metaData = reader.getMetaData(); + FieldInfos fieldInfos = reader.getFieldInfos(); if (metaData.hasBlocks() - && sort.getParentField() == null + && fieldInfos.getParentField() == null && metaData.getCreatedVersionMajor() >= Version.LUCENE_10_0_0.major) { throw new IllegalStateException( "parent field is not set but the index has document blocks and was created with version: " + metaData.getCreatedVersionMajor()); } final DocIdSetIterator iter = - metaData.hasBlocks() && sort.getParentField() != null - ? reader.getNumericDocValues(sort.getParentField()) + metaData.hasBlocks() && fieldInfos.getParentField() != null + ? reader.getNumericDocValues(fieldInfos.getParentField()) : DocIdSetIterator.all(reader.maxDoc()); int prevDoc = iter.nextDoc(); int nextDoc; diff --git a/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java b/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java index 21a1e5209371..588b41cd3da8 100644 --- a/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java +++ b/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java @@ -195,11 +195,10 @@ void abort() throws IOException { fieldInfos, indexWriterConfig, this::onAbortingException); - if (indexWriterConfig.getIndexSort() != null - && indexWriterConfig.getIndexSort().getParentField() != null) { + if (indexWriterConfig.getParentField() != null) { this.parentField = indexingChain.markAsReserved( - new NumericDocValuesField(indexWriterConfig.getIndexSort().getParentField(), -1)); + new NumericDocValuesField(indexWriterConfig.getParentField(), -1)); } else { this.parentField = null; } @@ -248,21 +247,22 @@ long updateDocuments( final Iterator> iterator = docs.iterator(); while (iterator.hasNext()) { Iterable doc = iterator.next(); - if (segmentInfo.getIndexSort() != null) { - if (parentField != null) { - if (iterator.hasNext() == false) { - int numChildren = numDocsInRAM - docsInRamBefore; - parentField.getDelegate().setLongValue(numChildren); - doc = addParentField(doc, parentField); - } - } else if (iterator.hasNext() && indexVersionCreated >= Version.LUCENE_10_0_0.major) { - // sort is configured but parent field is missing, yet we have a doc-block - // yet we must not fail if this index was created in an earlier version where this - // behavior was permitted. - throw new IllegalArgumentException( - "a parent field must be set in order to use document blocks with index sorting; see Sort#getParentField"); + if (parentField != null) { + if (iterator.hasNext() == false) { + int numChildren = numDocsInRAM - docsInRamBefore; + parentField.getDelegate().setLongValue(numChildren); + doc = addParentField(doc, parentField); } + } else if (segmentInfo.getIndexSort() != null + && iterator.hasNext() + && indexVersionCreated >= Version.LUCENE_10_0_0.major) { + // sort is configured but parent field is missing, yet we have a doc-block + // yet we must not fail if this index was created in an earlier version where this + // behavior was permitted. + throw new IllegalArgumentException( + "a parent field must be set in order to use document blocks with index sorting; see IndexWriterConfig#getParentField"); } + // Even on exception, the document is still added (but marked // deleted), so we don't need to un-reserve at that point. // Aborting exceptions will actually "lose" more than one diff --git a/lucene/core/src/java/org/apache/lucene/index/FieldInfo.java b/lucene/core/src/java/org/apache/lucene/index/FieldInfo.java index 59184a6bff06..0b2a2f0f385b 100644 --- a/lucene/core/src/java/org/apache/lucene/index/FieldInfo.java +++ b/lucene/core/src/java/org/apache/lucene/index/FieldInfo.java @@ -63,6 +63,8 @@ public final class FieldInfo { // whether this field is used as the soft-deletes field private final boolean softDeletesField; + private final boolean parentField; + /** * Sole constructor. * @@ -84,7 +86,8 @@ public FieldInfo( int vectorDimension, VectorEncoding vectorEncoding, VectorSimilarityFunction vectorSimilarityFunction, - boolean softDeletesField) { + boolean softDeletesField, + boolean parentField) { this.name = Objects.requireNonNull(name); this.number = number; this.docValuesType = @@ -111,6 +114,7 @@ public FieldInfo( this.vectorEncoding = vectorEncoding; this.vectorSimilarityFunction = vectorSimilarityFunction; this.softDeletesField = softDeletesField; + this.parentField = parentField; this.checkConsistency(); } @@ -633,4 +637,12 @@ public Map attributes() { public boolean isSoftDeletesField() { return softDeletesField; } + + /** + * Returns true if this field is configured and used as the parent document field field. See + * {@link IndexWriterConfig#setParentField(String)} + */ + public boolean isParentField() { + return parentField; + } } diff --git a/lucene/core/src/java/org/apache/lucene/index/FieldInfos.java b/lucene/core/src/java/org/apache/lucene/index/FieldInfos.java index fd07d171010b..c04ecb6b9f6c 100644 --- a/lucene/core/src/java/org/apache/lucene/index/FieldInfos.java +++ b/lucene/core/src/java/org/apache/lucene/index/FieldInfos.java @@ -59,6 +59,8 @@ public class FieldInfos implements Iterable { private final boolean hasVectorValues; private final String softDeletesField; + private final String parentField; + // used only by fieldInfo(int) private final FieldInfo[] byNumber; @@ -78,6 +80,7 @@ public FieldInfos(FieldInfo[] infos) { boolean hasPointValues = false; boolean hasVectorValues = false; String softDeletesField = null; + String parentField = null; int size = 0; // number of elements in byNumberTemp, number of used array slots FieldInfo[] byNumberTemp = new FieldInfo[10]; // initial array capacity of 10 @@ -132,6 +135,13 @@ public FieldInfos(FieldInfo[] infos) { } softDeletesField = info.name; } + if (info.isParentField()) { + if (parentField != null && parentField.equals(info.name) == false) { + throw new IllegalArgumentException( + "multiple parent fields [" + info.name + ", " + parentField + "]"); + } + parentField = info.name; + } } this.hasVectors = hasVectors; @@ -145,6 +155,7 @@ public FieldInfos(FieldInfo[] infos) { this.hasPointValues = hasPointValues; this.hasVectorValues = hasVectorValues; this.softDeletesField = softDeletesField; + this.parentField = parentField; List valuesTemp = new ArrayList<>(); byNumber = new FieldInfo[size]; @@ -178,7 +189,13 @@ public static FieldInfos getMergedFieldInfos(IndexReader reader) { .filter(Objects::nonNull) .findAny() .orElse(null); - final Builder builder = new Builder(new FieldNumbers(softDeletesField)); + final String parentField = + leaves.stream() + .map(l -> l.reader().getFieldInfos().getParentField()) + .filter(Objects::nonNull) + .findAny() + .orElse(null); + final Builder builder = new Builder(new FieldNumbers(softDeletesField, parentField)); for (final LeafReaderContext ctx : leaves) { for (FieldInfo fieldInfo : ctx.reader().getFieldInfos()) { builder.add(fieldInfo); @@ -254,6 +271,11 @@ public String getSoftDeletesField() { return softDeletesField; } + /** Returns the parent document field name if exists; otherwise returns null */ + public String getParentField() { + return parentField; + } + /** Returns the number of fields */ public int size() { return byName.size(); @@ -345,7 +367,10 @@ static final class FieldNumbers { // The soft-deletes field from IWC to enforce a single soft-deletes field private final String softDeletesFieldName; - FieldNumbers(String softDeletesFieldName) { + // The parent document field from IWC to mark parent document when indexing + private final String parentFieldName; + + FieldNumbers(String softDeletesFieldName, String parentFieldName) { this.nameToNumber = new HashMap<>(); this.numberToName = new HashMap<>(); this.indexOptions = new HashMap<>(); @@ -355,11 +380,13 @@ static final class FieldNumbers { this.omitNorms = new HashMap<>(); this.storeTermVectors = new HashMap<>(); this.softDeletesFieldName = softDeletesFieldName; + this.parentFieldName = parentFieldName; } synchronized void verifyFieldInfo(FieldInfo fi) { String fieldName = fi.getName(); verifySoftDeletedFieldName(fieldName, fi.isSoftDeletesField()); + verifyParentFieldName(fieldName, fi.isParentField()); if (nameToNumber.containsKey(fieldName)) { verifySameSchema(fi); } @@ -373,6 +400,7 @@ synchronized void verifyFieldInfo(FieldInfo fi) { synchronized int addOrGet(FieldInfo fi) { String fieldName = fi.getName(); verifySoftDeletedFieldName(fieldName, fi.isSoftDeletesField()); + verifyParentFieldName(fieldName, fi.isParentField()); Integer fieldNumber = nameToNumber.get(fieldName); if (fieldNumber != null) { @@ -437,6 +465,31 @@ private void verifySoftDeletedFieldName(String fieldName, boolean isSoftDeletesF } } + private void verifyParentFieldName(String fieldName, boolean isParentField) { + if (isParentField) { + if (parentFieldName == null) { + throw new IllegalArgumentException( + "this index has [" + + fieldName + + "] as parent document field already but parent document field is not configured in IWC"); + } else if (fieldName.equals(parentFieldName) == false) { + throw new IllegalArgumentException( + "cannot configure [" + + parentFieldName + + "] as parent document field ; this index uses [" + + fieldName + + "] as parent document field already"); + } + } else if (fieldName.equals(parentFieldName)) { + throw new IllegalArgumentException( + "cannot configure [" + + parentFieldName + + "] as parent document field ; this index uses [" + + fieldName + + "] as non parent document field already"); + } + } + private void verifySameSchema(FieldInfo fi) { String fieldName = fi.getName(); IndexOptions currentOpts = this.indexOptions.get(fieldName); @@ -513,7 +566,8 @@ synchronized void verifyOrCreateDvOnlyField( 0, VectorEncoding.FLOAT32, VectorSimilarityFunction.EUCLIDEAN, - (softDeletesFieldName != null && softDeletesFieldName.equals(fieldName))); + (softDeletesFieldName != null && softDeletesFieldName.equals(fieldName)), + (parentFieldName != null && parentFieldName.equals(fieldName))); addOrGet(fi); } } else { @@ -579,6 +633,7 @@ FieldInfo constructFieldInfo(String fieldName, DocValuesType dvType, int newFiel if (dvType != dvType0) return null; boolean isSoftDeletesField = fieldName.equals(softDeletesFieldName); + boolean isParentField = fieldName.equals(parentFieldName); return new FieldInfo( fieldName, newFieldNumber, @@ -595,7 +650,8 @@ FieldInfo constructFieldInfo(String fieldName, DocValuesType dvType, int newFiel 0, VectorEncoding.FLOAT32, VectorSimilarityFunction.EUCLIDEAN, - isSoftDeletesField); + isSoftDeletesField, + isParentField); } synchronized Set getFieldNames() { @@ -627,6 +683,14 @@ public String getSoftDeletesFieldName() { return globalFieldNumbers.softDeletesFieldName; } + /** + * Returns the name of the parent document field or null if no parent field is + * configured + */ + public String getParentFieldName() { + return globalFieldNumbers.parentFieldName; + } + /** * Adds the provided FieldInfo to this Builder if this field doesn't exist in this Builder. Also * adds a new field with its schema options to the global FieldNumbers if the field doesn't @@ -710,7 +774,8 @@ FieldInfo add(FieldInfo fi, long dvGen) { fi.getVectorDimension(), fi.getVectorEncoding(), fi.getVectorSimilarityFunction(), - fi.isSoftDeletesField()); + fi.isSoftDeletesField(), + fi.isParentField()); byName.put(fiNew.getName(), fiNew); return fiNew; } diff --git a/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java b/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java index a4c960d4d19d..3ee576422cf0 100644 --- a/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java +++ b/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java @@ -33,7 +33,6 @@ import java.util.List; import java.util.Locale; import java.util.Map; -import java.util.Objects; import java.util.Queue; import java.util.Set; import java.util.concurrent.ConcurrentLinkedQueue; @@ -1225,9 +1224,6 @@ private void validateIndexSort() { /** Returns true if indexSort is a prefix of otherSort. */ static boolean isCongruentSort(Sort indexSort, Sort otherSort) { - if (Objects.equals(indexSort.getParentField(), otherSort.getParentField()) == false) { - return false; - } final SortField[] fields1 = indexSort.getSort(); final SortField[] fields2 = otherSort.getSort(); if (fields1.length > fields2.length) { @@ -1264,7 +1260,8 @@ static FieldInfos readFieldInfos(SegmentCommitInfo si) throws IOException { * If this {@link SegmentInfos} has no global field number map the returned instance is empty */ private FieldNumbers getFieldNumberMap() throws IOException { - final FieldNumbers map = new FieldNumbers(config.softDeletesField); + final FieldNumbers map = + new FieldNumbers(config.getSoftDeletesField(), config.getParentField()); for (SegmentCommitInfo info : segmentInfos) { FieldInfos fis = readFieldInfos(info); @@ -6617,10 +6614,12 @@ public void setIndexWriterMaxDocs(int limit) { } @Override - public FieldInfosBuilder newFieldInfosBuilder(String softDeletesFieldName) { + public FieldInfosBuilder newFieldInfosBuilder( + String softDeletesFieldName, String parentFieldName) { return new FieldInfosBuilder() { private FieldInfos.Builder builder = - new FieldInfos.Builder(new FieldInfos.FieldNumbers(softDeletesFieldName)); + new FieldInfos.Builder( + new FieldInfos.FieldNumbers(softDeletesFieldName, parentFieldName)); @Override public FieldInfosBuilder add(FieldInfo fi) { diff --git a/lucene/core/src/java/org/apache/lucene/index/IndexWriterConfig.java b/lucene/core/src/java/org/apache/lucene/index/IndexWriterConfig.java index c5e9a8bd2240..5d28e991de08 100644 --- a/lucene/core/src/java/org/apache/lucene/index/IndexWriterConfig.java +++ b/lucene/core/src/java/org/apache/lucene/index/IndexWriterConfig.java @@ -545,4 +545,9 @@ public IndexWriterConfig setIndexWriterEventListener( this.eventListener = eventListener; return this; } + + public IndexWriterConfig setParentField(String parentField) { + this.parentField = parentField; + return this; + } } diff --git a/lucene/core/src/java/org/apache/lucene/index/IndexingChain.java b/lucene/core/src/java/org/apache/lucene/index/IndexingChain.java index 989b36dea545..7ccb8e3195fd 100644 --- a/lucene/core/src/java/org/apache/lucene/index/IndexingChain.java +++ b/lucene/core/src/java/org/apache/lucene/index/IndexingChain.java @@ -226,12 +226,12 @@ private Sorter.DocMap maybeSortSegment(SegmentWriteState state) throws IOExcepti LeafReader docValuesReader = getDocValuesLeafReader(); Function comparatorWrapper = in -> in; - if (state.segmentInfo.getHasBlocks() && indexSort.getParentField() != null) { + if (state.segmentInfo.getHasBlocks() && state.fieldInfos.getParentField() != null) { final DocIdSetIterator readerValues = - docValuesReader.getNumericDocValues(indexSort.getParentField()); + docValuesReader.getNumericDocValues(state.fieldInfos.getParentField()); if (readerValues == null) { throw new CorruptIndexException( - "missing doc values for parent field \"" + indexSort.getParentField() + "\"", + "missing doc values for parent field \"" + state.fieldInfos.getParentField() + "\"", "IndexingChain"); } BitSet parents = BitSet.of(readerValues, state.segmentInfo.maxDoc()); @@ -241,7 +241,7 @@ private Sorter.DocMap maybeSortSegment(SegmentWriteState state) throws IOExcepti in.compare(parents.nextSetBit(docID1), parents.nextSetBit(docID2)); } if (state.segmentInfo.getHasBlocks() - && indexSort.getParentField() == null + && state.fieldInfos.getParentField() == null && indexCreatedVersionMajor >= Version.LUCENE_10_0_0.major) { throw new CorruptIndexException( "parent field is not set but the index has blocks. indexCreatedVersionMajor: " @@ -687,7 +687,8 @@ private void initializeFieldInfo(PerField pf) throws IOException { s.vectorDimension, s.vectorEncoding, s.vectorSimilarityFunction, - pf.fieldName.equals(fieldInfos.getSoftDeletesFieldName()))); + pf.fieldName.equals(fieldInfos.getSoftDeletesFieldName()), + pf.fieldName.equals(fieldInfos.getParentFieldName()))); pf.setFieldInfo(fi); if (fi.getIndexOptions() != IndexOptions.NONE) { pf.setInvertState(); diff --git a/lucene/core/src/java/org/apache/lucene/index/LiveIndexWriterConfig.java b/lucene/core/src/java/org/apache/lucene/index/LiveIndexWriterConfig.java index 257c429818c4..132873b041b4 100644 --- a/lucene/core/src/java/org/apache/lucene/index/LiveIndexWriterConfig.java +++ b/lucene/core/src/java/org/apache/lucene/index/LiveIndexWriterConfig.java @@ -98,6 +98,8 @@ public class LiveIndexWriterConfig { /** The field names involved in the index sort */ protected Set indexSortFields = Collections.emptySet(); + protected String parentField = null; + /** * if an indexing thread should check for pending flushes on update in order to help out on a full * flush @@ -458,6 +460,10 @@ public IndexWriterEventListener getIndexWriterEventListener() { return eventListener; } + public String getParentField() { + return parentField; + } + @Override public String toString() { StringBuilder sb = new StringBuilder(); @@ -486,6 +492,7 @@ public String toString() { sb.append("maxFullFlushMergeWaitMillis=").append(getMaxFullFlushMergeWaitMillis()).append("\n"); sb.append("leafSorter=").append(getLeafSorter()).append("\n"); sb.append("eventListener=").append(getIndexWriterEventListener()).append("\n"); + sb.append("parentField=").append(getParentField()).append("\n"); return sb.toString(); } } diff --git a/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java b/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java index 9effcc1df490..52b1f8d624d3 100644 --- a/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java +++ b/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java @@ -54,12 +54,14 @@ static MergeState.DocMap[] sort(Sort sort, List readers) throws IOE comparables[i] = sorter.getComparableProviders(readers); for (int j = 0; j < readers.size(); j++) { CodecReader codecReader = readers.get(j); + FieldInfos fieldInfos = codecReader.getFieldInfos(); LeafMetaData metaData = codecReader.getMetaData(); - if (metaData.hasBlocks() && sort.getParentField() != null) { - NumericDocValues parentDocs = codecReader.getNumericDocValues(sort.getParentField()); + if (metaData.hasBlocks() && fieldInfos.getParentField() != null) { + NumericDocValues parentDocs = + codecReader.getNumericDocValues(fieldInfos.getParentField()); assert parentDocs != null : "parent field: " - + sort.getParentField() + + fieldInfos.getParentField() + " must be present if index sorting is used with blocks"; BitSet parents = BitSet.of(parentDocs, codecReader.maxDoc()); IndexSorter.ComparableProvider[] providers = comparables[i]; @@ -67,7 +69,7 @@ static MergeState.DocMap[] sort(Sort sort, List readers) throws IOE providers[j] = docId -> provider.getAsComparableLong(parents.nextSetBit(docId)); } assert metaData.hasBlocks() == false - || sort.getParentField() != null + || fieldInfos.getParentField() != null || metaData.getCreatedVersionMajor() < Version.LUCENE_10_0_0.major : "parent field is not set but the index has blocks. indexCreatedVersionMajor: " + metaData.getCreatedVersionMajor(); diff --git a/lucene/core/src/java/org/apache/lucene/index/ParallelLeafReader.java b/lucene/core/src/java/org/apache/lucene/index/ParallelLeafReader.java index 80273cdebae7..374dbb31194f 100644 --- a/lucene/core/src/java/org/apache/lucene/index/ParallelLeafReader.java +++ b/lucene/core/src/java/org/apache/lucene/index/ParallelLeafReader.java @@ -111,9 +111,15 @@ public ParallelLeafReader( .filter(Objects::nonNull) .findAny() .orElse(null); + final String parentField = + completeReaderSet.stream() + .map(r -> r.getFieldInfos().getParentField()) + .filter(Objects::nonNull) + .findAny() + .orElse(null); // TODO: make this read-only in a cleaner way? FieldInfos.Builder builder = - new FieldInfos.Builder(new FieldInfos.FieldNumbers(softDeletesField)); + new FieldInfos.Builder(new FieldInfos.FieldNumbers(softDeletesField, parentField)); Sort indexSort = null; int createdVersionMajor = -1; diff --git a/lucene/core/src/java/org/apache/lucene/index/ReadersAndUpdates.java b/lucene/core/src/java/org/apache/lucene/index/ReadersAndUpdates.java index 04306c024cbd..e5eb05b35bb7 100644 --- a/lucene/core/src/java/org/apache/lucene/index/ReadersAndUpdates.java +++ b/lucene/core/src/java/org/apache/lucene/index/ReadersAndUpdates.java @@ -720,7 +720,8 @@ private FieldInfo cloneFieldInfo(FieldInfo fi, int fieldNumber) { fi.getVectorDimension(), fi.getVectorEncoding(), fi.getVectorSimilarityFunction(), - fi.isSoftDeletesField()); + fi.isSoftDeletesField(), + fi.isParentField()); } private SegmentReader createNewReaderWithLatestLiveDocs(SegmentReader reader) throws IOException { diff --git a/lucene/core/src/java/org/apache/lucene/index/Sorter.java b/lucene/core/src/java/org/apache/lucene/index/Sorter.java index 0eebda13804d..741dfc6944a6 100644 --- a/lucene/core/src/java/org/apache/lucene/index/Sorter.java +++ b/lucene/core/src/java/org/apache/lucene/index/Sorter.java @@ -211,16 +211,17 @@ DocMap sort(LeafReader reader) throws IOException { Function comparatorWrapper = in -> in; LeafMetaData metaData = reader.getMetaData(); - if (metaData.hasBlocks() && sort.getParentField() != null) { + FieldInfos fieldInfos = reader.getFieldInfos(); + if (metaData.hasBlocks() && fieldInfos.getParentField() != null) { BitSet parents = - BitSet.of(reader.getNumericDocValues(sort.getParentField()), reader.maxDoc()); + BitSet.of(reader.getNumericDocValues(fieldInfos.getParentField()), reader.maxDoc()); comparatorWrapper = in -> (docID1, docID2) -> in.compare(parents.nextSetBit(docID1), parents.nextSetBit(docID2)); } if (metaData.hasBlocks() - && sort.getParentField() == null + && fieldInfos.getParentField() == null && metaData.getCreatedVersionMajor() >= Version.LUCENE_10_0_0.major) { throw new CorruptIndexException( "parent field is not set but the index has blocks. indexCreatedVersionMajor: " diff --git a/lucene/core/src/java/org/apache/lucene/internal/tests/IndexPackageAccess.java b/lucene/core/src/java/org/apache/lucene/internal/tests/IndexPackageAccess.java index 545b56599337..5ca6d9d1c58f 100644 --- a/lucene/core/src/java/org/apache/lucene/internal/tests/IndexPackageAccess.java +++ b/lucene/core/src/java/org/apache/lucene/internal/tests/IndexPackageAccess.java @@ -31,7 +31,7 @@ public interface IndexPackageAccess { void setIndexWriterMaxDocs(int limit); - FieldInfosBuilder newFieldInfosBuilder(String softDeletesFieldName); + FieldInfosBuilder newFieldInfosBuilder(String softDeletesFieldName, String parentFieldName); void checkImpacts(Impacts impacts, int max); diff --git a/lucene/core/src/java/org/apache/lucene/search/Sort.java b/lucene/core/src/java/org/apache/lucene/search/Sort.java index 41ee8b7e1696..394dcb1a23d3 100644 --- a/lucene/core/src/java/org/apache/lucene/search/Sort.java +++ b/lucene/core/src/java/org/apache/lucene/search/Sort.java @@ -18,7 +18,6 @@ import java.io.IOException; import java.util.Arrays; -import java.util.Objects; /** * Encapsulates sort criteria for returned hits. @@ -44,7 +43,6 @@ public final class Sort { // internal representation of the sort criteria private final SortField[] fields; - private final String parentField; /** * Sorts by computed relevance. This is the same sort criteria as calling {@link @@ -61,29 +59,10 @@ public Sort() { * is still a tie after all SortFields are checked, the internal Lucene docid is used to break it. */ public Sort(SortField... fields) { - this(null, fields); - } - - /** - * Sets the sort to the given criteria in succession: the first SortField is checked first, but if - * it produces a tie, then the second SortField is used to break the tie, etc. Finally, if there - * is still a tie after all SortFields are checked, the internal Lucene docid is used to break it. - * - * @param parentField the name of a numeric doc values field that marks the last document of a - * document blocks indexed with {@link - * org.apache.lucene.index.IndexWriter#addDocuments(Iterable)} or it's update relatives at - * index time. This is required for indices that use index sorting in combination with - * document blocks in order to maintain the document order of the blocks documents. Index - * sorting will effectively compare the parent (last document) of a block in order to stable - * sort all it's adjacent documents that belong to a block. This field must be a numeric doc - * values field. - */ - public Sort(String parentField, SortField... fields) { if (fields.length == 0) { throw new IllegalArgumentException("There must be at least 1 sort field"); } this.fields = fields; - this.parentField = parentField; } /** @@ -95,11 +74,6 @@ public SortField[] getSort() { return fields; } - /** Returns the field name that marks a document as a parent in a document block. */ - public String getParentField() { - return parentField; - } - /** * Rewrites the SortFields in this Sort, returning a new Sort if any of the fields changes during * their rewriting. @@ -111,9 +85,6 @@ public String getParentField() { */ public Sort rewrite(IndexSearcher searcher) throws IOException { boolean changed = false; - if (parentField != null) { - throw new IllegalStateException("parentFields must not be used with search time sorting"); - } SortField[] rewrittenSortFields = new SortField[fields.length]; for (int i = 0; i < fields.length; i++) { rewrittenSortFields[i] = fields[i].rewrite(searcher); @@ -128,9 +99,6 @@ public Sort rewrite(IndexSearcher searcher) throws IOException { @Override public String toString() { StringBuilder buffer = new StringBuilder(); - if (parentField != null) { - buffer.append("parent field: ").append(parentField).append(' '); - } for (int i = 0; i < fields.length; i++) { buffer.append(fields[i].toString()); if ((i + 1) < fields.length) buffer.append(','); @@ -145,14 +113,13 @@ public boolean equals(Object o) { if (this == o) return true; if (!(o instanceof Sort)) return false; final Sort other = (Sort) o; - return Objects.equals(parentField, other.parentField) - && Arrays.equals(this.fields, other.fields); + return Arrays.equals(this.fields, other.fields); } /** Returns a hash code value for this object. */ @Override public int hashCode() { - return 0x45aaf665 + Arrays.hashCode(fields) + Objects.hashCode(parentField); + return 0x45aaf665 + Arrays.hashCode(fields); } /** Returns true if the relevance score is needed to sort documents. */ diff --git a/lucene/core/src/test/org/apache/lucene/index/TestAddIndexes.java b/lucene/core/src/test/org/apache/lucene/index/TestAddIndexes.java index 496f0cd38274..5b839607cd35 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestAddIndexes.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestAddIndexes.java @@ -1679,60 +1679,6 @@ public void testIllegalIndexSortChange2() throws Exception { IOUtils.close(r1, dir1, w2, dir2); } - public void testIllegalIndexSortChange3() throws Exception { - Directory dir1 = newDirectory(); - IndexWriterConfig iwc1 = newIndexWriterConfig(new MockAnalyzer(random())); - iwc1.setIndexSort(new Sort("foobar", new SortField("foo", SortField.Type.INT))); - - RandomIndexWriter w1 = new RandomIndexWriter(random(), dir1, iwc1); - Document parent = new Document(); - w1.addDocuments(Arrays.asList(new Document(), new Document(), parent)); - w1.commit(); - w1.addDocuments(Arrays.asList(new Document(), new Document(), parent)); - w1.commit(); - // so the index sort is in fact burned into the index: - w1.forceMerge(1); - w1.close(); - - Directory dir2 = newDirectory(); - IndexWriterConfig iwc2 = newIndexWriterConfig(new MockAnalyzer(random())); - iwc2.setIndexSort(new Sort(new SortField("foo", SortField.Type.INT))); - RandomIndexWriter w2 = new RandomIndexWriter(random(), dir2, iwc2); - - IndexReader r1 = DirectoryReader.open(dir1); - String message = - expectThrows( - IllegalArgumentException.class, - () -> { - w2.addIndexes((SegmentReader) getOnlyLeafReader(r1)); - }) - .getMessage(); - assertEquals( - "cannot change index sort from parent field: foobar to ", - message); - - message = - expectThrows( - IllegalArgumentException.class, - () -> { - w2.addIndexes(dir1); - }) - .getMessage(); - assertEquals( - "cannot change index sort from parent field: foobar to ", - message); - - Directory dir3 = newDirectory(); - IndexWriterConfig iwc3 = newIndexWriterConfig(new MockAnalyzer(random())); - iwc3.setIndexSort(new Sort("foobar", new SortField("foo", SortField.Type.INT))); - RandomIndexWriter w3 = new RandomIndexWriter(random(), dir3, iwc3); - - w3.addIndexes((SegmentReader) getOnlyLeafReader(r1)); - w3.addIndexes(dir1); - - IOUtils.close(r1, dir1, w2, dir2, w3, dir3); - } - public void testAddIndexesDVUpdateSameSegmentName() throws Exception { Directory dir1 = newDirectory(); IndexWriterConfig iwc1 = newIndexWriterConfig(new MockAnalyzer(random())); @@ -1944,28 +1890,28 @@ public void testAddIndicesWithBlocks() throws IOException { public void testSetDiagnostics() throws IOException { MergePolicy myMergePolicy = - new FilterMergePolicy(newLogMergePolicy(4)) { - @Override - public MergeSpecification findMerges(CodecReader... readers) throws IOException { - MergeSpecification spec = super.findMerges(readers); - if (spec == null) { - return null; - } - MergeSpecification newSpec = new MergeSpecification(); - for (OneMerge merge : spec.merges) { - newSpec.add( - new OneMerge(merge) { - @Override - public void setMergeInfo(SegmentCommitInfo info) { - super.setMergeInfo(info); - info.info.addDiagnostics( - Collections.singletonMap("merge_policy", "my_merge_policy")); - } - }); - } - return newSpec; - } - }; + new FilterMergePolicy(newLogMergePolicy(4)) { + @Override + public MergeSpecification findMerges(CodecReader... readers) throws IOException { + MergeSpecification spec = super.findMerges(readers); + if (spec == null) { + return null; + } + MergeSpecification newSpec = new MergeSpecification(); + for (OneMerge merge : spec.merges) { + newSpec.add( + new OneMerge(merge) { + @Override + public void setMergeInfo(SegmentCommitInfo info) { + super.setMergeInfo(info); + info.info.addDiagnostics( + Collections.singletonMap("merge_policy", "my_merge_policy")); + } + }); + } + return newSpec; + } + }; Directory sourceDir = newDirectory(); try (IndexWriter w = new IndexWriter(sourceDir, newIndexWriterConfig())) { Document doc = new Document(); @@ -1976,7 +1922,7 @@ public void setMergeInfo(SegmentCommitInfo info) { Directory targetDir = newDirectory(); try (IndexWriter w = - new IndexWriter(targetDir, newIndexWriterConfig().setMergePolicy(myMergePolicy))) { + new IndexWriter(targetDir, newIndexWriterConfig().setMergePolicy(myMergePolicy))) { w.addIndexes(codecReader); } @@ -1984,11 +1930,64 @@ public void setMergeInfo(SegmentCommitInfo info) { assertNotEquals(0, si.size()); for (SegmentCommitInfo sci : si) { assertEquals( - IndexWriter.SOURCE_ADDINDEXES_READERS, sci.info.getDiagnostics().get(IndexWriter.SOURCE)); + IndexWriter.SOURCE_ADDINDEXES_READERS, sci.info.getDiagnostics().get(IndexWriter.SOURCE)); assertEquals("my_merge_policy", sci.info.getDiagnostics().get("merge_policy")); } reader.close(); targetDir.close(); sourceDir.close(); } + + public void testIllegalParentDocChange() throws Exception { + Directory dir1 = newDirectory(); + IndexWriterConfig iwc1 = newIndexWriterConfig(new MockAnalyzer(random())); + iwc1.setParentField("foobar"); + RandomIndexWriter w1 = new RandomIndexWriter(random(), dir1, iwc1); + Document parent = new Document(); + w1.addDocuments(Arrays.asList(new Document(), new Document(), parent)); + w1.commit(); + w1.addDocuments(Arrays.asList(new Document(), new Document(), parent)); + w1.commit(); + // so the index sort is in fact burned into the index: + w1.forceMerge(1); + w1.close(); + + Directory dir2 = newDirectory(); + IndexWriterConfig iwc2 = newIndexWriterConfig(new MockAnalyzer(random())); + iwc2.setParentField("foo"); + RandomIndexWriter w2 = new RandomIndexWriter(random(), dir2, iwc2); + + IndexReader r1 = DirectoryReader.open(dir1); + String message = + expectThrows( + IllegalArgumentException.class, + () -> { + w2.addIndexes((SegmentReader) getOnlyLeafReader(r1)); + }) + .getMessage(); + assertEquals( + "cannot configure [foo] as parent document field ; this index uses [foobar] as parent document field already", + message); + + message = + expectThrows( + IllegalArgumentException.class, + () -> { + w2.addIndexes(dir1); + }) + .getMessage(); + assertEquals( + "cannot configure [foo] as parent document field ; this index uses [foobar] as parent document field already", + message); + + Directory dir3 = newDirectory(); + IndexWriterConfig iwc3 = newIndexWriterConfig(new MockAnalyzer(random())); + iwc3.setParentField("foobar"); + RandomIndexWriter w3 = new RandomIndexWriter(random(), dir3, iwc3); + + w3.addIndexes((SegmentReader) getOnlyLeafReader(r1)); + w3.addIndexes(dir1); + + IOUtils.close(r1, dir1, w2, dir2, w3, dir3); + } } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestCodecs.java b/lucene/core/src/test/org/apache/lucene/index/TestCodecs.java index 40b00975805d..eff9cbe763fc 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestCodecs.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestCodecs.java @@ -114,6 +114,7 @@ public FieldData( 0, VectorEncoding.FLOAT32, VectorSimilarityFunction.EUCLIDEAN, + false, false)); } this.terms = terms; @@ -229,7 +230,8 @@ public void testFixedPostings() throws Throwable { terms[i] = new TermData(text, docs, null); } - final FieldInfos.Builder builder = new FieldInfos.Builder(new FieldInfos.FieldNumbers(null)); + final FieldInfos.Builder builder = + new FieldInfos.Builder(new FieldInfos.FieldNumbers(null, null)); final FieldData field = new FieldData("field", builder, terms, true, false); final FieldData[] fields = new FieldData[] {field}; @@ -292,7 +294,8 @@ public void testFixedPostings() throws Throwable { } public void testRandomPostings() throws Throwable { - final FieldInfos.Builder builder = new FieldInfos.Builder(new FieldInfos.FieldNumbers(null)); + final FieldInfos.Builder builder = + new FieldInfos.Builder(new FieldInfos.FieldNumbers(null, null)); final FieldData[] fields = new FieldData[NUM_FIELDS]; for (int i = 0; i < NUM_FIELDS; i++) { diff --git a/lucene/core/src/test/org/apache/lucene/index/TestDoc.java b/lucene/core/src/test/org/apache/lucene/index/TestDoc.java index 9a8eda06c888..3b245dd41326 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestDoc.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestDoc.java @@ -236,7 +236,7 @@ private SegmentCommitInfo merge( si, InfoStream.getDefault(), trackingDir, - new FieldInfos.FieldNumbers(null), + new FieldInfos.FieldNumbers(null, null), context); merger.merge(); diff --git a/lucene/core/src/test/org/apache/lucene/index/TestFieldInfos.java b/lucene/core/src/test/org/apache/lucene/index/TestFieldInfos.java index ac527062767c..a19b32ba1506 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestFieldInfos.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestFieldInfos.java @@ -243,7 +243,7 @@ public void testMergedFieldInfos_singleLeaf() throws IOException { } public void testFieldNumbersAutoIncrement() { - FieldInfos.FieldNumbers fieldNumbers = new FieldInfos.FieldNumbers("softDeletes"); + FieldInfos.FieldNumbers fieldNumbers = new FieldInfos.FieldNumbers("softDeletes", "parentDoc"); for (int i = 0; i < 10; i++) { fieldNumbers.addOrGet( new FieldInfo( @@ -262,6 +262,7 @@ public void testFieldNumbersAutoIncrement() { 0, VectorEncoding.FLOAT32, VectorSimilarityFunction.EUCLIDEAN, + false, false)); } int idx = @@ -282,6 +283,7 @@ public void testFieldNumbersAutoIncrement() { 0, VectorEncoding.FLOAT32, VectorSimilarityFunction.EUCLIDEAN, + false, false)); assertEquals("Field numbers 0 through 9 were allocated", 10, idx); @@ -304,6 +306,7 @@ public void testFieldNumbersAutoIncrement() { 0, VectorEncoding.FLOAT32, VectorSimilarityFunction.EUCLIDEAN, + false, false)); assertEquals("Field numbers should reset after clear()", 0, idx); } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestFieldsReader.java b/lucene/core/src/test/org/apache/lucene/index/TestFieldsReader.java index 0c984fce0e4a..b526f04f39d5 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestFieldsReader.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestFieldsReader.java @@ -45,8 +45,7 @@ public class TestFieldsReader extends LuceneTestCase { @BeforeClass public static void beforeClass() throws Exception { testDoc = new Document(); - final String softDeletesFieldName = null; - fieldInfos = new FieldInfos.Builder(new FieldInfos.FieldNumbers(softDeletesFieldName)); + fieldInfos = new FieldInfos.Builder(new FieldInfos.FieldNumbers(null, null)); DocHelper.setupDoc(testDoc); for (IndexableField field : testDoc.getFields()) { IndexableFieldType ift = field.fieldType(); @@ -67,7 +66,8 @@ public static void beforeClass() throws Exception { 0, VectorEncoding.FLOAT32, VectorSimilarityFunction.EUCLIDEAN, - field.name().equals(softDeletesFieldName))); + false, + false)); } dir = newDirectory(); IndexWriterConfig conf = diff --git a/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java b/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java index 16189eccdeac..586169145374 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java @@ -3195,7 +3195,8 @@ public void testBlockContainsParentField() throws IOException { try (Directory dir = newDirectory()) { IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random())); String parentField = "parent"; - Sort indexSort = new Sort(parentField, new SortField("foo", SortField.Type.INT)); + iwc.setParentField(parentField); + Sort indexSort = new Sort(new SortField("foo", SortField.Type.INT)); iwc.setIndexSort(indexSort); try (IndexWriter writer = new IndexWriter(dir, iwc)) { List runnabels = @@ -3237,8 +3238,7 @@ public void testBlockContainsParentField() throws IOException { public void testIndexWithSortIsCongruent() throws IOException { try (Directory dir = newDirectory()) { IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random())); - String parentField = "parent"; - Sort indexSort = new Sort(parentField, new SortField("foo", SortField.Type.INT)); + Sort indexSort = new Sort(new SortField("foo", SortField.Type.INT)); iwc.setIndexSort(indexSort); try (IndexWriter writer = new IndexWriter(dir, iwc)) { Document child1 = new Document(); @@ -3259,7 +3259,7 @@ public void testIndexWithSortIsCongruent() throws IOException { IllegalArgumentException.class, () -> { IndexWriterConfig config = new IndexWriterConfig(new MockAnalyzer(random())); - Sort sort = new Sort("someOther", new SortField("foo", SortField.Type.INT)); + Sort sort = new Sort(new SortField("foo", SortField.Type.INT)); config.setIndexSort(sort); new IndexWriter(dir, config); }); @@ -3286,8 +3286,9 @@ public void testIndexSortWithBlocks() throws IOException { AssertingNeedsIndexSortCodec codec = new AssertingNeedsIndexSortCodec(); iwc.setCodec(codec); String parentField = "parent"; - Sort indexSort = new Sort(parentField, new SortField("foo", SortField.Type.INT)); + Sort indexSort = new Sort(new SortField("foo", SortField.Type.INT)); iwc.setIndexSort(indexSort); + iwc.setParentField(parentField); LogMergePolicy policy = newLogMergePolicy(); // make sure that merge factor is always > 2 if (policy.getMergeFactor() <= 2) { diff --git a/lucene/core/src/test/org/apache/lucene/index/TestPendingSoftDeletes.java b/lucene/core/src/test/org/apache/lucene/index/TestPendingSoftDeletes.java index 02600f5d59d1..3c6d57c5d2f9 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestPendingSoftDeletes.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestPendingSoftDeletes.java @@ -199,7 +199,8 @@ public void testApplyUpdates() throws IOException { 0, VectorEncoding.FLOAT32, VectorSimilarityFunction.EUCLIDEAN, - true); + true, + false); List docsDeleted = Arrays.asList(1, 3, 7, 8, DocIdSetIterator.NO_MORE_DOCS); List updates = Arrays.asList(singleUpdate(docsDeleted, 10, true)); for (DocValuesFieldUpdates update : updates) { @@ -237,7 +238,8 @@ public void testApplyUpdates() throws IOException { 0, VectorEncoding.FLOAT32, VectorSimilarityFunction.EUCLIDEAN, - true); + true, + false); for (DocValuesFieldUpdates update : updates) { deletes.onDocValuesUpdate(fieldInfo, update.iterator()); } @@ -301,7 +303,8 @@ public void testUpdateAppliedOnlyOnce() throws IOException { 0, VectorEncoding.FLOAT32, VectorSimilarityFunction.EUCLIDEAN, - true); + true, + false); List docsDeleted = Arrays.asList(1, DocIdSetIterator.NO_MORE_DOCS); List updates = Arrays.asList(singleUpdate(docsDeleted, 3, true)); for (DocValuesFieldUpdates update : updates) { @@ -370,7 +373,8 @@ public void testResetOnUpdate() throws IOException { 0, VectorEncoding.FLOAT32, VectorSimilarityFunction.EUCLIDEAN, - true); + true, + false); List updates = Arrays.asList(singleUpdate(Arrays.asList(0, 1, DocIdSetIterator.NO_MORE_DOCS), 3, false)); for (DocValuesFieldUpdates update : updates) { @@ -407,7 +411,8 @@ public void testResetOnUpdate() throws IOException { 0, VectorEncoding.FLOAT32, VectorSimilarityFunction.EUCLIDEAN, - true); + true, + false); updates = Arrays.asList(singleUpdate(Arrays.asList(1, DocIdSetIterator.NO_MORE_DOCS), 3, true)); for (DocValuesFieldUpdates update : updates) { deletes.onDocValuesUpdate(fieldInfo, update.iterator()); diff --git a/lucene/core/src/test/org/apache/lucene/index/TestSegmentMerger.java b/lucene/core/src/test/org/apache/lucene/index/TestSegmentMerger.java index da17f332e771..615accbb4df3 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestSegmentMerger.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestSegmentMerger.java @@ -104,7 +104,7 @@ public void testMerge() throws IOException { si, InfoStream.getDefault(), mergedDir, - new FieldInfos.FieldNumbers(null), + new FieldInfos.FieldNumbers(null, null), newIOContext(random(), new IOContext(new MergeInfo(-1, -1, false, -1)))); MergeState mergeState = merger.merge(); int docsMerged = mergeState.segmentInfo.maxDoc(); diff --git a/lucene/core/src/test/org/apache/lucene/search/TestSortOptimization.java b/lucene/core/src/test/org/apache/lucene/search/TestSortOptimization.java index 765c355e534e..401bcb3e5c60 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestSortOptimization.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestSortOptimization.java @@ -1297,7 +1297,8 @@ public FieldInfos getFieldInfos() { 0, VectorEncoding.FLOAT32, VectorSimilarityFunction.DOT_PRODUCT, - fi.isSoftDeletesField()); + fi.isSoftDeletesField(), + fi.isParentField()); newInfos[i] = noIndexFI; i++; } diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/codecs/compressing/dummy/DummyCompressingCodec.java b/lucene/test-framework/src/java/org/apache/lucene/tests/codecs/compressing/dummy/DummyCompressingCodec.java index 9d7cd7e7742d..ef9f58e1bb37 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/codecs/compressing/dummy/DummyCompressingCodec.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/codecs/compressing/dummy/DummyCompressingCodec.java @@ -78,7 +78,7 @@ public Decompressor clone() { @Override public void compress(ByteBuffersDataInput buffersInput, DataOutput out) throws IOException { - out.copyBytes(buffersInput, buffersInput.size()); + out.copyBytes(buffersInput, buffersInput.length()); } @Override diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseFieldInfoFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseFieldInfoFormatTestCase.java index 83f8b293113c..5f09348fdfe4 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseFieldInfoFormatTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseFieldInfoFormatTestCase.java @@ -68,7 +68,7 @@ public void testOneField() throws Exception { FieldInfo fi = createFieldInfo(); addAttributes(fi); - FieldInfos infos = INDEX_PACKAGE_ACCESS.newFieldInfosBuilder(null).add(fi).finish(); + FieldInfos infos = INDEX_PACKAGE_ACCESS.newFieldInfosBuilder(null, null).add(fi).finish(); codec.fieldInfosFormat().write(dir, segmentInfo, "", infos, IOContext.DEFAULT); @@ -96,7 +96,7 @@ public void testImmutableAttributes() throws Exception { fi.putAttribute("foo", "bar"); fi.putAttribute("bar", "baz"); - FieldInfos infos = INDEX_PACKAGE_ACCESS.newFieldInfosBuilder(null).add(fi).finish(); + FieldInfos infos = INDEX_PACKAGE_ACCESS.newFieldInfosBuilder(null, null).add(fi).finish(); codec.fieldInfosFormat().write(dir, segmentInfo, "", infos, IOContext.DEFAULT); @@ -136,7 +136,7 @@ public void eval(MockDirectoryWrapper dir) throws IOException { FieldInfo fi = createFieldInfo(); addAttributes(fi); - FieldInfos infos = INDEX_PACKAGE_ACCESS.newFieldInfosBuilder(null).add(fi).finish(); + FieldInfos infos = INDEX_PACKAGE_ACCESS.newFieldInfosBuilder(null, null).add(fi).finish(); fail.setDoFail(); expectThrows( @@ -171,7 +171,7 @@ public void eval(MockDirectoryWrapper dir) throws IOException { FieldInfo fi = createFieldInfo(); addAttributes(fi); - FieldInfos infos = INDEX_PACKAGE_ACCESS.newFieldInfosBuilder(null).add(fi).finish(); + FieldInfos infos = INDEX_PACKAGE_ACCESS.newFieldInfosBuilder(null, null).add(fi).finish(); fail.setDoFail(); expectThrows( @@ -206,7 +206,7 @@ public void eval(MockDirectoryWrapper dir) throws IOException { FieldInfo fi = createFieldInfo(); addAttributes(fi); - FieldInfos infos = INDEX_PACKAGE_ACCESS.newFieldInfosBuilder(null).add(fi).finish(); + FieldInfos infos = INDEX_PACKAGE_ACCESS.newFieldInfosBuilder(null, null).add(fi).finish(); codec.fieldInfosFormat().write(dir, segmentInfo, "", infos, IOContext.DEFAULT); @@ -243,7 +243,7 @@ public void eval(MockDirectoryWrapper dir) throws IOException { FieldInfo fi = createFieldInfo(); addAttributes(fi); - FieldInfos infos = INDEX_PACKAGE_ACCESS.newFieldInfosBuilder(null).add(fi).finish(); + FieldInfos infos = INDEX_PACKAGE_ACCESS.newFieldInfosBuilder(null, null).add(fi).finish(); codec.fieldInfosFormat().write(dir, segmentInfo, "", infos, IOContext.DEFAULT); @@ -276,7 +276,9 @@ public void testRandom() throws Exception { String softDeletesField = random().nextBoolean() ? TestUtil.randomUnicodeString(random()) : null; - var builder = INDEX_PACKAGE_ACCESS.newFieldInfosBuilder(softDeletesField); + String parentField = random().nextBoolean() ? TestUtil.randomUnicodeString(random()) : null; + + var builder = INDEX_PACKAGE_ACCESS.newFieldInfosBuilder(softDeletesField, parentField); for (String field : fieldNames) { IndexableFieldType fieldType = randomFieldType(random(), field); @@ -307,7 +309,8 @@ public void testRandom() throws Exception { fieldType.vectorDimension(), fieldType.vectorEncoding(), fieldType.vectorSimilarityFunction(), - field.equals(softDeletesField)); + field.equals(softDeletesField), + field.equals(parentField)); addAttributes(fi); builder.add(fi); } @@ -431,6 +434,7 @@ private FieldInfo createFieldInfo() { 0, VectorEncoding.FLOAT32, VectorSimilarityFunction.EUCLIDEAN, + false, false); } } diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseIndexFileFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseIndexFileFormatTestCase.java index 968fd8ed61ed..dcb67f986658 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseIndexFileFormatTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseIndexFileFormatTestCase.java @@ -365,7 +365,8 @@ public void testMultiClose() throws IOException { proto.getVectorDimension(), proto.getVectorEncoding(), proto.getVectorSimilarityFunction(), - proto.isSoftDeletesField()); + proto.isSoftDeletesField(), + proto.isParentField()); FieldInfos fieldInfos = new FieldInfos(new FieldInfo[] {field}); diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseSegmentInfoFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseSegmentInfoFormatTestCase.java index 91950cd5328b..1d65e3245dcb 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseSegmentInfoFormatTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseSegmentInfoFormatTestCase.java @@ -392,9 +392,7 @@ public void testSort() throws IOException { sortFields[j] = randomIndexSortField(); } if (supportsHasBlocks()) { - String parentField = - random().nextBoolean() ? null : TestUtil.randomSimpleString(random(), 1, 10); - sort = new Sort(parentField, sortFields); + sort = new Sort(sortFields); } else { sort = new Sort(sortFields); } diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/index/MismatchedLeafReader.java b/lucene/test-framework/src/java/org/apache/lucene/tests/index/MismatchedLeafReader.java index de9a0215d1b4..e18c25ee0cf4 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/index/MismatchedLeafReader.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/index/MismatchedLeafReader.java @@ -112,7 +112,8 @@ static FieldInfos shuffleInfos(FieldInfos infos, Random random) { oldInfo.getVectorEncoding(), // numeric type of vector samples // distance function for calculating similarity of the field's vector oldInfo.getVectorSimilarityFunction(), - oldInfo.isSoftDeletesField()); // used as soft-deletes field + oldInfo.isSoftDeletesField(), // used as soft-deletes field + oldInfo.isParentField()); shuffled.set(i, newInfo); } diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/index/RandomPostingsTester.java b/lucene/test-framework/src/java/org/apache/lucene/tests/index/RandomPostingsTester.java index 924bc4a130c4..251cd29db446 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/index/RandomPostingsTester.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/index/RandomPostingsTester.java @@ -166,6 +166,7 @@ public RandomPostingsTester(Random random) throws IOException { 0, VectorEncoding.FLOAT32, VectorSimilarityFunction.EUCLIDEAN, + false, false); fieldUpto++; @@ -739,6 +740,7 @@ public FieldsProducer buildIndex( 0, VectorEncoding.FLOAT32, VectorSimilarityFunction.EUCLIDEAN, + false, false); } From f26646b67782a22e9f30f489a7aa26b0a32e155e Mon Sep 17 00:00:00 2001 From: Simon Willnauer Date: Thu, 14 Dec 2023 18:39:25 +0100 Subject: [PATCH 23/44] respect version in file format --- .../lucene/codecs/lucene94/Lucene94FieldInfosFormat.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene94/Lucene94FieldInfosFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene94/Lucene94FieldInfosFormat.java index 2e2476f6937d..d87b5136e072 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene94/Lucene94FieldInfosFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene94/Lucene94FieldInfosFormat.java @@ -131,7 +131,7 @@ public FieldInfos read( Throwable priorE = null; FieldInfo[] infos = null; try { - CodecUtil.checkIndexHeader( + int format = CodecUtil.checkIndexHeader( input, Lucene94FieldInfosFormat.CODEC_NAME, Lucene94FieldInfosFormat.FORMAT_START, @@ -157,7 +157,7 @@ public FieldInfos read( boolean omitNorms = (bits & OMIT_NORMS) != 0; boolean storePayloads = (bits & STORE_PAYLOADS) != 0; boolean isSoftDeletesField = (bits & SOFT_DELETES_FIELD) != 0; - boolean isParentField = (bits & PARENT_FIELD_FIELD) != 0; + boolean isParentField = format >= FORMAT_PARENT_FIELD ? (bits & PARENT_FIELD_FIELD) != 0 : false; final IndexOptions indexOptions = getIndexOptions(input, input.readByte()); From 2a14c41f077925e5ad421990058e81ca20954aa8 Mon Sep 17 00:00:00 2001 From: Simon Willnauer Date: Thu, 14 Dec 2023 18:41:33 +0100 Subject: [PATCH 24/44] tidy --- .../lucene94/Lucene94FieldInfosFormat.java | 18 ++++++++++-------- .../lucene/index/memory/MemoryIndex.java | 4 +++- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene94/Lucene94FieldInfosFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene94/Lucene94FieldInfosFormat.java index d87b5136e072..960172d56893 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene94/Lucene94FieldInfosFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene94/Lucene94FieldInfosFormat.java @@ -131,13 +131,14 @@ public FieldInfos read( Throwable priorE = null; FieldInfo[] infos = null; try { - int format = CodecUtil.checkIndexHeader( - input, - Lucene94FieldInfosFormat.CODEC_NAME, - Lucene94FieldInfosFormat.FORMAT_START, - Lucene94FieldInfosFormat.FORMAT_CURRENT, - segmentInfo.getId(), - segmentSuffix); + int format = + CodecUtil.checkIndexHeader( + input, + Lucene94FieldInfosFormat.CODEC_NAME, + Lucene94FieldInfosFormat.FORMAT_START, + Lucene94FieldInfosFormat.FORMAT_CURRENT, + segmentInfo.getId(), + segmentSuffix); final int size = input.readVInt(); // read in the size infos = new FieldInfo[size]; @@ -157,7 +158,8 @@ public FieldInfos read( boolean omitNorms = (bits & OMIT_NORMS) != 0; boolean storePayloads = (bits & STORE_PAYLOADS) != 0; boolean isSoftDeletesField = (bits & SOFT_DELETES_FIELD) != 0; - boolean isParentField = format >= FORMAT_PARENT_FIELD ? (bits & PARENT_FIELD_FIELD) != 0 : false; + boolean isParentField = + format >= FORMAT_PARENT_FIELD ? (bits & PARENT_FIELD_FIELD) != 0 : false; final IndexOptions indexOptions = getIndexOptions(input, input.readByte()); diff --git a/lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java b/lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java index 09a521c880d7..2bd50c5355c3 100644 --- a/lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java +++ b/lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java @@ -736,6 +736,7 @@ private FieldInfo createFieldInfo(String fieldName, int ord, IndexableFieldType fieldType.vectorDimension(), fieldType.vectorEncoding(), fieldType.vectorSimilarityFunction(), + false, false); } @@ -789,7 +790,8 @@ private void storeDocValues(Info info, DocValuesType docValuesType, Object docVa info.fieldInfo.getVectorDimension(), info.fieldInfo.getVectorEncoding(), info.fieldInfo.getVectorSimilarityFunction(), - info.fieldInfo.isSoftDeletesField()); + info.fieldInfo.isSoftDeletesField(), + info.fieldInfo.isParentField()); } else if (existingDocValuesType != docValuesType) { throw new IllegalArgumentException( "Can't add [" From 76d4ba27bde112587b46e2d5abc698fb5a0adec1 Mon Sep 17 00:00:00 2001 From: Simon Willnauer Date: Thu, 14 Dec 2023 18:48:06 +0100 Subject: [PATCH 25/44] tidy --- .../backward_codecs/lucene60/Lucene60FieldInfosFormat.java | 3 ++- .../backward_codecs/lucene90/Lucene90FieldInfosFormat.java | 3 ++- .../apache/lucene/search/highlight/TermVectorLeafReader.java | 1 + 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene60/Lucene60FieldInfosFormat.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene60/Lucene60FieldInfosFormat.java index 63347e18babc..a3e09db8ae9b 100644 --- a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene60/Lucene60FieldInfosFormat.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene60/Lucene60FieldInfosFormat.java @@ -217,7 +217,8 @@ private FieldInfo[] readFieldInfos(IndexInput input, int version) throws IOExcep 0, VectorEncoding.FLOAT32, VectorSimilarityFunction.EUCLIDEAN, - isSoftDeletesField); + isSoftDeletesField, + false); } catch (IllegalStateException e) { throw new CorruptIndexException( "invalid fieldinfo for field: " + name + ", fieldNumber=" + fieldNumber, input, e); diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90FieldInfosFormat.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90FieldInfosFormat.java index b2b05c49614a..22eb4558e3b3 100644 --- a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90FieldInfosFormat.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90FieldInfosFormat.java @@ -194,7 +194,8 @@ public FieldInfos read( vectorDimension, VectorEncoding.FLOAT32, vectorDistFunc, - isSoftDeletesField); + isSoftDeletesField, + false); infos[i].checkConsistency(); } catch (IllegalStateException e) { throw new CorruptIndexException( diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TermVectorLeafReader.java b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TermVectorLeafReader.java index aeb68267732d..17ba68777fba 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TermVectorLeafReader.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TermVectorLeafReader.java @@ -103,6 +103,7 @@ public int size() { 0, VectorEncoding.FLOAT32, VectorSimilarityFunction.EUCLIDEAN, + false, false); fieldInfos = new FieldInfos(new FieldInfo[] {fieldInfo}); } From 227c3cfe8c311251618d29cb90446235f6c25f84 Mon Sep 17 00:00:00 2001 From: Simon Willnauer Date: Thu, 14 Dec 2023 19:01:34 +0100 Subject: [PATCH 26/44] tidy --- .../lucene/codecs/uniformsplit/TestBlockWriter.java | 3 ++- .../uniformsplit/sharedterms/TestSTBlockReader.java | 1 + .../src/java/org/apache/lucene/index/FieldInfo.java | 7 +++++++ .../java/org/apache/lucene/index/FieldInfos.java | 6 ++++++ .../org/apache/lucene/index/TestIndexWriter.java | 13 +++++++++++++ 5 files changed, 29 insertions(+), 1 deletion(-) diff --git a/lucene/codecs/src/test/org/apache/lucene/codecs/uniformsplit/TestBlockWriter.java b/lucene/codecs/src/test/org/apache/lucene/codecs/uniformsplit/TestBlockWriter.java index 46a78341a5c3..2ee6b8fd2d23 100644 --- a/lucene/codecs/src/test/org/apache/lucene/codecs/uniformsplit/TestBlockWriter.java +++ b/lucene/codecs/src/test/org/apache/lucene/codecs/uniformsplit/TestBlockWriter.java @@ -119,6 +119,7 @@ private static FieldInfo getMockFieldInfo(String fieldName, int number) { 0, VectorEncoding.FLOAT32, VectorSimilarityFunction.EUCLIDEAN, - true); + true, + false); } } diff --git a/lucene/codecs/src/test/org/apache/lucene/codecs/uniformsplit/sharedterms/TestSTBlockReader.java b/lucene/codecs/src/test/org/apache/lucene/codecs/uniformsplit/sharedterms/TestSTBlockReader.java index 4571a49243d0..bf2b01332406 100644 --- a/lucene/codecs/src/test/org/apache/lucene/codecs/uniformsplit/sharedterms/TestSTBlockReader.java +++ b/lucene/codecs/src/test/org/apache/lucene/codecs/uniformsplit/sharedterms/TestSTBlockReader.java @@ -206,6 +206,7 @@ private static FieldInfo mockFieldInfo(String fieldName, int number) { 0, VectorEncoding.FLOAT32, VectorSimilarityFunction.EUCLIDEAN, + false, false); } diff --git a/lucene/core/src/java/org/apache/lucene/index/FieldInfo.java b/lucene/core/src/java/org/apache/lucene/index/FieldInfo.java index 0b2a2f0f385b..e9f1eb442a87 100644 --- a/lucene/core/src/java/org/apache/lucene/index/FieldInfo.java +++ b/lucene/core/src/java/org/apache/lucene/index/FieldInfo.java @@ -210,6 +210,13 @@ public void checkConsistency() { throw new IllegalArgumentException( "vectorDimension must be >=0; got " + vectorDimension + " (field: '" + name + "')"); } + + if (softDeletesField && parentField) { + throw new IllegalArgumentException( + "field can't be used as soft-deletes field and parent document fields (field: '" + + name + + "')"); + } } /** diff --git a/lucene/core/src/java/org/apache/lucene/index/FieldInfos.java b/lucene/core/src/java/org/apache/lucene/index/FieldInfos.java index c04ecb6b9f6c..e3392de6e1df 100644 --- a/lucene/core/src/java/org/apache/lucene/index/FieldInfos.java +++ b/lucene/core/src/java/org/apache/lucene/index/FieldInfos.java @@ -381,6 +381,12 @@ static final class FieldNumbers { this.storeTermVectors = new HashMap<>(); this.softDeletesFieldName = softDeletesFieldName; this.parentFieldName = parentFieldName; + if (softDeletesFieldName != null + && parentFieldName != null + && parentFieldName.equals(softDeletesFieldName)) { + throw new IllegalArgumentException( + "parent document and soft-deletes field can't be the same field"); + } } synchronized void verifyFieldInfo(FieldInfo fi) { diff --git a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java index 0390edb36cf3..3d1485e7615c 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java @@ -4893,4 +4893,17 @@ private static void addDocWithField(IndexWriter writer, String field) throws IOE doc.add(newField(field, "value", storedTextType)); writer.addDocument(doc); } + + public void testParentAndSoftDeletesAreTheSame() throws IOException { + try (Directory dir = newDirectory()) { + IndexWriterConfig indexWriterConfig = newIndexWriterConfig(new MockAnalyzer(random())); + indexWriterConfig.setSoftDeletesField("foo"); + indexWriterConfig.setParentField("foo"); + IllegalArgumentException iae = + expectThrows( + IllegalArgumentException.class, () -> new IndexWriter(dir, indexWriterConfig)); + assertEquals( + "parent document and soft-deletes field can't be the same field", iae.getMessage()); + } + } } From ac8101965084ed6b98a89da518a67222a8382530 Mon Sep 17 00:00:00 2001 From: Simon Willnauer Date: Fri, 15 Dec 2023 12:32:31 +0100 Subject: [PATCH 27/44] move test --- .../apache/lucene/index/TestIndexSorting.java | 47 +------------------ .../apache/lucene/index/TestIndexWriter.java | 46 ++++++++++++++++++ .../lucene/tests/index/RandomIndexWriter.java | 1 + 3 files changed, 48 insertions(+), 46 deletions(-) diff --git a/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java b/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java index 586169145374..19c093b1b16f 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java @@ -3185,7 +3185,7 @@ public void testParentFieldNotConfigured() throws IOException { writer.addDocuments(Arrays.asList(new Document(), new Document())); }); assertEquals( - "a parent field must be set in order to use document blocks with index sorting; see Sort#getParentField", + "a parent field must be set in order to use document blocks with index sorting; see IndexWriterConfig#getParentField", ex.getMessage()); } } @@ -3235,51 +3235,6 @@ public void testBlockContainsParentField() throws IOException { } } - public void testIndexWithSortIsCongruent() throws IOException { - try (Directory dir = newDirectory()) { - IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random())); - Sort indexSort = new Sort(new SortField("foo", SortField.Type.INT)); - iwc.setIndexSort(indexSort); - try (IndexWriter writer = new IndexWriter(dir, iwc)) { - Document child1 = new Document(); - child1.add(new StringField("id", Integer.toString(1), Store.YES)); - Document child2 = new Document(); - child2.add(new StringField("id", Integer.toString(1), Store.YES)); - Document parent = new Document(); - parent.add(new StringField("id", Integer.toString(1), Store.YES)); - writer.addDocuments(Arrays.asList(child1, child2, parent)); - writer.flush(); - if (random().nextBoolean()) { - writer.addDocuments(Arrays.asList(child1, child2, parent)); - } - writer.commit(); - } - IllegalArgumentException ex = - expectThrows( - IllegalArgumentException.class, - () -> { - IndexWriterConfig config = new IndexWriterConfig(new MockAnalyzer(random())); - Sort sort = new Sort(new SortField("foo", SortField.Type.INT)); - config.setIndexSort(sort); - new IndexWriter(dir, config); - }); - assertTrue( - ex.getMessage(), - ex.getMessage().endsWith(" to new indexSort=parent field: someOther ")); - - ex = - expectThrows( - IllegalArgumentException.class, - () -> { - IndexWriterConfig config = new IndexWriterConfig(new MockAnalyzer(random())); - Sort sort = new Sort(new SortField("foo", SortField.Type.INT)); - config.setIndexSort(sort); - new IndexWriter(dir, config); - }); - assertTrue(ex.getMessage(), ex.getMessage().endsWith(" to new indexSort=")); - } - } - public void testIndexSortWithBlocks() throws IOException { try (Directory dir = newDirectory()) { IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random())); diff --git a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java index 3d1485e7615c..233ef4a65e9b 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java @@ -4906,4 +4906,50 @@ public void testParentAndSoftDeletesAreTheSame() throws IOException { "parent document and soft-deletes field can't be the same field", iae.getMessage()); } } + + public void testIndexWithParentFieldIsCongruent() throws IOException { + try (Directory dir = newDirectory()) { + IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random())); + iwc.setParentField("parent"); + try (IndexWriter writer = new IndexWriter(dir, iwc)) { + if (random().nextBoolean()) { + Document child1 = new Document(); + child1.add(new StringField("id", Integer.toString(1), Field.Store.YES)); + Document child2 = new Document(); + child2.add(new StringField("id", Integer.toString(1), Field.Store.YES)); + Document parent = new Document(); + parent.add(new StringField("id", Integer.toString(1), Field.Store.YES)); + writer.addDocuments(Arrays.asList(child1, child2, parent)); + writer.flush(); + if (random().nextBoolean()) { + writer.addDocuments(Arrays.asList(child1, child2, parent)); + } + } else { + writer.addDocument(new Document()); + } + writer.commit(); + } + IllegalArgumentException ex = + expectThrows( + IllegalArgumentException.class, + () -> { + IndexWriterConfig config = new IndexWriterConfig(new MockAnalyzer(random())); + config.setParentField("someOtherField"); + new IndexWriter(dir, config); + }); + assertEquals( + "cannot configure [someOtherField] as parent document field ; this index uses [parent] as parent document field already", + ex.getMessage()); + ex = + expectThrows( + IllegalArgumentException.class, + () -> { + IndexWriterConfig config = new IndexWriterConfig(new MockAnalyzer(random())); + new IndexWriter(dir, config); + }); + assertEquals( + "this index has [parent] as parent document field already but parent document field is not configured in IWC", + ex.getMessage()); + } + } } diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/index/RandomIndexWriter.java b/lucene/test-framework/src/java/org/apache/lucene/tests/index/RandomIndexWriter.java index 2988957ba295..f6443c5c4984 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/index/RandomIndexWriter.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/index/RandomIndexWriter.java @@ -158,6 +158,7 @@ private RandomIndexWriter( } else { softDeletesRatio = 0d; } + w = mockIndexWriter(dir, c, r); config = w.getConfig(); flushAt = TestUtil.nextInt(r, 10, 1000); From 7573c0a47369dad2433f23e5bca62597920bcb78 Mon Sep 17 00:00:00 2001 From: Simon Willnauer Date: Fri, 15 Dec 2023 12:59:21 +0100 Subject: [PATCH 28/44] add test if field is already used --- .../apache/lucene/index/TestIndexWriter.java | 25 +++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java index 233ef4a65e9b..d06973c23636 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java @@ -4952,4 +4952,29 @@ public void testIndexWithParentFieldIsCongruent() throws IOException { ex.getMessage()); } } + + public void testParentFieldIsAlreadyUsed() throws IOException { + try (Directory dir = newDirectory()) { + + IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random())); + try (IndexWriter writer = new IndexWriter(dir, iwc)) { + Document doc = new Document(); + doc.add(new StringField("parent", Integer.toString(1), Field.Store.YES)); + writer.addDocument(doc); + writer.commit(); + } + IllegalArgumentException iae = + expectThrows( + IllegalArgumentException.class, + () -> { + IndexWriterConfig config = new IndexWriterConfig(new MockAnalyzer(random())); + config.setParentField("parent"); + + new IndexWriter(dir, config); + }); + assertEquals( + "cannot configure [parent] as parent document field ; this index uses [parent] as non parent document field already", + iae.getMessage()); + } + } } From 3781c866e0df9eefe34e7d0d48129c2ca7cb2926 Mon Sep 17 00:00:00 2001 From: Simon Willnauer Date: Fri, 15 Dec 2023 20:00:59 +0100 Subject: [PATCH 29/44] add javadocs --- .../org/apache/lucene/index/FieldInfos.java | 37 ++++++++++++++----- .../lucene/index/IndexWriterConfig.java | 11 ++++++ .../lucene/index/LiveIndexWriterConfig.java | 2 + 3 files changed, 40 insertions(+), 10 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/index/FieldInfos.java b/lucene/core/src/java/org/apache/lucene/index/FieldInfos.java index e3392de6e1df..4af945b35e76 100644 --- a/lucene/core/src/java/org/apache/lucene/index/FieldInfos.java +++ b/lucene/core/src/java/org/apache/lucene/index/FieldInfos.java @@ -33,6 +33,7 @@ import java.util.Map; import java.util.Objects; import java.util.Set; +import java.util.function.Function; import java.util.stream.Collectors; import java.util.stream.StreamSupport; import org.apache.lucene.util.ArrayUtil; @@ -184,17 +185,10 @@ public static FieldInfos getMergedFieldInfos(IndexReader reader) { return leaves.get(0).reader().getFieldInfos(); } else { final String softDeletesField = - leaves.stream() - .map(l -> l.reader().getFieldInfos().getSoftDeletesField()) - .filter(Objects::nonNull) - .findAny() - .orElse(null); + getAndValidateSpecialField( + "soft-deletes", leaves, r -> r.getFieldInfos().getSoftDeletesField()); final String parentField = - leaves.stream() - .map(l -> l.reader().getFieldInfos().getParentField()) - .filter(Objects::nonNull) - .findAny() - .orElse(null); + getAndValidateSpecialField("parent", leaves, r -> r.getFieldInfos().getParentField()); final Builder builder = new Builder(new FieldNumbers(softDeletesField, parentField)); for (final LeafReaderContext ctx : leaves) { for (FieldInfo fieldInfo : ctx.reader().getFieldInfos()) { @@ -205,6 +199,29 @@ public static FieldInfos getMergedFieldInfos(IndexReader reader) { } } + private static String getAndValidateSpecialField( + String type, List leaves, Function provider) { + boolean set = false; + String theField = null; + for (LeafReaderContext ctx : leaves) { + String field = provider.apply(ctx.reader()); + if (set && Objects.equals(field, theField) == false) { + throw new IllegalStateException( + "expected " + + type + + " field to be \"" + + theField + + " \" across all segments but found a segment with different field \"" + + field + + "\""); + } else { + theField = field; + set = true; + } + } + return theField; + } + /** Returns a set of names of fields that have a terms index. The order is undefined. */ public static Collection getIndexedFields(IndexReader reader) { return reader.leaves().stream() diff --git a/lucene/core/src/java/org/apache/lucene/index/IndexWriterConfig.java b/lucene/core/src/java/org/apache/lucene/index/IndexWriterConfig.java index 5d28e991de08..728004db0148 100644 --- a/lucene/core/src/java/org/apache/lucene/index/IndexWriterConfig.java +++ b/lucene/core/src/java/org/apache/lucene/index/IndexWriterConfig.java @@ -546,6 +546,17 @@ public IndexWriterConfig setIndexWriterEventListener( return this; } + /** + * Sets the parent document field. If this optional property is set, IndexWriter will add an + * internal field to every root document added to the index writer. A document is considered a + * parent document if it's the last document in a document block indexed via {@link + * IndexWriter#addDocuments(Iterable)} or {@link IndexWriter#updateDocuments(Term, Iterable)} + * (Iterable)} and it's relatives. Additionally, all individual documents added via the single + * document methods ({@link IndexWriter#addDocuments(Iterable)} etc.) are also considered parent + * documents. This property is optional for all indices that don't use document blocks in + * combination with index sorting. In order to maintain the API guarantee that the document order + * of a block is not altered by the {@link IndexWriter} a marker for parent documents is required. + */ public IndexWriterConfig setParentField(String parentField) { this.parentField = parentField; return this; diff --git a/lucene/core/src/java/org/apache/lucene/index/LiveIndexWriterConfig.java b/lucene/core/src/java/org/apache/lucene/index/LiveIndexWriterConfig.java index 132873b041b4..d9ac4f37d84c 100644 --- a/lucene/core/src/java/org/apache/lucene/index/LiveIndexWriterConfig.java +++ b/lucene/core/src/java/org/apache/lucene/index/LiveIndexWriterConfig.java @@ -98,6 +98,7 @@ public class LiveIndexWriterConfig { /** The field names involved in the index sort */ protected Set indexSortFields = Collections.emptySet(); + /** parent document field */ protected String parentField = null; /** @@ -460,6 +461,7 @@ public IndexWriterEventListener getIndexWriterEventListener() { return eventListener; } + /** Returns the parent document field name if configured. */ public String getParentField() { return parentField; } From cc173526477407bf19be939fbe7c51c35594fc0b Mon Sep 17 00:00:00 2001 From: Simon Willnauer Date: Fri, 15 Dec 2023 20:16:59 +0100 Subject: [PATCH 30/44] tidy --- .../apache/lucene/index/TestAddIndexes.java | 48 +++++++++---------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/lucene/core/src/test/org/apache/lucene/index/TestAddIndexes.java b/lucene/core/src/test/org/apache/lucene/index/TestAddIndexes.java index 5b839607cd35..482424a68f01 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestAddIndexes.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestAddIndexes.java @@ -1890,28 +1890,28 @@ public void testAddIndicesWithBlocks() throws IOException { public void testSetDiagnostics() throws IOException { MergePolicy myMergePolicy = - new FilterMergePolicy(newLogMergePolicy(4)) { - @Override - public MergeSpecification findMerges(CodecReader... readers) throws IOException { - MergeSpecification spec = super.findMerges(readers); - if (spec == null) { - return null; - } - MergeSpecification newSpec = new MergeSpecification(); - for (OneMerge merge : spec.merges) { - newSpec.add( - new OneMerge(merge) { - @Override - public void setMergeInfo(SegmentCommitInfo info) { - super.setMergeInfo(info); - info.info.addDiagnostics( - Collections.singletonMap("merge_policy", "my_merge_policy")); - } - }); - } - return newSpec; - } - }; + new FilterMergePolicy(newLogMergePolicy(4)) { + @Override + public MergeSpecification findMerges(CodecReader... readers) throws IOException { + MergeSpecification spec = super.findMerges(readers); + if (spec == null) { + return null; + } + MergeSpecification newSpec = new MergeSpecification(); + for (OneMerge merge : spec.merges) { + newSpec.add( + new OneMerge(merge) { + @Override + public void setMergeInfo(SegmentCommitInfo info) { + super.setMergeInfo(info); + info.info.addDiagnostics( + Collections.singletonMap("merge_policy", "my_merge_policy")); + } + }); + } + return newSpec; + } + }; Directory sourceDir = newDirectory(); try (IndexWriter w = new IndexWriter(sourceDir, newIndexWriterConfig())) { Document doc = new Document(); @@ -1922,7 +1922,7 @@ public void setMergeInfo(SegmentCommitInfo info) { Directory targetDir = newDirectory(); try (IndexWriter w = - new IndexWriter(targetDir, newIndexWriterConfig().setMergePolicy(myMergePolicy))) { + new IndexWriter(targetDir, newIndexWriterConfig().setMergePolicy(myMergePolicy))) { w.addIndexes(codecReader); } @@ -1930,7 +1930,7 @@ public void setMergeInfo(SegmentCommitInfo info) { assertNotEquals(0, si.size()); for (SegmentCommitInfo sci : si) { assertEquals( - IndexWriter.SOURCE_ADDINDEXES_READERS, sci.info.getDiagnostics().get(IndexWriter.SOURCE)); + IndexWriter.SOURCE_ADDINDEXES_READERS, sci.info.getDiagnostics().get(IndexWriter.SOURCE)); assertEquals("my_merge_policy", sci.info.getDiagnostics().get("merge_policy")); } reader.close(); From 7087f2cd9bfb52db3cc471c60a9f3466af8fd9d3 Mon Sep 17 00:00:00 2001 From: Simon Willnauer Date: Fri, 15 Dec 2023 20:18:20 +0100 Subject: [PATCH 31/44] add import --- lucene/core/src/java/org/apache/lucene/index/IndexWriter.java | 1 + 1 file changed, 1 insertion(+) diff --git a/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java b/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java index 3ee576422cf0..1fcedb422aee 100644 --- a/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java +++ b/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java @@ -33,6 +33,7 @@ import java.util.List; import java.util.Locale; import java.util.Map; +import java.util.Objects; import java.util.Queue; import java.util.Set; import java.util.concurrent.ConcurrentLinkedQueue; From c122994f6d34747bc2ac4730b3a598973dab8450 Mon Sep 17 00:00:00 2001 From: Simon Willnauer Date: Fri, 15 Dec 2023 20:23:34 +0100 Subject: [PATCH 32/44] remove hard checks for same soft deletes field --- .../org/apache/lucene/index/FieldInfos.java | 20 +++++++++---------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/index/FieldInfos.java b/lucene/core/src/java/org/apache/lucene/index/FieldInfos.java index 4af945b35e76..a5b13181a569 100644 --- a/lucene/core/src/java/org/apache/lucene/index/FieldInfos.java +++ b/lucene/core/src/java/org/apache/lucene/index/FieldInfos.java @@ -33,7 +33,6 @@ import java.util.Map; import java.util.Objects; import java.util.Set; -import java.util.function.Function; import java.util.stream.Collectors; import java.util.stream.StreamSupport; import org.apache.lucene.util.ArrayUtil; @@ -185,10 +184,12 @@ public static FieldInfos getMergedFieldInfos(IndexReader reader) { return leaves.get(0).reader().getFieldInfos(); } else { final String softDeletesField = - getAndValidateSpecialField( - "soft-deletes", leaves, r -> r.getFieldInfos().getSoftDeletesField()); - final String parentField = - getAndValidateSpecialField("parent", leaves, r -> r.getFieldInfos().getParentField()); + leaves.stream() + .map(l -> l.reader().getFieldInfos().getSoftDeletesField()) + .filter(Objects::nonNull) + .findAny() + .orElse(null); + final String parentField = getAndValidateParentField(leaves); final Builder builder = new Builder(new FieldNumbers(softDeletesField, parentField)); for (final LeafReaderContext ctx : leaves) { for (FieldInfo fieldInfo : ctx.reader().getFieldInfos()) { @@ -199,17 +200,14 @@ public static FieldInfos getMergedFieldInfos(IndexReader reader) { } } - private static String getAndValidateSpecialField( - String type, List leaves, Function provider) { + private static String getAndValidateParentField(List leaves) { boolean set = false; String theField = null; for (LeafReaderContext ctx : leaves) { - String field = provider.apply(ctx.reader()); + String field = ctx.reader().getFieldInfos().getParentField(); if (set && Objects.equals(field, theField) == false) { throw new IllegalStateException( - "expected " - + type - + " field to be \"" + "expected parent doc field to be \"" + theField + " \" across all segments but found a segment with different field \"" + field From a73e52bd3edebcf4aae41971a365cb485e134bcd Mon Sep 17 00:00:00 2001 From: Simon Willnauer Date: Thu, 21 Dec 2023 16:21:18 +0100 Subject: [PATCH 33/44] apply feedback --- .../codecs/lucene94/Lucene94FieldInfosFormat.java | 6 ++++++ .../java/org/apache/lucene/index/CheckIndex.java | 2 +- .../lucene/index/DocumentsWriterPerThread.java | 10 +++++----- .../src/java/org/apache/lucene/index/FieldInfo.java | 2 +- .../java/org/apache/lucene/index/IndexingChain.java | 5 +++-- .../java/org/apache/lucene/index/MultiSorter.java | 13 ++++++++----- 6 files changed, 24 insertions(+), 14 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene94/Lucene94FieldInfosFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene94/Lucene94FieldInfosFormat.java index 960172d56893..c8c00e084b38 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene94/Lucene94FieldInfosFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene94/Lucene94FieldInfosFormat.java @@ -161,6 +161,11 @@ public FieldInfos read( boolean isParentField = format >= FORMAT_PARENT_FIELD ? (bits & PARENT_FIELD_FIELD) != 0 : false; + assert (bits & 0xE0) == 0 + : "unused bits are set \"" + Integer.toBinaryString(bits) + "\""; + assert format >= FORMAT_PARENT_FIELD || (bits & 0xF0) == 0 + : "parent field bit is set but shouldn't' \"" + Integer.toBinaryString(bits) + "\""; + final IndexOptions indexOptions = getIndexOptions(input, input.readByte()); // DV Types are packed in one byte @@ -380,6 +385,7 @@ public void write( // Codec header static final String CODEC_NAME = "Lucene94FieldInfos"; static final int FORMAT_START = 0; + // this doesn't actually change the file format but uses up one more bit an existing bit pattern static final int FORMAT_PARENT_FIELD = 1; static final int FORMAT_CURRENT = FORMAT_PARENT_FIELD; diff --git a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java index 54cdf6146a90..fe74b03c32cc 100644 --- a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java +++ b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java @@ -1208,7 +1208,6 @@ public static Status.IndexSortStatus testSort( break; } } - prevDoc = nextDoc; if (cmp > 0) { throw new CheckIndexException( "segment has indexSort=" @@ -1218,6 +1217,7 @@ public static Status.IndexSortStatus testSort( + " sorts after docID=" + nextDoc); } + prevDoc = nextDoc; } msg( infoStream, diff --git a/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java b/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java index 588b41cd3da8..9d9246fda7ca 100644 --- a/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java +++ b/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java @@ -303,20 +303,20 @@ private Iterable addParentField( @Override public boolean hasNext() { - return first.hasNext() || additionalField != null; + return additionalField != null || first.hasNext(); } @Override public IndexableField next() { - if (first.hasNext()) { - IndexableField field = first.next(); - return field; - } if (additionalField != null) { IndexableField field = additionalField; additionalField = null; return field; } + if (first.hasNext()) { + IndexableField field = first.next(); + return field; + } throw new NoSuchElementException(); } }; diff --git a/lucene/core/src/java/org/apache/lucene/index/FieldInfo.java b/lucene/core/src/java/org/apache/lucene/index/FieldInfo.java index e9f1eb442a87..3ecbe4cac807 100644 --- a/lucene/core/src/java/org/apache/lucene/index/FieldInfo.java +++ b/lucene/core/src/java/org/apache/lucene/index/FieldInfo.java @@ -213,7 +213,7 @@ public void checkConsistency() { if (softDeletesField && parentField) { throw new IllegalArgumentException( - "field can't be used as soft-deletes field and parent document fields (field: '" + "field can't be used as soft-deletes field and parent document field (field: '" + name + "')"); } diff --git a/lucene/core/src/java/org/apache/lucene/index/IndexingChain.java b/lucene/core/src/java/org/apache/lucene/index/IndexingChain.java index 7ccb8e3195fd..560807d69730 100644 --- a/lucene/core/src/java/org/apache/lucene/index/IndexingChain.java +++ b/lucene/core/src/java/org/apache/lucene/index/IndexingChain.java @@ -224,7 +224,8 @@ private Sorter.DocMap maybeSortSegment(SegmentWriteState state) throws IOExcepti } LeafReader docValuesReader = getDocValuesLeafReader(); - Function comparatorWrapper = in -> in; + Function comparatorWrapper = + Function.identity(); if (state.segmentInfo.getHasBlocks() && state.fieldInfos.getParentField() != null) { final DocIdSetIterator readerValues = @@ -244,7 +245,7 @@ private Sorter.DocMap maybeSortSegment(SegmentWriteState state) throws IOExcepti && state.fieldInfos.getParentField() == null && indexCreatedVersionMajor >= Version.LUCENE_10_0_0.major) { throw new CorruptIndexException( - "parent field is not set but the index has blocks. indexCreatedVersionMajor: " + "parent field is not set but the index has blocks and uses index sorting. indexCreatedVersionMajor: " + indexCreatedVersionMajor, "IndexingChain"); } diff --git a/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java b/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java index 52b1f8d624d3..9a8e48e5d3eb 100644 --- a/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java +++ b/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java @@ -68,11 +68,14 @@ static MergeState.DocMap[] sort(Sort sort, List readers) throws IOE IndexSorter.ComparableProvider provider = providers[j]; providers[j] = docId -> provider.getAsComparableLong(parents.nextSetBit(docId)); } - assert metaData.hasBlocks() == false - || fieldInfos.getParentField() != null - || metaData.getCreatedVersionMajor() < Version.LUCENE_10_0_0.major - : "parent field is not set but the index has blocks. indexCreatedVersionMajor: " - + metaData.getCreatedVersionMajor(); + if (metaData.hasBlocks() + && fieldInfos.getParentField() == null + && metaData.getCreatedVersionMajor() >= Version.LUCENE_10_0_0.major) { + throw new CorruptIndexException( + "parent field is not set but the index has blocks and uses index sorting. indexCreatedVersionMajor: " + + metaData.getCreatedVersionMajor(), + "IndexingChain"); + } } reverseMuls[i] = fields[i].getReverse() ? -1 : 1; } From 502703ec69322a28d89cc793b8feb475f5bd2298 Mon Sep 17 00:00:00 2001 From: Simon Willnauer Date: Sun, 24 Dec 2023 12:47:17 +0100 Subject: [PATCH 34/44] apply feedback --- .../codecs/lucene94/Lucene94FieldInfosFormat.java | 13 +++++++++---- .../src/java/org/apache/lucene/index/FieldInfo.java | 10 +++++----- .../java/org/apache/lucene/index/FieldInfos.java | 6 ++++-- .../org/apache/lucene/index/IndexWriterConfig.java | 12 ++++++------ .../org/apache/lucene/index/TestIndexWriter.java | 3 ++- 5 files changed, 26 insertions(+), 18 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene94/Lucene94FieldInfosFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene94/Lucene94FieldInfosFormat.java index c8c00e084b38..97c05435b969 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene94/Lucene94FieldInfosFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene94/Lucene94FieldInfosFormat.java @@ -161,10 +161,15 @@ public FieldInfos read( boolean isParentField = format >= FORMAT_PARENT_FIELD ? (bits & PARENT_FIELD_FIELD) != 0 : false; - assert (bits & 0xE0) == 0 - : "unused bits are set \"" + Integer.toBinaryString(bits) + "\""; - assert format >= FORMAT_PARENT_FIELD || (bits & 0xF0) == 0 - : "parent field bit is set but shouldn't' \"" + Integer.toBinaryString(bits) + "\""; + if ((bits & 0xE0) != 0) { + throw new CorruptIndexException( + "unused bits are set \"" + Integer.toBinaryString(bits) + "\"", input); + } + if (format < FORMAT_PARENT_FIELD && (bits & 0xF0) != 0) { + throw new CorruptIndexException( + "parent field bit is set but shouldn't \"" + Integer.toBinaryString(bits) + "\"", + input); + } final IndexOptions indexOptions = getIndexOptions(input, input.readByte()); diff --git a/lucene/core/src/java/org/apache/lucene/index/FieldInfo.java b/lucene/core/src/java/org/apache/lucene/index/FieldInfo.java index 3ecbe4cac807..03f927a552d1 100644 --- a/lucene/core/src/java/org/apache/lucene/index/FieldInfo.java +++ b/lucene/core/src/java/org/apache/lucene/index/FieldInfo.java @@ -63,7 +63,7 @@ public final class FieldInfo { // whether this field is used as the soft-deletes field private final boolean softDeletesField; - private final boolean parentField; + private final boolean isParentField; /** * Sole constructor. @@ -87,7 +87,7 @@ public FieldInfo( VectorEncoding vectorEncoding, VectorSimilarityFunction vectorSimilarityFunction, boolean softDeletesField, - boolean parentField) { + boolean isParentField) { this.name = Objects.requireNonNull(name); this.number = number; this.docValuesType = @@ -114,7 +114,7 @@ public FieldInfo( this.vectorEncoding = vectorEncoding; this.vectorSimilarityFunction = vectorSimilarityFunction; this.softDeletesField = softDeletesField; - this.parentField = parentField; + this.isParentField = isParentField; this.checkConsistency(); } @@ -211,7 +211,7 @@ public void checkConsistency() { "vectorDimension must be >=0; got " + vectorDimension + " (field: '" + name + "')"); } - if (softDeletesField && parentField) { + if (softDeletesField && isParentField) { throw new IllegalArgumentException( "field can't be used as soft-deletes field and parent document field (field: '" + name @@ -650,6 +650,6 @@ public boolean isSoftDeletesField() { * {@link IndexWriterConfig#setParentField(String)} */ public boolean isParentField() { - return parentField; + return isParentField; } } diff --git a/lucene/core/src/java/org/apache/lucene/index/FieldInfos.java b/lucene/core/src/java/org/apache/lucene/index/FieldInfos.java index a5b13181a569..8fbe68326332 100644 --- a/lucene/core/src/java/org/apache/lucene/index/FieldInfos.java +++ b/lucene/core/src/java/org/apache/lucene/index/FieldInfos.java @@ -400,7 +400,9 @@ static final class FieldNumbers { && parentFieldName != null && parentFieldName.equals(softDeletesFieldName)) { throw new IllegalArgumentException( - "parent document and soft-deletes field can't be the same field"); + "parent document and soft-deletes field can't be the same field \"" + + parentFieldName + + "\""); } } @@ -501,7 +503,7 @@ private void verifyParentFieldName(String fieldName, boolean isParentField) { + fieldName + "] as parent document field already"); } - } else if (fieldName.equals(parentFieldName)) { + } else if (fieldName.equals(parentFieldName)) { // else case -- isParentField == false throw new IllegalArgumentException( "cannot configure [" + parentFieldName diff --git a/lucene/core/src/java/org/apache/lucene/index/IndexWriterConfig.java b/lucene/core/src/java/org/apache/lucene/index/IndexWriterConfig.java index 728004db0148..e20a6371ef2c 100644 --- a/lucene/core/src/java/org/apache/lucene/index/IndexWriterConfig.java +++ b/lucene/core/src/java/org/apache/lucene/index/IndexWriterConfig.java @@ -550,12 +550,12 @@ public IndexWriterConfig setIndexWriterEventListener( * Sets the parent document field. If this optional property is set, IndexWriter will add an * internal field to every root document added to the index writer. A document is considered a * parent document if it's the last document in a document block indexed via {@link - * IndexWriter#addDocuments(Iterable)} or {@link IndexWriter#updateDocuments(Term, Iterable)} - * (Iterable)} and it's relatives. Additionally, all individual documents added via the single - * document methods ({@link IndexWriter#addDocuments(Iterable)} etc.) are also considered parent - * documents. This property is optional for all indices that don't use document blocks in - * combination with index sorting. In order to maintain the API guarantee that the document order - * of a block is not altered by the {@link IndexWriter} a marker for parent documents is required. + * IndexWriter#addDocuments(Iterable)} or {@link IndexWriter#updateDocuments(Term, Iterable)} and + * its relatives. Additionally, all individual documents added via the single document methods + * ({@link IndexWriter#addDocuments(Iterable)} etc.) are also considered parent documents. This + * property is optional for all indices that don't use document blocks in combination with index + * sorting. In order to maintain the API guarantee that the document order of a block is not + * altered by the {@link IndexWriter} a marker for parent documents is required. */ public IndexWriterConfig setParentField(String parentField) { this.parentField = parentField; diff --git a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java index d06973c23636..759009abd43e 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java @@ -4903,7 +4903,8 @@ public void testParentAndSoftDeletesAreTheSame() throws IOException { expectThrows( IllegalArgumentException.class, () -> new IndexWriter(dir, indexWriterConfig)); assertEquals( - "parent document and soft-deletes field can't be the same field", iae.getMessage()); + "parent document and soft-deletes field can't be the same field \"foo\"", + iae.getMessage()); } } From 6f114365450b2391dc0c4b05969ce614f6b443a4 Mon Sep 17 00:00:00 2001 From: Simon Willnauer Date: Wed, 27 Dec 2023 17:13:52 +0100 Subject: [PATCH 35/44] beef up testing and revert to -1 as a default value isntead of the number of children --- .../index/DocumentsWriterPerThread.java | 2 - .../apache/lucene/index/TestIndexSorting.java | 137 +++++++++++++++++- 2 files changed, 136 insertions(+), 3 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java b/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java index 9d9246fda7ca..fd1364508dcb 100644 --- a/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java +++ b/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java @@ -249,8 +249,6 @@ long updateDocuments( Iterable doc = iterator.next(); if (parentField != null) { if (iterator.hasNext() == false) { - int numChildren = numDocsInRAM - docsInRamBefore; - parentField.getDelegate().setLongValue(numChildren); doc = addParentField(doc, parentField); } } else if (segmentInfo.getIndexSort() != null diff --git a/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java b/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java index 19c093b1b16f..0a7db5ce82ad 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java @@ -2121,6 +2121,10 @@ public void testBadAddIndexes() throws Exception { public void testAddIndexes(boolean withDeletes, boolean useReaders) throws Exception { Directory dir = newDirectory(); IndexWriterConfig iwc1 = newIndexWriterConfig(); + boolean useParent = rarely(); + if (useParent) { + iwc1.setParentField("___parent"); + } Sort indexSort = new Sort( new SortField("foo", SortField.Type.LONG), new SortField("bar", SortField.Type.LONG)); @@ -2153,6 +2157,9 @@ public void testAddIndexes(boolean withDeletes, boolean useReaders) throws Excep } else { iwc.setIndexSort(indexSort); } + if (useParent) { + iwc.setParentField("___parent"); + } IndexWriter w2 = new IndexWriter(dir2, iwc); if (useReaders) { @@ -3291,7 +3298,7 @@ public void testIndexSortWithBlocks() throws IOException { int doc; int expectedDocID = 2; while ((doc = parentDISI.nextDoc()) != NO_MORE_DOCS) { - assertEquals(2, parentDISI.longValue()); + assertEquals(-1, parentDISI.longValue()); assertEquals(expectedDocID, doc); int id = ids.nextDoc(); long child1ID = ids.longValue(); @@ -3316,4 +3323,132 @@ public void testIndexSortWithBlocks() throws IOException { } } } + + @SuppressWarnings("fallthrough") + public void testMixRandomDocumentsWithBlocks() throws IOException { + try (Directory dir = newDirectory()) { + IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random())); + AssertingNeedsIndexSortCodec codec = new AssertingNeedsIndexSortCodec(); + iwc.setCodec(codec); + String parentField = "parent"; + Sort indexSort = new Sort(new SortField("foo", SortField.Type.INT)); + iwc.setIndexSort(indexSort); + iwc.setParentField(parentField); + RandomIndexWriter randomIndexWriter = new RandomIndexWriter(random(), dir, iwc); + int numDocs = random().nextInt(100, 1000); + for (int i = 0; i < numDocs; i++) { + if (rarely()) { + randomIndexWriter.deleteDocuments(new Term("id", "" + random().nextInt(0, i + 1))); + } + List docs = new ArrayList<>(); + switch (random().nextInt(100) % 5) { + case 4: + Document child3 = new Document(); + child3.add(new StringField("id", Integer.toString(i), Store.YES)); + child3.add(new NumericDocValuesField("type", 2)); + child3.add(new NumericDocValuesField("child_ord", 3)); + child3.add(new NumericDocValuesField("foo", random().nextInt())); + docs.add(child3); + case 3: + Document child2 = new Document(); + child2.add(new StringField("id", Integer.toString(i), Store.YES)); + child2.add(new NumericDocValuesField("type", 2)); + child2.add(new NumericDocValuesField("child_ord", 2)); + child2.add(new NumericDocValuesField("foo", random().nextInt())); + docs.add(child2); + case 2: + Document child1 = new Document(); + child1.add(new StringField("id", Integer.toString(i), Store.YES)); + child1.add(new NumericDocValuesField("type", 2)); + child1.add(new NumericDocValuesField("child_ord", 1)); + child1.add(new NumericDocValuesField("foo", random().nextInt())); + docs.add(child1); + case 1: + Document root = new Document(); + root.add(new StringField("id", Integer.toString(i), Store.YES)); + root.add(new NumericDocValuesField("type", 1)); + root.add(new NumericDocValuesField("num_children", docs.size())); + root.add(new NumericDocValuesField("foo", random().nextInt())); + docs.add(root); + randomIndexWriter.addDocuments(docs); + break; + case 0: + Document single = new Document(); + single.add(new StringField("id", Integer.toString(i), Store.YES)); + single.add(new NumericDocValuesField("type", 0)); + single.add(new NumericDocValuesField("foo", random().nextInt())); + randomIndexWriter.addDocument(single); + } + if (rarely()) { + randomIndexWriter.forceMerge(1); + } + randomIndexWriter.commit(); + } + + randomIndexWriter.close(); + try (DirectoryReader reader = DirectoryReader.open(dir)) { + for (LeafReaderContext ctx : reader.leaves()) { + LeafReader leaf = ctx.reader(); + NumericDocValues parentDISI = leaf.getNumericDocValues(parentField); + assertNotNull(parentDISI); + NumericDocValues type = leaf.getNumericDocValues("type"); + NumericDocValues childOrd = leaf.getNumericDocValues("child_ord"); + NumericDocValues numChildren = leaf.getNumericDocValues("num_children"); + int numCurrentChildren = 0; + int totalPendingChildren = 0; + String childId = null; + for (int i = 0; i < leaf.maxDoc(); i++) { + if (leaf.getLiveDocs() == null || leaf.getLiveDocs().get(i)) { + assertTrue(type.advanceExact(i)); + int typeValue = (int) type.longValue(); + switch (typeValue) { + case 2: + assertFalse(parentDISI.advanceExact(i)); + assertTrue(childOrd.advanceExact(i)); + if (numCurrentChildren == 0) { // first child + childId = leaf.storedFields().document(i).get("id"); + totalPendingChildren = (int) childOrd.longValue() - 1; + } else { + assertNotNull(childId); + assertEquals(totalPendingChildren--, childOrd.longValue()); + assertEquals(childId, leaf.storedFields().document(i).get("id")); + } + numCurrentChildren++; + break; + case 1: + assertTrue(parentDISI.advanceExact(i)); + assertEquals(-1, parentDISI.longValue()); + if (childOrd != null) { + assertFalse(childOrd.advanceExact(i)); + } + assertTrue(numChildren.advanceExact(i)); + assertEquals(0, totalPendingChildren); + assertEquals(numCurrentChildren, numChildren.longValue()); + if (numCurrentChildren > 0) { + assertEquals(childId, leaf.storedFields().document(i).get("id")); + } else { + assertNull(childId); + } + numCurrentChildren = 0; + childId = null; + break; + case 0: + assertEquals(-1, parentDISI.longValue()); + assertTrue(parentDISI.advanceExact(i)); + if (childOrd != null) { + assertFalse(childOrd.advanceExact(i)); + } + if (numChildren != null) { + assertFalse(numChildren.advanceExact(i)); + } + break; + default: + fail(); + } + } + } + } + } + } + } } From 023d242d18ed02715e825a878c41b65913afbbbd Mon Sep 17 00:00:00 2001 From: Simon Willnauer Date: Tue, 2 Jan 2024 13:25:01 +0100 Subject: [PATCH 36/44] add changes --- lucene/CHANGES.txt | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 29f0bba34494..539e053ff5b0 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -90,6 +90,11 @@ New Features * LUCENE-10626 Hunspell: add tools to aid dictionary editing: analysis introspection, stem expansion and stem/flag suggestion (Peter Gromov) +* GITHUB#12829: IndexWriter now preserves document blocks indexed via IndexWriter#addDocuments + et.al. when index sorting is configured. Document blocks are maintained alongside their + parent documents during sort and merge. IndexWriterConfig requires a parent field to be specified + if index sorting is used together with documents blocks. (Simon Willnauer) + Improvements --------------------- @@ -131,6 +136,13 @@ Bug Fixes * GITHUB#12878: Fix the declared Exceptions of Expression#evaluate() to match those of DoubleValues#doubleValue(). (Uwe Schindler) +Changes in Backwards Compatibility Policy +----------------------------------------- + +* GITHUB#12829: IndexWriter#addDocuments et.al. now require a parent field name to be + specified in IndexWriterConfig is documents blocks are indexed and index time sorting + is configured. (Simon Willnauer) + Other --------------------- From d36dde5498cce3f1ba202dea6b9baa51674a6f35 Mon Sep 17 00:00:00 2001 From: Simon Willnauer Date: Wed, 3 Jan 2024 11:44:37 +0100 Subject: [PATCH 37/44] fix polishing comments --- .../lucene/index/DocumentsWriterPerThread.java | 13 ++++++------- .../java/org/apache/lucene/index/FieldInfos.java | 2 +- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java b/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java index fd1364508dcb..f26671809a28 100644 --- a/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java +++ b/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java @@ -137,11 +137,11 @@ void abort() throws IOException { private final ReentrantLock lock = new ReentrantLock(); private int[] deleteDocIDs = new int[0]; private int numDeletedDocIds = 0; - private final int indexVersionCreated; + private final int indexMajorVersionCreated; private final IndexingChain.ReservedField parentField; DocumentsWriterPerThread( - int indexVersionCreated, + int indexMajorVersionCreated, String segmentName, Directory directoryOrig, Directory directory, @@ -150,7 +150,7 @@ void abort() throws IOException { FieldInfos.Builder fieldInfos, AtomicLong pendingNumDocs, boolean enableTestPoints) { - this.indexVersionCreated = indexVersionCreated; + this.indexMajorVersionCreated = indexMajorVersionCreated; this.directory = new TrackingDirectoryWrapper(directory); this.fieldInfos = fieldInfos; this.indexWriterConfig = indexWriterConfig; @@ -189,7 +189,7 @@ void abort() throws IOException { this.enableTestPoints = enableTestPoints; indexingChain = new IndexingChain( - indexVersionCreated, + indexMajorVersionCreated, segmentInfo, this.directory, fieldInfos, @@ -253,7 +253,7 @@ long updateDocuments( } } else if (segmentInfo.getIndexSort() != null && iterator.hasNext() - && indexVersionCreated >= Version.LUCENE_10_0_0.major) { + && indexMajorVersionCreated >= Version.LUCENE_10_0_0.major) { // sort is configured but parent field is missing, yet we have a doc-block // yet we must not fail if this index was created in an earlier version where this // behavior was permitted. @@ -312,8 +312,7 @@ public IndexableField next() { return field; } if (first.hasNext()) { - IndexableField field = first.next(); - return field; + return first.next(); } throw new NoSuchElementException(); } diff --git a/lucene/core/src/java/org/apache/lucene/index/FieldInfos.java b/lucene/core/src/java/org/apache/lucene/index/FieldInfos.java index 8fbe68326332..f6bfddf4bd4b 100644 --- a/lucene/core/src/java/org/apache/lucene/index/FieldInfos.java +++ b/lucene/core/src/java/org/apache/lucene/index/FieldInfos.java @@ -501,7 +501,7 @@ private void verifyParentFieldName(String fieldName, boolean isParentField) { + parentFieldName + "] as parent document field ; this index uses [" + fieldName - + "] as parent document field already"); + + "] as parent document field already"); } } else if (fieldName.equals(parentFieldName)) { // else case -- isParentField == false throw new IllegalArgumentException( From 4c6e152b637b759284d862d88c3602aba2b43610 Mon Sep 17 00:00:00 2001 From: Simon Willnauer Date: Wed, 3 Jan 2024 12:00:06 +0100 Subject: [PATCH 38/44] add test for single doc blocks don't trigger hasBlocks --- .../apache/lucene/index/TestIndexWriter.java | 38 +++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java index 8d267237067d..b11ae9942997 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java @@ -1755,6 +1755,44 @@ public void testHasBlocksMergeFullyDelSegments() throws IOException { } } + public void testSingleDocsDoNotTriggerHasBlocks() throws IOException { + try (Directory dir = newDirectory()) { + try (IndexWriter w = + new IndexWriter( + dir, + new IndexWriterConfig(new MockAnalyzer(random())) + .setMaxBufferedDocs(Integer.MAX_VALUE) + .setRAMBufferSizeMB(100))) { + + int docs = random().nextInt(1, 100); + for (int i = 0; i < docs; i++) { + Document doc = new Document(); + doc.add(new StringField("id", "" + i, Field.Store.NO)); + w.addDocuments(Arrays.asList(doc)); + } + w.commit(); + SegmentInfos si = w.cloneSegmentInfos(); + assertEquals(1, si.size()); + assertFalse(si.asList().get(0).info.getHasBlocks()); + + Document doc = new Document(); + doc.add(new StringField("id", "XXX", Field.Store.NO)); + w.addDocuments(Arrays.asList(doc, doc)); + w.commit(); + si = w.cloneSegmentInfos(); + assertEquals(2, si.size()); + assertFalse(si.asList().get(0).info.getHasBlocks()); + assertTrue(si.asList().get(1).info.getHasBlocks()); + w.forceMerge(1); + + w.commit(); + si = w.cloneSegmentInfos(); + assertEquals(1, si.size()); + assertTrue(si.asList().get(0).info.getHasBlocks()); + } + } + } + public void testCarryOverHasBlocks() throws Exception { try (Directory dir = newDirectory()) { try (IndexWriter w = From ff8f561587f19413e7c1dd5e9fdd3394c96c7829 Mon Sep 17 00:00:00 2001 From: Simon Willnauer Date: Wed, 3 Jan 2024 12:00:37 +0100 Subject: [PATCH 39/44] remove extra whitespace in error msg --- .../core/src/test/org/apache/lucene/index/TestAddIndexes.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lucene/core/src/test/org/apache/lucene/index/TestAddIndexes.java b/lucene/core/src/test/org/apache/lucene/index/TestAddIndexes.java index 482424a68f01..13e11406be4d 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestAddIndexes.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestAddIndexes.java @@ -1966,7 +1966,7 @@ public void testIllegalParentDocChange() throws Exception { }) .getMessage(); assertEquals( - "cannot configure [foo] as parent document field ; this index uses [foobar] as parent document field already", + "cannot configure [foo] as parent document field ; this index uses [foobar] as parent document field already", message); message = From 0334d257a5b86b6da55cc2bb8886b6958ba9baa6 Mon Sep 17 00:00:00 2001 From: Simon Willnauer Date: Wed, 3 Jan 2024 12:00:58 +0100 Subject: [PATCH 40/44] remove extra whitespace in error msg --- .../core/src/test/org/apache/lucene/index/TestAddIndexes.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lucene/core/src/test/org/apache/lucene/index/TestAddIndexes.java b/lucene/core/src/test/org/apache/lucene/index/TestAddIndexes.java index 13e11406be4d..8fc2eac88d28 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestAddIndexes.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestAddIndexes.java @@ -1977,7 +1977,7 @@ public void testIllegalParentDocChange() throws Exception { }) .getMessage(); assertEquals( - "cannot configure [foo] as parent document field ; this index uses [foobar] as parent document field already", + "cannot configure [foo] as parent document field ; this index uses [foobar] as parent document field already", message); Directory dir3 = newDirectory(); From 25e6da47f989675d310de5f319ff38088589b4d4 Mon Sep 17 00:00:00 2001 From: Simon Willnauer Date: Wed, 3 Jan 2024 12:21:00 +0100 Subject: [PATCH 41/44] add test and docs to make mike not staring at this forever --- .../org/apache/lucene/index/FieldInfos.java | 8 ++-- .../apache/lucene/index/TestAddIndexes.java | 40 +++++++++++++++++++ 2 files changed, 45 insertions(+), 3 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/index/FieldInfos.java b/lucene/core/src/java/org/apache/lucene/index/FieldInfos.java index f6bfddf4bd4b..e2c8795ae952 100644 --- a/lucene/core/src/java/org/apache/lucene/index/FieldInfos.java +++ b/lucene/core/src/java/org/apache/lucene/index/FieldInfos.java @@ -503,13 +503,15 @@ private void verifyParentFieldName(String fieldName, boolean isParentField) { + fieldName + "] as parent document field already"); } - } else if (fieldName.equals(parentFieldName)) { // else case -- isParentField == false + } else if (fieldName.equals(parentFieldName)) { // isParent == false + // this would be the case if the current index has a parent field that is + // not a parent field in the incoming index (think addIndices) throw new IllegalArgumentException( "cannot configure [" + parentFieldName - + "] as parent document field ; this index uses [" + + "] as non parent document field ; this index uses [" + fieldName - + "] as non parent document field already"); + + "] as parent document field already"); } } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestAddIndexes.java b/lucene/core/src/test/org/apache/lucene/index/TestAddIndexes.java index 8fc2eac88d28..382cbd7bf548 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestAddIndexes.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestAddIndexes.java @@ -1990,4 +1990,44 @@ public void testIllegalParentDocChange() throws Exception { IOUtils.close(r1, dir1, w2, dir2, w3, dir3); } + + public void testIllegalNonParentField() throws IOException { + Directory dir1 = newDirectory(); + IndexWriterConfig iwc1 = newIndexWriterConfig(new MockAnalyzer(random())); + RandomIndexWriter w1 = new RandomIndexWriter(random(), dir1, iwc1); + Document parent = new Document(); + parent.add(new StringField("foo", "XXX", Field.Store.NO)); + w1.addDocument(parent); + w1.close(); + + Directory dir2 = newDirectory(); + IndexWriterConfig iwc2 = newIndexWriterConfig(new MockAnalyzer(random())); + iwc2.setParentField("foo"); + RandomIndexWriter w2 = new RandomIndexWriter(random(), dir2, iwc2); + + IndexReader r1 = DirectoryReader.open(dir1); + String message = + expectThrows( + IllegalArgumentException.class, + () -> { + w2.addIndexes((SegmentReader) getOnlyLeafReader(r1)); + }) + .getMessage(); + assertEquals( + "cannot configure [foo] as non parent document field ; this index uses [foo] as parent document field already", + message); + + message = + expectThrows( + IllegalArgumentException.class, + () -> { + w2.addIndexes(dir1); + }) + .getMessage(); + assertEquals( + "cannot configure [foo] as non parent document field ; this index uses [foo] as parent document field already", + message); + + IOUtils.close(r1, dir1, w2, dir2); + } } From 363ce3d52b638c90356aab05a03f073cf14e64b3 Mon Sep 17 00:00:00 2001 From: Simon Willnauer Date: Wed, 3 Jan 2024 12:34:53 +0100 Subject: [PATCH 42/44] apply review comments --- lucene/CHANGES.txt | 13 ++++++------- lucene/MIGRATE.md | 8 ++++++++ .../java/org/apache/lucene/index/CheckIndex.java | 10 ++++++---- .../lucene/index/DocumentsWriterPerThread.java | 2 +- .../org/apache/lucene/index/TestIndexWriter.java | 4 ++-- 5 files changed, 23 insertions(+), 14 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 539e053ff5b0..eb62aed59623 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -90,10 +90,10 @@ New Features * LUCENE-10626 Hunspell: add tools to aid dictionary editing: analysis introspection, stem expansion and stem/flag suggestion (Peter Gromov) -* GITHUB#12829: IndexWriter now preserves document blocks indexed via IndexWriter#addDocuments - et.al. when index sorting is configured. Document blocks are maintained alongside their - parent documents during sort and merge. IndexWriterConfig requires a parent field to be specified - if index sorting is used together with documents blocks. (Simon Willnauer) +* GITHUB#12829: For indices newly created as of 10.0.0 onwards, IndexWriter preserves document blocks indexed via + IndexWriter#addDocuments or IndexWriter#updateDocuments also when index sorting is configured. Document blocks are + maintained alongside their parent documents during sort and merge. IndexWriterConfig now requires a parent field to be + specified if index sorting is used together with documents blocks. (Simon Willnauer) Improvements --------------------- @@ -139,9 +139,8 @@ Bug Fixes Changes in Backwards Compatibility Policy ----------------------------------------- -* GITHUB#12829: IndexWriter#addDocuments et.al. now require a parent field name to be - specified in IndexWriterConfig is documents blocks are indexed and index time sorting - is configured. (Simon Willnauer) +* GITHUB#12829: IndexWriter#addDocuments or IndexWriter#updateDocuments now require a parent field name to be + specified in IndexWriterConfig is documents blocks are indexed and index time sorting is configured. (Simon Willnauer) Other --------------------- diff --git a/lucene/MIGRATE.md b/lucene/MIGRATE.md index 75f7c5b4eeba..698548fdfd71 100644 --- a/lucene/MIGRATE.md +++ b/lucene/MIGRATE.md @@ -19,6 +19,14 @@ ## Migration from Lucene 9.x to Lucene 10.0 +### IndexWriter requires a parent document field in order to use index sorting with document blocks (GITHUB#12829) + +For indices newly created as of 10.0.0 onwards, IndexWriter preserves document blocks indexed via +IndexWriter#addDocuments or IndexWriter#updateDocuments when index sorting is configured. Document blocks are maintained +alongside their parent documents during sort and merge. The internally used parent field must be configured in +IndexWriterConfig only if index sorting is used together with documents blocks. See `IndexWriterConfig#setParendField` +for reference. + ### Minor API changes in MatchHighlighter and MatchRegionRetriever. (GITHUB#12881) The API of interfaces for accepting highlights has changed to allow performance improvements. Look at the issue and the PR diff to get diff --git a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java index cdb346ebc029..00d79fa8934a 100644 --- a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java +++ b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java @@ -1186,10 +1186,12 @@ public static Status.IndexSortStatus testSort( "parent field is not set but the index has document blocks and was created with version: " + metaData.getCreatedVersionMajor()); } - final DocIdSetIterator iter = - metaData.hasBlocks() && fieldInfos.getParentField() != null - ? reader.getNumericDocValues(fieldInfos.getParentField()) - : DocIdSetIterator.all(reader.maxDoc()); + final DocIdSetIterator iter; + if (metaData.hasBlocks() && fieldInfos.getParentField() != null) { + iter = reader.getNumericDocValues(fieldInfos.getParentField()); + } else { + iter = DocIdSetIterator.all(reader.maxDoc()); + } int prevDoc = iter.nextDoc(); int nextDoc; while ((nextDoc = iter.nextDoc()) != NO_MORE_DOCS) { diff --git a/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java b/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java index f26671809a28..4a0f2f71666d 100644 --- a/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java +++ b/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java @@ -258,7 +258,7 @@ long updateDocuments( // yet we must not fail if this index was created in an earlier version where this // behavior was permitted. throw new IllegalArgumentException( - "a parent field must be set in order to use document blocks with index sorting; see IndexWriterConfig#getParentField"); + "a parent field must be set in order to use document blocks with index sorting; see IndexWriterConfig#setParentField"); } // Even on exception, the document is still added (but marked diff --git a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java index b11ae9942997..b7efce865f38 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java @@ -4911,7 +4911,7 @@ public void testIndexWithParentFieldIsCongruent() throws IOException { new IndexWriter(dir, config); }); assertEquals( - "cannot configure [someOtherField] as parent document field ; this index uses [parent] as parent document field already", + "cannot configure [someOtherField] as parent document field ; this index uses [parent] as parent document field already", ex.getMessage()); ex = expectThrows( @@ -4946,7 +4946,7 @@ public void testParentFieldIsAlreadyUsed() throws IOException { new IndexWriter(dir, config); }); assertEquals( - "cannot configure [parent] as parent document field ; this index uses [parent] as non parent document field already", + "cannot configure [parent] as non parent document field ; this index uses [parent] as parent document field already", iae.getMessage()); } } From 4e829cff90ca89cb46161d8d996246f642a084c2 Mon Sep 17 00:00:00 2001 From: Simon Willnauer Date: Wed, 3 Jan 2024 13:57:11 +0100 Subject: [PATCH 43/44] fix test --- .../core/src/test/org/apache/lucene/index/TestIndexSorting.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java b/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java index 0a7db5ce82ad..7199e1a86058 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java @@ -3192,7 +3192,7 @@ public void testParentFieldNotConfigured() throws IOException { writer.addDocuments(Arrays.asList(new Document(), new Document())); }); assertEquals( - "a parent field must be set in order to use document blocks with index sorting; see IndexWriterConfig#getParentField", + "a parent field must be set in order to use document blocks with index sorting; see IndexWriterConfig#setParentField", ex.getMessage()); } } From 2980f57194c0b61d5fd9f29bcadbf91d13dd489b Mon Sep 17 00:00:00 2001 From: Simon Willnauer Date: Tue, 9 Jan 2024 12:42:54 +0100 Subject: [PATCH 44/44] change wording --- lucene/CHANGES.txt | 2 +- .../org/apache/lucene/index/FieldInfos.java | 20 +++++++++---------- .../apache/lucene/index/TestAddIndexes.java | 8 ++++---- .../apache/lucene/index/TestIndexWriter.java | 6 +++--- 4 files changed, 18 insertions(+), 18 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index eb62aed59623..196d728b2bf9 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -93,7 +93,7 @@ New Features * GITHUB#12829: For indices newly created as of 10.0.0 onwards, IndexWriter preserves document blocks indexed via IndexWriter#addDocuments or IndexWriter#updateDocuments also when index sorting is configured. Document blocks are maintained alongside their parent documents during sort and merge. IndexWriterConfig now requires a parent field to be - specified if index sorting is used together with documents blocks. (Simon Willnauer) + specified if index sorting is used together with document blocks. (Simon Willnauer) Improvements --------------------- diff --git a/lucene/core/src/java/org/apache/lucene/index/FieldInfos.java b/lucene/core/src/java/org/apache/lucene/index/FieldInfos.java index e2c8795ae952..5d483f7ece10 100644 --- a/lucene/core/src/java/org/apache/lucene/index/FieldInfos.java +++ b/lucene/core/src/java/org/apache/lucene/index/FieldInfos.java @@ -492,26 +492,26 @@ private void verifyParentFieldName(String fieldName, boolean isParentField) { if (isParentField) { if (parentFieldName == null) { throw new IllegalArgumentException( - "this index has [" + "can't add field [" + fieldName - + "] as parent document field already but parent document field is not configured in IWC"); + + "] as parent document field; this IndexWriter has no parent document field configured"); } else if (fieldName.equals(parentFieldName) == false) { throw new IllegalArgumentException( - "cannot configure [" - + parentFieldName - + "] as parent document field ; this index uses [" + "can't add field [" + fieldName - + "] as parent document field already"); + + "] as parent document field; this IndexWriter is configured with [" + + parentFieldName + + "] as parent document field"); } } else if (fieldName.equals(parentFieldName)) { // isParent == false // this would be the case if the current index has a parent field that is // not a parent field in the incoming index (think addIndices) throw new IllegalArgumentException( - "cannot configure [" - + parentFieldName - + "] as non parent document field ; this index uses [" + "can't add [" + fieldName - + "] as parent document field already"); + + "] as non parent document field; this IndexWriter is configured with [" + + parentFieldName + + "] as parent document field"); } } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestAddIndexes.java b/lucene/core/src/test/org/apache/lucene/index/TestAddIndexes.java index 382cbd7bf548..152dcdddc554 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestAddIndexes.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestAddIndexes.java @@ -1966,7 +1966,7 @@ public void testIllegalParentDocChange() throws Exception { }) .getMessage(); assertEquals( - "cannot configure [foo] as parent document field ; this index uses [foobar] as parent document field already", + "can't add field [foobar] as parent document field; this IndexWriter is configured with [foo] as parent document field", message); message = @@ -1977,7 +1977,7 @@ public void testIllegalParentDocChange() throws Exception { }) .getMessage(); assertEquals( - "cannot configure [foo] as parent document field ; this index uses [foobar] as parent document field already", + "can't add field [foobar] as parent document field; this IndexWriter is configured with [foo] as parent document field", message); Directory dir3 = newDirectory(); @@ -2014,7 +2014,7 @@ public void testIllegalNonParentField() throws IOException { }) .getMessage(); assertEquals( - "cannot configure [foo] as non parent document field ; this index uses [foo] as parent document field already", + "can't add [foo] as non parent document field; this IndexWriter is configured with [foo] as parent document field", message); message = @@ -2025,7 +2025,7 @@ public void testIllegalNonParentField() throws IOException { }) .getMessage(); assertEquals( - "cannot configure [foo] as non parent document field ; this index uses [foo] as parent document field already", + "can't add [foo] as non parent document field; this IndexWriter is configured with [foo] as parent document field", message); IOUtils.close(r1, dir1, w2, dir2); diff --git a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java index b7efce865f38..c5e95d40b06c 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java @@ -4911,7 +4911,7 @@ public void testIndexWithParentFieldIsCongruent() throws IOException { new IndexWriter(dir, config); }); assertEquals( - "cannot configure [someOtherField] as parent document field ; this index uses [parent] as parent document field already", + "can't add field [parent] as parent document field; this IndexWriter is configured with [someOtherField] as parent document field", ex.getMessage()); ex = expectThrows( @@ -4921,7 +4921,7 @@ public void testIndexWithParentFieldIsCongruent() throws IOException { new IndexWriter(dir, config); }); assertEquals( - "this index has [parent] as parent document field already but parent document field is not configured in IWC", + "can't add field [parent] as parent document field; this IndexWriter has no parent document field configured", ex.getMessage()); } } @@ -4946,7 +4946,7 @@ public void testParentFieldIsAlreadyUsed() throws IOException { new IndexWriter(dir, config); }); assertEquals( - "cannot configure [parent] as non parent document field ; this index uses [parent] as parent document field already", + "can't add [parent] as non parent document field; this IndexWriter is configured with [parent] as parent document field", iae.getMessage()); } }