Skip to content

Commit

Permalink
Add support for index sorting with document blocks (#12829)
Browse files Browse the repository at this point in the history
Today index sorting will most likely break document blocks added with `IndexWriter#addDocuments(...)` and `#updateDocuments(...)` since the index sorter has no indication of what documents are part of a block. This change automatically adds a marker field to  parent documents if configured in `IWC`. These marker documents are optional unless document blocks are indexed and index sorting is configured. In this case indexing blocks will fail unless a parent field is configured. Index sorting will preserve document blocks during sort. Documents within a block not be reordered by the sorting algorithm and will sort along side their parent documents.

Relates to #12711
  • Loading branch information
s1monw committed Jan 17, 2024
1 parent 00e2fe6 commit 0aa8891
Show file tree
Hide file tree
Showing 47 changed files with 1,106 additions and 91 deletions.
6 changes: 6 additions & 0 deletions lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,12 @@ New Features
better-scoring nodes are available, or the best candidate is below a score of `traversalSimilarity` in the lowest
level. (Aditya Prakash, Kaival Parikh)

* GITHUB#12829: For indices newly created as of 9.10.0 onwards, IndexWriter preserves document blocks indexed via
IndexWriter#addDocuments or IndexWriter#updateDocuments also when index sorting is configured. Document blocks are
maintained alongside their parent documents during sort and merge. IndexWriterConfig accepts a parent field that is used
to maintain block orders if index sorting is used. Note, this is fully optional in Lucene 9.x while will be mandatory for
indices that use document blocks together with index sorting as of 10.0.0. (Simon Willnauer)

Improvements
---------------------

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -217,7 +217,8 @@ private FieldInfo[] readFieldInfos(IndexInput input, int version) throws IOExcep
0,
VectorEncoding.FLOAT32,
VectorSimilarityFunction.EUCLIDEAN,
isSoftDeletesField);
isSoftDeletesField,
false);
} catch (IllegalStateException e) {
throw new CorruptIndexException(
"invalid fieldinfo for field: " + name + ", fieldNumber=" + fieldNumber, input, e);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,8 @@ public FieldInfos read(
vectorDimension,
VectorEncoding.FLOAT32,
vectorDistFunc,
isSoftDeletesField);
isSoftDeletesField,
false);
infos[i].checkConsistency();
} catch (IllegalStateException e) {
throw new CorruptIndexException(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,4 +34,9 @@ protected Version[] getVersions() {
protected Codec getCodec() {
return new Lucene84RWCodec();
}

@Override
protected boolean supportsHasBlocks() {
return false;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -34,4 +34,9 @@ protected Version[] getVersions() {
protected Codec getCodec() {
return new Lucene87RWCodec();
}

@Override
protected boolean supportsHasBlocks() {
return false;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -32,4 +32,9 @@ protected Version[] getVersions() {
protected Codec getCodec() {
return new Lucene90RWCodec();
}

@Override
protected boolean supportsHasBlocks() {
return false;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,8 @@
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.index.VectorSimilarityFunction;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.FieldDoc;
import org.apache.lucene.search.FieldExistsQuery;
Expand Down Expand Up @@ -2187,6 +2189,83 @@ public void testSortedIndex() throws Exception {
}
}

public void testSortedIndexAddDocBlocks() throws Exception {
for (String name : oldSortedNames) {
Path path = createTempDir("sorted");
InputStream resource = TestBackwardsCompatibility.class.getResourceAsStream(name + ".zip");
assertNotNull("Sorted index index " + name + " not found", resource);
TestUtil.unzip(resource, path);

try (Directory dir = newFSDirectory(path)) {
final Sort sort;
try (DirectoryReader reader = DirectoryReader.open(dir)) {
assertEquals(1, reader.leaves().size());
sort = reader.leaves().get(0).reader().getMetaData().getSort();
assertNotNull(sort);
searchExampleIndex(reader);
}
// open writer
try (IndexWriter writer =
new IndexWriter(
dir,
newIndexWriterConfig(new MockAnalyzer(random()))
.setOpenMode(OpenMode.APPEND)
.setIndexSort(sort)
.setMergePolicy(newLogMergePolicy()))) {
// add 10 docs
for (int i = 0; i < 10; i++) {
Document child = new Document();
child.add(new StringField("relation", "child", Field.Store.NO));
child.add(new StringField("bid", "" + i, Field.Store.NO));
child.add(new NumericDocValuesField("dateDV", i));
Document parent = new Document();
parent.add(new StringField("relation", "parent", Field.Store.NO));
parent.add(new StringField("bid", "" + i, Field.Store.NO));
parent.add(new NumericDocValuesField("dateDV", i));
writer.addDocuments(Arrays.asList(child, child, parent));
if (random().nextBoolean()) {
writer.flush();
}
}
if (random().nextBoolean()) {
writer.forceMerge(1);
}
writer.commit();
try (IndexReader reader = DirectoryReader.open(dir)) {
IndexSearcher searcher = new IndexSearcher(reader);
for (int i = 0; i < 10; i++) {
TopDocs children =
searcher.search(
new BooleanQuery.Builder()
.add(
new TermQuery(new Term("relation", "child")),
BooleanClause.Occur.MUST)
.add(new TermQuery(new Term("bid", "" + i)), BooleanClause.Occur.MUST)
.build(),
2);
TopDocs parents =
searcher.search(
new BooleanQuery.Builder()
.add(
new TermQuery(new Term("relation", "parent")),
BooleanClause.Occur.MUST)
.add(new TermQuery(new Term("bid", "" + i)), BooleanClause.Occur.MUST)
.build(),
2);
assertEquals(2, children.totalHits.value);
assertEquals(1, parents.totalHits.value);
// make sure it's sorted
assertEquals(children.scoreDocs[0].doc + 1, children.scoreDocs[1].doc);
assertEquals(children.scoreDocs[1].doc + 1, parents.scoreDocs[0].doc);
}
}
}
// This will confirm the docs are really sorted
TestUtil.checkIndex(dir);
}
}
}

private void searchExampleIndex(DirectoryReader reader) throws IOException {
IndexSearcher searcher = newSearcher(reader);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ public class SimpleTextFieldInfosFormat extends FieldInfosFormat {
static final BytesRef VECTOR_ENCODING = new BytesRef(" vector encoding ");
static final BytesRef VECTOR_SIMILARITY = new BytesRef(" vector similarity ");
static final BytesRef SOFT_DELETES = new BytesRef(" soft-deletes ");
static final BytesRef PARENT = new BytesRef(" parent ");

@Override
public FieldInfos read(
Expand Down Expand Up @@ -170,6 +171,9 @@ public FieldInfos read(
SimpleTextUtil.readLine(input, scratch);
assert StringHelper.startsWith(scratch.get(), SOFT_DELETES);
boolean isSoftDeletesField = Boolean.parseBoolean(readString(SOFT_DELETES.length, scratch));
SimpleTextUtil.readLine(input, scratch);
assert StringHelper.startsWith(scratch.get(), PARENT);
boolean isParentField = Boolean.parseBoolean(readString(PARENT.length, scratch));

infos[i] =
new FieldInfo(
Expand All @@ -188,7 +192,8 @@ public FieldInfos read(
vectorNumDimensions,
vectorEncoding,
vectorDistFunc,
isSoftDeletesField);
isSoftDeletesField,
isParentField);
}

SimpleTextUtil.checkFooter(input);
Expand Down Expand Up @@ -320,6 +325,10 @@ public void write(
SimpleTextUtil.write(out, SOFT_DELETES);
SimpleTextUtil.write(out, Boolean.toString(fi.isSoftDeletesField()), scratch);
SimpleTextUtil.writeNewline(out);

SimpleTextUtil.write(out, PARENT);
SimpleTextUtil.write(out, Boolean.toString(fi.isParentField()), scratch);
SimpleTextUtil.writeNewline(out);
}
SimpleTextUtil.writeChecksum(out, scratch);
success = true;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,13 @@ public SegmentInfo read(
sortField[i] = SortFieldProvider.forName(provider).readSortField(bytes);
assert bytes.eof();
}
Sort indexSort = sortField.length == 0 ? null : new Sort(sortField);

final Sort indexSort;
if (sortField.length == 0) {
indexSort = null;
} else {
indexSort = new Sort(sortField);
}

SimpleTextUtil.checkFooter(input);

Expand Down Expand Up @@ -335,7 +341,6 @@ public void write(Directory dir, SegmentInfo si, IOContext ioContext) throws IOE
SimpleTextUtil.write(output, b.bytes.get().toString(), scratch);
SimpleTextUtil.writeNewline(output);
}

SimpleTextUtil.writeChecksum(output, scratch);
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,7 @@ private static FieldInfo getMockFieldInfo(String fieldName, int number) {
0,
VectorEncoding.FLOAT32,
VectorSimilarityFunction.EUCLIDEAN,
true);
true,
false);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,7 @@ private static FieldInfo mockFieldInfo(String fieldName, int number) {
0,
VectorEncoding.FLOAT32,
VectorSimilarityFunction.EUCLIDEAN,
false,
false);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -131,13 +131,14 @@ public FieldInfos read(
Throwable priorE = null;
FieldInfo[] infos = null;
try {
CodecUtil.checkIndexHeader(
input,
Lucene94FieldInfosFormat.CODEC_NAME,
Lucene94FieldInfosFormat.FORMAT_START,
Lucene94FieldInfosFormat.FORMAT_CURRENT,
segmentInfo.getId(),
segmentSuffix);
int format =
CodecUtil.checkIndexHeader(
input,
Lucene94FieldInfosFormat.CODEC_NAME,
Lucene94FieldInfosFormat.FORMAT_START,
Lucene94FieldInfosFormat.FORMAT_CURRENT,
segmentInfo.getId(),
segmentSuffix);

final int size = input.readVInt(); // read in the size
infos = new FieldInfo[size];
Expand All @@ -157,6 +158,18 @@ public FieldInfos read(
boolean omitNorms = (bits & OMIT_NORMS) != 0;
boolean storePayloads = (bits & STORE_PAYLOADS) != 0;
boolean isSoftDeletesField = (bits & SOFT_DELETES_FIELD) != 0;
boolean isParentField =
format >= FORMAT_PARENT_FIELD ? (bits & PARENT_FIELD_FIELD) != 0 : false;

if ((bits & 0xE0) != 0) {
throw new CorruptIndexException(
"unused bits are set \"" + Integer.toBinaryString(bits) + "\"", input);
}
if (format < FORMAT_PARENT_FIELD && (bits & 0xF0) != 0) {
throw new CorruptIndexException(
"parent field bit is set but shouldn't \"" + Integer.toBinaryString(bits) + "\"",
input);
}

final IndexOptions indexOptions = getIndexOptions(input, input.readByte());

Expand Down Expand Up @@ -200,7 +213,8 @@ public FieldInfos read(
vectorDimension,
vectorEncoding,
vectorDistFunc,
isSoftDeletesField);
isSoftDeletesField,
isParentField);
infos[i].checkConsistency();
} catch (IllegalStateException e) {
throw new CorruptIndexException(
Expand Down Expand Up @@ -348,6 +362,7 @@ public void write(
if (fi.omitsNorms()) bits |= OMIT_NORMS;
if (fi.hasPayloads()) bits |= STORE_PAYLOADS;
if (fi.isSoftDeletesField()) bits |= SOFT_DELETES_FIELD;
if (fi.isParentField()) bits |= PARENT_FIELD_FIELD;
output.writeByte(bits);

output.writeByte(indexOptionsByte(fi.getIndexOptions()));
Expand Down Expand Up @@ -375,11 +390,14 @@ public void write(
// Codec header
static final String CODEC_NAME = "Lucene94FieldInfos";
static final int FORMAT_START = 0;
static final int FORMAT_CURRENT = FORMAT_START;
// this doesn't actually change the file format but uses up one more bit an existing bit pattern
static final int FORMAT_PARENT_FIELD = 1;
static final int FORMAT_CURRENT = FORMAT_PARENT_FIELD;

// Field flags
static final byte STORE_TERMVECTOR = 0x1;
static final byte OMIT_NORMS = 0x2;
static final byte STORE_PAYLOADS = 0x4;
static final byte SOFT_DELETES_FIELD = 0x8;
static final byte PARENT_FIELD_FIELD = 0x10;
}
29 changes: 17 additions & 12 deletions lucene/core/src/java/org/apache/lucene/index/CheckIndex.java
Original file line number Diff line number Diff line change
Expand Up @@ -1196,34 +1196,39 @@ public static Status.IndexSortStatus testSort(
comparators[i] = fields[i].getComparator(1, Pruning.NONE).getLeafComparator(readerContext);
}

int maxDoc = reader.maxDoc();

try {

for (int docID = 1; docID < maxDoc; docID++) {

LeafMetaData metaData = reader.getMetaData();
FieldInfos fieldInfos = reader.getFieldInfos();
final DocIdSetIterator iter;
if (metaData.hasBlocks() && fieldInfos.getParentField() != null) {
iter = reader.getNumericDocValues(fieldInfos.getParentField());
} else {
iter = DocIdSetIterator.all(reader.maxDoc());
}
int prevDoc = iter.nextDoc();
int nextDoc;
while ((nextDoc = iter.nextDoc()) != NO_MORE_DOCS) {
int cmp = 0;

for (int i = 0; i < comparators.length; i++) {
// TODO: would be better if copy() didnt cause a term lookup in TermOrdVal & co,
// TODO: would be better if copy() didn't cause a term lookup in TermOrdVal & co,
// the segments are always the same here...
comparators[i].copy(0, docID - 1);
comparators[i].copy(0, prevDoc);
comparators[i].setBottom(0);
cmp = reverseMul[i] * comparators[i].compareBottom(docID);
cmp = reverseMul[i] * comparators[i].compareBottom(nextDoc);
if (cmp != 0) {
break;
}
}

if (cmp > 0) {
throw new CheckIndexException(
"segment has indexSort="
+ sort
+ " but docID="
+ (docID - 1)
+ (prevDoc)
+ " sorts after docID="
+ docID);
+ nextDoc);
}
prevDoc = nextDoc;
}
msg(
infoStream,
Expand Down
Loading

0 comments on commit 0aa8891

Please sign in to comment.