Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Backport #12829 to 9.x #13013

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,12 @@ New Features
better-scoring nodes are available, or the best candidate is below a score of `traversalSimilarity` in the lowest
level. (Aditya Prakash, Kaival Parikh)

* GITHUB#12829: For indices newly created as of 9.10.0 onwards, IndexWriter preserves document blocks indexed via
IndexWriter#addDocuments or IndexWriter#updateDocuments also when index sorting is configured. Document blocks are
maintained alongside their parent documents during sort and merge. IndexWriterConfig accepts a parent field that is used
to maintain block orders if index sorting is used. Note, this is fully optional in Lucene 9.x while will be mandatory for
indices that use document blocks together with index sorting as of 10.0.0. (Simon Willnauer)

Improvements
---------------------

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -217,7 +217,8 @@ private FieldInfo[] readFieldInfos(IndexInput input, int version) throws IOExcep
0,
VectorEncoding.FLOAT32,
VectorSimilarityFunction.EUCLIDEAN,
isSoftDeletesField);
isSoftDeletesField,
false);
} catch (IllegalStateException e) {
throw new CorruptIndexException(
"invalid fieldinfo for field: " + name + ", fieldNumber=" + fieldNumber, input, e);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,8 @@ public FieldInfos read(
vectorDimension,
VectorEncoding.FLOAT32,
vectorDistFunc,
isSoftDeletesField);
isSoftDeletesField,
false);
infos[i].checkConsistency();
} catch (IllegalStateException e) {
throw new CorruptIndexException(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,4 +34,9 @@ protected Version[] getVersions() {
protected Codec getCodec() {
return new Lucene84RWCodec();
}

@Override
protected boolean supportsHasBlocks() {
return false;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -34,4 +34,9 @@ protected Version[] getVersions() {
protected Codec getCodec() {
return new Lucene87RWCodec();
}

@Override
protected boolean supportsHasBlocks() {
return false;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -32,4 +32,9 @@ protected Version[] getVersions() {
protected Codec getCodec() {
return new Lucene90RWCodec();
}

@Override
protected boolean supportsHasBlocks() {
return false;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.LeafMetaData;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.LogByteSizeMergePolicy;
Expand All @@ -98,6 +99,8 @@
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.index.VectorSimilarityFunction;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.FieldDoc;
import org.apache.lucene.search.FieldExistsQuery;
Expand Down Expand Up @@ -2187,6 +2190,225 @@ public void testSortedIndex() throws Exception {
}
}

public void testSortedIndexAddDocBlocks() throws Exception {
for (String name : oldSortedNames) {
Path path = createTempDir("sorted");
InputStream resource = TestBackwardsCompatibility.class.getResourceAsStream(name + ".zip");
assertNotNull("Sorted index index " + name + " not found", resource);
TestUtil.unzip(resource, path);

try (Directory dir = newFSDirectory(path)) {
final Sort sort;
try (DirectoryReader reader = DirectoryReader.open(dir)) {
assertEquals(1, reader.leaves().size());
sort = reader.leaves().get(0).reader().getMetaData().getSort();
assertNotNull(sort);
searchExampleIndex(reader);
}
// open writer
try (IndexWriter writer =
new IndexWriter(
dir,
newIndexWriterConfig(new MockAnalyzer(random()))
.setOpenMode(OpenMode.APPEND)
.setIndexSort(sort)
.setMergePolicy(newLogMergePolicy()))) {
// add 10 docs
for (int i = 0; i < 10; i++) {
Document child = new Document();
child.add(new StringField("relation", "child", Field.Store.NO));
child.add(new StringField("bid", "" + i, Field.Store.NO));
child.add(new NumericDocValuesField("dateDV", i));
Document parent = new Document();
parent.add(new StringField("relation", "parent", Field.Store.NO));
parent.add(new StringField("bid", "" + i, Field.Store.NO));
parent.add(new NumericDocValuesField("dateDV", i));
writer.addDocuments(Arrays.asList(child, child, parent));
if (random().nextBoolean()) {
writer.flush();
}
}
if (random().nextBoolean()) {
writer.forceMerge(1);
}
writer.commit();
try (IndexReader reader = DirectoryReader.open(dir)) {
IndexSearcher searcher = new IndexSearcher(reader);
for (int i = 0; i < 10; i++) {
TopDocs children =
searcher.search(
new BooleanQuery.Builder()
.add(
new TermQuery(new Term("relation", "child")),
BooleanClause.Occur.MUST)
.add(new TermQuery(new Term("bid", "" + i)), BooleanClause.Occur.MUST)
.build(),
2);
TopDocs parents =
searcher.search(
new BooleanQuery.Builder()
.add(
new TermQuery(new Term("relation", "parent")),
BooleanClause.Occur.MUST)
.add(new TermQuery(new Term("bid", "" + i)), BooleanClause.Occur.MUST)
.build(),
2);
assertEquals(2, children.totalHits.value);
assertEquals(1, parents.totalHits.value);
// make sure it's sorted
assertEquals(children.scoreDocs[0].doc + 1, children.scoreDocs[1].doc);
assertEquals(children.scoreDocs[1].doc + 1, parents.scoreDocs[0].doc);
}
}
}
// This will confirm the docs are really sorted
TestUtil.checkIndex(dir);
}
}
}

public void testAddParentFieldToSortedIndex() throws IOException {
for (String name : oldSortedNames) {
Path path = createTempDir("sorted");
InputStream resource = TestBackwardsCompatibility.class.getResourceAsStream(name + ".zip");
assertNotNull("Sorted index index " + name + " not found", resource);
TestUtil.unzip(resource, path);

try (Directory dir = newFSDirectory(path)) {
final Sort sort;
LeafMetaData metaData;
try (DirectoryReader reader = DirectoryReader.open(dir)) {
assertEquals(1, reader.leaves().size());
metaData = reader.leaves().get(0).reader().getMetaData();
assertFalse(
"expected no reader with blocks",
reader.leaves().stream()
.filter(l -> l.reader().getMetaData().hasBlocks())
.findAny()
.isPresent());
sort = metaData.getSort();
assertNotNull(sort);
searchExampleIndex(reader);
}
if (metaData.getMinVersion().onOrAfter(Version.LUCENE_9_9_0) == false) {
continue; // not relevant here
}

try (IndexWriter writer =
new IndexWriter(
dir,
newIndexWriterConfig(new MockAnalyzer(random()))
.setOpenMode(OpenMode.APPEND)
.setIndexSort(sort)
.setParentField("parent")
.setMergePolicy(newLogMergePolicy()))) {
// add 10 docs
for (int i = 0; i < 10; i++) {
// children don't have the sort field here, we make sure it preserves blocks
Document child = new Document();
child.add(new StringField("relation", "child", Field.Store.NO));
child.add(new StringField("bid", "" + i, Field.Store.NO));
Document parent = new Document();
parent.add(new StringField("relation", "parent", Field.Store.NO));
parent.add(new StringField("bid", "" + i, Field.Store.NO));
parent.add(new NumericDocValuesField("dateDV", i));
writer.addDocuments(Arrays.asList(child, child, parent));
if (random().nextBoolean()) {
writer.flush();
}
}
if (random().nextBoolean()) {
writer.forceMerge(1);
}
writer.commit();
try (IndexReader reader = DirectoryReader.open(dir)) {
assertTrue(
"expected at least one reader with blocks",
reader.leaves().stream()
.filter(l -> l.reader().getMetaData().hasBlocks())
.findAny()
.isPresent());
IndexSearcher searcher = new IndexSearcher(reader);
for (int i = 0; i < 10; i++) {
TopDocs children =
searcher.search(
new BooleanQuery.Builder()
.add(
new TermQuery(new Term("relation", "child")),
BooleanClause.Occur.MUST)
.add(new TermQuery(new Term("bid", "" + i)), BooleanClause.Occur.MUST)
.build(),
2);
TopDocs parents =
searcher.search(
new BooleanQuery.Builder()
.add(
new TermQuery(new Term("relation", "parent")),
BooleanClause.Occur.MUST)
.add(new TermQuery(new Term("bid", "" + i)), BooleanClause.Occur.MUST)
.build(),
2);
assertEquals(2, children.totalHits.value);
assertEquals(1, parents.totalHits.value);
// make sure it's sorted
assertEquals(children.scoreDocs[0].doc + 1, children.scoreDocs[1].doc);
assertEquals(children.scoreDocs[1].doc + 1, parents.scoreDocs[0].doc);
}
}
}
// This will confirm the docs are really sorted
TestUtil.checkIndex(dir);
}
}
}

public void testSortedIndexAddDocBlocksFailsOnParentField() throws Exception {

for (String name : oldSortedNames) {
Path path = createTempDir("sorted");
InputStream resource = TestBackwardsCompatibility.class.getResourceAsStream(name + ".zip");
assertNotNull("Sorted index index " + name + " not found", resource);
TestUtil.unzip(resource, path);

try (Directory dir = newFSDirectory(path)) {
final Sort sort;
LeafMetaData metaData;
try (DirectoryReader reader = DirectoryReader.open(dir)) {
assertEquals(1, reader.leaves().size());
metaData = reader.leaves().get(0).reader().getMetaData();
sort = metaData.getSort();
assertNotNull(sort);
searchExampleIndex(reader);
}
if (metaData.getMinVersion().onOrAfter(Version.LUCENE_9_9_0)) {
new IndexWriter(
dir,
newIndexWriterConfig(new MockAnalyzer(random()))
.setOpenMode(OpenMode.APPEND)
.setIndexSort(sort)
.setParentField("parent")
.setMergePolicy(newLogMergePolicy()))
.close();
} else {
IllegalArgumentException iae =
expectThrows(
IllegalArgumentException.class,
() ->
new IndexWriter(
dir,
newIndexWriterConfig(new MockAnalyzer(random()))
.setOpenMode(OpenMode.APPEND)
.setIndexSort(sort)
.setParentField("parent")
.setMergePolicy(newLogMergePolicy())));
assertEquals(
"can't add a parent field to an index that has segments form a lucene version older than 9.9.0",
iae.getMessage());
}
}
}
}

private void searchExampleIndex(DirectoryReader reader) throws IOException {
IndexSearcher searcher = newSearcher(reader);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ public class SimpleTextFieldInfosFormat extends FieldInfosFormat {
static final BytesRef VECTOR_ENCODING = new BytesRef(" vector encoding ");
static final BytesRef VECTOR_SIMILARITY = new BytesRef(" vector similarity ");
static final BytesRef SOFT_DELETES = new BytesRef(" soft-deletes ");
static final BytesRef PARENT = new BytesRef(" parent ");

@Override
public FieldInfos read(
Expand Down Expand Up @@ -170,6 +171,9 @@ public FieldInfos read(
SimpleTextUtil.readLine(input, scratch);
assert StringHelper.startsWith(scratch.get(), SOFT_DELETES);
boolean isSoftDeletesField = Boolean.parseBoolean(readString(SOFT_DELETES.length, scratch));
SimpleTextUtil.readLine(input, scratch);
assert StringHelper.startsWith(scratch.get(), PARENT);
boolean isParentField = Boolean.parseBoolean(readString(PARENT.length, scratch));

infos[i] =
new FieldInfo(
Expand All @@ -188,7 +192,8 @@ public FieldInfos read(
vectorNumDimensions,
vectorEncoding,
vectorDistFunc,
isSoftDeletesField);
isSoftDeletesField,
isParentField);
}

SimpleTextUtil.checkFooter(input);
Expand Down Expand Up @@ -320,6 +325,10 @@ public void write(
SimpleTextUtil.write(out, SOFT_DELETES);
SimpleTextUtil.write(out, Boolean.toString(fi.isSoftDeletesField()), scratch);
SimpleTextUtil.writeNewline(out);

SimpleTextUtil.write(out, PARENT);
SimpleTextUtil.write(out, Boolean.toString(fi.isParentField()), scratch);
SimpleTextUtil.writeNewline(out);
}
SimpleTextUtil.writeChecksum(out, scratch);
success = true;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,13 @@ public SegmentInfo read(
sortField[i] = SortFieldProvider.forName(provider).readSortField(bytes);
assert bytes.eof();
}
Sort indexSort = sortField.length == 0 ? null : new Sort(sortField);

final Sort indexSort;
if (sortField.length == 0) {
indexSort = null;
} else {
indexSort = new Sort(sortField);
}

SimpleTextUtil.checkFooter(input);

Expand Down Expand Up @@ -335,7 +341,6 @@ public void write(Directory dir, SegmentInfo si, IOContext ioContext) throws IOE
SimpleTextUtil.write(output, b.bytes.get().toString(), scratch);
SimpleTextUtil.writeNewline(output);
}

SimpleTextUtil.writeChecksum(output, scratch);
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,7 @@ private static FieldInfo getMockFieldInfo(String fieldName, int number) {
0,
VectorEncoding.FLOAT32,
VectorSimilarityFunction.EUCLIDEAN,
true);
true,
false);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,7 @@ private static FieldInfo mockFieldInfo(String fieldName, int number) {
0,
VectorEncoding.FLOAT32,
VectorSimilarityFunction.EUCLIDEAN,
false,
false);
}

Expand Down
Loading
Loading