Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for index sorting with document blocks #12829

Merged
merged 47 commits into from
Jan 11, 2024
Merged
Show file tree
Hide file tree
Changes from 34 commits
Commits
Show all changes
47 commits
Select commit Hold shift + click to select a range
193d87a
first cut at sorting only the parent documents
s1monw Nov 16, 2023
f5eb4f8
progress
s1monw Nov 17, 2023
f3b90c0
add validator
s1monw Nov 18, 2023
97b35e0
tidy
s1monw Nov 18, 2023
b65eda0
fix tests
s1monw Nov 20, 2023
191ebaf
remove dead code
s1monw Nov 20, 2023
345edc5
add more tests
s1monw Nov 21, 2023
1693953
tidy
s1monw Nov 21, 2023
baaedcd
add test for validations
s1monw Nov 21, 2023
2162cce
add test for validations
s1monw Nov 21, 2023
56a4444
fix imports
s1monw Nov 21, 2023
ff8144a
apply feedback
s1monw Dec 3, 2023
ab4b3bc
remove dead code
s1monw Dec 3, 2023
224c7df
fix comparison
s1monw Dec 3, 2023
86d7032
add bwc
s1monw Dec 3, 2023
5a15f58
add test for BWC
s1monw Dec 3, 2023
0431a51
fix test
s1monw Dec 3, 2023
3e64766
improve BWC testing
s1monw Dec 4, 2023
5f6c297
address concerns about field comparison
s1monw Dec 7, 2023
6e64b51
cleanup
s1monw Dec 7, 2023
e31f17f
apply feedback
s1monw Dec 7, 2023
4b33f85
Move parent field to IWC and FieldInfos
s1monw Dec 14, 2023
f26646b
respect version in file format
s1monw Dec 14, 2023
2a14c41
tidy
s1monw Dec 14, 2023
76d4ba2
tidy
s1monw Dec 14, 2023
227c3cf
tidy
s1monw Dec 14, 2023
ac81019
move test
s1monw Dec 15, 2023
7573c0a
add test if field is already used
s1monw Dec 15, 2023
3781c86
add javadocs
s1monw Dec 15, 2023
cc17352
tidy
s1monw Dec 15, 2023
7087f2c
add import
s1monw Dec 15, 2023
c122994
remove hard checks for same soft deletes field
s1monw Dec 15, 2023
a73e52b
apply feedback
s1monw Dec 21, 2023
2635711
Merge branch 'main' into parent_field
s1monw Dec 21, 2023
502703e
apply feedback
s1monw Dec 24, 2023
6f11436
beef up testing and revert to -1 as a default value isntead of the nu…
s1monw Dec 27, 2023
e2876da
Merge branch 'main' into parent_field
s1monw Jan 2, 2024
023d242
add changes
s1monw Jan 2, 2024
d36dde5
fix polishing comments
s1monw Jan 3, 2024
4c6e152
add test for single doc blocks don't trigger hasBlocks
s1monw Jan 3, 2024
ff8f561
remove extra whitespace in error msg
s1monw Jan 3, 2024
0334d25
remove extra whitespace in error msg
s1monw Jan 3, 2024
25e6da4
add test and docs to make mike not staring at this forever
s1monw Jan 3, 2024
363ce3d
apply review comments
s1monw Jan 3, 2024
4e829cf
fix test
s1monw Jan 3, 2024
a4b91b5
Merge branch 'main' into parent_field
s1monw Jan 9, 2024
2980f57
change wording
s1monw Jan 9, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -217,7 +217,8 @@ private FieldInfo[] readFieldInfos(IndexInput input, int version) throws IOExcep
0,
VectorEncoding.FLOAT32,
VectorSimilarityFunction.EUCLIDEAN,
isSoftDeletesField);
isSoftDeletesField,
false);
} catch (IllegalStateException e) {
throw new CorruptIndexException(
"invalid fieldinfo for field: " + name + ", fieldNumber=" + fieldNumber, input, e);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,8 @@ public FieldInfos read(
vectorDimension,
VectorEncoding.FLOAT32,
vectorDistFunc,
isSoftDeletesField);
isSoftDeletesField,
false);
infos[i].checkConsistency();
} catch (IllegalStateException e) {
throw new CorruptIndexException(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,4 +35,9 @@ protected Version[] getVersions() {
protected Codec getCodec() {
return new Lucene84RWCodec();
}

@Override
protected boolean supportsHasBlocks() {
return false;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -35,4 +35,9 @@ protected Version[] getVersions() {
protected Codec getCodec() {
return new Lucene87RWCodec();
}

@Override
protected boolean supportsHasBlocks() {
return false;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -32,4 +32,9 @@ protected Version[] getVersions() {
protected Codec getCodec() {
return new Lucene90RWCodec();
}

@Override
protected boolean supportsHasBlocks() {
return false;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,8 @@
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.index.VectorSimilarityFunction;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.FieldDoc;
import org.apache.lucene.search.FieldExistsQuery;
Expand Down Expand Up @@ -2171,6 +2173,83 @@ public void testSortedIndex() throws Exception {
}
}

public void testSortedIndexAddDocBlocks() throws Exception {
for (String name : oldSortedNames) {
Path path = createTempDir("sorted");
InputStream resource = TestBackwardsCompatibility.class.getResourceAsStream(name + ".zip");
assertNotNull("Sorted index index " + name + " not found", resource);
TestUtil.unzip(resource, path);

try (Directory dir = newFSDirectory(path)) {
final Sort sort;
try (DirectoryReader reader = DirectoryReader.open(dir)) {
assertEquals(1, reader.leaves().size());
sort = reader.leaves().get(0).reader().getMetaData().getSort();
assertNotNull(sort);
searchExampleIndex(reader);
}
// open writer
try (IndexWriter writer =
new IndexWriter(
dir,
newIndexWriterConfig(new MockAnalyzer(random()))
.setOpenMode(OpenMode.APPEND)
.setIndexSort(sort)
.setMergePolicy(newLogMergePolicy()))) {
// add 10 docs
for (int i = 0; i < 10; i++) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is testing indexing doc blocks into an old (pre-10.0) index right? In this case does IW still add the DV field to the parent doc? The test here seems not to be configuring a parent field, so I guess no? Only indices created 10.0+ will do this?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, unless we 1. backport this which is possible and 2. if you set a parent field in the old index. yet I can't fully test that just yet since it's not backported. makes sense?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah this makes sense -- if you have a pre-10.0 index, then none of the new code is run. You must create a new index to get the stronger checking.

(Or, 9.10.x index if we backport).

Document child = new Document();
child.add(new StringField("relation", "child", Field.Store.NO));
child.add(new StringField("bid", "" + i, Field.Store.NO));
child.add(new NumericDocValuesField("dateDV", i));
Document parent = new Document();
parent.add(new StringField("relation", "parent", Field.Store.NO));
parent.add(new StringField("bid", "" + i, Field.Store.NO));
parent.add(new NumericDocValuesField("dateDV", i));
writer.addDocuments(Arrays.asList(child, child, parent));
if (random().nextBoolean()) {
writer.flush();
}
}
if (random().nextBoolean()) {
writer.forceMerge(1);
}
writer.commit();
try (IndexReader reader = DirectoryReader.open(dir)) {
IndexSearcher searcher = new IndexSearcher(reader);
for (int i = 0; i < 10; i++) {
TopDocs children =
searcher.search(
new BooleanQuery.Builder()
.add(
new TermQuery(new Term("relation", "child")),
BooleanClause.Occur.MUST)
.add(new TermQuery(new Term("bid", "" + i)), BooleanClause.Occur.MUST)
.build(),
2);
TopDocs parents =
searcher.search(
new BooleanQuery.Builder()
.add(
new TermQuery(new Term("relation", "parent")),
BooleanClause.Occur.MUST)
.add(new TermQuery(new Term("bid", "" + i)), BooleanClause.Occur.MUST)
.build(),
2);
assertEquals(2, children.totalHits.value);
assertEquals(1, parents.totalHits.value);
// make sure it's sorted
assertEquals(children.scoreDocs[0].doc + 1, children.scoreDocs[1].doc);
assertEquals(children.scoreDocs[1].doc + 1, parents.scoreDocs[0].doc);
}
}
}
// This will confirm the docs are really sorted
TestUtil.checkIndex(dir);
s1monw marked this conversation as resolved.
Show resolved Hide resolved
}
}
}

private void searchExampleIndex(DirectoryReader reader) throws IOException {
IndexSearcher searcher = newSearcher(reader);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ public class SimpleTextFieldInfosFormat extends FieldInfosFormat {
static final BytesRef VECTOR_ENCODING = new BytesRef(" vector encoding ");
static final BytesRef VECTOR_SIMILARITY = new BytesRef(" vector similarity ");
static final BytesRef SOFT_DELETES = new BytesRef(" soft-deletes ");
static final BytesRef PARENT = new BytesRef(" parent ");

@Override
public FieldInfos read(
Expand Down Expand Up @@ -170,6 +171,9 @@ public FieldInfos read(
SimpleTextUtil.readLine(input, scratch);
assert StringHelper.startsWith(scratch.get(), SOFT_DELETES);
boolean isSoftDeletesField = Boolean.parseBoolean(readString(SOFT_DELETES.length, scratch));
SimpleTextUtil.readLine(input, scratch);
assert StringHelper.startsWith(scratch.get(), PARENT);
boolean isParentField = Boolean.parseBoolean(readString(PARENT.length, scratch));

infos[i] =
new FieldInfo(
Expand All @@ -188,7 +192,8 @@ public FieldInfos read(
vectorNumDimensions,
vectorEncoding,
vectorDistFunc,
isSoftDeletesField);
isSoftDeletesField,
isParentField);
}

SimpleTextUtil.checkFooter(input);
Expand Down Expand Up @@ -320,6 +325,10 @@ public void write(
SimpleTextUtil.write(out, SOFT_DELETES);
SimpleTextUtil.write(out, Boolean.toString(fi.isSoftDeletesField()), scratch);
SimpleTextUtil.writeNewline(out);

SimpleTextUtil.write(out, PARENT);
SimpleTextUtil.write(out, Boolean.toString(fi.isParentField()), scratch);
SimpleTextUtil.writeNewline(out);
}
SimpleTextUtil.writeChecksum(out, scratch);
success = true;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,13 @@ public SegmentInfo read(
sortField[i] = SortFieldProvider.forName(provider).readSortField(bytes);
assert bytes.eof();
}
Sort indexSort = sortField.length == 0 ? null : new Sort(sortField);

final Sort indexSort;
if (sortField.length == 0) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you! Die ternary operator die!

indexSort = null;
} else {
indexSort = new Sort(sortField);
}

SimpleTextUtil.checkFooter(input);

Expand Down Expand Up @@ -336,7 +342,6 @@ public void write(Directory dir, SegmentInfo si, IOContext ioContext) throws IOE
SimpleTextUtil.write(output, b.bytes.get().toString(), scratch);
SimpleTextUtil.writeNewline(output);
}

SimpleTextUtil.writeChecksum(output, scratch);
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,7 @@ private static FieldInfo getMockFieldInfo(String fieldName, int number) {
0,
VectorEncoding.FLOAT32,
VectorSimilarityFunction.EUCLIDEAN,
true);
true,
false);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,7 @@ private static FieldInfo mockFieldInfo(String fieldName, int number) {
0,
VectorEncoding.FLOAT32,
VectorSimilarityFunction.EUCLIDEAN,
false,
false);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -131,13 +131,14 @@ public FieldInfos read(
Throwable priorE = null;
FieldInfo[] infos = null;
try {
CodecUtil.checkIndexHeader(
input,
Lucene94FieldInfosFormat.CODEC_NAME,
Lucene94FieldInfosFormat.FORMAT_START,
Lucene94FieldInfosFormat.FORMAT_CURRENT,
segmentInfo.getId(),
segmentSuffix);
int format =
CodecUtil.checkIndexHeader(
input,
Lucene94FieldInfosFormat.CODEC_NAME,
Lucene94FieldInfosFormat.FORMAT_START,
Lucene94FieldInfosFormat.FORMAT_CURRENT,
segmentInfo.getId(),
segmentSuffix);

final int size = input.readVInt(); // read in the size
infos = new FieldInfo[size];
Expand All @@ -157,6 +158,13 @@ public FieldInfos read(
boolean omitNorms = (bits & OMIT_NORMS) != 0;
boolean storePayloads = (bits & STORE_PAYLOADS) != 0;
boolean isSoftDeletesField = (bits & SOFT_DELETES_FIELD) != 0;
boolean isParentField =
format >= FORMAT_PARENT_FIELD ? (bits & PARENT_FIELD_FIELD) != 0 : false;
s1monw marked this conversation as resolved.
Show resolved Hide resolved

assert (bits & 0xE0) == 0
: "unused bits are set \"" + Integer.toBinaryString(bits) + "\"";
assert format >= FORMAT_PARENT_FIELD || (bits & 0xF0) == 0
: "parent field bit is set but shouldn't' \"" + Integer.toBinaryString(bits) + "\"";
s1monw marked this conversation as resolved.
Show resolved Hide resolved

final IndexOptions indexOptions = getIndexOptions(input, input.readByte());

Expand Down Expand Up @@ -200,7 +208,8 @@ public FieldInfos read(
vectorDimension,
vectorEncoding,
vectorDistFunc,
isSoftDeletesField);
isSoftDeletesField,
isParentField);
infos[i].checkConsistency();
} catch (IllegalStateException e) {
throw new CorruptIndexException(
Expand Down Expand Up @@ -348,6 +357,7 @@ public void write(
if (fi.omitsNorms()) bits |= OMIT_NORMS;
if (fi.hasPayloads()) bits |= STORE_PAYLOADS;
if (fi.isSoftDeletesField()) bits |= SOFT_DELETES_FIELD;
if (fi.isParentField()) bits |= PARENT_FIELD_FIELD;
output.writeByte(bits);

output.writeByte(indexOptionsByte(fi.getIndexOptions()));
Expand Down Expand Up @@ -375,11 +385,14 @@ public void write(
// Codec header
static final String CODEC_NAME = "Lucene94FieldInfos";
static final int FORMAT_START = 0;
static final int FORMAT_CURRENT = FORMAT_START;
// this doesn't actually change the file format but uses up one more bit an existing bit pattern
static final int FORMAT_PARENT_FIELD = 1;
s1monw marked this conversation as resolved.
Show resolved Hide resolved
static final int FORMAT_CURRENT = FORMAT_PARENT_FIELD;

// Field flags
static final byte STORE_TERMVECTOR = 0x1;
static final byte OMIT_NORMS = 0x2;
static final byte STORE_PAYLOADS = 0x4;
static final byte SOFT_DELETES_FIELD = 0x8;
static final byte PARENT_FIELD_FIELD = 0x10;
}
36 changes: 23 additions & 13 deletions lucene/core/src/java/org/apache/lucene/index/CheckIndex.java
Original file line number Diff line number Diff line change
Expand Up @@ -1180,34 +1180,44 @@ public static Status.IndexSortStatus testSort(
comparators[i] = fields[i].getComparator(1, Pruning.NONE).getLeafComparator(readerContext);
}

int maxDoc = reader.maxDoc();

try {

for (int docID = 1; docID < maxDoc; docID++) {

LeafMetaData metaData = reader.getMetaData();
FieldInfos fieldInfos = reader.getFieldInfos();
if (metaData.hasBlocks()
&& fieldInfos.getParentField() == null
&& metaData.getCreatedVersionMajor() >= Version.LUCENE_10_0_0.major) {
throw new IllegalStateException(
"parent field is not set but the index has document blocks and was created with version: "
+ metaData.getCreatedVersionMajor());
}
final DocIdSetIterator iter =
s1monw marked this conversation as resolved.
Show resolved Hide resolved
metaData.hasBlocks() && fieldInfos.getParentField() != null
? reader.getNumericDocValues(fieldInfos.getParentField())
: DocIdSetIterator.all(reader.maxDoc());
int prevDoc = iter.nextDoc();
int nextDoc;
while ((nextDoc = iter.nextDoc()) != NO_MORE_DOCS) {
int cmp = 0;

for (int i = 0; i < comparators.length; i++) {
// TODO: would be better if copy() didnt cause a term lookup in TermOrdVal & co,
// TODO: would be better if copy() didn't cause a term lookup in TermOrdVal & co,
// the segments are always the same here...
comparators[i].copy(0, docID - 1);
comparators[i].copy(0, prevDoc);
comparators[i].setBottom(0);
cmp = reverseMul[i] * comparators[i].compareBottom(docID);
cmp = reverseMul[i] * comparators[i].compareBottom(nextDoc);
if (cmp != 0) {
break;
}
}

if (cmp > 0) {
throw new CheckIndexException(
"segment has indexSort="
+ sort
+ " but docID="
+ (docID - 1)
+ (prevDoc)
+ " sorts after docID="
+ docID);
+ nextDoc);
}
prevDoc = nextDoc;
}
msg(
infoStream,
Expand All @@ -1216,7 +1226,7 @@ public static Status.IndexSortStatus testSort(
if (failFast) {
throw IOUtils.rethrowAlways(e);
}
msg(infoStream, "ERROR [" + String.valueOf(e.getMessage()) + "]");
msg(infoStream, "ERROR [" + e.getMessage() + "]");
status.error = e;
if (infoStream != null) {
e.printStackTrace(infoStream);
Expand Down
Loading
Loading