-
Notifications
You must be signed in to change notification settings - Fork 1.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Diversity check bugfix #11781
Merged
Merged
Diversity check bugfix #11781
Changes from 1 commit
Commits
Show all changes
2 commits
Select commit
Hold shift + click to select a range
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -504,6 +504,7 @@ public void testDiversity() throws IOException { | |
unitVector2d(0.9), | ||
unitVector2d(0.8), | ||
unitVector2d(0.77), | ||
unitVector2d(0.6) | ||
}; | ||
if (vectorEncoding == VectorEncoding.BYTE) { | ||
for (float[] v : values) { | ||
|
@@ -555,6 +556,78 @@ public void testDiversity() throws IOException { | |
assertLevel0Neighbors(builder.hnsw, 5, 1, 4); | ||
} | ||
|
||
public void testDiversityFallback() throws IOException { | ||
vectorEncoding = randomVectorEncoding(); | ||
similarityFunction = VectorSimilarityFunction.EUCLIDEAN; | ||
// Some test cases can't be exercised in two dimensions; | ||
// in particular if a new neighbor displaces an existing neighbor | ||
// by being closer to the target, yet none of the existing neighbors is closer to the new vector | ||
// than to the target -- ie they all remain diverse, so we simply drop the farthest one. | ||
float[][] values = { | ||
{0, 0, 0}, | ||
{0, 1, 0}, | ||
{0, 0, 2}, | ||
{1, 0, 0}, | ||
{0, 0.4f, 0} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. hm, I guess this works for bytes too, but we should probably multiply everything here to make it non-fractional |
||
}; | ||
MockVectorValues vectors = new MockVectorValues(values); | ||
// First add nodes until everybody gets a full neighbor list | ||
HnswGraphBuilder<?> builder = | ||
HnswGraphBuilder.create( | ||
vectors, vectorEncoding, similarityFunction, 1, 10, random().nextInt()); | ||
// node 0 is added by the builder constructor | ||
// builder.addGraphNode(vectors.vectorValue(0)); | ||
RandomAccessVectorValues vectorsCopy = vectors.copy(); | ||
builder.addGraphNode(1, vectorsCopy); | ||
builder.addGraphNode(2, vectorsCopy); | ||
assertLevel0Neighbors(builder.hnsw, 0, 1, 2); | ||
// 2 is closer to 0 than 1, so it is excluded as non-diverse | ||
assertLevel0Neighbors(builder.hnsw, 1, 0); | ||
// 1 is closer to 0 than 2, so it is excluded as non-diverse | ||
assertLevel0Neighbors(builder.hnsw, 2, 0); | ||
|
||
builder.addGraphNode(3, vectorsCopy); | ||
// this is one case we are testing; 2 has been displaced by 3 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nice test! |
||
assertLevel0Neighbors(builder.hnsw, 0, 1, 3); | ||
assertLevel0Neighbors(builder.hnsw, 1, 0); | ||
assertLevel0Neighbors(builder.hnsw, 2, 0); | ||
assertLevel0Neighbors(builder.hnsw, 3, 0); | ||
} | ||
|
||
public void testDiversity3d() throws IOException { | ||
vectorEncoding = randomVectorEncoding(); | ||
similarityFunction = VectorSimilarityFunction.EUCLIDEAN; | ||
// test the case when a neighbor *becomes* non-diverse when a newer better neighbor arrives | ||
float[][] values = { | ||
{0, 0, 0}, | ||
{0, 10, 0}, | ||
{0, 0, 20}, | ||
{0, 9, 0} | ||
}; | ||
MockVectorValues vectors = new MockVectorValues(values); | ||
// First add nodes until everybody gets a full neighbor list | ||
HnswGraphBuilder<?> builder = | ||
HnswGraphBuilder.create( | ||
vectors, vectorEncoding, similarityFunction, 1, 10, random().nextInt()); | ||
// node 0 is added by the builder constructor | ||
// builder.addGraphNode(vectors.vectorValue(0)); | ||
RandomAccessVectorValues vectorsCopy = vectors.copy(); | ||
builder.addGraphNode(1, vectorsCopy); | ||
builder.addGraphNode(2, vectorsCopy); | ||
assertLevel0Neighbors(builder.hnsw, 0, 1, 2); | ||
// 2 is closer to 0 than 1, so it is excluded as non-diverse | ||
assertLevel0Neighbors(builder.hnsw, 1, 0); | ||
// 1 is closer to 0 than 2, so it is excluded as non-diverse | ||
assertLevel0Neighbors(builder.hnsw, 2, 0); | ||
|
||
builder.addGraphNode(3, vectorsCopy); | ||
// this is one case we are testing; 1 has been displaced by 3 | ||
assertLevel0Neighbors(builder.hnsw, 0, 2, 3); | ||
assertLevel0Neighbors(builder.hnsw, 1, 0, 3); | ||
assertLevel0Neighbors(builder.hnsw, 2, 0); | ||
assertLevel0Neighbors(builder.hnsw, 3, 0, 1); | ||
} | ||
|
||
private void assertLevel0Neighbors(OnHeapHnswGraph graph, int node, int... expected) { | ||
Arrays.sort(expected); | ||
NeighborArray nn = graph.getNeighbors(0, node); | ||
|
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I am surprised that with this big change, we had only a small reduction in recall. I guess the reason could be that in our tests diversity check was really relevant only for small number of nodes; in majority of cases the algorithm just eliminated the most distant node.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I know - how did this garbage even work at all!☹️ It's kind of astonishing how insensitive this whole process is to the diversity checking. Initially we didn't have it at all though (just always pick the closest neighbors), and things still kind of work. Then I had the wonky implementation that did not sort the neighbors while indexing, but did some best effort kind of thing, and still it mostly worked. So we need good tests here to ensure we are doing the right thing! Because bugs here can lead to small degradation.