Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[MOD-6738] IndexComputer #535

Merged
merged 40 commits into from
Oct 21, 2024
Merged
Show file tree
Hide file tree
Changes from 37 commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
3aeb6e0
intiail implmnetation of IndexComputer and DistanceCalculator
meiravgri Aug 29, 2024
4ffba35
replace distFunc with calling indexComputer->calcDistance
meiravgri Aug 29, 2024
cbb9273
add distance calcation API to index abstect
meiravgri Aug 29, 2024
17b8e37
remove distFunc from VecSimIndexAbstract,
meiravgri Sep 2, 2024
3305dc2
free computer and calculator
meiravgri Sep 2, 2024
190d49d
Merge branch 'main' into meiravg_introduce_computer
meiravgri Sep 2, 2024
ca62ef7
introduce preprocessor and use it to normalize if cosine
meiravgri Sep 3, 2024
7f2521b
fix leak
meiravgri Sep 3, 2024
ed87ab6
add preprocessForStorage
meiravgri Sep 5, 2024
1b9d33c
remove addvector with auxilary context.
meiravgri Sep 8, 2024
b70f076
better organization of computer
meiravgri Sep 9, 2024
797653e
tiered: preprocess queries according to the frontend index and then p…
meiravgri Sep 9, 2024
6b4bb52
Merge branch 'main' into meiravg_introduce_computer
meiravgri Sep 9, 2024
8e46176
computer:
meiravgri Sep 11, 2024
b08e82f
Merge remote-tracking branch 'origin/main' into meiravg_introduce_com…
meiravgri Sep 11, 2024
ff6518c
fix assert
meiravgri Sep 11, 2024
170d615
fix leak
meiravgri Sep 11, 2024
e7529f5
review fixes:
meiravgri Sep 14, 2024
a4da462
IndexComputerExtended
meiravgri Sep 14, 2024
1dbf547
use override instead of redeclaration as virtual
meiravgri Sep 15, 2024
899aa74
simplify preprocess logic
meiravgri Sep 18, 2024
3c9fedd
remove all query wrappers API.
meiravgri Sep 19, 2024
a177538
add a test to tiered index with hnsw preprocessor
meiravgri Sep 19, 2024
b3c1456
add comment in computer::preprocess to emphasize that at least one bl…
meiravgri Sep 24, 2024
3962e05
review changes:
meiravgri Oct 9, 2024
5ef9be1
revert metric change in HNSWTieredIndexTest, CreateIndexInstance
meiravgri Oct 9, 2024
d1be229
rename factory_utils.h to computer_factory.h
meiravgri Oct 9, 2024
07182c3
add another component to encapsulate the preprocessors logic: Preproc…
meiravgri Oct 15, 2024
592deca
avoid dynamic allocation of containers in tests when possible
meiravgri Oct 15, 2024
64d1199
Merge branch 'main' into meiravg_introduce_computer
meiravgri Oct 15, 2024
44a16c5
remove index computer
meiravgri Oct 20, 2024
d6627eb
format with clang-format-14
meiravgri Oct 21, 2024
a78e8b5
add formated bindings file
meiravgri Oct 21, 2024
4614a8c
fix
meiravgri Oct 21, 2024
ee434a4
ADD PP TO THE TIERED INDEX - THIS COMMIT WILL BE REVERTED!
meiravgri Oct 21, 2024
15028fe
Revert "ADD PP TO THE TIERED INDEX - THIS COMMIT WILL BE REVERTED!"
meiravgri Oct 21, 2024
d67a1aa
better names for preprocessing tests in teierd unit tests
meiravgri Oct 21, 2024
b683ae4
run with macos
meiravgri Oct 21, 2024
b63b3b5
renmove addvector from VecSimIndexAbstract
meiravgri Oct 21, 2024
04ca716
disable flow temp
meiravgri Oct 21, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 7 additions & 7 deletions .github/workflows/flow-temp.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,19 +9,19 @@ name: temporary testing

on:
push:
branches-ignore: ['**'] # ignore all branches. Comment this line to run your workflow below on every push.
# branches-ignore: ['**'] # ignore all branches. Comment this line to run your workflow below on every push.
jobs:
jammy:
uses: ./.github/workflows/task-unit-test.yml
with:
container: ubuntu:jammy
run-valgrind: true
alpine3:
uses: ./.github/workflows/task-unit-test.yml
with:
container: alpine:3
pre-checkout-script: apk add bash
run-valgrind: true
# alpine3:
# uses: ./.github/workflows/task-unit-test.yml
# with:
# container: alpine:3
# pre-checkout-script: apk add bash
# run-valgrind: true
# focal:
# uses: ./.github/workflows/task-unit-test.yml
# with:
Expand Down
3 changes: 1 addition & 2 deletions src/VecSim/algorithms/brute_force/bfm_batch_iterator.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,12 @@ class BFM_BatchIterator : public BF_BatchIterator<DataType, DistType> {

idType curr_id = 0;
auto vectors_it = this->index->getVectorsIterator();
auto DistFunc = this->index->getDistFunc();
while (auto *vector = vectors_it->next()) {
// Compute the scores for every vector and extend the scores array.
if (VECSIM_TIMEOUT(this->getTimeoutCtx())) {
return VecSim_QueryReply_TimedOut;
}
auto score = DistFunc(vector, this->getQueryBlob(), this->index->getDim());
auto score = this->index->calcDistance(vector, this->getQueryBlob());
labelType curr_label = this->index->getVectorLabel(curr_id);
auto curr_pair = tmp_scores.find(curr_label);
// For each score, emplace or update the score of the label.
Expand Down
3 changes: 1 addition & 2 deletions src/VecSim/algorithms/brute_force/bfs_batch_iterator.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,12 @@ class BFS_BatchIterator : public BF_BatchIterator<DataType, DistType> {

idType curr_id = 0;
auto vectors_it = this->index->getVectorsIterator();
auto DistFunc = this->index->getDistFunc();
while (auto *vector = vectors_it->next()) {
// Compute the scores for every vector and extend the scores array.
if (VECSIM_TIMEOUT(this->getTimeoutCtx())) {
return VecSim_QueryReply_TimedOut;
}
auto score = DistFunc(vector, this->getQueryBlob(), this->index->getDim());
auto score = this->index->calcDistance(vector, this->getQueryBlob());
this->scores.emplace_back(score, this->index->getVectorLabel(curr_id));
++curr_id;
}
Expand Down
23 changes: 16 additions & 7 deletions src/VecSim/algorithms/brute_force/brute_force.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,8 @@ class BruteForceIndex : public VecSimIndexAbstract<DataType, DistType> {
idType count;

public:
BruteForceIndex(const BFParams *params, const AbstractIndexInitParams &abstractInitParams);
BruteForceIndex(const BFParams *params, const AbstractIndexInitParams &abstractInitParams,
const IndexComponents<DataType, DistType> &components);

size_t indexSize() const override;
size_t indexCapacity() const override;
Expand Down Expand Up @@ -138,8 +139,9 @@ class BruteForceIndex : public VecSimIndexAbstract<DataType, DistType> {
/******************** Ctor / Dtor **************/
template <typename DataType, typename DistType>
BruteForceIndex<DataType, DistType>::BruteForceIndex(
const BFParams *params, const AbstractIndexInitParams &abstractInitParams)
: VecSimIndexAbstract<DataType, DistType>(abstractInitParams),
const BFParams *params, const AbstractIndexInitParams &abstractInitParams,
const IndexComponents<DataType, DistType> &components)
: VecSimIndexAbstract<DataType, DistType>(abstractInitParams, components),
idToLabelMapping(this->allocator), count(0) {
assert(VecSimType_sizeof(this->vecType) == sizeof(DataType));
// Round up the initial capacity to the nearest multiple of the block size.
Expand All @@ -153,6 +155,7 @@ BruteForceIndex<DataType, DistType>::BruteForceIndex(

template <typename DataType, typename DistType>
void BruteForceIndex<DataType, DistType>::appendVector(const void *vector_data, labelType label) {
auto processed_blob = this->preprocessForStorage(vector_data);
// Give the vector new id and increase count.
idType id = this->count++;

Expand All @@ -161,7 +164,7 @@ void BruteForceIndex<DataType, DistType>::appendVector(const void *vector_data,
growByBlock();
}
// add vector data to vector raw data container
vectors->addElement(vector_data, id);
vectors->addElement(processed_blob.get(), id);

// add label to idToLabelMapping
setVectorLabel(id, label);
Expand Down Expand Up @@ -230,6 +233,8 @@ BruteForceIndex<DataType, DistType>::topKQuery(const void *queryBlob, size_t k,
return rep;
}

auto processed_query_ptr = this->preprocessQuery(queryBlob);
const void *processed_query = processed_query_ptr.get();
DistType upperBound = std::numeric_limits<DistType>::lowest();
vecsim_stl::abstract_priority_queue<DistType, labelType> *TopCandidates =
getNewMaxPriorityQueue();
Expand All @@ -243,7 +248,7 @@ BruteForceIndex<DataType, DistType>::topKQuery(const void *queryBlob, size_t k,
delete TopCandidates;
return rep;
}
auto score = this->distFunc(vector, queryBlob, this->dim);
auto score = this->calcDistance(vector, processed_query);
// If we have less than k or a better score, insert it.
if (score < upperBound || TopCandidates->size() < k) {
TopCandidates->emplace(score, getVectorLabel(curr_id));
Expand All @@ -270,6 +275,7 @@ template <typename DataType, typename DistType>
VecSimQueryReply *
BruteForceIndex<DataType, DistType>::rangeQuery(const void *queryBlob, double radius,
VecSimQueryParams *queryParams) const {
auto processed_query_ptr = this->preprocessQuery(queryBlob);
auto rep = new VecSimQueryReply(this->allocator);
void *timeoutCtx = queryParams ? queryParams->timeoutCtx : nullptr;
this->lastMode = RANGE_QUERY;
Expand All @@ -281,12 +287,13 @@ BruteForceIndex<DataType, DistType>::rangeQuery(const void *queryBlob, double ra
DistType radius_ = DistType(radius);
auto vectors_it = vectors->getIterator();
idType curr_id = 0;
const void *processed_query = processed_query_ptr.get();
while (vectors_it->hasNext()) {
if (VECSIM_TIMEOUT(timeoutCtx)) {
rep->code = VecSim_QueryReply_TimedOut;
break;
}
auto score = this->distFunc(vectors_it->next(), queryBlob, this->dim);
auto score = this->calcDistance(vectors_it->next(), processed_query);
if (score <= radius_) {
res_container->emplace(getVectorLabel(curr_id), score);
}
Expand Down Expand Up @@ -342,8 +349,10 @@ template <typename DataType, typename DistType>
VecSimBatchIterator *
BruteForceIndex<DataType, DistType>::newBatchIterator(const void *queryBlob,
VecSimQueryParams *queryParams) const {
auto *queryBlobCopy = this->allocator->allocate(sizeof(DataType) * this->dim);
auto *queryBlobCopy =
this->allocator->allocate_aligned(this->dataSize, this->preprocessors->getAlignment());
memcpy(queryBlobCopy, queryBlob, this->dim * sizeof(DataType));
this->preprocessQueryInPlace(queryBlobCopy);
// Ownership of queryBlobCopy moves to BF_BatchIterator that will free it at the end.
return newBatchIterator_Instance(queryBlobCopy, queryParams);
}
Expand Down
12 changes: 6 additions & 6 deletions src/VecSim/algorithms/brute_force/brute_force_multi.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,14 @@ class BruteForceIndex_Multi : public BruteForceIndex<DataType, DistType> {
vecsim_stl::unordered_map<labelType, vecsim_stl::vector<idType>> labelToIdsLookup;

public:
BruteForceIndex_Multi(const BFParams *params, const AbstractIndexInitParams &abstractInitParams)
: BruteForceIndex<DataType, DistType>(params, abstractInitParams),
BruteForceIndex_Multi(const BFParams *params, const AbstractIndexInitParams &abstractInitParams,
const IndexComponents<DataType, DistType> &components)
: BruteForceIndex<DataType, DistType>(params, abstractInitParams, components),
labelToIdsLookup(this->allocator) {}

~BruteForceIndex_Multi() = default;

int addVector(const void *vector_data, labelType label, void *auxiliaryCtx = nullptr) override;
int addVector(const void *vector_data, labelType label) override;
int deleteVector(labelType labelType) override;
int deleteVectorById(labelType label, idType id) override;
double getDistanceFrom_Unsafe(labelType label, const void *vector_data) const override;
Expand Down Expand Up @@ -93,8 +94,7 @@ class BruteForceIndex_Multi : public BruteForceIndex<DataType, DistType> {
/******************************* Implementation **********************************/

template <typename DataType, typename DistType>
int BruteForceIndex_Multi<DataType, DistType>::addVector(const void *vector_data, labelType label,
void *auxiliaryCtx) {
int BruteForceIndex_Multi<DataType, DistType>::addVector(const void *vector_data, labelType label) {
this->appendVector(vector_data, label);
return 1;
}
Expand Down Expand Up @@ -203,7 +203,7 @@ BruteForceIndex_Multi<DataType, DistType>::getDistanceFrom_Unsafe(labelType labe

DistType dist = std::numeric_limits<DistType>::infinity();
for (auto id : IDs->second) {
DistType d = this->distFunc(this->getDataByInternalId(id), vector_data, this->dim);
DistType d = this->calcDistance(this->getDataByInternalId(id), vector_data);
dist = (dist < d) ? dist : d;
}

Expand Down
16 changes: 9 additions & 7 deletions src/VecSim/algorithms/brute_force/brute_force_single.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,11 @@ class BruteForceIndex_Single : public BruteForceIndex<DataType, DistType> {

public:
BruteForceIndex_Single(const BFParams *params,
const AbstractIndexInitParams &abstractInitParams);
const AbstractIndexInitParams &abstractInitParams,
const IndexComponents<DataType, DistType> &components);
~BruteForceIndex_Single() = default;

int addVector(const void *vector_data, labelType label, void *auxiliaryCtx = nullptr) override;
int addVector(const void *vector_data, labelType label) override;
int deleteVector(labelType label) override;
int deleteVectorById(labelType label, idType id) override;
double getDistanceFrom_Unsafe(labelType label, const void *vector_data) const override;
Expand Down Expand Up @@ -97,13 +98,14 @@ class BruteForceIndex_Single : public BruteForceIndex<DataType, DistType> {

template <typename DataType, typename DistType>
BruteForceIndex_Single<DataType, DistType>::BruteForceIndex_Single(
const BFParams *params, const AbstractIndexInitParams &abstractInitParams)
: BruteForceIndex<DataType, DistType>(params, abstractInitParams),
const BFParams *params, const AbstractIndexInitParams &abstractInitParams,
const IndexComponents<DataType, DistType> &components)
: BruteForceIndex<DataType, DistType>(params, abstractInitParams, components),
labelToIdLookup(this->allocator) {}

template <typename DataType, typename DistType>
int BruteForceIndex_Single<DataType, DistType>::addVector(const void *vector_data, labelType label,
void *auxiliaryCtx) {
int BruteForceIndex_Single<DataType, DistType>::addVector(const void *vector_data,
labelType label) {

auto optionalID = this->labelToIdLookup.find(label);
// Check if label already exists, so it is an update operation.
Expand Down Expand Up @@ -178,5 +180,5 @@ BruteForceIndex_Single<DataType, DistType>::getDistanceFrom_Unsafe(labelType lab
}
idType id = optionalId->second;

return this->distFunc(this->getDataByInternalId(id), vector_data, this->dim);
return this->calcDistance(this->getDataByInternalId(id), vector_data);
}
2 changes: 1 addition & 1 deletion src/VecSim/algorithms/hnsw/graph_data.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ struct ElementLevelData {
idType links[];

explicit ElementLevelData(std::shared_ptr<VecSimAllocator> allocator)
: incomingUnidirectionalEdges(new(allocator) vecsim_stl::vector<idType>(allocator)),
: incomingUnidirectionalEdges(new (allocator) vecsim_stl::vector<idType>(allocator)),
numLinks(0) {}

linkListSize getNumLinks() const { return this->numLinks; }
Expand Down
Loading