Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

C++ Improvements - API enhancement and increase testing #85

Merged
merged 8 commits into from
Sep 10, 2024
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ java/classpath.txt
java/linux-build/include/*
python/voyager-headers
.asv/
*.dSYM

# Cmake
CMakeLists.txt.user
Expand Down
9 changes: 9 additions & 0 deletions cpp/src/Index.h
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,11 @@ class Index {

virtual hnswlib::labeltype addItem(std::vector<float> vector,
std::optional<hnswlib::labeltype> id) = 0;

virtual std::vector<hnswlib::labeltype>
addItems(std::vector<std::vector<float>> input,
std::vector<hnswlib::labeltype> ids = {}, int numThreads = -1) = 0;

virtual std::vector<hnswlib::labeltype>
addItems(NDArray<float, 2> input, std::vector<hnswlib::labeltype> ids = {},
int numThreads = -1) = 0;
Expand All @@ -86,6 +91,10 @@ class Index {
virtual std::tuple<std::vector<hnswlib::labeltype>, std::vector<float>>
query(std::vector<float> queryVector, int k = 1, long queryEf = -1) = 0;

virtual std::tuple<NDArray<hnswlib::labeltype, 2>, NDArray<float, 2>>
query(std::vector<std::vector<float>> queryVectors, int k = 1,
int numThreads = -1, long queryEf = -1) = 0;

virtual std::tuple<NDArray<hnswlib::labeltype, 2>, NDArray<float, 2>>
query(NDArray<float, 2> queryVectors, int k = 1, int numThreads = -1,
long queryEf = -1) = 0;
Expand Down
12 changes: 12 additions & 0 deletions cpp/src/TypedIndex.h
Original file line number Diff line number Diff line change
Expand Up @@ -290,6 +290,12 @@ class TypedIndex : public Index {
return addItems(NDArray<float, 2>(vector, {1, (int)vector.size()}), ids)[0];
}

std::vector<hnswlib::labeltype>
addItems(const std::vector<std::vector<float>> vectors,
std::vector<hnswlib::labeltype> ids = {}, int numThreads = -1) {
return addItems(vectorsToNDArray(vectors), ids, numThreads);
}

std::vector<hnswlib::labeltype>
addItems(NDArray<float, 2> floatInput,
std::vector<hnswlib::labeltype> ids = {}, int numThreads = -1) {
Expand Down Expand Up @@ -502,6 +508,12 @@ class TypedIndex : public Index {
return algorithmImpl->label_lookup_;
}

std::tuple<NDArray<hnswlib::labeltype, 2>, NDArray<dist_t, 2>>
query(std::vector<std::vector<float>> floatQueryVectors, int k = 1,
int numThreads = -1, long queryEf = -1) {
return query(vectorsToNDArray(floatQueryVectors), k, numThreads, queryEf);
}

std::tuple<NDArray<hnswlib::labeltype, 2>, NDArray<dist_t, 2>>
query(NDArray<float, 2> floatQueryVectors, int k = 1, int numThreads = -1,
long queryEf = -1) {
Expand Down
27 changes: 27 additions & 0 deletions cpp/src/array_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -309,3 +309,30 @@ std::string toFloatVectorString(std::vector<data_t> vec) {
return toFloatVectorString<dist_t, data_t, scalefactor>(vec.data(),
vec.size());
}

/** Convert a 2D vector of float to NDArray<float, 2> */
NDArray<float, 2> vectorsToNDArray(std::vector<std::vector<float>> vectors) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's add an explicit unit test for this

int numVectors = vectors.size();
int dimensions = numVectors > 0 ? vectors[0].size() : 0;
psobot marked this conversation as resolved.
Show resolved Hide resolved
std::array<int, 2> shape = {numVectors, dimensions};

// Flatten the 2d array into the NDArray's underlying 1D vector
std::vector<float> flatArray(numVectors * dimensions);
// Pointer to the beginning of the flat array
float *flatArrayPtr = flatArray.data();
for (const auto &vector : vectors) {
// check that all provided vectors are same size, using the 1st vector as
// the reference
if (vector.size() != dimensions) {
throw std::invalid_argument("All vectors must be of the same size, but "
"received vectors of size: " +
std::to_string(dimensions) + " and " +
std::to_string(vector.size()) + ".");
}
// Use std::memcpy to copy the elements directly into the flat array
std::memcpy(flatArrayPtr, vector.data(), vector.size() * sizeof(float));
flatArrayPtr += vector.size(); // Increment the pointer
}

return NDArray<float, 2>(flatArray, shape);
}
3 changes: 3 additions & 0 deletions cpp/test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@ set(TEST_FILES test_main.cpp doctest_setup.cpp) # Add any test files here
# Create an executable for the tests
add_executable(VoyagerTests ${TEST_FILES})

# Add compiler flags
target_compile_options(VoyagerTests PRIVATE -g)
Comment on lines +7 to +8
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

-g flag builds executable with debugging symbols to use with a debugger


# Link the test executable with the main project and Doctest
# target_link_libraries(MyProjectTests PRIVATE MyProject doctest::doctest)
target_link_libraries(VoyagerTests
Expand Down
229 changes: 197 additions & 32 deletions cpp/test/test_main.cpp
Original file line number Diff line number Diff line change
@@ -1,53 +1,218 @@
#include "doctest.h"

#include "TypedIndex.h"
#include "test_utils.cpp"
#include <tuple>
#include <type_traits>

template <typename dist_t, typename data_t = dist_t,
typename scalefactor = std::ratio<1, 1>>
void testCombination(TypedIndex<dist_t, data_t, scalefactor> &index,
SpaceType spaceType, int numDimensions,
StorageDataType storageType) {
CHECK(toString(index.getSpace()) == toString(spaceType));
CHECK(index.getNumDimensions() == numDimensions);
CHECK(toString(index.getStorageDataType()) == toString(storageType));
void testIndexProperties(TypedIndex<dist_t, data_t, scalefactor> &index,
SpaceType spaceType, int numDimensions,
StorageDataType storageType) {
REQUIRE(toString(index.getSpace()) == toString(spaceType));
REQUIRE(index.getNumDimensions() == numDimensions);
REQUIRE(toString(index.getStorageDataType()) == toString(storageType));
}

TEST_CASE("Test combinations of different instantiations and sizes") {
std::vector<SpaceType> spaceTypesSet = {SpaceType::Euclidean,
SpaceType::InnerProduct};
std::vector<int> numDimensionsSet = {4, 16, 128, 1024};
std::vector<int> numElementsSet = {100, 1000, 100000};
/**
* Test the query method of the index. The index is populated with random
* vectors, and then queried with the same vectors. The expected result is that
* each vector's nearest neighbor is itself and that the distance is zero
* (allowing for some precision error based on the storage type).
*/
template <typename dist_t, typename data_t = dist_t,
typename scalefactor = std::ratio<1, 1>>
void testQuery(TypedIndex<dist_t, data_t, scalefactor> &index, int numVectors,
int numDimensions, SpaceType spaceType,
StorageDataType storageType, bool testSingleVectorMethod,
float precisionTolerance) {
/**
* Create test data and ids. If we are using Float8 or E4M3 storage, quantize
* the vector values, if we are using Float32 storage, keep the float values
* as-is. We want to match the storage type use case with the input data.
*/
std::vector<std::vector<float>> inputData;
if (storageType == StorageDataType::Float8 ||
storageType == StorageDataType::E4M3) {
inputData = randomQuantizedVectors(numVectors, numDimensions);
} else if (storageType == StorageDataType::Float32) {
inputData = randomVectors(numVectors, numDimensions);
}
std::vector<hnswlib::labeltype> ids(numVectors);
for (int i = 0; i < numVectors; i++) {
ids[i] = i;
}

// add items to index
if (testSingleVectorMethod == true) {
for (auto id : ids) {
index.addItem(inputData[id], id);
}
} else {
index.addItems(inputData, ids, -1);
}

int k = 1;
float lowerBound = 0.0f - precisionTolerance;
float upperBound = 0.0f + precisionTolerance;

// Use the single-query interface (query with a single target vector)
for (long queryEf = 100; queryEf <= numVectors; queryEf *= 10) {
for (int i = 0; i < numVectors; i++) {

/**
* Use the raw inputData as target vectors for querying. We don't use the
* index data because once data has been added to the index, the model can
* change the "ground truth" by changing the data format.
*/
auto targetVector = inputData[i];
auto nearestNeighbor = index.query(targetVector, k, queryEf);

auto labels = std::get<0>(nearestNeighbor);
auto distances = std::get<1>(nearestNeighbor);
REQUIRE(labels.size() == k);
REQUIRE(distances.size() == k);

/**
* E4M3 is too low precision for us to confidently assume that querying
* with the unquantized (fp32) vector will return the quantized vector as
* its NN. InnerProduct will have negative distance to the closest item,
* not zero
*/
if (storageType != StorageDataType::E4M3 &&
spaceType != SpaceType::InnerProduct) {
REQUIRE(i == labels[0]);
REQUIRE(distances[0] >= lowerBound);
REQUIRE(distances[0] <= upperBound);
}
}
}

// Use the bulk-query interface (query with multiple target vectors at once)
for (long queryEf = 100; queryEf <= numVectors; queryEf *= 10) {
auto nearestNeighbors = index.query(
inputData, /* k= */ k, /* numThreads= */ -1, /* queryEf= */ queryEf);
NDArray<hnswlib::labeltype, 2> labels = std::get<0>(nearestNeighbors);
NDArray<dist_t, 2> distances = std::get<1>(nearestNeighbors);
REQUIRE(labels.shape[0] == numVectors);
REQUIRE(labels.shape[1] == k);
REQUIRE(distances.shape[0] == numVectors);
REQUIRE(distances.shape[1] == k);

for (int i = 0; i < numVectors; i++) {
auto label = labels.data[i];
auto distance = distances.data[i];

/**
* E4M3 is too low precision for us to confidently assume that querying
* with the unquantized (fp32) vector will return the quantized vector
* as its NN. InnerProduct will have negative distance to the closest
* item, not zero
*/
if (storageType != StorageDataType::E4M3 &&
spaceType != SpaceType::InnerProduct) {
REQUIRE(i == label);
REQUIRE(distance >= lowerBound);
REQUIRE(distance <= upperBound);
}
}
}
}

TEST_CASE("Test combinations of different instantiations. Test that each "
"vector's NN is itself and distance is approximately zero.") {
std::unordered_map<StorageDataType, float> PRECISION_TOLERANCE_PER_DATA_TYPE =
{{StorageDataType::Float32, 0.00001f},
{StorageDataType::Float8, 0.10f},
{StorageDataType::E4M3, 0.20f}};
std::vector<SpaceType> spaceTypesSet = {
SpaceType::Euclidean, SpaceType::InnerProduct, SpaceType::Cosine};
std::vector<int> numDimensionsSet = {32};
std::vector<int> numVectorsSet = {2000};
std::vector<StorageDataType> storageTypesSet = {
StorageDataType::Float8, StorageDataType::Float32, StorageDataType::E4M3};

auto count = 0;
std::vector<bool> testSingleVectorMethods = {true, false};

for (auto spaceType : spaceTypesSet) {
for (auto numDimensions : numDimensionsSet) {
for (auto numElements : numElementsSet) {
for (auto storageType : storageTypesSet) {
SUBCASE("Test instantiation ") {
CAPTURE(spaceType);
CAPTURE(numDimensions);
CAPTURE(numElements);
CAPTURE(storageType);

if (storageType == StorageDataType::Float8) {
auto index = TypedIndex<float, int8_t, std::ratio<1, 127>>(
spaceType, numDimensions);
testCombination(index, spaceType, numDimensions, storageType);
} else if (storageType == StorageDataType::Float32) {
auto index = TypedIndex<float>(spaceType, numDimensions);
testCombination(index, spaceType, numDimensions, storageType);
} else if (storageType == StorageDataType::E4M3) {
auto index = TypedIndex<float, E4M3>(spaceType, numDimensions);
testCombination(index, spaceType, numDimensions, storageType);
for (auto storageType : storageTypesSet) {
for (auto numDimensions : numDimensionsSet) {
for (auto numVectors : numVectorsSet) {
for (auto testSingleVectorMethod : testSingleVectorMethods) {

SUBCASE("Test instantiation ") {
CAPTURE(spaceType);
CAPTURE(numDimensions);
CAPTURE(numVectors);
CAPTURE(storageType);
CAPTURE(testSingleVectorMethod);

if (storageType == StorageDataType::Float8) {
auto index = TypedIndex<float, int8_t, std::ratio<1, 127>>(
spaceType, numDimensions);
testIndexProperties(index, spaceType, numDimensions,
storageType);
testQuery(index, numVectors, numDimensions, spaceType,
storageType, testSingleVectorMethod,
PRECISION_TOLERANCE_PER_DATA_TYPE[storageType]);
} else if (storageType == StorageDataType::Float32) {
auto index = TypedIndex<float>(spaceType, numDimensions);
testIndexProperties(index, spaceType, numDimensions,
storageType);
testQuery(index, numVectors, numDimensions, spaceType,
storageType, testSingleVectorMethod,
PRECISION_TOLERANCE_PER_DATA_TYPE[storageType]);
} else if (storageType == StorageDataType::E4M3) {
auto index = TypedIndex<float, E4M3>(spaceType, numDimensions);
testIndexProperties(index, spaceType, numDimensions,
storageType);
testQuery(index, numVectors, numDimensions, spaceType,
storageType, testSingleVectorMethod,
PRECISION_TOLERANCE_PER_DATA_TYPE[storageType]);
}
}
}
}
}
}
}
}

TEST_CASE("Test vectorsToNDArray converts 2D vector of float to NDArray<float, "
"2>") {
std::vector<std::vector<float>> vectors = {{1.0f, 2.0f, 3.0f, 4.0f},
{5.0f, 6.0f, 7.0f, 8.0f},
{9.0f, 10.0f, 11.0f, 12.0f}};
NDArray<float, 2> ndArray = vectorsToNDArray(vectors);
REQUIRE(ndArray.shape.size() == 2);
REQUIRE(ndArray.shape[0] == 3);
REQUIRE(ndArray.shape[1] == 4);
REQUIRE(ndArray.data.size() == 12);
REQUIRE(ndArray.data[0] == 1.0f);
REQUIRE(ndArray.data[1] == 2.0f);
REQUIRE(ndArray.data[2] == 3.0f);
REQUIRE(ndArray.data[3] == 4.0f);
REQUIRE(ndArray.data[4] == 5.0f);
REQUIRE(ndArray.data[5] == 6.0f);
REQUIRE(ndArray.data[6] == 7.0f);
REQUIRE(ndArray.data[7] == 8.0f);
REQUIRE(ndArray.data[8] == 9.0f);
REQUIRE(ndArray.data[9] == 10.0f);
REQUIRE(ndArray.data[10] == 11.0f);
REQUIRE(ndArray.data[11] == 12.0f);
REQUIRE(*ndArray[0] == 1.0f);
REQUIRE(*ndArray[1] == 5.0f);
REQUIRE(*ndArray[2] == 9.0f);
}

TEST_CASE("Test vectorsToNDArray throws error if vectors are not of the same "
"size") {
std::vector<std::vector<float>> vectors1 = {{1.0f, 2.0f, 3.0f, 4.0f},
{5.0f, 6.0f, 7.0f},
{9.0f, 10.0f, 11.0f, 12.0f}};
REQUIRE_THROWS_AS(vectorsToNDArray(vectors1), std::invalid_argument);

std::vector<std::vector<float>> vectors2 = {
{1.0f}, {5.0f, 6.0f, 7.0f}, {9.0f, 10.0f, 11.0f}};
REQUIRE_THROWS_AS(vectorsToNDArray(vectors2), std::invalid_argument);
}
41 changes: 41 additions & 0 deletions cpp/test/test_utils.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
#include <random>
#include <vector>

#include "array_utils.h"

// create test data intended for Float8 storage or E4M3 storage
std::vector<std::vector<float>> randomQuantizedVectors(int numVectors,
int dimensions) {
std::vector<std::vector<float>> vectors(numVectors,
std::vector<float>(dimensions));

std::random_device rd;
std::mt19937 gen(rd());
std::uniform_real_distribution<> dis(0, 1.0);

for (int i = 0; i < numVectors; ++i) {
for (int j = 0; j < dimensions; ++j) {
vectors[i][j] = static_cast<int>(((dis(gen) * 2 - 1) * 10.0f)) / 10.0f;
}
}

return vectors;
}

// create test data intended for Float32 storage
std::vector<std::vector<float>> randomVectors(int numVectors, int dimensions) {
std::vector<std::vector<float>> vectors(numVectors,
std::vector<float>(dimensions));

std::random_device rd;
std::mt19937 gen(rd());
std::uniform_real_distribution<> dis(0, 1.0);

for (int i = 0; i < numVectors; ++i) {
for (int j = 0; j < dimensions; ++j) {
vectors[i][j] = static_cast<float>(dis(gen)) * 2 - 1;
}
}

return vectors;
}
Loading