Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Clustering #513

Merged
merged 15 commits into from
Oct 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,7 @@
"ivdep",
"jaccard",
"Jemalloc",
"kmeans",
"Kullback",
"Leibler",
"libjemalloc",
Expand Down
13 changes: 12 additions & 1 deletion BENCHMARKS.md
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,8 @@ Within this repository you will find two commonly used utilities:
- `cpp/bench.cpp` the produces the `bench_cpp` binary for broad USearch benchmarks.
- `python/bench.py` and `python/bench.ipynb` for interactive charts against FAISS.

### C++ Benchmarking Utilities

To achieve best highest results we suggest compiling locally for the target architecture.

```sh
Expand Down Expand Up @@ -147,11 +149,20 @@ build_profile/bench_cpp \
--cos
```


> Optional parameters include `connectivity`, `expansion_add`, `expansion_search`.

For Python, jut open the Jupyter Notebook and start playing around.

### Python Benchmarking Utilities

Several benchmarking suites are available for Python: approximate search, exact search, and clustering.

```sh
python/scripts/bench.py --help
python/scripts/bench_exact.py --help
python/scripts/bench_cluster.py --help
```

## Datasets

BigANN benchmark is a good starting point, if you are searching for large collections of high-dimensional vectors.
Expand Down
2 changes: 1 addition & 1 deletion CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,7 @@ cibuildwheel --platform macos # works only on MacOS
cibuildwheel --platform windows # works only on Windows
```

You may need root previligies for multi-architecture builds:
You may need root privileges for multi-architecture builds:

```sh
sudo $(which cibuildwheel) --platform linux
Expand Down
16 changes: 16 additions & 0 deletions cpp/test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1100,6 +1100,22 @@ int main(int, char**) {
test_uint40();
test_cosine<float, std::int64_t, uint40_t>(10, 10);

// Test plugins, like K-Means clustering.
{
std::size_t vectors_count = 1000, centroids_count = 10, dimensions = 256;
kmeans_clustering_t clustering;
clustering.max_iterations = 2;
std::vector<float> vectors(vectors_count * dimensions), centroids(centroids_count * dimensions);
matrix_slice_gt<float const> vectors_slice(vectors.data(), dimensions, vectors_count);
matrix_slice_gt<float> centroids_slice(centroids.data(), dimensions, centroids_count);
std::generate(vectors.begin(), vectors.end(), [] { return float(std::rand()) / float(INT_MAX); });
std::vector<std::size_t> assignments(vectors_count);
std::vector<distance_punned_t> distances(vectors_count);
auto clustering_result = clustering(vectors_slice, centroids_slice, {assignments.data(), assignments.size()},
{distances.data(), distances.size()});
expect(clustering_result);
}

// Exact search without constructing indexes.
// Great for validating the distance functions.
std::printf("Testing exact search\n");
Expand Down
2 changes: 1 addition & 1 deletion include/usearch/index.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -370,7 +370,7 @@ template <typename scalar_at, typename allocator_at = std::allocator<scalar_at>>
void reset() noexcept {
if (!std::is_trivially_destructible<scalar_at>::value)
for (std::size_t i = 0; i != size_; ++i)
destroy_at(data_ + i);
unum::usearch::destroy_at(data_ + i); //< Facing some symbol visibility/ambiguity issues
allocator_at{}.deallocate(data_, size_);
data_ = nullptr;
size_ = 0;
Expand Down
2 changes: 1 addition & 1 deletion include/usearch/index_dense.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2254,4 +2254,4 @@ static join_result_t join( //
}

} // namespace usearch
} // namespace unum
} // namespace unum
348 changes: 342 additions & 6 deletions include/usearch/index_plugins.hpp

Large diffs are not rendered by default.

139 changes: 116 additions & 23 deletions python/lib.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -452,7 +452,7 @@ static py::tuple search_many_in_index( //
}

/**
* @brief Brute-force exact search implementation, compatible with
* @brief Brute-force @b exact search implementation, compatible with
* NumPy-like Tensors and other objects supporting Buffer Protocol.
*/
static py::tuple search_many_brute_force( //
Expand Down Expand Up @@ -545,6 +545,81 @@ static py::tuple search_many_brute_force( //
return results;
}

/**
* @brief Brute-force @b K-Means clustering, compatible with
* NumPy-like Tensors and other objects supporting Buffer Protocol.
*/
static py::tuple cluster_many_brute_force( //
py::buffer dataset, //
std::size_t wanted, //
std::size_t max_iterations, //
double inertia_threshold, //
double max_seconds, //
double min_shifts, //
std::uint64_t seed, //
std::size_t threads, //
scalar_kind_t scalar_kind, //
metric_kind_t metric_kind, //
progress_func_t const& progress_func) {

using distance_t = typename kmeans_clustering_t::distance_t;
py::buffer_info dataset_info = dataset.request();
if (dataset_info.ndim != 2)
throw std::invalid_argument("Expects a matrix (rank-2 tensor) of dataset to cluster!");

std::size_t dataset_count = static_cast<std::size_t>(dataset_info.shape[0]);
std::size_t dataset_dimensions = static_cast<std::size_t>(dataset_info.shape[1]);
std::size_t dataset_stride = static_cast<std::size_t>(dataset_info.strides[0]);
scalar_kind_t dataset_kind = numpy_string_to_kind(dataset_info.format);
std::size_t bytes_per_scalar = bits_per_scalar_word(dataset_kind) / CHAR_BIT;

std::vector<std::size_t> point_to_centroid_index(dataset_count, 0);
std::vector<distance_t> point_to_centroid_distance(dataset_count, 0);
std::vector<byte_t> centroids(wanted * dataset_dimensions * bytes_per_scalar, 0);

if (!threads)
threads = std::thread::hardware_concurrency();

// Dispatch brute-force search
progress_t progress{progress_func};
executor_default_t executor{threads};
kmeans_clustering_t engine;
engine.metric_kind = metric_kind;
engine.quantization_kind = scalar_kind;
engine.max_iterations = max_iterations;
engine.min_shifts = min_shifts;
engine.max_seconds = max_seconds;
engine.inertia_threshold = inertia_threshold;

kmeans_clustering_result_t result = engine( //
reinterpret_cast<byte_t const*>(dataset_info.ptr), dataset_count, dataset_stride, //
centroids.data(), wanted, dataset_dimensions * bytes_per_scalar, //
point_to_centroid_index.data(), point_to_centroid_distance.data(), dataset_kind, dataset_dimensions, executor,
[&](std::size_t passed, std::size_t total) { return PyErr_CheckSignals() == 0 && progress(passed, total); });

if (!result)
throw std::runtime_error(result.error.release());

// Following constructor doesn't seem to be documented, but it's used in the source code of `pybind11`
// https://github.com/pybind/pybind11/blob/aeda49ed0b4e6e8abba7abc265ace86a6c26ba66/include/pybind11/numpy.h#L918-L919
// https://github.com/pybind/pybind11/blob/aeda49ed0b4e6e8abba7abc265ace86a6c26ba66/include/pybind11/buffer_info.h#L60-L75
py::buffer_info centroids_info;
centroids_info.ptr = reinterpret_cast<void*>(centroids.data());
centroids_info.itemsize = dataset_info.itemsize;
centroids_info.size = wanted * dataset_dimensions;
centroids_info.format = dataset_info.format;
centroids_info.ndim = 2;
centroids_info.shape = {wanted, dataset_dimensions};
centroids_info.strides = {dataset_dimensions * bytes_per_scalar, bytes_per_scalar};

py::tuple results(3);
results[0] = py::array_t<std::size_t>({dataset_count}, point_to_centroid_index.data());
results[1] = py::array_t<distance_t>({dataset_count}, point_to_centroid_distance.data());
results[2] = py::array(centroids_info);

return results;
}

template <typename scalar_at> struct rows_lookup_gt {
byte_t* data_;
std::size_t stride_;
Expand Down Expand Up @@ -936,16 +1011,33 @@ PYBIND11_MODULE(compiled, m) {
return index_metadata(meta);
});

m.def("exact_search", &search_many_brute_force, //
py::arg("dataset"), //
py::arg("queries"), //
py::arg("count") = 10, //
py::kw_only(), //
py::arg("threads") = 0, //
py::arg("metric_kind") = metric_kind_t::cos_k, //
py::arg("metric_signature") = metric_punned_signature_t::array_array_k, //
py::arg("metric_pointer") = 0, //
py::arg("progress") = nullptr //
m.def( //
"exact_search", &search_many_brute_force, //
py::arg("dataset"), //
py::arg("queries"), //
py::arg("count") = 10, //
py::kw_only(), //
py::arg("threads") = 0, //
py::arg("metric_kind") = metric_kind_t::cos_k, //
py::arg("metric_signature") = metric_punned_signature_t::array_array_k, //
py::arg("metric_pointer") = 0, //
py::arg("progress") = nullptr //
);

m.def( //
"kmeans", &cluster_many_brute_force, //
py::arg("dataset"), //
py::arg("count") = 10, //
py::kw_only(), //
py::arg("max_iterations") = kmeans_clustering_t::max_iterations_default_k, //
py::arg("inertia_threshold") = kmeans_clustering_t::inertia_threshold_default_k, //
py::arg("max_seconds") = kmeans_clustering_t::max_seconds_default_k, //
py::arg("min_shifts") = kmeans_clustering_t::min_shifts_default_k, //
py::arg("seed") = 0, //
py::arg("threads") = 0, //
py::arg("dtype") = scalar_kind_t::bf16_k, //
py::arg("metric_kind") = metric_kind_t::l2sq_k, //
py::arg("progress") = nullptr //
);

m.def(
Expand All @@ -961,18 +1053,19 @@ PYBIND11_MODULE(compiled, m) {

auto i = py::class_<dense_index_py_t, std::shared_ptr<dense_index_py_t>>(m, "Index");

i.def(py::init(&make_index), //
py::kw_only(), //
py::arg("ndim") = 0, //
py::arg("dtype") = scalar_kind_t::f32_k, //
py::arg("connectivity") = default_connectivity(), //
py::arg("expansion_add") = default_expansion_add(), //
py::arg("expansion_search") = default_expansion_search(), //
py::arg("metric_kind") = metric_kind_t::cos_k, //
py::arg("metric_signature") = metric_punned_signature_t::array_array_k, //
py::arg("metric_pointer") = 0, //
py::arg("multi") = false, //
py::arg("enable_key_lookups") = true //
i.def( //
py::init(&make_index), //
py::kw_only(), //
py::arg("ndim") = 0, //
py::arg("dtype") = scalar_kind_t::f32_k, //
py::arg("connectivity") = default_connectivity(), //
py::arg("expansion_add") = default_expansion_add(), //
py::arg("expansion_search") = default_expansion_search(), //
py::arg("metric_kind") = metric_kind_t::cos_k, //
py::arg("metric_signature") = metric_punned_signature_t::array_array_k, //
py::arg("metric_pointer") = 0, //
py::arg("multi") = false, //
py::arg("enable_key_lookups") = true //
);

i.def( //
Expand Down
136 changes: 136 additions & 0 deletions python/scripts/bench_cluster.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
#!/usr/bin/env python3
import os
import argparse

import numpy as np
import faiss
from tqdm import tqdm

import usearch
from usearch.index import kmeans
from usearch.io import load_matrix


def evaluate_clustering_euclidean(X, labels, centroids):
"""Evaluate clustering quality as average distance to centroids"""
distances = np.linalg.norm(X - centroids[labels], axis=1)
return np.mean(distances)


def evaluate_clustering_cosine(X, labels, centroids):
"""Evaluate clustering quality as average cosine distance to centroids"""

# Normalize both data points and centroids
X_normalized = X / np.linalg.norm(X, axis=1, keepdims=True)
centroids_normalized = centroids / np.linalg.norm(centroids, axis=1, keepdims=True)

# Compute cosine similarity using dot product
cosine_similarities = np.sum(X_normalized * centroids_normalized[labels], axis=1)

# Convert cosine similarity to cosine distance
cosine_distances = 1 - cosine_similarities

# Return the average cosine distance
return np.mean(cosine_distances)


def numpy_initialize_centroids(X, k):
"""Randomly choose k data points as initial centroids"""
indices = np.random.choice(X.shape[0], k, replace=False)
return X[indices]


def numpy_assign_clusters(X, centroids):
"""Assign each data point to the nearest centroid (numpy NumPy implementation)"""
distances = np.linalg.norm(X[:, np.newaxis] - centroids, axis=2)
return np.argmin(distances, axis=1)


def numpy_update_centroids(X, labels, k):
"""Compute new centroids as the mean of all data points assigned to each cluster"""
return np.array([X[labels == i].mean(axis=0) for i in range(k)])


def cluster_with_numpy(X, k, max_iters=100, tol=1e-4):
centroids = numpy_initialize_centroids(X, k)

for i in tqdm(range(max_iters), desc="KMeans Iterations"):
labels = numpy_assign_clusters(X, centroids)
new_centroids = numpy_update_centroids(X, labels, k)

if np.linalg.norm(new_centroids - centroids) < tol:
break

centroids = new_centroids

return labels, centroids


def cluster_with_faiss(X, k, max_iters=100):
# Docs: https://github.com/facebookresearch/faiss/wiki/Faiss-building-blocks:-clustering,-PCA,-quantization
# Header: https://github.com/facebookresearch/faiss/blob/main/faiss/Clustering.h
# Source: https://github.com/facebookresearch/faiss/blob/main/faiss/Clustering.cpp
verbose = False
d: int = X.shape[1]
kmeans = faiss.Kmeans(d, k, niter=max_iters, verbose=verbose)
kmeans.train(X)
D, I = kmeans.index.search(X, 1)
return I.flatten(), kmeans.centroids


def cluster_with_usearch(X, k, max_iters=100):
assignments, _, centroids = kmeans(X, k, max_iterations=max_iters)
return assignments, centroids


def main():
parser = argparse.ArgumentParser(description="Compare KMeans clustering algorithms")
parser.add_argument("--vectors", type=str, required=True, help="Path to binary matrix file")
parser.add_argument("-k", default=10, type=int, required=True, help="Number of centroids")
parser.add_argument("-i", default=100, type=int, help="Upper bound on number of iterations")
parser.add_argument("-n", type=int, help="Upper bound on number of points to use")
parser.add_argument(
"--method",
type=str,
choices=["numpy", "faiss", "usearch"],
default="numpy",
help="Clustering backend",
)

args = parser.parse_args()

max_iters = args.i
X = load_matrix(args.vectors, count_rows=args.n)
k = args.k
method = args.method

time_before = os.times()
if method == "usearch":
labels, centroids = cluster_with_usearch(X, k, max_iters=max_iters)
elif method == "faiss":
labels, centroids = cluster_with_faiss(X, k, max_iters=max_iters)
else:
labels, centroids = cluster_with_numpy(X, k, max_iters=max_iters)
time_after = os.times()
time_duration = time_after[0] - time_before[0]
print(f"Time: {time_duration:.2f}s, {time_duration / max_iters:.2f}s per iteration")

quality = evaluate_clustering_euclidean(X, labels, centroids)
quality_cosine = evaluate_clustering_cosine(X, labels, centroids)
print(f"Clustering quality (average distance to centroids): {quality:.4f}, cosine: {quality_cosine:.4f}")

# Let's compare it to some random uniform assignment
random_labels = np.random.randint(0, k, size=X.shape[0])
random_quality = evaluate_clustering_euclidean(X, random_labels, centroids)
random_quality_cosine = evaluate_clustering_cosine(X, random_labels, centroids)
print(f"... while random assignment quality: {random_quality:.4f}, cosine: {random_quality_cosine:.4f}")

cluster_sizes = np.unique(labels, return_counts=True)[1]
cluster_sizes_mean = np.mean(cluster_sizes)
cluster_sizes_stddev = np.std(cluster_sizes)
print(f"Cluster sizes: {cluster_sizes_mean:.2f} ± {cluster_sizes_stddev:.2f}")
print(cluster_sizes)


if __name__ == "__main__":
main()
Loading
Loading