Merge pull request #513 from unum-cloud/main-dev-cluster

Clustering
unum-cloud · Oct 29, 2024 · f9fc617 · f9fc617
2 parents 0dac789 + 91c0bcb
commit f9fc617
Show file tree

Hide file tree

Showing 12 changed files with 754 additions and 38 deletions.
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -157,6 +157,7 @@
     "ivdep",
     "jaccard",
     "Jemalloc",
+    "kmeans",
     "Kullback",
     "Leibler",
     "libjemalloc",

diff --git a/BENCHMARKS.md b/BENCHMARKS.md
@@ -58,6 +58,8 @@ Within this repository you will find two commonly used utilities:
 - `cpp/bench.cpp` the produces the `bench_cpp` binary for broad USearch benchmarks.
 - `python/bench.py` and `python/bench.ipynb` for interactive charts against FAISS.
 
+### C++ Benchmarking Utilities
+
 To achieve best highest results we suggest compiling locally for the target architecture.
 
 ```sh
@@ -147,11 +149,20 @@ build_profile/bench_cpp \
     --cos
 ```
 
-
 > Optional parameters include `connectivity`, `expansion_add`, `expansion_search`.
 
 For Python, jut open the Jupyter Notebook and start playing around.
 
+### Python Benchmarking Utilities
+
+Several benchmarking suites are available for Python: approximate search, exact search, and clustering.
+
+```sh
+python/scripts/bench.py --help
+python/scripts/bench_exact.py --help
+python/scripts/bench_cluster.py --help
+```
+
 ## Datasets
 
 BigANN benchmark is a good starting point, if you are searching for large collections of high-dimensional vectors.

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -200,7 +200,7 @@ cibuildwheel --platform macos                   # works only on MacOS
 cibuildwheel --platform windows                 # works only on Windows
 ```
 
-You may need root previligies for multi-architecture builds:
+You may need root privileges for multi-architecture builds:
 
 ```sh
 sudo $(which cibuildwheel) --platform linux

diff --git a/cpp/test.cpp b/cpp/test.cpp
@@ -1100,6 +1100,22 @@ int main(int, char**) {
     test_uint40();
     test_cosine<float, std::int64_t, uint40_t>(10, 10);
 
+    // Test plugins, like K-Means clustering.
+    {
+        std::size_t vectors_count = 1000, centroids_count = 10, dimensions = 256;
+        kmeans_clustering_t clustering;
+        clustering.max_iterations = 2;
+        std::vector<float> vectors(vectors_count * dimensions), centroids(centroids_count * dimensions);
+        matrix_slice_gt<float const> vectors_slice(vectors.data(), dimensions, vectors_count);
+        matrix_slice_gt<float> centroids_slice(centroids.data(), dimensions, centroids_count);
+        std::generate(vectors.begin(), vectors.end(), [] { return float(std::rand()) / float(INT_MAX); });
+        std::vector<std::size_t> assignments(vectors_count);
+        std::vector<distance_punned_t> distances(vectors_count);
+        auto clustering_result = clustering(vectors_slice, centroids_slice, {assignments.data(), assignments.size()},
+                                            {distances.data(), distances.size()});
+        expect(clustering_result);
+    }
+
     // Exact search without constructing indexes.
     // Great for validating the distance functions.
     std::printf("Testing exact search\n");

diff --git a/include/usearch/index.hpp b/include/usearch/index.hpp
@@ -370,7 +370,7 @@ template <typename scalar_at, typename allocator_at = std::allocator<scalar_at>>
     void reset() noexcept {
         if (!std::is_trivially_destructible<scalar_at>::value)
             for (std::size_t i = 0; i != size_; ++i)
-                destroy_at(data_ + i);
+                unum::usearch::destroy_at(data_ + i); //< Facing some symbol visibility/ambiguity issues
         allocator_at{}.deallocate(data_, size_);
         data_ = nullptr;
         size_ = 0;

diff --git a/include/usearch/index_dense.hpp b/include/usearch/index_dense.hpp
@@ -2254,4 +2254,4 @@ static join_result_t join(                                    //
 }
 
 } // namespace usearch
-} // namespace unum
+} // namespace unum
diff --git a/include/usearch/index_plugins.hpp b/include/usearch/index_plugins.hpp
diff --git a/python/lib.cpp b/python/lib.cpp
@@ -452,7 +452,7 @@ static py::tuple search_many_in_index( //
 }
 
 /**
- *  @brief  Brute-force exact search implementation, compatible with
+ *  @brief  Brute-force @b exact search implementation, compatible with
  *          NumPy-like Tensors and other objects supporting Buffer Protocol.
  */
 static py::tuple search_many_brute_force(       //
@@ -545,6 +545,81 @@ static py::tuple search_many_brute_force(       //
     return results;
 }
 
+/**
+ *  @brief  Brute-force @b K-Means clustering, compatible with
+ *          NumPy-like Tensors and other objects supporting Buffer Protocol.
+ */
+static py::tuple cluster_many_brute_force( //
+    py::buffer dataset,                    //
+    std::size_t wanted,                    //
+    std::size_t max_iterations,            //
+    double inertia_threshold,              //
+    double max_seconds,                    //
+    double min_shifts,                     //
+    std::uint64_t seed,                    //
+    std::size_t threads,                   //
+    scalar_kind_t scalar_kind,             //
+    metric_kind_t metric_kind,             //
+    progress_func_t const& progress_func) {
+
+    using distance_t = typename kmeans_clustering_t::distance_t;
+    py::buffer_info dataset_info = dataset.request();
+    if (dataset_info.ndim != 2)
+        throw std::invalid_argument("Expects a matrix (rank-2 tensor) of dataset to cluster!");
+
+    std::size_t dataset_count = static_cast<std::size_t>(dataset_info.shape[0]);
+    std::size_t dataset_dimensions = static_cast<std::size_t>(dataset_info.shape[1]);
+    std::size_t dataset_stride = static_cast<std::size_t>(dataset_info.strides[0]);
+    scalar_kind_t dataset_kind = numpy_string_to_kind(dataset_info.format);
+    std::size_t bytes_per_scalar = bits_per_scalar_word(dataset_kind) / CHAR_BIT;
+
+    std::vector<std::size_t> point_to_centroid_index(dataset_count, 0);
+    std::vector<distance_t> point_to_centroid_distance(dataset_count, 0);
+    std::vector<byte_t> centroids(wanted * dataset_dimensions * bytes_per_scalar, 0);
+
+    if (!threads)
+        threads = std::thread::hardware_concurrency();
+
+    // Dispatch brute-force search
+    progress_t progress{progress_func};
+    executor_default_t executor{threads};
+    kmeans_clustering_t engine;
+    engine.metric_kind = metric_kind;
+    engine.quantization_kind = scalar_kind;
+    engine.max_iterations = max_iterations;
+    engine.min_shifts = min_shifts;
+    engine.max_seconds = max_seconds;
+    engine.inertia_threshold = inertia_threshold;
+
+    kmeans_clustering_result_t result = engine(                                           //
+        reinterpret_cast<byte_t const*>(dataset_info.ptr), dataset_count, dataset_stride, //
+        centroids.data(), wanted, dataset_dimensions * bytes_per_scalar,                  //
+        point_to_centroid_index.data(), point_to_centroid_distance.data(), dataset_kind, dataset_dimensions, executor,
+        [&](std::size_t passed, std::size_t total) { return PyErr_CheckSignals() == 0 && progress(passed, total); });
+
+    if (!result)
+        throw std::runtime_error(result.error.release());
+
+    // Following constructor doesn't seem to be documented, but it's used in the source code of `pybind11`
+    // https://github.com/pybind/pybind11/blob/aeda49ed0b4e6e8abba7abc265ace86a6c26ba66/include/pybind11/numpy.h#L918-L919
+    // https://github.com/pybind/pybind11/blob/aeda49ed0b4e6e8abba7abc265ace86a6c26ba66/include/pybind11/buffer_info.h#L60-L75
+    py::buffer_info centroids_info;
+    centroids_info.ptr = reinterpret_cast<void*>(centroids.data());
+    centroids_info.itemsize = dataset_info.itemsize;
+    centroids_info.size = wanted * dataset_dimensions;
+    centroids_info.format = dataset_info.format;
+    centroids_info.ndim = 2;
+    centroids_info.shape = {wanted, dataset_dimensions};
+    centroids_info.strides = {dataset_dimensions * bytes_per_scalar, bytes_per_scalar};
+
+    py::tuple results(3);
+    results[0] = py::array_t<std::size_t>({dataset_count}, point_to_centroid_index.data());
+    results[1] = py::array_t<distance_t>({dataset_count}, point_to_centroid_distance.data());
+    results[2] = py::array(centroids_info);
+
+    return results;
+}
+
 template <typename scalar_at> struct rows_lookup_gt {
     byte_t* data_;
     std::size_t stride_;
@@ -936,16 +1011,33 @@ PYBIND11_MODULE(compiled, m) {
         return index_metadata(meta);
     });
 
-    m.def("exact_search", &search_many_brute_force,                               //
-          py::arg("dataset"),                                                     //
-          py::arg("queries"),                                                     //
-          py::arg("count") = 10,                                                  //
-          py::kw_only(),                                                          //
-          py::arg("threads") = 0,                                                 //
-          py::arg("metric_kind") = metric_kind_t::cos_k,                          //
-          py::arg("metric_signature") = metric_punned_signature_t::array_array_k, //
-          py::arg("metric_pointer") = 0,                                          //
-          py::arg("progress") = nullptr                                           //
+    m.def(                                                                      //
+        "exact_search", &search_many_brute_force,                               //
+        py::arg("dataset"),                                                     //
+        py::arg("queries"),                                                     //
+        py::arg("count") = 10,                                                  //
+        py::kw_only(),                                                          //
+        py::arg("threads") = 0,                                                 //
+        py::arg("metric_kind") = metric_kind_t::cos_k,                          //
+        py::arg("metric_signature") = metric_punned_signature_t::array_array_k, //
+        py::arg("metric_pointer") = 0,                                          //
+        py::arg("progress") = nullptr                                           //
+    );
+
+    m.def(                                                                               //
+        "kmeans", &cluster_many_brute_force,                                             //
+        py::arg("dataset"),                                                              //
+        py::arg("count") = 10,                                                           //
+        py::kw_only(),                                                                   //
+        py::arg("max_iterations") = kmeans_clustering_t::max_iterations_default_k,       //
+        py::arg("inertia_threshold") = kmeans_clustering_t::inertia_threshold_default_k, //
+        py::arg("max_seconds") = kmeans_clustering_t::max_seconds_default_k,             //
+        py::arg("min_shifts") = kmeans_clustering_t::min_shifts_default_k,               //
+        py::arg("seed") = 0,                                                             //
+        py::arg("threads") = 0,                                                          //
+        py::arg("dtype") = scalar_kind_t::bf16_k,                                        //
+        py::arg("metric_kind") = metric_kind_t::l2sq_k,                                  //
+        py::arg("progress") = nullptr                                                    //
     );
 
     m.def(
@@ -961,18 +1053,19 @@ PYBIND11_MODULE(compiled, m) {
 
     auto i = py::class_<dense_index_py_t, std::shared_ptr<dense_index_py_t>>(m, "Index");
 
-    i.def(py::init(&make_index),                                                  //
-          py::kw_only(),                                                          //
-          py::arg("ndim") = 0,                                                    //
-          py::arg("dtype") = scalar_kind_t::f32_k,                                //
-          py::arg("connectivity") = default_connectivity(),                       //
-          py::arg("expansion_add") = default_expansion_add(),                     //
-          py::arg("expansion_search") = default_expansion_search(),               //
-          py::arg("metric_kind") = metric_kind_t::cos_k,                          //
-          py::arg("metric_signature") = metric_punned_signature_t::array_array_k, //
-          py::arg("metric_pointer") = 0,                                          //
-          py::arg("multi") = false,                                               //
-          py::arg("enable_key_lookups") = true                                    //
+    i.def(                                                                      //
+        py::init(&make_index),                                                  //
+        py::kw_only(),                                                          //
+        py::arg("ndim") = 0,                                                    //
+        py::arg("dtype") = scalar_kind_t::f32_k,                                //
+        py::arg("connectivity") = default_connectivity(),                       //
+        py::arg("expansion_add") = default_expansion_add(),                     //
+        py::arg("expansion_search") = default_expansion_search(),               //
+        py::arg("metric_kind") = metric_kind_t::cos_k,                          //
+        py::arg("metric_signature") = metric_punned_signature_t::array_array_k, //
+        py::arg("metric_pointer") = 0,                                          //
+        py::arg("multi") = false,                                               //
+        py::arg("enable_key_lookups") = true                                    //
     );
 
     i.def(                                                //

diff --git a/python/scripts/bench_cluster.py b/python/scripts/bench_cluster.py
@@ -0,0 +1,136 @@
+#!/usr/bin/env python3
+import os
+import argparse
+
+import numpy as np
+import faiss
+from tqdm import tqdm
+
+import usearch
+from usearch.index import kmeans
+from usearch.io import load_matrix
+
+
+def evaluate_clustering_euclidean(X, labels, centroids):
+    """Evaluate clustering quality as average distance to centroids"""
+    distances = np.linalg.norm(X - centroids[labels], axis=1)
+    return np.mean(distances)
+
+
+def evaluate_clustering_cosine(X, labels, centroids):
+    """Evaluate clustering quality as average cosine distance to centroids"""
+
+    # Normalize both data points and centroids
+    X_normalized = X / np.linalg.norm(X, axis=1, keepdims=True)
+    centroids_normalized = centroids / np.linalg.norm(centroids, axis=1, keepdims=True)
+
+    # Compute cosine similarity using dot product
+    cosine_similarities = np.sum(X_normalized * centroids_normalized[labels], axis=1)
+
+    # Convert cosine similarity to cosine distance
+    cosine_distances = 1 - cosine_similarities
+
+    # Return the average cosine distance
+    return np.mean(cosine_distances)
+
+
+def numpy_initialize_centroids(X, k):
+    """Randomly choose k data points as initial centroids"""
+    indices = np.random.choice(X.shape[0], k, replace=False)
+    return X[indices]
+
+
+def numpy_assign_clusters(X, centroids):
+    """Assign each data point to the nearest centroid (numpy NumPy implementation)"""
+    distances = np.linalg.norm(X[:, np.newaxis] - centroids, axis=2)
+    return np.argmin(distances, axis=1)
+
+
+def numpy_update_centroids(X, labels, k):
+    """Compute new centroids as the mean of all data points assigned to each cluster"""
+    return np.array([X[labels == i].mean(axis=0) for i in range(k)])
+
+
+def cluster_with_numpy(X, k, max_iters=100, tol=1e-4):
+    centroids = numpy_initialize_centroids(X, k)
+
+    for i in tqdm(range(max_iters), desc="KMeans Iterations"):
+        labels = numpy_assign_clusters(X, centroids)
+        new_centroids = numpy_update_centroids(X, labels, k)
+
+        if np.linalg.norm(new_centroids - centroids) < tol:
+            break
+
+        centroids = new_centroids
+
+    return labels, centroids
+
+
+def cluster_with_faiss(X, k, max_iters=100):
+    # Docs: https://github.com/facebookresearch/faiss/wiki/Faiss-building-blocks:-clustering,-PCA,-quantization
+    # Header: https://github.com/facebookresearch/faiss/blob/main/faiss/Clustering.h
+    # Source: https://github.com/facebookresearch/faiss/blob/main/faiss/Clustering.cpp
+    verbose = False
+    d: int = X.shape[1]
+    kmeans = faiss.Kmeans(d, k, niter=max_iters, verbose=verbose)
+    kmeans.train(X)
+    D, I = kmeans.index.search(X, 1)
+    return I.flatten(), kmeans.centroids
+
+
+def cluster_with_usearch(X, k, max_iters=100):
+    assignments, _, centroids = kmeans(X, k, max_iterations=max_iters)
+    return assignments, centroids
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Compare KMeans clustering algorithms")
+    parser.add_argument("--vectors", type=str, required=True, help="Path to binary matrix file")
+    parser.add_argument("-k", default=10, type=int, required=True, help="Number of centroids")
+    parser.add_argument("-i", default=100, type=int, help="Upper bound on number of iterations")
+    parser.add_argument("-n", type=int, help="Upper bound on number of points to use")
+    parser.add_argument(
+        "--method",
+        type=str,
+        choices=["numpy", "faiss", "usearch"],
+        default="numpy",
+        help="Clustering backend",
+    )
+
+    args = parser.parse_args()
+
+    max_iters = args.i
+    X = load_matrix(args.vectors, count_rows=args.n)
+    k = args.k
+    method = args.method
+
+    time_before = os.times()
+    if method == "usearch":
+        labels, centroids = cluster_with_usearch(X, k, max_iters=max_iters)
+    elif method == "faiss":
+        labels, centroids = cluster_with_faiss(X, k, max_iters=max_iters)
+    else:
+        labels, centroids = cluster_with_numpy(X, k, max_iters=max_iters)
+    time_after = os.times()
+    time_duration = time_after[0] - time_before[0]
+    print(f"Time: {time_duration:.2f}s, {time_duration / max_iters:.2f}s per iteration")
+
+    quality = evaluate_clustering_euclidean(X, labels, centroids)
+    quality_cosine = evaluate_clustering_cosine(X, labels, centroids)
+    print(f"Clustering quality (average distance to centroids): {quality:.4f}, cosine: {quality_cosine:.4f}")
+
+    # Let's compare it to some random uniform assignment
+    random_labels = np.random.randint(0, k, size=X.shape[0])
+    random_quality = evaluate_clustering_euclidean(X, random_labels, centroids)
+    random_quality_cosine = evaluate_clustering_cosine(X, random_labels, centroids)
+    print(f"... while random assignment quality: {random_quality:.4f}, cosine: {random_quality_cosine:.4f}")
+
+    cluster_sizes = np.unique(labels, return_counts=True)[1]
+    cluster_sizes_mean = np.mean(cluster_sizes)
+    cluster_sizes_stddev = np.std(cluster_sizes)
+    print(f"Cluster sizes: {cluster_sizes_mean:.2f} ± {cluster_sizes_stddev:.2f}")
+    print(cluster_sizes)
+
+
+if __name__ == "__main__":
+    main()