From d68fb62795196349b1a8cd183a71ad941761068f Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 23 Aug 2024 17:34:03 +0000
Subject: [PATCH 01/32] Improve: Clustering

---
 include/usearch/index_dense.hpp | 171 ++++++++++++++++++++++++++++----
 1 file changed, 151 insertions(+), 20 deletions(-)

diff --git a/include/usearch/index_dense.hpp b/include/usearch/index_dense.hpp
index 4b6a798b..416c1227 100644
--- a/include/usearch/index_dense.hpp
+++ b/include/usearch/index_dense.hpp
@@ -1791,6 +1791,110 @@ class index_dense_gt {
         }
     };
 
+    /**
+     *  @brief  Implements K-Means clustering for all the points in the index.
+     *
+     *  Unlike the normal K-Means, we compute a hierarchical version.
+     *  1. Find the first layer that contains enough nodes to exceed `config.clusters`.
+     *  2. Assign random centroids to those points, and perform K-Means between them.
+     *  3. Iterate a few times and go down, assigning neighbors of the child nodes to the same cluster.
+     *
+     *  ! Doesn't guarantee that the clusters are balanced in size.
+     *  ! Brute-forces the centroids, so it's not very efficient for large datasets,
+     *  ! but benefits from initializing the centroids with the second largest level.
+     *  ! Not designed for multi-indexes.
+     *  ! Assumes all slots are occupied and no deletions are present.
+     *
+     *  @param[in] config Configuration parameters for K-Means clustering.
+     *  @param[in] executor Thread-pool to execute the job in parallel.
+     *  @param[in] progress Callback to report the execution progress.
+     *
+     *  @param[out] cluster_keys Pointer to the array where the cluster keys will be exported.
+     *  @param[out] cluster_distances Pointer to the array where the distances to those centroids will be exported.
+     */
+    template <                                   //
+        typename executor_at = dummy_executor_t, //
+        typename progress_at = dummy_progress_t  //
+        >
+    clustering_result_t cluster(                       //
+        index_dense_clustering_kmeans_config_t config, //
+        vector_key_t* member_keys,                     //
+        std::size_t* centroid_index,                   //
+        distance_t* centroid_distances,                //
+        f64_t** centroids,                             //
+        std::size_t* centroids_stride,                 //
+        executor_at&& executor = executor_at{},        //
+        progress_at&& progress = progress_at{}) {
+
+        clustering_result_t result;
+
+        // Perform sanity checks.
+        if (config.clusters < 2)
+            return result.failed("The number of clusters must be at least 2");
+        if (config.max_iterations < 1)
+            return result.failed("The number of iterations must be at least 1");
+        std::size_t const nodes_count = typed_->size();
+        std::size_t const vectors_count = size();
+        if (nodes_count != vectors_count)
+            return result.failed("Multi-indexes and deletions are not supported");
+        if (config.clusters >= vectors_count)
+            return result.failed("The number of clusters must be less than the number of vectors");
+
+        // Find the first level (top -> down) that has enough nodes to exceed `config.clusters`.
+        level_t level = static_cast<level_t>(max_level());
+        for (; level > 0; --level) {
+            if (stats(level).nodes > config.clusters)
+                break;
+        }
+
+        // Let's allocate memory for the centroids coordinates and make sure it's
+        // rows are aligned to cache lines to avoid false sharing.
+        using centroid_allocator_t = aligned_allocator_gt<f64_t, 64>;
+        std::size_t centroids_rows = config.clusters;
+        std::size_t centroids_row_bytes = metric_.bytes_per_vector();
+        std::size_t centroids_row_stride = divide_round_up<64>(centroids_row_bytes);
+        f64_t* centroids_data = centroid_allocator_t{}.allocate(centroids_rows * centroids_row_stride);
+        if (!centroids_data)
+            return result.failed("Out of memory!");
+
+        std::random_device random_device;
+        std::mt19937 random_engine(random_device());
+
+        // Randomly initialize centroids keys - go through all the non-removed slots.
+        {
+            std::size_t vectors_exported_count = 0;
+            slot_lookup_.for_each([&](key_and_slot_t const& key_and_slot) {
+                member_keys[key_and_slot.slot] = key_and_slot.key;
+                centroid_index[key_and_slot.slot] = random_engine() % centroids_rows;
+                ++vectors_exported_count;
+            });
+            usearch_assert_m(vectors_exported_count == vectors_count, "Vectors count mismatch");
+        }
+        std::fill(centroid_distances, centroid_distances + vectors_count, std::numeric_limits<distance_t>::max());
+
+        // Instead of generating random values for centroids,
+        // we can copy and upcast some random points from the bottom layer.
+        for (std::size_t i = 0; i < centroids_rows; i++) {
+            std::size_t slot = random_engine() % nodes_count;
+            byte_t const* vector = vectors_lookup_[slot];
+            if (!casts_.to_f64(vector, centroids_data + i * centroids_row_stride))
+                std::memcpy(centroids_data + i * centroids_row_stride, vector, centroids_row_bytes);
+            centroid_index[slot] = i;
+            centroid_distances[slot] = 0;
+        }
+
+        // Go down the layers, performing K-Means clustering on each one.
+        for (; level >= 0; --level) {
+            for (std::size_t slot = 0; slot < nodes_count; slot++) {
+                byte_t const* vector = vectors_lookup_[slot];
+                std::size_t closest_centroid = centroid_index[slot];
+                distance_t distance =
+                    metric_.distance(vector, centroids_data + closest_centroid * centroids_row_stride);
+                centroid_distances[slot] = distance;
+            }
+        }
+    }
+
     /**
      *  @brief  Implements clustering, classifying the given objects (vectors of member keys)
      *          into a given number of clusters.
@@ -1858,23 +1962,56 @@ class index_dense_gt {
     map_to_clusters:
         // Concurrently perform search until a certain depth
         executor.dynamic(queries_count, [&](std::size_t thread_idx, std::size_t query_idx) {
-            auto result = cluster(queries_begin[query_idx], level, thread_idx);
-            if (!result) {
-                atomic_error = result.error.release();
-                return false;
-            }
+            auto query_key = queries_begin[query_idx];
+            if (level) {
+                auto result = cluster(query_key, level, thread_idx);
+                if (!result) {
+                    atomic_error = result.error.release();
+                    return false;
+                }
 
-            cluster_keys[query_idx] = result.cluster.member.key;
-            cluster_distances[query_idx] = result.cluster.distance;
+                cluster_keys[query_idx] = result.cluster.member.key;
+                cluster_distances[query_idx] = result.cluster.distance;
 
-            // Export in case we need to refine afterwards
-            clusters[query_idx].centroid = result.cluster.member.key;
-            clusters[query_idx].vector = vectors_lookup_[result.cluster.member.slot];
-            clusters[query_idx].merged_into = free_key();
-            clusters[query_idx].popularity = 1;
+                // Export in case we need to refine afterwards
+                clusters[query_idx].centroid = result.cluster.member.key;
+                clusters[query_idx].vector = vectors_lookup_[result.cluster.member.slot];
+                clusters[query_idx].merged_into = free_key();
+                clusters[query_idx].popularity = 1;
+
+                visited_members += result.visited_members;
+                computed_distances += result.computed_distances;
+            } else {
+                index_search_config_t search_config;
+                search_config.thread = thread_idx;
+                search_config.expansion = config_.expansion_search;
+                search_config.exact = false;
+
+                vector_key_t free_key_copy = free_key_;
+                auto allow = [free_key_copy](member_cref_t const& member) noexcept {
+                    return (vector_key_t)member.key != free_key_copy;
+                };
+                auto query_slot = slot_lookup_.find(key_and_slot_t::any_slot(query_key))->slot;
+                byte_t const* vector_data = vectors_lookup_[query_slot];
+                auto result = typed_->search(vector_data, 1, metric_proxy_t{*this}, search_config, allow);
+                if (!result) {
+                    atomic_error = result.error.release();
+                    return false;
+                }
+
+                cluster_keys[query_idx] = result.front().member.key;
+                cluster_distances[query_idx] = result.front().distance;
+
+                // Export in case we need to refine afterwards
+                clusters[query_idx].centroid = result.front().member.key;
+                clusters[query_idx].vector = vectors_lookup_[result.front().member.slot];
+                clusters[query_idx].merged_into = free_key();
+                clusters[query_idx].popularity = 1;
+
+                visited_members += result.visited_members;
+                computed_distances += result.computed_distances;
+            }
 
-            visited_members += result.visited_members;
-            computed_distances += result.computed_distances;
             return true;
         });
 
@@ -1995,7 +2132,6 @@ class index_dense_gt {
     add_result_t add_(                             //
         vector_key_t key, scalar_at const* vector, //
         std::size_t thread, bool force_vector_copy, cast_t const& cast) {
-
         if (!multi() && config().enable_key_lookups && contains(key))
             return add_result_t{}.failed("Duplicate keys not allowed in high-level wrappers");
 
@@ -2045,7 +2181,6 @@ class index_dense_gt {
     template <typename scalar_at, typename predicate_at>
     search_result_t search_(scalar_at const* vector, std::size_t wanted, predicate_at&& predicate, std::size_t thread,
                             bool exact, cast_t const& cast) const {
-
         // Cast the vector, if needed for compatibility with `metric_`
         thread_lock_t lock = thread_lock_(thread);
         byte_t const* vector_data = reinterpret_cast<byte_t const*>(vector);
@@ -2081,7 +2216,6 @@ class index_dense_gt {
     cluster_result_t cluster_(                      //
         scalar_at const* vector, std::size_t level, //
         std::size_t thread, cast_t const& cast) const {
-
         // Cast the vector, if needed for compatibility with `metric_`
         thread_lock_t lock = thread_lock_(thread);
         byte_t const* vector_data = reinterpret_cast<byte_t const*>(vector);
@@ -2105,7 +2239,6 @@ class index_dense_gt {
     aggregated_distances_t distance_between_(      //
         vector_key_t key, scalar_at const* vector, //
         std::size_t thread, cast_t const& cast) const {
-
         // Cast the vector, if needed for compatibility with `metric_`
         thread_lock_t lock = thread_lock_(thread);
         byte_t const* vector_data = reinterpret_cast<byte_t const*>(vector);
@@ -2149,7 +2282,6 @@ class index_dense_gt {
     }
 
     void reindex_keys_() {
-
         // Estimate number of entries first
         std::size_t count_total = typed_->size();
         std::size_t count_removed = 0;
@@ -2182,7 +2314,6 @@ class index_dense_gt {
 
     template <typename scalar_at>
     std::size_t get_(vector_key_t key, scalar_at* reconstructed, std::size_t vectors_limit, cast_t const& cast) const {
-
         if (!multi()) {
             compressed_slot_t slot;
             // Find the matching ID

From f0c250e0b436f44044bb09f688d99ca6a5a5b20a Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 2 Sep 2024 18:51:29 +0000
Subject: [PATCH 02/32] Add: `kmeans_clustering_gt` plugin

---
 .vscode/settings.json             |   1 +
 cpp/test.cpp                      |  11 ++
 include/usearch/index_plugins.hpp | 302 +++++++++++++++++++++++++++---
 3 files changed, 293 insertions(+), 21 deletions(-)

diff --git a/.vscode/settings.json b/.vscode/settings.json
index 3b5d8520..e2aa8e66 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -155,6 +155,7 @@
     "ibin",
     "jaccard",
     "Jemalloc",
+    "kmeans",
     "Kullback",
     "Leibler",
     "libjemalloc",
diff --git a/cpp/test.cpp b/cpp/test.cpp
index f8302c34..74c4b27d 100644
--- a/cpp/test.cpp
+++ b/cpp/test.cpp
@@ -1100,6 +1100,17 @@ int main(int, char**) {
     test_uint40();
     test_cosine<float, std::int64_t, uint40_t>(10, 10);
 
+    //
+    {
+        std::size_t vectors_count = 1000, centroids_count = 10, dimensions = 256;
+        kmeans_clustering_t clustering;
+        std::vector<float> vectors(vectors_count * dimensions), centroids(centroids_count * dimensions);
+        matrix_slice_gt<float const> vectors_slice(vectors.data(), vectors_count, dimensions);
+        matrix_slice_gt<float> centroids_slice(centroids.data(), centroids_count, dimensions);
+        std::generate(vectors.begin(), vectors.end(), [] { return float(std::rand()) / float(INT_MAX); });
+        clustering(vectors_slice, centroids_slice);
+    }
+
     // Exact search without constructing indexes.
     // Great for validating the distance functions.
     std::printf("Testing exact search\n");
diff --git a/include/usearch/index_plugins.hpp b/include/usearch/index_plugins.hpp
index 039b18e6..afedcdc4 100644
--- a/include/usearch/index_plugins.hpp
+++ b/include/usearch/index_plugins.hpp
@@ -156,12 +156,6 @@ enum class scalar_kind_t : std::uint8_t {
     i8_k = 23,
 };
 
-enum class prefetching_kind_t {
-    none_k,
-    cpu_k,
-    io_uring_k,
-};
-
 template <typename scalar_at> scalar_kind_t scalar_kind() noexcept {
     if (std::is_same<scalar_at, b1x8_t>())
         return scalar_kind_t::b1x8_k;
@@ -1142,6 +1136,71 @@ template <> struct cast_gt<f64_t, b1x8_t> : public cast_to_b1x8_gt<f64_t> {};
 template <> struct cast_gt<b1x8_t, i8_t> : public cast_from_b1x8_gt<i8_t> {};
 template <> struct cast_gt<i8_t, b1x8_t> : public cast_to_b1x8_gt<i8_t> {};
 
+/**
+ *  @brief  Type-punned array casting function.
+ *          Arguments: input buffer, bytes in input buffer, output buffer.
+ *          Returns `true` if the casting was performed successfully, `false` otherwise.
+ */
+using cast_punned_t = bool (*)(byte_t const*, std::size_t, byte_t*);
+
+/**
+ *  @brief  A collection of casting functions for typical vector types.
+ *          Covers to/from conversions for boolean, integer, half-precision,
+ *          single-precision, and double-precision scalars.
+ */
+struct casts_punned_t {
+    struct group_t {
+        cast_punned_t b1x8{};
+        cast_punned_t i8{};
+        cast_punned_t f16{};
+        cast_punned_t f32{};
+        cast_punned_t f64{};
+
+        cast_punned_t operator[](scalar_kind_t scalar_kind) const noexcept {
+            switch (scalar_kind) {
+            case scalar_kind_t::f64_k: return f64;
+            case scalar_kind_t::f32_k: return f32;
+            case scalar_kind_t::f16_k: return f16;
+            case scalar_kind_t::bf16_k: return f16;
+            case scalar_kind_t::i8_k: return i8;
+            case scalar_kind_t::b1x8_k: return b1x8;
+            default: return nullptr;
+            }
+        }
+
+    } from, to;
+
+    template <typename scalar_at> static casts_punned_t make() noexcept {
+        casts_punned_t result;
+
+        result.from.b1x8 = &cast_gt<b1x8_t, scalar_at>::try_;
+        result.from.i8 = &cast_gt<i8_t, scalar_at>::try_;
+        result.from.f16 = &cast_gt<f16_t, scalar_at>::try_;
+        result.from.f32 = &cast_gt<f32_t, scalar_at>::try_;
+        result.from.f64 = &cast_gt<f64_t, scalar_at>::try_;
+
+        result.to.b1x8 = &cast_gt<scalar_at, b1x8_t>::try_;
+        result.to.i8 = &cast_gt<scalar_at, i8_t>::try_;
+        result.to.f16 = &cast_gt<scalar_at, f16_t>::try_;
+        result.to.f32 = &cast_gt<scalar_at, f32_t>::try_;
+        result.to.f64 = &cast_gt<scalar_at, f64_t>::try_;
+
+        return result;
+    }
+
+    static casts_punned_t make(scalar_kind_t scalar_kind) noexcept {
+        switch (scalar_kind) {
+        case scalar_kind_t::f64_k: return casts_punned_t::make<f64_t>();
+        case scalar_kind_t::f32_k: return casts_punned_t::make<f32_t>();
+        case scalar_kind_t::f16_k: return casts_punned_t::make<f16_t>();
+        case scalar_kind_t::bf16_k: return casts_punned_t::make<bf16_t>();
+        case scalar_kind_t::i8_k: return casts_punned_t::make<i8_t>();
+        case scalar_kind_t::b1x8_k: return casts_punned_t::make<b1x8_t>();
+        default: return {};
+        }
+    }
+};
+
 /*  Don't complain if the vectorization of the inner loops fails:
  *
  *  > warning: loop not vectorized: the optimizer was unable to perform the requested transformation;
@@ -1862,32 +1921,32 @@ class metric_punned_t {
  *  @brief  View over a potentially-strided memory buffer, containing a row-major matrix.
  */
 template <typename scalar_at> //
-class vectors_view_gt {
+class matrix_slice_gt {
     using scalar_t = scalar_at;
 
-    scalar_t const* begin_{};
+    scalar_t* begin_{};
     std::size_t dimensions_{};
     std::size_t count_{};
     std::size_t stride_bytes_{};
 
   public:
-    vectors_view_gt() noexcept = default;
-    vectors_view_gt(vectors_view_gt const&) noexcept = default;
-    vectors_view_gt& operator=(vectors_view_gt const&) noexcept = default;
+    matrix_slice_gt() noexcept = default;
+    matrix_slice_gt(matrix_slice_gt const&) noexcept = default;
+    matrix_slice_gt& operator=(matrix_slice_gt const&) noexcept = default;
 
-    vectors_view_gt(scalar_t const* begin, std::size_t dimensions, std::size_t count = 1) noexcept
-        : vectors_view_gt(begin, dimensions, count, dimensions * sizeof(scalar_at)) {}
+    matrix_slice_gt(scalar_t* begin, std::size_t dimensions, std::size_t count = 1) noexcept
+        : matrix_slice_gt(begin, dimensions, count, dimensions * sizeof(scalar_at)) {}
 
-    vectors_view_gt(scalar_t const* begin, std::size_t dimensions, std::size_t count, std::size_t stride_bytes) noexcept
+    matrix_slice_gt(scalar_t* begin, std::size_t dimensions, std::size_t count, std::size_t stride_bytes) noexcept
         : begin_(begin), dimensions_(dimensions), count_(count), stride_bytes_(stride_bytes) {}
 
     explicit operator bool() const noexcept { return begin_; }
     std::size_t size() const noexcept { return count_; }
     std::size_t dimensions() const noexcept { return dimensions_; }
     std::size_t stride() const noexcept { return stride_bytes_; }
-    scalar_t const* data() const noexcept { return begin_; }
-    scalar_t const* at(std::size_t i) const noexcept {
-        return reinterpret_cast<scalar_t const*>(reinterpret_cast<byte_t const*>(begin_) + i * stride_bytes_);
+    scalar_t* data() const noexcept { return begin_; }
+    scalar_t* at(std::size_t i) const noexcept {
+        return reinterpret_cast<scalar_t*>(reinterpret_cast<byte_t*>(begin_) + i * stride_bytes_);
     }
 };
 
@@ -1896,7 +1955,7 @@ struct exact_offset_and_distance_t {
     f32_t distance;
 };
 
-using exact_search_results_t = vectors_view_gt<exact_offset_and_distance_t>;
+using exact_search_results_t = matrix_slice_gt<exact_offset_and_distance_t const>;
 
 /**
  *  @brief  Helper-structure for exact search operations.
@@ -1917,9 +1976,9 @@ class exact_search_t {
 
   public:
     template <typename scalar_at, typename executor_at = dummy_executor_t, typename progress_at = dummy_progress_t>
-    exact_search_results_t operator()(                                          //
-        vectors_view_gt<scalar_at> dataset, vectors_view_gt<scalar_at> queries, //
-        std::size_t wanted, metric_punned_t const& metric,                      //
+    exact_search_results_t operator()(                                                      //
+        matrix_slice_gt<scalar_at const> dataset, matrix_slice_gt<scalar_at const> queries, //
+        std::size_t wanted, metric_punned_t const& metric,                                  //
         executor_at&& executor = executor_at{}, progress_at&& progress = progress_at{}) {
         return operator()(                                                                     //
             metric,                                                                            //
@@ -2001,6 +2060,207 @@ class exact_search_t {
     }
 };
 
+struct kmeans_clustering_result_t {
+    error_t error{};
+    std::size_t computed_distances{};
+    std::size_t iterations{};
+
+    explicit operator bool() const noexcept { return !error; }
+    kmeans_clustering_result_t failed(error_t message) noexcept {
+        error = std::move(message);
+        return std::move(*this);
+    }
+};
+
+/**
+ *  @brief  Helper-class for K-Means clustering of dense vectors.
+ *          Doesn't require constructing the index, but benefits from mixed-precision logic.
+ *          ! Doesn't guarantee that the clusters are balanced in size.
+ *
+ *  The algorithm is as follows:
+ *  - Initialization: Select K initial centroids (randomly or with a heuristic).
+ *  - Assignment: Assign each data point to the nearest centroid based on the Euclidean distance.
+ *  - Update: Recalculate the centroids as the mean of all points assigned to each centroid.
+ *  - Repeat: Repeat the assignment and update steps until the centroids no longer change significantly.
+ */
+template <typename allocator_at = std::allocator<char>> class kmeans_clustering_gt {
+  public:
+    std::size_t max_iterations = 0;
+    f64_t tolerance = 0.0;
+    metric_kind_t metric_kind = metric_kind_t::l2sq_k;
+    scalar_kind_t quantization_kind = scalar_kind_t::bf16_k;
+
+    template <typename scalar_at, typename executor_at = dummy_executor_t, typename progress_at = dummy_progress_t>
+    kmeans_clustering_result_t operator()(                                              //
+        matrix_slice_gt<scalar_at const> dataset, matrix_slice_gt<scalar_at> centroids, //
+        executor_at&& executor = executor_at{}, progress_at&& progress = progress_at{}) {
+        return operator()(                                                                     //
+            reinterpret_cast<byte_t const*>(dataset.data()), dataset.size(), dataset.stride(), //
+            reinterpret_cast<byte_t*>(centroids.data()), centroids.size(), centroids.stride(), //
+            scalar_kind<scalar_at>(), dataset.dimensions(), executor, progress);
+    }
+
+    template <typename executor_at = dummy_executor_t, typename progress_at = dummy_progress_t>
+    kmeans_clustering_result_t operator()(                                                 //
+        byte_t const* dataset_data, std::size_t dataset_count, std::size_t dataset_stride, //
+        byte_t* centroids_data, std::size_t wanted_clusters, std::size_t centroids_stride, //
+        scalar_kind_t original_scalar_kind, std::size_t dimensions, executor_at&& executor = executor_at{},
+        progress_at&& progress = progress_at{}) {
+
+        // Perform sanity checks for algorithm settings.
+        kmeans_clustering_result_t result;
+        if (max_iterations < 1)
+            return result.failed("The number of iterations must be at least 1");
+
+        // Perform sanity checks for input arguments.
+        if (wanted_clusters < 2)
+            return result.failed("The number of clusters must be at least 2");
+        if (wanted_clusters >= dataset_count)
+            return result.failed("The number of clusters must be less than the number of vectors");
+
+        metric_punned_t metric = metric_punned_t::builtin(dimensions, metric_kind, quantization_kind);
+        if (!metric)
+            return result.failed("Unsupported metric or scalar kind");
+
+        // Let's allocate memory for the centroids coordinates and make sure it's
+        // rows are aligned to cache lines to avoid false sharing.
+        buffer_gt<aligned_allocator_gt<distance_t, 64>> centroid_distances(dataset_count);
+        buffer_gt<aligned_allocator_gt<std::size_t, 64>> centroids_indexes(dataset_count);
+        buffer_gt<aligned_allocator_gt<std::atomic<std::size_t>, 64>> cluster_sizes(wanted_clusters);
+
+        // For a mixed precision computation, we keep the centroids represented in two forms -
+        // double precision and quantized the same way as in the index, to avoid paying conversion penalties.
+        // Double precision is needed to avoid accumulating errors when aggregating too many entries.
+        std::size_t const bytes_per_vector_original =
+            divide_round_up<CHAR_BIT>(dimensions * bits_per_scalar(original_scalar_kind));
+        std::size_t const bytes_per_vector_quantized = metric.bytes_per_vector();
+        std::size_t const stride_per_vector_quantized = divide_round_up<64>(bytes_per_vector_quantized);
+        buffer_gt<aligned_allocator_gt<byte_t, 64>> dataset_quantized(dataset_count * stride_per_vector_quantized);
+        buffer_gt<aligned_allocator_gt<byte_t, 64>> centroids_quantized(wanted_clusters * stride_per_vector_quantized);
+
+        // When aggregating centroids, we want to parallelize the operation and need more memory.
+        // For every thread we keep two double-precision vectors. One is the up-casting output buffer for quantized
+        // coordinates, and the other is the temporary buffer for the partial sums of the double-precision coordinates.
+        // The ordering:
+        //
+        //      - thread 0: [centroid 0, centroid 1, centroid 2, centroid 3, ...]
+        //      - thread 1: [centroid 0, centroid 1, centroid 2, centroid 3, ...]
+        //      - thread 2: [centroid 0, centroid 1, centroid 2, centroid 3, ...]
+        //
+        std::size_t const thread_count = executor.size();
+        buffer_gt<aligned_allocator_gt<f64_t, 64>> centroids_precise(wanted_clusters * dimensions * thread_count);
+        buffer_gt<aligned_allocator_gt<f64_t, 64>> dataset_precise(wanted_clusters * dimensions * thread_count);
+        if (!centroids_precise || !dataset_precise || !centroids_quantized || !centroids_indexes ||
+            !centroid_distances || !dataset_quantized)
+            return result.failed("No memory for result outputs!");
+
+        std::fill_n(centroids_indexes.data(), dataset_count, wanted_clusters);
+        std::fill_n(centroid_distances.data(), dataset_count, std::numeric_limits<distance_t>::max());
+
+        // Initialize the casting kernel for quantization and export.
+        casts_punned_t casts = casts_punned_t::make(quantization_kind);
+        cast_punned_t const& compress_dataset = casts.from[original_scalar_kind];
+        cast_punned_t const& decompress_dataset = casts.to[original_scalar_kind];
+        for (std::size_t i = 0; i < dataset_count; i++) {
+            byte_t const* vector = dataset_data + i * dataset_stride;
+            byte_t* quantized = dataset_quantized.data() + i * stride_per_vector_quantized;
+            if (!compress_dataset(vector, bytes_per_vector_original, quantized))
+                std::memcpy(quantized, vector, bytes_per_vector_original);
+        }
+
+        // Initialize centroids with random dataset vectors.
+        std::random_device random_device;
+        std::mt19937 random_engine(random_device());
+        for (std::size_t i = 0; i < wanted_clusters; i++) {
+            // Generate the random index of the dataset vector,
+            // that is unique and not already used as a centroid.
+            std::size_t random_index;
+            do {
+                random_index = random_engine() % dataset_count;
+                bool is_unique = true;
+                for (std::size_t j = 0; j < i; j++) {
+                    if (centroids_indexes[j] == random_index) {
+                        is_unique = false;
+                        break;
+                    }
+                }
+                if (is_unique)
+                    break;
+            } while (true);
+
+            // Copy the vector to the centroid and quantize it.
+            byte_t const* quantized_dataset = dataset_quantized.data() + random_index * stride_per_vector_quantized;
+            byte_t* quantized_centroid = centroids_quantized.data() + i * stride_per_vector_quantized;
+            std::memcpy(quantized_centroid, quantized_dataset, bytes_per_vector_quantized);
+            centroid_indexes[random_index] = i;
+            centroid_distances[random_index] = 0;
+        }
+
+        // In the future we can go down the layers, performing K-Means clustering on each one.
+        // For now, we just jump to the bottom layer and use our SIMD-accelerated kernels for mixed-precision compute.
+        std::size_t iterations = 0;
+        while (iterations < config.max_iterations) {
+
+            // For every point in the dataset, find the closest centroid.
+            executor.dynamic(dataset_count, [&](std::size_t thread_idx, std::size_t dataset_idx) {
+                byte_t const* quantized_dataset = dataset_quantized.data() + dataset_idx * stride_per_vector_quantized;
+                byte_t const* quantized_centroids = centroids_quantized.data();
+                distance_t& closest_distance = centroid_distances[dataset_idx];
+                std::size_t& closest_idx = centroids_indexes[dataset_idx];
+                for (std::size_t centroid_idx = 0; centroid_idx < wanted_clusters; centroid_idx++) {
+                    byte_t const* quantized_centroid = quantized_centroids + centroid_idx * stride_per_vector_quantized;
+                    distance_t distance = metric(quantized_dataset, quantized_centroid);
+                    if (distance < closest_distance) {
+                        closest_distance = distance;
+                        closest_idx = centroid_idx;
+                    }
+                }
+                return true;
+            });
+
+            // For every centroid, recalculate the mean of all points assigned to it.
+            // That part is problematic to parallelize on many-core-systems, because of the contention.
+            // Alternatively, a tree-like approach can be used, where every core accumulates it's own partial sums.
+            // And those are later aggregated by a single thread.
+            std::memset(centroids_precise.data(), 0, wanted_clusters * dimensions * 2 * thread_count * sizeof(f64_t));
+            executor.dynamic(dataset_count, [&](std::size_t thread_idx, std::size_t dataset_idx) {
+                std::size_t centroid_idx = centroids_indexes[dataset_idx];
+                byte_t const* quantized_dataset = dataset_quantized.data() + dataset_idx * stride_per_vector_quantized;
+                f64_t* centroid_precise =
+                    centroids_precise.data() + wanted_clusters * dimensions * thread_idx + centroid_idx * dimensions;
+
+                // Upcast the dataset point into a buffer of double-precision floats.
+                f64_t* dataset_precise =
+                    dataset_precise.data() + wanted_clusters * dimensions * thread_idx + centroid_idx * dimensions;
+                if (!decompress_dataset(quantized_dataset, dimensions, dataset_precise))
+                    std::memcpy(dataset_precise, quantized_dataset, bytes_per_vector_quantized);
+
+                // Now add the vector from the dataset into the centroid partial sum.
+                for (std::size_t i = 0; i < dimensions; i++)
+                    centroid_precise[i] += dataset_precise[i];
+
+                cluster_sizes[centroid_idx].fetch_add(1, std::memory_order_relaxed);
+                return true;
+            });
+
+            // Aggregate the partial sums into the final centroids - storing them in the high-precision
+            // buffer of the first thread. Normalization procedure is different for different metrics.
+            for (std::size_t centroid_idx = 0; centroid_idx < wanted_clusters; centroid_idx++) {
+                f64_t* centroid_precise_aggregated = centroids_precise.data() + centroid_idx * dimensions;
+                for (std::size_t thread_idx = 1; thread_idx < thread_count; thread_idx++) {
+                    f64_t* centroid_precise = centroids_precise.data() + wanted_clusters * dimensions * thread_idx +
+                                              centroid_idx * dimensions;
+                    for (std::size_t i = 0; i < dimensions; i++)
+                        centroid_precise_aggregated[i] += centroid_precise[i];
+                }
+                // Normalize:
+            }
+        }
+    }
+};
+
+using kmeans_clustering_t = kmeans_clustering_gt<>;
+
 /**
  *  @brief  C++11 Multi-Hash-Set with Linear Probing.
  *

From 0f4c98cc9ac5901f9bfe43f6245ecf45ffac5386 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 2 Sep 2024 19:59:24 +0000
Subject: [PATCH 03/32] Add: Early exit strategies for KMeans

---
 include/usearch/index_plugins.hpp | 241 +++++++++++++++++++++---------
 1 file changed, 170 insertions(+), 71 deletions(-)

diff --git a/include/usearch/index_plugins.hpp b/include/usearch/index_plugins.hpp
index afedcdc4..98bc5da0 100644
--- a/include/usearch/index_plugins.hpp
+++ b/include/usearch/index_plugins.hpp
@@ -4,6 +4,7 @@
 #include <stdlib.h> // `aligned_alloc`
 
 #include <atomic>  // `std::atomic`
+#include <chrono>  // `std::chrono`
 #include <cstring> // `std::strncmp`
 #include <thread>  // `std::thread`
 
@@ -2063,7 +2064,14 @@ class exact_search_t {
 struct kmeans_clustering_result_t {
     error_t error{};
     std::size_t computed_distances{};
+    /// @brief The number of iterations the algorithm took to converge.
     std::size_t iterations{};
+    /// @brief The number of points that changed clusters in the last iteration.
+    std::size_t last_iteration_point_shifts{};
+    /// @brief The inertia of the last iteration (sum of squared distances to centroids).
+    f64_t last_iteration_inertia{};
+    /// @brief The total elapsed runtime of the algorithm in seconds.
+    f64_t runtime_seconds{};
 
     explicit operator bool() const noexcept { return !error; }
     kmeans_clustering_result_t failed(error_t message) noexcept {
@@ -2081,29 +2089,40 @@ struct kmeans_clustering_result_t {
  *  - Initialization: Select K initial centroids (randomly or with a heuristic).
  *  - Assignment: Assign each data point to the nearest centroid based on the Euclidean distance.
  *  - Update: Recalculate the centroids as the mean of all points assigned to each centroid.
- *  - Repeat: Repeat the assignment and update steps until the centroids no longer change significantly.
+ *  - Repeat: Repeat the assignment and update steps until the centroids no longer change significantly
+ *            or an early-exit condition is met.
  */
 template <typename allocator_at = std::allocator<char>> class kmeans_clustering_gt {
   public:
-    std::size_t max_iterations = 0;
-    f64_t tolerance = 0.0;
-    metric_kind_t metric_kind = metric_kind_t::l2sq_k;
-    scalar_kind_t quantization_kind = scalar_kind_t::bf16_k;
+    using distance_t = distance_punned_t;
+
+    metric_kind_t metric_kind{metric_kind_t::l2sq_k};
+    scalar_kind_t quantization_kind{scalar_kind_t::bf16_k};
+
+    /// @brief Early-exit parameter - the maximum number of iterations to perform.
+    std::size_t max_iterations{};
+    /// @brief Early-exit parameter - the threshold for the final inertia to terminate early.
+    f64_t final_inertia_threshold{};
+    /// @brief Early-exit parameter - the maximum runtime allowed in seconds.
+    f64_t runtime_limit_seconds{};
+    /// @brief Early-exit parameter - the minimum number of points that must change clusters per iteration.
+    std::size_t min_point_shifts_per_iteration{};
 
     template <typename scalar_at, typename executor_at = dummy_executor_t, typename progress_at = dummy_progress_t>
-    kmeans_clustering_result_t operator()(                                              //
-        matrix_slice_gt<scalar_at const> dataset, matrix_slice_gt<scalar_at> centroids, //
+    kmeans_clustering_result_t operator()(                                             //
+        matrix_slice_gt<scalar_at const> points, matrix_slice_gt<scalar_at> centroids, //
         executor_at&& executor = executor_at{}, progress_at&& progress = progress_at{}) {
         return operator()(                                                                     //
-            reinterpret_cast<byte_t const*>(dataset.data()), dataset.size(), dataset.stride(), //
+            reinterpret_cast<byte_t const*>(points.data()), points.size(), points.stride(),    //
             reinterpret_cast<byte_t*>(centroids.data()), centroids.size(), centroids.stride(), //
-            scalar_kind<scalar_at>(), dataset.dimensions(), executor, progress);
+            scalar_kind<scalar_at>(), points.dimensions(), executor, progress);
     }
 
     template <typename executor_at = dummy_executor_t, typename progress_at = dummy_progress_t>
     kmeans_clustering_result_t operator()(                                                 //
-        byte_t const* dataset_data, std::size_t dataset_count, std::size_t dataset_stride, //
+        byte_t const* points_data, std::size_t points_count, std::size_t points_stride,    //
         byte_t* centroids_data, std::size_t wanted_clusters, std::size_t centroids_stride, //
+        std::size_t* point_to_centroid_index,                                              //
         scalar_kind_t original_scalar_kind, std::size_t dimensions, executor_at&& executor = executor_at{},
         progress_at&& progress = progress_at{}) {
 
@@ -2115,7 +2134,7 @@ template <typename allocator_at = std::allocator<char>> class kmeans_clustering_
         // Perform sanity checks for input arguments.
         if (wanted_clusters < 2)
             return result.failed("The number of clusters must be at least 2");
-        if (wanted_clusters >= dataset_count)
+        if (wanted_clusters >= points_count)
             return result.failed("The number of clusters must be less than the number of vectors");
 
         metric_punned_t metric = metric_punned_t::builtin(dimensions, metric_kind, quantization_kind);
@@ -2124,9 +2143,10 @@ template <typename allocator_at = std::allocator<char>> class kmeans_clustering_
 
         // Let's allocate memory for the centroids coordinates and make sure it's
         // rows are aligned to cache lines to avoid false sharing.
-        buffer_gt<aligned_allocator_gt<distance_t, 64>> centroid_distances(dataset_count);
-        buffer_gt<aligned_allocator_gt<std::size_t, 64>> centroids_indexes(dataset_count);
-        buffer_gt<aligned_allocator_gt<std::atomic<std::size_t>, 64>> cluster_sizes(wanted_clusters);
+        buffer_gt<distance_t, aligned_allocator_gt<distance_t, 64>> point_to_centroid_distance_buffer(points_count);
+        buffer_gt<std::size_t, aligned_allocator_gt<std::size_t, 64>> point_to_centroid_index_buffer(points_count);
+        buffer_gt<std::atomic<std::size_t>, aligned_allocator_gt<std::atomic<std::size_t>, 64>> cluster_sizes_buffer(
+            wanted_clusters);
 
         // For a mixed precision computation, we keep the centroids represented in two forms -
         // double precision and quantized the same way as in the index, to avoid paying conversion penalties.
@@ -2135,8 +2155,10 @@ template <typename allocator_at = std::allocator<char>> class kmeans_clustering_
             divide_round_up<CHAR_BIT>(dimensions * bits_per_scalar(original_scalar_kind));
         std::size_t const bytes_per_vector_quantized = metric.bytes_per_vector();
         std::size_t const stride_per_vector_quantized = divide_round_up<64>(bytes_per_vector_quantized);
-        buffer_gt<aligned_allocator_gt<byte_t, 64>> dataset_quantized(dataset_count * stride_per_vector_quantized);
-        buffer_gt<aligned_allocator_gt<byte_t, 64>> centroids_quantized(wanted_clusters * stride_per_vector_quantized);
+        buffer_gt<byte_t, aligned_allocator_gt<byte_t, 64>> points_quantized_buffer(points_count *
+                                                                                    stride_per_vector_quantized);
+        buffer_gt<byte_t, aligned_allocator_gt<byte_t, 64>> centroids_quantized_buffer(wanted_clusters *
+                                                                                       stride_per_vector_quantized);
 
         // When aggregating centroids, we want to parallelize the operation and need more memory.
         // For every thread we keep two double-precision vectors. One is the up-casting output buffer for quantized
@@ -2148,38 +2170,42 @@ template <typename allocator_at = std::allocator<char>> class kmeans_clustering_
         //      - thread 2: [centroid 0, centroid 1, centroid 2, centroid 3, ...]
         //
         std::size_t const thread_count = executor.size();
-        buffer_gt<aligned_allocator_gt<f64_t, 64>> centroids_precise(wanted_clusters * dimensions * thread_count);
-        buffer_gt<aligned_allocator_gt<f64_t, 64>> dataset_precise(wanted_clusters * dimensions * thread_count);
-        if (!centroids_precise || !dataset_precise || !centroids_quantized || !centroids_indexes ||
-            !centroid_distances || !dataset_quantized)
+        buffer_gt<f64_t, aligned_allocator_gt<f64_t, 64>> centroids_precise_buffer(wanted_clusters * dimensions *
+                                                                                   thread_count);
+        buffer_gt<f64_t, aligned_allocator_gt<f64_t, 64>> points_precise_buffer(wanted_clusters * dimensions *
+                                                                                thread_count);
+        if (!centroids_precise_buffer || !points_precise_buffer || !centroids_quantized_buffer ||
+            !point_to_centroid_index_buffer || !point_to_centroid_distance_buffer || !points_quantized_buffer)
             return result.failed("No memory for result outputs!");
 
-        std::fill_n(centroids_indexes.data(), dataset_count, wanted_clusters);
-        std::fill_n(centroid_distances.data(), dataset_count, std::numeric_limits<distance_t>::max());
+        std::fill_n(point_to_centroid_index_buffer.data(), points_count, wanted_clusters);
+        std::fill_n(point_to_centroid_distance_buffer.data(), points_count, std::numeric_limits<distance_t>::max());
 
         // Initialize the casting kernel for quantization and export.
         casts_punned_t casts = casts_punned_t::make(quantization_kind);
-        cast_punned_t const& compress_dataset = casts.from[original_scalar_kind];
-        cast_punned_t const& decompress_dataset = casts.to[original_scalar_kind];
-        for (std::size_t i = 0; i < dataset_count; i++) {
-            byte_t const* vector = dataset_data + i * dataset_stride;
-            byte_t* quantized = dataset_quantized.data() + i * stride_per_vector_quantized;
-            if (!compress_dataset(vector, bytes_per_vector_original, quantized))
+        cast_punned_t const& compress_points = casts.from[original_scalar_kind];
+        cast_punned_t const& decompress_points = casts.to[original_scalar_kind];
+        cast_punned_t const& compress_precise = casts.from.f64;
+        cast_punned_t const& decompress_precise = casts.to.f64;
+        for (std::size_t i = 0; i < points_count; i++) {
+            byte_t const* vector = points_data + i * points_stride;
+            byte_t* quantized = points_quantized_buffer.data() + i * stride_per_vector_quantized;
+            if (!compress_points(vector, bytes_per_vector_original, quantized))
                 std::memcpy(quantized, vector, bytes_per_vector_original);
         }
 
-        // Initialize centroids with random dataset vectors.
+        // Initialize centroids with random points vectors.
         std::random_device random_device;
         std::mt19937 random_engine(random_device());
         for (std::size_t i = 0; i < wanted_clusters; i++) {
-            // Generate the random index of the dataset vector,
+            // Generate the random index of the points vector,
             // that is unique and not already used as a centroid.
             std::size_t random_index;
             do {
-                random_index = random_engine() % dataset_count;
+                random_index = random_engine() % points_count;
                 bool is_unique = true;
                 for (std::size_t j = 0; j < i; j++) {
-                    if (centroids_indexes[j] == random_index) {
+                    if (point_to_centroid_index_buffer[j] == random_index) {
                         is_unique = false;
                         break;
                     }
@@ -2189,73 +2215,146 @@ template <typename allocator_at = std::allocator<char>> class kmeans_clustering_
             } while (true);
 
             // Copy the vector to the centroid and quantize it.
-            byte_t const* quantized_dataset = dataset_quantized.data() + random_index * stride_per_vector_quantized;
-            byte_t* quantized_centroid = centroids_quantized.data() + i * stride_per_vector_quantized;
-            std::memcpy(quantized_centroid, quantized_dataset, bytes_per_vector_quantized);
-            centroid_indexes[random_index] = i;
-            centroid_distances[random_index] = 0;
+            byte_t const* quantized_point = points_quantized_buffer.data() + random_index * stride_per_vector_quantized;
+            byte_t* quantized_centroid = centroids_quantized_buffer.data() + i * stride_per_vector_quantized;
+            std::memcpy(quantized_centroid, quantized_point, bytes_per_vector_quantized);
+            point_to_centroid_index_buffer[random_index] = i;
+            point_to_centroid_distance_buffer[random_index] = 0;
         }
 
-        // In the future we can go down the layers, performing K-Means clustering on each one.
-        // For now, we just jump to the bottom layer and use our SIMD-accelerated kernels for mixed-precision compute.
+        auto start_time = std::chrono::high_resolution_clock::now();
         std::size_t iterations = 0;
-        while (iterations < config.max_iterations) {
-
-            // For every point in the dataset, find the closest centroid.
-            executor.dynamic(dataset_count, [&](std::size_t thread_idx, std::size_t dataset_idx) {
-                byte_t const* quantized_dataset = dataset_quantized.data() + dataset_idx * stride_per_vector_quantized;
-                byte_t const* quantized_centroids = centroids_quantized.data();
-                distance_t& closest_distance = centroid_distances[dataset_idx];
-                std::size_t& closest_idx = centroids_indexes[dataset_idx];
+        while (iterations < max_iterations) {
+            iterations++;
+
+            // For every point in the points, find the closest centroid.
+            std::atomic<std::size_t> point_shifts = 0;
+            std::atomic<distance_t> total_inertia = 0;
+            executor.dynamic(points_count, [&](std::size_t thread_idx, std::size_t points_idx) {
+                byte_t const* quantized_point =
+                    points_quantized_buffer.data() + points_idx * stride_per_vector_quantized;
+                byte_t const* quantized_centroids = centroids_quantized_buffer.data();
+                distance_t closest_distance_local = std::numeric_limits<distance_t>::max();
+                std::size_t closest_idx_local = 0;
                 for (std::size_t centroid_idx = 0; centroid_idx < wanted_clusters; centroid_idx++) {
                     byte_t const* quantized_centroid = quantized_centroids + centroid_idx * stride_per_vector_quantized;
-                    distance_t distance = metric(quantized_dataset, quantized_centroid);
-                    if (distance < closest_distance) {
-                        closest_distance = distance;
-                        closest_idx = centroid_idx;
+                    distance_t distance = metric(quantized_point, quantized_centroid);
+                    if (distance < closest_distance_local) {
+                        closest_distance_local = distance;
+                        closest_idx_local = centroid_idx;
                     }
                 }
+
+                distance_t& closest_distance_ref = point_to_centroid_distance_buffer[points_idx];
+                std::size_t& closest_idx_ref = point_to_centroid_index_buffer[points_idx];
+                if (closest_idx_local != closest_idx_ref) {
+                    closest_idx_ref = closest_idx_local;
+                    point_shifts.fetch_add(1, std::memory_order_relaxed);
+                }
+
+                closest_distance_ref = closest_distance_local;
+                // total_inertia.add(closest_distance_local * closest_distance_local, std::memory_order_relaxed);
                 return true;
             });
 
+            // Check for early-exit conditions
+            result.last_iteration_inertia = total_inertia.load(std::memory_order_relaxed);
+            if (final_inertia_threshold != 0)
+                if (result.last_iteration_inertia <= final_inertia_threshold)
+                    break;
+
+            result.last_iteration_point_shifts = point_shifts.load(std::memory_order_relaxed);
+            if (min_point_shifts_per_iteration != 0)
+                if (result.last_iteration_point_shifts <= min_point_shifts_per_iteration)
+                    break;
+
+            auto current_time = std::chrono::high_resolution_clock::now();
+            std::chrono::duration<f64_t> elapsed_time = current_time - start_time;
+            result.runtime_seconds = elapsed_time.count();
+            if (runtime_limit_seconds != 0)
+                if (result.runtime_seconds >= runtime_limit_seconds)
+                    break;
+
             // For every centroid, recalculate the mean of all points assigned to it.
             // That part is problematic to parallelize on many-core-systems, because of the contention.
             // Alternatively, a tree-like approach can be used, where every core accumulates it's own partial sums.
             // And those are later aggregated by a single thread.
-            std::memset(centroids_precise.data(), 0, wanted_clusters * dimensions * 2 * thread_count * sizeof(f64_t));
-            executor.dynamic(dataset_count, [&](std::size_t thread_idx, std::size_t dataset_idx) {
-                std::size_t centroid_idx = centroids_indexes[dataset_idx];
-                byte_t const* quantized_dataset = dataset_quantized.data() + dataset_idx * stride_per_vector_quantized;
-                f64_t* centroid_precise =
-                    centroids_precise.data() + wanted_clusters * dimensions * thread_idx + centroid_idx * dimensions;
-
-                // Upcast the dataset point into a buffer of double-precision floats.
-                f64_t* dataset_precise =
-                    dataset_precise.data() + wanted_clusters * dimensions * thread_idx + centroid_idx * dimensions;
-                if (!decompress_dataset(quantized_dataset, dimensions, dataset_precise))
-                    std::memcpy(dataset_precise, quantized_dataset, bytes_per_vector_quantized);
-
-                // Now add the vector from the dataset into the centroid partial sum.
+            std::memset(centroids_precise_buffer.data(), 0,
+                        wanted_clusters * dimensions * 2 * thread_count * sizeof(f64_t));
+            std::memset(cluster_sizes_buffer.data(), 0, wanted_clusters * sizeof(std::atomic<std::size_t>));
+            executor.dynamic(points_count, [&](std::size_t thread_idx, std::size_t points_idx) {
+                std::size_t centroid_idx = point_to_centroid_index_buffer[points_idx];
+                byte_t const* quantized_point =
+                    points_quantized_buffer.data() + points_idx * stride_per_vector_quantized;
+                f64_t* centroid_precise = centroids_precise_buffer.data() + wanted_clusters * dimensions * thread_idx +
+                                          centroid_idx * dimensions;
+
+                // Upcast the points point into a buffer of double-precision floats.
+                f64_t* point_precise = points_precise_buffer.data() + wanted_clusters * dimensions * thread_idx +
+                                       centroid_idx * dimensions;
+                if (!decompress_precise(quantized_point, dimensions, reinterpret_cast<byte_t*>(point_precise)))
+                    std::memcpy(reinterpret_cast<byte_t*>(point_precise), quantized_point, bytes_per_vector_quantized);
+
+                // Now add the vector from the points into the centroid partial sum.
                 for (std::size_t i = 0; i < dimensions; i++)
-                    centroid_precise[i] += dataset_precise[i];
+                    centroid_precise[i] += point_precise[i];
 
-                cluster_sizes[centroid_idx].fetch_add(1, std::memory_order_relaxed);
+                cluster_sizes_buffer[centroid_idx].fetch_add(1, std::memory_order_relaxed);
                 return true;
             });
 
             // Aggregate the partial sums into the final centroids - storing them in the high-precision
             // buffer of the first thread. Normalization procedure is different for different metrics.
             for (std::size_t centroid_idx = 0; centroid_idx < wanted_clusters; centroid_idx++) {
-                f64_t* centroid_precise_aggregated = centroids_precise.data() + centroid_idx * dimensions;
+                f64_t* centroid_precise_aggregated = centroids_precise_buffer.data() + centroid_idx * dimensions;
                 for (std::size_t thread_idx = 1; thread_idx < thread_count; thread_idx++) {
-                    f64_t* centroid_precise = centroids_precise.data() + wanted_clusters * dimensions * thread_idx +
-                                              centroid_idx * dimensions;
+                    f64_t* centroid_precise = centroids_precise_buffer.data() +
+                                              wanted_clusters * dimensions * thread_idx + centroid_idx * dimensions;
                     for (std::size_t i = 0; i < dimensions; i++)
                         centroid_precise_aggregated[i] += centroid_precise[i];
                 }
-                // Normalize:
+
+                // Normalize based on the metric kind
+                if (metric_kind == metric_kind_t::l2sq_k) {
+                    // Normalize for Euclidean distance (L2)
+                    std::size_t cluster_size = cluster_sizes_buffer[centroid_idx].load(std::memory_order_relaxed);
+                    if (cluster_size > 0)
+                        for (std::size_t i = 0; i < dimensions; i++)
+                            centroid_precise_aggregated[i] /= static_cast<f64_t>(cluster_size);
+
+                } else if (metric_kind == metric_kind_t::cos_k) {
+                    // Normalize for Cosine distance
+                    f64_t norm = 0.0;
+                    for (std::size_t i = 0; i < dimensions; i++)
+                        norm += centroid_precise_aggregated[i] * centroid_precise_aggregated[i];
+                    norm = std::sqrt(norm);
+                    if (norm > 0.0)
+                        for (std::size_t i = 0; i < dimensions; i++)
+                            centroid_precise_aggregated[i] /= norm;
+                }
+
+                // Quantize the centroid after normalization for further iterations
+                byte_t* quantized_centroid =
+                    centroids_quantized_buffer.data() + centroid_idx * stride_per_vector_quantized;
+                if (!compress_precise(reinterpret_cast<byte_t*>(centroid_precise_aggregated),
+                                      bytes_per_vector_quantized, quantized_centroid))
+                    std::memcpy(quantized_centroid, reinterpret_cast<byte_t*>(centroid_precise_aggregated),
+                                bytes_per_vector_quantized);
             }
         }
+
+        result.iterations = iterations;
+
+        // We've finished all the iterations, now we can export the centroids back to the original precision.
+        std::memcpy(point_to_centroid_index, point_to_centroid_index_buffer.data(), points_count * sizeof(std::size_t));
+        for (std::size_t i = 0; i < wanted_clusters; i++) {
+            byte_t const* quantized_centroid = centroids_quantized_buffer.data() + i * stride_per_vector_quantized;
+            byte_t* centroid = centroids_data + i * centroids_stride;
+            if (!decompress_points(quantized_centroid, bytes_per_vector_quantized, centroid))
+                std::memcpy(centroid, quantized_centroid, bytes_per_vector_quantized);
+        }
+
+        return result;
     }
 };
 

From 7416247e892f4be9162568af7763d8ce000e03e9 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 2 Sep 2024 20:32:52 +0000
Subject: [PATCH 04/32] Chore: Revert integrated K-Means attempts

---
 cpp/bench.cpp                     |   4 +-
 include/usearch/index_dense.hpp   | 306 +++++++-----------------------
 include/usearch/index_plugins.hpp |   3 +-
 3 files changed, 70 insertions(+), 243 deletions(-)

diff --git a/cpp/bench.cpp b/cpp/bench.cpp
index 63e7f9d3..2da000e2 100644
--- a/cpp/bench.cpp
+++ b/cpp/bench.cpp
@@ -185,7 +185,7 @@ struct persisted_dataset_gt {
     std::size_t vectors_count() const noexcept {
         return vectors_to_take_ ? vectors_to_take_ : (vectors_.rows - vectors_to_skip_);
     }
-    vectors_view_gt<scalar_t> vectors_view() const noexcept {
+    matrix_slice_gt<scalar_t const> vectors_view() const noexcept {
         return {vector(vectors_to_skip_), vectors_count(), dimensions()};
     }
 };
@@ -224,7 +224,7 @@ struct in_memory_dataset_gt {
     scalar_t* query(std::size_t i) noexcept { return queries_.data() + i * dimensions_; }
     compressed_slot_t* neighborhood(std::size_t i) noexcept { return neighborhoods_.data() + i * neighborhood_size_; }
 
-    vectors_view_gt<scalar_t> vectors_view() const noexcept { return {vector(0), vectors_count(), dimensions()}; }
+    matrix_slice_gt<scalar_t const> vectors_view() const noexcept { return {vector(0), vectors_count(), dimensions()}; }
 };
 
 char const* getenv_or(char const* name, char const* default_) { return getenv(name) ? getenv(name) : default_; }
diff --git a/include/usearch/index_dense.hpp b/include/usearch/index_dense.hpp
index 416c1227..1fb24e73 100644
--- a/include/usearch/index_dense.hpp
+++ b/include/usearch/index_dense.hpp
@@ -406,8 +406,6 @@ class index_dense_gt {
     using tape_allocator_t = memory_mapping_allocator_gt<64>;
 
   private:
-    /// @brief Schema: input buffer, bytes in input buffer, output buffer.
-    using cast_t = bool (*)(byte_t const*, std::size_t, byte_t*);
     /// @brief Punned index.
     using index_t = index_gt<                        //
         distance_t, vector_key_t, compressed_slot_t, //
@@ -446,19 +444,7 @@ class index_dense_gt {
 
     /// @brief  Temporary memory for every thread to store a casted vector.
     mutable cast_buffer_t cast_buffer_;
-    struct casts_t {
-        cast_t from_b1x8;
-        cast_t from_i8;
-        cast_t from_f16;
-        cast_t from_f32;
-        cast_t from_f64;
-
-        cast_t to_b1x8;
-        cast_t to_i8;
-        cast_t to_f16;
-        cast_t to_f32;
-        cast_t to_f64;
-    } casts_;
+    casts_punned_t casts_;
 
     /// @brief An instance of a potentially stateful `metric_t` used to initialize copies and forks.
     metric_t metric_;
@@ -677,7 +663,7 @@ class index_dense_gt {
         // In some cases the metric is not provided, and will be set later.
         if (metric) {
             scalar_kind_t scalar_kind = metric.scalar_kind();
-            index.casts_ = make_casts_(scalar_kind);
+            index.casts_ = casts_punned_t::make(scalar_kind);
             index.metric_ = metric;
         }
 
@@ -767,41 +753,41 @@ class index_dense_gt {
     };
 
     // clang-format off
-    add_result_t add(vector_key_t key, b1x8_t const* vector, std::size_t thread = any_thread(), bool force_vector_copy = true) { return add_(key, vector, thread, force_vector_copy, casts_.from_b1x8); }
-    add_result_t add(vector_key_t key, i8_t const* vector, std::size_t thread = any_thread(), bool force_vector_copy = true) { return add_(key, vector, thread, force_vector_copy, casts_.from_i8); }
-    add_result_t add(vector_key_t key, f16_t const* vector, std::size_t thread = any_thread(), bool force_vector_copy = true) { return add_(key, vector, thread, force_vector_copy, casts_.from_f16); }
-    add_result_t add(vector_key_t key, f32_t const* vector, std::size_t thread = any_thread(), bool force_vector_copy = true) { return add_(key, vector, thread, force_vector_copy, casts_.from_f32); }
-    add_result_t add(vector_key_t key, f64_t const* vector, std::size_t thread = any_thread(), bool force_vector_copy = true) { return add_(key, vector, thread, force_vector_copy, casts_.from_f64); }
-
-    search_result_t search(b1x8_t const* vector, std::size_t wanted, std::size_t thread = any_thread(), bool exact = false) const { return search_(vector, wanted, dummy_predicate_t {}, thread, exact, casts_.from_b1x8); }
-    search_result_t search(i8_t const* vector, std::size_t wanted, std::size_t thread = any_thread(), bool exact = false) const { return search_(vector, wanted, dummy_predicate_t {}, thread, exact, casts_.from_i8); }
-    search_result_t search(f16_t const* vector, std::size_t wanted, std::size_t thread = any_thread(), bool exact = false) const { return search_(vector, wanted, dummy_predicate_t {}, thread, exact, casts_.from_f16); }
-    search_result_t search(f32_t const* vector, std::size_t wanted, std::size_t thread = any_thread(), bool exact = false) const { return search_(vector, wanted, dummy_predicate_t {}, thread, exact, casts_.from_f32); }
-    search_result_t search(f64_t const* vector, std::size_t wanted, std::size_t thread = any_thread(), bool exact = false) const { return search_(vector, wanted, dummy_predicate_t {}, thread, exact, casts_.from_f64); }
-
-    template <typename predicate_at> search_result_t filtered_search(b1x8_t const* vector, std::size_t wanted, predicate_at&& predicate, std::size_t thread = any_thread(), bool exact = false) const { return search_(vector, wanted, std::forward<predicate_at>(predicate), thread, exact, casts_.from_b1x8); }
-    template <typename predicate_at> search_result_t filtered_search(i8_t const* vector, std::size_t wanted, predicate_at&& predicate, std::size_t thread = any_thread(), bool exact = false) const { return search_(vector, wanted, std::forward<predicate_at>(predicate), thread, exact, casts_.from_i8); }
-    template <typename predicate_at> search_result_t filtered_search(f16_t const* vector, std::size_t wanted, predicate_at&& predicate, std::size_t thread = any_thread(), bool exact = false) const { return search_(vector, wanted, std::forward<predicate_at>(predicate), thread, exact, casts_.from_f16); }
-    template <typename predicate_at> search_result_t filtered_search(f32_t const* vector, std::size_t wanted, predicate_at&& predicate, std::size_t thread = any_thread(), bool exact = false) const { return search_(vector, wanted, std::forward<predicate_at>(predicate), thread, exact, casts_.from_f32); }
-    template <typename predicate_at> search_result_t filtered_search(f64_t const* vector, std::size_t wanted, predicate_at&& predicate, std::size_t thread = any_thread(), bool exact = false) const { return search_(vector, wanted, std::forward<predicate_at>(predicate), thread, exact, casts_.from_f64); }
-
-    std::size_t get(vector_key_t key, b1x8_t* vector, std::size_t vectors_count = 1) const { return get_(key, vector, vectors_count, casts_.to_b1x8); }
-    std::size_t get(vector_key_t key, i8_t* vector, std::size_t vectors_count = 1) const { return get_(key, vector, vectors_count, casts_.to_i8); }
-    std::size_t get(vector_key_t key, f16_t* vector, std::size_t vectors_count = 1) const { return get_(key, vector, vectors_count, casts_.to_f16); }
-    std::size_t get(vector_key_t key, f32_t* vector, std::size_t vectors_count = 1) const { return get_(key, vector, vectors_count, casts_.to_f32); }
-    std::size_t get(vector_key_t key, f64_t* vector, std::size_t vectors_count = 1) const { return get_(key, vector, vectors_count, casts_.to_f64); }
-
-    cluster_result_t cluster(b1x8_t const* vector, std::size_t level, std::size_t thread = any_thread()) const { return cluster_(vector, level, thread, casts_.from_b1x8); }
-    cluster_result_t cluster(i8_t const* vector, std::size_t level, std::size_t thread = any_thread()) const { return cluster_(vector, level, thread, casts_.from_i8); }
-    cluster_result_t cluster(f16_t const* vector, std::size_t level, std::size_t thread = any_thread()) const { return cluster_(vector, level, thread, casts_.from_f16); }
-    cluster_result_t cluster(f32_t const* vector, std::size_t level, std::size_t thread = any_thread()) const { return cluster_(vector, level, thread, casts_.from_f32); }
-    cluster_result_t cluster(f64_t const* vector, std::size_t level, std::size_t thread = any_thread()) const { return cluster_(vector, level, thread, casts_.from_f64); }
-
-    aggregated_distances_t distance_between(vector_key_t key, b1x8_t const* vector, std::size_t thread = any_thread()) const { return distance_between_(key, vector, thread, casts_.to_b1x8); }
-    aggregated_distances_t distance_between(vector_key_t key, i8_t const* vector, std::size_t thread = any_thread()) const { return distance_between_(key, vector, thread, casts_.to_i8); }
-    aggregated_distances_t distance_between(vector_key_t key, f16_t const* vector, std::size_t thread = any_thread()) const { return distance_between_(key, vector, thread, casts_.to_f16); }
-    aggregated_distances_t distance_between(vector_key_t key, f32_t const* vector, std::size_t thread = any_thread()) const { return distance_between_(key, vector, thread, casts_.to_f32); }
-    aggregated_distances_t distance_between(vector_key_t key, f64_t const* vector, std::size_t thread = any_thread()) const { return distance_between_(key, vector, thread, casts_.to_f64); }
+    add_result_t add(vector_key_t key, b1x8_t const* vector, std::size_t thread = any_thread(), bool force_vector_copy = true) { return add_(key, vector, thread, force_vector_copy, casts_.from.b1x8); }
+    add_result_t add(vector_key_t key, i8_t const* vector, std::size_t thread = any_thread(), bool force_vector_copy = true) { return add_(key, vector, thread, force_vector_copy, casts_.from.i8); }
+    add_result_t add(vector_key_t key, f16_t const* vector, std::size_t thread = any_thread(), bool force_vector_copy = true) { return add_(key, vector, thread, force_vector_copy, casts_.from.f16); }
+    add_result_t add(vector_key_t key, f32_t const* vector, std::size_t thread = any_thread(), bool force_vector_copy = true) { return add_(key, vector, thread, force_vector_copy, casts_.from.f32); }
+    add_result_t add(vector_key_t key, f64_t const* vector, std::size_t thread = any_thread(), bool force_vector_copy = true) { return add_(key, vector, thread, force_vector_copy, casts_.from.f64); }
+
+    search_result_t search(b1x8_t const* vector, std::size_t wanted, std::size_t thread = any_thread(), bool exact = false) const { return search_(vector, wanted, dummy_predicate_t {}, thread, exact, casts_.from.b1x8); }
+    search_result_t search(i8_t const* vector, std::size_t wanted, std::size_t thread = any_thread(), bool exact = false) const { return search_(vector, wanted, dummy_predicate_t {}, thread, exact, casts_.from.i8); }
+    search_result_t search(f16_t const* vector, std::size_t wanted, std::size_t thread = any_thread(), bool exact = false) const { return search_(vector, wanted, dummy_predicate_t {}, thread, exact, casts_.from.f16); }
+    search_result_t search(f32_t const* vector, std::size_t wanted, std::size_t thread = any_thread(), bool exact = false) const { return search_(vector, wanted, dummy_predicate_t {}, thread, exact, casts_.from.f32); }
+    search_result_t search(f64_t const* vector, std::size_t wanted, std::size_t thread = any_thread(), bool exact = false) const { return search_(vector, wanted, dummy_predicate_t {}, thread, exact, casts_.from.f64); }
+
+    template <typename predicate_at> search_result_t filtered_search(b1x8_t const* vector, std::size_t wanted, predicate_at&& predicate, std::size_t thread = any_thread(), bool exact = false) const { return search_(vector, wanted, std::forward<predicate_at>(predicate), thread, exact, casts_.from.b1x8); }
+    template <typename predicate_at> search_result_t filtered_search(i8_t const* vector, std::size_t wanted, predicate_at&& predicate, std::size_t thread = any_thread(), bool exact = false) const { return search_(vector, wanted, std::forward<predicate_at>(predicate), thread, exact, casts_.from.i8); }
+    template <typename predicate_at> search_result_t filtered_search(f16_t const* vector, std::size_t wanted, predicate_at&& predicate, std::size_t thread = any_thread(), bool exact = false) const { return search_(vector, wanted, std::forward<predicate_at>(predicate), thread, exact, casts_.from.f16); }
+    template <typename predicate_at> search_result_t filtered_search(f32_t const* vector, std::size_t wanted, predicate_at&& predicate, std::size_t thread = any_thread(), bool exact = false) const { return search_(vector, wanted, std::forward<predicate_at>(predicate), thread, exact, casts_.from.f32); }
+    template <typename predicate_at> search_result_t filtered_search(f64_t const* vector, std::size_t wanted, predicate_at&& predicate, std::size_t thread = any_thread(), bool exact = false) const { return search_(vector, wanted, std::forward<predicate_at>(predicate), thread, exact, casts_.from.f64); }
+
+    std::size_t get(vector_key_t key, b1x8_t* vector, std::size_t vectors_count = 1) const { return get_(key, vector, vectors_count, casts_.to.b1x8); }
+    std::size_t get(vector_key_t key, i8_t* vector, std::size_t vectors_count = 1) const { return get_(key, vector, vectors_count, casts_.to.i8); }
+    std::size_t get(vector_key_t key, f16_t* vector, std::size_t vectors_count = 1) const { return get_(key, vector, vectors_count, casts_.to.f16); }
+    std::size_t get(vector_key_t key, f32_t* vector, std::size_t vectors_count = 1) const { return get_(key, vector, vectors_count, casts_.to.f32); }
+    std::size_t get(vector_key_t key, f64_t* vector, std::size_t vectors_count = 1) const { return get_(key, vector, vectors_count, casts_.to.f64); }
+
+    cluster_result_t cluster(b1x8_t const* vector, std::size_t level, std::size_t thread = any_thread()) const { return cluster_(vector, level, thread, casts_.from.b1x8); }
+    cluster_result_t cluster(i8_t const* vector, std::size_t level, std::size_t thread = any_thread()) const { return cluster_(vector, level, thread, casts_.from.i8); }
+    cluster_result_t cluster(f16_t const* vector, std::size_t level, std::size_t thread = any_thread()) const { return cluster_(vector, level, thread, casts_.from.f16); }
+    cluster_result_t cluster(f32_t const* vector, std::size_t level, std::size_t thread = any_thread()) const { return cluster_(vector, level, thread, casts_.from.f32); }
+    cluster_result_t cluster(f64_t const* vector, std::size_t level, std::size_t thread = any_thread()) const { return cluster_(vector, level, thread, casts_.from.f64); }
+
+    aggregated_distances_t distance_between(vector_key_t key, b1x8_t const* vector, std::size_t thread = any_thread()) const { return distance_between_(key, vector, thread, casts_.to.b1x8); }
+    aggregated_distances_t distance_between(vector_key_t key, i8_t const* vector, std::size_t thread = any_thread()) const { return distance_between_(key, vector, thread, casts_.to.i8); }
+    aggregated_distances_t distance_between(vector_key_t key, f16_t const* vector, std::size_t thread = any_thread()) const { return distance_between_(key, vector, thread, casts_.to.f16); }
+    aggregated_distances_t distance_between(vector_key_t key, f32_t const* vector, std::size_t thread = any_thread()) const { return distance_between_(key, vector, thread, casts_.to.f32); }
+    aggregated_distances_t distance_between(vector_key_t key, f64_t const* vector, std::size_t thread = any_thread()) const { return distance_between_(key, vector, thread, casts_.to.f64); }
     // clang-format on
 
     /**
@@ -1154,7 +1140,7 @@ class index_dense_gt {
             cast_buffer_ = cast_buffer_t(available_threads_.size() * metric_.bytes_per_vector());
             if (!cast_buffer_)
                 return result.failed("Failed to allocate memory for the casts");
-            casts_ = make_casts_(head.kind_scalar);
+            casts_ = casts_punned_t::make(head.kind_scalar);
         }
 
         // Pull the actual proximity graph
@@ -1266,7 +1252,7 @@ class index_dense_gt {
             cast_buffer_ = cast_buffer_t(available_threads_.size() * metric_.bytes_per_vector());
             if (!cast_buffer_)
                 return result.failed("Failed to allocate memory for the casts");
-            casts_ = make_casts_(head.kind_scalar);
+            casts_ = casts_punned_t::make(head.kind_scalar);
             offset += sizeof(buffer);
         }
 
@@ -1791,110 +1777,6 @@ class index_dense_gt {
         }
     };
 
-    /**
-     *  @brief  Implements K-Means clustering for all the points in the index.
-     *
-     *  Unlike the normal K-Means, we compute a hierarchical version.
-     *  1. Find the first layer that contains enough nodes to exceed `config.clusters`.
-     *  2. Assign random centroids to those points, and perform K-Means between them.
-     *  3. Iterate a few times and go down, assigning neighbors of the child nodes to the same cluster.
-     *
-     *  ! Doesn't guarantee that the clusters are balanced in size.
-     *  ! Brute-forces the centroids, so it's not very efficient for large datasets,
-     *  ! but benefits from initializing the centroids with the second largest level.
-     *  ! Not designed for multi-indexes.
-     *  ! Assumes all slots are occupied and no deletions are present.
-     *
-     *  @param[in] config Configuration parameters for K-Means clustering.
-     *  @param[in] executor Thread-pool to execute the job in parallel.
-     *  @param[in] progress Callback to report the execution progress.
-     *
-     *  @param[out] cluster_keys Pointer to the array where the cluster keys will be exported.
-     *  @param[out] cluster_distances Pointer to the array where the distances to those centroids will be exported.
-     */
-    template <                                   //
-        typename executor_at = dummy_executor_t, //
-        typename progress_at = dummy_progress_t  //
-        >
-    clustering_result_t cluster(                       //
-        index_dense_clustering_kmeans_config_t config, //
-        vector_key_t* member_keys,                     //
-        std::size_t* centroid_index,                   //
-        distance_t* centroid_distances,                //
-        f64_t** centroids,                             //
-        std::size_t* centroids_stride,                 //
-        executor_at&& executor = executor_at{},        //
-        progress_at&& progress = progress_at{}) {
-
-        clustering_result_t result;
-
-        // Perform sanity checks.
-        if (config.clusters < 2)
-            return result.failed("The number of clusters must be at least 2");
-        if (config.max_iterations < 1)
-            return result.failed("The number of iterations must be at least 1");
-        std::size_t const nodes_count = typed_->size();
-        std::size_t const vectors_count = size();
-        if (nodes_count != vectors_count)
-            return result.failed("Multi-indexes and deletions are not supported");
-        if (config.clusters >= vectors_count)
-            return result.failed("The number of clusters must be less than the number of vectors");
-
-        // Find the first level (top -> down) that has enough nodes to exceed `config.clusters`.
-        level_t level = static_cast<level_t>(max_level());
-        for (; level > 0; --level) {
-            if (stats(level).nodes > config.clusters)
-                break;
-        }
-
-        // Let's allocate memory for the centroids coordinates and make sure it's
-        // rows are aligned to cache lines to avoid false sharing.
-        using centroid_allocator_t = aligned_allocator_gt<f64_t, 64>;
-        std::size_t centroids_rows = config.clusters;
-        std::size_t centroids_row_bytes = metric_.bytes_per_vector();
-        std::size_t centroids_row_stride = divide_round_up<64>(centroids_row_bytes);
-        f64_t* centroids_data = centroid_allocator_t{}.allocate(centroids_rows * centroids_row_stride);
-        if (!centroids_data)
-            return result.failed("Out of memory!");
-
-        std::random_device random_device;
-        std::mt19937 random_engine(random_device());
-
-        // Randomly initialize centroids keys - go through all the non-removed slots.
-        {
-            std::size_t vectors_exported_count = 0;
-            slot_lookup_.for_each([&](key_and_slot_t const& key_and_slot) {
-                member_keys[key_and_slot.slot] = key_and_slot.key;
-                centroid_index[key_and_slot.slot] = random_engine() % centroids_rows;
-                ++vectors_exported_count;
-            });
-            usearch_assert_m(vectors_exported_count == vectors_count, "Vectors count mismatch");
-        }
-        std::fill(centroid_distances, centroid_distances + vectors_count, std::numeric_limits<distance_t>::max());
-
-        // Instead of generating random values for centroids,
-        // we can copy and upcast some random points from the bottom layer.
-        for (std::size_t i = 0; i < centroids_rows; i++) {
-            std::size_t slot = random_engine() % nodes_count;
-            byte_t const* vector = vectors_lookup_[slot];
-            if (!casts_.to_f64(vector, centroids_data + i * centroids_row_stride))
-                std::memcpy(centroids_data + i * centroids_row_stride, vector, centroids_row_bytes);
-            centroid_index[slot] = i;
-            centroid_distances[slot] = 0;
-        }
-
-        // Go down the layers, performing K-Means clustering on each one.
-        for (; level >= 0; --level) {
-            for (std::size_t slot = 0; slot < nodes_count; slot++) {
-                byte_t const* vector = vectors_lookup_[slot];
-                std::size_t closest_centroid = centroid_index[slot];
-                distance_t distance =
-                    metric_.distance(vector, centroids_data + closest_centroid * centroids_row_stride);
-                centroid_distances[slot] = distance;
-            }
-        }
-    }
-
     /**
      *  @brief  Implements clustering, classifying the given objects (vectors of member keys)
      *          into a given number of clusters.
@@ -1962,56 +1844,23 @@ class index_dense_gt {
     map_to_clusters:
         // Concurrently perform search until a certain depth
         executor.dynamic(queries_count, [&](std::size_t thread_idx, std::size_t query_idx) {
-            auto query_key = queries_begin[query_idx];
-            if (level) {
-                auto result = cluster(query_key, level, thread_idx);
-                if (!result) {
-                    atomic_error = result.error.release();
-                    return false;
-                }
-
-                cluster_keys[query_idx] = result.cluster.member.key;
-                cluster_distances[query_idx] = result.cluster.distance;
-
-                // Export in case we need to refine afterwards
-                clusters[query_idx].centroid = result.cluster.member.key;
-                clusters[query_idx].vector = vectors_lookup_[result.cluster.member.slot];
-                clusters[query_idx].merged_into = free_key();
-                clusters[query_idx].popularity = 1;
-
-                visited_members += result.visited_members;
-                computed_distances += result.computed_distances;
-            } else {
-                index_search_config_t search_config;
-                search_config.thread = thread_idx;
-                search_config.expansion = config_.expansion_search;
-                search_config.exact = false;
-
-                vector_key_t free_key_copy = free_key_;
-                auto allow = [free_key_copy](member_cref_t const& member) noexcept {
-                    return (vector_key_t)member.key != free_key_copy;
-                };
-                auto query_slot = slot_lookup_.find(key_and_slot_t::any_slot(query_key))->slot;
-                byte_t const* vector_data = vectors_lookup_[query_slot];
-                auto result = typed_->search(vector_data, 1, metric_proxy_t{*this}, search_config, allow);
-                if (!result) {
-                    atomic_error = result.error.release();
-                    return false;
-                }
-
-                cluster_keys[query_idx] = result.front().member.key;
-                cluster_distances[query_idx] = result.front().distance;
+            auto result = cluster(queries_begin[query_idx], level, thread_idx);
+            if (!result) {
+                atomic_error = result.error.release();
+                return false;
+            }
 
-                // Export in case we need to refine afterwards
-                clusters[query_idx].centroid = result.front().member.key;
-                clusters[query_idx].vector = vectors_lookup_[result.front().member.slot];
-                clusters[query_idx].merged_into = free_key();
-                clusters[query_idx].popularity = 1;
+            cluster_keys[query_idx] = result.cluster.member.key;
+            cluster_distances[query_idx] = result.cluster.distance;
 
-                visited_members += result.visited_members;
-                computed_distances += result.computed_distances;
-            }
+            // Export in case we need to refine afterwards
+            clusters[query_idx].centroid = result.cluster.member.key;
+            clusters[query_idx].vector = vectors_lookup_[result.cluster.member.slot];
+            clusters[query_idx].merged_into = free_key();
+            clusters[query_idx].popularity = 1;
 
+            visited_members += result.visited_members;
+            computed_distances += result.computed_distances;
             return true;
         });
 
@@ -2131,7 +1980,8 @@ class index_dense_gt {
     template <typename scalar_at>
     add_result_t add_(                             //
         vector_key_t key, scalar_at const* vector, //
-        std::size_t thread, bool force_vector_copy, cast_t const& cast) {
+        std::size_t thread, bool force_vector_copy, cast_punned_t const& cast) {
+
         if (!multi() && config().enable_key_lookups && contains(key))
             return add_result_t{}.failed("Duplicate keys not allowed in high-level wrappers");
 
@@ -2180,7 +2030,8 @@ class index_dense_gt {
 
     template <typename scalar_at, typename predicate_at>
     search_result_t search_(scalar_at const* vector, std::size_t wanted, predicate_at&& predicate, std::size_t thread,
-                            bool exact, cast_t const& cast) const {
+                            bool exact, cast_punned_t const& cast) const {
+
         // Cast the vector, if needed for compatibility with `metric_`
         thread_lock_t lock = thread_lock_(thread);
         byte_t const* vector_data = reinterpret_cast<byte_t const*>(vector);
@@ -2215,7 +2066,8 @@ class index_dense_gt {
     template <typename scalar_at>
     cluster_result_t cluster_(                      //
         scalar_at const* vector, std::size_t level, //
-        std::size_t thread, cast_t const& cast) const {
+        std::size_t thread, cast_punned_t const& cast) const {
+
         // Cast the vector, if needed for compatibility with `metric_`
         thread_lock_t lock = thread_lock_(thread);
         byte_t const* vector_data = reinterpret_cast<byte_t const*>(vector);
@@ -2238,7 +2090,8 @@ class index_dense_gt {
     template <typename scalar_at>
     aggregated_distances_t distance_between_(      //
         vector_key_t key, scalar_at const* vector, //
-        std::size_t thread, cast_t const& cast) const {
+        std::size_t thread, cast_punned_t const& cast) const {
+
         // Cast the vector, if needed for compatibility with `metric_`
         thread_lock_t lock = thread_lock_(thread);
         byte_t const* vector_data = reinterpret_cast<byte_t const*>(vector);
@@ -2282,6 +2135,7 @@ class index_dense_gt {
     }
 
     void reindex_keys_() {
+
         // Estimate number of entries first
         std::size_t count_total = typed_->size();
         std::size_t count_removed = 0;
@@ -2313,7 +2167,9 @@ class index_dense_gt {
     }
 
     template <typename scalar_at>
-    std::size_t get_(vector_key_t key, scalar_at* reconstructed, std::size_t vectors_limit, cast_t const& cast) const {
+    std::size_t get_(vector_key_t key, scalar_at* reconstructed, std::size_t vectors_limit,
+                     cast_punned_t const& cast) const {
+
         if (!multi()) {
             compressed_slot_t slot;
             // Find the matching ID
@@ -2347,36 +2203,6 @@ class index_dense_gt {
             return count_exported;
         }
     }
-
-    template <typename to_scalar_at> static casts_t make_casts_() {
-        casts_t result;
-
-        result.from_b1x8 = &cast_gt<b1x8_t, to_scalar_at>::try_;
-        result.from_i8 = &cast_gt<i8_t, to_scalar_at>::try_;
-        result.from_f16 = &cast_gt<f16_t, to_scalar_at>::try_;
-        result.from_f32 = &cast_gt<f32_t, to_scalar_at>::try_;
-        result.from_f64 = &cast_gt<f64_t, to_scalar_at>::try_;
-
-        result.to_b1x8 = &cast_gt<to_scalar_at, b1x8_t>::try_;
-        result.to_i8 = &cast_gt<to_scalar_at, i8_t>::try_;
-        result.to_f16 = &cast_gt<to_scalar_at, f16_t>::try_;
-        result.to_f32 = &cast_gt<to_scalar_at, f32_t>::try_;
-        result.to_f64 = &cast_gt<to_scalar_at, f64_t>::try_;
-
-        return result;
-    }
-
-    static casts_t make_casts_(scalar_kind_t scalar_kind) {
-        switch (scalar_kind) {
-        case scalar_kind_t::f64_k: return make_casts_<f64_t>();
-        case scalar_kind_t::f32_k: return make_casts_<f32_t>();
-        case scalar_kind_t::f16_k: return make_casts_<f16_t>();
-        case scalar_kind_t::bf16_k: return make_casts_<bf16_t>();
-        case scalar_kind_t::i8_k: return make_casts_<i8_t>();
-        case scalar_kind_t::b1x8_k: return make_casts_<b1x8_t>();
-        default: return {};
-        }
-    }
 };
 
 using index_dense_t = index_dense_gt<>;
@@ -2423,4 +2249,4 @@ static join_result_t join(                                    //
 }
 
 } // namespace usearch
-} // namespace unum
+} // namespace unum
\ No newline at end of file
diff --git a/include/usearch/index_plugins.hpp b/include/usearch/index_plugins.hpp
index 98bc5da0..9dec50e0 100644
--- a/include/usearch/index_plugins.hpp
+++ b/include/usearch/index_plugins.hpp
@@ -1924,6 +1924,7 @@ class metric_punned_t {
 template <typename scalar_at> //
 class matrix_slice_gt {
     using scalar_t = scalar_at;
+    using byte_addressable_t = typename std::conditional<std::is_const<scalar_t>::value, byte_t const, byte_t>::type;
 
     scalar_t* begin_{};
     std::size_t dimensions_{};
@@ -1947,7 +1948,7 @@ class matrix_slice_gt {
     std::size_t stride() const noexcept { return stride_bytes_; }
     scalar_t* data() const noexcept { return begin_; }
     scalar_t* at(std::size_t i) const noexcept {
-        return reinterpret_cast<scalar_t*>(reinterpret_cast<byte_t*>(begin_) + i * stride_bytes_);
+        return reinterpret_cast<scalar_t*>(reinterpret_cast<byte_addressable_t*>(begin_) + i * stride_bytes_);
     }
 };
 

From 9c799e18ed27c7006cccf6b34c3167ccf99cdfe9 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 2 Sep 2024 22:15:25 +0000
Subject: [PATCH 05/32] Fix: Ambiguous `destroy_at` call

---
 include/usearch/index.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/usearch/index.hpp b/include/usearch/index.hpp
index e524085a..fa69cd1d 100644
--- a/include/usearch/index.hpp
+++ b/include/usearch/index.hpp
@@ -367,7 +367,7 @@ template <typename scalar_at, typename allocator_at = std::allocator<scalar_at>>
     void reset() noexcept {
         if (!std::is_trivially_destructible<scalar_at>::value)
             for (std::size_t i = 0; i != size_; ++i)
-                destroy_at(data_ + i);
+                unum::usearch::destroy_at(data_ + i); //< Facing some symbol visibility/ambiguity issues
         allocator_at{}.deallocate(data_, size_);
         data_ = nullptr;
         size_ = 0;

From e35eab89530c89d68509e52af65a04123880f994 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 2 Sep 2024 22:16:31 +0000
Subject: [PATCH 06/32] Improve: Export aggregate distances

---
 cpp/test.cpp                      | 13 +++++++----
 include/usearch/index_plugins.hpp | 37 ++++++++++++++++++++-----------
 2 files changed, 33 insertions(+), 17 deletions(-)

diff --git a/cpp/test.cpp b/cpp/test.cpp
index 74c4b27d..4e03bfb8 100644
--- a/cpp/test.cpp
+++ b/cpp/test.cpp
@@ -1100,15 +1100,20 @@ int main(int, char**) {
     test_uint40();
     test_cosine<float, std::int64_t, uint40_t>(10, 10);
 
-    //
+    // Test plugins, like K-Means clustering.
     {
         std::size_t vectors_count = 1000, centroids_count = 10, dimensions = 256;
         kmeans_clustering_t clustering;
+        clustering.max_iterations = 2;
         std::vector<float> vectors(vectors_count * dimensions), centroids(centroids_count * dimensions);
-        matrix_slice_gt<float const> vectors_slice(vectors.data(), vectors_count, dimensions);
-        matrix_slice_gt<float> centroids_slice(centroids.data(), centroids_count, dimensions);
+        matrix_slice_gt<float const> vectors_slice(vectors.data(), dimensions, vectors_count);
+        matrix_slice_gt<float> centroids_slice(centroids.data(), dimensions, centroids_count);
         std::generate(vectors.begin(), vectors.end(), [] { return float(std::rand()) / float(INT_MAX); });
-        clustering(vectors_slice, centroids_slice);
+        std::vector<std::size_t> assignments(vectors_count);
+        std::vector<distance_punned_t> distances(vectors_count);
+        auto clustering_result = clustering(vectors_slice, centroids_slice, {assignments.data(), assignments.size()},
+                                            {distances.data(), distances.size()});
+        expect(clustering_result);
     }
 
     // Exact search without constructing indexes.
diff --git a/include/usearch/index_plugins.hpp b/include/usearch/index_plugins.hpp
index 9dec50e0..d1019e8e 100644
--- a/include/usearch/index_plugins.hpp
+++ b/include/usearch/index_plugins.hpp
@@ -2073,6 +2073,8 @@ struct kmeans_clustering_result_t {
     f64_t last_iteration_inertia{};
     /// @brief The total elapsed runtime of the algorithm in seconds.
     f64_t runtime_seconds{};
+    /// @brief The total distance between the points and their assigned centroids.
+    f64_t aggregate_distance{};
 
     explicit operator bool() const noexcept { return !error; }
     kmeans_clustering_result_t failed(error_t message) noexcept {
@@ -2110,12 +2112,14 @@ template <typename allocator_at = std::allocator<char>> class kmeans_clustering_
     std::size_t min_point_shifts_per_iteration{};
 
     template <typename scalar_at, typename executor_at = dummy_executor_t, typename progress_at = dummy_progress_t>
-    kmeans_clustering_result_t operator()(                                             //
-        matrix_slice_gt<scalar_at const> points, matrix_slice_gt<scalar_at> centroids, //
+    kmeans_clustering_result_t operator()( //
+        matrix_slice_gt<scalar_at const> points, matrix_slice_gt<scalar_at> centroids,
+        span_gt<std::size_t> point_to_centroid_index, span_gt<distance_t> point_to_centroid_distance, //
         executor_at&& executor = executor_at{}, progress_at&& progress = progress_at{}) {
-        return operator()(                                                                     //
-            reinterpret_cast<byte_t const*>(points.data()), points.size(), points.stride(),    //
-            reinterpret_cast<byte_t*>(centroids.data()), centroids.size(), centroids.stride(), //
+        return operator()(                                                                  //
+            reinterpret_cast<byte_t const*>(points.data()), points.size(), points.stride(), //
+            reinterpret_cast<byte_t*>(centroids.data()), centroids.size(), centroids.stride(),
+            point_to_centroid_index.data(), //
             scalar_kind<scalar_at>(), points.dimensions(), executor, progress);
     }
 
@@ -2123,7 +2127,7 @@ template <typename allocator_at = std::allocator<char>> class kmeans_clustering_
     kmeans_clustering_result_t operator()(                                                 //
         byte_t const* points_data, std::size_t points_count, std::size_t points_stride,    //
         byte_t* centroids_data, std::size_t wanted_clusters, std::size_t centroids_stride, //
-        std::size_t* point_to_centroid_index,                                              //
+        std::size_t* point_to_centroid_index, distance_t* point_to_centroid_distance,      //
         scalar_kind_t original_scalar_kind, std::size_t dimensions, executor_at&& executor = executor_at{},
         progress_at&& progress = progress_at{}) {
 
@@ -2155,7 +2159,7 @@ template <typename allocator_at = std::allocator<char>> class kmeans_clustering_
         std::size_t const bytes_per_vector_original =
             divide_round_up<CHAR_BIT>(dimensions * bits_per_scalar(original_scalar_kind));
         std::size_t const bytes_per_vector_quantized = metric.bytes_per_vector();
-        std::size_t const stride_per_vector_quantized = divide_round_up<64>(bytes_per_vector_quantized);
+        std::size_t const stride_per_vector_quantized = divide_round_up<64>(bytes_per_vector_quantized) * 64;
         buffer_gt<byte_t, aligned_allocator_gt<byte_t, 64>> points_quantized_buffer(points_count *
                                                                                     stride_per_vector_quantized);
         buffer_gt<byte_t, aligned_allocator_gt<byte_t, 64>> centroids_quantized_buffer(wanted_clusters *
@@ -2191,7 +2195,7 @@ template <typename allocator_at = std::allocator<char>> class kmeans_clustering_
         for (std::size_t i = 0; i < points_count; i++) {
             byte_t const* vector = points_data + i * points_stride;
             byte_t* quantized = points_quantized_buffer.data() + i * stride_per_vector_quantized;
-            if (!compress_points(vector, bytes_per_vector_original, quantized))
+            if (!compress_points(vector, dimensions, quantized))
                 std::memcpy(quantized, vector, bytes_per_vector_original);
         }
 
@@ -2281,8 +2285,9 @@ template <typename allocator_at = std::allocator<char>> class kmeans_clustering_
             // Alternatively, a tree-like approach can be used, where every core accumulates it's own partial sums.
             // And those are later aggregated by a single thread.
             std::memset(centroids_precise_buffer.data(), 0,
-                        wanted_clusters * dimensions * 2 * thread_count * sizeof(f64_t));
-            std::memset(cluster_sizes_buffer.data(), 0, wanted_clusters * sizeof(std::atomic<std::size_t>));
+                        wanted_clusters * dimensions * thread_count * sizeof(f64_t));
+            std::memset(reinterpret_cast<byte_t*>(cluster_sizes_buffer.data()), 0,
+                        wanted_clusters * sizeof(std::atomic<std::size_t>));
             executor.dynamic(points_count, [&](std::size_t thread_idx, std::size_t points_idx) {
                 std::size_t centroid_idx = point_to_centroid_index_buffer[points_idx];
                 byte_t const* quantized_point =
@@ -2337,21 +2342,27 @@ template <typename allocator_at = std::allocator<char>> class kmeans_clustering_
                 // Quantize the centroid after normalization for further iterations
                 byte_t* quantized_centroid =
                     centroids_quantized_buffer.data() + centroid_idx * stride_per_vector_quantized;
-                if (!compress_precise(reinterpret_cast<byte_t*>(centroid_precise_aggregated),
-                                      bytes_per_vector_quantized, quantized_centroid))
+                if (!compress_precise(reinterpret_cast<byte_t*>(centroid_precise_aggregated), dimensions,
+                                      quantized_centroid))
                     std::memcpy(quantized_centroid, reinterpret_cast<byte_t*>(centroid_precise_aggregated),
                                 bytes_per_vector_quantized);
             }
         }
 
+        // Export stats.
         result.iterations = iterations;
+        result.computed_distances = points_count * wanted_clusters * iterations;
+        result.aggregate_distance =
+            std::accumulate(point_to_centroid_distance_buffer.begin(), point_to_centroid_distance_buffer.end(), 0.0);
 
         // We've finished all the iterations, now we can export the centroids back to the original precision.
         std::memcpy(point_to_centroid_index, point_to_centroid_index_buffer.data(), points_count * sizeof(std::size_t));
+        std::memcpy(point_to_centroid_distance, point_to_centroid_distance_buffer.data(),
+                    points_count * sizeof(distance_t));
         for (std::size_t i = 0; i < wanted_clusters; i++) {
             byte_t const* quantized_centroid = centroids_quantized_buffer.data() + i * stride_per_vector_quantized;
             byte_t* centroid = centroids_data + i * centroids_stride;
-            if (!decompress_points(quantized_centroid, bytes_per_vector_quantized, centroid))
+            if (!decompress_points(quantized_centroid, dimensions, centroid))
                 std::memcpy(centroid, quantized_centroid, bytes_per_vector_quantized);
         }
 

From db0f44d025103493c972fcadec53a9254912b530 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 2 Sep 2024 22:17:16 +0000
Subject: [PATCH 07/32] Add: PyBind11 API for K-Means

---
 python/lib.cpp | 79 +++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 78 insertions(+), 1 deletion(-)

diff --git a/python/lib.cpp b/python/lib.cpp
index c21f330a..f072fcba 100644
--- a/python/lib.cpp
+++ b/python/lib.cpp
@@ -452,7 +452,7 @@ static py::tuple search_many_in_index( //
 }
 
 /**
- *  @brief  Brute-force exact search implementation, compatible with
+ *  @brief  Brute-force @b exact search implementation, compatible with
  *          NumPy-like Tensors and other objects supporting Buffer Protocol.
  */
 static py::tuple search_many_brute_force(       //
@@ -545,6 +545,73 @@ static py::tuple search_many_brute_force(       //
     return results;
 }
 
+/**
+ *  @brief  Brute-force @b K-Means clustering, compatible with
+ *          NumPy-like Tensors and other objects supporting Buffer Protocol.
+ */
+static py::tuple cluster_many_brute_force( //
+    py::buffer dataset,                    //
+    std::size_t wanted,                    //
+    std::size_t max_iterations,            //
+    std::size_t threads,                   //
+    metric_kind_t metric_kind,             //
+    progress_func_t const& progress_func) {
+
+    using distance_t = typename kmeans_clustering_t::distance_t;
+    py::buffer_info dataset_info = dataset.request();
+    if (dataset_info.ndim != 2)
+        throw std::invalid_argument("Expects a matrix (rank-2 tensor) of dataset to cluster!");
+
+    std::size_t dataset_count = static_cast<std::size_t>(dataset_info.shape[0]);
+    std::size_t dataset_dimensions = static_cast<std::size_t>(dataset_info.shape[1]);
+    std::size_t dataset_stride = static_cast<std::size_t>(dataset_info.strides[0]);
+    scalar_kind_t dataset_kind = numpy_string_to_kind(dataset_info.format);
+    std::size_t bytes_per_scalar = bits_per_scalar_word(dataset_kind) / CHAR_BIT;
+
+    std::vector<std::size_t> point_to_centroid_index(dataset_count, 0);
+    std::vector<distance_t> point_to_centroid_distance(dataset_count, 0);
+    std::vector<byte_t> centroids(wanted * dataset_dimensions * bytes_per_scalar, 0);
+
+    if (!threads)
+        threads = std::thread::hardware_concurrency();
+
+    // Dispatch brute-force search
+    progress_t progress{progress_func};
+    executor_default_t executor{threads};
+    kmeans_clustering_t engine;
+    engine.metric_kind = metric_kind;
+    engine.quantization_kind = scalar_kind_t::bf16_k;
+    engine.max_iterations = max_iterations;
+
+    kmeans_clustering_result_t result = engine(                                           //
+        reinterpret_cast<byte_t const*>(dataset_info.ptr), dataset_count, dataset_stride, //
+        centroids.data(), wanted, dataset_dimensions,                                     //
+        point_to_centroid_index.data(), point_to_centroid_distance.data(), dataset_kind, dataset_dimensions, executor,
+        [&](std::size_t passed, std::size_t total) { return PyErr_CheckSignals() == 0 && progress(passed, total); });
+
+    if (!result)
+        throw std::runtime_error(result.error.release());
+
+    // Following constructor doesn't seem to be documented, but it's used in the source code of `pybind11`
+    // https://github.com/pybind/pybind11/blob/aeda49ed0b4e6e8abba7abc265ace86a6c26ba66/include/pybind11/numpy.h#L918-L919
+    // https://github.com/pybind/pybind11/blob/aeda49ed0b4e6e8abba7abc265ace86a6c26ba66/include/pybind11/buffer_info.h#L60-L75
+    py::buffer_info centroids_info;
+    centroids_info.ptr = reinterpret_cast<void*>(centroids.data());
+    centroids_info.itemsize = dataset_info.itemsize;
+    centroids_info.size = wanted * dataset_dimensions;
+    centroids_info.format = dataset_info.format;
+    centroids_info.ndim = 2;
+    centroids_info.shape = {wanted, dataset_dimensions};
+    centroids_info.strides = {dataset_dimensions * bytes_per_scalar, bytes_per_scalar};
+
+    py::tuple results(3);
+    results[0] = py::array_t<std::size_t>({dataset_count}, point_to_centroid_index.data());
+    results[1] = py::array_t<distance_t>({dataset_count}, point_to_centroid_distance.data());
+    results[2] = py::array(centroids_info);
+
+    return results;
+}
+
 template <typename scalar_at> struct rows_lookup_gt {
     byte_t* data_;
     std::size_t stride_;
@@ -943,6 +1010,16 @@ PYBIND11_MODULE(compiled, m) {
           py::arg("progress") = nullptr                                           //
     );
 
+    m.def("kmeans", &cluster_many_brute_force,            //
+          py::arg("dataset"),                             //
+          py::arg("count") = 10,                          //
+          py::kw_only(),                                  //
+          py::arg("max_iterations") = 100,                //
+          py::arg("threads") = 0,                         //
+          py::arg("metric_kind") = metric_kind_t::l2sq_k, //
+          py::arg("progress") = nullptr                   //
+    );
+
     m.def(
         "hardware_acceleration",
         [](scalar_kind_t scalar_kind, std::size_t dimensions, metric_kind_t metric_kind) -> py::str {

From 3af4801b2fed9d8832cfa8fd5685801327643b2a Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 2 Sep 2024 22:17:45 +0000
Subject: [PATCH 08/32] Add: Clustering benchmark

---
 cluster.py | 142 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 142 insertions(+)
 create mode 100644 cluster.py

diff --git a/cluster.py b/cluster.py
new file mode 100644
index 00000000..4f8df455
--- /dev/null
+++ b/cluster.py
@@ -0,0 +1,142 @@
+import os
+import argparse
+
+import numpy as np
+import faiss
+from tqdm import tqdm
+
+import usearch
+from usearch.index import Index
+from usearch.io import load_matrix
+
+
+def initialize_centroids(X, k):
+    """Randomly choose k data points as initial centroids"""
+    indices = np.random.choice(X.shape[0], k, replace=False)
+    return X[indices]
+
+
+def assign_clusters_numpy(X, centroids):
+    """Assign each data point to the nearest centroid (numpy NumPy implementation)"""
+    distances = np.linalg.norm(X[:, np.newaxis] - centroids, axis=2)
+    return np.argmin(distances, axis=1)
+
+
+def assign_clusters_faiss(X, centroids):
+    """Assign each data point to the nearest centroid using FAISS"""
+    index = faiss.IndexFlatL2(centroids.shape[1])
+    index.add(centroids)
+    _, labels = index.search(X, 1)
+    return labels.flatten()
+
+
+def assign_clusters_usearch(X, centroids):
+    """Assign each data point to the nearest centroid using USearch"""
+    dim = centroids.shape[1]
+    index = Index(ndim=dim, metric="l2sq")
+    index.add(None, centroids)
+    results = index.search(X, 1)
+    labels = results.keys.flatten()
+    return labels
+
+
+def update_centroids(X, labels, k):
+    """Compute new centroids as the mean of all data points assigned to each cluster"""
+    return np.array([X[labels == i].mean(axis=0) for i in range(k)])
+
+
+def kmeans(X, k, max_iters=100, tol=1e-4, assign_clusters_func=assign_clusters_numpy):
+    centroids = initialize_centroids(X, k)
+
+    for i in tqdm(range(max_iters), desc="KMeans Iterations"):
+        labels = assign_clusters_func(X, centroids)
+        new_centroids = update_centroids(X, labels, k)
+
+        if np.linalg.norm(new_centroids - centroids) < tol:
+            break
+
+        centroids = new_centroids
+
+    return labels, centroids
+
+
+def evaluate_clustering(X, labels, centroids):
+    """Evaluate clustering quality as average distance to centroids"""
+    distances = np.linalg.norm(X - centroids[labels], axis=1)
+    return np.mean(distances)
+
+
+def inverted_kmeans(X, k, max_iters=100, tol=1e-4):
+    # This algorithm is a bit different.
+    # In a typical KMeans algorithm - we would construct an index of centroids, and search all points in it.
+    # That's not very relevant, as there are very few centroids, and many points. Inverted setup makes more sense.
+
+    pass
+
+
+def custom_faiss_clustering(X, k, max_iters=100):
+    verbose = False
+    d: int = X.shape[1]
+    kmeans = faiss.Kmeans(d, k, niter=max_iters, verbose=verbose)
+    kmeans.train(X)
+    D, I = kmeans.index.search(X, 1)
+    return I.flatten(), kmeans.centroids
+
+
+def custom_usearch_clustering(X, k, max_iters=100):
+    # index = Index(ndim=X.shape[1], metric="l2sq")
+    # index.add(None, X)
+    # clustering = index.cluster(min_count=k, max_count=k)
+    # query_ids = clustering.queries.flatten()
+    # cluster_ids = clustering.matches.keys.flatten()
+    # reordered_cluster_ids = cluster_ids[query_ids]
+    # return reordered_cluster_ids, X[query_ids]
+
+    assignments, distances, centroids = usearch.compiled.kmeans(X, k, max_iterations=max_iters)
+    return assignments, centroids
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Compare KMeans clustering algorithms")
+    parser.add_argument("--vectors", type=str, required=True, help="Path to binary matrix file")
+    parser.add_argument("-k", type=int, required=True, help="Number of centroids")
+    parser.add_argument("-n", type=int, help="Upper bound on number of points to use")
+    parser.add_argument(
+        "--method",
+        type=str,
+        choices=["numpy", "faiss", "usearch", "custom_usearch", "custom_faiss"],
+        default="numpy",
+        help="Clustering backend",
+    )
+
+    args = parser.parse_args()
+
+    X = load_matrix(args.vectors, count_rows=args.n)
+    k = args.k
+    method = args.method
+
+    if method == "custom_usearch":
+        labels, centroids = custom_usearch_clustering(X, k)
+    elif method == "custom_faiss":
+        labels, centroids = custom_faiss_clustering(X, k)
+    else:
+        if method == "numpy":
+            assign_clusters_func = assign_clusters_numpy
+        elif method == "faiss":
+            assign_clusters_func = assign_clusters_faiss
+        elif method == "usearch":
+            assign_clusters_func = assign_clusters_usearch
+        labels, centroids = kmeans(X, k, assign_clusters_func=assign_clusters_func)
+
+    quality = evaluate_clustering(X, labels, centroids)
+    print(f"Clustering quality (average distance to centroids): {quality:.4f}")
+
+    cluster_sizes = np.unique(labels, return_counts=True)[1]
+    cluster_sizes_mean = np.mean(cluster_sizes)
+    cluster_sizes_stddev = np.std(cluster_sizes)
+    print(f"Cluster sizes: {cluster_sizes_mean:.2f} ± {cluster_sizes_stddev:.2f}")
+    print(cluster_sizes)
+
+
+if __name__ == "__main__":
+    main()

From ce1af09643c8cbd83365504b449bf03e0f9a8ac0 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 9 Sep 2024 16:46:51 +0000
Subject: [PATCH 09/32] Fix: Stride arithmetic

---
 cluster.py                        |  40 +++++++----
 include/usearch/index_plugins.hpp | 109 +++++++++++++++++++-----------
 python/lib.cpp                    |   8 ++-
 3 files changed, 99 insertions(+), 58 deletions(-)

diff --git a/cluster.py b/cluster.py
index 4f8df455..6f1362a8 100644
--- a/cluster.py
+++ b/cluster.py
@@ -66,12 +66,21 @@ def evaluate_clustering(X, labels, centroids):
     return np.mean(distances)
 
 
-def inverted_kmeans(X, k, max_iters=100, tol=1e-4):
-    # This algorithm is a bit different.
-    # In a typical KMeans algorithm - we would construct an index of centroids, and search all points in it.
-    # That's not very relevant, as there are very few centroids, and many points. Inverted setup makes more sense.
+def evaluate_clustering_cosine(X, labels, centroids):
+    """Evaluate clustering quality as average cosine distance to centroids"""
 
-    pass
+    # Normalize both data points and centroids
+    X_normalized = X / np.linalg.norm(X, axis=1, keepdims=True)
+    centroids_normalized = centroids / np.linalg.norm(centroids, axis=1, keepdims=True)
+
+    # Compute cosine similarity using dot product
+    cosine_similarities = np.sum(X_normalized * centroids_normalized[labels], axis=1)
+
+    # Convert cosine similarity to cosine distance
+    cosine_distances = 1 - cosine_similarities
+
+    # Return the average cosine distance
+    return np.mean(cosine_distances)
 
 
 def custom_faiss_clustering(X, k, max_iters=100):
@@ -84,15 +93,7 @@ def custom_faiss_clustering(X, k, max_iters=100):
 
 
 def custom_usearch_clustering(X, k, max_iters=100):
-    # index = Index(ndim=X.shape[1], metric="l2sq")
-    # index.add(None, X)
-    # clustering = index.cluster(min_count=k, max_count=k)
-    # query_ids = clustering.queries.flatten()
-    # cluster_ids = clustering.matches.keys.flatten()
-    # reordered_cluster_ids = cluster_ids[query_ids]
-    # return reordered_cluster_ids, X[query_ids]
-
-    assignments, distances, centroids = usearch.compiled.kmeans(X, k, max_iterations=max_iters)
+    assignments, distances, centroids = usearch.compiled.kmeans(X, k, max_iterations=max_iters, threads=192)
     return assignments, centroids
 
 
@@ -128,8 +129,17 @@ def main():
             assign_clusters_func = assign_clusters_usearch
         labels, centroids = kmeans(X, k, assign_clusters_func=assign_clusters_func)
 
+    print("centroids: ", centroids.shape, centroids)
+
     quality = evaluate_clustering(X, labels, centroids)
-    print(f"Clustering quality (average distance to centroids): {quality:.4f}")
+    quality_cosine = evaluate_clustering_cosine(X, labels, centroids)
+    print(f"Clustering quality (average distance to centroids): {quality:.4f}, cosine: {quality_cosine:.4f}")
+
+    # Let's compare it to some random uniform assignment
+    random_labels = np.random.randint(0, k, size=X.shape[0])
+    random_quality = evaluate_clustering(X, random_labels, centroids)
+    random_quality_cosine = evaluate_clustering_cosine(X, random_labels, centroids)
+    print(f"- while random assignment quality: {random_quality:.4f}, cosine: {random_quality_cosine:.4f}")
 
     cluster_sizes = np.unique(labels, return_counts=True)[1]
     cluster_sizes_mean = np.mean(cluster_sizes)
diff --git a/include/usearch/index_plugins.hpp b/include/usearch/index_plugins.hpp
index d1019e8e..dcea75c4 100644
--- a/include/usearch/index_plugins.hpp
+++ b/include/usearch/index_plugins.hpp
@@ -1945,7 +1945,7 @@ class matrix_slice_gt {
     explicit operator bool() const noexcept { return begin_; }
     std::size_t size() const noexcept { return count_; }
     std::size_t dimensions() const noexcept { return dimensions_; }
-    std::size_t stride() const noexcept { return stride_bytes_; }
+    std::size_t stride_bytes() const noexcept { return stride_bytes_; }
     scalar_t* data() const noexcept { return begin_; }
     scalar_t* at(std::size_t i) const noexcept {
         return reinterpret_cast<scalar_t*>(reinterpret_cast<byte_addressable_t*>(begin_) + i * stride_bytes_);
@@ -1982,10 +1982,10 @@ class exact_search_t {
         matrix_slice_gt<scalar_at const> dataset, matrix_slice_gt<scalar_at const> queries, //
         std::size_t wanted, metric_punned_t const& metric,                                  //
         executor_at&& executor = executor_at{}, progress_at&& progress = progress_at{}) {
-        return operator()(                                                                     //
-            metric,                                                                            //
-            reinterpret_cast<byte_t const*>(dataset.data()), dataset.size(), dataset.stride(), //
-            reinterpret_cast<byte_t const*>(queries.data()), queries.size(), queries.stride(), //
+        return operator()(                                                                           //
+            metric,                                                                                  //
+            reinterpret_cast<byte_t const*>(dataset.data()), dataset.size(), dataset.stride_bytes(), //
+            reinterpret_cast<byte_t const*>(queries.data()), queries.size(), queries.stride_bytes(), //
             wanted, executor, progress);
     }
 
@@ -2068,7 +2068,7 @@ struct kmeans_clustering_result_t {
     /// @brief The number of iterations the algorithm took to converge.
     std::size_t iterations{};
     /// @brief The number of points that changed clusters in the last iteration.
-    std::size_t last_iteration_point_shifts{};
+    std::size_t last_iteration_points_shifted{};
     /// @brief The inertia of the last iteration (sum of squared distances to centroids).
     f64_t last_iteration_inertia{};
     /// @brief The total elapsed runtime of the algorithm in seconds.
@@ -2109,25 +2109,25 @@ template <typename allocator_at = std::allocator<char>> class kmeans_clustering_
     /// @brief Early-exit parameter - the maximum runtime allowed in seconds.
     f64_t runtime_limit_seconds{};
     /// @brief Early-exit parameter - the minimum number of points that must change clusters per iteration.
-    std::size_t min_point_shifts_per_iteration{};
+    std::size_t min_points_shifted_per_iteration{};
 
     template <typename scalar_at, typename executor_at = dummy_executor_t, typename progress_at = dummy_progress_t>
     kmeans_clustering_result_t operator()( //
         matrix_slice_gt<scalar_at const> points, matrix_slice_gt<scalar_at> centroids,
         span_gt<std::size_t> point_to_centroid_index, span_gt<distance_t> point_to_centroid_distance, //
         executor_at&& executor = executor_at{}, progress_at&& progress = progress_at{}) {
-        return operator()(                                                                  //
-            reinterpret_cast<byte_t const*>(points.data()), points.size(), points.stride(), //
-            reinterpret_cast<byte_t*>(centroids.data()), centroids.size(), centroids.stride(),
-            point_to_centroid_index.data(), //
+        return operator()(                                                                        //
+            reinterpret_cast<byte_t const*>(points.data()), points.size(), points.stride_bytes(), //
+            reinterpret_cast<byte_t*>(centroids.data()), centroids.size(), centroids.stride_bytes(),
+            point_to_centroid_index.data(), point_to_centroid_distance.data(), //
             scalar_kind<scalar_at>(), points.dimensions(), executor, progress);
     }
 
     template <typename executor_at = dummy_executor_t, typename progress_at = dummy_progress_t>
-    kmeans_clustering_result_t operator()(                                                 //
-        byte_t const* points_data, std::size_t points_count, std::size_t points_stride,    //
-        byte_t* centroids_data, std::size_t wanted_clusters, std::size_t centroids_stride, //
-        std::size_t* point_to_centroid_index, distance_t* point_to_centroid_distance,      //
+    kmeans_clustering_result_t operator()(                                                       //
+        byte_t const* points_data, std::size_t points_count, std::size_t points_stride_bytes,    //
+        byte_t* centroids_data, std::size_t wanted_clusters, std::size_t centroids_stride_bytes, //
+        std::size_t* point_to_centroid_index, distance_t* point_to_centroid_distance,            //
         scalar_kind_t original_scalar_kind, std::size_t dimensions, executor_at&& executor = executor_at{},
         progress_at&& progress = progress_at{}) {
 
@@ -2160,10 +2160,10 @@ template <typename allocator_at = std::allocator<char>> class kmeans_clustering_
             divide_round_up<CHAR_BIT>(dimensions * bits_per_scalar(original_scalar_kind));
         std::size_t const bytes_per_vector_quantized = metric.bytes_per_vector();
         std::size_t const stride_per_vector_quantized = divide_round_up<64>(bytes_per_vector_quantized) * 64;
-        buffer_gt<byte_t, aligned_allocator_gt<byte_t, 64>> points_quantized_buffer(points_count *
-                                                                                    stride_per_vector_quantized);
-        buffer_gt<byte_t, aligned_allocator_gt<byte_t, 64>> centroids_quantized_buffer(wanted_clusters *
-                                                                                       stride_per_vector_quantized);
+        buffer_gt<byte_t, aligned_allocator_gt<byte_t, 64>> points_quantized_buffer( //
+            points_count * stride_per_vector_quantized);
+        buffer_gt<byte_t, aligned_allocator_gt<byte_t, 64>> centroids_quantized_buffer( //
+            wanted_clusters * stride_per_vector_quantized);
 
         // When aggregating centroids, we want to parallelize the operation and need more memory.
         // For every thread we keep two double-precision vectors. One is the up-casting output buffer for quantized
@@ -2175,12 +2175,15 @@ template <typename allocator_at = std::allocator<char>> class kmeans_clustering_
         //      - thread 2: [centroid 0, centroid 1, centroid 2, centroid 3, ...]
         //
         std::size_t const thread_count = executor.size();
-        buffer_gt<f64_t, aligned_allocator_gt<f64_t, 64>> centroids_precise_buffer(wanted_clusters * dimensions *
-                                                                                   thread_count);
-        buffer_gt<f64_t, aligned_allocator_gt<f64_t, 64>> points_precise_buffer(wanted_clusters * dimensions *
-                                                                                thread_count);
+        buffer_gt<f64_t, aligned_allocator_gt<f64_t, 64>> centroids_precise_buffer( //
+            wanted_clusters * dimensions * thread_count);
+        buffer_gt<f64_t, aligned_allocator_gt<f64_t, 64>> points_precise_buffer( //
+            wanted_clusters * dimensions * thread_count);
+
+        // Check if all memory allocations were successful.
         if (!centroids_precise_buffer || !points_precise_buffer || !centroids_quantized_buffer ||
-            !point_to_centroid_index_buffer || !point_to_centroid_distance_buffer || !points_quantized_buffer)
+            !point_to_centroid_index_buffer || !cluster_sizes_buffer || !point_to_centroid_distance_buffer ||
+            !points_quantized_buffer)
             return result.failed("No memory for result outputs!");
 
         std::fill_n(point_to_centroid_index_buffer.data(), points_count, wanted_clusters);
@@ -2193,7 +2196,7 @@ template <typename allocator_at = std::allocator<char>> class kmeans_clustering_
         cast_punned_t const& compress_precise = casts.from.f64;
         cast_punned_t const& decompress_precise = casts.to.f64;
         for (std::size_t i = 0; i < points_count; i++) {
-            byte_t const* vector = points_data + i * points_stride;
+            byte_t const* vector = points_data + i * points_stride_bytes;
             byte_t* quantized = points_quantized_buffer.data() + i * stride_per_vector_quantized;
             if (!compress_points(vector, dimensions, quantized))
                 std::memcpy(quantized, vector, bytes_per_vector_original);
@@ -2232,8 +2235,8 @@ template <typename allocator_at = std::allocator<char>> class kmeans_clustering_
         while (iterations < max_iterations) {
             iterations++;
 
-            // For every point in the points, find the closest centroid.
-            std::atomic<std::size_t> point_shifts = 0;
+            // For every point, find the closest centroid.
+            std::atomic<std::size_t> points_shifted = 0;
             std::atomic<distance_t> total_inertia = 0;
             executor.dynamic(points_count, [&](std::size_t thread_idx, std::size_t points_idx) {
                 byte_t const* quantized_point =
@@ -2254,7 +2257,7 @@ template <typename allocator_at = std::allocator<char>> class kmeans_clustering_
                 std::size_t& closest_idx_ref = point_to_centroid_index_buffer[points_idx];
                 if (closest_idx_local != closest_idx_ref) {
                     closest_idx_ref = closest_idx_local;
-                    point_shifts.fetch_add(1, std::memory_order_relaxed);
+                    points_shifted.fetch_add(1, std::memory_order_relaxed);
                 }
 
                 closest_distance_ref = closest_distance_local;
@@ -2262,24 +2265,33 @@ template <typename allocator_at = std::allocator<char>> class kmeans_clustering_
                 return true;
             });
 
-            // Check for early-exit conditions
+            auto current_time = std::chrono::high_resolution_clock::now();
+            std::chrono::duration<f64_t> elapsed_time = current_time - start_time;
+            result.runtime_seconds = elapsed_time.count();
             result.last_iteration_inertia = total_inertia.load(std::memory_order_relaxed);
+            result.last_iteration_points_shifted = points_shifted.load(std::memory_order_relaxed);
+
+            // Check for early-exit conditions
             if (final_inertia_threshold != 0)
                 if (result.last_iteration_inertia <= final_inertia_threshold)
                     break;
-
-            result.last_iteration_point_shifts = point_shifts.load(std::memory_order_relaxed);
-            if (min_point_shifts_per_iteration != 0)
-                if (result.last_iteration_point_shifts <= min_point_shifts_per_iteration)
+            if (min_points_shifted_per_iteration != 0 || result.last_iteration_points_shifted == 0)
+                if (result.last_iteration_points_shifted <= min_points_shifted_per_iteration)
                     break;
-
-            auto current_time = std::chrono::high_resolution_clock::now();
-            std::chrono::duration<f64_t> elapsed_time = current_time - start_time;
-            result.runtime_seconds = elapsed_time.count();
             if (runtime_limit_seconds != 0)
                 if (result.runtime_seconds >= runtime_limit_seconds)
                     break;
 
+            printf("iteration: %d\n", (int)iterations);
+            printf("- points_shifted: %d\n", (int)points_shifted.load());
+            printf("- duration: %f secs\n", (float)result.runtime_seconds);
+
+            double aggregate_distance = 0.0;
+            for (std::size_t i = 0; i < points_count; i++)
+                aggregate_distance += point_to_centroid_distance_buffer[i];
+            printf("- aggregate_distance: %f\n", (float)aggregate_distance);
+            printf("- average distance: %f\n", (float)aggregate_distance / points_count);
+
             // For every centroid, recalculate the mean of all points assigned to it.
             // That part is problematic to parallelize on many-core-systems, because of the contention.
             // Alternatively, a tree-like approach can be used, where every core accumulates it's own partial sums.
@@ -2327,6 +2339,8 @@ template <typename allocator_at = std::allocator<char>> class kmeans_clustering_
                     if (cluster_size > 0)
                         for (std::size_t i = 0; i < dimensions; i++)
                             centroid_precise_aggregated[i] /= static_cast<f64_t>(cluster_size);
+                    else
+                        printf("!!!! Empty cluster %d\n", (int)centroid_idx);
 
                 } else if (metric_kind == metric_kind_t::cos_k) {
                     // Normalize for Cosine distance
@@ -2340,12 +2354,21 @@ template <typename allocator_at = std::allocator<char>> class kmeans_clustering_
                 }
 
                 // Quantize the centroid after normalization for further iterations
-                byte_t* quantized_centroid =
+                byte_t* centroid_quantized =
                     centroids_quantized_buffer.data() + centroid_idx * stride_per_vector_quantized;
                 if (!compress_precise(reinterpret_cast<byte_t*>(centroid_precise_aggregated), dimensions,
-                                      quantized_centroid))
-                    std::memcpy(quantized_centroid, reinterpret_cast<byte_t*>(centroid_precise_aggregated),
+                                      centroid_quantized))
+                    std::memcpy(centroid_quantized, reinterpret_cast<byte_t*>(centroid_precise_aggregated),
                                 bytes_per_vector_quantized);
+
+                // Let's print the centroids for debugging purposes.
+                printf("- centroid %d: ", (int)centroid_idx);
+                for (std::size_t j = 0; j < dimensions; j++) {
+                    if (j > 0)
+                        printf(", ");
+                    printf("%.2f", (float)centroid_precise_aggregated[j]);
+                }
+                printf("\n");
             }
         }
 
@@ -2354,6 +2377,10 @@ template <typename allocator_at = std::allocator<char>> class kmeans_clustering_
         result.computed_distances = points_count * wanted_clusters * iterations;
         result.aggregate_distance =
             std::accumulate(point_to_centroid_distance_buffer.begin(), point_to_centroid_distance_buffer.end(), 0.0);
+        printf("iterations: %d\n", (int)result.iterations);
+        printf("aggregate_distance: %f\n", (float)result.aggregate_distance);
+        printf("average distance: %f\n", (float)result.aggregate_distance / points_count);
+        printf("centroids stride is %d\n", (int)centroids_stride_bytes);
 
         // We've finished all the iterations, now we can export the centroids back to the original precision.
         std::memcpy(point_to_centroid_index, point_to_centroid_index_buffer.data(), points_count * sizeof(std::size_t));
@@ -2361,7 +2388,7 @@ template <typename allocator_at = std::allocator<char>> class kmeans_clustering_
                     points_count * sizeof(distance_t));
         for (std::size_t i = 0; i < wanted_clusters; i++) {
             byte_t const* quantized_centroid = centroids_quantized_buffer.data() + i * stride_per_vector_quantized;
-            byte_t* centroid = centroids_data + i * centroids_stride;
+            byte_t* centroid = centroids_data + i * centroids_stride_bytes;
             if (!decompress_points(quantized_centroid, dimensions, centroid))
                 std::memcpy(centroid, quantized_centroid, bytes_per_vector_quantized);
         }
diff --git a/python/lib.cpp b/python/lib.cpp
index f072fcba..e8774ce9 100644
--- a/python/lib.cpp
+++ b/python/lib.cpp
@@ -567,6 +567,10 @@ static py::tuple cluster_many_brute_force( //
     std::size_t dataset_stride = static_cast<std::size_t>(dataset_info.strides[0]);
     scalar_kind_t dataset_kind = numpy_string_to_kind(dataset_info.format);
     std::size_t bytes_per_scalar = bits_per_scalar_word(dataset_kind) / CHAR_BIT;
+    printf("dataset_stride: %d\n", (int)dataset_stride);
+    printf("dataset_dimensions: %d\n", (int)dataset_dimensions);
+    printf("dataset_kind: %d\n", (int)dataset_kind);
+    printf("bytes_per_scalar: %d\n", (int)bytes_per_scalar);
 
     std::vector<std::size_t> point_to_centroid_index(dataset_count, 0);
     std::vector<distance_t> point_to_centroid_distance(dataset_count, 0);
@@ -580,12 +584,12 @@ static py::tuple cluster_many_brute_force( //
     executor_default_t executor{threads};
     kmeans_clustering_t engine;
     engine.metric_kind = metric_kind;
-    engine.quantization_kind = scalar_kind_t::bf16_k;
+    engine.quantization_kind = scalar_kind_t::f32_k;
     engine.max_iterations = max_iterations;
 
     kmeans_clustering_result_t result = engine(                                           //
         reinterpret_cast<byte_t const*>(dataset_info.ptr), dataset_count, dataset_stride, //
-        centroids.data(), wanted, dataset_dimensions,                                     //
+        centroids.data(), wanted, dataset_dimensions * bytes_per_scalar,                  //
         point_to_centroid_index.data(), point_to_centroid_distance.data(), dataset_kind, dataset_dimensions, executor,
         [&](std::size_t passed, std::size_t total) { return PyErr_CheckSignals() == 0 && progress(passed, total); });
 

From 9d53d18e35e31197e945b1211f2302a9941c854d Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 9 Sep 2024 17:02:58 +0000
Subject: [PATCH 10/32] Improve: Cleanup clustering

---
 cluster.py                        | 22 +++++++++++++++-------
 include/usearch/index_plugins.hpp | 21 ---------------------
 python/lib.cpp                    |  8 +++-----
 simsimd                           |  2 +-
 4 files changed, 19 insertions(+), 34 deletions(-)

diff --git a/cluster.py b/cluster.py
index 6f1362a8..8652167c 100644
--- a/cluster.py
+++ b/cluster.py
@@ -84,6 +84,9 @@ def evaluate_clustering_cosine(X, labels, centroids):
 
 
 def custom_faiss_clustering(X, k, max_iters=100):
+    # Docs: https://github.com/facebookresearch/faiss/wiki/Faiss-building-blocks:-clustering,-PCA,-quantization
+    # Header: https://github.com/facebookresearch/faiss/blob/main/faiss/Clustering.h
+    # Source: https://github.com/facebookresearch/faiss/blob/main/faiss/Clustering.cpp
     verbose = False
     d: int = X.shape[1]
     kmeans = faiss.Kmeans(d, k, niter=max_iters, verbose=verbose)
@@ -92,8 +95,13 @@ def custom_faiss_clustering(X, k, max_iters=100):
     return I.flatten(), kmeans.centroids
 
 
-def custom_usearch_clustering(X, k, max_iters=100):
-    assignments, distances, centroids = usearch.compiled.kmeans(X, k, max_iterations=max_iters, threads=192)
+def custom_usearch_clustering(X, k, max_iters=100, dtype="bf16"):
+    assignments, distances, centroids = usearch.compiled.kmeans(
+        X,
+        k,
+        max_iterations=max_iters,
+        # dtype=dtype,
+    )
     return assignments, centroids
 
 
@@ -101,6 +109,7 @@ def main():
     parser = argparse.ArgumentParser(description="Compare KMeans clustering algorithms")
     parser.add_argument("--vectors", type=str, required=True, help="Path to binary matrix file")
     parser.add_argument("-k", type=int, required=True, help="Number of centroids")
+    parser.add_argument("-i", type=int, help="Upper bound on number of iterations")
     parser.add_argument("-n", type=int, help="Upper bound on number of points to use")
     parser.add_argument(
         "--method",
@@ -112,14 +121,15 @@ def main():
 
     args = parser.parse_args()
 
+    max_iters = args.i
     X = load_matrix(args.vectors, count_rows=args.n)
     k = args.k
     method = args.method
 
     if method == "custom_usearch":
-        labels, centroids = custom_usearch_clustering(X, k)
+        labels, centroids = custom_usearch_clustering(X, k, max_iters=max_iters)
     elif method == "custom_faiss":
-        labels, centroids = custom_faiss_clustering(X, k)
+        labels, centroids = custom_faiss_clustering(X, k, max_iters=max_iters)
     else:
         if method == "numpy":
             assign_clusters_func = assign_clusters_numpy
@@ -129,8 +139,6 @@ def main():
             assign_clusters_func = assign_clusters_usearch
         labels, centroids = kmeans(X, k, assign_clusters_func=assign_clusters_func)
 
-    print("centroids: ", centroids.shape, centroids)
-
     quality = evaluate_clustering(X, labels, centroids)
     quality_cosine = evaluate_clustering_cosine(X, labels, centroids)
     print(f"Clustering quality (average distance to centroids): {quality:.4f}, cosine: {quality_cosine:.4f}")
@@ -139,7 +147,7 @@ def main():
     random_labels = np.random.randint(0, k, size=X.shape[0])
     random_quality = evaluate_clustering(X, random_labels, centroids)
     random_quality_cosine = evaluate_clustering_cosine(X, random_labels, centroids)
-    print(f"- while random assignment quality: {random_quality:.4f}, cosine: {random_quality_cosine:.4f}")
+    print(f"... while random assignment quality: {random_quality:.4f}, cosine: {random_quality_cosine:.4f}")
 
     cluster_sizes = np.unique(labels, return_counts=True)[1]
     cluster_sizes_mean = np.mean(cluster_sizes)
diff --git a/include/usearch/index_plugins.hpp b/include/usearch/index_plugins.hpp
index dcea75c4..fa6724ae 100644
--- a/include/usearch/index_plugins.hpp
+++ b/include/usearch/index_plugins.hpp
@@ -2282,15 +2282,9 @@ template <typename allocator_at = std::allocator<char>> class kmeans_clustering_
                 if (result.runtime_seconds >= runtime_limit_seconds)
                     break;
 
-            printf("iteration: %d\n", (int)iterations);
-            printf("- points_shifted: %d\n", (int)points_shifted.load());
-            printf("- duration: %f secs\n", (float)result.runtime_seconds);
-
             double aggregate_distance = 0.0;
             for (std::size_t i = 0; i < points_count; i++)
                 aggregate_distance += point_to_centroid_distance_buffer[i];
-            printf("- aggregate_distance: %f\n", (float)aggregate_distance);
-            printf("- average distance: %f\n", (float)aggregate_distance / points_count);
 
             // For every centroid, recalculate the mean of all points assigned to it.
             // That part is problematic to parallelize on many-core-systems, because of the contention.
@@ -2339,8 +2333,6 @@ template <typename allocator_at = std::allocator<char>> class kmeans_clustering_
                     if (cluster_size > 0)
                         for (std::size_t i = 0; i < dimensions; i++)
                             centroid_precise_aggregated[i] /= static_cast<f64_t>(cluster_size);
-                    else
-                        printf("!!!! Empty cluster %d\n", (int)centroid_idx);
 
                 } else if (metric_kind == metric_kind_t::cos_k) {
                     // Normalize for Cosine distance
@@ -2360,15 +2352,6 @@ template <typename allocator_at = std::allocator<char>> class kmeans_clustering_
                                       centroid_quantized))
                     std::memcpy(centroid_quantized, reinterpret_cast<byte_t*>(centroid_precise_aggregated),
                                 bytes_per_vector_quantized);
-
-                // Let's print the centroids for debugging purposes.
-                printf("- centroid %d: ", (int)centroid_idx);
-                for (std::size_t j = 0; j < dimensions; j++) {
-                    if (j > 0)
-                        printf(", ");
-                    printf("%.2f", (float)centroid_precise_aggregated[j]);
-                }
-                printf("\n");
             }
         }
 
@@ -2377,10 +2360,6 @@ template <typename allocator_at = std::allocator<char>> class kmeans_clustering_
         result.computed_distances = points_count * wanted_clusters * iterations;
         result.aggregate_distance =
             std::accumulate(point_to_centroid_distance_buffer.begin(), point_to_centroid_distance_buffer.end(), 0.0);
-        printf("iterations: %d\n", (int)result.iterations);
-        printf("aggregate_distance: %f\n", (float)result.aggregate_distance);
-        printf("average distance: %f\n", (float)result.aggregate_distance / points_count);
-        printf("centroids stride is %d\n", (int)centroids_stride_bytes);
 
         // We've finished all the iterations, now we can export the centroids back to the original precision.
         std::memcpy(point_to_centroid_index, point_to_centroid_index_buffer.data(), points_count * sizeof(std::size_t));
diff --git a/python/lib.cpp b/python/lib.cpp
index e8774ce9..2275b09e 100644
--- a/python/lib.cpp
+++ b/python/lib.cpp
@@ -554,6 +554,7 @@ static py::tuple cluster_many_brute_force( //
     std::size_t wanted,                    //
     std::size_t max_iterations,            //
     std::size_t threads,                   //
+    scalar_kind_t scalar_kind,             //
     metric_kind_t metric_kind,             //
     progress_func_t const& progress_func) {
 
@@ -567,10 +568,6 @@ static py::tuple cluster_many_brute_force( //
     std::size_t dataset_stride = static_cast<std::size_t>(dataset_info.strides[0]);
     scalar_kind_t dataset_kind = numpy_string_to_kind(dataset_info.format);
     std::size_t bytes_per_scalar = bits_per_scalar_word(dataset_kind) / CHAR_BIT;
-    printf("dataset_stride: %d\n", (int)dataset_stride);
-    printf("dataset_dimensions: %d\n", (int)dataset_dimensions);
-    printf("dataset_kind: %d\n", (int)dataset_kind);
-    printf("bytes_per_scalar: %d\n", (int)bytes_per_scalar);
 
     std::vector<std::size_t> point_to_centroid_index(dataset_count, 0);
     std::vector<distance_t> point_to_centroid_distance(dataset_count, 0);
@@ -584,7 +581,7 @@ static py::tuple cluster_many_brute_force( //
     executor_default_t executor{threads};
     kmeans_clustering_t engine;
     engine.metric_kind = metric_kind;
-    engine.quantization_kind = scalar_kind_t::f32_k;
+    engine.quantization_kind = scalar_kind;
     engine.max_iterations = max_iterations;
 
     kmeans_clustering_result_t result = engine(                                           //
@@ -1020,6 +1017,7 @@ PYBIND11_MODULE(compiled, m) {
           py::kw_only(),                                  //
           py::arg("max_iterations") = 100,                //
           py::arg("threads") = 0,                         //
+          py::arg("dtype") = scalar_kind_t::bf16_k,       //
           py::arg("metric_kind") = metric_kind_t::l2sq_k, //
           py::arg("progress") = nullptr                   //
     );
diff --git a/simsimd b/simsimd
index 96891ce9..f092facc 160000
--- a/simsimd
+++ b/simsimd
@@ -1 +1 @@
-Subproject commit 96891ce929e3ec6b3600f12c82c383cf1d076548
+Subproject commit f092faccdf2c058d29e7cd1d0108c25b6d951746

From e448ce365df50556aeb53baa83c49d3bcb248886 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 9 Sep 2024 20:40:06 +0000
Subject: [PATCH 11/32] Fix: Type-casting

---
 cluster.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/cluster.py b/cluster.py
index 8652167c..88e2e41a 100644
--- a/cluster.py
+++ b/cluster.py
@@ -95,12 +95,15 @@ def custom_faiss_clustering(X, k, max_iters=100):
     return I.flatten(), kmeans.centroids
 
 
-def custom_usearch_clustering(X, k, max_iters=100, dtype="bf16"):
+def custom_usearch_clustering(X, k, max_iters=100, metric="l2sq", dtype="bf16"):
+    metric = usearch.index._normalize_metric(metric)
+    dtype = usearch.index._normalize_dtype(dtype, ndim=X.shape[1], metric=metric)
     assignments, distances, centroids = usearch.compiled.kmeans(
         X,
         k,
         max_iterations=max_iters,
-        # dtype=dtype,
+        metric_kind=metric,
+        dtype=dtype,
     )
     return assignments, centroids
 

From 1a6753bf90e75216d7132b60cd4f0b908e7f5c83 Mon Sep 17 00:00:00 2001
From: CCnut <C.C.nut@outlook.com>
Date: Sun, 29 Sep 2024 15:52:00 +0800
Subject: [PATCH 12/32] Make: Rust CI build and test

---
 .github/workflows/prerelease.yml | 45 +++++++++++++++++++++++++++++---
 1 file changed, 41 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/prerelease.yml b/.github/workflows/prerelease.yml
index bee4beac..64f2705f 100644
--- a/.github/workflows/prerelease.yml
+++ b/.github/workflows/prerelease.yml
@@ -101,11 +101,15 @@ jobs:
         run: npm test
 
         # Rust
+      - name: Set up Rust
+        run: |
+          rustup update stable
+          rustup default stable
+          rustc -vV
+      - name: Build Rust
+        run: cargo build
       - name: Test Rust
-        uses: actions-rs/toolchain@v1
-        with:
-          toolchain: stable
-          override: true
+        run: cargo test
 
         # Java
       - name: Setup Java
@@ -193,6 +197,17 @@ jobs:
       - name: Test Python
         run: pytest
 
+        # Rust
+      - name: Set up Rust
+        run: |
+          rustup update stable
+          rustup default stable
+          rustc -vV
+      - name: Build Rust
+        run: cargo build
+      - name: Test Rust
+        run: cargo test
+
         # C#
       - name: Setup .NET ${{ env.DOTNET_VERSION }}
         uses: actions/setup-dotnet@v3
@@ -260,6 +275,17 @@ jobs:
       - name: Test ObjC/Swift
         run: swift test
 
+        # Rust
+      - name: Set up Rust
+        run: |
+          rustup update stable
+          rustup default stable
+          rustc -vV
+      - name: Build Rust
+        run: cargo build
+      - name: Test Rust
+        run: cargo test
+
         # C#
       - name: Setup .NET ${{ env.DOTNET_VERSION }}
         uses: actions/setup-dotnet@v3
@@ -318,6 +344,17 @@ jobs:
       - name: Test JavaScript
         run: npm test
 
+        # Rust
+      - name: Set up Rust
+        run: |
+          rustup update stable
+          rustup default stable
+          rustc -vV
+      - name: Build Rust
+        run: cargo build
+      - name: Test Rust
+        run: cargo test
+
         # C#
       - name: Setup .NET ${{ env.DOTNET_VERSION }}
         uses: actions/setup-dotnet@v3

From f1c158f701431a368db2f1819e301eebac25267b Mon Sep 17 00:00:00 2001
From: CCnut <C.C.nut@outlook.com>
Date: Sun, 29 Sep 2024 23:19:21 +0800
Subject: [PATCH 13/32] Make: Android CI build and test

---
 .github/workflows/prerelease.yml | 64 ++++++++++++++++++++++++
 build.rs                         | 85 ++++++++++++++++++--------------
 2 files changed, 111 insertions(+), 38 deletions(-)

diff --git a/.github/workflows/prerelease.yml b/.github/workflows/prerelease.yml
index 64f2705f..3472977f 100644
--- a/.github/workflows/prerelease.yml
+++ b/.github/workflows/prerelease.yml
@@ -11,6 +11,8 @@ env:
   PYTHONUTF8: 1
   PYTHON_VERSION: 3.11
   DOTNET_VERSION: 7.0.x
+  ANDROID_NDK_VERSION: 26.3.11579264
+  ANDROID_SDK_VERSION: 21
 
 # Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages
 permissions:
@@ -458,3 +460,65 @@ jobs:
         run: |
           test -e build_artifacts/libusearch_c.so
           test -e build_artifacts/libusearch_sqlite.so
+
+  test_ubuntu_android_ndk:
+    name: Android NDK Build
+    runs-on: ubuntu-22.04
+    strategy:
+      matrix:
+        include:
+          - processor: armv7a
+            abi: armeabi-v7a
+            target: armv7-linux-androideabi
+          - processor: aarch64
+            abi: arm64-v8a
+            target: aarch64-linux-android
+    steps:
+      - uses: actions/checkout@v4
+      - run: git submodule update --init --recursive
+
+      - name: Install NDK ndk
+        run: |
+          ${ANDROID_HOME}/cmdline-tools/latest/bin/sdkmanager --install "ndk;${{ env.ANDROID_NDK_VERSION }}"
+
+      - name: Build C/C++
+        run: |
+          cmake -B build_artifacts \
+            -D CMAKE_BUILD_TYPE=RelWithDebInfo \
+            -D CMAKE_EXPORT_COMPILE_COMMANDS=1 \
+            -D CMAKE_TOOLCHAIN_FILE=${ANDROID_HOME}/ndk/${{ env.ANDROID_NDK_VERSION }}/build/cmake/android.toolchain.cmake \
+            -D CMAKE_ANDROID_STL_TYPE=c++_static \
+            -D ANDROID_PLATFORM=${{ env.ANDROID_SDK_VERSION }} \
+            -D ANDROID_ABI=${{ matrix.abi }} \
+            -D USEARCH_BUILD_LIB_C=1 \
+            -D USEARCH_BUILD_TEST_CPP=0 \
+            -D USEARCH_BUILD_BENCH_CPP=0
+
+          cmake --build build_artifacts --config RelWithDebInfo
+
+        # We can't run the produced builds, but we can make sure they exist
+      - name: Test artifacts presense
+        run: |
+          test -e build_artifacts/libusearch_c.so
+
+        # Rust
+      - name: Set up Rust
+        run: |
+          rustup update stable
+          rustup default stable
+          rustup target add ${{ matrix.target }}
+          rustc -vV
+
+      - name: Set up Rust Env
+        run: |
+          TOOLCHAIN=${ANDROID_HOME}/ndk/${{ env.ANDROID_NDK_VERSION }}/toolchains/llvm/prebuilt/linux-x86_64/bin/
+          NDK_CLANG=$(find ${TOOLCHAIN} -name "${{ matrix.processor }}*${{ env.ANDROID_SDK_VERSION }}-clang")
+          echo "CC_${{ matrix.target }}=${NDK_CLANG}" >> ${GITHUB_ENV}
+          echo "CXX_${{ matrix.target }}=${NDK_CLANG}++" >> ${GITHUB_ENV}
+          echo "AR_${{ matrix.target }}=${TOOLCHAIN}/llvm-ar" >> ${GITHUB_ENV}
+          echo "CARGO_${{ matrix.target }}=${NDK_CLANG}" >> ${GITHUB_ENV}
+          echo "CARGO_${{ matrix.target }}=${TOOLCHAIN}/llvm-ar" >> ${GITHUB_ENV}
+
+      - name: Build Rust
+        run: |
+          cargo build --target ${{ matrix.target }}
diff --git a/build.rs b/build.rs
index da4611e4..7958fe38 100644
--- a/build.rs
+++ b/build.rs
@@ -25,26 +25,8 @@ fn main() {
 
     // Define all possible SIMD targets as 1
     let target_arch = std::env::var("CARGO_CFG_TARGET_ARCH").unwrap_or_default();
-    let flags_to_try = match target_arch.as_str() {
-        "arm" | "aarch64" => vec![
-            "SIMSIMD_TARGET_SVE_BF16",
-            "SIMSIMD_TARGET_SVE_F16",
-            "SIMSIMD_TARGET_SVE_I8",
-            "SIMSIMD_TARGET_SVE",
-            "SIMSIMD_TARGET_NEON_BF16",
-            "SIMSIMD_TARGET_NEON_F16",
-            "SIMSIMD_TARGET_NEON_I8",
-            "SIMSIMD_TARGET_NEON",
-        ],
-        _ => vec![
-            "SIMSIMD_TARGET_SAPPHIRE",
-            "SIMSIMD_TARGET_GENOA",
-            "SIMSIMD_TARGET_ICE",
-            "SIMSIMD_TARGET_SKYLAKE",
-            "SIMSIMD_TARGET_HASWELL",
-        ],
-    };
 
+    let mut flags_to_try;
     if cfg!(feature = "simsimd") {
         build
             .file("simsimd/c/lib.c")
@@ -52,23 +34,40 @@ fn main() {
             .define("SIMSIMD_DYNAMIC_DISPATCH", "1")
             .define("SIMSIMD_NATIVE_BF16", "0")
             .define("SIMSIMD_NATIVE_F16", "0");
-
-        for flag in &flags_to_try {
-            build.define(flag, "1");
-        }
+        flags_to_try = match target_arch.as_str() {
+            "arm" | "aarch64" => vec![
+                "SIMSIMD_TARGET_NEON",
+                "SIMSIMD_TARGET_NEON_I8",
+                "SIMSIMD_TARGET_NEON_F16",
+                "SIMSIMD_TARGET_NEON_BF16",
+                "SIMSIMD_TARGET_SVE",
+                "SIMSIMD_TARGET_SVE_I8",
+                "SIMSIMD_TARGET_SVE_F16",
+                "SIMSIMD_TARGET_SVE_BF16",
+            ],
+            _ => vec![
+                "SIMSIMD_TARGET_HASWELL",
+                "SIMSIMD_TARGET_SKYLAKE",
+                "SIMSIMD_TARGET_ICE",
+                "SIMSIMD_TARGET_GENOA",
+                "SIMSIMD_TARGET_SAPPHIRE",
+            ],
+        };
     } else {
         build.define("USEARCH_USE_SIMSIMD", "0");
+        flags_to_try = vec![];
     }
 
+    let target_os = std::env::var("CARGO_CFG_TARGET_OS").unwrap();
     // Conditional compilation depending on the target operating system.
-    if cfg!(target_os = "linux") {
+    if target_os == "linux" || target_os == "android" {
         build
             .flag_if_supported("-std=c++17")
             .flag_if_supported("-O3")
             .flag_if_supported("-ffast-math")
             .flag_if_supported("-fdiagnostics-color=always")
             .flag_if_supported("-g1"); // Simplify debugging
-    } else if cfg!(target_os = "macos") {
+    } else if target_os == "macos" {
         build
             .flag_if_supported("-mmacosx-version-min=10.15")
             .flag_if_supported("-std=c++17")
@@ -76,7 +75,7 @@ fn main() {
             .flag_if_supported("-ffast-math")
             .flag_if_supported("-fcolor-diagnostics")
             .flag_if_supported("-g1"); // Simplify debugging
-    } else if cfg!(target_os = "windows") {
+    } else if target_os == "windows" {
         build
             .flag_if_supported("/std:c++17")
             .flag_if_supported("/O2")
@@ -90,26 +89,36 @@ fn main() {
             .define("_ALLOW_POINTER_TO_CONST_MISMATCH", None);
     }
 
-    let mut result = build.try_compile("usearch");
-    if result.is_err() {
-        print!("cargo:warning=Failed to compile with all SIMD backends...");
-        for flag in flags_to_try {
-            build.define(flag, "0");
-            result = build.try_compile("usearch");
-            if result.is_err() {
+    let base_build = build.clone();
+
+    let mut pop_flag = None;
+    loop {
+        let mut sub_build = base_build.clone();
+        for flag in &flags_to_try {
+            sub_build.define(flag, "1");
+        }
+        let result = sub_build.try_compile("usearch");
+        if result.is_err() {
+            if let Some(flag) = pop_flag {
                 println!(
-                    "cargo:warning=Failed to compile after disabling {}, trying next configuration...",
+                    "cargo:warning=Failed to compile after disabling {:?}, trying next configuration...",
                     flag
                 );
             } else {
-                break;
+                if !flags_to_try.is_empty() {
+                    print!("cargo:warning=Failed to compile with all SIMD backends...");
+                }
+            }
+
+            pop_flag = flags_to_try.pop();
+            if pop_flag.is_none() {
+                result.unwrap();
             }
+        } else {
+            break;
         }
     }
 
-    // Ensure one build has been successful
-    result.unwrap();
-
     println!("cargo:rerun-if-changed=rust/lib.rs");
     println!("cargo:rerun-if-changed=rust/lib.cpp");
     println!("cargo:rerun-if-changed=rust/lib.hpp");

From 189bb0be4dd6081f631dd275e68aaf2c8a861632 Mon Sep 17 00:00:00 2001
From: T-T <C.C.nut@outlook.com>
Date: Mon, 30 Sep 2024 00:12:37 +0800
Subject: [PATCH 14/32] Fix: Android build

---
 include/usearch/index.hpp         | 3 +++
 include/usearch/index_plugins.hpp | 4 ++--
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/include/usearch/index.hpp b/include/usearch/index.hpp
index f026455a..c7bcdac9 100644
--- a/include/usearch/index.hpp
+++ b/include/usearch/index.hpp
@@ -27,6 +27,9 @@
 #define USEARCH_DEFINED_APPLE
 #elif defined(__linux__)
 #define USEARCH_DEFINED_LINUX
+#if defined(__ANDROID_API__)
+#define USEARCH_DEFINED_ANDROID
+#endif
 #endif
 
 // Inferring the compiler: Clang vs GCC
diff --git a/include/usearch/index_plugins.hpp b/include/usearch/index_plugins.hpp
index 349ee360..607ebb80 100644
--- a/include/usearch/index_plugins.hpp
+++ b/include/usearch/index_plugins.hpp
@@ -796,9 +796,9 @@ class aligned_allocator_gt {
         std::size_t alignment = alignment_ak;
 #if defined(USEARCH_DEFINED_WINDOWS)
         return (pointer)_aligned_malloc(length_bytes, alignment);
-#elif defined(USEARCH_DEFINED_APPLE)
+#elif defined(USEARCH_DEFINED_APPLE) || defined(USEARCH_DEFINED_ANDROID)
         // Apple Clang keeps complaining that `aligned_alloc` is only available
-        // with macOS 10.15 and newer, so let's use `posix_memalign` there.
+        // with macOS 10.15 and newer or Android API >= 28, so let's use `posix_memalign` there.
         void* result = nullptr;
         int status = posix_memalign(&result, alignment, length_bytes);
         return status == 0 ? (pointer)result : nullptr;

From 7425cd718081eee25ca74329864adf2ec3476e7c Mon Sep 17 00:00:00 2001
From: Abe Tomoaki <abe@clear-code.com>
Date: Thu, 10 Oct 2024 18:33:22 +0900
Subject: [PATCH 15/32] Fix: JavaScript Change the return value of
 index.count() to a number

The return value of index.count() was a boolean, so it was changed to a number.
---
 javascript/lib.cpp         |  2 +-
 javascript/usearch.test.js | 20 ++++++++++++++++++++
 2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/javascript/lib.cpp b/javascript/lib.cpp
index 38856b3f..07346828 100644
--- a/javascript/lib.cpp
+++ b/javascript/lib.cpp
@@ -268,7 +268,7 @@ Napi::Value CompiledIndex::Count(Napi::CallbackInfo const& ctx) {
     std::size_t length = keys.ElementLength();
     Napi::Array result = Napi::Array::New(env, length);
     for (std::size_t i = 0; i < length; ++i)
-        result[i] = Napi::Boolean::New(env, native_->count(static_cast<default_key_t>(keys[i])));
+        result[i] = Napi::Number::New(env, native_->count(static_cast<default_key_t>(keys[i])));
     return result;
 }
 
diff --git a/javascript/usearch.test.js b/javascript/usearch.test.js
index 34c41523..607ddcc6 100644
--- a/javascript/usearch.test.js
+++ b/javascript/usearch.test.js
@@ -60,7 +60,27 @@ test("Expected results", () => {
     assertAlmostEqual(results.distances[0], new Float32Array([0]));
 });
 
+test('Expected count()', async (t) => {
+    const index = new usearch.Index({
+        metric: 'l2sq',
+        connectivity: 16,
+        dimensions: 3,
+    });
+    index.add(
+        [42n, 43n],
+        [new Float32Array([0.2, 0.6, 0.4]), new Float32Array([0.2, 0.6, 0.4])]
+    );
 
+    await t.test('Argument is a number', () => {
+        assert.equal(1, index.count(43n));
+    });
+    await t.test('Argument is a number (does not exist)', () => {
+        assert.equal(0, index.count(44n));
+    });
+    await t.test('Argument is an array', () => {
+        assert.deepEqual([1, 1, 0], index.count([42n, 43n, 44n]));
+    });
+});
 
 test('Operations with invalid values', () => {
     const indexBatch = new usearch.Index(2, 'l2sq');

From d6fd1eba21979cdb0fccce1257e76a99db600189 Mon Sep 17 00:00:00 2001
From: Abe Tomoaki <abe@clear-code.com>
Date: Thu, 10 Oct 2024 22:59:38 +0900
Subject: [PATCH 16/32] Fix: Raise exceptions from `add()` in JS (#486)

For example, if you try to add the same key, it aborts.

```
terminate called after throwing an instance of 'std::runtime_error'
  what():  Duplicate keys not allowed in high-level wrappers
Aborted (core dumped)
```

Improved error handling to throw JavaScript exceptions.
---
 javascript/lib.cpp         |  7 ++++++-
 javascript/usearch.test.js | 18 ++++++++++++++++++
 2 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/javascript/lib.cpp b/javascript/lib.cpp
index 07346828..3f576533 100644
--- a/javascript/lib.cpp
+++ b/javascript/lib.cpp
@@ -20,6 +20,8 @@
 using namespace unum::usearch;
 using namespace unum;
 
+using add_result_t = typename index_dense_t::add_result_t;
+
 class CompiledIndex : public Napi::ObjectWrap<CompiledIndex> {
   public:
     static Napi::Object Init(Napi::Env env, Napi::Object exports);
@@ -161,7 +163,10 @@ void CompiledIndex::Add(Napi::CallbackInfo const& ctx) {
     auto run_parallel = [&](auto vectors) {
         executor_stl_t executor;
         executor.fixed(tasks, [&](std::size_t /*thread_idx*/, std::size_t task_idx) {
-            native_->add(static_cast<default_key_t>(keys[task_idx]), vectors + task_idx * native_->dimensions());
+            add_result_t result = native_->add(static_cast<default_key_t>(keys[task_idx]),
+                                               vectors + task_idx * native_->dimensions());
+            if (!result)
+                Napi::Error::New(ctx.Env(), result.error.release()).ThrowAsJavaScriptException();
         });
     };
 
diff --git a/javascript/usearch.test.js b/javascript/usearch.test.js
index 607ddcc6..80747763 100644
--- a/javascript/usearch.test.js
+++ b/javascript/usearch.test.js
@@ -104,3 +104,21 @@ test('Operations with invalid values', () => {
         }
     );
 });
+
+test('Invalid operations', async (t) => {
+    await t.test('Add the same keys', () => {
+        const index = new usearch.Index({
+            metric: "l2sq",
+            connectivity: 16,
+            dimensions: 3,
+        });
+        index.add(42n, new Float32Array([0.2, 0.6, 0.4]));
+        assert.throws(
+            () => index.add(42n, new Float32Array([0.2, 0.6, 0.4])),
+            {
+                name: 'Error',
+                message: 'Duplicate keys not allowed in high-level wrappers'
+            }
+        );
+    });
+});

From 16dec632900581b17c62540208ee0ea0b016f30d Mon Sep 17 00:00:00 2001
From: Abe Tomoaki <abe@clear-code.com>
Date: Fri, 11 Oct 2024 10:29:54 +0900
Subject: [PATCH 17/32] Fix: Reserve after deserialization in JS (#484)

I got an error when I loaded and searched with load() or view().

Code Example:

```js
// Saved with `index.save('index.usearch');` in another script.
index.load('index.usearch');
const results = index.search(new Float32Array([0.2, 0.6, 0.4]), 10);
```
---
 javascript/lib.cpp         |  4 +++
 javascript/usearch.test.js | 50 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 54 insertions(+)

diff --git a/javascript/lib.cpp b/javascript/lib.cpp
index 3f576533..03307a1a 100644
--- a/javascript/lib.cpp
+++ b/javascript/lib.cpp
@@ -131,6 +131,8 @@ void CompiledIndex::Load(Napi::CallbackInfo const& ctx) {
         auto result = native_->load(path.c_str());
         if (!result)
             Napi::TypeError::New(ctx.Env(), result.error.release()).ThrowAsJavaScriptException();
+        if (!native_->try_reserve(ceil2(native_->size())))
+            Napi::Error::New(ctx.Env(), "Failed to reserve memory").ThrowAsJavaScriptException();
 
     } catch (...) {
         Napi::TypeError::New(ctx.Env(), "Loading failed").ThrowAsJavaScriptException();
@@ -144,6 +146,8 @@ void CompiledIndex::View(Napi::CallbackInfo const& ctx) {
         auto result = native_->view(path.c_str());
         if (!result)
             Napi::TypeError::New(ctx.Env(), result.error.release()).ThrowAsJavaScriptException();
+        if (!native_->try_reserve(ceil2(native_->size())))
+            Napi::Error::New(ctx.Env(), "Failed to reserve memory").ThrowAsJavaScriptException();
 
     } catch (...) {
         Napi::TypeError::New(ctx.Env(), "Memory-mapping failed").ThrowAsJavaScriptException();
diff --git a/javascript/usearch.test.js b/javascript/usearch.test.js
index 80747763..952cd926 100644
--- a/javascript/usearch.test.js
+++ b/javascript/usearch.test.js
@@ -1,5 +1,8 @@
 const test = require('node:test');
 const assert = require('node:assert');
+const fs = require('node:fs');
+const os = require('node:os');
+const path = require('node:path');
 const usearch = require('./dist/cjs/usearch.js');
 
 function assertAlmostEqual(actual, expected, tolerance = 1e-6) {
@@ -122,3 +125,50 @@ test('Invalid operations', async (t) => {
         );
     });
 });
+
+
+test('Serialization', async (t) => {
+    const indexPath = path.join(os.tmpdir(), 'usearch.test.index')
+
+    t.beforeEach(() => {
+        const index = new usearch.Index({
+            metric: "l2sq",
+            connectivity: 16,
+            dimensions: 3,
+        });
+        index.add(42n, new Float32Array([0.2, 0.6, 0.4]));
+        index.save(indexPath);
+    });
+
+    t.afterEach(() => {
+        fs.unlinkSync(indexPath);
+    });
+
+    await t.test('load', () => {
+        const index = new usearch.Index({
+            metric: "l2sq",
+            connectivity: 16,
+            dimensions: 3,
+        });
+        index.load(indexPath);
+        const results = index.search(new Float32Array([0.2, 0.6, 0.4]), 10);
+
+        assert.equal(index.size(), 1);
+        assert.deepEqual(results.keys, new BigUint64Array([42n]));
+        assertAlmostEqual(results.distances[0], new Float32Array([0]));
+    });
+
+    await t.test('view', () => {
+        const index = new usearch.Index({
+            metric: "l2sq",
+            connectivity: 16,
+            dimensions: 3,
+        });
+        index.view(indexPath);
+        const results = index.search(new Float32Array([0.2, 0.6, 0.4]), 10);
+
+        assert.equal(index.size(), 1);
+        assert.deepEqual(results.keys, new BigUint64Array([42n]));
+        assertAlmostEqual(results.distances[0], new Float32Array([0]));
+    });
+});

From 08c835db9baad6b8b72e0478c0013ef601a76b46 Mon Sep 17 00:00:00 2001
From: Abe Tomoaki <abe@clear-code.com>
Date: Mon, 14 Oct 2024 11:19:18 +0900
Subject: [PATCH 18/32] Fix: Skip JS `view()`  on Winodws (#504)

The test itself succeeds, but fails with the following error when deleting the index file created by save() in afterEach().

```
error: "EBUSY: resource busy or locked, unlink 'C:\\Users\\RUNNER~1\\AppData\\Local\\Temp\\usearch.test.index'"
```

Since it is only in Winodws that it fails, we will skip it on Winodws for now.
We will continue to investigate the solution.
---
 javascript/usearch.test.js | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/javascript/usearch.test.js b/javascript/usearch.test.js
index 952cd926..c21b80ae 100644
--- a/javascript/usearch.test.js
+++ b/javascript/usearch.test.js
@@ -158,7 +158,10 @@ test('Serialization', async (t) => {
         assertAlmostEqual(results.distances[0], new Float32Array([0]));
     });
 
-    await t.test('view', () => {
+    // todo: Skip as the test fails only on windows.
+    // The following error in afterEach().
+    // `error: "EBUSY: resource busy or locked, unlink`
+    await t.test('view', {skip: process.platform === 'win32'}, () => {
         const index = new usearch.Index({
             metric: "l2sq",
             connectivity: 16,

From c27c99d067e14faae87b56ea1ab106d225146523 Mon Sep 17 00:00:00 2001
From: Abe Tomoaki <abe@clear-code.com>
Date: Tue, 22 Oct 2024 22:52:04 +0900
Subject: [PATCH 19/32] Fix: Remove from a read-only index (#506)

The index read by `view()` is read-only.
When I did a `remove()` on that index, it crashed.

---------

Co-authored-by: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
---
 include/usearch/index_dense.hpp |   2 +
 javascript/lib.cpp              |   9 ++-
 javascript/usearch.test.js      | 122 ++++++++++++++++++++++++++------
 3 files changed, 108 insertions(+), 25 deletions(-)

diff --git a/include/usearch/index_dense.hpp b/include/usearch/index_dense.hpp
index 7c7b9b55..93b55877 100644
--- a/include/usearch/index_dense.hpp
+++ b/include/usearch/index_dense.hpp
@@ -1460,6 +1460,8 @@ class index_dense_gt {
     labeling_result_t remove(vector_key_t key) {
         usearch_assert_m(config().enable_key_lookups, "Key lookups are disabled");
         labeling_result_t result;
+        if (typed_->is_immutable())
+            return result.failed("Can't remove from an immutable index");
 
         unique_lock_t lookup_lock(slot_lookup_mutex_);
         auto matching_slots = slot_lookup_.equal_range(key_and_slot_t::any_slot(key));
diff --git a/javascript/lib.cpp b/javascript/lib.cpp
index 03307a1a..b259f0b0 100644
--- a/javascript/lib.cpp
+++ b/javascript/lib.cpp
@@ -254,11 +254,14 @@ Napi::Value CompiledIndex::Remove(Napi::CallbackInfo const& ctx) {
     Napi::Env env = ctx.Env();
     Napi::BigUint64Array keys = ctx[0].As<Napi::BigUint64Array>();
     std::size_t length = keys.ElementLength();
-    Napi::Array result = Napi::Array::New(env, length);
+    Napi::Array results = Napi::Array::New(env, length);
     for (std::size_t i = 0; i < length; ++i) {
-        result[i] = Napi::Number::New(env, native_->remove(static_cast<default_key_t>(keys[i])).completed);
+        auto result = native_->remove(static_cast<default_key_t>(keys[i]));
+        if (!result)
+            Napi::Error::New(ctx.Env(), result.error.release()).ThrowAsJavaScriptException();
+        results[i] = Napi::Number::New(env, result.completed);
     }
-    return result;
+    return results;
 }
 
 Napi::Value CompiledIndex::Contains(Napi::CallbackInfo const& ctx) {
diff --git a/javascript/usearch.test.js b/javascript/usearch.test.js
index c21b80ae..c04bc436 100644
--- a/javascript/usearch.test.js
+++ b/javascript/usearch.test.js
@@ -15,38 +15,84 @@ function assertAlmostEqual(actual, expected, tolerance = 1e-6) {
 }
 
 
-test('Single-entry operations', () => {
-    const index = new usearch.Index(2, 'l2sq');
+test('Single-entry operations', async (t) => {
+    await t.test('index info', () => {
+        const index = new usearch.Index(2, 'l2sq');
 
-    assert.equal(index.connectivity(), 16, 'connectivity should be 16');
-    assert.equal(index.dimensions(), 2, 'dimensions should be 2');
-    assert.equal(index.size(), 0, 'initial size should be 0');
+        assert.equal(index.connectivity(), 16, 'connectivity should be 16');
+        assert.equal(index.dimensions(), 2, 'dimensions should be 2');
+        assert.equal(index.size(), 0, 'initial size should be 0');
+    });
+
+    await t.test('add', () => {
+        const index = new usearch.Index(2, 'l2sq');
+
+        index.add(15n, new Float32Array([10, 20]));
+        index.add(16n, new Float32Array([10, 25]));
 
-    index.add(15n, new Float32Array([10, 20]));
-    index.add(16n, new Float32Array([10, 25]));
+        assert.equal(index.size(), 2, 'size after adding elements should be 2');
+        assert.equal(index.contains(15), true, 'entry must be present after insertion');
+
+        const results = index.search(new Float32Array([13, 14]), 2);
+
+        assert.deepEqual(results.keys, new BigUint64Array([15n, 16n]), 'keys should be 15 and 16');
+        assert.deepEqual(results.distances, new Float32Array([45, 130]), 'distances should be 45 and 130');
+    });
 
-    assert.equal(index.size(), 2, 'size after adding elements should be 2');
-    assert.equal(index.contains(15), true, 'entry must be present after insertion');
+    await t.test('remove', () => {
+        const index = new usearch.Index(2, 'l2sq');
 
-    const results = index.search(new Float32Array([13, 14]), 2);
+        index.add(15n, new Float32Array([10, 20]));
+        index.add(16n, new Float32Array([10, 25]));
+        index.add(25n, new Float32Array([20, 40]));
+        index.add(26n, new Float32Array([20, 45]));
 
-    assert.deepEqual(results.keys, new BigUint64Array([15n, 16n]), 'keys should be 15 and 16');
-    assert.deepEqual(results.distances, new Float32Array([45, 130]), 'distances should be 45 and 130');
+        assert.equal(index.remove(15n), 1);
+
+        assert.equal(index.size(), 3, 'size after remoing elements should be 3');
+        assert.equal(index.contains(15), false, 'entry must be absent after insertion');
+
+        const results = index.search(new Float32Array([13, 14]), 2);
+
+        assert.deepEqual(results.keys, new BigUint64Array([16n, 25n]), 'keys should not include 15');
+    });
 });
 
-test('Batch operations', () => {
-    const indexBatch = new usearch.Index(2, 'l2sq');
+test('Batch operations', async (t) => {
+    await t.test('add', () => {
+        const indexBatch = new usearch.Index(2, 'l2sq');
 
-    const keys = [15n, 16n];
-    const vectors = [new Float32Array([10, 20]), new Float32Array([10, 25])];
+        const keys = [15n, 16n];
+        const vectors = [new Float32Array([10, 20]), new Float32Array([10, 25])];
 
-    indexBatch.add(keys, vectors);
-    assert.equal(indexBatch.size(), 2, 'size after adding batch should be 2');
+        indexBatch.add(keys, vectors);
+        assert.equal(indexBatch.size(), 2, 'size after adding batch should be 2');
 
-    const results = indexBatch.search(new Float32Array([13, 14]), 2);
+        const results = indexBatch.search(new Float32Array([13, 14]), 2);
 
-    assert.deepEqual(results.keys, new BigUint64Array([15n, 16n]), 'keys should be 15 and 16');
-    assert.deepEqual(results.distances, new Float32Array([45, 130]), 'distances should be 45 and 130');
+        assert.deepEqual(results.keys, new BigUint64Array([15n, 16n]), 'keys should be 15 and 16');
+        assert.deepEqual(results.distances, new Float32Array([45, 130]), 'distances should be 45 and 130');
+    });
+
+    await t.test('remove', () => {
+        const indexBatch = new usearch.Index(2, 'l2sq');
+
+        const keys = [15n, 16n, 25n, 26n];
+        const vectors = [
+            new Float32Array([10, 20]),
+            new Float32Array([10, 25]),
+            new Float32Array([20, 40]),
+            new Float32Array([20, 45])
+        ];
+        indexBatch.add(keys, vectors);
+
+        assert.deepEqual(indexBatch.remove([15n, 25n]), [1, 1])
+        assert.equal(indexBatch.size(), 2, 'size after removing batch should be 2');
+
+        const results = indexBatch.search(new Float32Array([13, 14]), 2);
+
+        assert.deepEqual(results.keys, new BigUint64Array([16n, 26n]), 'keys should not include 15 and 25');
+    });
 });
 
 test("Expected results", () => {
@@ -161,7 +207,7 @@ test('Serialization', async (t) => {
     // todo: Skip as the test fails only on windows.
     // The following error in afterEach().
     // `error: "EBUSY: resource busy or locked, unlink`
-    await t.test('view', {skip: process.platform === 'win32'}, () => {
+    await t.test('view: Read data', {skip: process.platform === 'win32'}, () => {
         const index = new usearch.Index({
             metric: "l2sq",
             connectivity: 16,
@@ -174,4 +220,36 @@ test('Serialization', async (t) => {
         assert.deepEqual(results.keys, new BigUint64Array([42n]));
         assertAlmostEqual(results.distances[0], new Float32Array([0]));
     });
+
+    await t.test('view: Invalid operations: add', {skip: process.platform === 'win32'}, () => {
+        const index = new usearch.Index({
+            metric: "l2sq",
+            connectivity: 16,
+            dimensions: 3,
+        });
+        index.view(indexPath);
+        assert.throws(
+            () => index.add(43n, new Float32Array([0.2, 0.6, 0.4])),
+            {
+                name: 'Error',
+                message: "Can't add to an immutable index"
+            }
+        );
+    });
+
+    await t.test('view: Invalid operations: remove', {skip: process.platform === 'win32'}, () => {
+        const index = new usearch.Index({
+            metric: "l2sq",
+            connectivity: 16,
+            dimensions: 3,
+        });
+        index.view(indexPath);
+        assert.throws(
+            () => index.remove(42n),
+            {
+                name: 'Error',
+                message: "Can't remove to an immutable index"
+            }
+        );
+    });
 });

From 113a7862f80bf2eb347c559da8487c4be05a5cc4 Mon Sep 17 00:00:00 2001
From: Mikhail Bautin <552936+mbautin@users.noreply.github.com>
Date: Tue, 22 Oct 2024 06:53:13 -0700
Subject: [PATCH 20/32] Add: Metadata for observability (#508)

---------

Co-authored-by: Mikhail Bautin <mbautin@users.noreply.github.com>
---
 include/usearch/index.hpp       | 12 ++++++++++++
 include/usearch/index_dense.hpp |  3 +++
 2 files changed, 15 insertions(+)

diff --git a/include/usearch/index.hpp b/include/usearch/index.hpp
index ae10379b..13cd618d 100644
--- a/include/usearch/index.hpp
+++ b/include/usearch/index.hpp
@@ -3175,6 +3175,18 @@ class index_gt {
 
     std::size_t memory_usage_per_node(level_t level) const noexcept { return node_bytes_(level); }
 
+    double inverse_log_connectivity() const {
+        return pre_.inverse_log_connectivity;
+    }
+
+    std::size_t neighbors_base_bytes() const {
+        return pre_.neighbors_base_bytes;
+    }
+
+    std::size_t neighbors_bytes() const {
+        return pre_.neighbors_bytes;
+    }
+
 #if defined(USEARCH_USE_PRAGMA_REGION)
 #pragma endregion
 
diff --git a/include/usearch/index_dense.hpp b/include/usearch/index_dense.hpp
index 93b55877..3a5ab31f 100644
--- a/include/usearch/index_dense.hpp
+++ b/include/usearch/index_dense.hpp
@@ -693,6 +693,9 @@ class index_dense_gt {
     std::size_t max_level() const { return typed_->max_level(); }
     index_dense_config_t const& config() const { return config_; }
     index_limits_t const& limits() const { return typed_->limits(); }
+    double inverse_log_connectivity() const { return typed_->inverse_log_connectivity(); }
+    std::size_t neighbors_base_bytes() const { return typed_->neighbors_base_bytes(); }
+    std::size_t neighbors_bytes() const { return typed_->neighbors_bytes(); }
     bool multi() const { return config_.multi; }
     std::size_t currently_available_threads() const {
         std::unique_lock<std::mutex> available_threads_lock(available_threads_mutex_);

From aa2ddd7118215dcba3985103c19e8e89b8d0a8a9 Mon Sep 17 00:00:00 2001
From: Abe Tomoaki <abe@clear-code.com>
Date: Tue, 29 Oct 2024 04:26:05 +0900
Subject: [PATCH 21/32] Doc: Error message type (#509)

Related: GH-506
---
 javascript/usearch.test.js | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/javascript/usearch.test.js b/javascript/usearch.test.js
index c04bc436..5210a2bd 100644
--- a/javascript/usearch.test.js
+++ b/javascript/usearch.test.js
@@ -248,7 +248,7 @@ test('Serialization', async (t) => {
             () => index.remove(42n),
             {
                 name: 'Error',
-                message: "Can't remove to an immutable index"
+                message: "Can't remove from an immutable index"
             }
         );
     });

From 19a9d0bc813edb8d870a9b00bd992918294c5ab1 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 28 Oct 2024 19:31:38 +0000
Subject: [PATCH 22/32] Add: `kmeans` Python API

---
 include/usearch/index_plugins.hpp | 55 ++++++++++++------
 python/lib.cpp                    | 76 +++++++++++++++----------
 python/scripts/test_tooling.py    | 13 +++--
 python/usearch/index.py           | 94 +++++++++++++++++++++++++++++++
 4 files changed, 185 insertions(+), 53 deletions(-)

diff --git a/include/usearch/index_plugins.hpp b/include/usearch/index_plugins.hpp
index 7e78bb46..36e4b197 100644
--- a/include/usearch/index_plugins.hpp
+++ b/include/usearch/index_plugins.hpp
@@ -2168,14 +2168,30 @@ template <typename allocator_at = std::allocator<char>> class kmeans_clustering_
     metric_kind_t metric_kind{metric_kind_t::l2sq_k};
     scalar_kind_t quantization_kind{scalar_kind_t::bf16_k};
 
+    static constexpr std::size_t max_iterations_default_k = 300;
+    static constexpr f64_t inertia_threshold_default_k = 1e-4;
+    static constexpr f64_t max_seconds_default_k = 60.0;
+    static constexpr f64_t min_shifts_default_k = 0.01;
+
     /// @brief Early-exit parameter - the maximum number of iterations to perform.
-    std::size_t max_iterations{};
+    std::size_t max_iterations{max_iterations_default_k};
     /// @brief Early-exit parameter - the threshold for the final inertia to terminate early.
-    f64_t final_inertia_threshold{};
+    f64_t inertia_threshold{inertia_threshold_default_k};
     /// @brief Early-exit parameter - the maximum runtime allowed in seconds.
-    f64_t runtime_limit_seconds{};
-    /// @brief Early-exit parameter - the minimum number of points that must change clusters per iteration.
-    std::size_t min_points_shifted_per_iteration{};
+    f64_t max_seconds{max_seconds_default_k};
+    /// @brief Early-exit parameter - the minimum share of points that must change clusters per iteration.
+    f64_t min_shifts{min_shifts_default_k};
+    /// @brief The random seed to use for centroid initialization.
+    std::uint64_t seed{0};
+
+    kmeans_clustering_gt(std::uint64_t seed) noexcept : seed(seed) {}
+    kmeans_clustering_gt() noexcept(false) {
+        std::random_device random_device;
+        seed = random_device();
+    }
+
+    kmeans_clustering_gt(kmeans_clustering_gt const&) = default;
+    kmeans_clustering_gt& operator=(kmeans_clustering_gt const&) = default;
 
     template <typename scalar_at, typename executor_at = dummy_executor_t, typename progress_at = dummy_progress_t>
     kmeans_clustering_result_t operator()( //
@@ -2269,8 +2285,8 @@ template <typename allocator_at = std::allocator<char>> class kmeans_clustering_
         }
 
         // Initialize centroids with random points vectors.
-        std::random_device random_device;
-        std::mt19937 random_engine(random_device());
+        std::mt19937_64 random_engine;
+        random_engine.seed(seed);
         for (std::size_t i = 0; i < wanted_clusters; i++) {
             // Generate the random index of the points vector,
             // that is unique and not already used as a centroid.
@@ -2298,12 +2314,14 @@ template <typename allocator_at = std::allocator<char>> class kmeans_clustering_
 
         auto start_time = std::chrono::high_resolution_clock::now();
         std::size_t iterations = 0;
+        std::size_t const min_points_shifted_per_iteration = static_cast<std::size_t>(min_shifts * points_count);
+        f64_t last_aggregate_distance = std::numeric_limits<f64_t>::max();
+
         while (iterations < max_iterations) {
             iterations++;
 
             // For every point, find the closest centroid.
             std::atomic<std::size_t> points_shifted = 0;
-            std::atomic<distance_t> total_inertia = 0;
             executor.dynamic(points_count, [&](std::size_t thread_idx, std::size_t points_idx) {
                 byte_t const* quantized_point =
                     points_quantized_buffer.data() + points_idx * stride_per_vector_quantized;
@@ -2327,31 +2345,32 @@ template <typename allocator_at = std::allocator<char>> class kmeans_clustering_
                 }
 
                 closest_distance_ref = closest_distance_local;
-                // total_inertia.add(closest_distance_local * closest_distance_local, std::memory_order_relaxed);
                 return true;
             });
 
+            f64_t aggregate_distance = 0.0;
+            for (std::size_t i = 0; i < points_count; i++)
+                aggregate_distance += point_to_centroid_distance_buffer[i];
+            f64_t aggregate_distance_change =
+                std::abs(aggregate_distance - last_aggregate_distance) / last_aggregate_distance;
+
             auto current_time = std::chrono::high_resolution_clock::now();
             std::chrono::duration<f64_t> elapsed_time = current_time - start_time;
             result.runtime_seconds = elapsed_time.count();
-            result.last_iteration_inertia = total_inertia.load(std::memory_order_relaxed);
+            result.last_iteration_inertia = aggregate_distance_change;
             result.last_iteration_points_shifted = points_shifted.load(std::memory_order_relaxed);
 
             // Check for early-exit conditions
-            if (final_inertia_threshold != 0)
-                if (result.last_iteration_inertia <= final_inertia_threshold)
+            if (last_aggregate_distance != 0.0 && inertia_threshold != 0.0)
+                if (aggregate_distance_change <= inertia_threshold)
                     break;
             if (min_points_shifted_per_iteration != 0 || result.last_iteration_points_shifted == 0)
                 if (result.last_iteration_points_shifted <= min_points_shifted_per_iteration)
                     break;
-            if (runtime_limit_seconds != 0)
-                if (result.runtime_seconds >= runtime_limit_seconds)
+            if (max_seconds != 0)
+                if (result.runtime_seconds >= max_seconds)
                     break;
 
-            double aggregate_distance = 0.0;
-            for (std::size_t i = 0; i < points_count; i++)
-                aggregate_distance += point_to_centroid_distance_buffer[i];
-
             // For every centroid, recalculate the mean of all points assigned to it.
             // That part is problematic to parallelize on many-core-systems, because of the contention.
             // Alternatively, a tree-like approach can be used, where every core accumulates it's own partial sums.
diff --git a/python/lib.cpp b/python/lib.cpp
index 2275b09e..d5a6d024 100644
--- a/python/lib.cpp
+++ b/python/lib.cpp
@@ -553,6 +553,10 @@ static py::tuple cluster_many_brute_force( //
     py::buffer dataset,                    //
     std::size_t wanted,                    //
     std::size_t max_iterations,            //
+    double inertia_threshold,              //
+    double max_seconds,                    //
+    double min_shifts,                     //
+    std::uint64_t seed,                    //
     std::size_t threads,                   //
     scalar_kind_t scalar_kind,             //
     metric_kind_t metric_kind,             //
@@ -583,6 +587,9 @@ static py::tuple cluster_many_brute_force( //
     engine.metric_kind = metric_kind;
     engine.quantization_kind = scalar_kind;
     engine.max_iterations = max_iterations;
+    engine.min_shifts = min_shifts;
+    engine.max_seconds = max_seconds;
+    engine.inertia_threshold = inertia_threshold;
 
     kmeans_clustering_result_t result = engine(                                           //
         reinterpret_cast<byte_t const*>(dataset_info.ptr), dataset_count, dataset_stride, //
@@ -999,27 +1006,33 @@ PYBIND11_MODULE(compiled, m) {
         return index_metadata(meta);
     });
 
-    m.def("exact_search", &search_many_brute_force,                               //
-          py::arg("dataset"),                                                     //
-          py::arg("queries"),                                                     //
-          py::arg("count") = 10,                                                  //
-          py::kw_only(),                                                          //
-          py::arg("threads") = 0,                                                 //
-          py::arg("metric_kind") = metric_kind_t::cos_k,                          //
-          py::arg("metric_signature") = metric_punned_signature_t::array_array_k, //
-          py::arg("metric_pointer") = 0,                                          //
-          py::arg("progress") = nullptr                                           //
+    m.def(                                                                      //
+        "exact_search", &search_many_brute_force,                               //
+        py::arg("dataset"),                                                     //
+        py::arg("queries"),                                                     //
+        py::arg("count") = 10,                                                  //
+        py::kw_only(),                                                          //
+        py::arg("threads") = 0,                                                 //
+        py::arg("metric_kind") = metric_kind_t::cos_k,                          //
+        py::arg("metric_signature") = metric_punned_signature_t::array_array_k, //
+        py::arg("metric_pointer") = 0,                                          //
+        py::arg("progress") = nullptr                                           //
     );
 
-    m.def("kmeans", &cluster_many_brute_force,            //
-          py::arg("dataset"),                             //
-          py::arg("count") = 10,                          //
-          py::kw_only(),                                  //
-          py::arg("max_iterations") = 100,                //
-          py::arg("threads") = 0,                         //
-          py::arg("dtype") = scalar_kind_t::bf16_k,       //
-          py::arg("metric_kind") = metric_kind_t::l2sq_k, //
-          py::arg("progress") = nullptr                   //
+    m.def(                                                                               //
+        "kmeans", &cluster_many_brute_force,                                             //
+        py::arg("dataset"),                                                              //
+        py::arg("count") = 10,                                                           //
+        py::kw_only(),                                                                   //
+        py::arg("max_iterations") = kmeans_clustering_t::max_iterations_default_k,       //
+        py::arg("inertia_threshold") = kmeans_clustering_t::inertia_threshold_default_k, //
+        py::arg("max_seconds") = kmeans_clustering_t::max_seconds_default_k,             //
+        py::arg("min_shifts") = kmeans_clustering_t::min_shifts_default_k,               //
+        py::arg("seed") = 0,                                                             //
+        py::arg("threads") = 0,                                                          //
+        py::arg("dtype") = scalar_kind_t::bf16_k,                                        //
+        py::arg("metric_kind") = metric_kind_t::l2sq_k,                                  //
+        py::arg("progress") = nullptr                                                    //
     );
 
     m.def(
@@ -1035,18 +1048,19 @@ PYBIND11_MODULE(compiled, m) {
 
     auto i = py::class_<dense_index_py_t, std::shared_ptr<dense_index_py_t>>(m, "Index");
 
-    i.def(py::init(&make_index),                                                  //
-          py::kw_only(),                                                          //
-          py::arg("ndim") = 0,                                                    //
-          py::arg("dtype") = scalar_kind_t::f32_k,                                //
-          py::arg("connectivity") = default_connectivity(),                       //
-          py::arg("expansion_add") = default_expansion_add(),                     //
-          py::arg("expansion_search") = default_expansion_search(),               //
-          py::arg("metric_kind") = metric_kind_t::cos_k,                          //
-          py::arg("metric_signature") = metric_punned_signature_t::array_array_k, //
-          py::arg("metric_pointer") = 0,                                          //
-          py::arg("multi") = false,                                               //
-          py::arg("enable_key_lookups") = true                                    //
+    i.def(                                                                      //
+        py::init(&make_index),                                                  //
+        py::kw_only(),                                                          //
+        py::arg("ndim") = 0,                                                    //
+        py::arg("dtype") = scalar_kind_t::f32_k,                                //
+        py::arg("connectivity") = default_connectivity(),                       //
+        py::arg("expansion_add") = default_expansion_add(),                     //
+        py::arg("expansion_search") = default_expansion_search(),               //
+        py::arg("metric_kind") = metric_kind_t::cos_k,                          //
+        py::arg("metric_signature") = metric_punned_signature_t::array_array_k, //
+        py::arg("metric_pointer") = 0,                                          //
+        py::arg("multi") = false,                                               //
+        py::arg("enable_key_lookups") = true                                    //
     );
 
     i.def(                                                //
diff --git a/python/scripts/test_tooling.py b/python/scripts/test_tooling.py
index c7a52afb..5c9f3545 100644
--- a/python/scripts/test_tooling.py
+++ b/python/scripts/test_tooling.py
@@ -7,7 +7,7 @@
 from usearch.index import search
 from usearch.eval import random_vectors
 
-from usearch.index import Match, Matches, BatchMatches, Index, Indexes
+from usearch.index import Match, Matches, BatchMatches, Index, Indexes, kmeans
 
 
 dimensions = [3, 97, 256]
@@ -70,9 +70,7 @@ def test_exact_search(rows: int, cols: int, k: int, reordered: bool):
         reordered_keys = keys
 
     matches: BatchMatches = search(original, original[reordered_keys], k, exact=True)
-    top_matches = (
-        [int(m.keys[0]) for m in matches] if rows > 1 else [int(matches.keys[0])]
-    )
+    top_matches = [int(m.keys[0]) for m in matches] if rows > 1 else [int(matches.keys[0])]
     assert top_matches == list(reordered_keys)
 
     matches: Matches = search(original, original[-1], k, exact=True)
@@ -133,3 +131,10 @@ def test_multi_index():
     matches = indexes.search(vectors, 10)
     assert len(matches) == 3
     assert len(matches[0].keys) == 2
+
+
+def test_kmeans(count_vectors: int = 100, ndim: int = 10, count_clusters: int = 5):
+    X = np.random.rand(count_vectors, ndim)
+    assignments, distances, centroids = kmeans(X, count_clusters)
+    assert len(assignments) == count_vectors
+    assert ((assignments >= 0) & (assignments < count_clusters)).all()
diff --git a/python/usearch/index.py b/python/usearch/index.py
index fe344431..ef6e6e61 100644
--- a/python/usearch/index.py
+++ b/python/usearch/index.py
@@ -34,6 +34,7 @@
     index_dense_metadata_from_buffer as _index_dense_metadata_from_buffer,
     exact_search as _exact_search,
     hardware_acceleration as _hardware_acceleration,
+    kmeans as _kmeans,
 )
 
 # Precompiled symbols that will be exposed
@@ -1583,3 +1584,96 @@ def search_batch(query, **kwargs):
         threads=threads,
         progress=progress,
     )
+
+
+def kmeans(
+    X,
+    k,
+    metric: str = "l2sq",
+    dtype: str = "bf16",
+    max_iters: int = 300,
+    inertia_threshold: float = 1e-4,
+    max_seconds: float = 60.0,
+    min_shifts: float = 0.01,
+    seed: Optional[int] = None,
+) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
+    """
+    Performs KMeans clustering on a dataset using the USearch library with mixed-precision support.
+
+    This function clusters the given dataset `X` into `k` clusters by iteratively assigning points
+    to the nearest centroids and updating the centroids based on the mean of the points assigned to them.
+    The algorithm supports mixed-precision types and early termination based on convergence criteria
+    like the number of iterations, inertia threshold, maximum runtime, and minimum point shifts.
+
+    Parameters
+    ----------
+    X : numpy.ndarray
+        The input data, where each row represents a data point and each column represents a feature.
+    k : int
+        The number of clusters to form.
+    metric : str, optional
+        The distance metric used to calculate the distance between points and centroids.
+        Default is "l2sq" (squared Euclidean distance). Cosine "cos" distance is also supported.
+    dtype : str, optional
+        The data type used for clustering calculations. Default is "bf16" (Brain Float 16).
+        Other supported types include "f32" (float32) and "f64" (float64), "f16" (float16),
+        "i8" (int8), and b1 (boolean) bit-packed vectors.
+    max_iters : int, optional
+        The maximum number of iterations the algorithm should run. Default is 300.
+    inertia_threshold : float, optional
+        The threshold for inertia (sum of squared distances to centroids) to terminate early.
+        When the change in inertia between iterations falls below this value, the algorithm stops.
+        Default is 1e-4.
+    max_seconds : float, optional
+        The maximum allowable runtime for the algorithm in seconds. If exceeded, the algorithm
+        terminates early. Default is 60.0 seconds.
+    min_shifts : float, optional
+        The minimum fraction of points that must change their assigned cluster between iterations
+        to continue. If fewer than this fraction of points change clusters, the algorithm terminates.
+        Default is 0.01 (1% of the total points).
+    seed : int, optional
+        The random seed used to initialize the centroids. Default is None.
+
+    Returns
+    -------
+    assignments : numpy.ndarray
+        An array containing the index of the assigned cluster for each point in the dataset.
+    distances : numpy.ndarray
+        An array containing the distance of each point to its assigned cluster centroid.
+    centroids : numpy.ndarray
+        The final centroids of the clusters.
+
+    Raises
+    ------
+    ValueError
+        If any of the input parameters are invalid, such as the number of clusters being greater
+        than the number of data points.
+
+    Notes
+    -----
+    This implementation utilizes mixed-precision computation to speed up the clustering process
+    while maintaining accuracy. It also incorporates early exit conditions to avoid unnecessary
+    computation when the clustering has stabilized, either by reaching a minimal inertia threshold,
+    exceeding the maximum runtime, or when very few points are changing clusters between iterations.
+
+    Example
+    -------
+    >>> X = np.random.rand(100, 10)
+    >>> k = 5
+    >>> assignments, distances, centroids = usearch.index.kmeans(X, k)
+    """
+    metric = _normalize_metric(metric)
+    dtype = _normalize_dtype(dtype, ndim=X.shape[1], metric=metric)
+    seed = np.random.randint(0, 2**32 - 1) if seed is None else seed
+    assignments, distances, centroids = _kmeans(
+        X,
+        k,
+        metric_kind=metric,
+        max_iterations=max_iters,
+        max_seconds=max_seconds,
+        min_shifts=min_shifts,
+        inertia_threshold=inertia_threshold,
+        dtype=dtype,
+        seed=seed,
+    )
+    return assignments, distances, centroids

From 730815ffa87b633f82c075d3e609f7824c3386aa Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 29 Oct 2024 03:29:10 +0400
Subject: [PATCH 23/32] Docs: Spelling

---
 .vscode/settings.json             |  2 ++
 BENCHMARKS.md                     | 37 ++++++++++++++++---------------
 include/usearch/index.hpp         |  8 +++++--
 include/usearch/index_plugins.hpp |  4 ++--
 python/usearch/eval.py            |  2 +-
 python/usearch/index.py           |  1 +
 6 files changed, 31 insertions(+), 23 deletions(-)

diff --git a/.vscode/settings.json b/.vscode/settings.json
index a76dbcea..98bc319d 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -173,6 +173,7 @@
     "nlevels",
     "Numba",
     "numpy",
+    "NVME",
     "objc",
     "OPENMP",
     "preprocess",
@@ -198,6 +199,7 @@
     "uninitialize",
     "unumusearch",
     "usearch",
+    "usecase",
     "usecases",
     "Vardanian",
     "vectorize",
diff --git a/BENCHMARKS.md b/BENCHMARKS.md
index df9def46..5271a112 100644
--- a/BENCHMARKS.md
+++ b/BENCHMARKS.md
@@ -10,11 +10,11 @@ All major HNSW implementation share an identical list of hyper-parameters:
 
 The default values vary drastically.
 
-|  Library  | Connectivity | EF @ A | EF @ S |
-| :-------: | :----------: | :----: | :----: |
-| `hnswlib` |      16      |  200   |   10   |
-|  `FAISS`  |      32      |   40   |   16   |
-| `USearch` |      16      |  128   |   64   |
+| Library   | Connectivity | EF @ A | EF @ S |
+| :-------- | -----------: | -----: | -----: |
+| `hnswlib` |           16 |    200 |     10 |
+| `FAISS`   |           32 |     40 |     16 |
+| `USearch` |           16 |    128 |     64 |
 
 Below are the performance numbers for a benchmark running on the 64 cores of AWS `c7g.metal` "Graviton 3"-based instances.
 The main columns are:
@@ -26,27 +26,27 @@ The main columns are:
 ### Different "connectivity"
 
 | Vectors    | Connectivity | EF @ A | EF @ S | __Add__, QPS | __Search__, QPS | __Recall @1__ |
-| :--------- | :----------: | :----: | :----: | :----------: | :-------------: | ------------: |
-| `f32` x256 |      16      |  128   |   64   |    75'640    |     131'654     |         99.3% |
-| `f32` x256 |      12      |  128   |   64   |    81'747    |     149'728     |         99.0% |
-| `f32` x256 |      32      |  128   |   64   |    64'368    |     104'050     |         99.4% |
+| :--------- | -----------: | -----: | -----: | -----------: | --------------: | ------------: |
+| `f32` x256 |           16 |    128 |     64 |       75'640 |         131'654 |         99.3% |
+| `f32` x256 |           12 |    128 |     64 |       81'747 |         149'728 |         99.0% |
+| `f32` x256 |           32 |    128 |     64 |       64'368 |         104'050 |         99.4% |
 
 ### Different "expansion factors"
 
 | Vectors    | Connectivity | EF @ A | EF @ S | __Add__, QPS | __Search__, QPS | __Recall @1__ |
-| :--------- | :----------: | :----: | :----: | :----------: | :-------------: | ------------: |
-| `f32` x256 |      16      |  128   |   64   |    75'640    |     131'654     |         99.3% |
-| `f32` x256 |      16      |   64   |   32   |   128'644    |     228'422     |         97.2% |
-| `f32` x256 |      16      |  256   |  128   |    39'981    |     69'065      |         99.2% |
+| :--------- | -----------: | -----: | -----: | -----------: | --------------: | ------------: |
+| `f32` x256 |           16 |    128 |     64 |       75'640 |         131'654 |         99.3% |
+| `f32` x256 |           16 |     64 |     32 |      128'644 |         228'422 |         97.2% |
+| `f32` x256 |           16 |    256 |    128 |       39'981 |          69'065 |         99.2% |
 
 ### Different vectors "quantization"
 
 | Vectors      | Connectivity | EF @ A | EF @ S | __Add__, QPS | __Search__, QPS | __Recall @1__ |
-| :----------- | :----------: | :----: | :----: | :----------: | :-------------: | ------------: |
-| `f32` x256   |      16      |  128   |   64   |    87'995    |     171'856     |         99.1% |
-| `f16` x256   |      16      |  128   |   64   |    87'270    |     153'788     |         98.4% |
-| `f16` x256 ✳️ |      16      |  128   |   64   |    71'454    |     132'673     |         98.4% |
-| `i8` x256    |      16      |  128   |   64   |   115'923    |     274'653     |         98.9% |
+| :----------- | -----------: | -----: | -----: | -----------: | --------------: | ------------: |
+| `f32` x256   |           16 |    128 |     64 |       87'995 |         171'856 |         99.1% |
+| `f16` x256   |           16 |    128 |     64 |       87'270 |         153'788 |         98.4% |
+| `f16` x256 ✳️ |           16 |    128 |     64 |       71'454 |         132'673 |         98.4% |
+| `i8` x256    |           16 |    128 |     64 |      115'923 |         274'653 |         98.9% |
 
 As seen on the chart, for `f16` quantization, performance may differ depending on native hardware support for that numeric type.
 Also worth noting, 8-bit quantization results in almost no quantization loss and may perform better than `f16`.
@@ -61,6 +61,7 @@ Within this repository you will find two commonly used utilities:
 To achieve best highest results we suggest compiling locally for the target architecture.
 
 ```sh
+git submodule update --init --recursive
 cmake -USEARCH_BUILD_BENCH_CPP=1 -DUSEARCH_BUILD_TEST_C=1 -DUSEARCH_USE_OPENMP=1 -DUSEARCH_USE_SIMSIMD=1 -DCMAKE_BUILD_TYPE=RelWithDebInfo -B build_profile
 cmake --build build_profile --config RelWithDebInfo -j
 build_profile/bench_cpp --help
diff --git a/include/usearch/index.hpp b/include/usearch/index.hpp
index f026455a..b4240fad 100644
--- a/include/usearch/index.hpp
+++ b/include/usearch/index.hpp
@@ -957,6 +957,10 @@ class sorted_buffer_gt {
 
 /**
  *  @brief  Five-byte integer type to address node clouds with over 4B entries.
+ *
+ *  40 bits is enough to address a @b Trillion entries potentially colocated on 1 machine.
+ *  At roughly 5 bytes * 20 neighbors + 100 bytes per entry, this translates to 200 TB of data,
+ *  which is similar to a single-server capacity of modern NVME arrays.
  */
 class usearch_pack_m uint40_t {
     unsigned char octets[5];
@@ -1211,8 +1215,8 @@ class ring_gt {
     using element_t = element_at;
     using allocator_t = allocator_at;
 
-    static_assert(std::is_trivially_destructible<element_t>(), "This heap is designed for trivial structs");
-    static_assert(std::is_trivially_copy_constructible<element_t>(), "This heap is designed for trivial structs");
+    static_assert(std::is_trivially_destructible<element_t>(), "This ring is designed for trivial structs");
+    static_assert(std::is_trivially_copy_constructible<element_t>(), "This ring is designed for trivial structs");
 
     using value_type = element_t;
 
diff --git a/include/usearch/index_plugins.hpp b/include/usearch/index_plugins.hpp
index 349ee360..52978b90 100644
--- a/include/usearch/index_plugins.hpp
+++ b/include/usearch/index_plugins.hpp
@@ -65,11 +65,11 @@
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wunused-variable"
 #pragma GCC diagnostic ignored "-Wunused-but-set-variable"
-#ifdef _MSC_VER
+#if defined(_MSC_VER)
 #pragma warning(push)
 #pragma warning(disable : 4101) // "Unused variables"
 #pragma warning(disable : 4068) // "Unknown pragmas", when MSVC tries to read GCC pragmas
-#endif // _MSC_VER
+#endif                          // _MSC_VER
 #include <simsimd/simsimd.h>
 #ifdef _MSC_VER
 #pragma warning(pop)
diff --git a/python/usearch/eval.py b/python/usearch/eval.py
index 4baee713..c4e95122 100644
--- a/python/usearch/eval.py
+++ b/python/usearch/eval.py
@@ -359,7 +359,7 @@ def count(self):
         return self.vectors.shape[0]
 
     def inplace_shuffle(self):
-        """Rorders the `vectors` and `keys`. Often used for robustness benchmarks."""
+        """Reorders the `vectors` and `keys`. Often used for robustness benchmarks."""
 
         new_order = np.arange(self.count)
         np.random.shuffle(new_order)
diff --git a/python/usearch/index.py b/python/usearch/index.py
index f73997bc..cd14b6ec 100644
--- a/python/usearch/index.py
+++ b/python/usearch/index.py
@@ -126,6 +126,7 @@ def _normalize_dtype(
         "i8": ScalarKind.I8,
         "b1": ScalarKind.B1,
         "b1x8": ScalarKind.B1,
+        "bits": ScalarKind.B1,
         "float64": ScalarKind.F64,
         "float32": ScalarKind.F32,
         "bfloat16": ScalarKind.BF16,

From 295b3d61b04eac2390b30bee20c85cece9ec133b Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 29 Oct 2024 03:59:52 +0400
Subject: [PATCH 24/32] Make: Bump SimSIMD

---
 pyproject.toml | 2 +-
 setup.py       | 2 +-
 simsimd        | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index a4419535..643796fe 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -13,7 +13,7 @@ requires = [
     "cmake>=3.22",
     "pybind11",
     "numpy",
-    "simsimd>=5.6.4",
+    "simsimd>=5.9.3,<6.0.0",
 ]
 build-backend = "setuptools.build_meta"
 
diff --git a/setup.py b/setup.py
index 39d59904..61c85d6d 100644
--- a/setup.py
+++ b/setup.py
@@ -162,7 +162,7 @@ def get_bool_env_w_name(name: str, preference: bool) -> tuple:
 ]
 if use_simsimd:
     include_dirs.append("simsimd/include")
-    install_requires.append("simsimd>=5.6.4")
+    install_requires.append("simsimd>=5.9.3,<6.0.0")
 if use_fp16lib:
     include_dirs.append("fp16/include")
 
diff --git a/simsimd b/simsimd
index 2de0445d..b6185e6e 160000
--- a/simsimd
+++ b/simsimd
@@ -1 +1 @@
-Subproject commit 2de0445d58f593b49cfebfda7ce838d25bb7588f
+Subproject commit b6185e6e3c62426d7bec1892b96c5658f06d5862

From 0dac789832dd165481595d73f254bcf720c4238f Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 29 Oct 2024 07:04:43 +0100
Subject: [PATCH 25/32] Make: Bump SimSIMD for Turin builds

---
 simsimd | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/simsimd b/simsimd
index b6185e6e..79514c57 160000
--- a/simsimd
+++ b/simsimd
@@ -1 +1 @@
-Subproject commit b6185e6e3c62426d7bec1892b96c5658f06d5862
+Subproject commit 79514c57c6f353c308a4d01d3639861ab26a759f

From e057feb128743a3c14f77f9fd4913c6de3b8b1a9 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 29 Oct 2024 06:17:58 +0000
Subject: [PATCH 26/32] Improve: Clustering benchmarks

Remove `fire` dependency and
document usage.
---
 BENCHMARKS.md                                 |  13 +-
 CONTRIBUTING.md                               |   2 +-
 cluster.py => python/scripts/bench_cluster.py | 127 +++++++-----------
 python/scripts/bench_exact.py                 |  29 +++-
 python/usearch/index.py                       |   6 +-
 5 files changed, 91 insertions(+), 86 deletions(-)
 rename cluster.py => python/scripts/bench_cluster.py (59%)

diff --git a/BENCHMARKS.md b/BENCHMARKS.md
index df9def46..8b2241d2 100644
--- a/BENCHMARKS.md
+++ b/BENCHMARKS.md
@@ -58,6 +58,8 @@ Within this repository you will find two commonly used utilities:
 - `cpp/bench.cpp` the produces the `bench_cpp` binary for broad USearch benchmarks.
 - `python/bench.py` and `python/bench.ipynb` for interactive charts against FAISS.
 
+### C++ Benchmarking Utilities
+
 To achieve best highest results we suggest compiling locally for the target architecture.
 
 ```sh
@@ -146,11 +148,20 @@ build_profile/bench_cpp \
     --cos
 ```
 
-
 > Optional parameters include `connectivity`, `expansion_add`, `expansion_search`.
 
 For Python, jut open the Jupyter Notebook and start playing around.
 
+### Python Benchmarking Utilities
+
+Several benchmarking suites are available for Python: approximate search, exact search, and clustering.
+
+```sh
+python/scripts/bench.py --help
+python/scripts/bench_exact.py --help
+python/scripts/bench_cluster.py --help
+```
+
 ## Datasets
 
 BigANN benchmark is a good starting point, if you are searching for large collections of high-dimensional vectors.
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index e91cced6..a25900c0 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -200,7 +200,7 @@ cibuildwheel --platform macos                   # works only on MacOS
 cibuildwheel --platform windows                 # works only on Windows
 ```
 
-You may need root previligies for multi-architecture builds:
+You may need root privileges for multi-architecture builds:
 
 ```sh
 sudo $(which cibuildwheel) --platform linux
diff --git a/cluster.py b/python/scripts/bench_cluster.py
similarity index 59%
rename from cluster.py
rename to python/scripts/bench_cluster.py
index 88e2e41a..2364ed7a 100644
--- a/cluster.py
+++ b/python/scripts/bench_cluster.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
 import os
 import argparse
 
@@ -6,51 +7,56 @@
 from tqdm import tqdm
 
 import usearch
-from usearch.index import Index
+from usearch.index import kmeans
 from usearch.io import load_matrix
 
 
-def initialize_centroids(X, k):
+def evaluate_clustering_euclidean(X, labels, centroids):
+    """Evaluate clustering quality as average distance to centroids"""
+    distances = np.linalg.norm(X - centroids[labels], axis=1)
+    return np.mean(distances)
+
+
+def evaluate_clustering_cosine(X, labels, centroids):
+    """Evaluate clustering quality as average cosine distance to centroids"""
+
+    # Normalize both data points and centroids
+    X_normalized = X / np.linalg.norm(X, axis=1, keepdims=True)
+    centroids_normalized = centroids / np.linalg.norm(centroids, axis=1, keepdims=True)
+
+    # Compute cosine similarity using dot product
+    cosine_similarities = np.sum(X_normalized * centroids_normalized[labels], axis=1)
+
+    # Convert cosine similarity to cosine distance
+    cosine_distances = 1 - cosine_similarities
+
+    # Return the average cosine distance
+    return np.mean(cosine_distances)
+
+
+def numpy_initialize_centroids(X, k):
     """Randomly choose k data points as initial centroids"""
     indices = np.random.choice(X.shape[0], k, replace=False)
     return X[indices]
 
 
-def assign_clusters_numpy(X, centroids):
+def numpy_assign_clusters(X, centroids):
     """Assign each data point to the nearest centroid (numpy NumPy implementation)"""
     distances = np.linalg.norm(X[:, np.newaxis] - centroids, axis=2)
     return np.argmin(distances, axis=1)
 
 
-def assign_clusters_faiss(X, centroids):
-    """Assign each data point to the nearest centroid using FAISS"""
-    index = faiss.IndexFlatL2(centroids.shape[1])
-    index.add(centroids)
-    _, labels = index.search(X, 1)
-    return labels.flatten()
-
-
-def assign_clusters_usearch(X, centroids):
-    """Assign each data point to the nearest centroid using USearch"""
-    dim = centroids.shape[1]
-    index = Index(ndim=dim, metric="l2sq")
-    index.add(None, centroids)
-    results = index.search(X, 1)
-    labels = results.keys.flatten()
-    return labels
-
-
-def update_centroids(X, labels, k):
+def numpy_update_centroids(X, labels, k):
     """Compute new centroids as the mean of all data points assigned to each cluster"""
     return np.array([X[labels == i].mean(axis=0) for i in range(k)])
 
 
-def kmeans(X, k, max_iters=100, tol=1e-4, assign_clusters_func=assign_clusters_numpy):
-    centroids = initialize_centroids(X, k)
+def cluster_with_numpy(X, k, max_iters=100, tol=1e-4):
+    centroids = numpy_initialize_centroids(X, k)
 
     for i in tqdm(range(max_iters), desc="KMeans Iterations"):
-        labels = assign_clusters_func(X, centroids)
-        new_centroids = update_centroids(X, labels, k)
+        labels = numpy_assign_clusters(X, centroids)
+        new_centroids = numpy_update_centroids(X, labels, k)
 
         if np.linalg.norm(new_centroids - centroids) < tol:
             break
@@ -60,30 +66,7 @@ def kmeans(X, k, max_iters=100, tol=1e-4, assign_clusters_func=assign_clusters_n
     return labels, centroids
 
 
-def evaluate_clustering(X, labels, centroids):
-    """Evaluate clustering quality as average distance to centroids"""
-    distances = np.linalg.norm(X - centroids[labels], axis=1)
-    return np.mean(distances)
-
-
-def evaluate_clustering_cosine(X, labels, centroids):
-    """Evaluate clustering quality as average cosine distance to centroids"""
-
-    # Normalize both data points and centroids
-    X_normalized = X / np.linalg.norm(X, axis=1, keepdims=True)
-    centroids_normalized = centroids / np.linalg.norm(centroids, axis=1, keepdims=True)
-
-    # Compute cosine similarity using dot product
-    cosine_similarities = np.sum(X_normalized * centroids_normalized[labels], axis=1)
-
-    # Convert cosine similarity to cosine distance
-    cosine_distances = 1 - cosine_similarities
-
-    # Return the average cosine distance
-    return np.mean(cosine_distances)
-
-
-def custom_faiss_clustering(X, k, max_iters=100):
+def cluster_with_faiss(X, k, max_iters=100):
     # Docs: https://github.com/facebookresearch/faiss/wiki/Faiss-building-blocks:-clustering,-PCA,-quantization
     # Header: https://github.com/facebookresearch/faiss/blob/main/faiss/Clustering.h
     # Source: https://github.com/facebookresearch/faiss/blob/main/faiss/Clustering.cpp
@@ -95,29 +78,21 @@ def custom_faiss_clustering(X, k, max_iters=100):
     return I.flatten(), kmeans.centroids
 
 
-def custom_usearch_clustering(X, k, max_iters=100, metric="l2sq", dtype="bf16"):
-    metric = usearch.index._normalize_metric(metric)
-    dtype = usearch.index._normalize_dtype(dtype, ndim=X.shape[1], metric=metric)
-    assignments, distances, centroids = usearch.compiled.kmeans(
-        X,
-        k,
-        max_iterations=max_iters,
-        metric_kind=metric,
-        dtype=dtype,
-    )
+def cluster_with_usearch(X, k, max_iters=100):
+    assignments, _, centroids = kmeans(X, k, max_iterations=max_iters)
     return assignments, centroids
 
 
 def main():
     parser = argparse.ArgumentParser(description="Compare KMeans clustering algorithms")
     parser.add_argument("--vectors", type=str, required=True, help="Path to binary matrix file")
-    parser.add_argument("-k", type=int, required=True, help="Number of centroids")
-    parser.add_argument("-i", type=int, help="Upper bound on number of iterations")
+    parser.add_argument("-k", default=10, type=int, required=True, help="Number of centroids")
+    parser.add_argument("-i", default=100, type=int, help="Upper bound on number of iterations")
     parser.add_argument("-n", type=int, help="Upper bound on number of points to use")
     parser.add_argument(
         "--method",
         type=str,
-        choices=["numpy", "faiss", "usearch", "custom_usearch", "custom_faiss"],
+        choices=["numpy", "faiss", "usearch"],
         default="numpy",
         help="Clustering backend",
     )
@@ -129,26 +104,24 @@ def main():
     k = args.k
     method = args.method
 
-    if method == "custom_usearch":
-        labels, centroids = custom_usearch_clustering(X, k, max_iters=max_iters)
-    elif method == "custom_faiss":
-        labels, centroids = custom_faiss_clustering(X, k, max_iters=max_iters)
+    time_before = os.times()
+    if method == "usearch":
+        labels, centroids = cluster_with_usearch(X, k, max_iters=max_iters)
+    elif method == "faiss":
+        labels, centroids = cluster_with_faiss(X, k, max_iters=max_iters)
     else:
-        if method == "numpy":
-            assign_clusters_func = assign_clusters_numpy
-        elif method == "faiss":
-            assign_clusters_func = assign_clusters_faiss
-        elif method == "usearch":
-            assign_clusters_func = assign_clusters_usearch
-        labels, centroids = kmeans(X, k, assign_clusters_func=assign_clusters_func)
-
-    quality = evaluate_clustering(X, labels, centroids)
+        labels, centroids = cluster_with_numpy(X, k, max_iters=max_iters)
+    time_after = os.times()
+    time_duration = time_after[0] - time_before[0]
+    print(f"Time: {time_duration:.2f}s, {time_duration / max_iters:.2f}s per iteration")
+
+    quality = evaluate_clustering_euclidean(X, labels, centroids)
     quality_cosine = evaluate_clustering_cosine(X, labels, centroids)
     print(f"Clustering quality (average distance to centroids): {quality:.4f}, cosine: {quality_cosine:.4f}")
 
     # Let's compare it to some random uniform assignment
     random_labels = np.random.randint(0, k, size=X.shape[0])
-    random_quality = evaluate_clustering(X, random_labels, centroids)
+    random_quality = evaluate_clustering_euclidean(X, random_labels, centroids)
     random_quality_cosine = evaluate_clustering_cosine(X, random_labels, centroids)
     print(f"... while random assignment quality: {random_quality:.4f}, cosine: {random_quality_cosine:.4f}")
 
diff --git a/python/scripts/bench_exact.py b/python/scripts/bench_exact.py
index 87d7d2fa..f6e70189 100644
--- a/python/scripts/bench_exact.py
+++ b/python/scripts/bench_exact.py
@@ -1,13 +1,14 @@
+#!/usr/bin/env python3
+import argparse
 from time import time
 from typing import Literal
 
 from faiss import knn, METRIC_L2, METRIC_INNER_PRODUCT
-import fire
 
 from usearch.compiled import hardware_acceleration
 from usearch.eval import random_vectors
 
-# Supplemantary imports for CLI arguments normalization
+# Supplementary imports for CLI arguments normalization
 from usearch.index import (
     ScalarKind,
     MetricKind,
@@ -35,7 +36,7 @@ def run(
     k: int = 100,
     ndim: int = 256,
     dtype: Literal["b1", "i8", "f16", "bf16", "f32", "f64"] = "f32",
-    metric: Literal["cos", "ip", "l2sq"] = "ip",
+    metric: Literal["ip", "cos", "l2sq"] = "ip",
 ):
 
     metric: MetricKind = _normalize_metric(metric)
@@ -72,5 +73,25 @@ def run(
     print(f"FAISS:   {format_duration(duration)} ({calculate_throughput(duration, q)})")
 
 
+def main():
+    parser = argparse.ArgumentParser(description="Compare KMeans clustering algorithms")
+    parser.add_argument("--ndim", default=256, type=int, help="Number of vector dimensions")
+    parser.add_argument("-n", default=10**5, type=int, help="Number of random vectors in a haystack")
+    parser.add_argument("-q", default=10, type=int, help="Number of query vectors")
+    parser.add_argument("-k", default=100, type=int, required=True, help="Number of closest neighbors to search for")
+    parser.add_argument("--dtype", type=str, choices=["b1", "i8", "f16", "bf16", "f32", "f64"], default="f32")
+    parser.add_argument("--metric", type=str, choices=["ip", "cos", "l2sq"], default="ip")
+
+    args = parser.parse_args()
+    run(
+        n=args.n,
+        q=args.q,
+        k=args.k,
+        ndim=args.ndim,
+        dtype=args.dtype,
+        metric=args.metric,
+    )
+
+
 if __name__ == "__main__":
-    fire.Fire(run)
+    main()
diff --git a/python/usearch/index.py b/python/usearch/index.py
index ef6e6e61..02e578f0 100644
--- a/python/usearch/index.py
+++ b/python/usearch/index.py
@@ -1591,7 +1591,7 @@ def kmeans(
     k,
     metric: str = "l2sq",
     dtype: str = "bf16",
-    max_iters: int = 300,
+    max_iterations: int = 300,
     inertia_threshold: float = 1e-4,
     max_seconds: float = 60.0,
     min_shifts: float = 0.01,
@@ -1618,7 +1618,7 @@ def kmeans(
         The data type used for clustering calculations. Default is "bf16" (Brain Float 16).
         Other supported types include "f32" (float32) and "f64" (float64), "f16" (float16),
         "i8" (int8), and b1 (boolean) bit-packed vectors.
-    max_iters : int, optional
+    max_iterations : int, optional
         The maximum number of iterations the algorithm should run. Default is 300.
     inertia_threshold : float, optional
         The threshold for inertia (sum of squared distances to centroids) to terminate early.
@@ -1669,7 +1669,7 @@ def kmeans(
         X,
         k,
         metric_kind=metric,
-        max_iterations=max_iters,
+        max_iterations=max_iterations,
         max_seconds=max_seconds,
         min_shifts=min_shifts,
         inertia_threshold=inertia_threshold,

From 3f5a4ef805e37220bf9306834dc2d56cabe69b50 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 29 Oct 2024 19:24:53 +0000
Subject: [PATCH 27/32] Make: Dynamic dispatch in Java

---
 build.gradle | 15 +++++++++++++++
 simsimd      |  2 +-
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/build.gradle b/build.gradle
index 899477a1..4bf9e42a 100644
--- a/build.gradle
+++ b/build.gradle
@@ -2,6 +2,7 @@ import org.gradle.internal.jvm.Jvm
 
 plugins {
     id 'java-library'
+    id 'c'
     id 'cpp'
     id 'maven-publish'
     id 'signing'
@@ -66,6 +67,15 @@ model {
                         srcDirs "include", "fp16/include", "simsimd/include", "${Jvm.current().javaHome}/include"
                     }
                 }
+                c {
+                    source {
+                        srcDirs "simsimd/c/"
+                        include "**/*.c"
+                    }
+                    exportedHeaders {
+                        srcDirs "simsimd/include"
+                    }
+                }
             }
             binaries.withType(StaticLibraryBinarySpec) {
                 buildable = false
@@ -83,6 +93,11 @@ model {
                     cppCompiler.args "-I${Jvm.current().javaHome}/include/win32"
                     cppCompiler.args '/std:c++11'
                 }
+                cppCompiler.args '-DUSEARCH_USE_FP16LIB=1'
+                cppCompiler.args '-DUSEARCH_USE_SIMSIMD=1'
+                cppCompiler.args '-DSIMSIMD_DYNAMIC_DISPATCH=1'
+                cppCompiler.args '-DSIMSIMD_NATIVE_BF16=0'
+                cppCompiler.args '-DSIMSIMD_NATIVE_F16=0'
             }
         }
     }
diff --git a/simsimd b/simsimd
index 79514c57..935fef29 160000
--- a/simsimd
+++ b/simsimd
@@ -1 +1 @@
-Subproject commit 79514c57c6f353c308a4d01d3639861ab26a759f
+Subproject commit 935fef2964bc38e995c5f465b42259a35b8cf0d3

From f80ded9979b5bd350ce7c514acc2e3528586b79b Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 29 Oct 2024 19:26:00 +0000
Subject: [PATCH 28/32] Fix: Initializing `std::atomic`

Without this the the Java build on Ubuntu
with GCC fails.
---
 include/usearch/index_plugins.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/usearch/index_plugins.hpp b/include/usearch/index_plugins.hpp
index efb37be6..bc4e2a4c 100644
--- a/include/usearch/index_plugins.hpp
+++ b/include/usearch/index_plugins.hpp
@@ -2331,7 +2331,7 @@ template <typename allocator_at = std::allocator<char>> class kmeans_clustering_
             iterations++;
 
             // For every point, find the closest centroid.
-            std::atomic<std::size_t> points_shifted = 0;
+            std::atomic<std::size_t> points_shifted{0};
             executor.dynamic(points_count, [&](std::size_t thread_idx, std::size_t points_idx) {
                 byte_t const* quantized_point =
                     points_quantized_buffer.data() + points_idx * stride_per_vector_quantized;

From fbcab99ed5d58c986313ba3a56500d4076029c57 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 29 Oct 2024 19:26:26 +0000
Subject: [PATCH 29/32] Fix: Narrowing static casts for Py build

---
 python/lib.cpp | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/python/lib.cpp b/python/lib.cpp
index 40d289a6..31ad4bb4 100644
--- a/python/lib.cpp
+++ b/python/lib.cpp
@@ -609,12 +609,13 @@ static py::tuple cluster_many_brute_force( //
     centroids_info.size = wanted * dataset_dimensions;
     centroids_info.format = dataset_info.format;
     centroids_info.ndim = 2;
-    centroids_info.shape = {wanted, dataset_dimensions};
-    centroids_info.strides = {dataset_dimensions * bytes_per_scalar, bytes_per_scalar};
+    centroids_info.shape = {static_cast<ssize_t>(wanted), static_cast<ssize_t>(dataset_dimensions)};
+    centroids_info.strides = {static_cast<ssize_t>(dataset_dimensions * bytes_per_scalar),
+                              static_cast<ssize_t>(bytes_per_scalar)};
 
     py::tuple results(3);
-    results[0] = py::array_t<std::size_t>({dataset_count}, point_to_centroid_index.data());
-    results[1] = py::array_t<distance_t>({dataset_count}, point_to_centroid_distance.data());
+    results[0] = py::array_t<std::size_t>({static_cast<ssize_t>(dataset_count)}, point_to_centroid_index.data());
+    results[1] = py::array_t<distance_t>({static_cast<ssize_t>(dataset_count)}, point_to_centroid_distance.data());
     results[2] = py::array(centroids_info);
 
     return results;

From e4afcf3ebdafc796beba00c66edb2f3b201f3ff7 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 29 Oct 2024 19:31:00 +0000
Subject: [PATCH 30/32] Fix: Avoid `std::accumulate`

The `<numeric>` header that contains that
function brings up to 2K lines of templates.
---
 include/usearch/index_plugins.hpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/include/usearch/index_plugins.hpp b/include/usearch/index_plugins.hpp
index bc4e2a4c..f07b4c4a 100644
--- a/include/usearch/index_plugins.hpp
+++ b/include/usearch/index_plugins.hpp
@@ -2453,8 +2453,9 @@ template <typename allocator_at = std::allocator<char>> class kmeans_clustering_
         // Export stats.
         result.iterations = iterations;
         result.computed_distances = points_count * wanted_clusters * iterations;
-        result.aggregate_distance =
-            std::accumulate(point_to_centroid_distance_buffer.begin(), point_to_centroid_distance_buffer.end(), 0.0);
+        result.aggregate_distance = 0;
+        for (distance_t distance : point_to_centroid_distance_buffer)
+            result.aggregate_distance += distance;
 
         // We've finished all the iterations, now we can export the centroids back to the original precision.
         std::memcpy(point_to_centroid_index, point_to_centroid_index_buffer.data(), points_count * sizeof(std::size_t));

From 6f132bb3ad353fec0eb855db0fa235774bbf34a6 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 29 Oct 2024 19:48:33 +0000
Subject: [PATCH 31/32] Fix: Missing `ssize_t` on Windows

---
 python/lib.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/python/lib.cpp b/python/lib.cpp
index 31ad4bb4..8cfc0d3f 100644
--- a/python/lib.cpp
+++ b/python/lib.cpp
@@ -24,6 +24,10 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
+#if defined(_WIN32)       //! On Windows, `ssize_t` is not defined by default
+typedef intptr_t ssize_t; //! Use `intptr_t` for a signed integer with the same width as `size_t`
+#endif
+
 #include <usearch/index_dense.hpp>
 #include <usearch/index_plugins.hpp>
 

From 512c6aa163c612821e120bfd7d788a8f25cf3c09 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 29 Oct 2024 19:59:18 +0000
Subject: [PATCH 32/32] Fix: Generating 64-bit unsigned seeds

The previous solution failed on Windows with:
> ValueError: high is out of bounds for int32
---
 python/usearch/index.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/python/usearch/index.py b/python/usearch/index.py
index 73179299..eaae1e88 100644
--- a/python/usearch/index.py
+++ b/python/usearch/index.py
@@ -1665,7 +1665,9 @@ def kmeans(
     """
     metric = _normalize_metric(metric)
     dtype = _normalize_dtype(dtype, ndim=X.shape[1], metric=metric)
-    seed = np.random.randint(0, 2**32 - 1) if seed is None else seed
+
+    # Generating a 64-bit unsigned integer in NumPy may be somewhat tricky.
+    seed = np.random.default_rng().integers(0, 2**64, dtype=np.uint64) if seed is None else seed
     assignments, distances, centroids = _kmeans(
         X,
         k,