Skip to content

Commit

Permalink
Merge pull request #513 from unum-cloud/main-dev-cluster
Browse files Browse the repository at this point in the history
Clustering
  • Loading branch information
ashvardanian authored Oct 29, 2024
2 parents 0dac789 + 91c0bcb commit f9fc617
Show file tree
Hide file tree
Showing 12 changed files with 754 additions and 38 deletions.
1 change: 1 addition & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,7 @@
"ivdep",
"jaccard",
"Jemalloc",
"kmeans",
"Kullback",
"Leibler",
"libjemalloc",
Expand Down
13 changes: 12 additions & 1 deletion BENCHMARKS.md
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,8 @@ Within this repository you will find two commonly used utilities:
- `cpp/bench.cpp` the produces the `bench_cpp` binary for broad USearch benchmarks.
- `python/bench.py` and `python/bench.ipynb` for interactive charts against FAISS.

### C++ Benchmarking Utilities

To achieve best highest results we suggest compiling locally for the target architecture.

```sh
Expand Down Expand Up @@ -147,11 +149,20 @@ build_profile/bench_cpp \
--cos
```


> Optional parameters include `connectivity`, `expansion_add`, `expansion_search`.
For Python, jut open the Jupyter Notebook and start playing around.

### Python Benchmarking Utilities

Several benchmarking suites are available for Python: approximate search, exact search, and clustering.

```sh
python/scripts/bench.py --help
python/scripts/bench_exact.py --help
python/scripts/bench_cluster.py --help
```

## Datasets

BigANN benchmark is a good starting point, if you are searching for large collections of high-dimensional vectors.
Expand Down
2 changes: 1 addition & 1 deletion CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,7 @@ cibuildwheel --platform macos # works only on MacOS
cibuildwheel --platform windows # works only on Windows
```

You may need root previligies for multi-architecture builds:
You may need root privileges for multi-architecture builds:

```sh
sudo $(which cibuildwheel) --platform linux
Expand Down
16 changes: 16 additions & 0 deletions cpp/test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1100,6 +1100,22 @@ int main(int, char**) {
test_uint40();
test_cosine<float, std::int64_t, uint40_t>(10, 10);

// Test plugins, like K-Means clustering.
{
std::size_t vectors_count = 1000, centroids_count = 10, dimensions = 256;
kmeans_clustering_t clustering;
clustering.max_iterations = 2;
std::vector<float> vectors(vectors_count * dimensions), centroids(centroids_count * dimensions);
matrix_slice_gt<float const> vectors_slice(vectors.data(), dimensions, vectors_count);
matrix_slice_gt<float> centroids_slice(centroids.data(), dimensions, centroids_count);
std::generate(vectors.begin(), vectors.end(), [] { return float(std::rand()) / float(INT_MAX); });
std::vector<std::size_t> assignments(vectors_count);
std::vector<distance_punned_t> distances(vectors_count);
auto clustering_result = clustering(vectors_slice, centroids_slice, {assignments.data(), assignments.size()},
{distances.data(), distances.size()});
expect(clustering_result);
}

// Exact search without constructing indexes.
// Great for validating the distance functions.
std::printf("Testing exact search\n");
Expand Down
2 changes: 1 addition & 1 deletion include/usearch/index.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -370,7 +370,7 @@ template <typename scalar_at, typename allocator_at = std::allocator<scalar_at>>
void reset() noexcept {
if (!std::is_trivially_destructible<scalar_at>::value)
for (std::size_t i = 0; i != size_; ++i)
destroy_at(data_ + i);
unum::usearch::destroy_at(data_ + i); //< Facing some symbol visibility/ambiguity issues
allocator_at{}.deallocate(data_, size_);
data_ = nullptr;
size_ = 0;
Expand Down
2 changes: 1 addition & 1 deletion include/usearch/index_dense.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2254,4 +2254,4 @@ static join_result_t join( //
}

} // namespace usearch
} // namespace unum
} // namespace unum
348 changes: 342 additions & 6 deletions include/usearch/index_plugins.hpp

Large diffs are not rendered by default.

139 changes: 116 additions & 23 deletions python/lib.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -452,7 +452,7 @@ static py::tuple search_many_in_index( //
}

/**
* @brief Brute-force exact search implementation, compatible with
* @brief Brute-force @b exact search implementation, compatible with
* NumPy-like Tensors and other objects supporting Buffer Protocol.
*/
static py::tuple search_many_brute_force( //
Expand Down Expand Up @@ -545,6 +545,81 @@ static py::tuple search_many_brute_force( //
return results;
}

/**
* @brief Brute-force @b K-Means clustering, compatible with
* NumPy-like Tensors and other objects supporting Buffer Protocol.
*/
static py::tuple cluster_many_brute_force( //
py::buffer dataset, //
std::size_t wanted, //
std::size_t max_iterations, //
double inertia_threshold, //
double max_seconds, //
double min_shifts, //
std::uint64_t seed, //
std::size_t threads, //
scalar_kind_t scalar_kind, //
metric_kind_t metric_kind, //
progress_func_t const& progress_func) {

using distance_t = typename kmeans_clustering_t::distance_t;
py::buffer_info dataset_info = dataset.request();
if (dataset_info.ndim != 2)
throw std::invalid_argument("Expects a matrix (rank-2 tensor) of dataset to cluster!");

std::size_t dataset_count = static_cast<std::size_t>(dataset_info.shape[0]);
std::size_t dataset_dimensions = static_cast<std::size_t>(dataset_info.shape[1]);
std::size_t dataset_stride = static_cast<std::size_t>(dataset_info.strides[0]);
scalar_kind_t dataset_kind = numpy_string_to_kind(dataset_info.format);
std::size_t bytes_per_scalar = bits_per_scalar_word(dataset_kind) / CHAR_BIT;

std::vector<std::size_t> point_to_centroid_index(dataset_count, 0);
std::vector<distance_t> point_to_centroid_distance(dataset_count, 0);
std::vector<byte_t> centroids(wanted * dataset_dimensions * bytes_per_scalar, 0);

if (!threads)
threads = std::thread::hardware_concurrency();

// Dispatch brute-force search
progress_t progress{progress_func};
executor_default_t executor{threads};
kmeans_clustering_t engine;
engine.metric_kind = metric_kind;
engine.quantization_kind = scalar_kind;
engine.max_iterations = max_iterations;
engine.min_shifts = min_shifts;
engine.max_seconds = max_seconds;
engine.inertia_threshold = inertia_threshold;

kmeans_clustering_result_t result = engine( //
reinterpret_cast<byte_t const*>(dataset_info.ptr), dataset_count, dataset_stride, //
centroids.data(), wanted, dataset_dimensions * bytes_per_scalar, //
point_to_centroid_index.data(), point_to_centroid_distance.data(), dataset_kind, dataset_dimensions, executor,
[&](std::size_t passed, std::size_t total) { return PyErr_CheckSignals() == 0 && progress(passed, total); });

if (!result)
throw std::runtime_error(result.error.release());

// Following constructor doesn't seem to be documented, but it's used in the source code of `pybind11`
// https://github.com/pybind/pybind11/blob/aeda49ed0b4e6e8abba7abc265ace86a6c26ba66/include/pybind11/numpy.h#L918-L919
// https://github.com/pybind/pybind11/blob/aeda49ed0b4e6e8abba7abc265ace86a6c26ba66/include/pybind11/buffer_info.h#L60-L75
py::buffer_info centroids_info;
centroids_info.ptr = reinterpret_cast<void*>(centroids.data());
centroids_info.itemsize = dataset_info.itemsize;
centroids_info.size = wanted * dataset_dimensions;
centroids_info.format = dataset_info.format;
centroids_info.ndim = 2;
centroids_info.shape = {wanted, dataset_dimensions};
centroids_info.strides = {dataset_dimensions * bytes_per_scalar, bytes_per_scalar};

py::tuple results(3);
results[0] = py::array_t<std::size_t>({dataset_count}, point_to_centroid_index.data());
results[1] = py::array_t<distance_t>({dataset_count}, point_to_centroid_distance.data());
results[2] = py::array(centroids_info);

return results;
}

template <typename scalar_at> struct rows_lookup_gt {
byte_t* data_;
std::size_t stride_;
Expand Down Expand Up @@ -936,16 +1011,33 @@ PYBIND11_MODULE(compiled, m) {
return index_metadata(meta);
});

m.def("exact_search", &search_many_brute_force, //
py::arg("dataset"), //
py::arg("queries"), //
py::arg("count") = 10, //
py::kw_only(), //
py::arg("threads") = 0, //
py::arg("metric_kind") = metric_kind_t::cos_k, //
py::arg("metric_signature") = metric_punned_signature_t::array_array_k, //
py::arg("metric_pointer") = 0, //
py::arg("progress") = nullptr //
m.def( //
"exact_search", &search_many_brute_force, //
py::arg("dataset"), //
py::arg("queries"), //
py::arg("count") = 10, //
py::kw_only(), //
py::arg("threads") = 0, //
py::arg("metric_kind") = metric_kind_t::cos_k, //
py::arg("metric_signature") = metric_punned_signature_t::array_array_k, //
py::arg("metric_pointer") = 0, //
py::arg("progress") = nullptr //
);

m.def( //
"kmeans", &cluster_many_brute_force, //
py::arg("dataset"), //
py::arg("count") = 10, //
py::kw_only(), //
py::arg("max_iterations") = kmeans_clustering_t::max_iterations_default_k, //
py::arg("inertia_threshold") = kmeans_clustering_t::inertia_threshold_default_k, //
py::arg("max_seconds") = kmeans_clustering_t::max_seconds_default_k, //
py::arg("min_shifts") = kmeans_clustering_t::min_shifts_default_k, //
py::arg("seed") = 0, //
py::arg("threads") = 0, //
py::arg("dtype") = scalar_kind_t::bf16_k, //
py::arg("metric_kind") = metric_kind_t::l2sq_k, //
py::arg("progress") = nullptr //
);

m.def(
Expand All @@ -961,18 +1053,19 @@ PYBIND11_MODULE(compiled, m) {

auto i = py::class_<dense_index_py_t, std::shared_ptr<dense_index_py_t>>(m, "Index");

i.def(py::init(&make_index), //
py::kw_only(), //
py::arg("ndim") = 0, //
py::arg("dtype") = scalar_kind_t::f32_k, //
py::arg("connectivity") = default_connectivity(), //
py::arg("expansion_add") = default_expansion_add(), //
py::arg("expansion_search") = default_expansion_search(), //
py::arg("metric_kind") = metric_kind_t::cos_k, //
py::arg("metric_signature") = metric_punned_signature_t::array_array_k, //
py::arg("metric_pointer") = 0, //
py::arg("multi") = false, //
py::arg("enable_key_lookups") = true //
i.def( //
py::init(&make_index), //
py::kw_only(), //
py::arg("ndim") = 0, //
py::arg("dtype") = scalar_kind_t::f32_k, //
py::arg("connectivity") = default_connectivity(), //
py::arg("expansion_add") = default_expansion_add(), //
py::arg("expansion_search") = default_expansion_search(), //
py::arg("metric_kind") = metric_kind_t::cos_k, //
py::arg("metric_signature") = metric_punned_signature_t::array_array_k, //
py::arg("metric_pointer") = 0, //
py::arg("multi") = false, //
py::arg("enable_key_lookups") = true //
);

i.def( //
Expand Down
136 changes: 136 additions & 0 deletions python/scripts/bench_cluster.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
#!/usr/bin/env python3
import os
import argparse

import numpy as np
import faiss
from tqdm import tqdm

import usearch
from usearch.index import kmeans
from usearch.io import load_matrix


def evaluate_clustering_euclidean(X, labels, centroids):
"""Evaluate clustering quality as average distance to centroids"""
distances = np.linalg.norm(X - centroids[labels], axis=1)
return np.mean(distances)


def evaluate_clustering_cosine(X, labels, centroids):
"""Evaluate clustering quality as average cosine distance to centroids"""

# Normalize both data points and centroids
X_normalized = X / np.linalg.norm(X, axis=1, keepdims=True)
centroids_normalized = centroids / np.linalg.norm(centroids, axis=1, keepdims=True)

# Compute cosine similarity using dot product
cosine_similarities = np.sum(X_normalized * centroids_normalized[labels], axis=1)

# Convert cosine similarity to cosine distance
cosine_distances = 1 - cosine_similarities

# Return the average cosine distance
return np.mean(cosine_distances)


def numpy_initialize_centroids(X, k):
"""Randomly choose k data points as initial centroids"""
indices = np.random.choice(X.shape[0], k, replace=False)
return X[indices]


def numpy_assign_clusters(X, centroids):
"""Assign each data point to the nearest centroid (numpy NumPy implementation)"""
distances = np.linalg.norm(X[:, np.newaxis] - centroids, axis=2)
return np.argmin(distances, axis=1)


def numpy_update_centroids(X, labels, k):
"""Compute new centroids as the mean of all data points assigned to each cluster"""
return np.array([X[labels == i].mean(axis=0) for i in range(k)])


def cluster_with_numpy(X, k, max_iters=100, tol=1e-4):
centroids = numpy_initialize_centroids(X, k)

for i in tqdm(range(max_iters), desc="KMeans Iterations"):
labels = numpy_assign_clusters(X, centroids)
new_centroids = numpy_update_centroids(X, labels, k)

if np.linalg.norm(new_centroids - centroids) < tol:
break

centroids = new_centroids

return labels, centroids


def cluster_with_faiss(X, k, max_iters=100):
# Docs: https://github.com/facebookresearch/faiss/wiki/Faiss-building-blocks:-clustering,-PCA,-quantization
# Header: https://github.com/facebookresearch/faiss/blob/main/faiss/Clustering.h
# Source: https://github.com/facebookresearch/faiss/blob/main/faiss/Clustering.cpp
verbose = False
d: int = X.shape[1]
kmeans = faiss.Kmeans(d, k, niter=max_iters, verbose=verbose)
kmeans.train(X)
D, I = kmeans.index.search(X, 1)
return I.flatten(), kmeans.centroids


def cluster_with_usearch(X, k, max_iters=100):
assignments, _, centroids = kmeans(X, k, max_iterations=max_iters)
return assignments, centroids


def main():
parser = argparse.ArgumentParser(description="Compare KMeans clustering algorithms")
parser.add_argument("--vectors", type=str, required=True, help="Path to binary matrix file")
parser.add_argument("-k", default=10, type=int, required=True, help="Number of centroids")
parser.add_argument("-i", default=100, type=int, help="Upper bound on number of iterations")
parser.add_argument("-n", type=int, help="Upper bound on number of points to use")
parser.add_argument(
"--method",
type=str,
choices=["numpy", "faiss", "usearch"],
default="numpy",
help="Clustering backend",
)

args = parser.parse_args()

max_iters = args.i
X = load_matrix(args.vectors, count_rows=args.n)
k = args.k
method = args.method

time_before = os.times()
if method == "usearch":
labels, centroids = cluster_with_usearch(X, k, max_iters=max_iters)
elif method == "faiss":
labels, centroids = cluster_with_faiss(X, k, max_iters=max_iters)
else:
labels, centroids = cluster_with_numpy(X, k, max_iters=max_iters)
time_after = os.times()
time_duration = time_after[0] - time_before[0]
print(f"Time: {time_duration:.2f}s, {time_duration / max_iters:.2f}s per iteration")

quality = evaluate_clustering_euclidean(X, labels, centroids)
quality_cosine = evaluate_clustering_cosine(X, labels, centroids)
print(f"Clustering quality (average distance to centroids): {quality:.4f}, cosine: {quality_cosine:.4f}")

# Let's compare it to some random uniform assignment
random_labels = np.random.randint(0, k, size=X.shape[0])
random_quality = evaluate_clustering_euclidean(X, random_labels, centroids)
random_quality_cosine = evaluate_clustering_cosine(X, random_labels, centroids)
print(f"... while random assignment quality: {random_quality:.4f}, cosine: {random_quality_cosine:.4f}")

cluster_sizes = np.unique(labels, return_counts=True)[1]
cluster_sizes_mean = np.mean(cluster_sizes)
cluster_sizes_stddev = np.std(cluster_sizes)
print(f"Cluster sizes: {cluster_sizes_mean:.2f} ± {cluster_sizes_stddev:.2f}")
print(cluster_sizes)


if __name__ == "__main__":
main()
Loading

0 comments on commit f9fc617

Please sign in to comment.