From 070cca28b7cf99e6bba69a023477a4e17ee48f8f Mon Sep 17 00:00:00 2001 From: Ben Frederickson Date: Wed, 30 Nov 2022 13:08:17 -0800 Subject: [PATCH] Add python bindings for kmeans fit (#1016) As a first attempt at exposing the kmeans inertia - this adds python bindings for the kmeans::fit function, which returns the inertia as part of the results. This also adds support for accessing mdspan api's from cython directly Authors: - Ben Frederickson (https://github.com/benfred) Approvers: - Corey J. Nolet (https://github.com/cjnolet) URL: https://github.com/rapidsai/raft/pull/1016 --- cpp/CMakeLists.txt | 2 + cpp/include/raft_distance/kmeans.hpp | 20 ++ cpp/src/distance/kmeans_fit_double.cu | 34 +++ cpp/src/distance/kmeans_fit_float.cu | 34 +++ python/pylibraft/pylibraft/cluster/kmeans.pyx | 247 +++++++++++++++++- .../pylibraft/pylibraft/common/cai_wrapper.py | 16 ++ python/pylibraft/pylibraft/cpp/kmeans.pxd | 27 ++ .../pylibraft/pylibraft/cpp/kmeans_types.pxd | 44 ++++ python/pylibraft/pylibraft/cpp/mdspan.pxd | 43 +++ python/pylibraft/pylibraft/cpp/optional.pxd | 24 ++ .../pylibraft/pylibraft/test/test_kmeans.py | 33 ++- 11 files changed, 521 insertions(+), 3 deletions(-) create mode 100644 cpp/src/distance/kmeans_fit_double.cu create mode 100644 cpp/src/distance/kmeans_fit_float.cu create mode 100644 python/pylibraft/pylibraft/cpp/kmeans_types.pxd create mode 100644 python/pylibraft/pylibraft/cpp/mdspan.pxd create mode 100644 python/pylibraft/pylibraft/cpp/optional.pxd diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 672be75df5..1497fe412b 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -284,6 +284,8 @@ if(RAFT_COMPILE_DIST_LIBRARY) src/distance/update_centroids_double.cu src/distance/cluster_cost_float.cu src/distance/cluster_cost_double.cu + src/distance/kmeans_fit_float.cu + src/distance/kmeans_fit_double.cu src/distance/specializations/detail/canberra.cu src/distance/specializations/detail/chebyshev.cu src/distance/specializations/detail/correlation.cu diff --git a/cpp/include/raft_distance/kmeans.hpp b/cpp/include/raft_distance/kmeans.hpp index a56021b110..c05aa281c4 100644 --- a/cpp/include/raft_distance/kmeans.hpp +++ b/cpp/include/raft_distance/kmeans.hpp @@ -14,9 +14,13 @@ * limitations under the License. */ +#include #include +#include #include +#include + namespace raft::cluster::kmeans::runtime { void update_centroids(raft::handle_t const& handle, @@ -41,6 +45,22 @@ void update_centroids(raft::handle_t const& handle, double* new_centroids, double* weight_per_cluster); +void fit(handle_t const& handle, + const KMeansParams& params, + raft::device_matrix_view X, + std::optional> sample_weight, + raft::device_matrix_view centroids, + raft::host_scalar_view inertia, + raft::host_scalar_view n_iter); + +void fit(handle_t const& handle, + const KMeansParams& params, + raft::device_matrix_view X, + std::optional> sample_weight, + raft::device_matrix_view centroids, + raft::host_scalar_view inertia, + raft::host_scalar_view n_iter); + void cluster_cost(raft::handle_t const& handle, const float* X, int n_samples, diff --git a/cpp/src/distance/kmeans_fit_double.cu b/cpp/src/distance/kmeans_fit_double.cu new file mode 100644 index 0000000000..cb5d6e8c8f --- /dev/null +++ b/cpp/src/distance/kmeans_fit_double.cu @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +namespace raft::cluster::kmeans::runtime { + +void fit(handle_t const& handle, + const KMeansParams& params, + raft::device_matrix_view X, + std::optional> sample_weight, + raft::device_matrix_view centroids, + raft::host_scalar_view inertia, + raft::host_scalar_view n_iter) +{ + raft::cluster::kmeans::fit( + handle, params, X, sample_weight, centroids, inertia, n_iter); +} +} // namespace raft::cluster::kmeans::runtime diff --git a/cpp/src/distance/kmeans_fit_float.cu b/cpp/src/distance/kmeans_fit_float.cu new file mode 100644 index 0000000000..c10234d705 --- /dev/null +++ b/cpp/src/distance/kmeans_fit_float.cu @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +namespace raft::cluster::kmeans::runtime { + +void fit(handle_t const& handle, + const KMeansParams& params, + raft::device_matrix_view X, + std::optional> sample_weight, + raft::device_matrix_view centroids, + raft::host_scalar_view inertia, + raft::host_scalar_view n_iter) +{ + raft::cluster::kmeans::fit( + handle, params, X, sample_weight, centroids, inertia, n_iter); +} +} // namespace raft::cluster::kmeans::runtime diff --git a/python/pylibraft/pylibraft/cluster/kmeans.pyx b/python/pylibraft/pylibraft/cluster/kmeans.pyx index 679523cef4..3432ebe0a0 100644 --- a/python/pylibraft/pylibraft/cluster/kmeans.pyx +++ b/python/pylibraft/pylibraft/cluster/kmeans.pyx @@ -22,20 +22,28 @@ import numpy as np from cython.operator cimport dereference as deref from libc.stdint cimport uintptr_t -from libcpp cimport bool, nullptr +from libcpp cimport nullptr -from pylibraft.common import Handle +from collections import namedtuple +from enum import IntEnum + +from pylibraft.common import Handle, cai_wrapper from pylibraft.common.handle import auto_sync_handle from pylibraft.common.handle cimport handle_t +from pylibraft.random.rng_state cimport RngState from pylibraft.common.input_validation import * from pylibraft.distance import DISTANCE_TYPES +from pylibraft.common.handle cimport handle_t +from pylibraft.cpp cimport kmeans as cpp_kmeans, kmeans_types from pylibraft.cpp.kmeans cimport ( cluster_cost as cpp_cluster_cost, update_centroids, ) +from pylibraft.cpp.mdspan cimport * +from pylibraft.cpp.optional cimport optional def is_c_cont(cai, dt): @@ -285,3 +293,238 @@ def cluster_cost(X, centroids, handle=None): return d_cost else: raise ValueError("dtype %s not supported" % x_dt) + + +class InitMethod(IntEnum): + """ Method for initializing kmeans """ + KMeansPlusPlus = kmeans_types.InitMethod.KMeansPlusPlus + Random = kmeans_types.InitMethod.Random + Array = kmeans_types.InitMethod.Array + + +cdef class KMeansParams: + """ Specifies hyper-parameters for the kmeans algorithm. + + Parameters + ---------- + n_clusters : int, optional + The number of clusters to form as well as the number of centroids + to generate + max_iter : int, optional + Maximum number of iterations of the k-means algorithm for a single run + tol : float, optional + Relative tolerance with regards to inertia to declare convergence + verbosity : int, optional + seed: int, optional + Seed to the random number generator. + metric : str, optional + Metric names to use for distance computation, see + :func:`pylibraft.distance.pairwise_distance` for valid values. + init : InitMethod, optional + n_init : int, optional + Number of instance k-means algorithm will be run with different seeds. + oversampling_factor : float, optional + Oversampling factor for use in the k-means algorithm + """ + cdef kmeans_types.KMeansParams c_obj + + def __init__(self, + n_clusters: Optional[int] = None, + max_iter: Optional[int] = None, + tol: Optional[float] = None, + verbosity: Optional[int] = None, + seed: Optional[int] = None, + metric: Optional[str] = None, + init: Optional[InitMethod] = None, + n_init: Optional[int] = None, + oversampling_factor: Optional[float] = None, + batch_samples: Optional[int] = None, + batch_centroids: Optional[int] = None, + inertia_check: Optional[bool] = None): + if n_clusters is not None: + self.c_obj.n_clusters = n_clusters + if max_iter is not None: + self.c_obj.max_iter = max_iter + if tol is not None: + self.c_obj.tol = tol + if verbosity is not None: + self.c_obj.verbosity = verbosity + if seed is not None: + self.c_obj.rng_state.seed = seed + if metric is not None: + distance = DISTANCE_TYPES.get(metric) + if distance is None: + valid_metrics = list(DISTANCE_TYPES.keys()) + raise ValueError(f"Unknown metric '{metric}'. Valid values " + f"are: {valid_metrics}") + self.c_obj.metric = distance + if init is not None: + self.c_obj.init = init + if n_init is not None: + self.c_obj.n_init = n_init + if oversampling_factor is not None: + self.c_obj.oversampling_factor = oversampling_factor + if batch_samples is not None: + self.c_obj.batch_samples = batch_samples + if batch_centroids is not None: + self.c_obj.batch_centroids = batch_centroids + if inertia_check is not None: + self.c_obj.inertia_check = inertia_check + + @property + def n_clusters(self): + return self.c_obj.n_clusters + + @property + def max_iter(self): + return self.c_obj.max_iter + + @property + def tol(self): + return self.c_obj.tol + + @property + def verbosity(self): + return self.c_obj.verbosity + + @property + def seed(self): + return self.c_obj.rng_state.seed + + @property + def init(self): + return InitMethod(self.c_obj.init) + + @property + def oversampling_factor(self): + return self.c_obj.oversampling_factor + + @property + def batch_samples(self): + return self.c_obj.batch_samples + + @property + def batch_centroids(self): + return self.c_obj.batch_centroids + + @property + def inertia_check(self): + return self.c_obj.inertia_check + +FitOutput = namedtuple("FitOutput", "centroids inertia n_iter") + + +@auto_sync_handle +def fit( + KMeansParams params, X, centroids=None, sample_weights=None, handle=None +): + """ + Find clusters with the k-means algorithm + + Parameters + ---------- + + params : KMeansParams + Parameters to use to fit KMeans model + X : Input CUDA array interface compliant matrix shape (m, k) + centroids : Optional writable CUDA array interface compliant matrix + shape (n_clusters, k) + sample_weights : Optional input CUDA array interface compliant matrix shape + (n_clusters, 1) default: None + {handle_docstring} + + Returns + ------- + centroids : raft.device_ndarray + The computed centroids for each cluster + inertia : float + Sum of squared distances of samples to their closest cluster center + n_iter : int + The number of iterations used to fit the model + + Examples + -------- + + .. code-block:: python + + import cupy as cp + + from pylibraft.cluster.kmeans import fit, KMeansParams + + n_samples = 5000 + n_features = 50 + n_clusters = 3 + + X = cp.random.random_sample((n_samples, n_features), + dtype=cp.float32) + + params = KMeansParams(n_clusters=n_clusters) + centroids, inertia, n_iter = fit(params, X) + """ + cdef handle_t *h = handle.getHandle() + + cdef float f_inertia = 0.0 + cdef double d_inertia = 0.0 + cdef int n_iter = 0 + + cdef optional[device_vector_view[const double, int]] d_sample_weights + cdef optional[device_vector_view[const float, int]] f_sample_weights + + X_cai = cai_wrapper(X) + dtype = X_cai.dtype + + if centroids is None: + centroids_shape = (params.n_clusters, X_cai.shape[1]) + centroids = device_ndarray.empty(centroids_shape, dtype=dtype) + centroids_cai = cai_wrapper(centroids) + + # validate inputs have are all c-contiguous, and have a consistent dtype + # and expected shape + X_cai.validate_shape_dtype(2) + centroids_cai.validate_shape_dtype(2, dtype) + if sample_weights is not None: + sample_weights_cai = cai_wrapper(sample_weights) + sample_weights_cai.validate_shape_dtype(1, dtype) + + if dtype == np.float64: + if sample_weights is not None: + d_sample_weights = make_device_vector_view( + sample_weights_cai.data, + sample_weights_cai.shape[0]) + + cpp_kmeans.fit( + deref(h), + params.c_obj, + make_device_matrix_view( + X_cai.data, + X_cai.shape[0], X_cai.shape[1]), + d_sample_weights, + make_device_matrix_view( + centroids_cai.data, + centroids_cai.shape[0], centroids_cai.shape[1]), + make_host_scalar_view[double, int](&d_inertia), + make_host_scalar_view[int, int](&n_iter)) + return FitOutput(centroids, d_inertia, n_iter) + + elif dtype == np.float32: + if sample_weights is not None: + f_sample_weights = make_device_vector_view( + sample_weights_cai.data, + sample_weights_cai.shape[0]) + + cpp_kmeans.fit( + deref(h), + params.c_obj, + make_device_matrix_view( + X_cai.data, + X_cai.shape[0], X_cai.shape[1]), + f_sample_weights, + make_device_matrix_view( + centroids_cai.data, + centroids_cai.shape[0], centroids_cai.shape[1]), + make_host_scalar_view[float, int](&f_inertia), + make_host_scalar_view[int, int](&n_iter)) + return FitOutput(centroids, f_inertia, n_iter) + + else: + raise ValueError(f"unhandled dtype {dtype}") diff --git a/python/pylibraft/pylibraft/common/cai_wrapper.py b/python/pylibraft/pylibraft/common/cai_wrapper.py index fdfc6b0b09..5851821f57 100644 --- a/python/pylibraft/pylibraft/common/cai_wrapper.py +++ b/python/pylibraft/pylibraft/common/cai_wrapper.py @@ -71,3 +71,19 @@ def data(self): Returns the data pointer of the underlying CUDA array interface """ return self.cai_["data"][0] + + def validate_shape_dtype(self, expected_dims=None, expected_dtype=None): + """Checks to see if the shape, dtype, and strides match expectations""" + if expected_dims is not None and len(self.shape) != expected_dims: + raise ValueError( + f"unexpected shape {self.shape} - " + f"expected {expected_dims} dimensions" + ) + + if expected_dtype is not None and self.dtype != expected_dtype: + raise ValueError( + f"invalid dtype {self.dtype}: expected " f"{expected_dtype}" + ) + + if not self.c_contiguous: + raise ValueError("input must be c-contiguous") diff --git a/python/pylibraft/pylibraft/cpp/kmeans.pxd b/python/pylibraft/pylibraft/cpp/kmeans.pxd index b263952522..9eb35bd71e 100644 --- a/python/pylibraft/pylibraft/cpp/kmeans.pxd +++ b/python/pylibraft/pylibraft/cpp/kmeans.pxd @@ -18,7 +18,16 @@ # cython: embedsignature = True # cython: language_level = 3 +import numpy as np + +from cython.operator cimport dereference as deref +from libc.stdint cimport uintptr_t +from libcpp cimport bool, nullptr + from pylibraft.common.handle cimport handle_t +from pylibraft.cpp.kmeans_types cimport KMeansParams +from pylibraft.cpp.mdspan cimport * +from pylibraft.cpp.optional cimport optional cdef extern from "raft_distance/kmeans.hpp" \ @@ -65,3 +74,21 @@ cdef extern from "raft_distance/kmeans.hpp" \ int n_clusters, const double * centroids, double * cost) except + + + cdef void fit( + const handle_t & handle, + const KMeansParams& params, + device_matrix_view[const float, int] X, + optional[device_vector_view[const float, int]] sample_weight, + device_matrix_view[float, int] inertia, + host_scalar_view[float, int] inertia, + host_scalar_view[int, int] n_iter) except + + + cdef void fit( + const handle_t & handle, + const KMeansParams& params, + device_matrix_view[const double, int] X, + optional[device_vector_view[const double, int]] sample_weight, + device_matrix_view[double, int] inertia, + host_scalar_view[double, int] inertia, + host_scalar_view[int, int] n_iter) except + diff --git a/python/pylibraft/pylibraft/cpp/kmeans_types.pxd b/python/pylibraft/pylibraft/cpp/kmeans_types.pxd new file mode 100644 index 0000000000..869d2cb5fd --- /dev/null +++ b/python/pylibraft/pylibraft/cpp/kmeans_types.pxd @@ -0,0 +1,44 @@ +# +# Copyright (c) 2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from libcpp cimport bool + +from pylibraft.distance.distance_type cimport DistanceType +from pylibraft.random.rng_state cimport RngState + + +cdef extern from "raft/cluster/kmeans_types.hpp" \ + namespace "raft::cluster::kmeans": + + ctypedef enum InitMethod 'raft::cluster::KMeansParams::InitMethod': + KMeansPlusPlus 'raft::cluster::kmeans::KMeansParams::InitMethod::KMeansPlusPlus' # noqa + Random 'raft::cluster::kmeans::KMeansParams::InitMethod::Random' + Array 'raft::cluster::kmeans::KMeansParams::InitMethod::Array' + + cdef cppclass KMeansParams: + KMeansParams() except + + int n_clusters + InitMethod init + int max_iter + double tol + int verbosity + RngState rng_state + DistanceType metric + int n_init + double oversampling_factor + int batch_samples + int batch_centroids + bool inertia_check diff --git a/python/pylibraft/pylibraft/cpp/mdspan.pxd b/python/pylibraft/pylibraft/cpp/mdspan.pxd new file mode 100644 index 0000000000..da6cac478d --- /dev/null +++ b/python/pylibraft/pylibraft/cpp/mdspan.pxd @@ -0,0 +1,43 @@ +# +# Copyright (c) 2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# cython: profile=False +# distutils: language = c++ +# cython: embedsignature = True +# cython: language_level = 3 + + +cdef extern from "raft/core/device_mdspan.hpp" namespace "raft" nogil: + cdef cppclass device_vector_view[T, IndexType]: + pass + + cdef cppclass device_matrix_view[T, IndexType]: + pass + + cdef cppclass host_scalar_view[T, IndexType]: + pass + + cdef device_vector_view[T, IndexType] \ + make_device_vector_view[T, IndexType](T * ptr, + IndexType n) except + + + cdef device_matrix_view[T, IndexType] \ + make_device_matrix_view[T, IndexType](T * ptr, + IndexType rows, + IndexType cols) except + + + cdef host_scalar_view[T, IndexType] \ + make_host_scalar_view[T, IndexType](T * ptr) except + diff --git a/python/pylibraft/pylibraft/cpp/optional.pxd b/python/pylibraft/pylibraft/cpp/optional.pxd new file mode 100644 index 0000000000..a6dd8a2dcd --- /dev/null +++ b/python/pylibraft/pylibraft/cpp/optional.pxd @@ -0,0 +1,24 @@ +# +# Copyright (c) 2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# We're still using cython v0.29.x - which doesn't have std::optional +# support. Include the minimal definition here as suggested by +# https://github.com/cython/cython/issues/3293#issuecomment-1223058101 + +cdef extern from "" namespace "std" nogil: + cdef cppclass optional[T]: + optional() + optional& operator=[U](U&) diff --git a/python/pylibraft/pylibraft/test/test_kmeans.py b/python/pylibraft/pylibraft/test/test_kmeans.py index 44f60be310..e5e544d565 100644 --- a/python/pylibraft/pylibraft/test/test_kmeans.py +++ b/python/pylibraft/pylibraft/test/test_kmeans.py @@ -16,11 +16,42 @@ import numpy as np import pytest -from pylibraft.cluster.kmeans import cluster_cost, compute_new_centroids +from pylibraft.cluster.kmeans import ( + KMeansParams, + cluster_cost, + compute_new_centroids, + fit, +) from pylibraft.common import Handle, device_ndarray from pylibraft.distance import pairwise_distance +@pytest.mark.parametrize("n_rows", [100]) +@pytest.mark.parametrize("n_cols", [5, 25]) +@pytest.mark.parametrize("n_clusters", [5, 15]) +@pytest.mark.parametrize("dtype", [np.float32, np.float64]) +def test_kmeans_fit(n_rows, n_cols, n_clusters, dtype): + # generate some random input points / centroids + X_host = np.random.random_sample((n_rows, n_cols)).astype(dtype) + centroids = device_ndarray(X_host[:n_clusters]) + X = device_ndarray(X_host) + + # compute the inertia, before fitting centroids + original_inertia = cluster_cost(X, centroids) + + params = KMeansParams(n_clusters=n_clusters, seed=42) + + # fit the centroids, make sure inertia has gone down + # TODO: once we have make_blobs exposed to python + # (https://github.com/rapidsai/raft/issues/1059) + # we should use that to test out the kmeans fit, like the C++ + # tests do right now + centroids, inertia, n_iter = fit(params, X, centroids) + assert inertia < original_inertia + assert n_iter >= 1 + assert np.allclose(cluster_cost(X, centroids), inertia, rtol=1e-6) + + @pytest.mark.parametrize("n_rows", [100]) @pytest.mark.parametrize("n_cols", [5, 25]) @pytest.mark.parametrize("n_clusters", [5, 15])