From b0c33a98f6018421ada16bdf8d21981ea67aaec8 Mon Sep 17 00:00:00 2001 From: Ben Frederickson Date: Mon, 14 Nov 2022 17:43:52 -0800 Subject: [PATCH 01/11] Add python bindings for kmeans fit As a first attempt at exposing the kmeans inertia - this adds python bindings for the kmeans::fit function, which returns the inertia as part of the results. This also adds support for accessing mdspan api's from cython directly --- cpp/CMakeLists.txt | 2 + cpp/include/raft_distance/kmeans.hpp | 21 ++- cpp/src/distance/kmeans_fit_double.cu | 33 ++++ cpp/src/distance/kmeans_fit_float.cu | 33 ++++ .../pylibraft/cluster/cpp_kmeans.pxd | 53 ++++++ python/pylibraft/pylibraft/cluster/kmeans.pyx | 166 +++++++++++++++++- .../pylibraft/cluster/kmeans_types.pxd | 34 ++++ .../pylibraft/pylibraft/common/CMakeLists.txt | 2 +- python/pylibraft/pylibraft/common/mdspan.pxd | 46 +++++ python/pylibraft/pylibraft/common/mdspan.pyx | 65 +++++++ .../pylibraft/pylibraft/common/optional.pxd | 40 +++++ .../pylibraft/pylibraft/test/test_kmeans.py | 18 +- 12 files changed, 508 insertions(+), 5 deletions(-) create mode 100644 cpp/src/distance/kmeans_fit_double.cu create mode 100644 cpp/src/distance/kmeans_fit_float.cu create mode 100644 python/pylibraft/pylibraft/cluster/cpp_kmeans.pxd create mode 100644 python/pylibraft/pylibraft/cluster/kmeans_types.pxd create mode 100644 python/pylibraft/pylibraft/common/mdspan.pxd create mode 100644 python/pylibraft/pylibraft/common/mdspan.pyx create mode 100644 python/pylibraft/pylibraft/common/optional.pxd diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index af08a1a2a4..21110f04a3 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -285,6 +285,8 @@ if(RAFT_COMPILE_DIST_LIBRARY) src/distance/fused_l2_min_arg.cu src/distance/update_centroids_float.cu src/distance/update_centroids_double.cu + src/distance/kmeans_fit_float.cu + src/distance/kmeans_fit_double.cu src/distance/specializations/detail/canberra.cu src/distance/specializations/detail/chebyshev.cu src/distance/specializations/detail/correlation.cu diff --git a/cpp/include/raft_distance/kmeans.hpp b/cpp/include/raft_distance/kmeans.hpp index 19f92dd977..e160a3b230 100644 --- a/cpp/include/raft_distance/kmeans.hpp +++ b/cpp/include/raft_distance/kmeans.hpp @@ -14,9 +14,13 @@ * limitations under the License. */ +#include #include +#include #include +#include + namespace raft::cluster::kmeans::runtime { void update_centroids(raft::handle_t const& handle, @@ -41,4 +45,19 @@ void update_centroids(raft::handle_t const& handle, double* new_centroids, double* weight_per_cluster); -} // namespace raft::cluster::kmeans::runtime \ No newline at end of file +void fit(handle_t const& handle, + const KMeansParams& params, + raft::device_matrix_view X, + std::optional> sample_weight, + raft::device_matrix_view centroids, + raft::host_scalar_view inertia, + raft::host_scalar_view n_iter); + +void fit(handle_t const& handle, + const KMeansParams& params, + raft::device_matrix_view X, + std::optional> sample_weight, + raft::device_matrix_view centroids, + raft::host_scalar_view inertia, + raft::host_scalar_view n_iter); +} // namespace raft::cluster::kmeans::runtime diff --git a/cpp/src/distance/kmeans_fit_double.cu b/cpp/src/distance/kmeans_fit_double.cu new file mode 100644 index 0000000000..6d9a367d73 --- /dev/null +++ b/cpp/src/distance/kmeans_fit_double.cu @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +namespace raft::cluster::kmeans::runtime { + +void fit(handle_t const& handle, + const KMeansParams& params, + raft::device_matrix_view X, + std::optional> sample_weight, + raft::device_matrix_view centroids, + raft::host_scalar_view inertia, + raft::host_scalar_view n_iter) +{ + raft::cluster::kmeans::fit( + handle, params, X, sample_weight, centroids, inertia, n_iter); +} +} // namespace raft::cluster::kmeans::runtime diff --git a/cpp/src/distance/kmeans_fit_float.cu b/cpp/src/distance/kmeans_fit_float.cu new file mode 100644 index 0000000000..ff389f6886 --- /dev/null +++ b/cpp/src/distance/kmeans_fit_float.cu @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +namespace raft::cluster::kmeans::runtime { + +void fit(handle_t const& handle, + const KMeansParams& params, + raft::device_matrix_view X, + std::optional> sample_weight, + raft::device_matrix_view centroids, + raft::host_scalar_view inertia, + raft::host_scalar_view n_iter) +{ + raft::cluster::kmeans::fit( + handle, params, X, sample_weight, centroids, inertia, n_iter); +} +} // namespace raft::cluster::kmeans::runtime diff --git a/python/pylibraft/pylibraft/cluster/cpp_kmeans.pxd b/python/pylibraft/pylibraft/cluster/cpp_kmeans.pxd new file mode 100644 index 0000000000..1f651d2ad4 --- /dev/null +++ b/python/pylibraft/pylibraft/cluster/cpp_kmeans.pxd @@ -0,0 +1,53 @@ +# +# Copyright (c) 2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# cython: profile=False +# distutils: language = c++ +# cython: embedsignature = True +# cython: language_level = 3 + +import numpy as np + +from cython.operator cimport dereference as deref +from libc.stdint cimport uintptr_t +from libcpp cimport bool, nullptr + +from pylibraft.common.handle cimport handle_t +from pylibraft.common.optional cimport optional +from pylibraft.common.mdspan cimport * + +cimport pylibraft.cluster.kmeans_types as kmeans_types + + +cdef extern from "raft_distance/kmeans.hpp" \ + namespace "raft::cluster::kmeans::runtime" nogil: + + cdef void fit( + const handle_t & handle, + const kmeans_types.KMeansParams& params, + device_matrix_view[const float, int] X, + optional[device_vector_view[const float, int]] sample_weight, + device_matrix_view[float, int] inertia, + host_scalar_view[float, int] inertia, + host_scalar_view[int, int] n_iter) except + + + cdef void fit( + const handle_t & handle, + const kmeans_types.KMeansParams& params, + device_matrix_view[const double, int] X, + optional[device_vector_view[const double, int]] sample_weight, + device_matrix_view[double, int] inertia, + host_scalar_view[double, int] inertia, + host_scalar_view[int, int] n_iter) except + diff --git a/python/pylibraft/pylibraft/cluster/kmeans.pyx b/python/pylibraft/pylibraft/cluster/kmeans.pyx index 732a78585d..390cd3c5dd 100644 --- a/python/pylibraft/pylibraft/cluster/kmeans.pyx +++ b/python/pylibraft/pylibraft/cluster/kmeans.pyx @@ -17,16 +17,23 @@ # distutils: language = c++ # cython: embedsignature = True # cython: language_level = 3 +from collections import namedtuple import numpy as np from cython.operator cimport dereference as deref from libc.stdint cimport uintptr_t -from libcpp cimport bool, nullptr +from libcpp cimport nullptr -from pylibraft.common import Handle +from enum import IntEnum + +from pylibraft.common import Handle, device_ndarray from pylibraft.common.handle import auto_sync_handle + +from pylibraft.cluster cimport cpp_kmeans, kmeans_types from pylibraft.common.handle cimport handle_t +from pylibraft.common.mdspan cimport * +from pylibraft.common.optional cimport optional from pylibraft.common.input_validation import * from pylibraft.distance import DISTANCE_TYPES @@ -220,3 +227,158 @@ def compute_new_centroids(X, weight_per_cluster_ptr) else: raise ValueError("dtype %s not supported" % x_dt) + + +class InitMethod(IntEnum): + KMeansPlusPlus = kmeans_types.InitMethod.KMeansPlusPlus + Random = kmeans_types.InitMethod.Random + Array = kmeans_types.InitMethod.Array + + +cdef class KMeansParams: + cdef kmeans_types.KMeansParams c_obj + + def __init__(self, + n_clusters: Optional[int] = None, + max_iter: Optional[int] = None, + tol: Optional[float] = None, + init: Optional[InitMethod] = None, + n_init: Optional[int] = None, + oversampling_factor: Optional[float] = None, + batch_samples: Optional[int] = None, + batch_centroids: Optional[int] = None, + inertia_check: Optional[bool] = None): + if n_clusters is not None: + self.c_obj.n_clusters = n_clusters + if max_iter is not None: + self.c_obj.max_iter = max_iter + if tol is not None: + self.c_obj.tol = tol + if init is not None: + self.c_obj.init = init + if n_init is not None: + self.c_obj.n_init = n_init + if oversampling_factor is not None: + self.c_obj.oversampling_factor = oversampling_factor + if batch_samples is not None: + self.c_obj.batch_samples = batch_samples + if batch_centroids is not None: + self.c_obj.batch_centroids = batch_centroids + if inertia_check is not None: + self.c_obj.inertia_check = inertia_check + + # TODO: distance metric/ verbosity level (?) / rng state + + @property + def n_clusters(self): + return self.c_obj.n_clusters + + @property + def max_iter(self): + return self.c_obj.max_iter + + @property + def tol(self): + return self.c_obj.tol + + @property + def init(self): + return InitMethod(self.c_obj.init) + + @property + def oversampling_factor(self): + return self.c_obj.oversampling_factor + + @property + def batch_samples(self): + return self.c_obj.batch_samples + + @property + def batch_centroids(self): + return self.c_obj.batch_centroids + + @property + def inertia_check(self): + return self.c_obj.inertia_check + +KMeansOutput = namedtuple("FitOutput", "centroids inertia n_iter") + + +@auto_sync_handle +def fit( + KMeansParams params, X, centroids=None, sample_weights=None, handle=None +): + + """ + Fit kmeans + + Parameters + ---------- + + X : Input CUDA array interface compliant matrix shape (m, k) + centroids : Optional writable CUDA array interface compliant matrix + shape (n_clusters, k) + sample_weights : Optional input CUDA array interface compliant matrix shape + (n_clusters, 1) default: None + {handle_docstring} + + Examples + -------- + + .. code-block:: python + + import cupy as cp + + from pylibraft.cluster.kmeans import fit, KMeansParams + + n_samples = 5000 + n_features = 50 + n_clusters = 3 + + X = cp.random.random_sample((n_samples, n_features), + dtype=cp.float32) + + params = KMeansParams(n_clusters=n_clusters) + centroids, inertia, n_iter = fit(params, X) + """ + cdef handle_t *h = handle.getHandle() + + x_cai = X.__cuda_array_interface__ + dtype = np.dtype(x_cai["typestr"]) + + cdef float f_inertia = 0.0 + cdef double d_inertia = 0.0 + cdef int n_iter = 0 + + # TODO: convert sampleweights (when != None) to device_vector_view + cdef optional[device_vector_view[const double, int]] d_sample_weights + cdef optional[device_vector_view[const float, int]] f_sample_weights + + if centroids is None: + centroids_shape = (params.n_clusters, x_cai["shape"][1]) + centroids = device_ndarray.empty(centroids_shape, dtype=dtype) + + if dtype == np.float64: + cpp_kmeans.fit( + deref(h), + params.c_obj, + const_device_matrix_view_from_array[double](X, NULL), + d_sample_weights, + device_matrix_view_from_array[double](centroids, NULL), + make_host_scalar_view[double, int](&d_inertia), + make_host_scalar_view[int, int](&n_iter)) + return KMeansOutput(centroids, d_inertia, n_iter) + + elif dtype == np.float32: + cpp_kmeans.fit( + deref(h), + params.c_obj, + const_device_matrix_view_from_array[float](X, NULL), + f_sample_weights, + device_matrix_view_from_array[float](centroids, NULL), + make_host_scalar_view[float, int](&f_inertia), + make_host_scalar_view[int, int](&n_iter)) + return KMeansOutput(centroids, f_inertia, n_iter) + + else: + raise ValueError(f"unhandled dtype {dtype}") diff --git a/python/pylibraft/pylibraft/cluster/kmeans_types.pxd b/python/pylibraft/pylibraft/cluster/kmeans_types.pxd new file mode 100644 index 0000000000..d6f1c2c45a --- /dev/null +++ b/python/pylibraft/pylibraft/cluster/kmeans_types.pxd @@ -0,0 +1,34 @@ +# TODO: expose this function from raft +# cdef void kmeans_fit[ElementType, IndexType]( +# const handle_t & handle, +# const KMeansParams& params, +# device_matrix_view[const ElementType, IndexType] X, +# optional[device_vector_view[const ElementType, IndexType]] sample_weight, +# device_matrix_view[ElementType, IndexType] inertia, +# host_scalar_view[ElementType] inertia, +# host_scalar_view[IndexType] n_iter) except + +from libcpp cimport bool + +from pylibraft.random.rng_state cimport RngState + +cdef extern from "raft/cluster/kmeans_types.hpp" \ + namespace "raft::cluster::kmeans": + + ctypedef enum InitMethod 'raft::cluster::KMeansParams::InitMethod': + KMeansPlusPlus 'raft::cluster::kmeans::KMeansParams::InitMethod::KMeansPlusPlus' + Random 'raft::cluster::kmeans::KMeansParams::InitMethod::Random' + Array 'raft::cluster::kmeans::KMeansParams::InitMethod::Array' + + cdef cppclass KMeansParams: + KMeansParams() except + + int n_clusters + InitMethod init + int max_iter + double tol + int verbosity + RngState rng_state + int n_init + double oversampling_factor + int batch_samples + int batch_centroids + bool inertia_check diff --git a/python/pylibraft/pylibraft/common/CMakeLists.txt b/python/pylibraft/pylibraft/common/CMakeLists.txt index 3b49cef429..6eb31ef5e2 100644 --- a/python/pylibraft/pylibraft/common/CMakeLists.txt +++ b/python/pylibraft/pylibraft/common/CMakeLists.txt @@ -13,7 +13,7 @@ # ============================================================================= # Set the list of Cython files to build -set(cython_sources cuda.pyx handle.pyx interruptible.pyx) +set(cython_sources cuda.pyx handle.pyx interruptible.pyx mdspan.pyx) set(linked_libraries raft::raft) # Build all of the Cython targets diff --git a/python/pylibraft/pylibraft/common/mdspan.pxd b/python/pylibraft/pylibraft/common/mdspan.pxd new file mode 100644 index 0000000000..fe8b7c93c6 --- /dev/null +++ b/python/pylibraft/pylibraft/common/mdspan.pxd @@ -0,0 +1,46 @@ +# +# Copyright (c) 2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# cython: profile=False +# distutils: language = c++ +# cython: embedsignature = True +# cython: language_level = 3 + + +cdef extern from "raft/core/device_mdspan.hpp" namespace "raft" nogil: + cdef cppclass device_vector_view[ElementType, IndexType]: + pass + + cdef cppclass device_matrix_view[ElementType, IndexType]: + pass + + cdef cppclass host_scalar_view[ElementType, IndexType]: + pass + + cdef device_vector_view[ElementType, IndexType] make_device_vector_view[ElementType, IndexType](ElementType * ptr, IndexType n) except + + + cdef device_matrix_view[ElementType, IndexType] make_device_matrix_view[ElementType, IndexType](ElementType * ptr, IndexType rows, IndexType cols) except + + + cdef host_scalar_view[ElementType, IndexType] make_host_scalar_view[ElementType, IndexType](ElementType * ptr) except + + + +ctypedef fused ElementType: + float + double + + +cdef device_matrix_view[ElementType, int] device_matrix_view_from_array(arr, ElementType * p) except + +cdef device_matrix_view[const ElementType, int] const_device_matrix_view_from_array(arr, ElementType * p) except + diff --git a/python/pylibraft/pylibraft/common/mdspan.pyx b/python/pylibraft/pylibraft/common/mdspan.pyx new file mode 100644 index 0000000000..4fb0b64594 --- /dev/null +++ b/python/pylibraft/pylibraft/common/mdspan.pyx @@ -0,0 +1,65 @@ +# +# Copyright (c) 2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# cython: profile=False +# distutils: language = c++ +# cython: embedsignature = True +# cython: language_level = 3 + +from libc.stdint cimport uintptr_t +import numpy as np + +from pylibraft.common.input_validation import is_c_contiguous + +from pylibraft.common.mdspan cimport * + + + +cdef _validate_array_interface(cai, expected_shape, ElementType * p) except +: + """ checks an array interface dictionary to see if the shape, dtype, and strides + match expectations """ + shape = cai["shape"] + if len(shape) != expected_shape: + raise ValueError(f"unexpected shape {shape} - expected {expected_shape} elements") + + dt = np.dtype(cai["typestr"]) + if dt.itemsize != sizeof(ElementType): + raise ValueError(f"invalid dtype {dt}: has itemsize {dt.itemsize} but function expects {sizeof(ElementType)}") + + if not is_c_contiguous(cai, dt): + raise ValueError("input must be c-contiguous") + + +cdef device_matrix_view[ElementType, int] device_matrix_view_from_array(arr, ElementType * p) except +: + """ Transform an CAI array to a device_matrix_view """ + # need to have the ElementType as one of the params, otherwise this crashes cython compiler =( + cai = arr.__cuda_array_interface__ + _validate_array_interface(cai, 2, p) + rows, cols = cai["shape"] + ptr = cai["data"][0] + return make_device_matrix_view(ptr, rows, cols) + + +cdef device_matrix_view[const ElementType, int] const_device_matrix_view_from_array(arr, ElementType * p) except +: + """ Transform an CAI array to a device_matrix_view with a const ElementType""" + # I couldn't make cython accept a FusedType that distiguishes between a const/non-const + # ElementType - meaning that we have some duplicated logic from the device_matrix_view_from_array + # function here + cai = arr.__cuda_array_interface__ + _validate_array_interface(cai, 2, p) + rows, cols = cai["shape"] + ptr = cai["data"][0] + return make_device_matrix_view(ptr, rows, cols) diff --git a/python/pylibraft/pylibraft/common/optional.pxd b/python/pylibraft/pylibraft/common/optional.pxd new file mode 100644 index 0000000000..53ec4cc592 --- /dev/null +++ b/python/pylibraft/pylibraft/common/optional.pxd @@ -0,0 +1,40 @@ +# polyfill for libcpp.optional +# +# We're still using cython v0.29.x - which doesn't have std::optional +# support. Include this definition here as suggested by +# https://github.com/cython/cython/issues/3293#issuecomment-1223058101 + +from libcpp cimport bool + +cdef extern from "" namespace "std" nogil: + cdef cppclass nullopt_t: + nullopt_t() + + cdef nullopt_t nullopt + + cdef cppclass optional[T]: + ctypedef T value_type + optional() + optional(nullopt_t) + optional(optional&) except + + optional(T&) except + + bool has_value() + T& value() + T& value_or[U](U& default_value) + void swap(optional&) + void reset() + T& emplace(...) + T& operator*() + #T* operator->() # Not Supported + optional& operator=(optional&) + optional& operator=[U](U&) + bool operator bool() + bool operator!() + bool operator==[U](optional&, U&) + bool operator!=[U](optional&, U&) + bool operator<[U](optional&, U&) + bool operator>[U](optional&, U&) + bool operator<=[U](optional&, U&) + bool operator>=[U](optional&, U&) + + optional[T] make_optional[T](...) except + diff --git a/python/pylibraft/pylibraft/test/test_kmeans.py b/python/pylibraft/pylibraft/test/test_kmeans.py index 58028e90e8..1b6203afa9 100644 --- a/python/pylibraft/pylibraft/test/test_kmeans.py +++ b/python/pylibraft/pylibraft/test/test_kmeans.py @@ -16,11 +16,27 @@ import numpy as np import pytest -from pylibraft.cluster.kmeans import compute_new_centroids +from pylibraft.cluster.kmeans import KMeansParams, compute_new_centroids, fit from pylibraft.common import Handle, device_ndarray from pylibraft.distance import pairwise_distance +@pytest.mark.parametrize("n_rows", [100]) +@pytest.mark.parametrize("n_cols", [5, 25]) +@pytest.mark.parametrize("n_clusters", [5, 15]) +@pytest.mark.parametrize("dtype", [np.float32, np.float64]) +def test_kmeans_fit(n_rows, n_cols, n_clusters, dtype): + X = np.random.random_sample((n_rows, n_cols)).astype(dtype) + + # TODO: test out some different options on this + # TODO: use a fixed RNG state on params + params = KMeansParams(n_clusters=n_clusters) + + centroids, inertia, n_iter = fit(params, device_ndarray(X)) + + # TODO: validate that centroids are reasonable ... somehow + + @pytest.mark.parametrize("n_rows", [100]) @pytest.mark.parametrize("n_cols", [5, 25]) @pytest.mark.parametrize("n_clusters", [5, 15]) From efb0b3047a99c4e98ef73e1d1a19094179be7aec Mon Sep 17 00:00:00 2001 From: Ben Frederickson Date: Tue, 15 Nov 2022 13:13:38 -0800 Subject: [PATCH 02/11] style --- .../pylibraft/cluster/cpp_kmeans.pxd | 5 ++- .../pylibraft/cluster/kmeans_types.pxd | 28 +++++++++------ python/pylibraft/pylibraft/common/mdspan.pxd | 20 +++++++---- python/pylibraft/pylibraft/common/mdspan.pyx | 35 +++++++++++-------- .../pylibraft/pylibraft/common/optional.pxd | 34 +++++++++--------- 5 files changed, 71 insertions(+), 51 deletions(-) diff --git a/python/pylibraft/pylibraft/cluster/cpp_kmeans.pxd b/python/pylibraft/pylibraft/cluster/cpp_kmeans.pxd index 1f651d2ad4..14cd1ebc30 100644 --- a/python/pylibraft/pylibraft/cluster/cpp_kmeans.pxd +++ b/python/pylibraft/pylibraft/cluster/cpp_kmeans.pxd @@ -24,11 +24,10 @@ from cython.operator cimport dereference as deref from libc.stdint cimport uintptr_t from libcpp cimport bool, nullptr +cimport pylibraft.cluster.kmeans_types as kmeans_types from pylibraft.common.handle cimport handle_t -from pylibraft.common.optional cimport optional from pylibraft.common.mdspan cimport * - -cimport pylibraft.cluster.kmeans_types as kmeans_types +from pylibraft.common.optional cimport optional cdef extern from "raft_distance/kmeans.hpp" \ diff --git a/python/pylibraft/pylibraft/cluster/kmeans_types.pxd b/python/pylibraft/pylibraft/cluster/kmeans_types.pxd index d6f1c2c45a..e5a7fcebc9 100644 --- a/python/pylibraft/pylibraft/cluster/kmeans_types.pxd +++ b/python/pylibraft/pylibraft/cluster/kmeans_types.pxd @@ -1,21 +1,29 @@ -# TODO: expose this function from raft -# cdef void kmeans_fit[ElementType, IndexType]( -# const handle_t & handle, -# const KMeansParams& params, -# device_matrix_view[const ElementType, IndexType] X, -# optional[device_vector_view[const ElementType, IndexType]] sample_weight, -# device_matrix_view[ElementType, IndexType] inertia, -# host_scalar_view[ElementType] inertia, -# host_scalar_view[IndexType] n_iter) except + +# +# Copyright (c) 2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + from libcpp cimport bool from pylibraft.random.rng_state cimport RngState + cdef extern from "raft/cluster/kmeans_types.hpp" \ namespace "raft::cluster::kmeans": ctypedef enum InitMethod 'raft::cluster::KMeansParams::InitMethod': - KMeansPlusPlus 'raft::cluster::kmeans::KMeansParams::InitMethod::KMeansPlusPlus' + KMeansPlusPlus 'raft::cluster::kmeans::KMeansParams::InitMethod::KMeansPlusPlus' # noqa Random 'raft::cluster::kmeans::KMeansParams::InitMethod::Random' Array 'raft::cluster::kmeans::KMeansParams::InitMethod::Array' diff --git a/python/pylibraft/pylibraft/common/mdspan.pxd b/python/pylibraft/pylibraft/common/mdspan.pxd index fe8b7c93c6..188fab3da6 100644 --- a/python/pylibraft/pylibraft/common/mdspan.pxd +++ b/python/pylibraft/pylibraft/common/mdspan.pxd @@ -30,17 +30,25 @@ cdef extern from "raft/core/device_mdspan.hpp" namespace "raft" nogil: cdef cppclass host_scalar_view[ElementType, IndexType]: pass - cdef device_vector_view[ElementType, IndexType] make_device_vector_view[ElementType, IndexType](ElementType * ptr, IndexType n) except + - - cdef device_matrix_view[ElementType, IndexType] make_device_matrix_view[ElementType, IndexType](ElementType * ptr, IndexType rows, IndexType cols) except + + cdef device_vector_view[ElementType, IndexType] + make_device_vector_view[ElementType, IndexType]( + ElementType * ptr, IndexType n + ) except + - cdef host_scalar_view[ElementType, IndexType] make_host_scalar_view[ElementType, IndexType](ElementType * ptr) except + + cdef device_matrix_view[ElementType, IndexType] + make_device_matrix_view[ElementType, IndexType]( + ElementType * ptr, IndexType rows, IndexType cols + ) except + + cdef host_scalar_view[ElementType, IndexType] + make_host_scalar_view[ElementType, IndexType](ElementType * ptr) except + ctypedef fused ElementType: float double +cdef device_matrix_view[ElementType, int] +device_matrix_view_from_array(arr, ElementType * p) except + -cdef device_matrix_view[ElementType, int] device_matrix_view_from_array(arr, ElementType * p) except + -cdef device_matrix_view[const ElementType, int] const_device_matrix_view_from_array(arr, ElementType * p) except + +cdef device_matrix_view[const ElementType, int] +const_device_matrix_view_from_array(arr, ElementType * p) except + diff --git a/python/pylibraft/pylibraft/common/mdspan.pyx b/python/pylibraft/pylibraft/common/mdspan.pyx index 4fb0b64594..ced7497856 100644 --- a/python/pylibraft/pylibraft/common/mdspan.pyx +++ b/python/pylibraft/pylibraft/common/mdspan.pyx @@ -20,6 +20,7 @@ # cython: language_level = 3 from libc.stdint cimport uintptr_t + import numpy as np from pylibraft.common.input_validation import is_c_contiguous @@ -27,25 +28,29 @@ from pylibraft.common.input_validation import is_c_contiguous from pylibraft.common.mdspan cimport * - cdef _validate_array_interface(cai, expected_shape, ElementType * p) except +: - """ checks an array interface dictionary to see if the shape, dtype, and strides - match expectations """ + """ checks an array interface dictionary to see if the shape, dtype, and + strides match expectations """ shape = cai["shape"] if len(shape) != expected_shape: - raise ValueError(f"unexpected shape {shape} - expected {expected_shape} elements") + raise ValueError(f"unexpected shape {shape} - " + f"expected {expected_shape} elements") dt = np.dtype(cai["typestr"]) if dt.itemsize != sizeof(ElementType): - raise ValueError(f"invalid dtype {dt}: has itemsize {dt.itemsize} but function expects {sizeof(ElementType)}") + raise ValueError(f"invalid dtype {dt}: has itemsize {dt.itemsize} but" + f" function expects {sizeof(ElementType)}") if not is_c_contiguous(cai, dt): raise ValueError("input must be c-contiguous") -cdef device_matrix_view[ElementType, int] device_matrix_view_from_array(arr, ElementType * p) except +: - """ Transform an CAI array to a device_matrix_view """ - # need to have the ElementType as one of the params, otherwise this crashes cython compiler =( +cdef device_matrix_view[ElementType, int] device_matrix_view_from_array( + arr, ElementType * p +) except +: + """ Transform a CAI array to a device_matrix_view """ + # need to have the ElementType as one of the parameters, otherwise this + # crashes the cython compiler =( cai = arr.__cuda_array_interface__ _validate_array_interface(cai, 2, p) rows, cols = cai["shape"] @@ -53,13 +58,15 @@ cdef device_matrix_view[ElementType, int] device_matrix_view_from_array(arr, Ele return make_device_matrix_view(ptr, rows, cols) -cdef device_matrix_view[const ElementType, int] const_device_matrix_view_from_array(arr, ElementType * p) except +: - """ Transform an CAI array to a device_matrix_view with a const ElementType""" - # I couldn't make cython accept a FusedType that distiguishes between a const/non-const - # ElementType - meaning that we have some duplicated logic from the device_matrix_view_from_array - # function here +cdef device_matrix_view[const ElementType, int] +const_device_matrix_view_from_array(arr, ElementType * p) except +: + """ Transform a CAI array to a device_matrix_view with a const element""" + # I couldn't make cython accept a FusedType that distiguishes between a + # const/non-const ElementType - meaning that we have some duplicated + # logic from the device_matrix_view_from_array function here cai = arr.__cuda_array_interface__ _validate_array_interface(cai, 2, p) rows, cols = cai["shape"] ptr = cai["data"][0] - return make_device_matrix_view(ptr, rows, cols) + return make_device_matrix_view(ptr, + rows, cols) diff --git a/python/pylibraft/pylibraft/common/optional.pxd b/python/pylibraft/pylibraft/common/optional.pxd index 53ec4cc592..269620d946 100644 --- a/python/pylibraft/pylibraft/common/optional.pxd +++ b/python/pylibraft/pylibraft/common/optional.pxd @@ -1,3 +1,18 @@ +# +# Copyright (c) 2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + # polyfill for libcpp.optional # # We're still using cython v0.29.x - which doesn't have std::optional @@ -6,6 +21,7 @@ from libcpp cimport bool + cdef extern from "" namespace "std" nogil: cdef cppclass nullopt_t: nullopt_t() @@ -18,23 +34,5 @@ cdef extern from "" namespace "std" nogil: optional(nullopt_t) optional(optional&) except + optional(T&) except + - bool has_value() - T& value() - T& value_or[U](U& default_value) - void swap(optional&) - void reset() - T& emplace(...) - T& operator*() - #T* operator->() # Not Supported - optional& operator=(optional&) - optional& operator=[U](U&) - bool operator bool() - bool operator!() - bool operator==[U](optional&, U&) - bool operator!=[U](optional&, U&) - bool operator<[U](optional&, U&) - bool operator>[U](optional&, U&) - bool operator<=[U](optional&, U&) - bool operator>=[U](optional&, U&) optional[T] make_optional[T](...) except + From 31d232c7271e18b8e00aad63ab2dcbb6cc08f17f Mon Sep 17 00:00:00 2001 From: Ben Frederickson Date: Tue, 15 Nov 2022 14:09:08 -0800 Subject: [PATCH 03/11] fix --- python/pylibraft/pylibraft/common/mdspan.pxd | 33 ++++++++++---------- python/pylibraft/pylibraft/common/mdspan.pyx | 4 +-- 2 files changed, 18 insertions(+), 19 deletions(-) diff --git a/python/pylibraft/pylibraft/common/mdspan.pxd b/python/pylibraft/pylibraft/common/mdspan.pxd index 188fab3da6..a37f836341 100644 --- a/python/pylibraft/pylibraft/common/mdspan.pxd +++ b/python/pylibraft/pylibraft/common/mdspan.pxd @@ -21,34 +21,33 @@ cdef extern from "raft/core/device_mdspan.hpp" namespace "raft" nogil: - cdef cppclass device_vector_view[ElementType, IndexType]: + cdef cppclass device_vector_view[T, IndexType]: pass - cdef cppclass device_matrix_view[ElementType, IndexType]: + cdef cppclass device_matrix_view[T, IndexType]: pass - cdef cppclass host_scalar_view[ElementType, IndexType]: + cdef cppclass host_scalar_view[T, IndexType]: pass - cdef device_vector_view[ElementType, IndexType] - make_device_vector_view[ElementType, IndexType]( - ElementType * ptr, IndexType n - ) except + + cdef device_vector_view[T, IndexType] \ + make_device_vector_view[T, IndexType](T * ptr, + IndexType n) except + - cdef device_matrix_view[ElementType, IndexType] - make_device_matrix_view[ElementType, IndexType]( - ElementType * ptr, IndexType rows, IndexType cols - ) except + + cdef device_matrix_view[T, IndexType] \ + make_device_matrix_view[T, IndexType](T * ptr, + IndexType rows, + IndexType cols) except + - cdef host_scalar_view[ElementType, IndexType] - make_host_scalar_view[ElementType, IndexType](ElementType * ptr) except + + cdef host_scalar_view[T, IndexType] \ + make_host_scalar_view[T, IndexType](T * ptr) except + ctypedef fused ElementType: float double -cdef device_matrix_view[ElementType, int] -device_matrix_view_from_array(arr, ElementType * p) except + +cdef device_matrix_view[ElementType, int] \ + device_matrix_view_from_array(arr, ElementType * p) except + -cdef device_matrix_view[const ElementType, int] -const_device_matrix_view_from_array(arr, ElementType * p) except + +cdef device_matrix_view[const ElementType, int] \ + const_device_matrix_view_from_array(arr, ElementType * p) except + diff --git a/python/pylibraft/pylibraft/common/mdspan.pyx b/python/pylibraft/pylibraft/common/mdspan.pyx index ced7497856..5cdbaad3bb 100644 --- a/python/pylibraft/pylibraft/common/mdspan.pyx +++ b/python/pylibraft/pylibraft/common/mdspan.pyx @@ -58,8 +58,8 @@ cdef device_matrix_view[ElementType, int] device_matrix_view_from_array( return make_device_matrix_view(ptr, rows, cols) -cdef device_matrix_view[const ElementType, int] -const_device_matrix_view_from_array(arr, ElementType * p) except +: +cdef device_matrix_view[const ElementType, int] \ + const_device_matrix_view_from_array(arr, ElementType * p) except +: """ Transform a CAI array to a device_matrix_view with a const element""" # I couldn't make cython accept a FusedType that distiguishes between a # const/non-const ElementType - meaning that we have some duplicated From 6d26221da73f07514f8157598d00f7e379f8983e Mon Sep 17 00:00:00 2001 From: Ben Frederickson Date: Tue, 15 Nov 2022 14:56:17 -0800 Subject: [PATCH 04/11] pass sample_weights --- python/pylibraft/pylibraft/cluster/kmeans.pyx | 11 +++++++++- python/pylibraft/pylibraft/common/mdspan.pxd | 6 ++++++ python/pylibraft/pylibraft/common/mdspan.pyx | 21 +++++++++++++++++++ .../pylibraft/pylibraft/common/optional.pxd | 19 +++-------------- 4 files changed, 40 insertions(+), 17 deletions(-) diff --git a/python/pylibraft/pylibraft/cluster/kmeans.pyx b/python/pylibraft/pylibraft/cluster/kmeans.pyx index 390cd3c5dd..fd058b6835 100644 --- a/python/pylibraft/pylibraft/cluster/kmeans.pyx +++ b/python/pylibraft/pylibraft/cluster/kmeans.pyx @@ -38,6 +38,8 @@ from pylibraft.common.optional cimport optional from pylibraft.common.input_validation import * from pylibraft.distance import DISTANCE_TYPES +ctypedef const float const_float + def is_c_cont(cai, dt): return "strides" not in cai or \ @@ -350,7 +352,6 @@ def fit( cdef double d_inertia = 0.0 cdef int n_iter = 0 - # TODO: convert sampleweights (when != None) to device_vector_view cdef optional[device_vector_view[const double, int]] d_sample_weights cdef optional[device_vector_view[const float, int]] f_sample_weights @@ -359,6 +360,10 @@ def fit( centroids = device_ndarray.empty(centroids_shape, dtype=dtype) if dtype == np.float64: + if sample_weights is not None: + d_sample_weights = const_device_vector_view_from_array[double]( + sample_weights, NULL) + cpp_kmeans.fit( deref(h), params.c_obj, @@ -370,6 +375,10 @@ def fit( return KMeansOutput(centroids, d_inertia, n_iter) elif dtype == np.float32: + if sample_weights is not None: + f_sample_weights = const_device_vector_view_from_array[float]( + sample_weights, NULL) + cpp_kmeans.fit( deref(h), params.c_obj, diff --git a/python/pylibraft/pylibraft/common/mdspan.pxd b/python/pylibraft/pylibraft/common/mdspan.pxd index a37f836341..be6757c59f 100644 --- a/python/pylibraft/pylibraft/common/mdspan.pxd +++ b/python/pylibraft/pylibraft/common/mdspan.pxd @@ -51,3 +51,9 @@ cdef device_matrix_view[ElementType, int] \ cdef device_matrix_view[const ElementType, int] \ const_device_matrix_view_from_array(arr, ElementType * p) except + + +cdef device_vector_view[ElementType, int] \ + device_vector_view_from_array(arr, ElementType * p) except + + +cdef device_vector_view[const ElementType, int] \ + const_device_vector_view_from_array(arr, ElementType * p) except + diff --git a/python/pylibraft/pylibraft/common/mdspan.pyx b/python/pylibraft/pylibraft/common/mdspan.pyx index 5cdbaad3bb..907da00e7d 100644 --- a/python/pylibraft/pylibraft/common/mdspan.pyx +++ b/python/pylibraft/pylibraft/common/mdspan.pyx @@ -70,3 +70,24 @@ cdef device_matrix_view[const ElementType, int] \ ptr = cai["data"][0] return make_device_matrix_view(ptr, rows, cols) + + +cdef device_vector_view[ElementType, int] \ + device_vector_view_from_array(arr, ElementType * p) except +: + """ Transform a CAI array to a device_vector_view """ + cai = arr.__cuda_array_interface__ + _validate_array_interface(cai, 1, p) + elements, = cai["shape"] + ptr = cai["data"][0] + return make_device_vector_view(ptr, cai["shape"][0]) + + +cdef device_vector_view[const ElementType, int] \ + const_device_vector_view_from_array(arr, ElementType * p) except +: + """ Transform a CAI array to a device_vector_view with a const element""" + cai = arr.__cuda_array_interface__ + _validate_array_interface(cai, 1, p) + elements, = cai["shape"] + ptr = cai["data"][0] + return make_device_vector_view(ptr, + cai["shape"][0]) diff --git a/python/pylibraft/pylibraft/common/optional.pxd b/python/pylibraft/pylibraft/common/optional.pxd index 269620d946..9811304231 100644 --- a/python/pylibraft/pylibraft/common/optional.pxd +++ b/python/pylibraft/pylibraft/common/optional.pxd @@ -13,26 +13,13 @@ # See the License for the specific language governing permissions and # limitations under the License. -# polyfill for libcpp.optional + # # We're still using cython v0.29.x - which doesn't have std::optional -# support. Include this definition here as suggested by +# support. Include the minimal definition here as suggested by # https://github.com/cython/cython/issues/3293#issuecomment-1223058101 -from libcpp cimport bool - - cdef extern from "" namespace "std" nogil: - cdef cppclass nullopt_t: - nullopt_t() - - cdef nullopt_t nullopt - cdef cppclass optional[T]: - ctypedef T value_type optional() - optional(nullopt_t) - optional(optional&) except + - optional(T&) except + - - optional[T] make_optional[T](...) except + + optional& operator=[U](U&) From 3183345d38445190dc1727628e7c400ac61d9533 Mon Sep 17 00:00:00 2001 From: Ben Frederickson Date: Fri, 18 Nov 2022 14:52:51 -0800 Subject: [PATCH 05/11] Use make_device_matrix_view directly Rather than wrap in a function using FusedTypes, use make_device_matrix_view directly --- python/pylibraft/pylibraft/cluster/kmeans.pyx | 47 +++++++--- .../pylibraft/pylibraft/common/CMakeLists.txt | 2 +- .../pylibraft/pylibraft/common/cai_wrapper.py | 15 +++ python/pylibraft/pylibraft/common/mdspan.pyx | 93 ------------------- python/pylibraft/pylibraft/cpp/__init__.pxd | 0 python/pylibraft/pylibraft/cpp/__init__.py | 0 .../cpp_kmeans.pxd => cpp/kmeans.pxd} | 7 +- .../{cluster => cpp}/kmeans_types.pxd | 0 .../pylibraft/{common => cpp}/mdspan.pxd | 16 ---- .../pylibraft/{common => cpp}/optional.pxd | 1 - 10 files changed, 53 insertions(+), 128 deletions(-) delete mode 100644 python/pylibraft/pylibraft/common/mdspan.pyx create mode 100644 python/pylibraft/pylibraft/cpp/__init__.pxd create mode 100644 python/pylibraft/pylibraft/cpp/__init__.py rename python/pylibraft/pylibraft/{cluster/cpp_kmeans.pxd => cpp/kmeans.pxd} (91%) rename python/pylibraft/pylibraft/{cluster => cpp}/kmeans_types.pxd (100%) rename python/pylibraft/pylibraft/{common => cpp}/mdspan.pxd (74%) rename python/pylibraft/pylibraft/{common => cpp}/optional.pxd (99%) diff --git a/python/pylibraft/pylibraft/cluster/kmeans.pyx b/python/pylibraft/pylibraft/cluster/kmeans.pyx index fd058b6835..a98c395da3 100644 --- a/python/pylibraft/pylibraft/cluster/kmeans.pyx +++ b/python/pylibraft/pylibraft/cluster/kmeans.pyx @@ -27,10 +27,10 @@ from libcpp cimport nullptr from enum import IntEnum -from pylibraft.common import Handle, device_ndarray +from pylibraft.common import Handle, cai_wrapper, device_ndarray from pylibraft.common.handle import auto_sync_handle -from pylibraft.cluster cimport cpp_kmeans, kmeans_types +from pylibraft.cpp cimport kmeans as cpp_kmeans, kmeans_types from pylibraft.common.handle cimport handle_t from pylibraft.common.mdspan cimport * from pylibraft.common.optional cimport optional @@ -345,9 +345,6 @@ def fit( """ cdef handle_t *h = handle.getHandle() - x_cai = X.__cuda_array_interface__ - dtype = np.dtype(x_cai["typestr"]) - cdef float f_inertia = 0.0 cdef double d_inertia = 0.0 cdef int n_iter = 0 @@ -355,36 +352,58 @@ def fit( cdef optional[device_vector_view[const double, int]] d_sample_weights cdef optional[device_vector_view[const float, int]] f_sample_weights + X_cai = cai_wrapper(X) + dtype = X_cai.dtype + if centroids is None: - centroids_shape = (params.n_clusters, x_cai["shape"][1]) + centroids_shape = (params.n_clusters, X_cai.shape[1]) centroids = device_ndarray.empty(centroids_shape, dtype=dtype) + centroids_cai = cai_wrapper(centroids) + + # validate inputs have are all c-contiguious, and have a consistent dtype + # and expected shape + X_cai.validate(2) + centroids_cai.validate(2, dtype) + if sample_weights is not None: + sample_weights_cai = cai_wrapper(sample_weights) + sample_weights_cai.validate(1, dtype) if dtype == np.float64: if sample_weights is not None: - d_sample_weights = const_device_vector_view_from_array[double]( - sample_weights, NULL) + d_sample_weights = make_device_vector_view( + sample_weights_cai.data, + sample_weights_cai.shape[0]) cpp_kmeans.fit( deref(h), params.c_obj, - const_device_matrix_view_from_array[double](X, NULL), + make_device_matrix_view( + X_cai.data, + X_cai.shape[0], X_cai.shape[1]), d_sample_weights, - device_matrix_view_from_array[double](centroids, NULL), + make_device_matrix_view( + centroids_cai.data, + centroids_cai.shape[0], centroids_cai.shape[1]), make_host_scalar_view[double, int](&d_inertia), make_host_scalar_view[int, int](&n_iter)) return KMeansOutput(centroids, d_inertia, n_iter) elif dtype == np.float32: if sample_weights is not None: - f_sample_weights = const_device_vector_view_from_array[float]( - sample_weights, NULL) + f_sample_weights = make_device_vector_view( + sample_weights_cai.data, + sample_weights_cai.shape[0]) cpp_kmeans.fit( deref(h), params.c_obj, - const_device_matrix_view_from_array[float](X, NULL), + make_device_matrix_view( + X_cai.data, + X_cai.shape[0], X_cai.shape[1]), f_sample_weights, - device_matrix_view_from_array[float](centroids, NULL), + make_device_matrix_view( + centroids_cai.data, + centroids_cai.shape[0], centroids_cai.shape[1]), make_host_scalar_view[float, int](&f_inertia), make_host_scalar_view[int, int](&n_iter)) return KMeansOutput(centroids, f_inertia, n_iter) diff --git a/python/pylibraft/pylibraft/common/CMakeLists.txt b/python/pylibraft/pylibraft/common/CMakeLists.txt index 6eb31ef5e2..3b49cef429 100644 --- a/python/pylibraft/pylibraft/common/CMakeLists.txt +++ b/python/pylibraft/pylibraft/common/CMakeLists.txt @@ -13,7 +13,7 @@ # ============================================================================= # Set the list of Cython files to build -set(cython_sources cuda.pyx handle.pyx interruptible.pyx mdspan.pyx) +set(cython_sources cuda.pyx handle.pyx interruptible.pyx) set(linked_libraries raft::raft) # Build all of the Cython targets diff --git a/python/pylibraft/pylibraft/common/cai_wrapper.py b/python/pylibraft/pylibraft/common/cai_wrapper.py index fdfc6b0b09..970e395e5b 100644 --- a/python/pylibraft/pylibraft/common/cai_wrapper.py +++ b/python/pylibraft/pylibraft/common/cai_wrapper.py @@ -71,3 +71,18 @@ def data(self): Returns the data pointer of the underlying CUDA array interface """ return self.cai_["data"][0] + + def validate(self, expected_dims=None, expected_dtype=None): + """checks to see if the shape, dtype, and strides match expectations""" + if expected_dims is not None and len(self.shape) != expected_dims: + raise ValueError( + f"unexpected shape {self.shape} - " f"expected {expected_dims} dimensions" + ) + + if expected_dtype is not None and self.dtype != expected_dtype: + raise ValueError( + f"invalid dtype {self.dtype}: expected " f"{expected_dtype}" + ) + + if not self.c_contiguous: + raise ValueError("input must be c-contiguous") diff --git a/python/pylibraft/pylibraft/common/mdspan.pyx b/python/pylibraft/pylibraft/common/mdspan.pyx deleted file mode 100644 index 907da00e7d..0000000000 --- a/python/pylibraft/pylibraft/common/mdspan.pyx +++ /dev/null @@ -1,93 +0,0 @@ -# -# Copyright (c) 2022, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# cython: profile=False -# distutils: language = c++ -# cython: embedsignature = True -# cython: language_level = 3 - -from libc.stdint cimport uintptr_t - -import numpy as np - -from pylibraft.common.input_validation import is_c_contiguous - -from pylibraft.common.mdspan cimport * - - -cdef _validate_array_interface(cai, expected_shape, ElementType * p) except +: - """ checks an array interface dictionary to see if the shape, dtype, and - strides match expectations """ - shape = cai["shape"] - if len(shape) != expected_shape: - raise ValueError(f"unexpected shape {shape} - " - f"expected {expected_shape} elements") - - dt = np.dtype(cai["typestr"]) - if dt.itemsize != sizeof(ElementType): - raise ValueError(f"invalid dtype {dt}: has itemsize {dt.itemsize} but" - f" function expects {sizeof(ElementType)}") - - if not is_c_contiguous(cai, dt): - raise ValueError("input must be c-contiguous") - - -cdef device_matrix_view[ElementType, int] device_matrix_view_from_array( - arr, ElementType * p -) except +: - """ Transform a CAI array to a device_matrix_view """ - # need to have the ElementType as one of the parameters, otherwise this - # crashes the cython compiler =( - cai = arr.__cuda_array_interface__ - _validate_array_interface(cai, 2, p) - rows, cols = cai["shape"] - ptr = cai["data"][0] - return make_device_matrix_view(ptr, rows, cols) - - -cdef device_matrix_view[const ElementType, int] \ - const_device_matrix_view_from_array(arr, ElementType * p) except +: - """ Transform a CAI array to a device_matrix_view with a const element""" - # I couldn't make cython accept a FusedType that distiguishes between a - # const/non-const ElementType - meaning that we have some duplicated - # logic from the device_matrix_view_from_array function here - cai = arr.__cuda_array_interface__ - _validate_array_interface(cai, 2, p) - rows, cols = cai["shape"] - ptr = cai["data"][0] - return make_device_matrix_view(ptr, - rows, cols) - - -cdef device_vector_view[ElementType, int] \ - device_vector_view_from_array(arr, ElementType * p) except +: - """ Transform a CAI array to a device_vector_view """ - cai = arr.__cuda_array_interface__ - _validate_array_interface(cai, 1, p) - elements, = cai["shape"] - ptr = cai["data"][0] - return make_device_vector_view(ptr, cai["shape"][0]) - - -cdef device_vector_view[const ElementType, int] \ - const_device_vector_view_from_array(arr, ElementType * p) except +: - """ Transform a CAI array to a device_vector_view with a const element""" - cai = arr.__cuda_array_interface__ - _validate_array_interface(cai, 1, p) - elements, = cai["shape"] - ptr = cai["data"][0] - return make_device_vector_view(ptr, - cai["shape"][0]) diff --git a/python/pylibraft/pylibraft/cpp/__init__.pxd b/python/pylibraft/pylibraft/cpp/__init__.pxd new file mode 100644 index 0000000000..e69de29bb2 diff --git a/python/pylibraft/pylibraft/cpp/__init__.py b/python/pylibraft/pylibraft/cpp/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/python/pylibraft/pylibraft/cluster/cpp_kmeans.pxd b/python/pylibraft/pylibraft/cpp/kmeans.pxd similarity index 91% rename from python/pylibraft/pylibraft/cluster/cpp_kmeans.pxd rename to python/pylibraft/pylibraft/cpp/kmeans.pxd index 14cd1ebc30..a9604cab7b 100644 --- a/python/pylibraft/pylibraft/cluster/cpp_kmeans.pxd +++ b/python/pylibraft/pylibraft/cpp/kmeans.pxd @@ -24,7 +24,8 @@ from cython.operator cimport dereference as deref from libc.stdint cimport uintptr_t from libcpp cimport bool, nullptr -cimport pylibraft.cluster.kmeans_types as kmeans_types +from pylibraft.cpp.kmeans_types cimport KMeansParams + from pylibraft.common.handle cimport handle_t from pylibraft.common.mdspan cimport * from pylibraft.common.optional cimport optional @@ -35,7 +36,7 @@ cdef extern from "raft_distance/kmeans.hpp" \ cdef void fit( const handle_t & handle, - const kmeans_types.KMeansParams& params, + const KMeansParams& params, device_matrix_view[const float, int] X, optional[device_vector_view[const float, int]] sample_weight, device_matrix_view[float, int] inertia, @@ -44,7 +45,7 @@ cdef extern from "raft_distance/kmeans.hpp" \ cdef void fit( const handle_t & handle, - const kmeans_types.KMeansParams& params, + const KMeansParams& params, device_matrix_view[const double, int] X, optional[device_vector_view[const double, int]] sample_weight, device_matrix_view[double, int] inertia, diff --git a/python/pylibraft/pylibraft/cluster/kmeans_types.pxd b/python/pylibraft/pylibraft/cpp/kmeans_types.pxd similarity index 100% rename from python/pylibraft/pylibraft/cluster/kmeans_types.pxd rename to python/pylibraft/pylibraft/cpp/kmeans_types.pxd diff --git a/python/pylibraft/pylibraft/common/mdspan.pxd b/python/pylibraft/pylibraft/cpp/mdspan.pxd similarity index 74% rename from python/pylibraft/pylibraft/common/mdspan.pxd rename to python/pylibraft/pylibraft/cpp/mdspan.pxd index be6757c59f..da6cac478d 100644 --- a/python/pylibraft/pylibraft/common/mdspan.pxd +++ b/python/pylibraft/pylibraft/cpp/mdspan.pxd @@ -41,19 +41,3 @@ cdef extern from "raft/core/device_mdspan.hpp" namespace "raft" nogil: cdef host_scalar_view[T, IndexType] \ make_host_scalar_view[T, IndexType](T * ptr) except + - -ctypedef fused ElementType: - float - double - -cdef device_matrix_view[ElementType, int] \ - device_matrix_view_from_array(arr, ElementType * p) except + - -cdef device_matrix_view[const ElementType, int] \ - const_device_matrix_view_from_array(arr, ElementType * p) except + - -cdef device_vector_view[ElementType, int] \ - device_vector_view_from_array(arr, ElementType * p) except + - -cdef device_vector_view[const ElementType, int] \ - const_device_vector_view_from_array(arr, ElementType * p) except + diff --git a/python/pylibraft/pylibraft/common/optional.pxd b/python/pylibraft/pylibraft/cpp/optional.pxd similarity index 99% rename from python/pylibraft/pylibraft/common/optional.pxd rename to python/pylibraft/pylibraft/cpp/optional.pxd index 9811304231..a6dd8a2dcd 100644 --- a/python/pylibraft/pylibraft/common/optional.pxd +++ b/python/pylibraft/pylibraft/cpp/optional.pxd @@ -14,7 +14,6 @@ # limitations under the License. -# # We're still using cython v0.29.x - which doesn't have std::optional # support. Include the minimal definition here as suggested by # https://github.com/cython/cython/issues/3293#issuecomment-1223058101 From 0aad8d0059e3704d163c582c48877af4d79ec323 Mon Sep 17 00:00:00 2001 From: Ben Frederickson Date: Fri, 18 Nov 2022 15:08:40 -0800 Subject: [PATCH 06/11] style --- cpp/src/distance/kmeans_fit_float.cu | 1 + python/pylibraft/pylibraft/cluster/kmeans.pyx | 6 ++---- python/pylibraft/pylibraft/common/cai_wrapper.py | 5 +++-- python/pylibraft/pylibraft/cpp/kmeans.pxd | 3 +-- 4 files changed, 7 insertions(+), 8 deletions(-) diff --git a/cpp/src/distance/kmeans_fit_float.cu b/cpp/src/distance/kmeans_fit_float.cu index ff389f6886..eb89610eea 100644 --- a/cpp/src/distance/kmeans_fit_float.cu +++ b/cpp/src/distance/kmeans_fit_float.cu @@ -15,6 +15,7 @@ */ #include +// #include #include namespace raft::cluster::kmeans::runtime { diff --git a/python/pylibraft/pylibraft/cluster/kmeans.pyx b/python/pylibraft/pylibraft/cluster/kmeans.pyx index a98c395da3..1d7fca0f4f 100644 --- a/python/pylibraft/pylibraft/cluster/kmeans.pyx +++ b/python/pylibraft/pylibraft/cluster/kmeans.pyx @@ -30,16 +30,14 @@ from enum import IntEnum from pylibraft.common import Handle, cai_wrapper, device_ndarray from pylibraft.common.handle import auto_sync_handle -from pylibraft.cpp cimport kmeans as cpp_kmeans, kmeans_types from pylibraft.common.handle cimport handle_t from pylibraft.common.mdspan cimport * from pylibraft.common.optional cimport optional +from pylibraft.cpp cimport kmeans as cpp_kmeans, kmeans_types from pylibraft.common.input_validation import * from pylibraft.distance import DISTANCE_TYPES -ctypedef const float const_float - def is_c_cont(cai, dt): return "strides" not in cai or \ @@ -360,7 +358,7 @@ def fit( centroids = device_ndarray.empty(centroids_shape, dtype=dtype) centroids_cai = cai_wrapper(centroids) - # validate inputs have are all c-contiguious, and have a consistent dtype + # validate inputs have are all c-contiguous, and have a consistent dtype # and expected shape X_cai.validate(2) centroids_cai.validate(2, dtype) diff --git a/python/pylibraft/pylibraft/common/cai_wrapper.py b/python/pylibraft/pylibraft/common/cai_wrapper.py index 970e395e5b..78d8f4dfeb 100644 --- a/python/pylibraft/pylibraft/common/cai_wrapper.py +++ b/python/pylibraft/pylibraft/common/cai_wrapper.py @@ -73,10 +73,11 @@ def data(self): return self.cai_["data"][0] def validate(self, expected_dims=None, expected_dtype=None): - """checks to see if the shape, dtype, and strides match expectations""" + """Checks to see if the shape, dtype, and strides match expectations""" if expected_dims is not None and len(self.shape) != expected_dims: raise ValueError( - f"unexpected shape {self.shape} - " f"expected {expected_dims} dimensions" + f"unexpected shape {self.shape} - " + f"expected {expected_dims} dimensions" ) if expected_dtype is not None and self.dtype != expected_dtype: diff --git a/python/pylibraft/pylibraft/cpp/kmeans.pxd b/python/pylibraft/pylibraft/cpp/kmeans.pxd index a9604cab7b..99a265797e 100644 --- a/python/pylibraft/pylibraft/cpp/kmeans.pxd +++ b/python/pylibraft/pylibraft/cpp/kmeans.pxd @@ -24,11 +24,10 @@ from cython.operator cimport dereference as deref from libc.stdint cimport uintptr_t from libcpp cimport bool, nullptr -from pylibraft.cpp.kmeans_types cimport KMeansParams - from pylibraft.common.handle cimport handle_t from pylibraft.common.mdspan cimport * from pylibraft.common.optional cimport optional +from pylibraft.cpp.kmeans_types cimport KMeansParams cdef extern from "raft_distance/kmeans.hpp" \ From 732e43f212e7451d64971719d7a1c18acfa1dde1 Mon Sep 17 00:00:00 2001 From: Ben Frederickson Date: Mon, 28 Nov 2022 13:01:57 -0800 Subject: [PATCH 07/11] use distance specializations to speed up compile time --- cpp/src/distance/kmeans_fit_double.cu | 1 + cpp/src/distance/kmeans_fit_float.cu | 2 +- python/pylibraft/pylibraft/cpp/kmeans.pxd | 4 ++-- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/cpp/src/distance/kmeans_fit_double.cu b/cpp/src/distance/kmeans_fit_double.cu index 6d9a367d73..cb5d6e8c8f 100644 --- a/cpp/src/distance/kmeans_fit_double.cu +++ b/cpp/src/distance/kmeans_fit_double.cu @@ -15,6 +15,7 @@ */ #include +#include #include namespace raft::cluster::kmeans::runtime { diff --git a/cpp/src/distance/kmeans_fit_float.cu b/cpp/src/distance/kmeans_fit_float.cu index eb89610eea..c10234d705 100644 --- a/cpp/src/distance/kmeans_fit_float.cu +++ b/cpp/src/distance/kmeans_fit_float.cu @@ -15,7 +15,7 @@ */ #include -// #include +#include #include namespace raft::cluster::kmeans::runtime { diff --git a/python/pylibraft/pylibraft/cpp/kmeans.pxd b/python/pylibraft/pylibraft/cpp/kmeans.pxd index d3402d61ee..9eb35bd71e 100644 --- a/python/pylibraft/pylibraft/cpp/kmeans.pxd +++ b/python/pylibraft/pylibraft/cpp/kmeans.pxd @@ -25,9 +25,9 @@ from libc.stdint cimport uintptr_t from libcpp cimport bool, nullptr from pylibraft.common.handle cimport handle_t -from pylibraft.common.mdspan cimport * -from pylibraft.common.optional cimport optional from pylibraft.cpp.kmeans_types cimport KMeansParams +from pylibraft.cpp.mdspan cimport * +from pylibraft.cpp.optional cimport optional cdef extern from "raft_distance/kmeans.hpp" \ From 998486873a7a90cd4ea22aba8f579879c7b486c7 Mon Sep 17 00:00:00 2001 From: Ben Frederickson Date: Mon, 28 Nov 2022 13:40:33 -0800 Subject: [PATCH 08/11] update docstring --- python/pylibraft/pylibraft/cluster/kmeans.pyx | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/python/pylibraft/pylibraft/cluster/kmeans.pyx b/python/pylibraft/pylibraft/cluster/kmeans.pyx index 14b4e69046..c989a62019 100644 --- a/python/pylibraft/pylibraft/cluster/kmeans.pyx +++ b/python/pylibraft/pylibraft/cluster/kmeans.pyx @@ -363,7 +363,7 @@ cdef class KMeansParams: def inertia_check(self): return self.c_obj.inertia_check -KMeansOutput = namedtuple("FitOutput", "centroids inertia n_iter") +FitOutput = namedtuple("FitOutput", "centroids inertia n_iter") @auto_sync_handle @@ -372,11 +372,13 @@ def fit( ): """ - Fit kmeans + Find clusters with the k-means algorithm Parameters ---------- + params : KMeansParams + Parameters to use to fit KMeans model X : Input CUDA array interface compliant matrix shape (m, k) centroids : Optional writable CUDA array interface compliant matrix shape (n_clusters, k) @@ -384,6 +386,15 @@ def fit( (n_clusters, 1) default: None {handle_docstring} + Returns + ------- + centroids : raft.device_ndarray + The computed centroids for each cluster + inertia : float + Sum of squared distances of samples to their closest cluster center + n_iter : int + The number of iterations used to fit the model + Examples -------- @@ -446,7 +457,7 @@ def fit( centroids_cai.shape[0], centroids_cai.shape[1]), make_host_scalar_view[double, int](&d_inertia), make_host_scalar_view[int, int](&n_iter)) - return KMeansOutput(centroids, d_inertia, n_iter) + return FitOutput(centroids, d_inertia, n_iter) elif dtype == np.float32: if sample_weights is not None: @@ -466,7 +477,7 @@ def fit( centroids_cai.shape[0], centroids_cai.shape[1]), make_host_scalar_view[float, int](&f_inertia), make_host_scalar_view[int, int](&n_iter)) - return KMeansOutput(centroids, f_inertia, n_iter) + return FitOutput(centroids, f_inertia, n_iter) else: raise ValueError(f"unhandled dtype {dtype}") From 5994b52c20a069082ba28a35d8769e6c108cc9f6 Mon Sep 17 00:00:00 2001 From: Ben Frederickson Date: Mon, 28 Nov 2022 15:46:06 -0800 Subject: [PATCH 09/11] Test inertia / fix issues --- python/pylibraft/pylibraft/cluster/kmeans.pyx | 5 +++-- python/pylibraft/pylibraft/test/test_kmeans.py | 16 ++++++++++++---- 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/python/pylibraft/pylibraft/cluster/kmeans.pyx b/python/pylibraft/pylibraft/cluster/kmeans.pyx index c989a62019..c3775c4d1d 100644 --- a/python/pylibraft/pylibraft/cluster/kmeans.pyx +++ b/python/pylibraft/pylibraft/cluster/kmeans.pyx @@ -23,8 +23,10 @@ import numpy as np from cython.operator cimport dereference as deref from libc.stdint cimport uintptr_t from libcpp cimport nullptr +from collections import namedtuple +from enum import IntEnum -from pylibraft.common import Handle +from pylibraft.common import Handle, cai_wrapper from pylibraft.common.handle import auto_sync_handle from pylibraft.common.handle cimport handle_t @@ -370,7 +372,6 @@ FitOutput = namedtuple("FitOutput", "centroids inertia n_iter") def fit( KMeansParams params, X, centroids=None, sample_weights=None, handle=None ): - """ Find clusters with the k-means algorithm diff --git a/python/pylibraft/pylibraft/test/test_kmeans.py b/python/pylibraft/pylibraft/test/test_kmeans.py index 525033afeb..1d4e8e2352 100644 --- a/python/pylibraft/pylibraft/test/test_kmeans.py +++ b/python/pylibraft/pylibraft/test/test_kmeans.py @@ -31,15 +31,23 @@ @pytest.mark.parametrize("n_clusters", [5, 15]) @pytest.mark.parametrize("dtype", [np.float32, np.float64]) def test_kmeans_fit(n_rows, n_cols, n_clusters, dtype): - X = np.random.random_sample((n_rows, n_cols)).astype(dtype) + # generate some random input points / centroids + X_host = np.random.random_sample((n_rows, n_cols)).astype(dtype) + centroids = device_ndarray(X_host[:n_clusters]) + X = device_ndarray(X_host) + + # compute the inertia, before fitting centroids + original_inertia = cluster_cost(X, centroids) # TODO: test out some different options on this # TODO: use a fixed RNG state on params params = KMeansParams(n_clusters=n_clusters) - centroids, inertia, n_iter = fit(params, device_ndarray(X)) - - # TODO: validate that centroids are reasonable ... somehow + # fit the centroids, make sure inertia has gone down + centroids, inertia, n_iter = fit(params, X, centroids) + assert inertia < original_inertia + assert n_iter >= 1 + assert np.allclose(cluster_cost(X, centroids), inertia, rtol=1e-6) @pytest.mark.parametrize("n_rows", [100]) From 89a5c1533a4ffc20339b36ca11128ff617807a7d Mon Sep 17 00:00:00 2001 From: Ben Frederickson Date: Tue, 29 Nov 2022 12:03:58 -0800 Subject: [PATCH 10/11] build fix --- python/pylibraft/pylibraft/cluster/kmeans.pyx | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/python/pylibraft/pylibraft/cluster/kmeans.pyx b/python/pylibraft/pylibraft/cluster/kmeans.pyx index c3775c4d1d..752ede08a2 100644 --- a/python/pylibraft/pylibraft/cluster/kmeans.pyx +++ b/python/pylibraft/pylibraft/cluster/kmeans.pyx @@ -23,6 +23,7 @@ import numpy as np from cython.operator cimport dereference as deref from libc.stdint cimport uintptr_t from libcpp cimport nullptr + from collections import namedtuple from enum import IntEnum @@ -35,13 +36,13 @@ from pylibraft.common.input_validation import * from pylibraft.distance import DISTANCE_TYPES from pylibraft.common.handle cimport handle_t -from pylibraft.common.mdspan cimport * -from pylibraft.common.optional cimport optional from pylibraft.cpp cimport kmeans as cpp_kmeans, kmeans_types from pylibraft.cpp.kmeans cimport ( cluster_cost as cpp_cluster_cost, update_centroids, ) +from pylibraft.cpp.mdspan cimport * +from pylibraft.cpp.optional cimport optional def is_c_cont(cai, dt): From fb3db0fb056789255ba895d4613426ddc0e90239 Mon Sep 17 00:00:00 2001 From: Ben Frederickson Date: Wed, 30 Nov 2022 11:31:48 -0800 Subject: [PATCH 11/11] Add parameters / code review feedback * Add seed/metric/verbosity to python kmeans params * Add a todo about make_blobs testing * rename cai_wrapper.validate to validate_shape_dtype --- python/pylibraft/pylibraft/cluster/kmeans.pyx | 55 +++++++++++++++++-- .../pylibraft/pylibraft/common/cai_wrapper.py | 2 +- .../pylibraft/pylibraft/cpp/kmeans_types.pxd | 2 + .../pylibraft/pylibraft/test/test_kmeans.py | 8 ++- 4 files changed, 58 insertions(+), 9 deletions(-) diff --git a/python/pylibraft/pylibraft/cluster/kmeans.pyx b/python/pylibraft/pylibraft/cluster/kmeans.pyx index 752ede08a2..3432ebe0a0 100644 --- a/python/pylibraft/pylibraft/cluster/kmeans.pyx +++ b/python/pylibraft/pylibraft/cluster/kmeans.pyx @@ -31,6 +31,7 @@ from pylibraft.common import Handle, cai_wrapper from pylibraft.common.handle import auto_sync_handle from pylibraft.common.handle cimport handle_t +from pylibraft.random.rng_state cimport RngState from pylibraft.common.input_validation import * from pylibraft.distance import DISTANCE_TYPES @@ -295,18 +296,45 @@ def cluster_cost(X, centroids, handle=None): class InitMethod(IntEnum): + """ Method for initializing kmeans """ KMeansPlusPlus = kmeans_types.InitMethod.KMeansPlusPlus Random = kmeans_types.InitMethod.Random Array = kmeans_types.InitMethod.Array cdef class KMeansParams: + """ Specifies hyper-parameters for the kmeans algorithm. + + Parameters + ---------- + n_clusters : int, optional + The number of clusters to form as well as the number of centroids + to generate + max_iter : int, optional + Maximum number of iterations of the k-means algorithm for a single run + tol : float, optional + Relative tolerance with regards to inertia to declare convergence + verbosity : int, optional + seed: int, optional + Seed to the random number generator. + metric : str, optional + Metric names to use for distance computation, see + :func:`pylibraft.distance.pairwise_distance` for valid values. + init : InitMethod, optional + n_init : int, optional + Number of instance k-means algorithm will be run with different seeds. + oversampling_factor : float, optional + Oversampling factor for use in the k-means algorithm + """ cdef kmeans_types.KMeansParams c_obj def __init__(self, n_clusters: Optional[int] = None, max_iter: Optional[int] = None, tol: Optional[float] = None, + verbosity: Optional[int] = None, + seed: Optional[int] = None, + metric: Optional[str] = None, init: Optional[InitMethod] = None, n_init: Optional[int] = None, oversampling_factor: Optional[float] = None, @@ -319,6 +347,17 @@ cdef class KMeansParams: self.c_obj.max_iter = max_iter if tol is not None: self.c_obj.tol = tol + if verbosity is not None: + self.c_obj.verbosity = verbosity + if seed is not None: + self.c_obj.rng_state.seed = seed + if metric is not None: + distance = DISTANCE_TYPES.get(metric) + if distance is None: + valid_metrics = list(DISTANCE_TYPES.keys()) + raise ValueError(f"Unknown metric '{metric}'. Valid values " + f"are: {valid_metrics}") + self.c_obj.metric = distance if init is not None: self.c_obj.init = init if n_init is not None: @@ -332,8 +371,6 @@ cdef class KMeansParams: if inertia_check is not None: self.c_obj.inertia_check = inertia_check - # TODO: distance metric/ verbosity level (?) / rng state - @property def n_clusters(self): return self.c_obj.n_clusters @@ -346,6 +383,14 @@ cdef class KMeansParams: def tol(self): return self.c_obj.tol + @property + def verbosity(self): + return self.c_obj.verbosity + + @property + def seed(self): + return self.c_obj.rng_state.seed + @property def init(self): return InitMethod(self.c_obj.init) @@ -435,11 +480,11 @@ def fit( # validate inputs have are all c-contiguous, and have a consistent dtype # and expected shape - X_cai.validate(2) - centroids_cai.validate(2, dtype) + X_cai.validate_shape_dtype(2) + centroids_cai.validate_shape_dtype(2, dtype) if sample_weights is not None: sample_weights_cai = cai_wrapper(sample_weights) - sample_weights_cai.validate(1, dtype) + sample_weights_cai.validate_shape_dtype(1, dtype) if dtype == np.float64: if sample_weights is not None: diff --git a/python/pylibraft/pylibraft/common/cai_wrapper.py b/python/pylibraft/pylibraft/common/cai_wrapper.py index 78d8f4dfeb..5851821f57 100644 --- a/python/pylibraft/pylibraft/common/cai_wrapper.py +++ b/python/pylibraft/pylibraft/common/cai_wrapper.py @@ -72,7 +72,7 @@ def data(self): """ return self.cai_["data"][0] - def validate(self, expected_dims=None, expected_dtype=None): + def validate_shape_dtype(self, expected_dims=None, expected_dtype=None): """Checks to see if the shape, dtype, and strides match expectations""" if expected_dims is not None and len(self.shape) != expected_dims: raise ValueError( diff --git a/python/pylibraft/pylibraft/cpp/kmeans_types.pxd b/python/pylibraft/pylibraft/cpp/kmeans_types.pxd index e5a7fcebc9..869d2cb5fd 100644 --- a/python/pylibraft/pylibraft/cpp/kmeans_types.pxd +++ b/python/pylibraft/pylibraft/cpp/kmeans_types.pxd @@ -16,6 +16,7 @@ from libcpp cimport bool +from pylibraft.distance.distance_type cimport DistanceType from pylibraft.random.rng_state cimport RngState @@ -35,6 +36,7 @@ cdef extern from "raft/cluster/kmeans_types.hpp" \ double tol int verbosity RngState rng_state + DistanceType metric int n_init double oversampling_factor int batch_samples diff --git a/python/pylibraft/pylibraft/test/test_kmeans.py b/python/pylibraft/pylibraft/test/test_kmeans.py index 1d4e8e2352..e5e544d565 100644 --- a/python/pylibraft/pylibraft/test/test_kmeans.py +++ b/python/pylibraft/pylibraft/test/test_kmeans.py @@ -39,11 +39,13 @@ def test_kmeans_fit(n_rows, n_cols, n_clusters, dtype): # compute the inertia, before fitting centroids original_inertia = cluster_cost(X, centroids) - # TODO: test out some different options on this - # TODO: use a fixed RNG state on params - params = KMeansParams(n_clusters=n_clusters) + params = KMeansParams(n_clusters=n_clusters, seed=42) # fit the centroids, make sure inertia has gone down + # TODO: once we have make_blobs exposed to python + # (https://github.com/rapidsai/raft/issues/1059) + # we should use that to test out the kmeans fit, like the C++ + # tests do right now centroids, inertia, n_iter = fit(params, X, centroids) assert inertia < original_inertia assert n_iter >= 1