From 3c4e43cff01b6ab617db5b1f839dd5103a66f398 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Mon, 5 Dec 2022 20:29:45 -0500 Subject: [PATCH 01/19] Initial port of auto-find-k from nvgraph --- .../cluster/detail/kmeans_auto_find_k.cuh | 180 ++++++++++++++++++ 1 file changed, 180 insertions(+) create mode 100644 cpp/include/raft/cluster/detail/kmeans_auto_find_k.cuh diff --git a/cpp/include/raft/cluster/detail/kmeans_auto_find_k.cuh b/cpp/include/raft/cluster/detail/kmeans_auto_find_k.cuh new file mode 100644 index 0000000000..9cd2afaba4 --- /dev/null +++ b/cpp/include/raft/cluster/detail/kmeans_auto_find_k.cuh @@ -0,0 +1,180 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +#include + +#include +#include + +namespace raft::cluster::kmeans::detail { +template +void kmeans_find_clusters(const raft::handle_t& handle, + raft::device_matrix_view X, + raft::device_matrix_view centroids, + raft::device_vector_view labels, + raft::host_scalar_view k_star, + raft::host_scalar_view residual, + raft::host_scalar_view maxiter, + idx_t n, + idx_t d, + idx_t kmin, + idx_t kmax, + value_t tol) +{ + idx_t n = X.extent(0); + idx_t d = X.extent(1); + + RAFT_EXPECTS(n >= 1, "n must be >= 1"); + RAFT_EXPECTS(d >= 1, "d must be >= 1"); + RAFT_EXPECTS(kmin >= 1, "kmin must be >= 1"); + RAFT_EXPECTS(kmax <= n, "kmax must be <= number of data samples in X"); + RAFT_EXPECTS(tol >= 0, "tolerance must be >= 0"); + RAFT_EXPECTS(maxiter >= 0, "maxiter must be >= 0"); + // Allocate memory + // Device memory + + auto clusterSizes = raft::make_device_vector(kmax); + auto data_dots = raft::make_device_vector(n); + auto centroid_dots = raft::device_vector(kmax); + auto work = raft::make_device_vector(n * max(kmax, d)); + auto work_int = raft::make_device_vector work_int(2 * d * n); + + idx_t* clusterSizes_ptr = clusterSizes.data_handle(); + // idx_t *codes_ptr = codes.data().get(); + idx_t* work_int_ptr = work_int.data_handle(); + value_t* data_dots_ptr = data_dots.data_handle(); + value_t* centroid_dots_ptr = centroid_dots.data_handle(); + value_t* work_ptr = work.data_handle(); + + // Host memory + auto results = raft::make_host_vector(kmax + 1); + auto clusterDispersion = raft::make_host_vector(kmax + 1); + + // Loop to find *best* k + // Perform k-means in binary search + int left = kmin; // must be at least 2 + int right = kmax; // int(floor(len(data)/2)) #assumption of clusters of size 2 at least + int mid = int(floor((right + left) / 2)); + int oldmid = mid; + int tests = 0; + int iters = 0; + value_t objective[3]; // 0= left of mid, 1= right of mid + if (left == 1) left = 2; // at least do 2 clusters + + auto centroids_view = + raft::make_device_matrix_view(centroids.view(), left, d); + raft::cluster::kmeans_fit(handle, X.view(), std::nullptr, centroids_view, residual, maxiter); + + // TODO: Compute clustersizes + + results[left] = *residual; + + clusterDispersion[left] = + raft::stats::dispersion(handle, centroids_view, clusterSizes.view(), std::nullopt, n); + // eval right edge + iters = 0; + results[right] = 1e20; + while (results[right] > results[left] && tests < 3) { + centroids_view = + raft::make_device_matrix_view(centroids.view(), right, d); + raft::cluster::kmeans_fit(handle, X.view(), std::nullptr, centroids_view, residual, maxiter); + + // TODO: Compute cluster sizes + results[right] = *residual; + clusterDispersion[right] = + raft::stats::dispersion(handle, centroids_view, clusterSizes.view(), std::nullopt, n); + tests += 1; + } + + objective[0] = (n - left) / (left - 1) * clusterDispersion[left] / results[left]; + objective[1] = (n - right) / (right - 1) * clusterDispersion[right] / results[right]; + // printf(" L=%g,%g,R=%g,%g : resid,objectives\n", results[left], objective[0], results[right], + // objective[1]); binary search + while (left < right - 1) { + results[mid] = 1e20; + tests = 0; + iters = 0; + while (results[mid] > results[left] && tests < 3) { + centroids_view = + raft::make_device_matrix_view(centroids.view(), mid, d); + raft::cluster::kmeans_fit(handle, X.view(), std::nullptr, centroids_view, residual, maxiter); + + // TODO: Compute cluster sizes + results[mid] = *residual; + clusterDispersion[mid] = + raft::stats::dispersion(handle, centroids_view, clusterSizes.view(), std::nullopt, n); + + if (results[mid] > results[left] && (mid + 1) < right) { + mid += 1; + results[mid] = 1e20; + } else if (results[mid] > results[left] && (mid - 1) > left) { + mid -= 1; + results[mid] = 1e20; + } + tests += 1; + } + // objective[0] =abs(results[left]-results[mid]) /(results[left]-minres); + // objective[0] /= mid-left; + // objective[1] =abs(results[mid] -results[right])/(results[mid]-minres); + // objective[1] /= right-mid; + + // maximize Calinski-Harabasz Index, minimize resid/ cluster + objective[0] = (n - left) / (left - 1) * clusterDispersion[left] / results[left]; + objective[1] = (n - right) / (right - 1) * clusterDispersion[right] / results[right]; + objective[2] = (n - mid) / (mid - 1) * clusterDispersion[mid] / results[mid]; + // yes, overwriting the above temporary results is what I want + // printf(" L=%g M=%g R=%g : objectives\n", objective[0], objective[2], objective[1]); + objective[0] = (objective[2] - objective[0]) / (mid - left); + objective[1] = (objective[1] - objective[2]) / (right - mid); + + // printf(" L=%g,R=%g : d obj/ d k \n", objective[0], objective[1]); + // printf(" left, mid, right, res_left, res_mid, res_right\n"); + // printf(" %d, %d, %d, %g, %g, %g\n", left, mid, right, results[left], results[mid], + // results[right]); + if (objective[0] > 0 && objective[1] < 0) { + // our point is in the left-of-mid side + right = mid; + } else { + left = mid; + } + oldmid = mid; + mid = int(floor((right + left) / 2)); + } + *k_star = right; + objective[0] = (n - left) / (left - 1) * clusterDispersion[left] / results[left]; + objective[1] = (n - oldmid) / (oldmid - 1) * clusterDispersion[oldmid] / results[oldmid]; + // objective[0] =abs(results[left]-results[mid]) /(results[left]-minres); + // objective[0] /= mid-left; + // objective[1] =abs(results[mid] -results[right])/(results[mid]-minres); + // objective[1] /= right-mid; + if (objective[1] < objective[0]) { *k_star = left; } + + // if k_star isn't what we just ran, re-run to get correct centroids and dist data on return-> + // this saves memory + if (*k_star != oldmid) { + centroids_view = + raft::make_device_matrix_view(centroids.view(), *k_star, d); + raft::cluster::kmeans_fit(handle, X.view(), std::nullptr, centroids_view, residual, maxiter); + } + + *maxiter = iters; +} +} // namespace raft::cluster::kmeans::detail \ No newline at end of file From c56415ab303e4605c1a3fb6decc58ab7ab0fe059 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Mon, 5 Dec 2022 20:34:03 -0500 Subject: [PATCH 02/19] Removing unneeded arguments --- cpp/include/raft/cluster/detail/kmeans_auto_find_k.cuh | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/cpp/include/raft/cluster/detail/kmeans_auto_find_k.cuh b/cpp/include/raft/cluster/detail/kmeans_auto_find_k.cuh index 9cd2afaba4..648013fa58 100644 --- a/cpp/include/raft/cluster/detail/kmeans_auto_find_k.cuh +++ b/cpp/include/raft/cluster/detail/kmeans_auto_find_k.cuh @@ -33,11 +33,9 @@ void kmeans_find_clusters(const raft::handle_t& handle, raft::host_scalar_view k_star, raft::host_scalar_view residual, raft::host_scalar_view maxiter, - idx_t n, - idx_t d, - idx_t kmin, idx_t kmax, - value_t tol) + idx_t kmin = 0, + value_t tol = 1e-3) { idx_t n = X.extent(0); idx_t d = X.extent(1); From 46cc28de7949adb6041d2ccbfd0db2a46ab1d4f0 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Mon, 5 Dec 2022 20:34:38 -0500 Subject: [PATCH 03/19] Fixing kmin default --- cpp/include/raft/cluster/detail/kmeans_auto_find_k.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/raft/cluster/detail/kmeans_auto_find_k.cuh b/cpp/include/raft/cluster/detail/kmeans_auto_find_k.cuh index 648013fa58..af50d924ac 100644 --- a/cpp/include/raft/cluster/detail/kmeans_auto_find_k.cuh +++ b/cpp/include/raft/cluster/detail/kmeans_auto_find_k.cuh @@ -34,7 +34,7 @@ void kmeans_find_clusters(const raft::handle_t& handle, raft::host_scalar_view residual, raft::host_scalar_view maxiter, idx_t kmax, - idx_t kmin = 0, + idx_t kmin = 1, value_t tol = 1e-3) { idx_t n = X.extent(0); From 08cd31d242e8335aad31e431c83f35aab543437d Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Mon, 5 Dec 2022 20:35:39 -0500 Subject: [PATCH 04/19] Rename function --- .../cluster/detail/kmeans_auto_find_k.cuh | 21 ++++++++++--------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/cpp/include/raft/cluster/detail/kmeans_auto_find_k.cuh b/cpp/include/raft/cluster/detail/kmeans_auto_find_k.cuh index af50d924ac..9f5149ffed 100644 --- a/cpp/include/raft/cluster/detail/kmeans_auto_find_k.cuh +++ b/cpp/include/raft/cluster/detail/kmeans_auto_find_k.cuh @@ -25,17 +25,18 @@ #include namespace raft::cluster::kmeans::detail { + template -void kmeans_find_clusters(const raft::handle_t& handle, - raft::device_matrix_view X, - raft::device_matrix_view centroids, - raft::device_vector_view labels, - raft::host_scalar_view k_star, - raft::host_scalar_view residual, - raft::host_scalar_view maxiter, - idx_t kmax, - idx_t kmin = 1, - value_t tol = 1e-3) +void find_k(const raft::handle_t& handle, + raft::device_matrix_view X, + raft::device_matrix_view centroids, + raft::device_vector_view labels, + raft::host_scalar_view k_star, + raft::host_scalar_view residual, + raft::host_scalar_view maxiter, + idx_t kmax, + idx_t kmin = 1, + value_t tol = 1e-3) { idx_t n = X.extent(0); idx_t d = X.extent(1); From 3bdafdd5f8afbc5bfbb1186c6fd8b8520be45997 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Tue, 6 Dec 2022 12:31:51 -0500 Subject: [PATCH 05/19] Adding countLabels to auto-find k --- .../cluster/detail/kmeans_auto_find_k.cuh | 30 +++++++++---------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/cpp/include/raft/cluster/detail/kmeans_auto_find_k.cuh b/cpp/include/raft/cluster/detail/kmeans_auto_find_k.cuh index 9f5149ffed..d3ef6cc8ee 100644 --- a/cpp/include/raft/cluster/detail/kmeans_auto_find_k.cuh +++ b/cpp/include/raft/cluster/detail/kmeans_auto_find_k.cuh @@ -50,18 +50,12 @@ void find_k(const raft::handle_t& handle, // Allocate memory // Device memory - auto clusterSizes = raft::make_device_vector(kmax); - auto data_dots = raft::make_device_vector(n); - auto centroid_dots = raft::device_vector(kmax); - auto work = raft::make_device_vector(n * max(kmax, d)); - auto work_int = raft::make_device_vector work_int(2 * d * n); + auto clusterSizes = raft::make_device_vector(kmax); + auto labels = raft::make_device_vector(n); + + rmm::device_uvector workspace(0, handle.get_stream()); idx_t* clusterSizes_ptr = clusterSizes.data_handle(); - // idx_t *codes_ptr = codes.data().get(); - idx_t* work_int_ptr = work_int.data_handle(); - value_t* data_dots_ptr = data_dots.data_handle(); - value_t* centroid_dots_ptr = centroid_dots.data_handle(); - value_t* work_ptr = work.data_handle(); // Host memory auto results = raft::make_host_vector(kmax + 1); @@ -80,23 +74,26 @@ void find_k(const raft::handle_t& handle, auto centroids_view = raft::make_device_matrix_view(centroids.view(), left, d); - raft::cluster::kmeans_fit(handle, X.view(), std::nullptr, centroids_view, residual, maxiter); + raft::cluster::kmeans_fit_predict( + handle, X.view(), std::nullptr, centroids_view, labels.view(), residual, maxiter); - // TODO: Compute clustersizes + detail::countLabels(handle, labels.view(), clusterSizes.data_handle(), n, left, workspace); results[left] = *residual; clusterDispersion[left] = raft::stats::dispersion(handle, centroids_view, clusterSizes.view(), std::nullopt, n); - // eval right edge + // eval right edge0 iters = 0; results[right] = 1e20; while (results[right] > results[left] && tests < 3) { centroids_view = raft::make_device_matrix_view(centroids.view(), right, d); - raft::cluster::kmeans_fit(handle, X.view(), std::nullptr, centroids_view, residual, maxiter); + raft::cluster::kmeans_fit_predict( + handle, X.view(), std::nullptr, centroids_view, labels.view(), residual, maxiter); + + detail::countLabels(handle, labels.view(), clusterSizes.data_handle(), n, right, workspace); - // TODO: Compute cluster sizes results[right] = *residual; clusterDispersion[right] = raft::stats::dispersion(handle, centroids_view, clusterSizes.view(), std::nullopt, n); @@ -116,7 +113,8 @@ void find_k(const raft::handle_t& handle, raft::make_device_matrix_view(centroids.view(), mid, d); raft::cluster::kmeans_fit(handle, X.view(), std::nullptr, centroids_view, residual, maxiter); - // TODO: Compute cluster sizes + detail::countLabels(handle, labels.view(), clusterSizes.data_handle(), n, mid, workspace); + results[mid] = *residual; clusterDispersion[mid] = raft::stats::dispersion(handle, centroids_view, clusterSizes.view(), std::nullopt, n); From 2456df21a5ea10785baec8ab93fa689aefde30a0 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Tue, 6 Dec 2022 12:33:09 -0500 Subject: [PATCH 06/19] Cleanup --- .../raft/cluster/detail/kmeans_auto_find_k.cuh | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) diff --git a/cpp/include/raft/cluster/detail/kmeans_auto_find_k.cuh b/cpp/include/raft/cluster/detail/kmeans_auto_find_k.cuh index d3ef6cc8ee..b31a71d3c7 100644 --- a/cpp/include/raft/cluster/detail/kmeans_auto_find_k.cuh +++ b/cpp/include/raft/cluster/detail/kmeans_auto_find_k.cuh @@ -102,8 +102,6 @@ void find_k(const raft::handle_t& handle, objective[0] = (n - left) / (left - 1) * clusterDispersion[left] / results[left]; objective[1] = (n - right) / (right - 1) * clusterDispersion[right] / results[right]; - // printf(" L=%g,%g,R=%g,%g : resid,objectives\n", results[left], objective[0], results[right], - // objective[1]); binary search while (left < right - 1) { results[mid] = 1e20; tests = 0; @@ -128,24 +126,14 @@ void find_k(const raft::handle_t& handle, } tests += 1; } - // objective[0] =abs(results[left]-results[mid]) /(results[left]-minres); - // objective[0] /= mid-left; - // objective[1] =abs(results[mid] -results[right])/(results[mid]-minres); - // objective[1] /= right-mid; // maximize Calinski-Harabasz Index, minimize resid/ cluster objective[0] = (n - left) / (left - 1) * clusterDispersion[left] / results[left]; objective[1] = (n - right) / (right - 1) * clusterDispersion[right] / results[right]; objective[2] = (n - mid) / (mid - 1) * clusterDispersion[mid] / results[mid]; - // yes, overwriting the above temporary results is what I want - // printf(" L=%g M=%g R=%g : objectives\n", objective[0], objective[2], objective[1]); objective[0] = (objective[2] - objective[0]) / (mid - left); objective[1] = (objective[1] - objective[2]) / (right - mid); - // printf(" L=%g,R=%g : d obj/ d k \n", objective[0], objective[1]); - // printf(" left, mid, right, res_left, res_mid, res_right\n"); - // printf(" %d, %d, %d, %g, %g, %g\n", left, mid, right, results[left], results[mid], - // results[right]); if (objective[0] > 0 && objective[1] < 0) { // our point is in the left-of-mid side right = mid; @@ -158,10 +146,7 @@ void find_k(const raft::handle_t& handle, *k_star = right; objective[0] = (n - left) / (left - 1) * clusterDispersion[left] / results[left]; objective[1] = (n - oldmid) / (oldmid - 1) * clusterDispersion[oldmid] / results[oldmid]; - // objective[0] =abs(results[left]-results[mid]) /(results[left]-minres); - // objective[0] /= mid-left; - // objective[1] =abs(results[mid] -results[right])/(results[mid]-minres); - // objective[1] /= right-mid; + if (objective[1] < objective[0]) { *k_star = left; } // if k_star isn't what we just ran, re-run to get correct centroids and dist data on return-> From f9666bbb2b163d42d02fc86dd227991d5c93aad4 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Tue, 6 Dec 2022 17:02:38 -0500 Subject: [PATCH 07/19] Adding to public api --- cpp/include/raft/cluster/kmeans.cuh | 33 +++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/cpp/include/raft/cluster/kmeans.cuh b/cpp/include/raft/cluster/kmeans.cuh index d64815244b..2283d63282 100644 --- a/cpp/include/raft/cluster/kmeans.cuh +++ b/cpp/include/raft/cluster/kmeans.cuh @@ -17,6 +17,7 @@ #include #include +#include #include #include #include @@ -260,6 +261,38 @@ void transform(const raft::handle_t& handle, handle, params, X, centroids, n_samples, n_features, X_new); } +/** + * Automatically find the optimal value of k using a binary search. + * This method maximizes the Calinski-Harabasz Index while minimizing the per-cluster residuals. + * @tparam idx_t + * @tparam value_t + * @param handle raft handle + * @param X input observations (shape n_samples, n_dims) + * @param best_centroids Best centroids returned from auto-find + * @param best_labels + * @param best_k + * @param residual + * @param maxiter + * @param kmax + * @param kmin + * @param tol + */ +template +void find_k(const raft::handle_t& handle, + raft::device_matrix_view X, + raft::device_matrix_view best_centroids, + raft::device_vector_view best_labels, + raft::host_scalar_view best_k, + raft::host_scalar_view residual, + raft::host_scalar_view maxiter, + idx_t kmax, + idx_t kmin = 1, + value_t tol = 1e-3) +{ + detail::find_k( + handle, X, best_centroids, best_labels, best_k, residual, maxiter, kmax, kmin, tol); +} + /** * @brief Select centroids according to a sampling operation * From 565a68d12f151d318f2bf4a85a390a20f86aa93a Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Thu, 8 Dec 2022 10:34:23 -0500 Subject: [PATCH 08/19] auto find k builds. Need to add test --- .../cluster/detail/kmeans_auto_find_k.cuh | 60 +++++++++++-------- cpp/include/raft/cluster/kmeans.cuh | 12 ++-- 2 files changed, 40 insertions(+), 32 deletions(-) diff --git a/cpp/include/raft/cluster/detail/kmeans_auto_find_k.cuh b/cpp/include/raft/cluster/detail/kmeans_auto_find_k.cuh index b31a71d3c7..0fcd297646 100644 --- a/cpp/include/raft/cluster/detail/kmeans_auto_find_k.cuh +++ b/cpp/include/raft/cluster/detail/kmeans_auto_find_k.cuh @@ -19,24 +19,25 @@ #include #include +#include + #include #include #include -namespace raft::cluster::kmeans::detail { +namespace raft::cluster::detail { template void find_k(const raft::handle_t& handle, raft::device_matrix_view X, - raft::device_matrix_view centroids, - raft::device_vector_view labels, - raft::host_scalar_view k_star, + raft::host_scalar_view best_k, raft::host_scalar_view residual, - raft::host_scalar_view maxiter, + raft::host_scalar_view n_iter, idx_t kmax, - idx_t kmin = 1, - value_t tol = 1e-3) + idx_t kmin = 1, + idx_t maxiter = 100, + value_t tol = 1e-3) { idx_t n = X.extent(0); idx_t d = X.extent(1); @@ -50,6 +51,7 @@ void find_k(const raft::handle_t& handle, // Allocate memory // Device memory + auto centroids = raft::make_device_matrix(kmax, X.extent(1)); auto clusterSizes = raft::make_device_vector(kmax); auto labels = raft::make_device_vector(n); @@ -68,14 +70,19 @@ void find_k(const raft::handle_t& handle, int mid = int(floor((right + left) / 2)); int oldmid = mid; int tests = 0; - int iters = 0; value_t objective[3]; // 0= left of mid, 1= right of mid if (left == 1) left = 2; // at least do 2 clusters + KMeansParams params; + params.max_iter = maxiter; + params.tol = tol; + auto centroids_view = raft::make_device_matrix_view(centroids.view(), left, d); - raft::cluster::kmeans_fit_predict( - handle, X.view(), std::nullptr, centroids_view, labels.view(), residual, maxiter); + + params.n_clusters = left; + kmeans_fit_predict( + handle, params, X.view(), std::nullopt, centroids_view, labels.view(), residual, n_iter); detail::countLabels(handle, labels.view(), clusterSizes.data_handle(), n, left, workspace); @@ -84,17 +91,18 @@ void find_k(const raft::handle_t& handle, clusterDispersion[left] = raft::stats::dispersion(handle, centroids_view, clusterSizes.view(), std::nullopt, n); // eval right edge0 - iters = 0; results[right] = 1e20; while (results[right] > results[left] && tests < 3) { centroids_view = raft::make_device_matrix_view(centroids.view(), right, d); - raft::cluster::kmeans_fit_predict( - handle, X.view(), std::nullptr, centroids_view, labels.view(), residual, maxiter); + + params.n_clusters = right; + kmeans_fit_predict( + handle, params, X.view(), std::nullopt, centroids_view, labels.view(), residual, n_iter); detail::countLabels(handle, labels.view(), clusterSizes.data_handle(), n, right, workspace); - results[right] = *residual; + results[right] = residual[0]; clusterDispersion[right] = raft::stats::dispersion(handle, centroids_view, clusterSizes.view(), std::nullopt, n); tests += 1; @@ -105,11 +113,13 @@ void find_k(const raft::handle_t& handle, while (left < right - 1) { results[mid] = 1e20; tests = 0; - iters = 0; while (results[mid] > results[left] && tests < 3) { centroids_view = raft::make_device_matrix_view(centroids.view(), mid, d); - raft::cluster::kmeans_fit(handle, X.view(), std::nullptr, centroids_view, residual, maxiter); + + params.n_clusters = mid; + kmeans_fit_predict( + handle, params, X.view(), std::nullopt, centroids_view, labels.view(), residual, n_iter); detail::countLabels(handle, labels.view(), clusterSizes.data_handle(), n, mid, workspace); @@ -143,20 +153,20 @@ void find_k(const raft::handle_t& handle, oldmid = mid; mid = int(floor((right + left) / 2)); } - *k_star = right; + best_k[0] = right; objective[0] = (n - left) / (left - 1) * clusterDispersion[left] / results[left]; objective[1] = (n - oldmid) / (oldmid - 1) * clusterDispersion[oldmid] / results[oldmid]; - if (objective[1] < objective[0]) { *k_star = left; } + if (objective[1] < objective[0]) { best_k[0] = left; } - // if k_star isn't what we just ran, re-run to get correct centroids and dist data on return-> + // if best_k isn't what we just ran, re-run to get correct centroids and dist data on return-> // this saves memory - if (*k_star != oldmid) { + if (best_k[0] != oldmid) { centroids_view = - raft::make_device_matrix_view(centroids.view(), *k_star, d); - raft::cluster::kmeans_fit(handle, X.view(), std::nullptr, centroids_view, residual, maxiter); + raft::make_device_matrix_view(centroids.view(), best_k[0], d); + params.n_clusters = best_k[0]; + kmeans_fit_predict( + handle, params, X.view(), std::nullopt, centroids_view, labels.view(), residual, n_iter); } - - *maxiter = iters; } -} // namespace raft::cluster::kmeans::detail \ No newline at end of file +} // namespace raft::cluster::detail \ No newline at end of file diff --git a/cpp/include/raft/cluster/kmeans.cuh b/cpp/include/raft/cluster/kmeans.cuh index 2283d63282..f4aeecfdea 100644 --- a/cpp/include/raft/cluster/kmeans.cuh +++ b/cpp/include/raft/cluster/kmeans.cuh @@ -280,17 +280,15 @@ void transform(const raft::handle_t& handle, template void find_k(const raft::handle_t& handle, raft::device_matrix_view X, - raft::device_matrix_view best_centroids, - raft::device_vector_view best_labels, raft::host_scalar_view best_k, raft::host_scalar_view residual, - raft::host_scalar_view maxiter, + raft::host_scalar_view n_iter, idx_t kmax, - idx_t kmin = 1, - value_t tol = 1e-3) + idx_t kmin = 1, + idx_t maxiter = 100, + value_t tol = 1e-3) { - detail::find_k( - handle, X, best_centroids, best_labels, best_k, residual, maxiter, kmax, kmin, tol); + detail::find_k(handle, X, best_k, residual, n_iter, kmax, kmin, maxiter, tol); } /** From bef9b427c283de8bdef82a9b9841305afff96d23 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Thu, 8 Dec 2022 10:46:31 -0500 Subject: [PATCH 09/19] Adding docs --- cpp/include/raft/cluster/kmeans.cuh | 50 +++++++++++++++++++++-------- 1 file changed, 37 insertions(+), 13 deletions(-) diff --git a/cpp/include/raft/cluster/kmeans.cuh b/cpp/include/raft/cluster/kmeans.cuh index f4aeecfdea..ceb3bab524 100644 --- a/cpp/include/raft/cluster/kmeans.cuh +++ b/cpp/include/raft/cluster/kmeans.cuh @@ -263,32 +263,56 @@ void transform(const raft::handle_t& handle, /** * Automatically find the optimal value of k using a binary search. - * This method maximizes the Calinski-Harabasz Index while minimizing the per-cluster residuals. - * @tparam idx_t - * @tparam value_t + * This method maximizes the Calinski-Harabasz Index while minimizing the per-cluster inertia. + * + * @code{.cpp} + * #include + * #include + * #include + * + * #include + * + * using namespace raft::cluster; + * + * raft::handle_t handle; + * int n_samples = 100, n_features = 15, n_clusters = 10; + * auto X = raft::make_device_matrix(handle, n_samples, n_features); + * auto labels = raft::make_device_vector(handle, n_samples); + * + * raft::random::make_blobs(handle, X, labels, n_clusters); + * + * auto best_k = raft::make_host_scalar(); + * auto n_iter = raft::make_host_scalar(); + * auto inertia = raft::make_host_scalar(); + * + * kmeans::find_k(handle, X, best_k, inertia, n_iter, n_clusters+1); + * + * @endcode + * + * @tparam idx_t indexing type (should be integral) + * @tparam value_t value type (should be floating point) * @param handle raft handle * @param X input observations (shape n_samples, n_dims) - * @param best_centroids Best centroids returned from auto-find - * @param best_labels - * @param best_k - * @param residual - * @param maxiter - * @param kmax - * @param kmin - * @param tol + * @param best_k best k found from binary search + * @param inertia inertia of best k found + * @param n_iter number of iterations used to find best k + * @param kmax maximum k to try in search + * @param kmin minimum k to try in search (should be >= 1) + * @param maxiter maximum number of iterations to run + * @param tol tolerance for early stopping convergence */ template void find_k(const raft::handle_t& handle, raft::device_matrix_view X, raft::host_scalar_view best_k, - raft::host_scalar_view residual, + raft::host_scalar_view inertia, raft::host_scalar_view n_iter, idx_t kmax, idx_t kmin = 1, idx_t maxiter = 100, value_t tol = 1e-3) { - detail::find_k(handle, X, best_k, residual, n_iter, kmax, kmin, maxiter, tol); + detail::find_k(handle, X, best_k, inertia, n_iter, kmax, kmin, maxiter, tol); } /** From efb71ddec0eb96ececcfa05bc89dc368c2845a1c Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Thu, 8 Dec 2022 11:03:51 -0500 Subject: [PATCH 10/19] Adding googletest --- cpp/include/raft/cluster/kmeans.cuh | 2 +- cpp/test/CMakeLists.txt | 4 +- cpp/test/cluster/kmeans_find_k.cu | 146 ++++++++++++++++++++++++++++ 3 files changed, 149 insertions(+), 3 deletions(-) create mode 100644 cpp/test/cluster/kmeans_find_k.cu diff --git a/cpp/include/raft/cluster/kmeans.cuh b/cpp/include/raft/cluster/kmeans.cuh index ceb3bab524..eca597fb94 100644 --- a/cpp/include/raft/cluster/kmeans.cuh +++ b/cpp/include/raft/cluster/kmeans.cuh @@ -285,7 +285,7 @@ void transform(const raft::handle_t& handle, * auto n_iter = raft::make_host_scalar(); * auto inertia = raft::make_host_scalar(); * - * kmeans::find_k(handle, X, best_k, inertia, n_iter, n_clusters+1); + * kmeans::find_k(handle, X, best_k.view(), inertia.view(), n_iter.view(), n_clusters+1); * * @endcode * diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt index a75eb3bff6..63f5eb7de5 100644 --- a/cpp/test/CMakeLists.txt +++ b/cpp/test/CMakeLists.txt @@ -77,8 +77,8 @@ endfunction() if(BUILD_TESTS) ConfigureTest( - NAME CLUSTER_TEST PATH test/cluster/kmeans.cu test/cluster_solvers.cu test/cluster/linkage.cu - OPTIONAL DIST NN + NAME CLUSTER_TEST PATH test/cluster/kmeans.cu test/cluster/kmeans_find_k.cu + test/cluster_solvers.cu test/cluster/linkage.cu OPTIONAL DIST NN ) ConfigureTest( diff --git a/cpp/test/cluster/kmeans_find_k.cu b/cpp/test/cluster/kmeans_find_k.cu new file mode 100644 index 0000000000..e7b19b874f --- /dev/null +++ b/cpp/test/cluster/kmeans_find_k.cu @@ -0,0 +1,146 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../test_utils.h" +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined RAFT_DISTANCE_COMPILED && defined RAFT_NN_COMPILED +#include +#endif + +namespace raft { + +template +struct KmeansFindKInputs { + int n_row; + int n_col; + int n_clusters; + T tol; + bool weighted; +}; + +template +class KmeansFindKTest : public ::testing::TestWithParam> { + protected: + KmeansFindKTest() : stream(handle.get_stream()) {} + + void basicTest() + { + testparams = ::testing::TestWithParam>::GetParam(); + + int n_samples = testparams.n_row; + int n_features = testparams.n_col; + + auto X = raft::make_device_matrix(handle, n_samples, n_features); + auto labels = raft::make_device_vector(handle, n_samples); + + raft::random::make_blobs(X.data_handle(), + labels.data_handle(), + n_samples, + n_features, + params.n_clusters, + stream, + true, + nullptr, + nullptr, + T(1.0), + false, + (T)-10.0f, + (T)10.0f, + (uint64_t)1234); + + // std::optional> d_sw = std::nullopt; + // auto d_centroids_view = + // raft::make_device_matrix_view(d_centroids.data(), + // params.n_clusters, n_features); + // if (testparams.weighted) { + // d_sample_weight.resize(n_samples, stream); + // d_sw = std::make_optional( + // raft::make_device_vector_view(d_sample_weight.data(), + // n_samples)); + // thrust::fill(thrust::cuda::par.on(stream), + // d_sample_weight.data(), + // d_sample_weight.data() + n_samples, + // 1); + // } + // + auto best_k = raft::make_host_scalar(); + auto inertia = raft::make_host_scalar(); + auto n_iter = raft::make_host_scalar(); + + auto X_view = + raft::make_device_matrix_view(X.data_handle(), X.extent(0), X.extent(1)); + + raft::cluster::kmeans::find_k( + handle, X_view, best_k.view(), inertia.view(), n_iter.view(), testparams.n_clusters + 2); + + handle.sync_stream(stream); + + assert(best_k[0] == testparams.n_clusters); + } + + void SetUp() override { basicTest(); } + + protected: + raft::handle_t handle; + cudaStream_t stream; + KmeansFindKInputs testparams; +}; + +const std::vector> inputsf2 = {{1000, 32, 5, 0.0001f, true}, + {1000, 32, 5, 0.0001f, false}, + {1000, 100, 20, 0.0001f, true}, + {1000, 100, 20, 0.0001f, false}, + {10000, 32, 10, 0.0001f, true}, + {10000, 32, 10, 0.0001f, false}, + {10000, 100, 50, 0.0001f, true}, + {10000, 100, 50, 0.0001f, false}, + {10000, 500, 100, 0.0001f, true}, + {10000, 500, 100, 0.0001f, false}}; + +const std::vector> inputsd2 = {{1000, 32, 5, 0.0001, true}, + {1000, 32, 5, 0.0001, false}, + {1000, 100, 20, 0.0001, true}, + {1000, 100, 20, 0.0001, false}, + {10000, 32, 10, 0.0001, true}, + {10000, 32, 10, 0.0001, false}, + {10000, 100, 50, 0.0001, true}, + {10000, 100, 50, 0.0001, false}, + {10000, 500, 100, 0.0001, true}, + {10000, 500, 100, 0.0001, false}}; + +typedef KmeansFindKTest KmeansFindKTestF; +TEST_P(KmeansFindKTestF, Result) { ASSERT_TRUE(score == 1.0); } + +typedef KmeansFindKTest KmeansFindKTestD; +TEST_P(KmeansFindKTestD, Result) { ASSERT_TRUE(score == 1.0); } + +INSTANTIATE_TEST_CASE_P(KmeansFindKTests, KmeansFindKTestF, ::testing::ValuesIn(inputsf2)); + +INSTANTIATE_TEST_CASE_P(KmeansFindKTests, KmeansFindKTestD, ::testing::ValuesIn(inputsd2)); + +} // namespace raft From 45b9541ccb433a72d3b8e9dc64717ee1f5060889 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Thu, 8 Dec 2022 12:23:47 -0500 Subject: [PATCH 11/19] Getting auto-find-k tests to build --- cpp/include/raft/cluster/detail/kmeans.cuh | 22 +-- .../cluster/detail/kmeans_auto_find_k.cuh | 133 +++++++++++------- cpp/include/raft/cluster/kmeans.cuh | 8 +- cpp/test/cluster/kmeans_find_k.cu | 20 ++- 4 files changed, 109 insertions(+), 74 deletions(-) diff --git a/cpp/include/raft/cluster/detail/kmeans.cuh b/cpp/include/raft/cluster/detail/kmeans.cuh index 2d3481b4e1..16bef7f720 100644 --- a/cpp/include/raft/cluster/detail/kmeans.cuh +++ b/cpp/include/raft/cluster/detail/kmeans.cuh @@ -840,9 +840,9 @@ void initScalableKMeansPlusPlus(const raft::handle_t& handle, template void kmeans_fit(handle_t const& handle, const KMeansParams& params, - raft::device_matrix_view X, - std::optional> sample_weight, - raft::device_matrix_view centroids, + raft::device_matrix_view X, + std::optional> sample_weight, + raft::device_matrix_view centroids, raft::host_scalar_view inertia, raft::host_scalar_view n_iter) { @@ -1004,10 +1004,10 @@ void kmeans_fit(handle_t const& handle, template void kmeans_predict(handle_t const& handle, const KMeansParams& params, - raft::device_matrix_view X, - std::optional> sample_weight, - raft::device_matrix_view centroids, - raft::device_vector_view labels, + raft::device_matrix_view X, + std::optional> sample_weight, + raft::device_matrix_view centroids, + raft::device_vector_view labels, bool normalize_weight, raft::host_scalar_view inertia) { @@ -1149,10 +1149,10 @@ void kmeans_predict(handle_t const& handle, template void kmeans_fit_predict(handle_t const& handle, const KMeansParams& params, - raft::device_matrix_view X, - std::optional> sample_weight, - std::optional> centroids, - raft::device_vector_view labels, + raft::device_matrix_view X, + std::optional> sample_weight, + std::optional> centroids, + raft::device_vector_view labels, raft::host_scalar_view inertia, raft::host_scalar_view n_iter) { diff --git a/cpp/include/raft/cluster/detail/kmeans_auto_find_k.cuh b/cpp/include/raft/cluster/detail/kmeans_auto_find_k.cuh index 0fcd297646..6c9bb979dd 100644 --- a/cpp/include/raft/cluster/detail/kmeans_auto_find_k.cuh +++ b/cpp/include/raft/cluster/detail/kmeans_auto_find_k.cuh @@ -29,9 +29,9 @@ namespace raft::cluster::detail { template -void find_k(const raft::handle_t& handle, +void find_k(raft::handle_t const& handle, raft::device_matrix_view X, - raft::host_scalar_view best_k, + raft::host_scalar_view best_k, raft::host_scalar_view residual, raft::host_scalar_view n_iter, idx_t kmax, @@ -51,9 +51,9 @@ void find_k(const raft::handle_t& handle, // Allocate memory // Device memory - auto centroids = raft::make_device_matrix(kmax, X.extent(1)); - auto clusterSizes = raft::make_device_vector(kmax); - auto labels = raft::make_device_vector(n); + auto centroids = raft::make_device_matrix(handle, kmax, X.extent(1)); + auto clusterSizes = raft::make_device_vector(handle, kmax); + auto labels = raft::make_device_vector(handle, n); rmm::device_uvector workspace(0, handle.get_stream()); @@ -63,6 +63,10 @@ void find_k(const raft::handle_t& handle, auto results = raft::make_host_vector(kmax + 1); auto clusterDispersion = raft::make_host_vector(kmax + 1); + auto clusterDispertionView = clusterDispersion.view(); + auto resultsView = results.view(); + auto labels_view = raft::make_device_matrix_view(labels.data_handle(), n, d); + // Loop to find *best* k // Perform k-means in binary search int left = kmin; // must be at least 2 @@ -78,69 +82,97 @@ void find_k(const raft::handle_t& handle, params.tol = tol; auto centroids_view = - raft::make_device_matrix_view(centroids.view(), left, d); + raft::make_device_matrix_view(centroids.data_handle(), left, d); + + auto cluster_sizes_view = + raft::make_device_vector_view(clusterSizes_ptr, left); params.n_clusters = left; - kmeans_fit_predict( - handle, params, X.view(), std::nullopt, centroids_view, labels.view(), residual, n_iter); + kmeans_fit_predict(handle, + params, + X, + std::nullopt, + std::make_optional(centroids.view()), + labels.view(), + residual, + n_iter); - detail::countLabels(handle, labels.view(), clusterSizes.data_handle(), n, left, workspace); + detail::countLabels(handle, labels.data_handle(), clusterSizes.data_handle(), n, left, workspace); - results[left] = *residual; + resultsView[left] = residual[0]; - clusterDispersion[left] = - raft::stats::dispersion(handle, centroids_view, clusterSizes.view(), std::nullopt, n); + clusterDispertionView[left] = + raft::stats::cluster_dispersion(handle, centroids_view, cluster_sizes_view, std::nullopt, n); // eval right edge0 - results[right] = 1e20; - while (results[right] > results[left] && tests < 3) { + resultsView[right] = 1e20; + while (resultsView[right] > resultsView[left] && tests < 3) { centroids_view = - raft::make_device_matrix_view(centroids.view(), right, d); - - params.n_clusters = right; - kmeans_fit_predict( - handle, params, X.view(), std::nullopt, centroids_view, labels.view(), residual, n_iter); + raft::make_device_matrix_view(centroids.data_handle(), right, d); + cluster_sizes_view = raft::make_device_vector_view(clusterSizes_ptr, right); - detail::countLabels(handle, labels.view(), clusterSizes.data_handle(), n, right, workspace); + labels_view = raft::make_device_matrix_view(labels.data_handle(), left, d); - results[right] = residual[0]; - clusterDispersion[right] = - raft::stats::dispersion(handle, centroids_view, clusterSizes.view(), std::nullopt, n); + params.n_clusters = right; + kmeans_fit_predict(handle, + params, + X, + std::nullopt, + std::make_optional(centroids.view()), + labels.view(), + residual, + n_iter); + + detail::countLabels( + handle, labels.data_handle(), clusterSizes.data_handle(), n, right, workspace); + + resultsView[right] = residual[0]; + clusterDispertionView[right] = + raft::stats::cluster_dispersion(handle, centroids_view, cluster_sizes_view, std::nullopt, n); tests += 1; } - objective[0] = (n - left) / (left - 1) * clusterDispersion[left] / results[left]; - objective[1] = (n - right) / (right - 1) * clusterDispersion[right] / results[right]; + objective[0] = (n - left) / (left - 1) * clusterDispertionView[left] / resultsView[left]; + objective[1] = (n - right) / (right - 1) * clusterDispertionView[right] / resultsView[right]; while (left < right - 1) { - results[mid] = 1e20; - tests = 0; - while (results[mid] > results[left] && tests < 3) { + resultsView[mid] = 1e20; + tests = 0; + while (resultsView[mid] > resultsView[left] && tests < 3) { centroids_view = - raft::make_device_matrix_view(centroids.view(), mid, d); + raft::make_device_matrix_view(centroids.data_handle(), mid, d); + cluster_sizes_view = raft::make_device_vector_view(clusterSizes_ptr, mid); params.n_clusters = mid; - kmeans_fit_predict( - handle, params, X.view(), std::nullopt, centroids_view, labels.view(), residual, n_iter); - detail::countLabels(handle, labels.view(), clusterSizes.data_handle(), n, mid, workspace); + kmeans_fit_predict(handle, + params, + X, + std::nullopt, + std::make_optional(centroids.view()), + labels.view(), + residual, + n_iter); + + detail::countLabels( + handle, labels.data_handle(), clusterSizes.data_handle(), n, mid, workspace); - results[mid] = *residual; - clusterDispersion[mid] = - raft::stats::dispersion(handle, centroids_view, clusterSizes.view(), std::nullopt, n); + resultsView[mid] = residual[0]; + clusterDispertionView[mid] = raft::stats::cluster_dispersion( + handle, centroids_view, cluster_sizes_view, std::nullopt, n); - if (results[mid] > results[left] && (mid + 1) < right) { + if (resultsView[mid] > resultsView[left] && (mid + 1) < right) { mid += 1; - results[mid] = 1e20; - } else if (results[mid] > results[left] && (mid - 1) > left) { + resultsView[mid] = 1e20; + } else if (resultsView[mid] > resultsView[left] && (mid - 1) > left) { mid -= 1; - results[mid] = 1e20; + resultsView[mid] = 1e20; } tests += 1; } // maximize Calinski-Harabasz Index, minimize resid/ cluster - objective[0] = (n - left) / (left - 1) * clusterDispersion[left] / results[left]; - objective[1] = (n - right) / (right - 1) * clusterDispersion[right] / results[right]; - objective[2] = (n - mid) / (mid - 1) * clusterDispersion[mid] / results[mid]; + objective[0] = (n - left) / (left - 1) * clusterDispertionView[left] / resultsView[left]; + objective[1] = (n - right) / (right - 1) * clusterDispertionView[right] / resultsView[right]; + objective[2] = (n - mid) / (mid - 1) * clusterDispertionView[mid] / resultsView[mid]; objective[0] = (objective[2] - objective[0]) / (mid - left); objective[1] = (objective[1] - objective[2]) / (right - mid); @@ -154,8 +186,8 @@ void find_k(const raft::handle_t& handle, mid = int(floor((right + left) / 2)); } best_k[0] = right; - objective[0] = (n - left) / (left - 1) * clusterDispersion[left] / results[left]; - objective[1] = (n - oldmid) / (oldmid - 1) * clusterDispersion[oldmid] / results[oldmid]; + objective[0] = (n - left) / (left - 1) * clusterDispertionView[left] / resultsView[left]; + objective[1] = (n - oldmid) / (oldmid - 1) * clusterDispertionView[oldmid] / resultsView[oldmid]; if (objective[1] < objective[0]) { best_k[0] = left; } @@ -163,10 +195,17 @@ void find_k(const raft::handle_t& handle, // this saves memory if (best_k[0] != oldmid) { centroids_view = - raft::make_device_matrix_view(centroids.view(), best_k[0], d); + raft::make_device_matrix_view(centroids.data_handle(), best_k[0], d); + params.n_clusters = best_k[0]; - kmeans_fit_predict( - handle, params, X.view(), std::nullopt, centroids_view, labels.view(), residual, n_iter); + kmeans_fit_predict(handle, + params, + X, + std::nullopt, + std::make_optional(centroids.view()), + labels.view(), + residual, + n_iter); } } } // namespace raft::cluster::detail \ No newline at end of file diff --git a/cpp/include/raft/cluster/kmeans.cuh b/cpp/include/raft/cluster/kmeans.cuh index eca597fb94..e504cca869 100644 --- a/cpp/include/raft/cluster/kmeans.cuh +++ b/cpp/include/raft/cluster/kmeans.cuh @@ -281,9 +281,9 @@ void transform(const raft::handle_t& handle, * * raft::random::make_blobs(handle, X, labels, n_clusters); * - * auto best_k = raft::make_host_scalar(); - * auto n_iter = raft::make_host_scalar(); - * auto inertia = raft::make_host_scalar(); + * auto best_k = raft::make_host_scalar(0); + * auto n_iter = raft::make_host_scalar(0); + * auto inertia = raft::make_host_scalar(0); * * kmeans::find_k(handle, X, best_k.view(), inertia.view(), n_iter.view(), n_clusters+1); * @@ -304,7 +304,7 @@ void transform(const raft::handle_t& handle, template void find_k(const raft::handle_t& handle, raft::device_matrix_view X, - raft::host_scalar_view best_k, + raft::host_scalar_view best_k, raft::host_scalar_view inertia, raft::host_scalar_view n_iter, idx_t kmax, diff --git a/cpp/test/cluster/kmeans_find_k.cu b/cpp/test/cluster/kmeans_find_k.cu index e7b19b874f..a52fea0aa9 100644 --- a/cpp/test/cluster/kmeans_find_k.cu +++ b/cpp/test/cluster/kmeans_find_k.cu @@ -23,10 +23,7 @@ #include #include #include -#include #include -#include -#include #if defined RAFT_DISTANCE_COMPILED && defined RAFT_NN_COMPILED #include @@ -54,6 +51,7 @@ class KmeansFindKTest : public ::testing::TestWithParam> { int n_samples = testparams.n_row; int n_features = testparams.n_col; + int n_clusters = testparams.n_clusters; auto X = raft::make_device_matrix(handle, n_samples, n_features); auto labels = raft::make_device_vector(handle, n_samples); @@ -62,7 +60,7 @@ class KmeansFindKTest : public ::testing::TestWithParam> { labels.data_handle(), n_samples, n_features, - params.n_clusters, + n_clusters, stream, true, nullptr, @@ -88,19 +86,16 @@ class KmeansFindKTest : public ::testing::TestWithParam> { // 1); // } // - auto best_k = raft::make_host_scalar(); - auto inertia = raft::make_host_scalar(); - auto n_iter = raft::make_host_scalar(); + auto inertia = raft::make_host_scalar(0); + auto n_iter = raft::make_host_scalar(0); auto X_view = raft::make_device_matrix_view(X.data_handle(), X.extent(0), X.extent(1)); raft::cluster::kmeans::find_k( - handle, X_view, best_k.view(), inertia.view(), n_iter.view(), testparams.n_clusters + 2); + handle, X_view, best_k.view(), inertia.view(), n_iter.view(), n_clusters + 2); handle.sync_stream(stream); - - assert(best_k[0] == testparams.n_clusters); } void SetUp() override { basicTest(); } @@ -109,6 +104,7 @@ class KmeansFindKTest : public ::testing::TestWithParam> { raft::handle_t handle; cudaStream_t stream; KmeansFindKInputs testparams; + raft::host_scalar best_k; }; const std::vector> inputsf2 = {{1000, 32, 5, 0.0001f, true}, @@ -134,10 +130,10 @@ const std::vector> inputsd2 = {{1000, 32, 5, 0.0001, t {10000, 500, 100, 0.0001, false}}; typedef KmeansFindKTest KmeansFindKTestF; -TEST_P(KmeansFindKTestF, Result) { ASSERT_TRUE(score == 1.0); } +TEST_P(KmeansFindKTestF, Result) { ASSERT_TRUE(best_k.view()[0] == testparams.n_clusters); } typedef KmeansFindKTest KmeansFindKTestD; -TEST_P(KmeansFindKTestD, Result) { ASSERT_TRUE(score == 1.0); } +TEST_P(KmeansFindKTestD, Result) { ASSERT_TRUE(best_k.view()[0] == testparams.n_clusters); } INSTANTIATE_TEST_CASE_P(KmeansFindKTests, KmeansFindKTestF, ::testing::ValuesIn(inputsf2)); From 1d6ba6c8521f907637b6f100025076ff61a9c9fa Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Thu, 8 Dec 2022 18:33:58 -0500 Subject: [PATCH 12/19] Troubleshooting segmentation faults --- .../cluster/detail/kmeans_auto_find_k.cuh | 70 ++++++++++++++----- 1 file changed, 53 insertions(+), 17 deletions(-) diff --git a/cpp/include/raft/cluster/detail/kmeans_auto_find_k.cuh b/cpp/include/raft/cluster/detail/kmeans_auto_find_k.cuh index 6c9bb979dd..6b8a82fd79 100644 --- a/cpp/include/raft/cluster/detail/kmeans_auto_find_k.cuh +++ b/cpp/include/raft/cluster/detail/kmeans_auto_find_k.cuh @@ -19,6 +19,8 @@ #include #include +#include + #include #include @@ -65,7 +67,6 @@ void find_k(raft::handle_t const& handle, auto clusterDispertionView = clusterDispersion.view(); auto resultsView = results.view(); - auto labels_view = raft::make_device_matrix_view(labels.data_handle(), n, d); // Loop to find *best* k // Perform k-means in binary search @@ -81,9 +82,14 @@ void find_k(raft::handle_t const& handle, params.max_iter = maxiter; params.tol = tol; - auto centroids_view = + RAFT_LOG_INFO("Running left."); + + auto centroids_const_view = raft::make_device_matrix_view(centroids.data_handle(), left, d); + auto centroids_view = + raft::make_device_matrix_view(centroids.data_handle(), left, d); + auto cluster_sizes_view = raft::make_device_vector_view(clusterSizes_ptr, left); @@ -92,32 +98,37 @@ void find_k(raft::handle_t const& handle, params, X, std::nullopt, - std::make_optional(centroids.view()), + std::make_optional(centroids_view), labels.view(), residual, n_iter); detail::countLabels(handle, labels.data_handle(), clusterSizes.data_handle(), n, left, workspace); + RAFT_LOG_INFO("Done left."); + resultsView[left] = residual[0]; - clusterDispertionView[left] = - raft::stats::cluster_dispersion(handle, centroids_view, cluster_sizes_view, std::nullopt, n); + clusterDispertionView[left] = raft::stats::cluster_dispersion( + handle, centroids_const_view, cluster_sizes_view, std::nullopt, n); // eval right edge0 resultsView[right] = 1e20; while (resultsView[right] > resultsView[left] && tests < 3) { - centroids_view = + RAFT_LOG_INFO("Running right."); + + centroids_const_view = raft::make_device_matrix_view(centroids.data_handle(), right, d); - cluster_sizes_view = raft::make_device_vector_view(clusterSizes_ptr, right); + centroids_view = + raft::make_device_matrix_view(centroids.data_handle(), right, d); - labels_view = raft::make_device_matrix_view(labels.data_handle(), left, d); + cluster_sizes_view = raft::make_device_vector_view(clusterSizes_ptr, right); params.n_clusters = right; kmeans_fit_predict(handle, params, X, std::nullopt, - std::make_optional(centroids.view()), + std::make_optional(centroids_view), labels.view(), residual, n_iter); @@ -125,9 +136,11 @@ void find_k(raft::handle_t const& handle, detail::countLabels( handle, labels.data_handle(), clusterSizes.data_handle(), n, right, workspace); - resultsView[right] = residual[0]; - clusterDispertionView[right] = - raft::stats::cluster_dispersion(handle, centroids_view, cluster_sizes_view, std::nullopt, n); + resultsView[right] = residual[0]; + clusterDispertionView[right] = raft::stats::cluster_dispersion( + handle, centroids_const_view, cluster_sizes_view, std::nullopt, n); + RAFT_LOG_INFO("Running done."); + tests += 1; } @@ -137,8 +150,13 @@ void find_k(raft::handle_t const& handle, resultsView[mid] = 1e20; tests = 0; while (resultsView[mid] > resultsView[left] && tests < 3) { - centroids_view = + RAFT_LOG_INFO("Running mid."); + + centroids_const_view = raft::make_device_matrix_view(centroids.data_handle(), mid, d); + centroids_view = + raft::make_device_matrix_view(centroids.data_handle(), mid, d); + cluster_sizes_view = raft::make_device_vector_view(clusterSizes_ptr, mid); params.n_clusters = mid; @@ -147,7 +165,7 @@ void find_k(raft::handle_t const& handle, params, X, std::nullopt, - std::make_optional(centroids.view()), + std::make_optional(centroids_view), labels.view(), residual, n_iter); @@ -157,7 +175,9 @@ void find_k(raft::handle_t const& handle, resultsView[mid] = residual[0]; clusterDispertionView[mid] = raft::stats::cluster_dispersion( - handle, centroids_view, cluster_sizes_view, std::nullopt, n); + handle, centroids_const_view, cluster_sizes_view, std::nullopt, n); + + RAFT_LOG_INFO("Done."); if (resultsView[mid] > resultsView[left] && (mid + 1) < right) { mid += 1; @@ -169,6 +189,8 @@ void find_k(raft::handle_t const& handle, tests += 1; } + RAFT_LOG_INFO("S3etting objective things"); + // maximize Calinski-Harabasz Index, minimize resid/ cluster objective[0] = (n - left) / (left - 1) * clusterDispertionView[left] / resultsView[left]; objective[1] = (n - right) / (right - 1) * clusterDispertionView[right] / resultsView[right]; @@ -176,6 +198,8 @@ void find_k(raft::handle_t const& handle, objective[0] = (objective[2] - objective[0]) / (mid - left); objective[1] = (objective[1] - objective[2]) / (right - mid); + RAFT_LOG_INFO("One"); + if (objective[0] > 0 && objective[1] < 0) { // our point is in the left-of-mid side right = mid; @@ -185,27 +209,39 @@ void find_k(raft::handle_t const& handle, oldmid = mid; mid = int(floor((right + left) / 2)); } + + RAFT_LOG_INFO("Two"); + best_k[0] = right; objective[0] = (n - left) / (left - 1) * clusterDispertionView[left] / resultsView[left]; + + RAFT_LOG_INFO("TWO21"); objective[1] = (n - oldmid) / (oldmid - 1) * clusterDispertionView[oldmid] / resultsView[oldmid]; + RAFT_LOG_INFO("TWO2"); if (objective[1] < objective[0]) { best_k[0] = left; } + RAFT_LOG_INFO("Three"); + // if best_k isn't what we just ran, re-run to get correct centroids and dist data on return-> // this saves memory if (best_k[0] != oldmid) { + RAFT_LOG_INFO("Running final."); + centroids_view = - raft::make_device_matrix_view(centroids.data_handle(), best_k[0], d); + raft::make_device_matrix_view(centroids.data_handle(), best_k[0], d); params.n_clusters = best_k[0]; kmeans_fit_predict(handle, params, X, std::nullopt, - std::make_optional(centroids.view()), + std::make_optional(centroids_view), labels.view(), residual, n_iter); + + RAFT_LOG_INFO("Done."); } } } // namespace raft::cluster::detail \ No newline at end of file From 4c136eadc3e5c53fe6c6b516d0173c3fad5e8b07 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Fri, 9 Dec 2022 12:49:19 -0500 Subject: [PATCH 13/19] The tests pass! --- .../cluster/detail/kmeans_auto_find_k.cuh | 28 ------------------- cpp/test/cluster/kmeans_find_k.cu | 17 +---------- 2 files changed, 1 insertion(+), 44 deletions(-) diff --git a/cpp/include/raft/cluster/detail/kmeans_auto_find_k.cuh b/cpp/include/raft/cluster/detail/kmeans_auto_find_k.cuh index 6b8a82fd79..6b83a173ff 100644 --- a/cpp/include/raft/cluster/detail/kmeans_auto_find_k.cuh +++ b/cpp/include/raft/cluster/detail/kmeans_auto_find_k.cuh @@ -82,8 +82,6 @@ void find_k(raft::handle_t const& handle, params.max_iter = maxiter; params.tol = tol; - RAFT_LOG_INFO("Running left."); - auto centroids_const_view = raft::make_device_matrix_view(centroids.data_handle(), left, d); @@ -104,9 +102,6 @@ void find_k(raft::handle_t const& handle, n_iter); detail::countLabels(handle, labels.data_handle(), clusterSizes.data_handle(), n, left, workspace); - - RAFT_LOG_INFO("Done left."); - resultsView[left] = residual[0]; clusterDispertionView[left] = raft::stats::cluster_dispersion( @@ -114,8 +109,6 @@ void find_k(raft::handle_t const& handle, // eval right edge0 resultsView[right] = 1e20; while (resultsView[right] > resultsView[left] && tests < 3) { - RAFT_LOG_INFO("Running right."); - centroids_const_view = raft::make_device_matrix_view(centroids.data_handle(), right, d); centroids_view = @@ -139,7 +132,6 @@ void find_k(raft::handle_t const& handle, resultsView[right] = residual[0]; clusterDispertionView[right] = raft::stats::cluster_dispersion( handle, centroids_const_view, cluster_sizes_view, std::nullopt, n); - RAFT_LOG_INFO("Running done."); tests += 1; } @@ -150,8 +142,6 @@ void find_k(raft::handle_t const& handle, resultsView[mid] = 1e20; tests = 0; while (resultsView[mid] > resultsView[left] && tests < 3) { - RAFT_LOG_INFO("Running mid."); - centroids_const_view = raft::make_device_matrix_view(centroids.data_handle(), mid, d); centroids_view = @@ -177,8 +167,6 @@ void find_k(raft::handle_t const& handle, clusterDispertionView[mid] = raft::stats::cluster_dispersion( handle, centroids_const_view, cluster_sizes_view, std::nullopt, n); - RAFT_LOG_INFO("Done."); - if (resultsView[mid] > resultsView[left] && (mid + 1) < right) { mid += 1; resultsView[mid] = 1e20; @@ -189,8 +177,6 @@ void find_k(raft::handle_t const& handle, tests += 1; } - RAFT_LOG_INFO("S3etting objective things"); - // maximize Calinski-Harabasz Index, minimize resid/ cluster objective[0] = (n - left) / (left - 1) * clusterDispertionView[left] / resultsView[left]; objective[1] = (n - right) / (right - 1) * clusterDispertionView[right] / resultsView[right]; @@ -198,8 +184,6 @@ void find_k(raft::handle_t const& handle, objective[0] = (objective[2] - objective[0]) / (mid - left); objective[1] = (objective[1] - objective[2]) / (right - mid); - RAFT_LOG_INFO("One"); - if (objective[0] > 0 && objective[1] < 0) { // our point is in the left-of-mid side right = mid; @@ -210,24 +194,14 @@ void find_k(raft::handle_t const& handle, mid = int(floor((right + left) / 2)); } - RAFT_LOG_INFO("Two"); - best_k[0] = right; objective[0] = (n - left) / (left - 1) * clusterDispertionView[left] / resultsView[left]; - - RAFT_LOG_INFO("TWO21"); objective[1] = (n - oldmid) / (oldmid - 1) * clusterDispertionView[oldmid] / resultsView[oldmid]; - - RAFT_LOG_INFO("TWO2"); if (objective[1] < objective[0]) { best_k[0] = left; } - RAFT_LOG_INFO("Three"); - // if best_k isn't what we just ran, re-run to get correct centroids and dist data on return-> // this saves memory if (best_k[0] != oldmid) { - RAFT_LOG_INFO("Running final."); - centroids_view = raft::make_device_matrix_view(centroids.data_handle(), best_k[0], d); @@ -240,8 +214,6 @@ void find_k(raft::handle_t const& handle, labels.view(), residual, n_iter); - - RAFT_LOG_INFO("Done."); } } } // namespace raft::cluster::detail \ No newline at end of file diff --git a/cpp/test/cluster/kmeans_find_k.cu b/cpp/test/cluster/kmeans_find_k.cu index a52fea0aa9..e19ec6317a 100644 --- a/cpp/test/cluster/kmeans_find_k.cu +++ b/cpp/test/cluster/kmeans_find_k.cu @@ -43,7 +43,7 @@ struct KmeansFindKInputs { template class KmeansFindKTest : public ::testing::TestWithParam> { protected: - KmeansFindKTest() : stream(handle.get_stream()) {} + KmeansFindKTest() : stream(handle.get_stream()), best_k(raft::make_host_scalar(0)) {} void basicTest() { @@ -71,21 +71,6 @@ class KmeansFindKTest : public ::testing::TestWithParam> { (T)10.0f, (uint64_t)1234); - // std::optional> d_sw = std::nullopt; - // auto d_centroids_view = - // raft::make_device_matrix_view(d_centroids.data(), - // params.n_clusters, n_features); - // if (testparams.weighted) { - // d_sample_weight.resize(n_samples, stream); - // d_sw = std::make_optional( - // raft::make_device_vector_view(d_sample_weight.data(), - // n_samples)); - // thrust::fill(thrust::cuda::par.on(stream), - // d_sample_weight.data(), - // d_sample_weight.data() + n_samples, - // 1); - // } - // auto inertia = raft::make_host_scalar(0); auto n_iter = raft::make_host_scalar(0); From 5c785286bd07781f3509e794520d875a2722797e Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Mon, 12 Dec 2022 11:34:16 -0500 Subject: [PATCH 14/19] Making within-cluster variance super small --- cpp/test/cluster/kmeans_find_k.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/test/cluster/kmeans_find_k.cu b/cpp/test/cluster/kmeans_find_k.cu index e19ec6317a..0a4ae08e8f 100644 --- a/cpp/test/cluster/kmeans_find_k.cu +++ b/cpp/test/cluster/kmeans_find_k.cu @@ -65,7 +65,7 @@ class KmeansFindKTest : public ::testing::TestWithParam> { true, nullptr, nullptr, - T(1.0), + T(.00001), false, (T)-10.0f, (T)10.0f, From 77ec3ee6e0d7026c0bd502c5e147765763b080c8 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Thu, 16 Feb 2023 17:27:19 -0500 Subject: [PATCH 15/19] Some fixes and updates --- cpp/include/raft/cluster/detail/kmeans_auto_find_k.cuh | 6 +++--- cpp/include/raft/cluster/kmeans.cuh | 2 +- cpp/test/cluster/kmeans_find_k.cu | 6 +++--- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/cpp/include/raft/cluster/detail/kmeans_auto_find_k.cuh b/cpp/include/raft/cluster/detail/kmeans_auto_find_k.cuh index 6b83a173ff..143ae1db1b 100644 --- a/cpp/include/raft/cluster/detail/kmeans_auto_find_k.cuh +++ b/cpp/include/raft/cluster/detail/kmeans_auto_find_k.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -25,13 +25,13 @@ #include -#include +#include #include namespace raft::cluster::detail { template -void find_k(raft::handle_t const& handle, +void find_k(raft::device_resources const& handle, raft::device_matrix_view X, raft::host_scalar_view best_k, raft::host_scalar_view residual, diff --git a/cpp/include/raft/cluster/kmeans.cuh b/cpp/include/raft/cluster/kmeans.cuh index 425d6f6ed0..da5f0458ad 100644 --- a/cpp/include/raft/cluster/kmeans.cuh +++ b/cpp/include/raft/cluster/kmeans.cuh @@ -303,7 +303,7 @@ void transform(raft::device_resources const& handle, * @param tol tolerance for early stopping convergence */ template -void find_k(const raft::handle_t& handle, +void find_k(raft::device_resources const& handle, raft::device_matrix_view X, raft::host_scalar_view best_k, raft::host_scalar_view inertia, diff --git a/cpp/test/cluster/kmeans_find_k.cu b/cpp/test/cluster/kmeans_find_k.cu index 0a4ae08e8f..213c2e810c 100644 --- a/cpp/test/cluster/kmeans_find_k.cu +++ b/cpp/test/cluster/kmeans_find_k.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -21,7 +21,7 @@ #include #include -#include +#include #include #include @@ -86,7 +86,7 @@ class KmeansFindKTest : public ::testing::TestWithParam> { void SetUp() override { basicTest(); } protected: - raft::handle_t handle; + raft::device_resources handle; cudaStream_t stream; KmeansFindKInputs testparams; raft::host_scalar best_k; From abe5a1d1fff46a8a8aad333ccc7f0971fc8a4ff3 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Thu, 16 Feb 2023 19:49:16 -0500 Subject: [PATCH 16/19] linking in kmeans_auto_find_k googletest --- cpp/test/CMakeLists.txt | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt index 073681cd65..8071c1e8aa 100644 --- a/cpp/test/CMakeLists.txt +++ b/cpp/test/CMakeLists.txt @@ -78,8 +78,17 @@ endfunction() if(BUILD_TESTS) ConfigureTest( - NAME CLUSTER_TEST PATH test/cluster/kmeans.cu test/cluster/kmeans_balanced.cu - test/cluster/cluster_solvers.cu test/cluster/linkage.cu OPTIONAL DIST NN + NAME + CLUSTER_TEST + PATH + test/cluster/kmeans.cu + test/cluster/kmeans_balanced.cu + test/cluster/cluster_solvers.cu + test/cluster/linkage.cu + test/cluster/kmeans_find_k.cu + OPTIONAL + DIST + NN ) ConfigureTest( From 15ee9fe79d2dcc7a140339e33f296d2d0aad45e6 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Thu, 16 Feb 2023 21:43:32 -0500 Subject: [PATCH 17/19] Build fixes --- .../cluster/detail/kmeans_auto_find_k.cuh | 64 +++++++++---------- 1 file changed, 32 insertions(+), 32 deletions(-) diff --git a/cpp/include/raft/cluster/detail/kmeans_auto_find_k.cuh b/cpp/include/raft/cluster/detail/kmeans_auto_find_k.cuh index 143ae1db1b..d12d556236 100644 --- a/cpp/include/raft/cluster/detail/kmeans_auto_find_k.cuh +++ b/cpp/include/raft/cluster/detail/kmeans_auto_find_k.cuh @@ -92,14 +92,14 @@ void find_k(raft::device_resources const& handle, raft::make_device_vector_view(clusterSizes_ptr, left); params.n_clusters = left; - kmeans_fit_predict(handle, - params, - X, - std::nullopt, - std::make_optional(centroids_view), - labels.view(), - residual, - n_iter); + raft::cluster::detail::kmeans_fit_predict(handle, + params, + X, + std::nullopt, + std::make_optional(centroids_view), + labels.view(), + residual, + n_iter); detail::countLabels(handle, labels.data_handle(), clusterSizes.data_handle(), n, left, workspace); resultsView[left] = residual[0]; @@ -117,14 +117,14 @@ void find_k(raft::device_resources const& handle, cluster_sizes_view = raft::make_device_vector_view(clusterSizes_ptr, right); params.n_clusters = right; - kmeans_fit_predict(handle, - params, - X, - std::nullopt, - std::make_optional(centroids_view), - labels.view(), - residual, - n_iter); + raft::cluster::detail::kmeans_fit_predict(handle, + params, + X, + std::nullopt, + std::make_optional(centroids_view), + labels.view(), + residual, + n_iter); detail::countLabels( handle, labels.data_handle(), clusterSizes.data_handle(), n, right, workspace); @@ -151,14 +151,14 @@ void find_k(raft::device_resources const& handle, params.n_clusters = mid; - kmeans_fit_predict(handle, - params, - X, - std::nullopt, - std::make_optional(centroids_view), - labels.view(), - residual, - n_iter); + raft::cluster::detail::kmeans_fit_predict(handle, + params, + X, + std::nullopt, + std::make_optional(centroids_view), + labels.view(), + residual, + n_iter); detail::countLabels( handle, labels.data_handle(), clusterSizes.data_handle(), n, mid, workspace); @@ -206,14 +206,14 @@ void find_k(raft::device_resources const& handle, raft::make_device_matrix_view(centroids.data_handle(), best_k[0], d); params.n_clusters = best_k[0]; - kmeans_fit_predict(handle, - params, - X, - std::nullopt, - std::make_optional(centroids_view), - labels.view(), - residual, - n_iter); + raft::cluster::detail::kmeans_fit_predict(handle, + params, + X, + std::nullopt, + std::make_optional(centroids_view), + labels.view(), + residual, + n_iter); } } } // namespace raft::cluster::detail \ No newline at end of file From 058b6e448df071a9d56c70f78615d76484dc4ff8 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Fri, 17 Feb 2023 17:16:28 -0500 Subject: [PATCH 18/19] Hopefully tests pass this timey --- .../cluster/detail/kmeans_auto_find_k.cuh | 6 +-- cpp/test/cluster/kmeans_find_k.cu | 43 ++++++++++++------- 2 files changed, 31 insertions(+), 18 deletions(-) diff --git a/cpp/include/raft/cluster/detail/kmeans_auto_find_k.cuh b/cpp/include/raft/cluster/detail/kmeans_auto_find_k.cuh index d12d556236..8ec435b5d9 100644 --- a/cpp/include/raft/cluster/detail/kmeans_auto_find_k.cuh +++ b/cpp/include/raft/cluster/detail/kmeans_auto_find_k.cuh @@ -39,7 +39,7 @@ void find_k(raft::device_resources const& handle, idx_t kmax, idx_t kmin = 1, idx_t maxiter = 100, - value_t tol = 1e-3) + value_t tol = 1e-2) { idx_t n = X.extent(0); idx_t d = X.extent(1); @@ -72,7 +72,7 @@ void find_k(raft::device_resources const& handle, // Perform k-means in binary search int left = kmin; // must be at least 2 int right = kmax; // int(floor(len(data)/2)) #assumption of clusters of size 2 at least - int mid = int(floor((right + left) / 2)); + int mid = ((unsigned int)left + (unsigned int)right) >> 1; int oldmid = mid; int tests = 0; value_t objective[3]; // 0= left of mid, 1= right of mid @@ -191,7 +191,7 @@ void find_k(raft::device_resources const& handle, left = mid; } oldmid = mid; - mid = int(floor((right + left) / 2)); + mid = ((unsigned int)right + (unsigned int)left) >> 1; } best_k[0] = right; diff --git a/cpp/test/cluster/kmeans_find_k.cu b/cpp/test/cluster/kmeans_find_k.cu index 213c2e810c..5ac4ebd293 100644 --- a/cpp/test/cluster/kmeans_find_k.cu +++ b/cpp/test/cluster/kmeans_find_k.cu @@ -65,7 +65,7 @@ class KmeansFindKTest : public ::testing::TestWithParam> { true, nullptr, nullptr, - T(.00001), + T(.001), false, (T)-10.0f, (T)10.0f, @@ -77,8 +77,8 @@ class KmeansFindKTest : public ::testing::TestWithParam> { auto X_view = raft::make_device_matrix_view(X.data_handle(), X.extent(0), X.extent(1)); - raft::cluster::kmeans::find_k( - handle, X_view, best_k.view(), inertia.view(), n_iter.view(), n_clusters + 2); + raft::cluster::kmeans::find_k( + handle, X_view, best_k.view(), inertia.view(), n_iter.view(), n_clusters); handle.sync_stream(stream); } @@ -92,16 +92,16 @@ class KmeansFindKTest : public ::testing::TestWithParam> { raft::host_scalar best_k; }; -const std::vector> inputsf2 = {{1000, 32, 5, 0.0001f, true}, - {1000, 32, 5, 0.0001f, false}, - {1000, 100, 20, 0.0001f, true}, - {1000, 100, 20, 0.0001f, false}, - {10000, 32, 10, 0.0001f, true}, - {10000, 32, 10, 0.0001f, false}, - {10000, 100, 50, 0.0001f, true}, - {10000, 100, 50, 0.0001f, false}, - {10000, 500, 100, 0.0001f, true}, - {10000, 500, 100, 0.0001f, false}}; +const std::vector> inputsf2 = {{1000, 32, 8, 0.001f, true}, + {1000, 32, 8, 0.001f, false}, + {1000, 100, 20, 0.001f, true}, + {1000, 100, 20, 0.001f, false}, + {10000, 32, 10, 0.001f, true}, + {10000, 32, 10, 0.001f, false}, + {10000, 100, 50, 0.001f, true}, + {10000, 100, 50, 0.001f, false}, + {10000, 500, 100, 0.001f, true}, + {10000, 500, 100, 0.001f, false}}; const std::vector> inputsd2 = {{1000, 32, 5, 0.0001, true}, {1000, 32, 5, 0.0001, false}, @@ -115,10 +115,23 @@ const std::vector> inputsd2 = {{1000, 32, 5, 0.0001, t {10000, 500, 100, 0.0001, false}}; typedef KmeansFindKTest KmeansFindKTestF; -TEST_P(KmeansFindKTestF, Result) { ASSERT_TRUE(best_k.view()[0] == testparams.n_clusters); } +TEST_P(KmeansFindKTestF, Result) +{ + if (best_k.view()[0] != testparams.n_clusters) { + std::cout << best_k.view()[0] << " " << testparams.n_clusters << std::endl; + } + ASSERT_TRUE(best_k.view()[0] == testparams.n_clusters); +} typedef KmeansFindKTest KmeansFindKTestD; -TEST_P(KmeansFindKTestD, Result) { ASSERT_TRUE(best_k.view()[0] == testparams.n_clusters); } +TEST_P(KmeansFindKTestD, Result) +{ + if (best_k.view()[0] != testparams.n_clusters) { + std::cout << best_k.view()[0] << " " << testparams.n_clusters << std::endl; + } + + ASSERT_TRUE(best_k.view()[0] == testparams.n_clusters); +} INSTANTIATE_TEST_CASE_P(KmeansFindKTests, KmeansFindKTestF, ::testing::ValuesIn(inputsf2)); From c77d09e6ba4d9d7b9ab98ce8b44a6e902db433ac Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Sat, 18 Feb 2023 16:19:34 -0500 Subject: [PATCH 19/19] Implementing review feedback --- .../cluster/detail/kmeans_auto_find_k.cuh | 145 ++++++++++-------- 1 file changed, 79 insertions(+), 66 deletions(-) diff --git a/cpp/include/raft/cluster/detail/kmeans_auto_find_k.cuh b/cpp/include/raft/cluster/detail/kmeans_auto_find_k.cuh index 8ec435b5d9..edc74a085f 100644 --- a/cpp/include/raft/cluster/detail/kmeans_auto_find_k.cuh +++ b/cpp/include/raft/cluster/detail/kmeans_auto_find_k.cuh @@ -30,6 +30,41 @@ namespace raft::cluster::detail { +template +void compute_dispersion(raft::device_resources const& handle, + raft::device_matrix_view X, + KMeansParams& params, + raft::device_matrix_view centroids_view, + raft::device_vector_view labels, + raft::device_vector_view clusterSizes, + rmm::device_uvector& workspace, + raft::host_vector_view clusterDispertionView, + raft::host_vector_view resultsView, + raft::host_scalar_view residual, + raft::host_scalar_view n_iter, + int val, + idx_t n, + idx_t d) +{ + auto centroids_const_view = + raft::make_device_matrix_view(centroids_view.data_handle(), val, d); + + idx_t* clusterSizes_ptr = clusterSizes.data_handle(); + auto cluster_sizes_view = + raft::make_device_vector_view(clusterSizes_ptr, val); + + params.n_clusters = val; + + raft::cluster::detail::kmeans_fit_predict( + handle, params, X, std::nullopt, std::make_optional(centroids_view), labels, residual, n_iter); + + detail::countLabels(handle, labels.data_handle(), clusterSizes.data_handle(), n, val, workspace); + + resultsView[val] = residual[0]; + clusterDispertionView[val] = raft::stats::cluster_dispersion( + handle, centroids_const_view, cluster_sizes_view, std::nullopt, n); +} + template void find_k(raft::device_resources const& handle, raft::device_matrix_view X, @@ -75,63 +110,49 @@ void find_k(raft::device_resources const& handle, int mid = ((unsigned int)left + (unsigned int)right) >> 1; int oldmid = mid; int tests = 0; - value_t objective[3]; // 0= left of mid, 1= right of mid + double objective[3]; // 0= left of mid, 1= right of mid if (left == 1) left = 2; // at least do 2 clusters KMeansParams params; params.max_iter = maxiter; params.tol = tol; - auto centroids_const_view = - raft::make_device_matrix_view(centroids.data_handle(), left, d); - auto centroids_view = raft::make_device_matrix_view(centroids.data_handle(), left, d); + compute_dispersion(handle, + X, + params, + centroids_view, + labels.view(), + clusterSizes.view(), + workspace, + clusterDispertionView, + resultsView, + residual, + n_iter, + left, + n, + d); - auto cluster_sizes_view = - raft::make_device_vector_view(clusterSizes_ptr, left); - - params.n_clusters = left; - raft::cluster::detail::kmeans_fit_predict(handle, - params, - X, - std::nullopt, - std::make_optional(centroids_view), - labels.view(), - residual, - n_iter); - - detail::countLabels(handle, labels.data_handle(), clusterSizes.data_handle(), n, left, workspace); - resultsView[left] = residual[0]; - - clusterDispertionView[left] = raft::stats::cluster_dispersion( - handle, centroids_const_view, cluster_sizes_view, std::nullopt, n); // eval right edge0 resultsView[right] = 1e20; while (resultsView[right] > resultsView[left] && tests < 3) { - centroids_const_view = - raft::make_device_matrix_view(centroids.data_handle(), right, d); centroids_view = raft::make_device_matrix_view(centroids.data_handle(), right, d); - - cluster_sizes_view = raft::make_device_vector_view(clusterSizes_ptr, right); - - params.n_clusters = right; - raft::cluster::detail::kmeans_fit_predict(handle, - params, - X, - std::nullopt, - std::make_optional(centroids_view), - labels.view(), - residual, - n_iter); - - detail::countLabels( - handle, labels.data_handle(), clusterSizes.data_handle(), n, right, workspace); - - resultsView[right] = residual[0]; - clusterDispertionView[right] = raft::stats::cluster_dispersion( - handle, centroids_const_view, cluster_sizes_view, std::nullopt, n); + compute_dispersion(handle, + X, + params, + centroids_view, + labels.view(), + clusterSizes.view(), + workspace, + clusterDispertionView, + resultsView, + residual, + n_iter, + right, + n, + d); tests += 1; } @@ -142,30 +163,22 @@ void find_k(raft::device_resources const& handle, resultsView[mid] = 1e20; tests = 0; while (resultsView[mid] > resultsView[left] && tests < 3) { - centroids_const_view = - raft::make_device_matrix_view(centroids.data_handle(), mid, d); centroids_view = raft::make_device_matrix_view(centroids.data_handle(), mid, d); - - cluster_sizes_view = raft::make_device_vector_view(clusterSizes_ptr, mid); - - params.n_clusters = mid; - - raft::cluster::detail::kmeans_fit_predict(handle, - params, - X, - std::nullopt, - std::make_optional(centroids_view), - labels.view(), - residual, - n_iter); - - detail::countLabels( - handle, labels.data_handle(), clusterSizes.data_handle(), n, mid, workspace); - - resultsView[mid] = residual[0]; - clusterDispertionView[mid] = raft::stats::cluster_dispersion( - handle, centroids_const_view, cluster_sizes_view, std::nullopt, n); + compute_dispersion(handle, + X, + params, + centroids_view, + labels.view(), + clusterSizes.view(), + workspace, + clusterDispertionView, + resultsView, + residual, + n_iter, + mid, + n, + d); if (resultsView[mid] > resultsView[left] && (mid + 1) < right) { mid += 1; @@ -202,7 +215,7 @@ void find_k(raft::device_resources const& handle, // if best_k isn't what we just ran, re-run to get correct centroids and dist data on return-> // this saves memory if (best_k[0] != oldmid) { - centroids_view = + auto centroids_view = raft::make_device_matrix_view(centroids.data_handle(), best_k[0], d); params.n_clusters = best_k[0];