From e378d365a7ca35a197fff54f7bfc86cc6d31320f Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Mon, 12 Feb 2024 16:29:09 -0500 Subject: [PATCH 01/13] Fixing cusparse aligned address issue and adding note --- cpp/include/raft/sparse/linalg/spmm.hpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cpp/include/raft/sparse/linalg/spmm.hpp b/cpp/include/raft/sparse/linalg/spmm.hpp index 1e815ba521..b0814093e9 100644 --- a/cpp/include/raft/sparse/linalg/spmm.hpp +++ b/cpp/include/raft/sparse/linalg/spmm.hpp @@ -72,6 +72,10 @@ void spmm(raft::resources const& handle, detail::spmm(handle, trans_x, trans_y, is_row_major, alpha, descr_x, descr_y, beta, descr_z); + // WARNING: Do not remove the following copy unless you can, with certainty, say that + // the underlying cuSPARSE issue affecting CUA 12.2+ has been resolved. + raft::copy( + z.data_handle(), z_tmp.data_handle(), z_tmp.size(), raft::resource::get_cuda_stream(handle)); RAFT_CUSPARSE_TRY_NO_THROW(cusparseDestroySpMat(descr_x)); RAFT_CUSPARSE_TRY_NO_THROW(cusparseDestroyDnMat(descr_y)); RAFT_CUSPARSE_TRY_NO_THROW(cusparseDestroyDnMat(descr_z)); From 1a05bb94211ba89a1600b27ac76f56d2db9e79b3 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Mon, 12 Feb 2024 16:45:17 -0500 Subject: [PATCH 02/13] Adding reproducer for strange cusparse 12.2 bug. This will at least make sure we fail until there's a better fix --- cpp/test/CMakeLists.txt | 4 +- .../cluster/cluster_solvers_deprecated.cu | 2 +- cpp/test/cluster/spectral.cu | 102 ++++++++++++++++++ 3 files changed, 106 insertions(+), 2 deletions(-) create mode 100644 cpp/test/cluster/spectral.cu diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt index 931530b66a..e36e86345a 100644 --- a/cpp/test/CMakeLists.txt +++ b/cpp/test/CMakeLists.txt @@ -101,9 +101,11 @@ if(BUILD_TESTS) PATH test/cluster/kmeans.cu test/cluster/kmeans_balanced.cu + test/cluster/kmeans_find_k.cu test/cluster/cluster_solvers.cu test/cluster/linkage.cu - test/cluster/kmeans_find_k.cu + test/cluster/spectral.cu + LIB EXPLICIT_INSTANTIATE_ONLY ) diff --git a/cpp/test/cluster/cluster_solvers_deprecated.cu b/cpp/test/cluster/cluster_solvers_deprecated.cu index 5af8fda5ff..a95b013852 100644 --- a/cpp/test/cluster/cluster_solvers_deprecated.cu +++ b/cpp/test/cluster/cluster_solvers_deprecated.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/test/cluster/spectral.cu b/cpp/test/cluster/spectral.cu new file mode 100644 index 0000000000..1b9b633dce --- /dev/null +++ b/cpp/test/cluster/spectral.cu @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include + +namespace raft { +namespace cluster { + +TEST(Raft, Spectral) +{ + raft::handle_t handle; + + std::vector h_offsets({0, 2, 4, 7, 10, 12, 14}); + std::vector h_indices({1, 2, 0, 2, 0, 1, 3, 2, 4, 5, 3, 5, 3, 4}); + std::vector h_values( + {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}); + std::vector expected_clustering({1, 1, 1, 0, 0, 0}); + + int32_t n_clusters{2}; + int32_t n_eigenvectors{2}; + int32_t evs_max_it{100}; + int32_t kmean_max_it{100}; + int32_t restartIter_lanczos = 15 + n_eigenvectors; + float evs_tol{0.001}; + float kmean_tol{0.001}; + unsigned long long seed1{1234567}; + unsigned long long seed2{12345678}; + bool reorthog{false}; + + rmm::device_uvector offsets(h_offsets.size(), handle.get_stream()); + rmm::device_uvector indices(h_indices.size(), handle.get_stream()); + rmm::device_uvector values(h_indices.size(), handle.get_stream()); + rmm::device_uvector clustering(expected_clustering.size(), handle.get_stream()); + rmm::device_uvector eigenvalues(n_eigenvectors, handle.get_stream()); + rmm::device_uvector eigenvectors(n_eigenvectors * expected_clustering.size(), + handle.get_stream()); + + rmm::device_uvector exp_dev(expected_clustering.size()); + + raft::update_device( + exp_dev.data(), expected_clustering.data(), expected_clustering.size(), handle.get_stream()); + + raft::update_device(offsets.data(), h_offsets.data(), h_offsets.size(), handle.get_stream()); + raft::update_device(indices.data(), h_indices.data(), h_indices.size(), handle.get_stream()); + raft::update_device(values.data(), h_values.data(), h_values.size(), handle.get_stream()); + + raft::print_device_vector(" offsets", offsets.data(), offsets.size(), std::cout); + raft::print_device_vector(" indices", indices.data(), indices.size(), std::cout); + raft::print_device_vector(" values", values.data(), values.size(), std::cout); + + raft::spectral::matrix::sparse_matrix_t const matrix{ + handle, + offsets.data(), + indices.data(), + values.data(), + static_cast(offsets.size() - 1), + static_cast(indices.size())}; + + raft::spectral::eigen_solver_config_t eig_cfg{ + n_eigenvectors, evs_max_it, restartIter_lanczos, evs_tol, reorthog, seed1}; + raft::spectral::lanczos_solver_t eig_solver{eig_cfg}; + + raft::spectral::cluster_solver_config_t clust_cfg{ + n_clusters, kmean_max_it, kmean_tol, seed2}; + raft::spectral::kmeans_solver_t cluster_solver{clust_cfg}; + + raft::spectral::partition(handle, + matrix, + eig_solver, + cluster_solver, + clustering.data(), + eigenvalues.data(), + eigenvectors.data()); + + ASSERT_TRUE(devArrMatch(expected_clustering.data(), + exp_dev.data(), + exp_dev.size(), + 1, + raft::Compare(), + stream)); +} + +} // namespace cluster +} // namespace raft \ No newline at end of file From 3bfa018bcad4ebf2b58e76972907a19b2c9c3379 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Mon, 12 Feb 2024 16:47:33 -0500 Subject: [PATCH 03/13] Adding description to spectral test --- cpp/test/cluster/spectral.cu | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/cpp/test/cluster/spectral.cu b/cpp/test/cluster/spectral.cu index 1b9b633dce..7e1ce9eec0 100644 --- a/cpp/test/cluster/spectral.cu +++ b/cpp/test/cluster/spectral.cu @@ -24,6 +24,13 @@ namespace raft { namespace cluster { +/** + * Warning: There appears to be a CUDA 12.2 bug in cusparse that causes an + * alignment issue. We've fixed the bug in our code through a workaround + * (see raft/sparse/linalg/spmm.hpp for fix). This test is meant to fail + * in the case where the fix is accidentally reverted, so that it doesn't + * break any downstream libraries that depend on RAFT + */ TEST(Raft, Spectral) { raft::handle_t handle; From 87813ae33dc4320e4bf792bb3ed6bd4e0b3c0abd Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Mon, 12 Feb 2024 16:49:13 -0500 Subject: [PATCH 04/13] Update cpp/include/raft/sparse/linalg/spmm.hpp Co-authored-by: Bradley Dice --- cpp/include/raft/sparse/linalg/spmm.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/raft/sparse/linalg/spmm.hpp b/cpp/include/raft/sparse/linalg/spmm.hpp index b0814093e9..9f55a00c73 100644 --- a/cpp/include/raft/sparse/linalg/spmm.hpp +++ b/cpp/include/raft/sparse/linalg/spmm.hpp @@ -73,7 +73,7 @@ void spmm(raft::resources const& handle, detail::spmm(handle, trans_x, trans_y, is_row_major, alpha, descr_x, descr_y, beta, descr_z); // WARNING: Do not remove the following copy unless you can, with certainty, say that - // the underlying cuSPARSE issue affecting CUA 12.2+ has been resolved. + // the underlying cuSPARSE issue affecting CUDA 12.2+ has been resolved. raft::copy( z.data_handle(), z_tmp.data_handle(), z_tmp.size(), raft::resource::get_cuda_stream(handle)); RAFT_CUSPARSE_TRY_NO_THROW(cusparseDestroySpMat(descr_x)); From 3a8e048840d0b67fc2083db373d9a60f3dc5393f Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Mon, 12 Feb 2024 16:50:17 -0500 Subject: [PATCH 05/13] Removing unnecessary prints in google test --- cpp/test/cluster/spectral.cu | 4 ---- 1 file changed, 4 deletions(-) diff --git a/cpp/test/cluster/spectral.cu b/cpp/test/cluster/spectral.cu index 7e1ce9eec0..97fc43af37 100644 --- a/cpp/test/cluster/spectral.cu +++ b/cpp/test/cluster/spectral.cu @@ -69,10 +69,6 @@ TEST(Raft, Spectral) raft::update_device(indices.data(), h_indices.data(), h_indices.size(), handle.get_stream()); raft::update_device(values.data(), h_values.data(), h_values.size(), handle.get_stream()); - raft::print_device_vector(" offsets", offsets.data(), offsets.size(), std::cout); - raft::print_device_vector(" indices", indices.data(), indices.size(), std::cout); - raft::print_device_vector(" values", values.data(), values.size(), std::cout); - raft::spectral::matrix::sparse_matrix_t const matrix{ handle, offsets.data(), From 0e483b1f37f3d4f809e3b6fbad299ae4c00dfb82 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Mon, 12 Feb 2024 17:23:00 -0500 Subject: [PATCH 06/13] Fixing build errors --- cpp/include/raft/sparse/linalg/spmm.hpp | 6 ++++-- cpp/test/cluster/spectral.cu | 6 ++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/cpp/include/raft/sparse/linalg/spmm.hpp b/cpp/include/raft/sparse/linalg/spmm.hpp index 9f55a00c73..dd661c71ac 100644 --- a/cpp/include/raft/sparse/linalg/spmm.hpp +++ b/cpp/include/raft/sparse/linalg/spmm.hpp @@ -74,8 +74,10 @@ void spmm(raft::resources const& handle, // WARNING: Do not remove the following copy unless you can, with certainty, say that // the underlying cuSPARSE issue affecting CUDA 12.2+ has been resolved. - raft::copy( - z.data_handle(), z_tmp.data_handle(), z_tmp.size(), raft::resource::get_cuda_stream(handle)); + raft::copy(z.data_handle(), + z_tmp_view.data_handle(), + z_tmp_view.size(), + raft::resource::get_cuda_stream(handle)); RAFT_CUSPARSE_TRY_NO_THROW(cusparseDestroySpMat(descr_x)); RAFT_CUSPARSE_TRY_NO_THROW(cusparseDestroyDnMat(descr_y)); RAFT_CUSPARSE_TRY_NO_THROW(cusparseDestroyDnMat(descr_z)); diff --git a/cpp/test/cluster/spectral.cu b/cpp/test/cluster/spectral.cu index 97fc43af37..580564ccbc 100644 --- a/cpp/test/cluster/spectral.cu +++ b/cpp/test/cluster/spectral.cu @@ -14,6 +14,8 @@ * limitations under the License. */ +#include "../test_utils.cuh" + #include #include #include @@ -60,7 +62,7 @@ TEST(Raft, Spectral) rmm::device_uvector eigenvectors(n_eigenvectors * expected_clustering.size(), handle.get_stream()); - rmm::device_uvector exp_dev(expected_clustering.size()); + rmm::device_uvector exp_dev(expected_clustering.size(), handle.get_stream()); raft::update_device( exp_dev.data(), expected_clustering.data(), expected_clustering.size(), handle.get_stream()); @@ -98,7 +100,7 @@ TEST(Raft, Spectral) exp_dev.size(), 1, raft::Compare(), - stream)); + handle.get_stream())); } } // namespace cluster From d68a41359fb6691252c31f7d4c3aa9633934b9e4 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Mon, 12 Feb 2024 17:23:19 -0500 Subject: [PATCH 07/13] Fixing style --- cpp/test/CMakeLists.txt | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt index e36e86345a..f34a122a65 100644 --- a/cpp/test/CMakeLists.txt +++ b/cpp/test/CMakeLists.txt @@ -101,11 +101,10 @@ if(BUILD_TESTS) PATH test/cluster/kmeans.cu test/cluster/kmeans_balanced.cu - test/cluster/kmeans_find_k.cu + test/cluster/kmeans_find_k.cu test/cluster/cluster_solvers.cu test/cluster/linkage.cu test/cluster/spectral.cu - LIB EXPLICIT_INSTANTIATE_ONLY ) From dbfa7ddb3c0008852d7a2c0c672af19aaf3f679d Mon Sep 17 00:00:00 2001 From: Tamas Bela Feher Date: Tue, 13 Feb 2024 14:03:36 +0100 Subject: [PATCH 08/13] Fix ANN bench ground truth generation for k>1024 --- cpp/include/raft/neighbors/detail/knn_merge_parts.cuh | 3 +++ .../src/raft-ann-bench/generate_groundtruth/__main__.py | 5 +++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/cpp/include/raft/neighbors/detail/knn_merge_parts.cuh b/cpp/include/raft/neighbors/detail/knn_merge_parts.cuh index c8ff03741c..bc64fc9181 100644 --- a/cpp/include/raft/neighbors/detail/knn_merge_parts.cuh +++ b/cpp/include/raft/neighbors/detail/knn_merge_parts.cuh @@ -16,6 +16,7 @@ #pragma once +#include #include #include @@ -168,5 +169,7 @@ inline void knn_merge_parts(const value_t* inK, else if (k <= 1024) knn_merge_parts_impl( inK, inV, outK, outV, n_samples, n_parts, k, stream, translations); + else + THROW("Unimplemented for k=%d, knn_merge_parts works for k<=1024", k); } } // namespace raft::neighbors::detail diff --git a/python/raft-ann-bench/src/raft-ann-bench/generate_groundtruth/__main__.py b/python/raft-ann-bench/src/raft-ann-bench/generate_groundtruth/__main__.py index f4d97edea5..2250618f58 100644 --- a/python/raft-ann-bench/src/raft-ann-bench/generate_groundtruth/__main__.py +++ b/python/raft-ann-bench/src/raft-ann-bench/generate_groundtruth/__main__.py @@ -67,12 +67,13 @@ def calc_truth(dataset, queries, k, metric="sqeuclidean"): queries, k, metric=metric, - handle=handle, - global_id_offset=i, # shift neighbor index by offset i + handle=handle ) handle.sync() D, Ind = cp.asarray(D), cp.asarray(Ind) + Ind += i # shift neighbor index by offset i + if distances is None: distances = D indices = Ind From 485460932fe14c7c522630343015c248b9bd1ded Mon Sep 17 00:00:00 2001 From: Tamas Bela Feher Date: Tue, 13 Feb 2024 14:35:57 +0100 Subject: [PATCH 09/13] Fix style --- .../raft-ann-bench/generate_groundtruth/__main__.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/python/raft-ann-bench/src/raft-ann-bench/generate_groundtruth/__main__.py b/python/raft-ann-bench/src/raft-ann-bench/generate_groundtruth/__main__.py index 2250618f58..a5ebb76635 100644 --- a/python/raft-ann-bench/src/raft-ann-bench/generate_groundtruth/__main__.py +++ b/python/raft-ann-bench/src/raft-ann-bench/generate_groundtruth/__main__.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -# Copyright (c) 2023, NVIDIA CORPORATION. +# Copyright (c) 2023-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -62,18 +62,12 @@ def calc_truth(dataset, queries, k, metric="sqeuclidean"): X = cp.asarray(dataset[i : i + n_batch, :], cp.float32) - D, Ind = knn( - X, - queries, - k, - metric=metric, - handle=handle - ) + D, Ind = knn(X, queries, k, metric=metric, handle=handle) handle.sync() D, Ind = cp.asarray(D), cp.asarray(Ind) Ind += i # shift neighbor index by offset i - + if distances is None: distances = D indices = Ind From 22ed7f5bd4cd22a0fe6b0c0fbd063ae1742fdd6e Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Tue, 13 Feb 2024 22:47:54 -0500 Subject: [PATCH 10/13] Fixing cluster test so it fails --- cpp/test/cluster/spectral.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/test/cluster/spectral.cu b/cpp/test/cluster/spectral.cu index 580564ccbc..3cc5df1e7c 100644 --- a/cpp/test/cluster/spectral.cu +++ b/cpp/test/cluster/spectral.cu @@ -95,7 +95,7 @@ TEST(Raft, Spectral) eigenvalues.data(), eigenvectors.data()); - ASSERT_TRUE(devArrMatch(expected_clustering.data(), + ASSERT_TRUE(devArrMatch(clustering.data(), exp_dev.data(), exp_dev.size(), 1, From feb0e135aaf541c77338c7dc2cf678ffd5e618b9 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Tue, 13 Feb 2024 22:55:21 -0500 Subject: [PATCH 11/13] Fixing workaround --- cpp/include/raft/sparse/linalg/spmm.hpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/cpp/include/raft/sparse/linalg/spmm.hpp b/cpp/include/raft/sparse/linalg/spmm.hpp index dd661c71ac..3780b54568 100644 --- a/cpp/include/raft/sparse/linalg/spmm.hpp +++ b/cpp/include/raft/sparse/linalg/spmm.hpp @@ -60,11 +60,15 @@ void spmm(raft::resources const& handle, { bool is_row_major = detail::is_row_major(y, z); + // WARNING: The following copy is working around a bug in cusparse which causes an alignment issue + // and incorrect results. This bug is fixed in CUDA 12.5+ so this workaround shouldn't be removed + // until that version is supported. + auto z_tmp = raft::make_device_matrix(handle, z.extent(0), z.extent(1)); auto z_tmp_view = is_row_major ? raft::make_device_strided_matrix_view( - z.data_handle(), z.extent(0), z.extent(1), z.stride(0)) + z_tmp.data_handle(), z.extent(0), z.extent(1), z.stride(0)) : raft::make_device_strided_matrix_view( - z.data_handle(), z.extent(0), z.extent(1), z.stride(1)); + z_tmp.data_handle(), z.extent(0), z.extent(1), z.stride(1)); auto descr_x = detail::create_descriptor(x); auto descr_y = detail::create_descriptor(y); From 9a7a0f42a34a020c6c2a3938bb6bff93374be7c2 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Wed, 14 Feb 2024 06:22:20 +0100 Subject: [PATCH 12/13] Update spmm.hpp Co-authored-by: Paul Taylor <178183+trxcllnt@users.noreply.github.com> --- cpp/include/raft/sparse/linalg/spmm.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/include/raft/sparse/linalg/spmm.hpp b/cpp/include/raft/sparse/linalg/spmm.hpp index 3780b54568..ee243f45dd 100644 --- a/cpp/include/raft/sparse/linalg/spmm.hpp +++ b/cpp/include/raft/sparse/linalg/spmm.hpp @@ -64,6 +64,7 @@ void spmm(raft::resources const& handle, // and incorrect results. This bug is fixed in CUDA 12.5+ so this workaround shouldn't be removed // until that version is supported. auto z_tmp = raft::make_device_matrix(handle, z.extent(0), z.extent(1)); + raft::copy(z_tmp.data_handle(), z.data_handle(), z.size(), raft::resource::get_cuda_stream(handle)); auto z_tmp_view = is_row_major ? raft::make_device_strided_matrix_view( z_tmp.data_handle(), z.extent(0), z.extent(1), z.stride(0)) From 5e7b602acbdb789104aa2c8980adae157b2fe087 Mon Sep 17 00:00:00 2001 From: Paul Taylor <178183+trxcllnt@users.noreply.github.com> Date: Wed, 14 Feb 2024 09:19:45 -0800 Subject: [PATCH 13/13] Update cpp/include/raft/sparse/linalg/spmm.hpp --- cpp/include/raft/sparse/linalg/spmm.hpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cpp/include/raft/sparse/linalg/spmm.hpp b/cpp/include/raft/sparse/linalg/spmm.hpp index ee243f45dd..d0727755b6 100644 --- a/cpp/include/raft/sparse/linalg/spmm.hpp +++ b/cpp/include/raft/sparse/linalg/spmm.hpp @@ -64,7 +64,9 @@ void spmm(raft::resources const& handle, // and incorrect results. This bug is fixed in CUDA 12.5+ so this workaround shouldn't be removed // until that version is supported. auto z_tmp = raft::make_device_matrix(handle, z.extent(0), z.extent(1)); - raft::copy(z_tmp.data_handle(), z.data_handle(), z.size(), raft::resource::get_cuda_stream(handle)); + raft::copy( + z_tmp.data_handle(), z.data_handle(), z.size(), raft::resource::get_cuda_stream(handle)); + auto z_tmp_view = is_row_major ? raft::make_device_strided_matrix_view( z_tmp.data_handle(), z.extent(0), z.extent(1), z.stride(0))