From e378d365a7ca35a197fff54f7bfc86cc6d31320f Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Mon, 12 Feb 2024 16:29:09 -0500 Subject: [PATCH 1/7] Fixing cusparse aligned address issue and adding note --- cpp/include/raft/sparse/linalg/spmm.hpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cpp/include/raft/sparse/linalg/spmm.hpp b/cpp/include/raft/sparse/linalg/spmm.hpp index 1e815ba521..b0814093e9 100644 --- a/cpp/include/raft/sparse/linalg/spmm.hpp +++ b/cpp/include/raft/sparse/linalg/spmm.hpp @@ -72,6 +72,10 @@ void spmm(raft::resources const& handle, detail::spmm(handle, trans_x, trans_y, is_row_major, alpha, descr_x, descr_y, beta, descr_z); + // WARNING: Do not remove the following copy unless you can, with certainty, say that + // the underlying cuSPARSE issue affecting CUA 12.2+ has been resolved. + raft::copy( + z.data_handle(), z_tmp.data_handle(), z_tmp.size(), raft::resource::get_cuda_stream(handle)); RAFT_CUSPARSE_TRY_NO_THROW(cusparseDestroySpMat(descr_x)); RAFT_CUSPARSE_TRY_NO_THROW(cusparseDestroyDnMat(descr_y)); RAFT_CUSPARSE_TRY_NO_THROW(cusparseDestroyDnMat(descr_z)); From 1a05bb94211ba89a1600b27ac76f56d2db9e79b3 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Mon, 12 Feb 2024 16:45:17 -0500 Subject: [PATCH 2/7] Adding reproducer for strange cusparse 12.2 bug. This will at least make sure we fail until there's a better fix --- cpp/test/CMakeLists.txt | 4 +- .../cluster/cluster_solvers_deprecated.cu | 2 +- cpp/test/cluster/spectral.cu | 102 ++++++++++++++++++ 3 files changed, 106 insertions(+), 2 deletions(-) create mode 100644 cpp/test/cluster/spectral.cu diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt index 931530b66a..e36e86345a 100644 --- a/cpp/test/CMakeLists.txt +++ b/cpp/test/CMakeLists.txt @@ -101,9 +101,11 @@ if(BUILD_TESTS) PATH test/cluster/kmeans.cu test/cluster/kmeans_balanced.cu + test/cluster/kmeans_find_k.cu test/cluster/cluster_solvers.cu test/cluster/linkage.cu - test/cluster/kmeans_find_k.cu + test/cluster/spectral.cu + LIB EXPLICIT_INSTANTIATE_ONLY ) diff --git a/cpp/test/cluster/cluster_solvers_deprecated.cu b/cpp/test/cluster/cluster_solvers_deprecated.cu index 5af8fda5ff..a95b013852 100644 --- a/cpp/test/cluster/cluster_solvers_deprecated.cu +++ b/cpp/test/cluster/cluster_solvers_deprecated.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/test/cluster/spectral.cu b/cpp/test/cluster/spectral.cu new file mode 100644 index 0000000000..1b9b633dce --- /dev/null +++ b/cpp/test/cluster/spectral.cu @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include + +namespace raft { +namespace cluster { + +TEST(Raft, Spectral) +{ + raft::handle_t handle; + + std::vector h_offsets({0, 2, 4, 7, 10, 12, 14}); + std::vector h_indices({1, 2, 0, 2, 0, 1, 3, 2, 4, 5, 3, 5, 3, 4}); + std::vector h_values( + {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}); + std::vector expected_clustering({1, 1, 1, 0, 0, 0}); + + int32_t n_clusters{2}; + int32_t n_eigenvectors{2}; + int32_t evs_max_it{100}; + int32_t kmean_max_it{100}; + int32_t restartIter_lanczos = 15 + n_eigenvectors; + float evs_tol{0.001}; + float kmean_tol{0.001}; + unsigned long long seed1{1234567}; + unsigned long long seed2{12345678}; + bool reorthog{false}; + + rmm::device_uvector offsets(h_offsets.size(), handle.get_stream()); + rmm::device_uvector indices(h_indices.size(), handle.get_stream()); + rmm::device_uvector values(h_indices.size(), handle.get_stream()); + rmm::device_uvector clustering(expected_clustering.size(), handle.get_stream()); + rmm::device_uvector eigenvalues(n_eigenvectors, handle.get_stream()); + rmm::device_uvector eigenvectors(n_eigenvectors * expected_clustering.size(), + handle.get_stream()); + + rmm::device_uvector exp_dev(expected_clustering.size()); + + raft::update_device( + exp_dev.data(), expected_clustering.data(), expected_clustering.size(), handle.get_stream()); + + raft::update_device(offsets.data(), h_offsets.data(), h_offsets.size(), handle.get_stream()); + raft::update_device(indices.data(), h_indices.data(), h_indices.size(), handle.get_stream()); + raft::update_device(values.data(), h_values.data(), h_values.size(), handle.get_stream()); + + raft::print_device_vector(" offsets", offsets.data(), offsets.size(), std::cout); + raft::print_device_vector(" indices", indices.data(), indices.size(), std::cout); + raft::print_device_vector(" values", values.data(), values.size(), std::cout); + + raft::spectral::matrix::sparse_matrix_t const matrix{ + handle, + offsets.data(), + indices.data(), + values.data(), + static_cast(offsets.size() - 1), + static_cast(indices.size())}; + + raft::spectral::eigen_solver_config_t eig_cfg{ + n_eigenvectors, evs_max_it, restartIter_lanczos, evs_tol, reorthog, seed1}; + raft::spectral::lanczos_solver_t eig_solver{eig_cfg}; + + raft::spectral::cluster_solver_config_t clust_cfg{ + n_clusters, kmean_max_it, kmean_tol, seed2}; + raft::spectral::kmeans_solver_t cluster_solver{clust_cfg}; + + raft::spectral::partition(handle, + matrix, + eig_solver, + cluster_solver, + clustering.data(), + eigenvalues.data(), + eigenvectors.data()); + + ASSERT_TRUE(devArrMatch(expected_clustering.data(), + exp_dev.data(), + exp_dev.size(), + 1, + raft::Compare(), + stream)); +} + +} // namespace cluster +} // namespace raft \ No newline at end of file From 3bfa018bcad4ebf2b58e76972907a19b2c9c3379 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Mon, 12 Feb 2024 16:47:33 -0500 Subject: [PATCH 3/7] Adding description to spectral test --- cpp/test/cluster/spectral.cu | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/cpp/test/cluster/spectral.cu b/cpp/test/cluster/spectral.cu index 1b9b633dce..7e1ce9eec0 100644 --- a/cpp/test/cluster/spectral.cu +++ b/cpp/test/cluster/spectral.cu @@ -24,6 +24,13 @@ namespace raft { namespace cluster { +/** + * Warning: There appears to be a CUDA 12.2 bug in cusparse that causes an + * alignment issue. We've fixed the bug in our code through a workaround + * (see raft/sparse/linalg/spmm.hpp for fix). This test is meant to fail + * in the case where the fix is accidentally reverted, so that it doesn't + * break any downstream libraries that depend on RAFT + */ TEST(Raft, Spectral) { raft::handle_t handle; From 87813ae33dc4320e4bf792bb3ed6bd4e0b3c0abd Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Mon, 12 Feb 2024 16:49:13 -0500 Subject: [PATCH 4/7] Update cpp/include/raft/sparse/linalg/spmm.hpp Co-authored-by: Bradley Dice --- cpp/include/raft/sparse/linalg/spmm.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/raft/sparse/linalg/spmm.hpp b/cpp/include/raft/sparse/linalg/spmm.hpp index b0814093e9..9f55a00c73 100644 --- a/cpp/include/raft/sparse/linalg/spmm.hpp +++ b/cpp/include/raft/sparse/linalg/spmm.hpp @@ -73,7 +73,7 @@ void spmm(raft::resources const& handle, detail::spmm(handle, trans_x, trans_y, is_row_major, alpha, descr_x, descr_y, beta, descr_z); // WARNING: Do not remove the following copy unless you can, with certainty, say that - // the underlying cuSPARSE issue affecting CUA 12.2+ has been resolved. + // the underlying cuSPARSE issue affecting CUDA 12.2+ has been resolved. raft::copy( z.data_handle(), z_tmp.data_handle(), z_tmp.size(), raft::resource::get_cuda_stream(handle)); RAFT_CUSPARSE_TRY_NO_THROW(cusparseDestroySpMat(descr_x)); From 3a8e048840d0b67fc2083db373d9a60f3dc5393f Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Mon, 12 Feb 2024 16:50:17 -0500 Subject: [PATCH 5/7] Removing unnecessary prints in google test --- cpp/test/cluster/spectral.cu | 4 ---- 1 file changed, 4 deletions(-) diff --git a/cpp/test/cluster/spectral.cu b/cpp/test/cluster/spectral.cu index 7e1ce9eec0..97fc43af37 100644 --- a/cpp/test/cluster/spectral.cu +++ b/cpp/test/cluster/spectral.cu @@ -69,10 +69,6 @@ TEST(Raft, Spectral) raft::update_device(indices.data(), h_indices.data(), h_indices.size(), handle.get_stream()); raft::update_device(values.data(), h_values.data(), h_values.size(), handle.get_stream()); - raft::print_device_vector(" offsets", offsets.data(), offsets.size(), std::cout); - raft::print_device_vector(" indices", indices.data(), indices.size(), std::cout); - raft::print_device_vector(" values", values.data(), values.size(), std::cout); - raft::spectral::matrix::sparse_matrix_t const matrix{ handle, offsets.data(), From 0e483b1f37f3d4f809e3b6fbad299ae4c00dfb82 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Mon, 12 Feb 2024 17:23:00 -0500 Subject: [PATCH 6/7] Fixing build errors --- cpp/include/raft/sparse/linalg/spmm.hpp | 6 ++++-- cpp/test/cluster/spectral.cu | 6 ++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/cpp/include/raft/sparse/linalg/spmm.hpp b/cpp/include/raft/sparse/linalg/spmm.hpp index 9f55a00c73..dd661c71ac 100644 --- a/cpp/include/raft/sparse/linalg/spmm.hpp +++ b/cpp/include/raft/sparse/linalg/spmm.hpp @@ -74,8 +74,10 @@ void spmm(raft::resources const& handle, // WARNING: Do not remove the following copy unless you can, with certainty, say that // the underlying cuSPARSE issue affecting CUDA 12.2+ has been resolved. - raft::copy( - z.data_handle(), z_tmp.data_handle(), z_tmp.size(), raft::resource::get_cuda_stream(handle)); + raft::copy(z.data_handle(), + z_tmp_view.data_handle(), + z_tmp_view.size(), + raft::resource::get_cuda_stream(handle)); RAFT_CUSPARSE_TRY_NO_THROW(cusparseDestroySpMat(descr_x)); RAFT_CUSPARSE_TRY_NO_THROW(cusparseDestroyDnMat(descr_y)); RAFT_CUSPARSE_TRY_NO_THROW(cusparseDestroyDnMat(descr_z)); diff --git a/cpp/test/cluster/spectral.cu b/cpp/test/cluster/spectral.cu index 97fc43af37..580564ccbc 100644 --- a/cpp/test/cluster/spectral.cu +++ b/cpp/test/cluster/spectral.cu @@ -14,6 +14,8 @@ * limitations under the License. */ +#include "../test_utils.cuh" + #include #include #include @@ -60,7 +62,7 @@ TEST(Raft, Spectral) rmm::device_uvector eigenvectors(n_eigenvectors * expected_clustering.size(), handle.get_stream()); - rmm::device_uvector exp_dev(expected_clustering.size()); + rmm::device_uvector exp_dev(expected_clustering.size(), handle.get_stream()); raft::update_device( exp_dev.data(), expected_clustering.data(), expected_clustering.size(), handle.get_stream()); @@ -98,7 +100,7 @@ TEST(Raft, Spectral) exp_dev.size(), 1, raft::Compare(), - stream)); + handle.get_stream())); } } // namespace cluster From d68a41359fb6691252c31f7d4c3aa9633934b9e4 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Mon, 12 Feb 2024 17:23:19 -0500 Subject: [PATCH 7/7] Fixing style --- cpp/test/CMakeLists.txt | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt index e36e86345a..f34a122a65 100644 --- a/cpp/test/CMakeLists.txt +++ b/cpp/test/CMakeLists.txt @@ -101,11 +101,10 @@ if(BUILD_TESTS) PATH test/cluster/kmeans.cu test/cluster/kmeans_balanced.cu - test/cluster/kmeans_find_k.cu + test/cluster/kmeans_find_k.cu test/cluster/cluster_solvers.cu test/cluster/linkage.cu test/cluster/spectral.cu - LIB EXPLICIT_INSTANTIATE_ONLY )