From e145ed145f1e1bfff59fea4729d77a118cc0f262 Mon Sep 17 00:00:00 2001 From: Mickael Ide Date: Wed, 22 May 2024 22:30:21 +0200 Subject: [PATCH 1/7] Add workaround for syevd stream + test --- cpp/include/raft/linalg/detail/eig.cuh | 26 ++++++++++++++++++-------- cpp/test/linalg/eig.cu | 13 +++++++++++++ 2 files changed, 31 insertions(+), 8 deletions(-) diff --git a/cpp/include/raft/linalg/detail/eig.cuh b/cpp/include/raft/linalg/detail/eig.cuh index 2a4cfd52ec..b23152ba03 100644 --- a/cpp/include/raft/linalg/detail/eig.cuh +++ b/cpp/include/raft/linalg/detail/eig.cuh @@ -91,6 +91,13 @@ void eigDC(raft::resources const& handle, #if CUDART_VERSION < 11010 eigDC_legacy(handle, in, n_rows, n_cols, eig_vectors, eig_vals, stream); #else + + // Use a new stream instead of `cudaStreamPerThread` to avoid cusolver bug # 4580093. + cudaStream_t stream_new; + cudaEvent_t sync_event; + RAFT_CUDA_TRY(cudaStreamCreate(&stream_new)); + RAFT_CUDA_TRY(cudaEventCreate(&sync_event)); + cusolverDnHandle_t cusolverH = resource::get_cusolver_dn_handle(handle); cusolverDnParams_t dn_params = nullptr; @@ -108,15 +115,13 @@ void eigDC(raft::resources const& handle, eig_vals, &workspaceDevice, &workspaceHost, - stream)); + stream_new)); - rmm::device_uvector d_work(workspaceDevice / sizeof(math_t), stream); - rmm::device_scalar d_dev_info(stream); + rmm::device_uvector d_work(workspaceDevice / sizeof(math_t), stream_new); + rmm::device_scalar d_dev_info(stream_new); std::vector h_work(workspaceHost / sizeof(math_t)); - raft::matrix::copy(handle, - make_device_matrix_view(in, n_rows, n_cols), - make_device_matrix_view(eig_vectors, n_rows, n_cols)); + raft::copy(eig_vectors, in, n_rows * n_cols, stream_new); RAFT_CUSOLVER_TRY(cusolverDnxsyevd(cusolverH, dn_params, @@ -131,14 +136,19 @@ void eigDC(raft::resources const& handle, h_work.data(), workspaceHost, d_dev_info.data(), - stream)); + stream_new)); RAFT_CUDA_TRY(cudaGetLastError()); RAFT_CUSOLVER_TRY(cusolverDnDestroyParams(dn_params)); - int dev_info = d_dev_info.value(stream); + int dev_info = d_dev_info.value(stream_new); ASSERT(dev_info == 0, "eig.cuh: eigensolver couldn't converge to a solution. " "This usually occurs when some of the features do not vary enough."); + + // Synchronize the created stream with the original stream before return + RAFT_CUDA_TRY(cudaEventRecord(sync_event, stream_new)); + RAFT_CUDA_TRY(cudaStreamWaitEvent(stream, sync_event)); + RAFT_CUDA_TRY(cudaStreamDestroy(stream_new)); #endif } diff --git a/cpp/test/linalg/eig.cu b/cpp/test/linalg/eig.cu index 460b99aaa0..1d9384d351 100644 --- a/cpp/test/linalg/eig.cu +++ b/cpp/test/linalg/eig.cu @@ -156,6 +156,19 @@ class EigTest : public ::testing::TestWithParam> { eig_vals_large, eig_vals_jacobi_large; }; +TEST(Raft, EigStream) +{ + // Separate test to check eig_dc stream workaround for CUDA 12+ + raft::resources handle; + auto n_rows = 5000; + auto cov_matrix_large = raft::make_device_matrix(handle, n_rows, n_rows); + auto eig_vectors_large = raft::make_device_matrix(handle, n_rows, n_rows); + auto eig_vals_large = raft::make_device_vector(handle, n_rows); + + raft::linalg::eig_dc(handle, raft::make_const_mdspan(cov_matrix_large.view()), eig_vectors_large.view(), eig_vals_large.view()); + raft::resource::sync_stream(handle, raft::resource::get_cuda_stream(handle)); +} + const std::vector> inputsf2 = {{0.001f, 4 * 4, 4, 4, 1234ULL, 256}}; const std::vector> inputsd2 = {{0.001, 4 * 4, 4, 4, 1234ULL, 256}}; From 604350d00ccc8334947103a426379cc52e3abe43 Mon Sep 17 00:00:00 2001 From: Mickael Ide Date: Wed, 22 May 2024 22:41:20 +0200 Subject: [PATCH 2/7] Fix style --- cpp/test/linalg/eig.cu | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/cpp/test/linalg/eig.cu b/cpp/test/linalg/eig.cu index 1d9384d351..19a69a263b 100644 --- a/cpp/test/linalg/eig.cu +++ b/cpp/test/linalg/eig.cu @@ -161,12 +161,17 @@ TEST(Raft, EigStream) // Separate test to check eig_dc stream workaround for CUDA 12+ raft::resources handle; auto n_rows = 5000; - auto cov_matrix_large = raft::make_device_matrix(handle, n_rows, n_rows); - auto eig_vectors_large = raft::make_device_matrix(handle, n_rows, n_rows); + auto cov_matrix_large = + raft::make_device_matrix(handle, n_rows, n_rows); + auto eig_vectors_large = + raft::make_device_matrix(handle, n_rows, n_rows); auto eig_vals_large = raft::make_device_vector(handle, n_rows); - raft::linalg::eig_dc(handle, raft::make_const_mdspan(cov_matrix_large.view()), eig_vectors_large.view(), eig_vals_large.view()); - raft::resource::sync_stream(handle, raft::resource::get_cuda_stream(handle)); + raft::linalg::eig_dc(handle, + raft::make_const_mdspan(cov_matrix_large.view()), + eig_vectors_large.view(), + eig_vals_large.view()); + raft::resource::sync_stream(handle, raft::resource::get_cuda_stream(handle)); } const std::vector> inputsf2 = {{0.001f, 4 * 4, 4, 4, 1234ULL, 256}}; From 9f7f4cde63d4468c93ac0d3007079f78ba0d9ef8 Mon Sep 17 00:00:00 2001 From: Mickael Ide Date: Wed, 5 Jun 2024 19:12:36 +0200 Subject: [PATCH 3/7] Destroy event after use --- cpp/include/raft/linalg/detail/eig.cuh | 1 + cpp/test/linalg/eig.cu | 12 ++++++------ 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/cpp/include/raft/linalg/detail/eig.cuh b/cpp/include/raft/linalg/detail/eig.cuh index b23152ba03..9a146912b2 100644 --- a/cpp/include/raft/linalg/detail/eig.cuh +++ b/cpp/include/raft/linalg/detail/eig.cuh @@ -148,6 +148,7 @@ void eigDC(raft::resources const& handle, // Synchronize the created stream with the original stream before return RAFT_CUDA_TRY(cudaEventRecord(sync_event, stream_new)); RAFT_CUDA_TRY(cudaStreamWaitEvent(stream, sync_event)); + RAFT_CUDA_TRY(cudaEventDestroy(sync_event)); RAFT_CUDA_TRY(cudaStreamDestroy(stream_new)); #endif } diff --git a/cpp/test/linalg/eig.cu b/cpp/test/linalg/eig.cu index 19a69a263b..3ff117cf08 100644 --- a/cpp/test/linalg/eig.cu +++ b/cpp/test/linalg/eig.cu @@ -161,16 +161,16 @@ TEST(Raft, EigStream) // Separate test to check eig_dc stream workaround for CUDA 12+ raft::resources handle; auto n_rows = 5000; - auto cov_matrix_large = + auto cov_matrix_stream = raft::make_device_matrix(handle, n_rows, n_rows); - auto eig_vectors_large = + auto eig_vectors_stream = raft::make_device_matrix(handle, n_rows, n_rows); - auto eig_vals_large = raft::make_device_vector(handle, n_rows); + auto eig_vals_stream = raft::make_device_vector(handle, n_rows); raft::linalg::eig_dc(handle, - raft::make_const_mdspan(cov_matrix_large.view()), - eig_vectors_large.view(), - eig_vals_large.view()); + raft::make_const_mdspan(cov_matrix_stream.view()), + eig_vectors_stream.view(), + eig_vals_stream.view()); raft::resource::sync_stream(handle, raft::resource::get_cuda_stream(handle)); } From e21b1067d3c2692e49e55d02deb574178846bd3b Mon Sep 17 00:00:00 2001 From: Mickael Ide Date: Mon, 15 Jul 2024 11:59:33 -0700 Subject: [PATCH 4/7] Add synchronization on stream start --- cpp/include/raft/linalg/detail/eig.cuh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cpp/include/raft/linalg/detail/eig.cuh b/cpp/include/raft/linalg/detail/eig.cuh index 9a146912b2..a3ea0a3e18 100644 --- a/cpp/include/raft/linalg/detail/eig.cuh +++ b/cpp/include/raft/linalg/detail/eig.cuh @@ -97,6 +97,8 @@ void eigDC(raft::resources const& handle, cudaEvent_t sync_event; RAFT_CUDA_TRY(cudaStreamCreate(&stream_new)); RAFT_CUDA_TRY(cudaEventCreate(&sync_event)); + RAFT_CUDA_TRY(cudaEventRecord(sync_event, stream)); + RAFT_CUDA_TRY(cudaStreamWaitEvent(stream_new, sync_event)); cusolverDnHandle_t cusolverH = resource::get_cusolver_dn_handle(handle); From d57a3bb1aab2b822760773d69da2eed98a9b5167 Mon Sep 17 00:00:00 2001 From: Mickael Ide Date: Mon, 22 Jul 2024 15:38:20 +0200 Subject: [PATCH 5/7] Address reviews, limit CTX version, use common resources --- cpp/include/raft/linalg/detail/eig.cuh | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/cpp/include/raft/linalg/detail/eig.cuh b/cpp/include/raft/linalg/detail/eig.cuh index a3ea0a3e18..733146f212 100644 --- a/cpp/include/raft/linalg/detail/eig.cuh +++ b/cpp/include/raft/linalg/detail/eig.cuh @@ -19,10 +19,12 @@ #include "cusolver_wrappers.hpp" #include +#include #include #include #include +#include #include #include @@ -90,16 +92,19 @@ void eigDC(raft::resources const& handle, { #if CUDART_VERSION < 11010 eigDC_legacy(handle, in, n_rows, n_cols, eig_vectors, eig_vals, stream); -#else + return; +#endif +#if CUDART_VERSION <= 12041 // Use a new stream instead of `cudaStreamPerThread` to avoid cusolver bug # 4580093. - cudaStream_t stream_new; - cudaEvent_t sync_event; - RAFT_CUDA_TRY(cudaStreamCreate(&stream_new)); - RAFT_CUDA_TRY(cudaEventCreate(&sync_event)); + rmm::cuda_stream stream_new_wrapper; + cudaStream_t stream_new = stream_new_wrapper.value(); + cudaEvent_t sync_event = resource::detail::get_cuda_stream_sync_event(handle); RAFT_CUDA_TRY(cudaEventRecord(sync_event, stream)); RAFT_CUDA_TRY(cudaStreamWaitEvent(stream_new, sync_event)); - +#else + cudaStream_t stream_new = stream; +#endif cusolverDnHandle_t cusolverH = resource::get_cusolver_dn_handle(handle); cusolverDnParams_t dn_params = nullptr; @@ -147,11 +152,10 @@ void eigDC(raft::resources const& handle, "eig.cuh: eigensolver couldn't converge to a solution. " "This usually occurs when some of the features do not vary enough."); +#if CUDART_VERSION <= 12041 // Synchronize the created stream with the original stream before return RAFT_CUDA_TRY(cudaEventRecord(sync_event, stream_new)); RAFT_CUDA_TRY(cudaStreamWaitEvent(stream, sync_event)); - RAFT_CUDA_TRY(cudaEventDestroy(sync_event)); - RAFT_CUDA_TRY(cudaStreamDestroy(stream_new)); #endif } From 6313df7f82980e27a1ed120bd09e281ce5404138 Mon Sep 17 00:00:00 2001 From: Mickael Ide Date: Mon, 22 Jul 2024 16:52:29 +0200 Subject: [PATCH 6/7] Fix version --- cpp/include/raft/linalg/detail/eig.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/raft/linalg/detail/eig.cuh b/cpp/include/raft/linalg/detail/eig.cuh index 733146f212..d0a6b7c555 100644 --- a/cpp/include/raft/linalg/detail/eig.cuh +++ b/cpp/include/raft/linalg/detail/eig.cuh @@ -95,7 +95,7 @@ void eigDC(raft::resources const& handle, return; #endif -#if CUDART_VERSION <= 12041 +#if CUDART_VERSION <= 12040 // Use a new stream instead of `cudaStreamPerThread` to avoid cusolver bug # 4580093. rmm::cuda_stream stream_new_wrapper; cudaStream_t stream_new = stream_new_wrapper.value(); From 389976efa560f072b6100aeb720c4994adec5c96 Mon Sep 17 00:00:00 2001 From: Mickael Ide Date: Mon, 22 Jul 2024 17:07:42 +0200 Subject: [PATCH 7/7] Fix version --- cpp/include/raft/linalg/detail/eig.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/raft/linalg/detail/eig.cuh b/cpp/include/raft/linalg/detail/eig.cuh index d0a6b7c555..ba7ed3dcdf 100644 --- a/cpp/include/raft/linalg/detail/eig.cuh +++ b/cpp/include/raft/linalg/detail/eig.cuh @@ -152,7 +152,7 @@ void eigDC(raft::resources const& handle, "eig.cuh: eigensolver couldn't converge to a solution. " "This usually occurs when some of the features do not vary enough."); -#if CUDART_VERSION <= 12041 +#if CUDART_VERSION <= 12040 // Synchronize the created stream with the original stream before return RAFT_CUDA_TRY(cudaEventRecord(sync_event, stream_new)); RAFT_CUDA_TRY(cudaStreamWaitEvent(stream, sync_event));