-
Notifications
You must be signed in to change notification settings - Fork 197
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add workaround for syevd in CUDA 12.0 #2332
Changes from 10 commits
e145ed1
bba530a
604350d
f713ced
c165331
75ea3a9
9f7f4cd
3b966ea
e21b106
e2bd6a8
d57a3bb
6313df7
389976e
bbbf424
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -91,6 +91,15 @@ void eigDC(raft::resources const& handle, | |
#if CUDART_VERSION < 11010 | ||
eigDC_legacy(handle, in, n_rows, n_cols, eig_vectors, eig_vals, stream); | ||
#else | ||
|
||
// Use a new stream instead of `cudaStreamPerThread` to avoid cusolver bug # 4580093. | ||
cudaStream_t stream_new; | ||
cudaEvent_t sync_event; | ||
RAFT_CUDA_TRY(cudaStreamCreate(&stream_new)); | ||
lowener marked this conversation as resolved.
Show resolved
Hide resolved
|
||
RAFT_CUDA_TRY(cudaEventCreate(&sync_event)); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think, in this case it would be justified to use the resource::detail::get_cuda_stream_sync_event instead of manually managing the resource. You can also use the stream from the stream pool resource, but there is a small problem with it, that raft/rmm would create 16 streams by default instead of one :) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please note that we shouldn't be calling detail APIs in any namdespaces outside of the immediate namespace where those detail APIs reside. Please expose this function if it's going to be used publicly. |
||
RAFT_CUDA_TRY(cudaEventRecord(sync_event, stream)); | ||
RAFT_CUDA_TRY(cudaStreamWaitEvent(stream_new, sync_event)); | ||
|
||
cusolverDnHandle_t cusolverH = resource::get_cusolver_dn_handle(handle); | ||
|
||
cusolverDnParams_t dn_params = nullptr; | ||
|
@@ -108,15 +117,13 @@ void eigDC(raft::resources const& handle, | |
eig_vals, | ||
&workspaceDevice, | ||
&workspaceHost, | ||
stream)); | ||
stream_new)); | ||
|
||
rmm::device_uvector<math_t> d_work(workspaceDevice / sizeof(math_t), stream); | ||
rmm::device_scalar<int> d_dev_info(stream); | ||
rmm::device_uvector<math_t> d_work(workspaceDevice / sizeof(math_t), stream_new); | ||
rmm::device_scalar<int> d_dev_info(stream_new); | ||
std::vector<math_t> h_work(workspaceHost / sizeof(math_t)); | ||
|
||
raft::matrix::copy(handle, | ||
make_device_matrix_view<const math_t>(in, n_rows, n_cols), | ||
make_device_matrix_view<math_t>(eig_vectors, n_rows, n_cols)); | ||
raft::copy(eig_vectors, in, n_rows * n_cols, stream_new); | ||
|
||
RAFT_CUSOLVER_TRY(cusolverDnxsyevd(cusolverH, | ||
dn_params, | ||
|
@@ -131,14 +138,20 @@ void eigDC(raft::resources const& handle, | |
h_work.data(), | ||
workspaceHost, | ||
d_dev_info.data(), | ||
stream)); | ||
stream_new)); | ||
|
||
RAFT_CUDA_TRY(cudaGetLastError()); | ||
RAFT_CUSOLVER_TRY(cusolverDnDestroyParams(dn_params)); | ||
int dev_info = d_dev_info.value(stream); | ||
int dev_info = d_dev_info.value(stream_new); | ||
ASSERT(dev_info == 0, | ||
"eig.cuh: eigensolver couldn't converge to a solution. " | ||
"This usually occurs when some of the features do not vary enough."); | ||
|
||
// Synchronize the created stream with the original stream before return | ||
RAFT_CUDA_TRY(cudaEventRecord(sync_event, stream_new)); | ||
RAFT_CUDA_TRY(cudaStreamWaitEvent(stream, sync_event)); | ||
RAFT_CUDA_TRY(cudaEventDestroy(sync_event)); | ||
RAFT_CUDA_TRY(cudaStreamDestroy(stream_new)); | ||
#endif | ||
} | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
IIUC, the cusolver bug is solved in cuda toolkit 12.4.1.003. It would be great if we apply this workaround only when we use an earlier cuda version.