Guard CUDA runtime APIs with error checking (#12531)

This PR adds missing error checking for CUDA runtime APIs in libcudf. Authors: - Yunsong Wang (https://github.com/PointKernel) Approvers: - Bradley Dice (https://github.com/bdice) - Nghia Truong (https://github.com/ttnghia) URL: #12531
rapidsai · Jan 23, 2023 · 27a4aa8 · 27a4aa8
1 parent 37fa3c2
commit 27a4aa8
Show file tree

Hide file tree

Showing 15 changed files with 92 additions and 88 deletions.
diff --git a/cpp/benchmarks/io/json/nested_json.cpp b/cpp/benchmarks/io/json/nested_json.cpp
@@ -148,8 +148,8 @@ auto make_test_json_data(cudf::size_type string_size, rmm::cuda_stream_view stre
   auto d_scalar         = cudf::strings::repeat_string(d_string_scalar, repeat_times);
 
   auto data = const_cast<char*>(d_scalar->data());
-  cudaMemsetAsync(data, '[', 1, stream.value());
-  cudaMemsetAsync(data + d_scalar->size() - 1, ']', 1, stream.value());
+  CUDF_CUDA_TRY(cudaMemsetAsync(data, '[', 1, stream.value()));
+  CUDF_CUDA_TRY(cudaMemsetAsync(data + d_scalar->size() - 1, ']', 1, stream.value()));
 
   return d_scalar;
 }

diff --git a/cpp/benchmarks/io/text/multibyte_split.cpp b/cpp/benchmarks/io/text/multibyte_split.cpp
@@ -148,8 +148,8 @@ static void bench_multibyte_split(nvbench::state& state,
   }
   if (source_type == data_chunk_source_type::host_pinned) {
     host_pinned_input.resize(static_cast<std::size_t>(device_input.size()));
-    cudaMemcpy(
-      host_pinned_input.data(), device_input.data(), host_pinned_input.size(), cudaMemcpyDefault);
+    CUDF_CUDA_TRY(cudaMemcpy(
+      host_pinned_input.data(), device_input.data(), host_pinned_input.size(), cudaMemcpyDefault));
   }
 
   auto source = [&] {

diff --git a/cpp/benchmarks/string/json.cu b/cpp/benchmarks/string/json.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -198,7 +198,7 @@ void BM_case(benchmark::State& state, std::string query_arg)
   for (auto _ : state) {
     cuda_event_timer raii(state, true);
     auto result = cudf::strings::get_json_object(scv, json_path);
-    cudaStreamSynchronize(0);
+    CUDF_CUDA_TRY(cudaStreamSynchronize(0));
   }
 
   // this isn't strictly 100% accurate. a given query isn't necessarily

diff --git a/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md b/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
@@ -886,7 +886,7 @@ thrown exception includes a description of the CUDA error code in its `what()` m
 Example:
 
 ```c++
-CUDA_TRY( cudaMemcpy(&dst, &src, num_bytes) );
+CUDF_CUDA_TRY( cudaMemcpy(&dst, &src, num_bytes) );
 ```
 
 ## Compile-Time Conditions

diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu
@@ -166,18 +166,18 @@ std::vector<std::string> create_key_strings(char const* h_data,
 {
   auto const num_cols = sorted_info.num_rows();
   std::vector<uint64_t> h_offsets(num_cols);
-  cudaMemcpyAsync(h_offsets.data(),
-                  sorted_info.column(0).data<uint64_t>(),
-                  sizeof(uint64_t) * num_cols,
-                  cudaMemcpyDefault,
-                  stream.value());
+  CUDF_CUDA_TRY(cudaMemcpyAsync(h_offsets.data(),
+                                sorted_info.column(0).data<uint64_t>(),
+                                sizeof(uint64_t) * num_cols,
+                                cudaMemcpyDefault,
+                                stream.value()));
 
   std::vector<uint16_t> h_lens(num_cols);
-  cudaMemcpyAsync(h_lens.data(),
-                  sorted_info.column(1).data<uint16_t>(),
-                  sizeof(uint16_t) * num_cols,
-                  cudaMemcpyDefault,
-                  stream.value());
+  CUDF_CUDA_TRY(cudaMemcpyAsync(h_lens.data(),
+                                sorted_info.column(1).data<uint16_t>(),
+                                sizeof(uint16_t) * num_cols,
+                                cudaMemcpyDefault,
+                                stream.value()));
 
   std::vector<std::string> names(num_cols);
   std::transform(h_offsets.cbegin(),

diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
@@ -1322,16 +1322,18 @@ writer::impl::encoded_footer_statistics writer::impl::finish_statistic_blobs(
     auto const chunk_bytes = stripes_per_col * sizeof(statistics_chunk);
     auto const merge_bytes = stripes_per_col * sizeof(statistics_merge_group);
     for (size_t col = 0; col < num_columns; ++col) {
-      cudaMemcpyAsync(stat_chunks.data() + (num_stripes * col) + num_entries_seen,
-                      per_chunk_stats.stripe_stat_chunks[i].data() + col * stripes_per_col,
-                      chunk_bytes,
-                      cudaMemcpyDefault,
-                      stream);
-      cudaMemcpyAsync(stats_merge.device_ptr() + (num_stripes * col) + num_entries_seen,
-                      per_chunk_stats.stripe_stat_merge[i].device_ptr() + col * stripes_per_col,
-                      merge_bytes,
-                      cudaMemcpyDefault,
-                      stream);
+      CUDF_CUDA_TRY(
+        cudaMemcpyAsync(stat_chunks.data() + (num_stripes * col) + num_entries_seen,
+                        per_chunk_stats.stripe_stat_chunks[i].data() + col * stripes_per_col,
+                        chunk_bytes,
+                        cudaMemcpyDefault,
+                        stream.value()));
+      CUDF_CUDA_TRY(
+        cudaMemcpyAsync(stats_merge.device_ptr() + (num_stripes * col) + num_entries_seen,
+                        per_chunk_stats.stripe_stat_merge[i].device_ptr() + col * stripes_per_col,
+                        merge_bytes,
+                        cudaMemcpyDefault,
+                        stream.value()));
     }
     num_entries_seen += stripes_per_col;
   }
@@ -1346,11 +1348,11 @@ writer::impl::encoded_footer_statistics writer::impl::finish_statistic_blobs(
   }
 
   auto d_file_stats_merge = stats_merge.device_ptr(num_stripe_blobs);
-  cudaMemcpyAsync(d_file_stats_merge,
-                  file_stats_merge.data(),
-                  num_file_blobs * sizeof(statistics_merge_group),
-                  cudaMemcpyDefault,
-                  stream);
+  CUDF_CUDA_TRY(cudaMemcpyAsync(d_file_stats_merge,
+                                file_stats_merge.data(),
+                                num_file_blobs * sizeof(statistics_merge_group),
+                                cudaMemcpyDefault,
+                                stream.value()));
 
   auto file_stat_chunks = stat_chunks.data() + num_stripe_blobs;
   detail::merge_group_statistics<detail::io_file_format::ORC>(

diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
@@ -150,11 +150,11 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
 
       // the final offset for a list at level N is the size of it's child
       int offset = child.type.id() == type_id::LIST ? child.size - 1 : child.size;
-      cudaMemcpyAsync(static_cast<int32_t*>(out_buf.data()) + (out_buf.size - 1),
-                      &offset,
-                      sizeof(offset),
-                      cudaMemcpyDefault,
-                      _stream.value());
+      CUDF_CUDA_TRY(cudaMemcpyAsync(static_cast<int32_t*>(out_buf.data()) + (out_buf.size - 1),
+                                    &offset,
+                                    sizeof(offset),
+                                    cudaMemcpyDefault,
+                                    _stream.value()));
       out_buf.user_data |= PARQUET_COLUMN_BUFFER_FLAG_LIST_TERMINATED;
     }
   }

diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -817,10 +817,11 @@ void print_cumulative_page_info(hostdevice_vector<gpu::PageInfo>& pages,
 
   std::vector<int> schemas(pages.size());
   std::vector<int> h_page_index(pages.size());
-  cudaMemcpy(h_page_index.data(), page_index.data(), sizeof(int) * pages.size(), cudaMemcpyDefault);
+  CUDF_CUDA_TRY(cudaMemcpy(
+    h_page_index.data(), page_index.data(), sizeof(int) * pages.size(), cudaMemcpyDefault));
   std::vector<cumulative_row_info> h_cinfo(pages.size());
-  cudaMemcpy(
-    h_cinfo.data(), c_info.data(), sizeof(cumulative_row_info) * pages.size(), cudaMemcpyDefault);
+  CUDF_CUDA_TRY(cudaMemcpy(
+    h_cinfo.data(), c_info.data(), sizeof(cumulative_row_info) * pages.size(), cudaMemcpyDefault));
   auto schema_iter = cudf::detail::make_counting_transform_iterator(
     0, [&](size_type i) { return pages[h_page_index[i]].src_col_schema; });
   thrust::copy(thrust::seq, schema_iter, schema_iter + pages.size(), schemas.begin());
@@ -1107,11 +1108,11 @@ std::vector<gpu::chunk_read_info> compute_splits(hostdevice_vector<gpu::PageInfo
                  return a.row_count < b.row_count;
                });
 
-  std::vector<cumulative_row_info> h_c_info_sorted(c_info_sorted.size());
-  cudaMemcpy(h_c_info_sorted.data(),
-             c_info_sorted.data(),
-             sizeof(cumulative_row_info) * c_info_sorted.size(),
-             cudaMemcpyDefault);
+  // std::vector<cumulative_row_info> h_c_info_sorted(c_info_sorted.size());
+  // CUDF_CUDA_TRY(cudaMemcpy(h_c_info_sorted.data(),
+  //                          c_info_sorted.data(),
+  //                          sizeof(cumulative_row_info) * c_info_sorted.size(),
+  //                          cudaMemcpyDefault));
   // print_cumulative_row_info(h_c_info_sorted, "raw");
 
   // generate key offsets (offsets to the start of each partition of keys). worst case is 1 page per
@@ -1147,11 +1148,11 @@ std::vector<gpu::chunk_read_info> compute_splits(hostdevice_vector<gpu::PageInfo
 
   // bring back to the cpu
   std::vector<cumulative_row_info> h_aggregated_info(aggregated_info.size());
-  cudaMemcpyAsync(h_aggregated_info.data(),
-                  aggregated_info.data(),
-                  sizeof(cumulative_row_info) * c_info.size(),
-                  cudaMemcpyDefault,
-                  stream);
+  CUDF_CUDA_TRY(cudaMemcpyAsync(h_aggregated_info.data(),
+                                aggregated_info.data(),
+                                sizeof(cumulative_row_info) * c_info.size(),
+                                cudaMemcpyDefault,
+                                stream.value()));
   stream.synchronize();
 
   return find_splits(h_aggregated_info, num_rows, chunk_read_limit);

diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -312,23 +312,23 @@ namespace detail {
 void fork_stream(std::vector<rmm::cuda_stream_view> streams, rmm::cuda_stream_view stream)
 {
   cudaEvent_t event;
-  cudaEventCreate(&event);
-  cudaEventRecord(event, stream);
+  CUDF_CUDA_TRY(cudaEventCreate(&event));
+  CUDF_CUDA_TRY(cudaEventRecord(event, stream));
   for (uint32_t i = 0; i < streams.size(); i++) {
-    cudaStreamWaitEvent(streams[i], event, 0);
+    CUDF_CUDA_TRY(cudaStreamWaitEvent(streams[i], event, 0));
   }
-  cudaEventDestroy(event);
+  CUDF_CUDA_TRY(cudaEventDestroy(event));
 }
 
 void join_stream(std::vector<rmm::cuda_stream_view> streams, rmm::cuda_stream_view stream)
 {
   cudaEvent_t event;
-  cudaEventCreate(&event);
+  CUDF_CUDA_TRY(cudaEventCreate(&event));
   for (uint32_t i = 0; i < streams.size(); i++) {
-    cudaEventRecord(event, streams[i]);
-    cudaStreamWaitEvent(stream, event, 0);
+    CUDF_CUDA_TRY(cudaEventRecord(event, streams[i]));
+    CUDF_CUDA_TRY(cudaStreamWaitEvent(stream, event, 0));
   }
-  cudaEventDestroy(event);
+  CUDF_CUDA_TRY(cudaEventDestroy(event));
 }
 
 std::vector<rmm::cuda_stream_view> get_streams(int32_t count, rmm::cuda_stream_pool& stream_pool)
@@ -417,7 +417,7 @@ std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source
   fork_stream(streams, stream);
 
   cudaEvent_t last_launch_event;
-  cudaEventCreate(&last_launch_event);
+  CUDF_CUDA_TRY(cudaEventCreate(&last_launch_event));
 
   auto& read_stream     = streams[0];
   auto& scan_stream     = streams[1];
@@ -451,7 +451,7 @@ std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source
       tile_multistates,
       tile_offsets);
 
-    cudaStreamWaitEvent(scan_stream.value(), last_launch_event);
+    CUDF_CUDA_TRY(cudaStreamWaitEvent(scan_stream.value(), last_launch_event));
 
     if (delimiter.size() == 1) {
       // the single-byte case allows for a much more efficient kernel, so we special-case it
@@ -525,15 +525,15 @@ std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source
       char_storage.advance_output(output_size, scan_stream);
     }
 
-    cudaEventRecord(last_launch_event, scan_stream.value());
+    CUDF_CUDA_TRY(cudaEventRecord(last_launch_event, scan_stream.value()));
 
     std::swap(read_stream, scan_stream);
     base_tile_idx += tiles_in_launch;
     chunk_offset += chunk->size();
     chunk = std::move(next_chunk);
   }
 
-  cudaEventDestroy(last_launch_event);
+  CUDF_CUDA_TRY(cudaEventDestroy(last_launch_event));
 
   join_stream(streams, stream);
 

diff --git a/cpp/src/io/utilities/file_io_utilities.cpp b/cpp/src/io/utilities/file_io_utilities.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -202,11 +202,11 @@ std::future<size_t> cufile_input_impl::read_async(size_t offset,
                                                   rmm::cuda_stream_view stream)
 {
   int device;
-  cudaGetDevice(&device);
+  CUDF_CUDA_TRY(cudaGetDevice(&device));
 
   auto read_slice = [device, gds_read = shim->read, file_handle = cf_file.handle()](
                       void* dst, size_t size, size_t offset) -> ssize_t {
-    cudaSetDevice(device);
+    CUDF_CUDA_TRY(cudaSetDevice(device));
     auto read_size = gds_read(file_handle, dst, size, offset, 0);
     CUDF_EXPECTS(read_size != -1, "cuFile error reading from a file");
     return read_size;
@@ -234,11 +234,11 @@ cufile_output_impl::cufile_output_impl(std::string const& filepath)
 std::future<void> cufile_output_impl::write_async(void const* data, size_t offset, size_t size)
 {
   int device;
-  cudaGetDevice(&device);
+  CUDF_CUDA_TRY(cudaGetDevice(&device));
 
   auto write_slice = [device, gds_write = shim->write, file_handle = cf_file.handle()](
                        void const* src, size_t size, size_t offset) -> void {
-    cudaSetDevice(device);
+    CUDF_CUDA_TRY(cudaSetDevice(device));
     auto write_size = gds_write(file_handle, src, size, offset, 0);
     CUDF_EXPECTS(write_size != -1 and write_size == static_cast<decltype(write_size)>(size),
                  "cuFile error writing to a file");

diff --git a/cpp/src/lists/dremel.cu b/cpp/src/lists/dremel.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -126,7 +126,8 @@ dremel_data get_dremel_data(column_view h_col,
   std::unique_ptr<column> empty_list_offset_col;
   if (has_empty_list_offsets) {
     empty_list_offset_col = make_fixed_width_column(data_type(type_id::INT32), 1);
-    cudaMemsetAsync(empty_list_offset_col->mutable_view().head(), 0, sizeof(size_type), stream);
+    CUDF_CUDA_TRY(cudaMemsetAsync(
+      empty_list_offset_col->mutable_view().head(), 0, sizeof(size_type), stream.value()));
     std::function<column_view(column_view const&)> normalize_col = [&](column_view const& col) {
       auto children = [&]() -> std::vector<column_view> {
         if (col.type().id() == type_id::LIST) {

diff --git a/cpp/tests/io/comp/decomp_test.cpp b/cpp/tests/io/comp/decomp_test.cpp
@@ -64,8 +64,8 @@ struct DecompressTest : public cudf::test::BaseFixture {
     inf_stat.host_to_device(stream);
 
     static_cast<Decompressor*>(this)->dispatch(inf_in, inf_out, inf_stat);
-    cudaMemcpyAsync(
-      decompressed->data(), dst.data(), dst.size(), cudaMemcpyDefault, stream.value());
+    CUDF_CUDA_TRY(cudaMemcpyAsync(
+      decompressed->data(), dst.data(), dst.size(), cudaMemcpyDefault, stream.value()));
     inf_stat.device_to_host(stream, true);
     ASSERT_EQ(inf_stat[0].status, cudf::io::compression_status::SUCCESS);
   }

diff --git a/cpp/tests/io/fst/logical_stack_test.cu b/cpp/tests/io/fst/logical_stack_test.cu
@@ -204,17 +204,17 @@ TEST_F(LogicalStackTest, GroundTruth)
   hostdevice_vector<SymbolT> top_of_stack_gpu{string_size, stream_view};
   cudf::device_span<SymbolOffsetT> d_stack_op_idx_span{d_stack_op_indexes};
 
-  cudaMemcpyAsync(d_stack_ops.data(),
-                  stack_symbols.data(),
-                  stack_symbols.size() * sizeof(SymbolT),
-                  cudaMemcpyDefault,
-                  stream.value());
-
-  cudaMemcpyAsync(d_stack_op_indexes.data(),
-                  stack_op_indexes.data(),
-                  stack_op_indexes.size() * sizeof(SymbolOffsetT),
-                  cudaMemcpyDefault,
-                  stream.value());
+  CUDF_CUDA_TRY(cudaMemcpyAsync(d_stack_ops.data(),
+                                stack_symbols.data(),
+                                stack_symbols.size() * sizeof(SymbolT),
+                                cudaMemcpyDefault,
+                                stream.value()));
+
+  CUDF_CUDA_TRY(cudaMemcpyAsync(d_stack_op_indexes.data(),
+                                stack_op_indexes.data(),
+                                stack_op_indexes.size() * sizeof(SymbolOffsetT),
+                                cudaMemcpyDefault,
+                                stream.value()));
 
   // Run algorithm
   fst::sparse_stack_op_to_top_of_stack<StackLevelT>(d_stack_ops.data(),

diff --git a/cpp/tests/io/text/data_chunk_source_test.cpp b/cpp/tests/io/text/data_chunk_source_test.cpp
@@ -36,7 +36,7 @@ struct DataChunkSourceTest : public BaseFixture {
 std::string chunk_to_host(const cudf::io::text::device_data_chunk& chunk)
 {
   std::string result(chunk.size(), '\0');
-  cudaMemcpy(result.data(), chunk.data(), chunk.size(), cudaMemcpyDefault);
+  CUDF_CUDA_TRY(cudaMemcpy(result.data(), chunk.data(), chunk.size(), cudaMemcpyDefault));
   return result;
 }
 

diff --git a/cpp/tests/utilities/tdigest_utilities.cu b/cpp/tests/utilities/tdigest_utilities.cu
@@ -113,10 +113,10 @@ std::unique_ptr<column> make_expected_tdigest_column(std::vector<expected_tdiges
     std::vector<offset_type> h_offsets{0, tdigest.mean.size()};
     auto offsets =
       cudf::make_fixed_width_column(data_type{type_id::INT32}, 2, mask_state::UNALLOCATED);
-    cudaMemcpy(offsets->mutable_view().begin<offset_type>(),
-               h_offsets.data(),
-               sizeof(offset_type) * 2,
-               cudaMemcpyDefault);
+    CUDF_CUDA_TRY(cudaMemcpy(offsets->mutable_view().begin<offset_type>(),
+                             h_offsets.data(),
+                             sizeof(offset_type) * 2,
+                             cudaMemcpyDefault));
 
     auto list = cudf::make_lists_column(1, std::move(offsets), std::move(tdigests), 0, {});