Merge pull request #266 from PointKernel/guard-cuda-apis

Guard CUDA runtime APIs with error checking
NVIDIA · Jan 31, 2023 · 8523ab1 · 8523ab1
2 parents 5943222 + e851556
commit 8523ab1
Show file tree

Hide file tree

Showing 10 changed files with 63 additions and 66 deletions.
diff --git a/benchmarks/hash_table/static_map_bench.cu b/benchmarks/hash_table/static_map_bench.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -95,16 +95,16 @@ static void BM_static_map_insert(::benchmark::State& state)
     map_type map{size, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{-1}};
 
     cudaEvent_t start, stop;
-    cudaEventCreate(&start);
-    cudaEventCreate(&stop);
+    CUCO_CUDA_TRY(cudaEventCreate(&start));
+    CUCO_CUDA_TRY(cudaEventCreate(&stop));
 
-    cudaEventRecord(start);
+    CUCO_CUDA_TRY(cudaEventRecord(start));
     map.insert(d_pairs.begin(), d_pairs.end());
-    cudaEventRecord(stop);
-    cudaEventSynchronize(stop);
+    CUCO_CUDA_TRY(cudaEventRecord(stop));
+    CUCO_CUDA_TRY(cudaEventSynchronize(stop));
 
     float ms;
-    cudaEventElapsedTime(&ms, start, stop);
+    CUCO_CUDA_TRY(cudaEventElapsedTime(&ms, start, stop));
 
     state.SetIterationTime(ms / 1000);
   }
@@ -148,7 +148,7 @@ static void BM_static_map_search_all(::benchmark::State& state)
     map.find(d_keys.begin(), d_keys.end(), d_results.begin());
     // TODO: get rid of sync and rewrite the benchmark with `nvbench`
     // once https://github.com/NVIDIA/nvbench/pull/80 is merged
-    cudaDeviceSynchronize();
+    CUCO_CUDA_TRY(cudaDeviceSynchronize());
   }
 
   state.SetBytesProcessed((sizeof(Key) + sizeof(Value)) * int64_t(state.iterations()) *
@@ -195,7 +195,7 @@ static void BM_static_map_search_none(::benchmark::State& state)
     map.find(d_keys.begin(), d_keys.end(), d_results.begin());
     // TODO: get rid of sync and rewrite the benchmark with `nvbench`
     // once https://github.com/NVIDIA/nvbench/pull/80 is merged
-    cudaDeviceSynchronize();
+    CUCO_CUDA_TRY(cudaDeviceSynchronize());
   }
 
   state.SetBytesProcessed((sizeof(Key) + sizeof(Value)) * int64_t(state.iterations()) *

diff --git a/benchmarks/reduce_by_key/reduce_by_key.cu b/benchmarks/reduce_by_key/reduce_by_key.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#include <cuco/detail/error.hpp>
+
 #include <benchmark/benchmark.h>
 
 #include <thrust/device_vector.h>
@@ -75,7 +77,7 @@ static void BM_thrust(::benchmark::State& state)
     thrust::device_vector<Value> values(state.range(0));
     state.ResumeTiming();
     thrust_reduce_by_key(keys.begin(), keys.end(), values.begin());
-    cudaDeviceSynchronize();
+    CUCO_CUDA_TRY(cudaDeviceSynchronize());
   }
 }
 BENCHMARK_TEMPLATE(BM_thrust, int32_t, int32_t)

diff --git a/benchmarks/synchronization.hpp b/benchmarks/synchronization.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,24 +16,15 @@
 
 #pragma once
 
+#include <cuco/detail/error.hpp>
+
 // Google Benchmark library
 #include <benchmark/benchmark.h>
 
 #include <cuda_runtime_api.h>
 
 #include <stdexcept>
 
-#define BENCH_CUDA_TRY(call)                                                         \
-  do {                                                                               \
-    auto const status = (call);                                                      \
-    if (cudaSuccess != status) { throw std::runtime_error("CUDA error detected."); } \
-  } while (0)
-
-#define BENCH_ASSERT_CUDA_SUCCESS(expr) \
-  do {                                  \
-    cudaError_t const status = (expr);  \
-    assert(cudaSuccess == status);      \
-  } while (0)
 /**
  * @brief  This class serves as a wrapper for using `cudaEvent_t` as the user
  * defined timer within the framework of google benchmark
@@ -90,24 +81,24 @@ class cuda_event_timer {
     // flush all of L2$
     if (flush_l2_cache) {
       int current_device = 0;
-      BENCH_CUDA_TRY(cudaGetDevice(&current_device));
+      CUCO_CUDA_TRY(cudaGetDevice(&current_device));
 
       int l2_cache_bytes = 0;
-      BENCH_CUDA_TRY(
+      CUCO_CUDA_TRY(
         cudaDeviceGetAttribute(&l2_cache_bytes, cudaDevAttrL2CacheSize, current_device));
 
       if (l2_cache_bytes > 0) {
         const int memset_value = 0;
         int* l2_cache_buffer   = nullptr;
-        BENCH_CUDA_TRY(cudaMalloc(&l2_cache_buffer, l2_cache_bytes));
-        BENCH_CUDA_TRY(cudaMemsetAsync(l2_cache_buffer, memset_value, l2_cache_bytes, stream_));
-        BENCH_CUDA_TRY(cudaFree(l2_cache_buffer));
+        CUCO_CUDA_TRY(cudaMalloc(&l2_cache_buffer, l2_cache_bytes));
+        CUCO_CUDA_TRY(cudaMemsetAsync(l2_cache_buffer, memset_value, l2_cache_bytes, stream_));
+        CUCO_CUDA_TRY(cudaFree(l2_cache_buffer));
       }
     }
 
-    BENCH_CUDA_TRY(cudaEventCreate(&start_));
-    BENCH_CUDA_TRY(cudaEventCreate(&stop_));
-    BENCH_CUDA_TRY(cudaEventRecord(start_, stream_));
+    CUCO_CUDA_TRY(cudaEventCreate(&start_));
+    CUCO_CUDA_TRY(cudaEventCreate(&stop_));
+    CUCO_CUDA_TRY(cudaEventRecord(start_, stream_));
   }
 
   cuda_event_timer() = delete;
@@ -118,13 +109,13 @@ class cuda_event_timer {
    */
   ~cuda_event_timer()
   {
-    BENCH_ASSERT_CUDA_SUCCESS(cudaEventRecord(stop_, stream_));
-    BENCH_ASSERT_CUDA_SUCCESS(cudaEventSynchronize(stop_));
+    CUCO_ASSERT_CUDA_SUCCESS(cudaEventRecord(stop_, stream_));
+    CUCO_ASSERT_CUDA_SUCCESS(cudaEventSynchronize(stop_));
     float milliseconds = 0.0f;
-    BENCH_ASSERT_CUDA_SUCCESS(cudaEventElapsedTime(&milliseconds, start_, stop_));
+    CUCO_ASSERT_CUDA_SUCCESS(cudaEventElapsedTime(&milliseconds, start_, stop_));
     p_state->SetIterationTime(milliseconds / (1000.0f));
-    BENCH_ASSERT_CUDA_SUCCESS(cudaEventDestroy(start_));
-    BENCH_ASSERT_CUDA_SUCCESS(cudaEventDestroy(stop_));
+    CUCO_ASSERT_CUDA_SUCCESS(cudaEventDestroy(start_));
+    CUCO_ASSERT_CUDA_SUCCESS(cudaEventDestroy(stop_));
   }
 
  private:

diff --git a/include/cuco/detail/static_multimap/static_multimap.inl b/include/cuco/detail/static_multimap/static_multimap.inl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -167,7 +167,7 @@ std::size_t static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::count(
   auto view            = get_device_view();
   auto const grid_size = (cg_size() * num_keys + stride * block_size - 1) / (stride * block_size);
 
-  cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream);
+  CUCO_CUDA_TRY(cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream));
   std::size_t h_counter;
 
   detail::count<block_size, cg_size(), is_outer>
@@ -198,7 +198,7 @@ std::size_t static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::count_
   auto view            = get_device_view();
   auto const grid_size = (cg_size() * num_keys + stride * block_size - 1) / (stride * block_size);
 
-  cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream);
+  CUCO_CUDA_TRY(cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream));
   std::size_t h_counter;
 
   detail::count<block_size, cg_size(), is_outer>
@@ -229,7 +229,7 @@ std::size_t static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::pair_c
   auto view            = get_device_view();
   auto const grid_size = (cg_size() * num_pairs + stride * block_size - 1) / (stride * block_size);
 
-  cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream);
+  CUCO_CUDA_TRY(cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream));
   std::size_t h_counter;
 
   detail::pair_count<block_size, cg_size(), is_outer>
@@ -260,7 +260,7 @@ std::size_t static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::pair_c
   auto view            = get_device_view();
   auto const grid_size = (cg_size() * num_pairs + stride * block_size - 1) / (stride * block_size);
 
-  cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream);
+  CUCO_CUDA_TRY(cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream));
   std::size_t h_counter;
 
   detail::pair_count<block_size, cg_size(), is_outer>
@@ -307,7 +307,7 @@ OutputIt static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::retrieve(
                                                                 KeyEqual>,
                                                block_size);
 
-  cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream);
+  CUCO_CUDA_TRY(cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream));
   std::size_t h_counter;
 
   detail::retrieve<block_size, flushing_cg_size, cg_size(), buffer_size, is_outer>
@@ -357,7 +357,7 @@ OutputIt static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::retrieve_
                                                                 KeyEqual>,
                                                block_size);
 
-  cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream);
+  CUCO_CUDA_TRY(cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream));
   std::size_t h_counter;
 
   detail::retrieve<block_size, flushing_cg_size, cg_size(), buffer_size, is_outer>
@@ -403,7 +403,7 @@ static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::pair_retrieve(
   }();
   auto const grid_size = (cg_size() * num_pairs + stride * block_size - 1) / (stride * block_size);
 
-  cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream);
+  CUCO_CUDA_TRY(cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream));
   std::size_t h_counter;
 
   detail::pair_retrieve<block_size, flushing_cg_size, cg_size(), buffer_size, is_outer>
@@ -453,7 +453,7 @@ static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::pair_retrieve_oute
   }();
   auto const grid_size = (cg_size() * num_pairs + stride * block_size - 1) / (stride * block_size);
 
-  cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream);
+  CUCO_CUDA_TRY(cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream));
   std::size_t h_counter;
 
   detail::pair_retrieve<block_size, flushing_cg_size, cg_size(), buffer_size, is_outer>

diff --git a/include/cuco/detail/utils.hpp b/include/cuco/detail/utils.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,6 +15,8 @@
 
 #pragma once
 
+#include <cuco/detail/error.hpp>
+
 #include <iterator>
 #include <type_traits>
 
@@ -43,11 +45,12 @@ template <typename Kernel>
 auto get_grid_size(Kernel kernel, std::size_t block_size, std::size_t dynamic_smem_bytes = 0)
 {
   int grid_size{-1};
-  cudaOccupancyMaxActiveBlocksPerMultiprocessor(&grid_size, kernel, block_size, dynamic_smem_bytes);
+  CUCO_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+    &grid_size, kernel, block_size, dynamic_smem_bytes));
   int dev_id{-1};
-  cudaGetDevice(&dev_id);
+  CUCO_CUDA_TRY(cudaGetDevice(&dev_id));
   int num_sms{-1};
-  cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, dev_id);
+  CUCO_CUDA_TRY(cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, dev_id));
   grid_size *= num_sms;
   return grid_size;
 }

diff --git a/tests/static_map/insert_and_find_test.cu b/tests/static_map/insert_and_find_test.cu
@@ -66,7 +66,7 @@ TEMPLATE_TEST_CASE_SIG("Parallel insert-or-update",
   static constexpr int Blocks  = 1024;
   static constexpr int Threads = 128;
   parallel_sum<<<Blocks, Threads>>>(m.get_device_mutable_view());
-  cudaDeviceSynchronize();
+  CUCO_CUDA_TRY(cudaDeviceSynchronize());
 
   thrust::device_vector<Key> d_keys(Iters);
   thrust::device_vector<Value> d_values(Iters);

diff --git a/tests/static_map/key_sentinel_test.cu b/tests/static_map/key_sentinel_test.cu
@@ -49,7 +49,7 @@ TEMPLATE_TEST_CASE_SIG(
   for (int i = 0; i < SIZE; i++) {
     h_A[i] = i;
   }
-  cudaMemcpyToSymbol(A, h_A, SIZE * sizeof(int));
+  CUCO_CUDA_TRY(cudaMemcpyToSymbol(A, h_A, SIZE * sizeof(int)));
 
   auto pairs_begin = thrust::make_transform_iterator(
     thrust::make_counting_iterator<T>(0),

diff --git a/tests/static_map/stream_test.cu b/tests/static_map/stream_test.cu
@@ -38,7 +38,7 @@ TEMPLATE_TEST_CASE_SIG("Unique sequence of keys on given stream",
                        (int64_t, int64_t))
 {
   cudaStream_t stream;
-  cudaStreamCreate(&stream);
+  CUCO_CUDA_TRY(cudaStreamCreate(&stream));
 
   constexpr std::size_t num_keys{500'000};
   cuco::static_map<Key, Value> map{1'000'000,
@@ -67,7 +67,6 @@ TEMPLATE_TEST_CASE_SIG("Unique sequence of keys on given stream",
 
     map.insert(pairs_begin, pairs_begin + num_keys, hash_fn, equal_fn, stream);
     map.find(d_keys.begin(), d_keys.end(), d_results.begin(), hash_fn, equal_fn, stream);
-    // cudaStreamSynchronize(stream);
     auto zip = thrust::make_zip_iterator(thrust::make_tuple(d_results.begin(), d_values.begin()));
 
     REQUIRE(cuco::test::all_of(
@@ -87,5 +86,5 @@ TEMPLATE_TEST_CASE_SIG("Unique sequence of keys on given stream",
     REQUIRE(cuco::test::all_of(d_contained.begin(), d_contained.end(), thrust::identity{}, stream));
   }
 
-  cudaStreamDestroy(stream);
+  CUCO_CUDA_TRY(cudaStreamDestroy(stream));
 }
diff --git a/tests/static_multimap/pair_function_test.cu b/tests/static_multimap/pair_function_test.cu
@@ -43,7 +43,7 @@ template <typename Key, typename Value, typename Map, typename PairIt>
 __inline__ void test_pair_functions(Map& map, PairIt pair_begin, std::size_t num_pairs)
 {
   map.insert(pair_begin, pair_begin + num_pairs);
-  cudaStreamSynchronize(0);
+  CUCO_CUDA_TRY(cudaStreamSynchronize(0));
 
   auto res = map.get_size();
   REQUIRE(res == num_pairs);

diff --git a/tests/utils.hpp b/tests/utils.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,8 @@
 
 #include <utils.cuh>
 
+#include <cuco/detail/error.hpp>
+
 #include <thrust/functional.h>
 
 #include <cooperative_groups.h>
@@ -39,19 +41,19 @@ int count_if(Iterator begin, Iterator end, Predicate p, cudaStream_t stream = 0)
   auto const grid_size = (size + block_size - 1) / block_size;
 
   int* count;
-  cudaMallocManaged(&count, sizeof(int));
+  CUCO_CUDA_TRY(cudaMallocManaged(&count, sizeof(int)));
 
   *count = 0;
   int device_id;
-  cudaGetDevice(&device_id);
-  cudaMemPrefetchAsync(count, sizeof(int), device_id, stream);
+  CUCO_CUDA_TRY(cudaGetDevice(&device_id));
+  CUCO_CUDA_TRY(cudaMemPrefetchAsync(count, sizeof(int), device_id, stream));
 
   detail::count_if<<<grid_size, block_size, 0, stream>>>(begin, end, count, p);
-  cudaStreamSynchronize(stream);
+  CUCO_CUDA_TRY(cudaStreamSynchronize(stream));
 
   auto res = *count;
 
-  cudaFree(count);
+  CUCO_CUDA_TRY(cudaFree(count));
 
   return res;
 }
@@ -85,19 +87,19 @@ bool equal(Iterator1 begin1, Iterator1 end1, Iterator2 begin2, Predicate p, cuda
   auto const grid_size = (size + block_size - 1) / block_size;
 
   int* count;
-  cudaMallocManaged(&count, sizeof(int));
+  CUCO_CUDA_TRY(cudaMallocManaged(&count, sizeof(int)));
 
   *count = 0;
   int device_id;
-  cudaGetDevice(&device_id);
-  cudaMemPrefetchAsync(count, sizeof(int), device_id, stream);
+  CUCO_CUDA_TRY(cudaGetDevice(&device_id));
+  CUCO_CUDA_TRY(cudaMemPrefetchAsync(count, sizeof(int), device_id, stream));
 
   detail::count_if<<<grid_size, block_size, 0, stream>>>(begin1, end1, begin2, count, p);
-  cudaStreamSynchronize(stream);
+  CUCO_CUDA_TRY(cudaStreamSynchronize(stream));
 
   auto res = *count;
 
-  cudaFree(count);
+  CUCO_CUDA_TRY(cudaFree(count));
 
   return res == size;
 }