Skip to content

Commit

Permalink
Merge pull request #266 from PointKernel/guard-cuda-apis
Browse files Browse the repository at this point in the history
Guard CUDA runtime APIs with error checking
  • Loading branch information
PointKernel authored Jan 31, 2023
2 parents 5943222 + e851556 commit 8523ab1
Show file tree
Hide file tree
Showing 10 changed files with 63 additions and 66 deletions.
18 changes: 9 additions & 9 deletions benchmarks/hash_table/static_map_bench.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2022, NVIDIA CORPORATION.
* Copyright (c) 2020-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -95,16 +95,16 @@ static void BM_static_map_insert(::benchmark::State& state)
map_type map{size, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{-1}};

cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
CUCO_CUDA_TRY(cudaEventCreate(&start));
CUCO_CUDA_TRY(cudaEventCreate(&stop));

cudaEventRecord(start);
CUCO_CUDA_TRY(cudaEventRecord(start));
map.insert(d_pairs.begin(), d_pairs.end());
cudaEventRecord(stop);
cudaEventSynchronize(stop);
CUCO_CUDA_TRY(cudaEventRecord(stop));
CUCO_CUDA_TRY(cudaEventSynchronize(stop));

float ms;
cudaEventElapsedTime(&ms, start, stop);
CUCO_CUDA_TRY(cudaEventElapsedTime(&ms, start, stop));

state.SetIterationTime(ms / 1000);
}
Expand Down Expand Up @@ -148,7 +148,7 @@ static void BM_static_map_search_all(::benchmark::State& state)
map.find(d_keys.begin(), d_keys.end(), d_results.begin());
// TODO: get rid of sync and rewrite the benchmark with `nvbench`
// once https://github.com/NVIDIA/nvbench/pull/80 is merged
cudaDeviceSynchronize();
CUCO_CUDA_TRY(cudaDeviceSynchronize());
}

state.SetBytesProcessed((sizeof(Key) + sizeof(Value)) * int64_t(state.iterations()) *
Expand Down Expand Up @@ -195,7 +195,7 @@ static void BM_static_map_search_none(::benchmark::State& state)
map.find(d_keys.begin(), d_keys.end(), d_results.begin());
// TODO: get rid of sync and rewrite the benchmark with `nvbench`
// once https://github.com/NVIDIA/nvbench/pull/80 is merged
cudaDeviceSynchronize();
CUCO_CUDA_TRY(cudaDeviceSynchronize());
}

state.SetBytesProcessed((sizeof(Key) + sizeof(Value)) * int64_t(state.iterations()) *
Expand Down
6 changes: 4 additions & 2 deletions benchmarks/reduce_by_key/reduce_by_key.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020, NVIDIA CORPORATION.
* Copyright (c) 2020-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -14,6 +14,8 @@
* limitations under the License.
*/

#include <cuco/detail/error.hpp>

#include <benchmark/benchmark.h>

#include <thrust/device_vector.h>
Expand Down Expand Up @@ -75,7 +77,7 @@ static void BM_thrust(::benchmark::State& state)
thrust::device_vector<Value> values(state.range(0));
state.ResumeTiming();
thrust_reduce_by_key(keys.begin(), keys.end(), values.begin());
cudaDeviceSynchronize();
CUCO_CUDA_TRY(cudaDeviceSynchronize());
}
}
BENCHMARK_TEMPLATE(BM_thrust, int32_t, int32_t)
Expand Down
41 changes: 16 additions & 25 deletions benchmarks/synchronization.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2022, NVIDIA CORPORATION.
* Copyright (c) 2020-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -16,24 +16,15 @@

#pragma once

#include <cuco/detail/error.hpp>

// Google Benchmark library
#include <benchmark/benchmark.h>

#include <cuda_runtime_api.h>

#include <stdexcept>

#define BENCH_CUDA_TRY(call) \
do { \
auto const status = (call); \
if (cudaSuccess != status) { throw std::runtime_error("CUDA error detected."); } \
} while (0)

#define BENCH_ASSERT_CUDA_SUCCESS(expr) \
do { \
cudaError_t const status = (expr); \
assert(cudaSuccess == status); \
} while (0)
/**
* @brief This class serves as a wrapper for using `cudaEvent_t` as the user
* defined timer within the framework of google benchmark
Expand Down Expand Up @@ -90,24 +81,24 @@ class cuda_event_timer {
// flush all of L2$
if (flush_l2_cache) {
int current_device = 0;
BENCH_CUDA_TRY(cudaGetDevice(&current_device));
CUCO_CUDA_TRY(cudaGetDevice(&current_device));

int l2_cache_bytes = 0;
BENCH_CUDA_TRY(
CUCO_CUDA_TRY(
cudaDeviceGetAttribute(&l2_cache_bytes, cudaDevAttrL2CacheSize, current_device));

if (l2_cache_bytes > 0) {
const int memset_value = 0;
int* l2_cache_buffer = nullptr;
BENCH_CUDA_TRY(cudaMalloc(&l2_cache_buffer, l2_cache_bytes));
BENCH_CUDA_TRY(cudaMemsetAsync(l2_cache_buffer, memset_value, l2_cache_bytes, stream_));
BENCH_CUDA_TRY(cudaFree(l2_cache_buffer));
CUCO_CUDA_TRY(cudaMalloc(&l2_cache_buffer, l2_cache_bytes));
CUCO_CUDA_TRY(cudaMemsetAsync(l2_cache_buffer, memset_value, l2_cache_bytes, stream_));
CUCO_CUDA_TRY(cudaFree(l2_cache_buffer));
}
}

BENCH_CUDA_TRY(cudaEventCreate(&start_));
BENCH_CUDA_TRY(cudaEventCreate(&stop_));
BENCH_CUDA_TRY(cudaEventRecord(start_, stream_));
CUCO_CUDA_TRY(cudaEventCreate(&start_));
CUCO_CUDA_TRY(cudaEventCreate(&stop_));
CUCO_CUDA_TRY(cudaEventRecord(start_, stream_));
}

cuda_event_timer() = delete;
Expand All @@ -118,13 +109,13 @@ class cuda_event_timer {
*/
~cuda_event_timer()
{
BENCH_ASSERT_CUDA_SUCCESS(cudaEventRecord(stop_, stream_));
BENCH_ASSERT_CUDA_SUCCESS(cudaEventSynchronize(stop_));
CUCO_ASSERT_CUDA_SUCCESS(cudaEventRecord(stop_, stream_));
CUCO_ASSERT_CUDA_SUCCESS(cudaEventSynchronize(stop_));
float milliseconds = 0.0f;
BENCH_ASSERT_CUDA_SUCCESS(cudaEventElapsedTime(&milliseconds, start_, stop_));
CUCO_ASSERT_CUDA_SUCCESS(cudaEventElapsedTime(&milliseconds, start_, stop_));
p_state->SetIterationTime(milliseconds / (1000.0f));
BENCH_ASSERT_CUDA_SUCCESS(cudaEventDestroy(start_));
BENCH_ASSERT_CUDA_SUCCESS(cudaEventDestroy(stop_));
CUCO_ASSERT_CUDA_SUCCESS(cudaEventDestroy(start_));
CUCO_ASSERT_CUDA_SUCCESS(cudaEventDestroy(stop_));
}

private:
Expand Down
18 changes: 9 additions & 9 deletions include/cuco/detail/static_multimap/static_multimap.inl
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021-2022, NVIDIA CORPORATION.
* Copyright (c) 2021-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -167,7 +167,7 @@ std::size_t static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::count(
auto view = get_device_view();
auto const grid_size = (cg_size() * num_keys + stride * block_size - 1) / (stride * block_size);

cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream);
CUCO_CUDA_TRY(cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream));
std::size_t h_counter;

detail::count<block_size, cg_size(), is_outer>
Expand Down Expand Up @@ -198,7 +198,7 @@ std::size_t static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::count_
auto view = get_device_view();
auto const grid_size = (cg_size() * num_keys + stride * block_size - 1) / (stride * block_size);

cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream);
CUCO_CUDA_TRY(cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream));
std::size_t h_counter;

detail::count<block_size, cg_size(), is_outer>
Expand Down Expand Up @@ -229,7 +229,7 @@ std::size_t static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::pair_c
auto view = get_device_view();
auto const grid_size = (cg_size() * num_pairs + stride * block_size - 1) / (stride * block_size);

cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream);
CUCO_CUDA_TRY(cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream));
std::size_t h_counter;

detail::pair_count<block_size, cg_size(), is_outer>
Expand Down Expand Up @@ -260,7 +260,7 @@ std::size_t static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::pair_c
auto view = get_device_view();
auto const grid_size = (cg_size() * num_pairs + stride * block_size - 1) / (stride * block_size);

cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream);
CUCO_CUDA_TRY(cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream));
std::size_t h_counter;

detail::pair_count<block_size, cg_size(), is_outer>
Expand Down Expand Up @@ -307,7 +307,7 @@ OutputIt static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::retrieve(
KeyEqual>,
block_size);

cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream);
CUCO_CUDA_TRY(cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream));
std::size_t h_counter;

detail::retrieve<block_size, flushing_cg_size, cg_size(), buffer_size, is_outer>
Expand Down Expand Up @@ -357,7 +357,7 @@ OutputIt static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::retrieve_
KeyEqual>,
block_size);

cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream);
CUCO_CUDA_TRY(cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream));
std::size_t h_counter;

detail::retrieve<block_size, flushing_cg_size, cg_size(), buffer_size, is_outer>
Expand Down Expand Up @@ -403,7 +403,7 @@ static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::pair_retrieve(
}();
auto const grid_size = (cg_size() * num_pairs + stride * block_size - 1) / (stride * block_size);

cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream);
CUCO_CUDA_TRY(cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream));
std::size_t h_counter;

detail::pair_retrieve<block_size, flushing_cg_size, cg_size(), buffer_size, is_outer>
Expand Down Expand Up @@ -453,7 +453,7 @@ static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::pair_retrieve_oute
}();
auto const grid_size = (cg_size() * num_pairs + stride * block_size - 1) / (stride * block_size);

cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream);
CUCO_CUDA_TRY(cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream));
std::size_t h_counter;

detail::pair_retrieve<block_size, flushing_cg_size, cg_size(), buffer_size, is_outer>
Expand Down
11 changes: 7 additions & 4 deletions include/cuco/detail/utils.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021-2022, NVIDIA CORPORATION.
* Copyright (c) 2021-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -15,6 +15,8 @@

#pragma once

#include <cuco/detail/error.hpp>

#include <iterator>
#include <type_traits>

Expand Down Expand Up @@ -43,11 +45,12 @@ template <typename Kernel>
auto get_grid_size(Kernel kernel, std::size_t block_size, std::size_t dynamic_smem_bytes = 0)
{
int grid_size{-1};
cudaOccupancyMaxActiveBlocksPerMultiprocessor(&grid_size, kernel, block_size, dynamic_smem_bytes);
CUCO_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&grid_size, kernel, block_size, dynamic_smem_bytes));
int dev_id{-1};
cudaGetDevice(&dev_id);
CUCO_CUDA_TRY(cudaGetDevice(&dev_id));
int num_sms{-1};
cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, dev_id);
CUCO_CUDA_TRY(cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, dev_id));
grid_size *= num_sms;
return grid_size;
}
Expand Down
2 changes: 1 addition & 1 deletion tests/static_map/insert_and_find_test.cu
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ TEMPLATE_TEST_CASE_SIG("Parallel insert-or-update",
static constexpr int Blocks = 1024;
static constexpr int Threads = 128;
parallel_sum<<<Blocks, Threads>>>(m.get_device_mutable_view());
cudaDeviceSynchronize();
CUCO_CUDA_TRY(cudaDeviceSynchronize());

thrust::device_vector<Key> d_keys(Iters);
thrust::device_vector<Value> d_values(Iters);
Expand Down
2 changes: 1 addition & 1 deletion tests/static_map/key_sentinel_test.cu
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ TEMPLATE_TEST_CASE_SIG(
for (int i = 0; i < SIZE; i++) {
h_A[i] = i;
}
cudaMemcpyToSymbol(A, h_A, SIZE * sizeof(int));
CUCO_CUDA_TRY(cudaMemcpyToSymbol(A, h_A, SIZE * sizeof(int)));

auto pairs_begin = thrust::make_transform_iterator(
thrust::make_counting_iterator<T>(0),
Expand Down
5 changes: 2 additions & 3 deletions tests/static_map/stream_test.cu
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ TEMPLATE_TEST_CASE_SIG("Unique sequence of keys on given stream",
(int64_t, int64_t))
{
cudaStream_t stream;
cudaStreamCreate(&stream);
CUCO_CUDA_TRY(cudaStreamCreate(&stream));

constexpr std::size_t num_keys{500'000};
cuco::static_map<Key, Value> map{1'000'000,
Expand Down Expand Up @@ -67,7 +67,6 @@ TEMPLATE_TEST_CASE_SIG("Unique sequence of keys on given stream",

map.insert(pairs_begin, pairs_begin + num_keys, hash_fn, equal_fn, stream);
map.find(d_keys.begin(), d_keys.end(), d_results.begin(), hash_fn, equal_fn, stream);
// cudaStreamSynchronize(stream);
auto zip = thrust::make_zip_iterator(thrust::make_tuple(d_results.begin(), d_values.begin()));

REQUIRE(cuco::test::all_of(
Expand All @@ -87,5 +86,5 @@ TEMPLATE_TEST_CASE_SIG("Unique sequence of keys on given stream",
REQUIRE(cuco::test::all_of(d_contained.begin(), d_contained.end(), thrust::identity{}, stream));
}

cudaStreamDestroy(stream);
CUCO_CUDA_TRY(cudaStreamDestroy(stream));
}
2 changes: 1 addition & 1 deletion tests/static_multimap/pair_function_test.cu
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ template <typename Key, typename Value, typename Map, typename PairIt>
__inline__ void test_pair_functions(Map& map, PairIt pair_begin, std::size_t num_pairs)
{
map.insert(pair_begin, pair_begin + num_pairs);
cudaStreamSynchronize(0);
CUCO_CUDA_TRY(cudaStreamSynchronize(0));

auto res = map.get_size();
REQUIRE(res == num_pairs);
Expand Down
24 changes: 13 additions & 11 deletions tests/utils.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2022, NVIDIA CORPORATION.
* Copyright (c) 2020-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -18,6 +18,8 @@

#include <utils.cuh>

#include <cuco/detail/error.hpp>

#include <thrust/functional.h>

#include <cooperative_groups.h>
Expand All @@ -39,19 +41,19 @@ int count_if(Iterator begin, Iterator end, Predicate p, cudaStream_t stream = 0)
auto const grid_size = (size + block_size - 1) / block_size;

int* count;
cudaMallocManaged(&count, sizeof(int));
CUCO_CUDA_TRY(cudaMallocManaged(&count, sizeof(int)));

*count = 0;
int device_id;
cudaGetDevice(&device_id);
cudaMemPrefetchAsync(count, sizeof(int), device_id, stream);
CUCO_CUDA_TRY(cudaGetDevice(&device_id));
CUCO_CUDA_TRY(cudaMemPrefetchAsync(count, sizeof(int), device_id, stream));

detail::count_if<<<grid_size, block_size, 0, stream>>>(begin, end, count, p);
cudaStreamSynchronize(stream);
CUCO_CUDA_TRY(cudaStreamSynchronize(stream));

auto res = *count;

cudaFree(count);
CUCO_CUDA_TRY(cudaFree(count));

return res;
}
Expand Down Expand Up @@ -85,19 +87,19 @@ bool equal(Iterator1 begin1, Iterator1 end1, Iterator2 begin2, Predicate p, cuda
auto const grid_size = (size + block_size - 1) / block_size;

int* count;
cudaMallocManaged(&count, sizeof(int));
CUCO_CUDA_TRY(cudaMallocManaged(&count, sizeof(int)));

*count = 0;
int device_id;
cudaGetDevice(&device_id);
cudaMemPrefetchAsync(count, sizeof(int), device_id, stream);
CUCO_CUDA_TRY(cudaGetDevice(&device_id));
CUCO_CUDA_TRY(cudaMemPrefetchAsync(count, sizeof(int), device_id, stream));

detail::count_if<<<grid_size, block_size, 0, stream>>>(begin1, end1, begin2, count, p);
cudaStreamSynchronize(stream);
CUCO_CUDA_TRY(cudaStreamSynchronize(stream));

auto res = *count;

cudaFree(count);
CUCO_CUDA_TRY(cudaFree(count));

return res == size;
}
Expand Down

0 comments on commit 8523ab1

Please sign in to comment.