Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Guard CUDA runtime APIs with error checking #266

Merged
merged 1 commit into from
Jan 31, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 9 additions & 9 deletions benchmarks/hash_table/static_map_bench.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2022, NVIDIA CORPORATION.
* Copyright (c) 2020-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -95,16 +95,16 @@ static void BM_static_map_insert(::benchmark::State& state)
map_type map{size, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{-1}};

cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
CUCO_CUDA_TRY(cudaEventCreate(&start));
CUCO_CUDA_TRY(cudaEventCreate(&stop));

cudaEventRecord(start);
CUCO_CUDA_TRY(cudaEventRecord(start));
map.insert(d_pairs.begin(), d_pairs.end());
cudaEventRecord(stop);
cudaEventSynchronize(stop);
CUCO_CUDA_TRY(cudaEventRecord(stop));
CUCO_CUDA_TRY(cudaEventSynchronize(stop));

float ms;
cudaEventElapsedTime(&ms, start, stop);
CUCO_CUDA_TRY(cudaEventElapsedTime(&ms, start, stop));

state.SetIterationTime(ms / 1000);
}
Expand Down Expand Up @@ -148,7 +148,7 @@ static void BM_static_map_search_all(::benchmark::State& state)
map.find(d_keys.begin(), d_keys.end(), d_results.begin());
// TODO: get rid of sync and rewrite the benchmark with `nvbench`
// once https://github.com/NVIDIA/nvbench/pull/80 is merged
cudaDeviceSynchronize();
CUCO_CUDA_TRY(cudaDeviceSynchronize());
}

state.SetBytesProcessed((sizeof(Key) + sizeof(Value)) * int64_t(state.iterations()) *
Expand Down Expand Up @@ -195,7 +195,7 @@ static void BM_static_map_search_none(::benchmark::State& state)
map.find(d_keys.begin(), d_keys.end(), d_results.begin());
// TODO: get rid of sync and rewrite the benchmark with `nvbench`
// once https://github.com/NVIDIA/nvbench/pull/80 is merged
cudaDeviceSynchronize();
CUCO_CUDA_TRY(cudaDeviceSynchronize());
}

state.SetBytesProcessed((sizeof(Key) + sizeof(Value)) * int64_t(state.iterations()) *
Expand Down
6 changes: 4 additions & 2 deletions benchmarks/reduce_by_key/reduce_by_key.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020, NVIDIA CORPORATION.
* Copyright (c) 2020-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -14,6 +14,8 @@
* limitations under the License.
*/

#include <cuco/detail/error.hpp>

#include <benchmark/benchmark.h>

#include <thrust/device_vector.h>
Expand Down Expand Up @@ -75,7 +77,7 @@ static void BM_thrust(::benchmark::State& state)
thrust::device_vector<Value> values(state.range(0));
state.ResumeTiming();
thrust_reduce_by_key(keys.begin(), keys.end(), values.begin());
cudaDeviceSynchronize();
CUCO_CUDA_TRY(cudaDeviceSynchronize());
}
}
BENCHMARK_TEMPLATE(BM_thrust, int32_t, int32_t)
Expand Down
41 changes: 16 additions & 25 deletions benchmarks/synchronization.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2022, NVIDIA CORPORATION.
* Copyright (c) 2020-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -16,24 +16,15 @@

#pragma once

#include <cuco/detail/error.hpp>

// Google Benchmark library
#include <benchmark/benchmark.h>

#include <cuda_runtime_api.h>

#include <stdexcept>

#define BENCH_CUDA_TRY(call) \
do { \
auto const status = (call); \
if (cudaSuccess != status) { throw std::runtime_error("CUDA error detected."); } \
} while (0)

#define BENCH_ASSERT_CUDA_SUCCESS(expr) \
do { \
cudaError_t const status = (expr); \
assert(cudaSuccess == status); \
} while (0)
/**
* @brief This class serves as a wrapper for using `cudaEvent_t` as the user
* defined timer within the framework of google benchmark
Expand Down Expand Up @@ -90,24 +81,24 @@ class cuda_event_timer {
// flush all of L2$
if (flush_l2_cache) {
int current_device = 0;
BENCH_CUDA_TRY(cudaGetDevice(&current_device));
CUCO_CUDA_TRY(cudaGetDevice(&current_device));

int l2_cache_bytes = 0;
BENCH_CUDA_TRY(
CUCO_CUDA_TRY(
cudaDeviceGetAttribute(&l2_cache_bytes, cudaDevAttrL2CacheSize, current_device));

if (l2_cache_bytes > 0) {
const int memset_value = 0;
int* l2_cache_buffer = nullptr;
BENCH_CUDA_TRY(cudaMalloc(&l2_cache_buffer, l2_cache_bytes));
BENCH_CUDA_TRY(cudaMemsetAsync(l2_cache_buffer, memset_value, l2_cache_bytes, stream_));
BENCH_CUDA_TRY(cudaFree(l2_cache_buffer));
CUCO_CUDA_TRY(cudaMalloc(&l2_cache_buffer, l2_cache_bytes));
CUCO_CUDA_TRY(cudaMemsetAsync(l2_cache_buffer, memset_value, l2_cache_bytes, stream_));
CUCO_CUDA_TRY(cudaFree(l2_cache_buffer));
}
}

BENCH_CUDA_TRY(cudaEventCreate(&start_));
BENCH_CUDA_TRY(cudaEventCreate(&stop_));
BENCH_CUDA_TRY(cudaEventRecord(start_, stream_));
CUCO_CUDA_TRY(cudaEventCreate(&start_));
CUCO_CUDA_TRY(cudaEventCreate(&stop_));
CUCO_CUDA_TRY(cudaEventRecord(start_, stream_));
}

cuda_event_timer() = delete;
Expand All @@ -118,13 +109,13 @@ class cuda_event_timer {
*/
~cuda_event_timer()
{
BENCH_ASSERT_CUDA_SUCCESS(cudaEventRecord(stop_, stream_));
BENCH_ASSERT_CUDA_SUCCESS(cudaEventSynchronize(stop_));
CUCO_ASSERT_CUDA_SUCCESS(cudaEventRecord(stop_, stream_));
CUCO_ASSERT_CUDA_SUCCESS(cudaEventSynchronize(stop_));
float milliseconds = 0.0f;
BENCH_ASSERT_CUDA_SUCCESS(cudaEventElapsedTime(&milliseconds, start_, stop_));
CUCO_ASSERT_CUDA_SUCCESS(cudaEventElapsedTime(&milliseconds, start_, stop_));
p_state->SetIterationTime(milliseconds / (1000.0f));
BENCH_ASSERT_CUDA_SUCCESS(cudaEventDestroy(start_));
BENCH_ASSERT_CUDA_SUCCESS(cudaEventDestroy(stop_));
CUCO_ASSERT_CUDA_SUCCESS(cudaEventDestroy(start_));
CUCO_ASSERT_CUDA_SUCCESS(cudaEventDestroy(stop_));
}

private:
Expand Down
18 changes: 9 additions & 9 deletions include/cuco/detail/static_multimap/static_multimap.inl
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021-2022, NVIDIA CORPORATION.
* Copyright (c) 2021-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -167,7 +167,7 @@ std::size_t static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::count(
auto view = get_device_view();
auto const grid_size = (cg_size() * num_keys + stride * block_size - 1) / (stride * block_size);

cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream);
CUCO_CUDA_TRY(cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream));
std::size_t h_counter;

detail::count<block_size, cg_size(), is_outer>
Expand Down Expand Up @@ -198,7 +198,7 @@ std::size_t static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::count_
auto view = get_device_view();
auto const grid_size = (cg_size() * num_keys + stride * block_size - 1) / (stride * block_size);

cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream);
CUCO_CUDA_TRY(cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream));
std::size_t h_counter;

detail::count<block_size, cg_size(), is_outer>
Expand Down Expand Up @@ -229,7 +229,7 @@ std::size_t static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::pair_c
auto view = get_device_view();
auto const grid_size = (cg_size() * num_pairs + stride * block_size - 1) / (stride * block_size);

cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream);
CUCO_CUDA_TRY(cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream));
std::size_t h_counter;

detail::pair_count<block_size, cg_size(), is_outer>
Expand Down Expand Up @@ -260,7 +260,7 @@ std::size_t static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::pair_c
auto view = get_device_view();
auto const grid_size = (cg_size() * num_pairs + stride * block_size - 1) / (stride * block_size);

cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream);
CUCO_CUDA_TRY(cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream));
std::size_t h_counter;

detail::pair_count<block_size, cg_size(), is_outer>
Expand Down Expand Up @@ -307,7 +307,7 @@ OutputIt static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::retrieve(
KeyEqual>,
block_size);

cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream);
CUCO_CUDA_TRY(cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream));
std::size_t h_counter;

detail::retrieve<block_size, flushing_cg_size, cg_size(), buffer_size, is_outer>
Expand Down Expand Up @@ -357,7 +357,7 @@ OutputIt static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::retrieve_
KeyEqual>,
block_size);

cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream);
CUCO_CUDA_TRY(cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream));
std::size_t h_counter;

detail::retrieve<block_size, flushing_cg_size, cg_size(), buffer_size, is_outer>
Expand Down Expand Up @@ -403,7 +403,7 @@ static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::pair_retrieve(
}();
auto const grid_size = (cg_size() * num_pairs + stride * block_size - 1) / (stride * block_size);

cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream);
CUCO_CUDA_TRY(cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream));
std::size_t h_counter;

detail::pair_retrieve<block_size, flushing_cg_size, cg_size(), buffer_size, is_outer>
Expand Down Expand Up @@ -453,7 +453,7 @@ static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::pair_retrieve_oute
}();
auto const grid_size = (cg_size() * num_pairs + stride * block_size - 1) / (stride * block_size);

cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream);
CUCO_CUDA_TRY(cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream));
std::size_t h_counter;

detail::pair_retrieve<block_size, flushing_cg_size, cg_size(), buffer_size, is_outer>
Expand Down
11 changes: 7 additions & 4 deletions include/cuco/detail/utils.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021-2022, NVIDIA CORPORATION.
* Copyright (c) 2021-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -15,6 +15,8 @@

#pragma once

#include <cuco/detail/error.hpp>

#include <iterator>
#include <type_traits>

Expand Down Expand Up @@ -43,11 +45,12 @@ template <typename Kernel>
auto get_grid_size(Kernel kernel, std::size_t block_size, std::size_t dynamic_smem_bytes = 0)
{
int grid_size{-1};
cudaOccupancyMaxActiveBlocksPerMultiprocessor(&grid_size, kernel, block_size, dynamic_smem_bytes);
CUCO_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&grid_size, kernel, block_size, dynamic_smem_bytes));
int dev_id{-1};
cudaGetDevice(&dev_id);
CUCO_CUDA_TRY(cudaGetDevice(&dev_id));
int num_sms{-1};
cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, dev_id);
CUCO_CUDA_TRY(cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, dev_id));
grid_size *= num_sms;
return grid_size;
}
Expand Down
2 changes: 1 addition & 1 deletion tests/static_map/insert_and_find_test.cu
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ TEMPLATE_TEST_CASE_SIG("Parallel insert-or-update",
static constexpr int Blocks = 1024;
static constexpr int Threads = 128;
parallel_sum<<<Blocks, Threads>>>(m.get_device_mutable_view());
cudaDeviceSynchronize();
CUCO_CUDA_TRY(cudaDeviceSynchronize());
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How does catch2 handle exceptions or failing assertions? Will the test case just fail or will this nuke the whole test suite? As a workaround, we could wrap CUDA calls with a catch2 clause, e.g., REQUIRE(cudaSuccess == cudaDeviceSynchronize())

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

REQUIRE would return if a test case fails while CHECK won't stop at the first failure and will complete the whole test suite.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can also wrap CUCO_CUDA_TRY(...) with REQUIRE_NOTHROW(CUCO_CUDA_TRY(...))

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How does catch2 handle exceptions or failing assertions? Will the test case just fail or will this nuke the whole test suite?

Just did a quick test, throwing an error will fail the current test case but other tests in the test suite will continue. So the behavior is similar to CHECK_NOTHROW. In that sense, wrapping those calls with catch2 macros is not necessary.


thrust::device_vector<Key> d_keys(Iters);
thrust::device_vector<Value> d_values(Iters);
Expand Down
2 changes: 1 addition & 1 deletion tests/static_map/key_sentinel_test.cu
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ TEMPLATE_TEST_CASE_SIG(
for (int i = 0; i < SIZE; i++) {
h_A[i] = i;
}
cudaMemcpyToSymbol(A, h_A, SIZE * sizeof(int));
CUCO_CUDA_TRY(cudaMemcpyToSymbol(A, h_A, SIZE * sizeof(int)));

auto pairs_begin = thrust::make_transform_iterator(
thrust::make_counting_iterator<T>(0),
Expand Down
5 changes: 2 additions & 3 deletions tests/static_map/stream_test.cu
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ TEMPLATE_TEST_CASE_SIG("Unique sequence of keys on given stream",
(int64_t, int64_t))
{
cudaStream_t stream;
cudaStreamCreate(&stream);
CUCO_CUDA_TRY(cudaStreamCreate(&stream));

constexpr std::size_t num_keys{500'000};
cuco::static_map<Key, Value> map{1'000'000,
Expand Down Expand Up @@ -67,7 +67,6 @@ TEMPLATE_TEST_CASE_SIG("Unique sequence of keys on given stream",

map.insert(pairs_begin, pairs_begin + num_keys, hash_fn, equal_fn, stream);
map.find(d_keys.begin(), d_keys.end(), d_results.begin(), hash_fn, equal_fn, stream);
// cudaStreamSynchronize(stream);
auto zip = thrust::make_zip_iterator(thrust::make_tuple(d_results.begin(), d_values.begin()));

REQUIRE(cuco::test::all_of(
Expand All @@ -87,5 +86,5 @@ TEMPLATE_TEST_CASE_SIG("Unique sequence of keys on given stream",
REQUIRE(cuco::test::all_of(d_contained.begin(), d_contained.end(), thrust::identity{}, stream));
}

cudaStreamDestroy(stream);
CUCO_CUDA_TRY(cudaStreamDestroy(stream));
}
2 changes: 1 addition & 1 deletion tests/static_multimap/pair_function_test.cu
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ template <typename Key, typename Value, typename Map, typename PairIt>
__inline__ void test_pair_functions(Map& map, PairIt pair_begin, std::size_t num_pairs)
{
map.insert(pair_begin, pair_begin + num_pairs);
cudaStreamSynchronize(0);
CUCO_CUDA_TRY(cudaStreamSynchronize(0));

auto res = map.get_size();
REQUIRE(res == num_pairs);
Expand Down
24 changes: 13 additions & 11 deletions tests/utils.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2022, NVIDIA CORPORATION.
* Copyright (c) 2020-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -18,6 +18,8 @@

#include <utils.cuh>

#include <cuco/detail/error.hpp>

#include <thrust/functional.h>

#include <cooperative_groups.h>
Expand All @@ -39,19 +41,19 @@ int count_if(Iterator begin, Iterator end, Predicate p, cudaStream_t stream = 0)
auto const grid_size = (size + block_size - 1) / block_size;

int* count;
cudaMallocManaged(&count, sizeof(int));
CUCO_CUDA_TRY(cudaMallocManaged(&count, sizeof(int)));

*count = 0;
int device_id;
cudaGetDevice(&device_id);
cudaMemPrefetchAsync(count, sizeof(int), device_id, stream);
CUCO_CUDA_TRY(cudaGetDevice(&device_id));
CUCO_CUDA_TRY(cudaMemPrefetchAsync(count, sizeof(int), device_id, stream));

detail::count_if<<<grid_size, block_size, 0, stream>>>(begin, end, count, p);
cudaStreamSynchronize(stream);
CUCO_CUDA_TRY(cudaStreamSynchronize(stream));

auto res = *count;

cudaFree(count);
CUCO_CUDA_TRY(cudaFree(count));

return res;
}
Expand Down Expand Up @@ -85,19 +87,19 @@ bool equal(Iterator1 begin1, Iterator1 end1, Iterator2 begin2, Predicate p, cuda
auto const grid_size = (size + block_size - 1) / block_size;

int* count;
cudaMallocManaged(&count, sizeof(int));
CUCO_CUDA_TRY(cudaMallocManaged(&count, sizeof(int)));

*count = 0;
int device_id;
cudaGetDevice(&device_id);
cudaMemPrefetchAsync(count, sizeof(int), device_id, stream);
CUCO_CUDA_TRY(cudaGetDevice(&device_id));
CUCO_CUDA_TRY(cudaMemPrefetchAsync(count, sizeof(int), device_id, stream));

detail::count_if<<<grid_size, block_size, 0, stream>>>(begin1, end1, begin2, count, p);
cudaStreamSynchronize(stream);
CUCO_CUDA_TRY(cudaStreamSynchronize(stream));

auto res = *count;

cudaFree(count);
CUCO_CUDA_TRY(cudaFree(count));

return res == size;
}
Expand Down