Skip to content

Commit

Permalink
Improve error handling in GPU callbacks
Browse files Browse the repository at this point in the history
Update the callbacks used on the CUDA and HIP/ROCm backends to match the
original CUDA implementation: in case of asynchronous errors,
throw-catch an exception to let GDB intercept it, and propagate the
exception to the framework.
  • Loading branch information
fwyzard committed Mar 20, 2024
1 parent d6052f1 commit 127bf32
Show file tree
Hide file tree
Showing 3 changed files with 38 additions and 11 deletions.
4 changes: 2 additions & 2 deletions HeterogeneousCore/AlpakaCore/src/alpaka/EDMetadata.cc
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,10 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
}

void EDMetadata::enqueueCallback(edm::WaitingTaskWithArenaHolder holder) {
alpaka::enqueue(*queue_, alpaka::HostOnlyTask([holder = std::move(holder)]() {
alpaka::enqueue(*queue_, alpaka::HostOnlyTask([holder = std::move(holder)](std::exception_ptr eptr) {
// The functor is required to be const, but the original waitingTaskHolder_
// needs to be notified...
const_cast<edm::WaitingTaskWithArenaHolder&>(holder).doneWaiting(nullptr);
const_cast<edm::WaitingTaskWithArenaHolder&>(holder).doneWaiting(eptr);
}));
}

Expand Down
1 change: 1 addition & 0 deletions HeterogeneousCore/AlpakaInterface/BuildFile.xml
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
<use name="alpaka"/>
<use name="boost_header"/>
<use name="fmt"/>
<use name="FWCore/Utilities"/>
<export>
<lib name="1"/>
Expand Down
44 changes: 35 additions & 9 deletions HeterogeneousCore/AlpakaInterface/interface/HostOnlyTask.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
#include <functional>
#include <memory>

#include <fmt/format.h>

#include <alpaka/alpaka.hpp>

namespace alpaka {
Expand All @@ -14,12 +16,12 @@ namespace alpaka {
//! dedicated host-side worker thread.
class HostOnlyTask {
public:
HostOnlyTask(std::function<void()> task) : task_(std::move(task)) {}
HostOnlyTask(std::function<void(std::exception_ptr)> task) : task_(std::move(task)) {}

void operator()() const { task_(); }
void operator()(std::exception_ptr eptr) const { task_(eptr); }

private:
std::function<void()> task_;
std::function<void(std::exception_ptr)> task_;
};

namespace trait {
Expand All @@ -30,10 +32,22 @@ namespace alpaka {
struct Enqueue<QueueCudaRtNonBlocking, HostOnlyTask> {
using TApi = ApiCudaRt;

static void CUDART_CB callback(cudaStream_t /*queue*/, cudaError_t /*status*/, void* arg) {
//ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(status);
static void CUDART_CB callback(cudaStream_t queue, cudaError_t status, void* arg) {
std::unique_ptr<HostOnlyTask> pTask(static_cast<HostOnlyTask*>(arg));
(*pTask)();
if (status == cudaSuccess) {
(*pTask)(nullptr);
} else {
// wrap the exception in a try-catch block to let GDB "catch throw" break on it
try {
throw std::runtime_error(fmt::format("CUDA error: callback of stream {} received error {}: {}.",
fmt::ptr(queue),
cudaGetErrorName(status),
cudaGetErrorString(status)));
} catch (std::exception&) {
// pass the exception to the task
(*pTask)(std::current_exception());
}
}
}

ALPAKA_FN_HOST static auto enqueue(QueueCudaRtNonBlocking& queue, HostOnlyTask task) -> void {
Expand All @@ -50,10 +64,22 @@ namespace alpaka {
struct Enqueue<QueueHipRtNonBlocking, HostOnlyTask> {
using TApi = ApiHipRt;

static void callback(hipStream_t /*queue*/, hipError_t /*status*/, void* arg) {
//ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(status);
static void callback(hipStream_t queue, hipError_t status, void* arg) {
std::unique_ptr<HostOnlyTask> pTask(static_cast<HostOnlyTask*>(arg));
(*pTask)();
if (status == hipSuccess) {
(*pTask)(nullptr);
} else {
// wrap the exception in a try-catch block to let GDB "catch throw" break on it
try {
throw std::runtime_error(fmt::format("HIP error: callback of stream {} received error {}: {}.",
fmt::ptr(queue),
hipGetErrorName(status),
hipGetErrorString(status)));
} catch (std::exception&) {
// pass the exception to the task
(*pTask)(std::current_exception());
}
}
}

ALPAKA_FN_HOST static auto enqueue(QueueHipRtNonBlocking& queue, HostOnlyTask task) -> void {
Expand Down

0 comments on commit 127bf32

Please sign in to comment.