Skip to content

Commit

Permalink
Improve error handling in GPU callbacks
Browse files Browse the repository at this point in the history
Update the callbacks used on the CUDA and HIP/ROCm backends to match the
original CUDA implementation: in case of asynchronous errors,
throw-catch an exception to let GDB intercept it, and propagate the
exception to the framework.
  • Loading branch information
fwyzard committed Mar 20, 2024
1 parent d6052f1 commit 8c3b153
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 11 deletions.
4 changes: 2 additions & 2 deletions HeterogeneousCore/AlpakaCore/src/alpaka/EDMetadata.cc
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,10 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
}

void EDMetadata::enqueueCallback(edm::WaitingTaskWithArenaHolder holder) {
alpaka::enqueue(*queue_, alpaka::HostOnlyTask([holder = std::move(holder)]() {
alpaka::enqueue(*queue_, alpaka::HostOnlyTask([holder = std::move(holder)](std::exception_ptr eptr) {
// The functor is required to be const, but the original waitingTaskHolder_
// needs to be notified...
const_cast<edm::WaitingTaskWithArenaHolder&>(holder).doneWaiting(nullptr);
const_cast<edm::WaitingTaskWithArenaHolder&>(holder).doneWaiting(eptr);
}));
}

Expand Down
41 changes: 32 additions & 9 deletions HeterogeneousCore/AlpakaInterface/interface/HostOnlyTask.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

#include <functional>
#include <memory>
#include <sstream>

#include <alpaka/alpaka.hpp>

Expand All @@ -14,12 +15,12 @@ namespace alpaka {
//! dedicated host-side worker thread.
class HostOnlyTask {
public:
HostOnlyTask(std::function<void()> task) : task_(std::move(task)) {}
HostOnlyTask(std::function<void(std::exception_ptr)> task) : task_(std::move(task)) {}

void operator()() const { task_(); }
void operator()(std::exception_ptr eptr) const { task_(eptr); }

private:
std::function<void()> task_;
std::function<void(std::exception_ptr)> task_;
};

namespace trait {
Expand All @@ -30,10 +31,21 @@ namespace alpaka {
struct Enqueue<QueueCudaRtNonBlocking, HostOnlyTask> {
using TApi = ApiCudaRt;

static void CUDART_CB callback(cudaStream_t /*queue*/, cudaError_t /*status*/, void* arg) {
//ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(status);
static void CUDART_CB callback(cudaStream_t queue, cudaError_t status, void* arg) {
std::unique_ptr<HostOnlyTask> pTask(static_cast<HostOnlyTask*>(arg));
(*pTask)();
if (status == cudaSuccess) {
(*pTask)(nullptr);
} else {
// wrap the exception in a try-catch block to let GDB "catch throw" break on it
try {
std::ostringstream msg;
msg << "CUDA error: callback of stream " << queue << " received error " << cudaGetErrorName(status) << ": " << cudaGetErrorString(status) << ".";
throw std::runtime_error(msg.str());
} catch (std::exception&) {
// pass the exception to the task
(*pTask)(std::current_exception());
}
}
}

ALPAKA_FN_HOST static auto enqueue(QueueCudaRtNonBlocking& queue, HostOnlyTask task) -> void {
Expand All @@ -50,10 +62,21 @@ namespace alpaka {
struct Enqueue<QueueHipRtNonBlocking, HostOnlyTask> {
using TApi = ApiHipRt;

static void callback(hipStream_t /*queue*/, hipError_t /*status*/, void* arg) {
//ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(status);
static void callback(hipStream_t queue, hipError_t status, void* arg) {
std::unique_ptr<HostOnlyTask> pTask(static_cast<HostOnlyTask*>(arg));
(*pTask)();
if (status == hipSuccess) {
(*pTask)(nullptr);
} else {
// wrap the exception in a try-catch block to let GDB "catch throw" break on it
try {
std::ostringstream msg;
msg << "HIP error: callback of stream " << queue << " received error " << hipGetErrorName(status) << ": " << hipGetErrorString(status) << ".";
throw std::runtime_error(msg.str());
} catch (std::exception&) {
// pass the exception to the task
(*pTask)(std::current_exception());
}
}
}

ALPAKA_FN_HOST static auto enqueue(QueueHipRtNonBlocking& queue, HostOnlyTask task) -> void {
Expand Down

0 comments on commit 8c3b153

Please sign in to comment.