Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve error handling in GPU callbacks [14.0.x] #44477

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions HeterogeneousCore/AlpakaCore/src/alpaka/EDMetadata.cc
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,10 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
}

void EDMetadata::enqueueCallback(edm::WaitingTaskWithArenaHolder holder) {
alpaka::enqueue(*queue_, alpaka::HostOnlyTask([holder = std::move(holder)]() {
alpaka::enqueue(*queue_, alpaka::HostOnlyTask([holder = std::move(holder)](std::exception_ptr eptr) {
// The functor is required to be const, but the original waitingTaskHolder_
// needs to be notified...
const_cast<edm::WaitingTaskWithArenaHolder&>(holder).doneWaiting(nullptr);
const_cast<edm::WaitingTaskWithArenaHolder&>(holder).doneWaiting(eptr);
}));
}

Expand Down
1 change: 1 addition & 0 deletions HeterogeneousCore/AlpakaInterface/BuildFile.xml
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
<use name="alpaka"/>
<use name="boost_header"/>
<use name="fmt"/>
<use name="FWCore/Utilities"/>
<export>
<lib name="1"/>
Expand Down
44 changes: 35 additions & 9 deletions HeterogeneousCore/AlpakaInterface/interface/HostOnlyTask.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
#include <functional>
#include <memory>

#include <fmt/format.h>

#include <alpaka/alpaka.hpp>

namespace alpaka {
Expand All @@ -14,12 +16,12 @@ namespace alpaka {
//! dedicated host-side worker thread.
class HostOnlyTask {
public:
HostOnlyTask(std::function<void()> task) : task_(std::move(task)) {}
HostOnlyTask(std::function<void(std::exception_ptr)> task) : task_(std::move(task)) {}

void operator()() const { task_(); }
void operator()(std::exception_ptr eptr) const { task_(eptr); }

private:
std::function<void()> task_;
std::function<void(std::exception_ptr)> task_;
};

namespace trait {
Expand All @@ -30,10 +32,22 @@ namespace alpaka {
struct Enqueue<QueueCudaRtNonBlocking, HostOnlyTask> {
using TApi = ApiCudaRt;

static void CUDART_CB callback(cudaStream_t /*queue*/, cudaError_t /*status*/, void* arg) {
//ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(status);
static void CUDART_CB callback(cudaStream_t queue, cudaError_t status, void* arg) {
std::unique_ptr<HostOnlyTask> pTask(static_cast<HostOnlyTask*>(arg));
(*pTask)();
if (status == cudaSuccess) {
(*pTask)(nullptr);
} else {
// wrap the exception in a try-catch block to let GDB "catch throw" break on it
try {
throw std::runtime_error(fmt::format("CUDA error: callback of stream {} received error {}: {}.",
fmt::ptr(queue),
cudaGetErrorName(status),
cudaGetErrorString(status)));
} catch (std::exception&) {
// pass the exception to the task
(*pTask)(std::current_exception());
}
}
}

ALPAKA_FN_HOST static auto enqueue(QueueCudaRtNonBlocking& queue, HostOnlyTask task) -> void {
Expand All @@ -50,10 +64,22 @@ namespace alpaka {
struct Enqueue<QueueHipRtNonBlocking, HostOnlyTask> {
using TApi = ApiHipRt;

static void callback(hipStream_t /*queue*/, hipError_t /*status*/, void* arg) {
//ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(status);
static void callback(hipStream_t queue, hipError_t status, void* arg) {
std::unique_ptr<HostOnlyTask> pTask(static_cast<HostOnlyTask*>(arg));
(*pTask)();
if (status == hipSuccess) {
(*pTask)(nullptr);
} else {
// wrap the exception in a try-catch block to let GDB "catch throw" break on it
try {
throw std::runtime_error(fmt::format("HIP error: callback of stream {} received error {}: {}.",
fmt::ptr(queue),
hipGetErrorName(status),
hipGetErrorString(status)));
} catch (std::exception&) {
// pass the exception to the task
(*pTask)(std::current_exception());
}
}
}

ALPAKA_FN_HOST static auto enqueue(QueueHipRtNonBlocking& queue, HostOnlyTask task) -> void {
Expand Down