Skip to content

Commit

Permalink
Merge pull request cms-sw#44476 from fwyzard/Improve_GPU_callback_err…
Browse files Browse the repository at this point in the history
…or_handling_141x

Improve error handling in GPU callbacks
  • Loading branch information
cmsbuild authored Mar 20, 2024
2 parents ce15d4d + c435174 commit 86d37c3
Showing 3 changed files with 38 additions and 11 deletions.
4 changes: 2 additions & 2 deletions HeterogeneousCore/AlpakaCore/src/alpaka/EDMetadata.cc
Original file line number Diff line number Diff line change
@@ -23,10 +23,10 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
}

void EDMetadata::enqueueCallback(edm::WaitingTaskWithArenaHolder holder) {
alpaka::enqueue(*queue_, alpaka::HostOnlyTask([holder = std::move(holder)]() {
alpaka::enqueue(*queue_, alpaka::HostOnlyTask([holder = std::move(holder)](std::exception_ptr eptr) {
// The functor is required to be const, but the original waitingTaskHolder_
// needs to be notified...
const_cast<edm::WaitingTaskWithArenaHolder&>(holder).doneWaiting(nullptr);
const_cast<edm::WaitingTaskWithArenaHolder&>(holder).doneWaiting(eptr);
}));
}

1 change: 1 addition & 0 deletions HeterogeneousCore/AlpakaInterface/BuildFile.xml
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
<use name="alpaka"/>
<use name="boost_header"/>
<use name="fmt"/>
<use name="FWCore/Utilities"/>
<export>
<lib name="1"/>
44 changes: 35 additions & 9 deletions HeterogeneousCore/AlpakaInterface/interface/HostOnlyTask.h
Original file line number Diff line number Diff line change
@@ -4,6 +4,8 @@
#include <functional>
#include <memory>

#include <fmt/format.h>

#include <alpaka/alpaka.hpp>

namespace alpaka {
@@ -14,12 +16,12 @@ namespace alpaka {
//! dedicated host-side worker thread.
class HostOnlyTask {
public:
HostOnlyTask(std::function<void()> task) : task_(std::move(task)) {}
HostOnlyTask(std::function<void(std::exception_ptr)> task) : task_(std::move(task)) {}

void operator()() const { task_(); }
void operator()(std::exception_ptr eptr) const { task_(eptr); }

private:
std::function<void()> task_;
std::function<void(std::exception_ptr)> task_;
};

namespace trait {
@@ -30,10 +32,22 @@ namespace alpaka {
struct Enqueue<QueueCudaRtNonBlocking, HostOnlyTask> {
using TApi = ApiCudaRt;

static void CUDART_CB callback(cudaStream_t /*queue*/, cudaError_t /*status*/, void* arg) {
//ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(status);
static void CUDART_CB callback(cudaStream_t queue, cudaError_t status, void* arg) {
std::unique_ptr<HostOnlyTask> pTask(static_cast<HostOnlyTask*>(arg));
(*pTask)();
if (status == cudaSuccess) {
(*pTask)(nullptr);
} else {
// wrap the exception in a try-catch block to let GDB "catch throw" break on it
try {
throw std::runtime_error(fmt::format("CUDA error: callback of stream {} received error {}: {}.",
fmt::ptr(queue),
cudaGetErrorName(status),
cudaGetErrorString(status)));
} catch (std::exception&) {
// pass the exception to the task
(*pTask)(std::current_exception());
}
}
}

ALPAKA_FN_HOST static auto enqueue(QueueCudaRtNonBlocking& queue, HostOnlyTask task) -> void {
@@ -50,10 +64,22 @@ namespace alpaka {
struct Enqueue<QueueHipRtNonBlocking, HostOnlyTask> {
using TApi = ApiHipRt;

static void callback(hipStream_t /*queue*/, hipError_t /*status*/, void* arg) {
//ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(status);
static void callback(hipStream_t queue, hipError_t status, void* arg) {
std::unique_ptr<HostOnlyTask> pTask(static_cast<HostOnlyTask*>(arg));
(*pTask)();
if (status == hipSuccess) {
(*pTask)(nullptr);
} else {
// wrap the exception in a try-catch block to let GDB "catch throw" break on it
try {
throw std::runtime_error(fmt::format("HIP error: callback of stream {} received error {}: {}.",
fmt::ptr(queue),
hipGetErrorName(status),
hipGetErrorString(status)));
} catch (std::exception&) {
// pass the exception to the task
(*pTask)(std::current_exception());
}
}
}

ALPAKA_FN_HOST static auto enqueue(QueueHipRtNonBlocking& queue, HostOnlyTask task) -> void {

0 comments on commit 86d37c3

Please sign in to comment.