Skip to content

Commit

Permalink
Ignore errors from alpaka::enqueue() in CachingAllocator::free()
Browse files Browse the repository at this point in the history
  • Loading branch information
makortel committed Apr 17, 2024
1 parent bdd404a commit ee87ccd
Show file tree
Hide file tree
Showing 3 changed files with 119 additions and 1 deletion.
23 changes: 22 additions & 1 deletion HeterogeneousCore/AlpakaInterface/interface/CachingAllocator.h
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,28 @@ namespace cms::alpakatools {

bool recache = (cachedBytes_.free + block.bytes <= maxCachedBytes_);
if (recache) {
alpaka::enqueue(*(block.queue), *(block.event));
// If enqueuing the event fails, very likely an error has
// occurred in the asynchronous processing. In that case the
// error will show up in all device API function calls, and
// the free() will be called by destructors during stack
// unwinding. In order to avoid terminate() being called
// because of multiple exceptions it is best to ignore these
// errors.
try {
alpaka::enqueue(*(block.queue), *(block.event));
} catch (std::exception& e) {
if (debug_) {
std::ostringstream out;
out << "CachingAllocator::free() error from alpaka::enqueue(): " << e.what() << "\n";
out << "\t" << deviceType_ << " " << alpaka::getName(device_) << " freed " << block.bytes << " bytes at "
<< ptr << " from associated queue " << block.queue->m_spQueueImpl.get() << ", event "
<< block.event->m_spEventImpl.get() << " .\n\t\t " << cachedBlocks_.size()
<< " available blocks cached (" << cachedBytes_.free << " bytes), " << liveBlocks_.size()
<< " live blocks (" << cachedBytes_.live << " bytes) outstanding." << std::endl;
std::cout << out.str() << std::endl;
}
return;
}
cachedBytes_.free += block.bytes;
// after the call to insert(), cachedBlocks_ shares ownership of the buffer
// TODO use std::move ?
Expand Down
6 changes: 6 additions & 0 deletions HeterogeneousCore/AlpakaInterface/test/BuildFile.xml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,12 @@
<use name="HeterogeneousCore/AlpakaInterface"/>
</bin>

<bin name="alpakaTestBuffer" file="alpaka/testBuffer.dev.cc">
<use name="catch2"/>
<use name="HeterogeneousCore/AlpakaInterface"/>
<flags ALPAKA_BACKENDS="1"/>
</bin>

<bin name="alpakaTestAtomicPairCounter" file="alpaka/testAtomicPairCounter.dev.cc">
<use name="alpaka"/>
<use name="catch2"/>
Expand Down
91 changes: 91 additions & 0 deletions HeterogeneousCore/AlpakaInterface/test/alpaka/testBuffer.dev.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
#include <alpaka/alpaka.hpp>

#define CATCH_CONFIG_MAIN
#include <catch.hpp>

#include "FWCore/Utilities/interface/stringize.h"
#include "HeterogeneousCore/AlpakaInterface/interface/config.h"
#include "HeterogeneousCore/AlpakaInterface/interface/memory.h"
#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"

// each test binary is built for a single Alpaka backend
using namespace ALPAKA_ACCELERATOR_NAMESPACE;

namespace {
constexpr size_t SIZE = 32;

void testDeviceSideError(Device const& device) {
auto queue = Queue(device);
auto buf_h = cms::alpakatools::make_host_buffer<int[]>(queue, SIZE);
auto buf_d = cms::alpakatools::make_device_buffer<int[]>(queue, SIZE);
alpaka::memset(queue, buf_h, 0);
alpaka::memcpy(queue, buf_d, buf_h);
// On the host device I don't know how to fabricate a device-side
// error for which the Alpaka API calls would then throw an
// exception. Therefore I just throw the std::runtime_error to
// keep the caller side the same for all backends. At least the
// test ensures the buffer destructors won't throw exceptions
// during the stack unwinding of the thrown runtime_error.
if constexpr (std::is_same_v<Device, alpaka::DevCpu>) {
throw std::runtime_error("assert");
} else {
auto div = cms::alpakatools::make_workdiv<Acc1D>(1, 1);
alpaka::exec<Acc1D>(
queue,
div,
[] ALPAKA_FN_ACC(Acc1D const& acc, int* data, size_t size) {
for (auto index : cms::alpakatools::uniform_elements(acc, size)) {
assert(data[index] != 0);
}
},
buf_d.data(),
SIZE);
alpaka::wait(queue);
}
}
} // namespace

TEST_CASE("Test alpaka buffers for the " EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE) " backend",
"[" EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE) "]") {
// get the list of devices on the current platform
auto const& devices = cms::alpakatools::devices<Platform>();
if (devices.empty()) {
FAIL("No devices available for the " EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE) " backend, "
"the test will be skipped.");
}

SECTION("Single device buffer") {
for (auto const& device : devices) {
auto queue = Queue(device);
auto buf = cms::alpakatools::make_device_buffer<int[]>(queue, SIZE);
alpaka::memset(queue, buf, 0);
alpaka::wait(queue);
}
}

SECTION("Single host buffer") {
for (auto const& device : devices) {
auto queue = Queue(device);
auto buf = cms::alpakatools::make_host_buffer<int[]>(queue, SIZE);
buf[0] = 0;
alpaka::wait(queue);
}
}

SECTION("Host and device buffers") {
for (auto const& device : devices) {
auto queue = Queue(device);
auto buf_h = cms::alpakatools::make_host_buffer<int[]>(queue, SIZE);
auto buf_d = cms::alpakatools::make_device_buffer<int[]>(queue, SIZE);
alpaka::memset(queue, buf_h, 0);
alpaka::memcpy(queue, buf_d, buf_h);
alpaka::wait(queue);
}
}

SECTION("Buffer destruction after a device-side error") {
for (auto const& device : devices) {
REQUIRE_THROWS_AS(testDeviceSideError(device), std::runtime_error);
}
}
}

0 comments on commit ee87ccd

Please sign in to comment.