From ee87ccd907445e01845444252a26171521df0ee8 Mon Sep 17 00:00:00 2001 From: Matti Kortelainen Date: Fri, 5 Apr 2024 20:49:24 +0200 Subject: [PATCH] Ignore errors from alpaka::enqueue() in CachingAllocator::free() --- .../interface/CachingAllocator.h | 23 ++++- .../AlpakaInterface/test/BuildFile.xml | 6 ++ .../test/alpaka/testBuffer.dev.cc | 91 +++++++++++++++++++ 3 files changed, 119 insertions(+), 1 deletion(-) create mode 100644 HeterogeneousCore/AlpakaInterface/test/alpaka/testBuffer.dev.cc diff --git a/HeterogeneousCore/AlpakaInterface/interface/CachingAllocator.h b/HeterogeneousCore/AlpakaInterface/interface/CachingAllocator.h index 017b13a2c1341..2560361e796ef 100644 --- a/HeterogeneousCore/AlpakaInterface/interface/CachingAllocator.h +++ b/HeterogeneousCore/AlpakaInterface/interface/CachingAllocator.h @@ -207,7 +207,28 @@ namespace cms::alpakatools { bool recache = (cachedBytes_.free + block.bytes <= maxCachedBytes_); if (recache) { - alpaka::enqueue(*(block.queue), *(block.event)); + // If enqueuing the event fails, very likely an error has + // occurred in the asynchronous processing. In that case the + // error will show up in all device API function calls, and + // the free() will be called by destructors during stack + // unwinding. In order to avoid terminate() being called + // because of multiple exceptions it is best to ignore these + // errors. + try { + alpaka::enqueue(*(block.queue), *(block.event)); + } catch (std::exception& e) { + if (debug_) { + std::ostringstream out; + out << "CachingAllocator::free() error from alpaka::enqueue(): " << e.what() << "\n"; + out << "\t" << deviceType_ << " " << alpaka::getName(device_) << " freed " << block.bytes << " bytes at " + << ptr << " from associated queue " << block.queue->m_spQueueImpl.get() << ", event " + << block.event->m_spEventImpl.get() << " .\n\t\t " << cachedBlocks_.size() + << " available blocks cached (" << cachedBytes_.free << " bytes), " << liveBlocks_.size() + << " live blocks (" << cachedBytes_.live << " bytes) outstanding." << std::endl; + std::cout << out.str() << std::endl; + } + return; + } cachedBytes_.free += block.bytes; // after the call to insert(), cachedBlocks_ shares ownership of the buffer // TODO use std::move ? diff --git a/HeterogeneousCore/AlpakaInterface/test/BuildFile.xml b/HeterogeneousCore/AlpakaInterface/test/BuildFile.xml index 0198e36f9166f..426a750e3d0b9 100644 --- a/HeterogeneousCore/AlpakaInterface/test/BuildFile.xml +++ b/HeterogeneousCore/AlpakaInterface/test/BuildFile.xml @@ -24,6 +24,12 @@ + + + + + + diff --git a/HeterogeneousCore/AlpakaInterface/test/alpaka/testBuffer.dev.cc b/HeterogeneousCore/AlpakaInterface/test/alpaka/testBuffer.dev.cc new file mode 100644 index 0000000000000..77ca80f639ca4 --- /dev/null +++ b/HeterogeneousCore/AlpakaInterface/test/alpaka/testBuffer.dev.cc @@ -0,0 +1,91 @@ +#include + +#define CATCH_CONFIG_MAIN +#include + +#include "FWCore/Utilities/interface/stringize.h" +#include "HeterogeneousCore/AlpakaInterface/interface/config.h" +#include "HeterogeneousCore/AlpakaInterface/interface/memory.h" +#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h" + +// each test binary is built for a single Alpaka backend +using namespace ALPAKA_ACCELERATOR_NAMESPACE; + +namespace { + constexpr size_t SIZE = 32; + + void testDeviceSideError(Device const& device) { + auto queue = Queue(device); + auto buf_h = cms::alpakatools::make_host_buffer(queue, SIZE); + auto buf_d = cms::alpakatools::make_device_buffer(queue, SIZE); + alpaka::memset(queue, buf_h, 0); + alpaka::memcpy(queue, buf_d, buf_h); + // On the host device I don't know how to fabricate a device-side + // error for which the Alpaka API calls would then throw an + // exception. Therefore I just throw the std::runtime_error to + // keep the caller side the same for all backends. At least the + // test ensures the buffer destructors won't throw exceptions + // during the stack unwinding of the thrown runtime_error. + if constexpr (std::is_same_v) { + throw std::runtime_error("assert"); + } else { + auto div = cms::alpakatools::make_workdiv(1, 1); + alpaka::exec( + queue, + div, + [] ALPAKA_FN_ACC(Acc1D const& acc, int* data, size_t size) { + for (auto index : cms::alpakatools::uniform_elements(acc, size)) { + assert(data[index] != 0); + } + }, + buf_d.data(), + SIZE); + alpaka::wait(queue); + } + } +} // namespace + +TEST_CASE("Test alpaka buffers for the " EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE) " backend", + "[" EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE) "]") { + // get the list of devices on the current platform + auto const& devices = cms::alpakatools::devices(); + if (devices.empty()) { + FAIL("No devices available for the " EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE) " backend, " + "the test will be skipped."); + } + + SECTION("Single device buffer") { + for (auto const& device : devices) { + auto queue = Queue(device); + auto buf = cms::alpakatools::make_device_buffer(queue, SIZE); + alpaka::memset(queue, buf, 0); + alpaka::wait(queue); + } + } + + SECTION("Single host buffer") { + for (auto const& device : devices) { + auto queue = Queue(device); + auto buf = cms::alpakatools::make_host_buffer(queue, SIZE); + buf[0] = 0; + alpaka::wait(queue); + } + } + + SECTION("Host and device buffers") { + for (auto const& device : devices) { + auto queue = Queue(device); + auto buf_h = cms::alpakatools::make_host_buffer(queue, SIZE); + auto buf_d = cms::alpakatools::make_device_buffer(queue, SIZE); + alpaka::memset(queue, buf_h, 0); + alpaka::memcpy(queue, buf_d, buf_h); + alpaka::wait(queue); + } + } + + SECTION("Buffer destruction after a device-side error") { + for (auto const& device : devices) { + REQUIRE_THROWS_AS(testDeviceSideError(device), std::runtime_error); + } + } +}