diff --git a/HeterogeneousCore/CUDAUtilities/interface/CUDAEventCache.h b/HeterogeneousCore/CUDAUtilities/interface/CUDAEventCache.h index 53045b59c0a98..8e5001b525351 100644 --- a/HeterogeneousCore/CUDAUtilities/interface/CUDAEventCache.h +++ b/HeterogeneousCore/CUDAUtilities/interface/CUDAEventCache.h @@ -18,12 +18,19 @@ namespace cudautils { CUDAEventCache(); // Gets a (cached) CUDA event for the current device. The event - // will be returned to the cache by the shared_ptr destructor. + // will be returned to the cache by the shared_ptr destructor. The + // returned event is guaranteed to be "occurred", i.e. + // cudaEventQuery() == cudaSuccess. + // // This function is thread safe SharedEventPtr getCUDAEvent(); private: friend class ::CUDAService; + + // thread safe + SharedEventPtr makeOrGet(int dev); + // not thread safe, intended to be called only from CUDAService destructor void clear(); diff --git a/HeterogeneousCore/CUDAUtilities/src/CUDAEventCache.cc b/HeterogeneousCore/CUDAUtilities/src/CUDAEventCache.cc index 119e79dc29149..bf79d0bb54568 100644 --- a/HeterogeneousCore/CUDAUtilities/src/CUDAEventCache.cc +++ b/HeterogeneousCore/CUDAUtilities/src/CUDAEventCache.cc @@ -19,8 +19,36 @@ namespace cudautils { SharedEventPtr CUDAEventCache::getCUDAEvent() { const auto dev = cudautils::currentDevice(); + auto event = makeOrGet(dev); + auto ret = cudaEventQuery(event.get()); + // event is occurred, return immediately + if (ret == cudaSuccess) { + return event; + } + // return code is something else than "recorded", throw exception + if (ret != cudaErrorNotReady) { + cudaCheck(ret); + } + + // Got recorded, but not yet occurred event. Try until we get an + // occurred event. Need to keep all recorded events until an + // occurred event is found in order to avoid ping-pong with a + // recorded event. + std::vector ptrs{std::move(event)}; + do { + event = makeOrGet(dev); + ret = cudaEventQuery(event.get()); + if (ret == cudaErrorNotReady) { + ptrs.emplace_back(std::move(event)); + } else if (ret != cudaSuccess) { + cudaCheck(ret); + } + } while (ret != cudaSuccess); + return event; + } + + SharedEventPtr CUDAEventCache::makeOrGet(int dev) { return cache_[dev].makeOrGet([dev]() { - // TODO(?): We should not return a recorded, but not-yet-occurred event cudaEvent_t event; // it should be a bit faster to ignore timings cudaCheck(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));