From b14cff677f530c2758ae0ee1844a8adce3a40bd2 Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Mon, 16 Dec 2019 23:23:41 +0100
Subject: [PATCH] Always record and query the CUDA event, to minimize need for
 error checking in CUDAScopedContextProduce destructor

---
 .../Common/interface/CUDAProduct.h            |  8 +++---
 .../Common/interface/CUDAProductBase.h        |  9 +++----
 CUDADataFormats/Common/src/CUDAProductBase.cc | 22 ++++++++--------
 .../CUDACore/interface/CUDAScopedContext.h    | 21 +++++-----------
 .../CUDACore/src/CUDAScopedContext.cc         | 25 +++----------------
 5 files changed, 29 insertions(+), 56 deletions(-)
diff --git a/CUDADataFormats/Common/interface/CUDAProduct.h b/CUDADataFormats/Common/interface/CUDAProduct.h
index 75c9c80e7f206..130ce1e5486c0 100644
--- a/CUDADataFormats/Common/interface/CUDAProduct.h
+++ b/CUDADataFormats/Common/interface/CUDAProduct.h
@@ -42,12 +42,12 @@ class CUDAProduct : public CUDAProductBase {
   friend class CUDAScopedContextProduce;
   friend class edm::Wrapper<CUDAProduct<T>>;
 
-  explicit CUDAProduct(int device, cudautils::SharedStreamPtr stream, T data)
-      : CUDAProductBase(device, std::move(stream)), data_(std::move(data)) {}
+  explicit CUDAProduct(int device, cudautils::SharedStreamPtr stream, cudautils::SharedEventPtr event, T data)
+    : CUDAProductBase(device, std::move(stream), std::move(event)), data_(std::move(data)) {}
 
   template <typename... Args>
-  explicit CUDAProduct(int device, cudautils::SharedStreamPtr stream, Args&&... args)
-      : CUDAProductBase(device, std::move(stream)), data_(std::forward<Args>(args)...) {}
+  explicit CUDAProduct(int device, cudautils::SharedStreamPtr stream, cudautils::SharedEventPtr event, Args&&... args)
+    : CUDAProductBase(device, std::move(stream), std::move(event)), data_(std::forward<Args>(args)...) {}
 
   T data_;  //!
 };
diff --git a/CUDADataFormats/Common/interface/CUDAProductBase.h b/CUDADataFormats/Common/interface/CUDAProductBase.h
index 219b7e619de7f..361d3523617b6 100644
--- a/CUDADataFormats/Common/interface/CUDAProductBase.h
+++ b/CUDADataFormats/Common/interface/CUDAProductBase.h
@@ -50,18 +50,17 @@ class CUDAProductBase {
   // mutable access is needed even if the CUDAScopedContext itself
   // would be const. Therefore it is ok to return a non-const
   // pointer from a const method here.
-  cudaEvent_t event() const { return event_ ? event_.get() : nullptr; }
+  cudaEvent_t event() const { return event_.get(); }
 
 protected:
-  explicit CUDAProductBase(int device, cudautils::SharedStreamPtr stream)
-      : stream_{std::move(stream)}, device_{device} {}
+  explicit CUDAProductBase(int device, cudautils::SharedStreamPtr stream, cudautils::SharedEventPtr event)
+    : stream_{std::move(stream)}, event_{std::move(event)}, device_{device} {}
 
 private:
   friend class impl::CUDAScopedContextBase;
   friend class CUDAScopedContextProduce;
 
-  // The following functions are intended to be used only from CUDAScopedContext
-  void setEvent(cudautils::SharedEventPtr event) { event_ = std::move(event); }
+  // The following function is intended to be used only from CUDAScopedContext
   const cudautils::SharedStreamPtr& streamPtr() const { return stream_; }
 
   bool mayReuseStream() const {
diff --git a/CUDADataFormats/Common/src/CUDAProductBase.cc b/CUDADataFormats/Common/src/CUDAProductBase.cc
index 72302d3165676..a5aa9e0221b56 100644
--- a/CUDADataFormats/Common/src/CUDAProductBase.cc
+++ b/CUDADataFormats/Common/src/CUDAProductBase.cc
@@ -2,10 +2,9 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/eventIsOccurred.h"
 
 bool CUDAProductBase::isAvailable() const {
-  // In absence of event, the product was available already at the end
-  // of produce() of the producer.
-  if (not event_) {
-    return true;
+  // if default-constructed, the product is not available
+  if(not event_) {
+    return false;
   }
   return cudautils::eventIsOccurred(event_.get());
 }
@@ -15,13 +14,14 @@ CUDAProductBase::~CUDAProductBase() {
   // complete before destructing the product. This is to make sure
   // that the EDM stream does not move to the next event before all
   // asynchronous processing of the current is complete.
-  if (event_) {
-    // TODO: a callback notifying a WaitingTaskHolder (or similar)
-    // would avoid blocking the CPU, but would also require more work.
-    //
-    // Intentionally not checking the return value to avoid throwing
-    // exceptions. If this call would fail, we should get failures
-    // elsewhere as well.
+
+  // TODO: a callback notifying a WaitingTaskHolder (or similar)
+  // would avoid blocking the CPU, but would also require more work.
+  //
+  // Intentionally not checking the return value to avoid throwing
+  // exceptions. If this call would fail, we should get failures
+  // elsewhere as well.
+  if(event_) {
     cudaEventSynchronize(event_.get());
   }
 }
diff --git a/HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h b/HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h
index 70539006d2563..f5dc53b785a05 100644
--- a/HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h
+++ b/HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h
@@ -10,6 +10,7 @@
 #include "FWCore/Utilities/interface/EDPutToken.h"
 #include "FWCore/Utilities/interface/StreamID.h"
 #include "HeterogeneousCore/CUDACore/interface/CUDAContextState.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/CUDAEventCache.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/SharedEventPtr.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/SharedStreamPtr.h"
 
@@ -154,27 +155,18 @@ class CUDAScopedContextProduce : public impl::CUDAScopedContextGetterBase {
   explicit CUDAScopedContextProduce(CUDAContextState& state)
       : CUDAScopedContextGetterBase(state.device(), state.releaseStreamPtr()) {}
 
+  /// Record the CUDA event, all asynchronous work must have been queued before the destructor
   ~CUDAScopedContextProduce();
 
   template <typename T>
   std::unique_ptr<CUDAProduct<T>> wrap(T data) {
     // make_unique doesn't work because of private constructor
-    //
-    // CUDAProduct<T> constructor records CUDA event to the CUDA
-    // stream. The event will become "occurred" after all work queued
-    // to the stream before this point has been finished.
-    std::unique_ptr<CUDAProduct<T>> ret(new CUDAProduct<T>(device(), streamPtr(), std::move(data)));
-    createEventIfStreamBusy();
-    ret->setEvent(event_);
-    return ret;
+    return std::unique_ptr<CUDAProduct<T>>(new CUDAProduct<T>(device(), streamPtr(), event_, std::move(data)));
   }
 
   template <typename T, typename... Args>
   auto emplace(edm::Event& iEvent, edm::EDPutTokenT<T> token, Args&&... args) {
-    auto ret = iEvent.emplace(token, device(), streamPtr(), std::forward<Args>(args)...);
-    createEventIfStreamBusy();
-    const_cast<T&>(*ret).setEvent(event_);
-    return ret;
+    return iEvent.emplace(token, device(), streamPtr(), event_, std::forward<Args>(args)...);
   }
 
 private:
@@ -184,9 +176,8 @@ class CUDAScopedContextProduce : public impl::CUDAScopedContextGetterBase {
   explicit CUDAScopedContextProduce(int device, cudautils::SharedStreamPtr stream, cudautils::SharedEventPtr event)
       : CUDAScopedContextGetterBase(device, std::move(stream)), event_{std::move(event)} {}
 
-  void createEventIfStreamBusy();
-
-  cudautils::SharedEventPtr event_;
+  // create the CUDA Event upfront to catch possible errors from its creation
+  cudautils::SharedEventPtr event_ = cudautils::getCUDAEventCache().getCUDAEvent();
 };
 
 /**
diff --git a/HeterogeneousCore/CUDACore/src/CUDAScopedContext.cc b/HeterogeneousCore/CUDACore/src/CUDAScopedContext.cc
index df56c318e22fa..2d2a155a5bc11 100644
--- a/HeterogeneousCore/CUDACore/src/CUDAScopedContext.cc
+++ b/HeterogeneousCore/CUDACore/src/CUDAScopedContext.cc
@@ -3,7 +3,6 @@
 #include "FWCore/MessageLogger/interface/MessageLogger.h"
 #include "FWCore/ServiceRegistry/interface/Service.h"
 #include "FWCore/Utilities/interface/Exception.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/CUDAEventCache.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/CUDAStreamCache.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 
@@ -107,26 +106,10 @@ void CUDAScopedContextAcquire::throwNoState() {
 ////////////////////
 
 CUDAScopedContextProduce::~CUDAScopedContextProduce() {
-  if (event_) {
-    cudaCheck(cudaEventRecord(event_.get(), stream()));
-  }
-}
-
-void CUDAScopedContextProduce::createEventIfStreamBusy() {
-  if (event_) {
-    return;
-  }
-  auto ret = cudaStreamQuery(stream());
-  if (ret == cudaSuccess) {
-    return;
-  }
-  if (ret != cudaErrorNotReady) {
-    // cudaErrorNotReady indicates that the stream is busy, and thus
-    // is not an error
-    cudaCheck(ret);
-  }
-
-  event_ = cudautils::getCUDAEventCache().getCUDAEvent();
+  // Intentionally not checking the return value to avoid throwing
+  // exceptions. If this call would fail, we should get failures
+  // elsewhere as well.
+  cudaEventRecord(event_.get(), stream());
 }
 
 ////////////////////