diff --git a/CUDADataFormats/Common/interface/CUDAProduct.h b/CUDADataFormats/Common/interface/CUDAProduct.h index 75c9c80e7f206..130ce1e5486c0 100644 --- a/CUDADataFormats/Common/interface/CUDAProduct.h +++ b/CUDADataFormats/Common/interface/CUDAProduct.h @@ -42,12 +42,12 @@ class CUDAProduct : public CUDAProductBase { friend class CUDAScopedContextProduce; friend class edm::Wrapper>; - explicit CUDAProduct(int device, cudautils::SharedStreamPtr stream, T data) - : CUDAProductBase(device, std::move(stream)), data_(std::move(data)) {} + explicit CUDAProduct(int device, cudautils::SharedStreamPtr stream, cudautils::SharedEventPtr event, T data) + : CUDAProductBase(device, std::move(stream), std::move(event)), data_(std::move(data)) {} template - explicit CUDAProduct(int device, cudautils::SharedStreamPtr stream, Args&&... args) - : CUDAProductBase(device, std::move(stream)), data_(std::forward(args)...) {} + explicit CUDAProduct(int device, cudautils::SharedStreamPtr stream, cudautils::SharedEventPtr event, Args&&... args) + : CUDAProductBase(device, std::move(stream), std::move(event)), data_(std::forward(args)...) {} T data_; //! }; diff --git a/CUDADataFormats/Common/interface/CUDAProductBase.h b/CUDADataFormats/Common/interface/CUDAProductBase.h index 219b7e619de7f..361d3523617b6 100644 --- a/CUDADataFormats/Common/interface/CUDAProductBase.h +++ b/CUDADataFormats/Common/interface/CUDAProductBase.h @@ -50,18 +50,17 @@ class CUDAProductBase { // mutable access is needed even if the CUDAScopedContext itself // would be const. Therefore it is ok to return a non-const // pointer from a const method here. - cudaEvent_t event() const { return event_ ? event_.get() : nullptr; } + cudaEvent_t event() const { return event_.get(); } protected: - explicit CUDAProductBase(int device, cudautils::SharedStreamPtr stream) - : stream_{std::move(stream)}, device_{device} {} + explicit CUDAProductBase(int device, cudautils::SharedStreamPtr stream, cudautils::SharedEventPtr event) + : stream_{std::move(stream)}, event_{std::move(event)}, device_{device} {} private: friend class impl::CUDAScopedContextBase; friend class CUDAScopedContextProduce; - // The following functions are intended to be used only from CUDAScopedContext - void setEvent(cudautils::SharedEventPtr event) { event_ = std::move(event); } + // The following function is intended to be used only from CUDAScopedContext const cudautils::SharedStreamPtr& streamPtr() const { return stream_; } bool mayReuseStream() const { diff --git a/CUDADataFormats/Common/src/CUDAProductBase.cc b/CUDADataFormats/Common/src/CUDAProductBase.cc index 72302d3165676..a5aa9e0221b56 100644 --- a/CUDADataFormats/Common/src/CUDAProductBase.cc +++ b/CUDADataFormats/Common/src/CUDAProductBase.cc @@ -2,10 +2,9 @@ #include "HeterogeneousCore/CUDAUtilities/interface/eventIsOccurred.h" bool CUDAProductBase::isAvailable() const { - // In absence of event, the product was available already at the end - // of produce() of the producer. - if (not event_) { - return true; + // if default-constructed, the product is not available + if(not event_) { + return false; } return cudautils::eventIsOccurred(event_.get()); } @@ -15,13 +14,14 @@ CUDAProductBase::~CUDAProductBase() { // complete before destructing the product. This is to make sure // that the EDM stream does not move to the next event before all // asynchronous processing of the current is complete. - if (event_) { - // TODO: a callback notifying a WaitingTaskHolder (or similar) - // would avoid blocking the CPU, but would also require more work. - // - // Intentionally not checking the return value to avoid throwing - // exceptions. If this call would fail, we should get failures - // elsewhere as well. + + // TODO: a callback notifying a WaitingTaskHolder (or similar) + // would avoid blocking the CPU, but would also require more work. + // + // Intentionally not checking the return value to avoid throwing + // exceptions. If this call would fail, we should get failures + // elsewhere as well. + if(event_) { cudaEventSynchronize(event_.get()); } } diff --git a/HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h b/HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h index 70539006d2563..f5dc53b785a05 100644 --- a/HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h +++ b/HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h @@ -10,6 +10,7 @@ #include "FWCore/Utilities/interface/EDPutToken.h" #include "FWCore/Utilities/interface/StreamID.h" #include "HeterogeneousCore/CUDACore/interface/CUDAContextState.h" +#include "HeterogeneousCore/CUDAUtilities/interface/CUDAEventCache.h" #include "HeterogeneousCore/CUDAUtilities/interface/SharedEventPtr.h" #include "HeterogeneousCore/CUDAUtilities/interface/SharedStreamPtr.h" @@ -154,27 +155,18 @@ class CUDAScopedContextProduce : public impl::CUDAScopedContextGetterBase { explicit CUDAScopedContextProduce(CUDAContextState& state) : CUDAScopedContextGetterBase(state.device(), state.releaseStreamPtr()) {} + /// Record the CUDA event, all asynchronous work must have been queued before the destructor ~CUDAScopedContextProduce(); template std::unique_ptr> wrap(T data) { // make_unique doesn't work because of private constructor - // - // CUDAProduct constructor records CUDA event to the CUDA - // stream. The event will become "occurred" after all work queued - // to the stream before this point has been finished. - std::unique_ptr> ret(new CUDAProduct(device(), streamPtr(), std::move(data))); - createEventIfStreamBusy(); - ret->setEvent(event_); - return ret; + return std::unique_ptr>(new CUDAProduct(device(), streamPtr(), event_, std::move(data))); } template auto emplace(edm::Event& iEvent, edm::EDPutTokenT token, Args&&... args) { - auto ret = iEvent.emplace(token, device(), streamPtr(), std::forward(args)...); - createEventIfStreamBusy(); - const_cast(*ret).setEvent(event_); - return ret; + return iEvent.emplace(token, device(), streamPtr(), event_, std::forward(args)...); } private: @@ -184,9 +176,8 @@ class CUDAScopedContextProduce : public impl::CUDAScopedContextGetterBase { explicit CUDAScopedContextProduce(int device, cudautils::SharedStreamPtr stream, cudautils::SharedEventPtr event) : CUDAScopedContextGetterBase(device, std::move(stream)), event_{std::move(event)} {} - void createEventIfStreamBusy(); - - cudautils::SharedEventPtr event_; + // create the CUDA Event upfront to catch possible errors from its creation + cudautils::SharedEventPtr event_ = cudautils::getCUDAEventCache().getCUDAEvent(); }; /** diff --git a/HeterogeneousCore/CUDACore/src/CUDAScopedContext.cc b/HeterogeneousCore/CUDACore/src/CUDAScopedContext.cc index df56c318e22fa..2d2a155a5bc11 100644 --- a/HeterogeneousCore/CUDACore/src/CUDAScopedContext.cc +++ b/HeterogeneousCore/CUDACore/src/CUDAScopedContext.cc @@ -3,7 +3,6 @@ #include "FWCore/MessageLogger/interface/MessageLogger.h" #include "FWCore/ServiceRegistry/interface/Service.h" #include "FWCore/Utilities/interface/Exception.h" -#include "HeterogeneousCore/CUDAUtilities/interface/CUDAEventCache.h" #include "HeterogeneousCore/CUDAUtilities/interface/CUDAStreamCache.h" #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" @@ -107,26 +106,10 @@ void CUDAScopedContextAcquire::throwNoState() { //////////////////// CUDAScopedContextProduce::~CUDAScopedContextProduce() { - if (event_) { - cudaCheck(cudaEventRecord(event_.get(), stream())); - } -} - -void CUDAScopedContextProduce::createEventIfStreamBusy() { - if (event_) { - return; - } - auto ret = cudaStreamQuery(stream()); - if (ret == cudaSuccess) { - return; - } - if (ret != cudaErrorNotReady) { - // cudaErrorNotReady indicates that the stream is busy, and thus - // is not an error - cudaCheck(ret); - } - - event_ = cudautils::getCUDAEventCache().getCUDAEvent(); + // Intentionally not checking the return value to avoid throwing + // exceptions. If this call would fail, we should get failures + // elsewhere as well. + cudaEventRecord(event_.get(), stream()); } ////////////////////