From 575dc08f46ff3a35694c77ecfb36c91c932828db Mon Sep 17 00:00:00 2001 From: Matti Kortelainen Date: Fri, 15 Mar 2019 22:25:31 +0100 Subject: [PATCH 1/2] Create CUDA events only when needed Do not create event if the CUDA stream is idle, i.e. has already finished all work that was queued, at the point when data products are wrapped/emplaced for/to edm::Event. When creating an event, create only a single event per producer, i.e. all products of a producer share the same event. Also include a unit test checking the assumed behavior of CUDA events and streams. --- .../Common/interface/CUDAProduct.h | 4 +- .../Common/interface/CUDAProductBase.h | 7 +- CUDADataFormats/Common/src/CUDAProductBase.cc | 20 +-- .../Common/test/test_CUDAProduct.cc | 14 ++- .../CUDACore/interface/CUDAScopedContext.h | 18 +-- .../CUDACore/src/CUDAScopedContext.cc | 22 +++- HeterogeneousCore/CUDACore/test/BuildFile.xml | 6 +- .../CUDACore/test/testStreamEvent.cu | 117 ++++++++++++++++++ .../CUDACore/test/test_CUDAScopedContext.cc | 17 ++- .../plugins/SiPixelRecHitHeterogeneous.cc | 8 +- .../ClusterTPAssociationHeterogeneous.cc | 4 +- 11 files changed, 194 insertions(+), 43 deletions(-) create mode 100644 HeterogeneousCore/CUDACore/test/testStreamEvent.cu diff --git a/CUDADataFormats/Common/interface/CUDAProduct.h b/CUDADataFormats/Common/interface/CUDAProduct.h index ca07a344ba2d5..181024f068c7a 100644 --- a/CUDADataFormats/Common/interface/CUDAProduct.h +++ b/CUDADataFormats/Common/interface/CUDAProduct.h @@ -40,8 +40,8 @@ class CUDAProduct: public CUDAProductBase { friend class CUDAScopedContext; friend class edm::Wrapper>; - explicit CUDAProduct(int device, std::shared_ptr> stream, T data): - CUDAProductBase(device, std::move(stream)), + explicit CUDAProduct(int device, std::shared_ptr> stream, std::shared_ptr event, T data): + CUDAProductBase(device, std::move(stream), std::move(event)), data_(std::move(data)) {} diff --git a/CUDADataFormats/Common/interface/CUDAProductBase.h b/CUDADataFormats/Common/interface/CUDAProductBase.h index eb6fdae0e5abf..f54b1c0548ef4 100644 --- a/CUDADataFormats/Common/interface/CUDAProductBase.h +++ b/CUDADataFormats/Common/interface/CUDAProductBase.h @@ -14,6 +14,7 @@ class CUDAProductBase { CUDAProductBase() = default; // Needed only for ROOT dictionary generation bool isValid() const { return stream_.get() != nullptr; } + bool isAvailable() const; int device() const { return device_; } @@ -21,11 +22,11 @@ class CUDAProductBase { cuda::stream_t<>& stream() { return *stream_; } const std::shared_ptr>& streamPtr() const { return stream_; } - const cuda::event_t& event() const { return *event_; } - cuda::event_t& event() { return *event_; } + const cuda::event_t *event() const { return event_.get(); } + cuda::event_t *event() { return event_.get(); } protected: - explicit CUDAProductBase(int device, std::shared_ptr> stream); + explicit CUDAProductBase(int device, std::shared_ptr> stream, std::shared_ptr event); private: // The cuda::stream_t is really shared among edm::Event products, so diff --git a/CUDADataFormats/Common/src/CUDAProductBase.cc b/CUDADataFormats/Common/src/CUDAProductBase.cc index c034b4f7295f8..331c4514eb7f7 100644 --- a/CUDADataFormats/Common/src/CUDAProductBase.cc +++ b/CUDADataFormats/Common/src/CUDAProductBase.cc @@ -3,17 +3,17 @@ #include "FWCore/ServiceRegistry/interface/Service.h" #include "HeterogeneousCore/CUDAServices/interface/CUDAService.h" -CUDAProductBase::CUDAProductBase(int device, std::shared_ptr> stream): +CUDAProductBase::CUDAProductBase(int device, std::shared_ptr> stream, std::shared_ptr event): stream_(std::move(stream)), + event_(std::move(event)), device_(device) -{ - edm::Service cs; - event_ = cs->getCUDAEvent(); +{} - // Record CUDA event to the CUDA stream. The event will become - // "occurred" after all work queued to the stream before this - // point has been finished. - event_->record(stream_->id()); +bool CUDAProductBase::isAvailable() const { + // In absence of event, the product was available already at the end + // of produce() of the producer. + if(not event_) { + return true; + } + return event_->has_occurred(); } - - diff --git a/CUDADataFormats/Common/test/test_CUDAProduct.cc b/CUDADataFormats/Common/test/test_CUDAProduct.cc index bd5ddf7f512fe..308bfe27b29db 100644 --- a/CUDADataFormats/Common/test/test_CUDAProduct.cc +++ b/CUDADataFormats/Common/test/test_CUDAProduct.cc @@ -11,9 +11,15 @@ namespace cudatest { class TestCUDAScopedContext { public: static - CUDAScopedContext make(int dev) { + CUDAScopedContext make(int dev, bool createEvent) { auto device = cuda::device::get(dev); - return CUDAScopedContext(dev, std::make_unique>(device.create_stream(cuda::stream::implicitly_synchronizes_with_default_stream))); + std::unique_ptr event; + if(createEvent) { + event = std::make_unique(device.create_event()); + } + return CUDAScopedContext(dev, + std::make_unique>(device.create_stream(cuda::stream::implicitly_synchronizes_with_default_stream)), + std::move(event)); } }; } @@ -30,7 +36,7 @@ TEST_CASE("Use of CUDAProduct template", "[CUDACore]") { constexpr int defaultDevice = 0; { - auto ctx = cudatest::TestCUDAScopedContext::make(defaultDevice); + auto ctx = cudatest::TestCUDAScopedContext::make(defaultDevice, true); std::unique_ptr> dataPtr = ctx.wrap(10); auto& data = *dataPtr; @@ -38,7 +44,7 @@ TEST_CASE("Use of CUDAProduct template", "[CUDACore]") { REQUIRE(data.isValid()); REQUIRE(data.device() == defaultDevice); REQUIRE(data.stream().id() == ctx.stream().id()); - REQUIRE(&data.event() != nullptr); + REQUIRE(data.event() != nullptr); } SECTION("Move constructor") { diff --git a/HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h b/HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h index ef87d017373f8..ca6af7ef40d0c 100644 --- a/HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h +++ b/HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h @@ -68,7 +68,7 @@ class CUDAScopedContext { template const T& get(const CUDAProduct& data) { - synchronizeStreams(data.device(), data.stream(), data.event()); + synchronizeStreams(data.device(), data.stream(), data.isAvailable(), data.event()); return data.data_; } @@ -78,32 +78,36 @@ class CUDAScopedContext { } template - std::unique_ptr > wrap(T data) const { + std::unique_ptr > wrap(T data) { + createEventIfStreamBusy(); // make_unique doesn't work because of private constructor // // CUDAProduct constructor records CUDA event to the CUDA // stream. The event will become "occurred" after all work queued // to the stream before this point has been finished. - return std::unique_ptr >(new CUDAProduct(device(), streamPtr(), std::move(data))); + return std::unique_ptr >(new CUDAProduct(device(), streamPtr(), event_, std::move(data))); } template - auto emplace(edm::Event& iEvent, edm::EDPutTokenT token, Args&&... args) const { - return iEvent.emplace(token, device(), streamPtr(), std::forward(args)...); + auto emplace(edm::Event& iEvent, edm::EDPutTokenT token, Args&&... args) { + createEventIfStreamBusy(); + return iEvent.emplace(token, device(), streamPtr(), event_, std::forward(args)...); } private: friend class cudatest::TestCUDAScopedContext; // This construcor is only meant for testing - explicit CUDAScopedContext(int device, std::unique_ptr> stream); + explicit CUDAScopedContext(int device, std::unique_ptr> stream, std::unique_ptr event); - void synchronizeStreams(int dataDevice, const cuda::stream_t<>& dataStream, const cuda::event_t& dataEvent); + void createEventIfStreamBusy(); + void synchronizeStreams(int dataDevice, const cuda::stream_t<>& dataStream, bool available, const cuda::event_t *dataEvent); int currentDevice_; std::optional waitingTaskHolder_; cuda::device::current::scoped_override_t<> setDeviceForThisScope_; std::shared_ptr> stream_; + std::shared_ptr event_; }; #endif diff --git a/HeterogeneousCore/CUDACore/src/CUDAScopedContext.cc b/HeterogeneousCore/CUDACore/src/CUDAScopedContext.cc index a29fbee36865f..f46ee660b448d 100644 --- a/HeterogeneousCore/CUDACore/src/CUDAScopedContext.cc +++ b/HeterogeneousCore/CUDACore/src/CUDAScopedContext.cc @@ -16,13 +16,17 @@ CUDAScopedContext::CUDAScopedContext(edm::StreamID streamID): stream_ = cs->getCUDAStream(); } -CUDAScopedContext::CUDAScopedContext(int device, std::unique_ptr> stream): +CUDAScopedContext::CUDAScopedContext(int device, std::unique_ptr> stream, std::unique_ptr event): currentDevice_(device), setDeviceForThisScope_(device), - stream_(std::move(stream)) + stream_(std::move(stream)), + event_(std::move(event)) {} CUDAScopedContext::~CUDAScopedContext() { + if(event_) { + event_->record(stream_->id()); + } if(waitingTaskHolder_.has_value()) { stream_->enqueue.callback([device=currentDevice_, waitingTaskHolder=*waitingTaskHolder_] @@ -45,7 +49,15 @@ CUDAScopedContext::~CUDAScopedContext() { } } -void CUDAScopedContext::synchronizeStreams(int dataDevice, const cuda::stream_t<>& dataStream, const cuda::event_t& dataEvent) { +void CUDAScopedContext::createEventIfStreamBusy() { + if(event_ or stream_->is_clear()) { + return; + } + edm::Service cs; + event_ = cs->getCUDAEvent(); +} + +void CUDAScopedContext::synchronizeStreams(int dataDevice, const cuda::stream_t<>& dataStream, bool available, const cuda::event_t *dataEvent) { if(dataDevice != currentDevice_) { // Eventually replace with prefetch to current device (assuming unified memory works) // If we won't go to unified memory, need to figure out something else... @@ -54,13 +66,13 @@ void CUDAScopedContext::synchronizeStreams(int dataDevice, const cuda::stream_t< if(dataStream.id() != stream_->id()) { // Different streams, need to synchronize - if(!dataEvent.has_occurred()) { + if(not available and not dataEvent->has_occurred()) { // Event not yet occurred, so need to add synchronization // here. Sychronization is done by making the CUDA stream to // wait for an event, so all subsequent work in the stream // will run only after the event has "occurred" (i.e. data // product became available). - auto ret = cudaStreamWaitEvent(stream_->id(), dataEvent.id(), 0); + auto ret = cudaStreamWaitEvent(stream_->id(), dataEvent->id(), 0); cuda::throw_if_error(ret, "Failed to make a stream to wait for an event"); } } diff --git a/HeterogeneousCore/CUDACore/test/BuildFile.xml b/HeterogeneousCore/CUDACore/test/BuildFile.xml index cd2c3b094243c..d4bcf721b4af5 100644 --- a/HeterogeneousCore/CUDACore/test/BuildFile.xml +++ b/HeterogeneousCore/CUDACore/test/BuildFile.xml @@ -1,6 +1,10 @@ - + + + + + diff --git a/HeterogeneousCore/CUDACore/test/testStreamEvent.cu b/HeterogeneousCore/CUDACore/test/testStreamEvent.cu new file mode 100644 index 0000000000000..ba02dfef958a3 --- /dev/null +++ b/HeterogeneousCore/CUDACore/test/testStreamEvent.cu @@ -0,0 +1,117 @@ +/** + * The purpose of this test program is to ensure that the logic for + * CUDA event use in CUDAProduct and CUDAScopedContext + */ + +#include +#include +#include +#include +#include +#include + +#include + +#include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h" + +namespace { + constexpr int ARRAY_SIZE = 20000000; + constexpr int NLOOPS = 10; +} + +__global__ void kernel_looping(float* point, unsigned int num) { + unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x; + + for(int iloop=0; iloop(j); + } + + cudaMemcpyAsync(dev_points1, host_points1, + ARRAY_SIZE * sizeof(float), + cudaMemcpyHostToDevice, stream1); + kernel_looping<<<1, 16, 0, stream1>>>(dev_points1, ARRAY_SIZE); + if(debug) std::cout << "Kernel launched on stream1" << std::endl; + + auto status = cudaStreamQuery(stream1); + if(debug) std::cout << "Stream1 busy? " << (status == cudaErrorNotReady) << " idle? " << (status == cudaSuccess) << std::endl; + cudaEventRecord(event1, stream1); + status = cudaEventQuery(event1); + if (debug) std::cout << "Event1 recorded? " << (status == cudaErrorNotReady) << " occurred? " << (status == cudaSuccess) << std::endl; + assert(status == cudaErrorNotReady); + + status = cudaStreamQuery(stream2); + if(debug) std::cout << "Stream2 busy? " << (status == cudaErrorNotReady) << " idle? " << (status == cudaSuccess) << std::endl; + assert(status == cudaSuccess); + if(debug) { + cudaEventRecord(event2, stream2); + status = cudaEventQuery(event2); + std::cout << "Event2 recorded? " << (status == cudaErrorNotReady) << " occurred? " << (status == cudaSuccess) << std::endl; + std::this_thread::sleep_for(std::chrono::milliseconds(1)); + status = cudaEventQuery(event2); + std::cout << "Event2 recorded? " << (status == cudaErrorNotReady) << " occurred? " << (status == cudaSuccess) << std::endl; + } + + cudaStreamWaitEvent(stream2, event1, 0); + if(debug) std::cout << "\nStream2 waiting for event1" << std::endl; + status = cudaStreamQuery(stream2); + if(debug) std::cout << "Stream2 busy? " << (status == cudaErrorNotReady) << " idle? " << (status == cudaSuccess) << std::endl; + assert(status == cudaErrorNotReady); + cudaEventRecord(event2, stream2); + status = cudaEventQuery(event2); + if(debug) std::cout << "Event2 recorded? " << (status == cudaErrorNotReady) << " occurred? " << (status == cudaSuccess) << std::endl; + assert(status == cudaErrorNotReady); + if(debug) { + std::this_thread::sleep_for(std::chrono::milliseconds(1)); + status = cudaEventQuery(event2); + std::cout << "Event2 recorded? " << (status == cudaErrorNotReady) << " occurred? " << (status == cudaSuccess) << std::endl; + } + + status = cudaStreamQuery(stream1); + if(debug) { + std::cout << "\nStream1 busy? " << (status == cudaErrorNotReady) << " idle? " << (status == cudaSuccess) << std::endl; + std::cout << "Synchronizing stream1" << std::endl; + } + assert(status == cudaErrorNotReady); + cudaStreamSynchronize(stream1); + if(debug) std::cout << "Synchronized stream1" << std::endl; + + status = cudaEventQuery(event1); + if(debug) std::cout << "Event1 recorded? " << (status == cudaErrorNotReady) << " occurred? " << (status == cudaSuccess) << std::endl; + assert(status == cudaSuccess); + status = cudaEventQuery(event2); + if(debug) std::cout << "Event2 recorded? " << (status == cudaErrorNotReady) << " occurred? " << (status == cudaSuccess) << std::endl; + assert(status == cudaSuccess); + + cudaFree(dev_points1); + cudaFreeHost(host_points1); + cudaStreamDestroy(stream1); + cudaStreamDestroy(stream2); + cudaEventDestroy(event1); + cudaEventDestroy(event2); + + return 0; +} diff --git a/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc b/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc index eda2b94f5dfb4..6ac03b35ce5f1 100644 --- a/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc +++ b/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc @@ -11,16 +11,22 @@ namespace cudatest { class TestCUDAScopedContext { public: static - CUDAScopedContext make(int dev) { + CUDAScopedContext make(int dev, bool createEvent) { auto device = cuda::device::get(dev); - return CUDAScopedContext(dev, std::make_unique>(device.create_stream(cuda::stream::implicitly_synchronizes_with_default_stream))); + std::unique_ptr event; + if(createEvent) { + event = std::make_unique(device.create_event()); + } + return CUDAScopedContext(dev, + std::make_unique>(device.create_stream(cuda::stream::implicitly_synchronizes_with_default_stream)), + std::move(event)); } }; } namespace { std::unique_ptr > produce(int device, int *d, int *h) { - auto ctx = cudatest::TestCUDAScopedContext::make(device); + auto ctx = cudatest::TestCUDAScopedContext::make(device, true); cuda::memory::async::copy(d, h, sizeof(int), ctx.stream().id()); testCUDAScopedContextKernels_single(d, ctx.stream()); @@ -33,7 +39,7 @@ TEST_CASE("Use of CUDAScopedContext", "[CUDACore]") { constexpr int defaultDevice = 0; { - auto ctx = cudatest::TestCUDAScopedContext::make(defaultDevice); + auto ctx = cudatest::TestCUDAScopedContext::make(defaultDevice, true); SECTION("Construct from device ID") { REQUIRE(cuda::device::current::get().id() == defaultDevice); @@ -96,7 +102,8 @@ TEST_CASE("Use of CUDAScopedContext", "[CUDACore]") { auto d_a3 = cuda::memory::device::make_unique(current_device); testCUDAScopedContextKernels_join(prod1, prod2, d_a3.get(), ctx.stream()); ctx.stream().synchronize(); - REQUIRE(wprod2->event().has_occurred()); + REQUIRE(wprod2->isAvailable()); + REQUIRE(wprod2->event()->has_occurred()); h_a1 = 0; h_a2 = 0; diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitHeterogeneous.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitHeterogeneous.cc index d8e07667f976b..1a2f718f6a645 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitHeterogeneous.cc +++ b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitHeterogeneous.cc @@ -187,11 +187,11 @@ void SiPixelRecHitHeterogeneous::acquireGPUCuda(const edm::HeterogeneousEvent& i // synchronize explicitly (implementation is from // CUDAScopedContext). In practice these should not be needed // (because of synchronizations upstream), but let's play generic. - if(not hclusters->event().has_occurred()) { - cudaCheck(cudaStreamWaitEvent(cudaStream.id(), hclusters->event().id(), 0)); + if(not hclusters->isAvailable() && hclusters->event()->has_occurred()) { + cudaCheck(cudaStreamWaitEvent(cudaStream.id(), hclusters->event()->id(), 0)); } - if(not hdigis->event().has_occurred()) { - cudaCheck(cudaStreamWaitEvent(cudaStream.id(), hclusters->event().id(), 0)); + if(not hdigis->isAvailable() && hdigis->event()->has_occurred()) { + cudaCheck(cudaStreamWaitEvent(cudaStream.id(), hclusters->event()->id(), 0)); } edm::Handle bsHandle; diff --git a/SimTracker/TrackerHitAssociation/plugins/ClusterTPAssociationHeterogeneous.cc b/SimTracker/TrackerHitAssociation/plugins/ClusterTPAssociationHeterogeneous.cc index e9e271e1e58cc..99e8280a6c394 100644 --- a/SimTracker/TrackerHitAssociation/plugins/ClusterTPAssociationHeterogeneous.cc +++ b/SimTracker/TrackerHitAssociation/plugins/ClusterTPAssociationHeterogeneous.cc @@ -199,8 +199,8 @@ void ClusterTPAssociationHeterogeneous::acquireGPUCuda(const edm::HeterogeneousE // synchronize explicitly (implementation is from // CUDAScopedContext). In practice these should not be needed // (because of synchronizations upstream), but let's play generic. - if(not gd->event().has_occurred()) { - cudaCheck(cudaStreamWaitEvent(cudaStream.id(), gd->event().id(), 0)); + if(not gd->isAvailable() and gd->event()->has_occurred()) { + cudaCheck(cudaStreamWaitEvent(cudaStream.id(), gd->event()->id(), 0)); } edm::Handle gh; From 915accc357967b94f19c152dcf17fa00bc73f621 Mon Sep 17 00:00:00 2001 From: Matti Kortelainen Date: Mon, 18 Mar 2019 22:03:44 +0100 Subject: [PATCH 2/2] Fix intentions of the unit test --- .../CUDACore/test/test_CUDAScopedContext.cc | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc b/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc index 6ac03b35ce5f1..8b67c26490362 100644 --- a/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc +++ b/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc @@ -81,7 +81,7 @@ TEST_CASE("Use of CUDAScopedContext", "[CUDACore]") { cuda::device::current::scoped_override_t<> setDeviceForThisScope(defaultDevice); auto current_device = cuda::device::current::get(); - // Mimick a producer on the second CUDA stream + // Mimick a producer on the first CUDA stream int h_a1 = 1; auto d_a1 = cuda::memory::device::make_unique(current_device); auto wprod1 = produce(defaultDevice, d_a1.get(), &h_a1); @@ -96,12 +96,12 @@ TEST_CASE("Use of CUDAScopedContext", "[CUDACore]") { // Mimick a third producer "joining" the two streams CUDAScopedContext ctx2{*wprod1}; - auto prod1 = ctx.get(*wprod1); - auto prod2 = ctx.get(*wprod2); + auto prod1 = ctx2.get(*wprod1); + auto prod2 = ctx2.get(*wprod2); auto d_a3 = cuda::memory::device::make_unique(current_device); - testCUDAScopedContextKernels_join(prod1, prod2, d_a3.get(), ctx.stream()); - ctx.stream().synchronize(); + testCUDAScopedContextKernels_join(prod1, prod2, d_a3.get(), ctx2.stream()); + ctx2.stream().synchronize(); REQUIRE(wprod2->isAvailable()); REQUIRE(wprod2->event()->has_occurred());