From e7659d22c95d4a3a57c7175d628aae5dc1603b99 Mon Sep 17 00:00:00 2001 From: Matti Kortelainen Date: Tue, 9 Apr 2019 18:09:46 +0200 Subject: [PATCH 1/6] Add perfect forwarding overload for CUDAProduct constructor --- CUDADataFormats/Common/interface/CUDAProduct.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/CUDADataFormats/Common/interface/CUDAProduct.h b/CUDADataFormats/Common/interface/CUDAProduct.h index 181024f068c7a..fb90496f33661 100644 --- a/CUDADataFormats/Common/interface/CUDAProduct.h +++ b/CUDADataFormats/Common/interface/CUDAProduct.h @@ -45,6 +45,12 @@ class CUDAProduct: public CUDAProductBase { data_(std::move(data)) {} + template + explicit CUDAProduct(int device, std::shared_ptr> stream, std::shared_ptr event, Args&&... args): + CUDAProductBase(device, std::move(stream), std::move(event)), + data_(std::forward(args)...) + {} + T data_; //! }; From fc23822606fbdbb6b2d72d527897435a4ebf8253 Mon Sep 17 00:00:00 2001 From: Matti Kortelainen Date: Fri, 5 Apr 2019 17:33:15 +0200 Subject: [PATCH 2/6] Move BeamSpot transfer to GPU to its own producer --- CUDADataFormats/BeamSpot/BuildFile.xml | 8 +++ .../BeamSpot/interface/BeamSpotCUDA.h | 33 ++++++++++ CUDADataFormats/BeamSpot/src/BeamSpotCUDA.cc | 12 ++++ CUDADataFormats/BeamSpot/src/classes.h | 8 +++ CUDADataFormats/BeamSpot/src/classes_def.xml | 4 ++ .../python/Reconstruction_cff.py | 4 +- .../SiPixelRecHits/plugins/BuildFile.xml | 1 + .../SiPixelRecHits/plugins/PixelRecHits.cu | 7 +- .../SiPixelRecHits/plugins/PixelRecHits.h | 3 +- .../plugins/SiPixelRecHitHeterogeneous.cc | 21 +++--- .../SiPixelRecHits/plugins/gpuPixelRecHits.h | 9 +-- .../siPixelRecHitsHeterogeneousProduct.h | 1 - .../plugins/BeamSpotToCUDA.cc | 65 +++++++++++++++++++ .../BeamSpotProducer/plugins/BuildFile.xml | 8 +++ .../BeamSpotProducer/python/BeamSpot_cff.py | 7 ++ 15 files changed, 167 insertions(+), 24 deletions(-) create mode 100644 CUDADataFormats/BeamSpot/BuildFile.xml create mode 100644 CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h create mode 100644 CUDADataFormats/BeamSpot/src/BeamSpotCUDA.cc create mode 100644 CUDADataFormats/BeamSpot/src/classes.h create mode 100644 CUDADataFormats/BeamSpot/src/classes_def.xml create mode 100644 RecoVertex/BeamSpotProducer/plugins/BeamSpotToCUDA.cc diff --git a/CUDADataFormats/BeamSpot/BuildFile.xml b/CUDADataFormats/BeamSpot/BuildFile.xml new file mode 100644 index 0000000000000..b4a05240b567d --- /dev/null +++ b/CUDADataFormats/BeamSpot/BuildFile.xml @@ -0,0 +1,8 @@ + + + + + + + + diff --git a/CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h b/CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h new file mode 100644 index 0000000000000..5dfc646fcad49 --- /dev/null +++ b/CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h @@ -0,0 +1,33 @@ +#ifndef CUDADataFormats_BeamSpot_interface_BeamSpotCUDA_h +#define CUDADataFormats_BeamSpot_interface_BeamSpotCUDA_h + +#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" +#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" + +#include + +class BeamSpotCUDA { +public: + // alignas(128) doesn't really make sense as there is only one + // beamspot per event? + struct Data { + float x,y,z; // position + // TODO: add covariance matrix + + float sigmaZ; + float beamWidthX, beamWidthY; + float dxdz, dydz; + float emittanceX, emittanceY; + float betaStar; + }; + + BeamSpotCUDA() = default; + BeamSpotCUDA(cudautils::host::unique_ptr data_h, cuda::stream_t<>& stream); + + Data const* data() const { return data_d_.get(); } + +private: + cudautils::device::unique_ptr data_d_; +}; + +#endif diff --git a/CUDADataFormats/BeamSpot/src/BeamSpotCUDA.cc b/CUDADataFormats/BeamSpot/src/BeamSpotCUDA.cc new file mode 100644 index 0000000000000..487f506297cd1 --- /dev/null +++ b/CUDADataFormats/BeamSpot/src/BeamSpotCUDA.cc @@ -0,0 +1,12 @@ +#include "CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h" + +#include "FWCore/ServiceRegistry/interface/Service.h" +#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h" +#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h" + +BeamSpotCUDA::BeamSpotCUDA(cudautils::host::unique_ptr data_h, cuda::stream_t<>& stream) { + edm::Service cs; + + data_d_ = cs->make_device_unique(stream); + cudautils::copyAsync(data_d_, data_h, stream); +} diff --git a/CUDADataFormats/BeamSpot/src/classes.h b/CUDADataFormats/BeamSpot/src/classes.h new file mode 100644 index 0000000000000..62f990c0ba3b3 --- /dev/null +++ b/CUDADataFormats/BeamSpot/src/classes.h @@ -0,0 +1,8 @@ +#ifndef CUDADataFormats_BeamSpot_classes_h +#define CUDADataFormats_BeamSpot_classes_h + +#include "CUDADataFormats/Common/interface/CUDAProduct.h" +#include "CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h" +#include "DataFormats/Common/interface/Wrapper.h" + +#endif diff --git a/CUDADataFormats/BeamSpot/src/classes_def.xml b/CUDADataFormats/BeamSpot/src/classes_def.xml new file mode 100644 index 0000000000000..29a0eafa04005 --- /dev/null +++ b/CUDADataFormats/BeamSpot/src/classes_def.xml @@ -0,0 +1,4 @@ + + + + diff --git a/Configuration/StandardSequences/python/Reconstruction_cff.py b/Configuration/StandardSequences/python/Reconstruction_cff.py index 36ac26b2197c6..8bb2e5cb2afcb 100644 --- a/Configuration/StandardSequences/python/Reconstruction_cff.py +++ b/Configuration/StandardSequences/python/Reconstruction_cff.py @@ -198,9 +198,9 @@ reconstruction_trackingOnly = cms.Sequence(localreco*globalreco_tracking) reconstruction_pixelTrackingOnly = cms.Sequence( pixeltrackerlocalreco* - offlineBeamSpot* siPixelClusterShapeCachePreSplitting* - recopixelvertexing + recopixelvertexing, + offlineBeamSpotTask ) #need a fully expanded sequence copy diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/BuildFile.xml b/RecoLocalTracker/SiPixelRecHits/plugins/BuildFile.xml index a8af0c8a7c4f9..27ee3af86e102 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/BuildFile.xml +++ b/RecoLocalTracker/SiPixelRecHits/plugins/BuildFile.xml @@ -1,3 +1,4 @@ + diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu index 80be13dedd26b..6ac70fce95a88 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu +++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu @@ -36,7 +36,6 @@ namespace pixelgpudetails { constexpr auto MAX_HITS = siPixelRecHitsHeterogeneousProduct::maxHits(); - cudaCheck(cudaMalloc((void **) & gpu_.bs_d, 3 * sizeof(float))); cudaCheck(cudaMalloc((void **) & gpu_.hitsLayerStart_d, 11 * sizeof(uint32_t))); // Coalesce all 32bit and 16bit arrays to two big blobs @@ -111,7 +110,6 @@ namespace pixelgpudetails { #endif } PixelRecHitGPUKernel::~PixelRecHitGPUKernel() { - cudaCheck(cudaFree(gpu_.bs_d)); cudaCheck(cudaFree(gpu_.hitsLayerStart_d)); cudaCheck(cudaFree(gpu_.owner_32bit_)); cudaCheck(cudaFree(gpu_.owner_16bit_)); @@ -131,11 +129,10 @@ namespace pixelgpudetails { void PixelRecHitGPUKernel::makeHitsAsync(SiPixelDigisCUDA const& digis_d, SiPixelClustersCUDA const& clusters_d, - float const * bs, + BeamSpotCUDA const& bs_d, pixelCPEforGPU::ParamsOnGPU const * cpeParams, bool transferToCPU, cuda::stream_t<>& stream) { - cudaCheck(cudaMemcpyAsync(gpu_.bs_d, bs, 3 * sizeof(float), cudaMemcpyDefault, stream.id())); gpu_.hitsModuleStart_d = clusters_d.clusModuleStart(); gpu_.cpeParams = cpeParams; // copy it for use in clients cudaCheck(cudaMemcpyAsync(gpu_d, &gpu_, sizeof(HitsOnGPU), cudaMemcpyDefault, stream.id())); @@ -148,7 +145,7 @@ namespace pixelgpudetails { #endif gpuPixelRecHits::getHits<<>>( cpeParams, - gpu_.bs_d, + bs_d.data(), digis_d.moduleInd(), digis_d.xx(), digis_d.yy(), digis_d.adc(), clusters_d.moduleStart(), diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.h b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.h index 49164d24ab335..8e5599c239789 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.h +++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.h @@ -1,6 +1,7 @@ #ifndef RecoLocalTracker_SiPixelRecHits_plugins_PixelRecHits_h #define RecoLocalTracker_SiPixelRecHits_plugins_PixelRecHits_h +#include "CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h" #include "CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h" #include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h" #include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusteringConstants.h" @@ -34,7 +35,7 @@ namespace pixelgpudetails { void makeHitsAsync(SiPixelDigisCUDA const& digis_d, SiPixelClustersCUDA const& clusters_d, - float const * bs, + BeamSpotCUDA const& bs_d, pixelCPEforGPU::ParamsOnGPU const * cpeParams, bool transferToCPU, cuda::stream_t<>& stream); diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitHeterogeneous.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitHeterogeneous.cc index e26ec84ddf5b5..8786747ef7a50 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitHeterogeneous.cc +++ b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitHeterogeneous.cc @@ -1,9 +1,9 @@ #include "CUDADataFormats/Common/interface/CUDAProduct.h" +#include "CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h" #include "CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h" #include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h" #include "DataFormats/Common/interface/DetSetVectorNew.h" #include "DataFormats/Common/interface/Handle.h" -#include "DataFormats/BeamSpot/interface/BeamSpot.h" #include "DataFormats/SiPixelCluster/interface/SiPixelCluster.h" #include "DataFormats/TrackerRecHit2D/interface/SiPixelRecHitCollection.h" #include "FWCore/Framework/interface/ESHandle.h" @@ -64,8 +64,8 @@ class SiPixelRecHitHeterogeneous: public HeterogeneousEDProducer& inputhandle, SiPixelRecHitCollectionNew &output, const pixelgpudetails::HitsOnCPU& hoc) const; - edm::EDGetTokenT tBeamSpot; // The mess with inputs will be cleaned up when migrating to the new framework + edm::EDGetTokenT> tBeamSpot; edm::EDGetTokenT> token_; edm::EDGetTokenT> tokenDigi_; edm::EDGetTokenT clusterToken_; @@ -82,7 +82,7 @@ class SiPixelRecHitHeterogeneous: public HeterogeneousEDProducer(iConfig.getParameter("beamSpot"))), + tBeamSpot(consumes>(iConfig.getParameter("beamSpot"))), token_(consumes>(iConfig.getParameter("heterogeneousSrc"))), tokenDigi_(consumes>(iConfig.getParameter("heterogeneousSrc"))), cpeName_(iConfig.getParameter("CPE")) @@ -100,7 +100,7 @@ SiPixelRecHitHeterogeneous::SiPixelRecHitHeterogeneous(const edm::ParameterSet& void SiPixelRecHitHeterogeneous::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { edm::ParameterSetDescription desc; - desc.add("beamSpot", edm::InputTag("offlineBeamSpot")); + desc.add("beamSpot", edm::InputTag("offlineBeamSpotCUDA")); desc.add("heterogeneousSrc", edm::InputTag("siPixelClustersCUDAPreSplitting")); desc.add("src", edm::InputTag("siPixelClustersPreSplitting")); desc.add("CPE", "PixelCPEFast"); @@ -183,6 +183,10 @@ void SiPixelRecHitHeterogeneous::acquireGPUCuda(const edm::HeterogeneousEvent& i iEvent.getByToken(tokenDigi_, hdigis); auto const& digis = ctx.get(*hdigis); + edm::Handle> hbs; + iEvent.getByToken(tBeamSpot, hbs); + auto const& bs = ctx.get(*hbs); + // We're processing in a stream given by base class, so need to // synchronize explicitly (implementation is from // CUDAScopedContext). In practice these should not be needed @@ -193,13 +197,8 @@ void SiPixelRecHitHeterogeneous::acquireGPUCuda(const edm::HeterogeneousEvent& i if(not hdigis->isAvailable() && hdigis->event()->has_occurred()) { cudaCheck(cudaStreamWaitEvent(cudaStream.id(), hclusters->event()->id(), 0)); } - - edm::Handle bsHandle; - iEvent.getByToken( tBeamSpot, bsHandle); - float bs[3] = {0.f}; - if(bsHandle.isValid()) { - const auto & bsh = *bsHandle; - bs[0]=bsh.x0(); bs[1]=bsh.y0(); bs[2]=bsh.z0(); + if(not hbs->isAvailable() && hbs->event()->has_occurred()) { + cudaCheck(cudaStreamWaitEvent(cudaStream.id(), hbs->event()->id(), 0)); } gpuAlgo_->makeHitsAsync(digis, clusters, bs, fcpe->getGPUProductAsync(cudaStream), enableTransfer_, cudaStream); diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h b/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h index 6864a046bf1dc..cbd354e71143e 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h +++ b/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h @@ -5,6 +5,7 @@ #include #include +#include "CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h" #include "DataFormats/Math/interface/approx_atan2.h" #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h" #include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h" @@ -15,7 +16,7 @@ namespace gpuPixelRecHits { __global__ void getHits(pixelCPEforGPU::ParamsOnGPU const * __restrict__ cpeParams, - float const * __restrict__ bs, + BeamSpotCUDA::Data const * __restrict__ bs, uint16_t const * __restrict__ id, uint16_t const * __restrict__ x, uint16_t const * __restrict__ y, @@ -143,9 +144,9 @@ namespace gpuPixelRecHits { // to global and compute phi... cpeParams->detParams(me).frame.toGlobal(xl[h],yl[h], xg[h],yg[h],zg[h]); // here correct for the beamspot... - xg[h]-=bs[0]; - yg[h]-=bs[1]; - zg[h]-=bs[2]; + xg[h]-=bs->x; + yg[h]-=bs->y; + zg[h]-=bs->z; rg[h] = std::sqrt(xg[h]*xg[h]+yg[h]*yg[h]); iph[h] = unsafe_atan2s<7>(yg[h],xg[h]); diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h b/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h index ea6eaf8458dde..5bed073c01c3a 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h +++ b/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h @@ -22,7 +22,6 @@ namespace siPixelRecHitsHeterogeneousProduct { struct HitsOnGPU{ pixelCPEforGPU::ParamsOnGPU const * cpeParams = nullptr; // forwarded from setup, NOT owned - float * bs_d; const uint32_t * hitsModuleStart_d; // forwarded from clusters uint32_t * hitsLayerStart_d; int32_t * charge_d; diff --git a/RecoVertex/BeamSpotProducer/plugins/BeamSpotToCUDA.cc b/RecoVertex/BeamSpotProducer/plugins/BeamSpotToCUDA.cc new file mode 100644 index 0000000000000..68c1a19eb3e71 --- /dev/null +++ b/RecoVertex/BeamSpotProducer/plugins/BeamSpotToCUDA.cc @@ -0,0 +1,65 @@ +#include "CUDADataFormats/Common/interface/CUDAProduct.h" +#include "CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h" +#include "DataFormats/BeamSpot/interface/BeamSpot.h" +#include "FWCore/Framework/interface/Event.h" +#include "FWCore/Framework/interface/MakerMacros.h" +#include "FWCore/Framework/interface/global/EDProducer.h" +#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h" +#include "FWCore/ParameterSet/interface/ParameterSetDescription.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "FWCore/ServiceRegistry/interface/Service.h" +#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h" +#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h" +#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" + + +class BeamSpotToCUDA: public edm::global::EDProducer<> { +public: + explicit BeamSpotToCUDA(const edm::ParameterSet& iConfig); + ~BeamSpotToCUDA() override = default; + + static void fillDescriptions(edm::ConfigurationDescriptions& descriptions); + + void produce(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const override; + +private: + edm::EDGetTokenT bsGetToken_; + edm::EDPutTokenT> bsPutToken_; +}; + +BeamSpotToCUDA::BeamSpotToCUDA(const edm::ParameterSet& iConfig): + bsGetToken_{consumes(iConfig.getParameter("src"))}, + bsPutToken_{produces>()} +{} + +void BeamSpotToCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { + edm::ParameterSetDescription desc; + desc.add("src", edm::InputTag("offlineBeamSpot")); + descriptions.addWithDefaultLabel(desc); +} + +void BeamSpotToCUDA::produce(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const { + CUDAScopedContext ctx{streamID}; + + const reco::BeamSpot& bs = iEvent.get(bsGetToken_); + + edm::Service cs; + auto bsHost = cs->make_host_unique(ctx.stream()); + bsHost->x = bs.x0(); + bsHost->y = bs.y0(); + bsHost->z = bs.z0(); + + bsHost->sigmaZ = bs.sigmaZ(); + bsHost->beamWidthX = bs.BeamWidthX(); + bsHost->beamWidthY = bs.BeamWidthY(); + bsHost->dxdz = bs.dxdz(); + bsHost->dydz = bs.dydz(); + bsHost->emittanceX = bs.emittanceX(); + bsHost->emittanceY = bs.emittanceY(); + bsHost->betaStar = bs.betaStar(); + + ctx.emplace(iEvent, bsPutToken_, std::move(bsHost), ctx.stream()); +} + +DEFINE_FWK_MODULE(BeamSpotToCUDA); + diff --git a/RecoVertex/BeamSpotProducer/plugins/BuildFile.xml b/RecoVertex/BeamSpotProducer/plugins/BuildFile.xml index bb4502c33b332..e3ce85df9a81a 100644 --- a/RecoVertex/BeamSpotProducer/plugins/BuildFile.xml +++ b/RecoVertex/BeamSpotProducer/plugins/BuildFile.xml @@ -14,6 +14,14 @@ + + + + + + + + diff --git a/RecoVertex/BeamSpotProducer/python/BeamSpot_cff.py b/RecoVertex/BeamSpotProducer/python/BeamSpot_cff.py index deb62255199e5..9654f9ab410b8 100644 --- a/RecoVertex/BeamSpotProducer/python/BeamSpot_cff.py +++ b/RecoVertex/BeamSpotProducer/python/BeamSpot_cff.py @@ -1,4 +1,11 @@ import FWCore.ParameterSet.Config as cms from RecoVertex.BeamSpotProducer.BeamSpot_cfi import * +from RecoVertex.BeamSpotProducer.beamSpotToCUDA_cfi import beamSpotToCUDA as _beamSpotToCUDA +offlineBeamSpotCUDA = _beamSpotToCUDA.clone() + +offlineBeamSpotTask = cms.Task( + offlineBeamSpot, + offlineBeamSpotCUDA +) From 6deb682f0e19a9feab995b5d51c8a60a6a719204 Mon Sep 17 00:00:00 2001 From: Matti Kortelainen Date: Tue, 15 Jan 2019 18:00:18 +0100 Subject: [PATCH 3/6] Add non-cached make_host_unique with the possibility to pass flags to cudaHostAlloc() --- HeterogeneousCore/CUDAUtilities/BuildFile.xml | 1 + .../interface/host_noncached_unique_ptr.h | 65 +++++++++++++++++++ .../CUDAUtilities/test/BuildFile.xml | 2 +- .../test/host_noncached_unique_ptr_t.cpp | 22 +++++++ 4 files changed, 89 insertions(+), 1 deletion(-) create mode 100644 HeterogeneousCore/CUDAUtilities/interface/host_noncached_unique_ptr.h create mode 100644 HeterogeneousCore/CUDAUtilities/test/host_noncached_unique_ptr_t.cpp diff --git a/HeterogeneousCore/CUDAUtilities/BuildFile.xml b/HeterogeneousCore/CUDAUtilities/BuildFile.xml index 289f0208fd5e7..153430997064d 100644 --- a/HeterogeneousCore/CUDAUtilities/BuildFile.xml +++ b/HeterogeneousCore/CUDAUtilities/BuildFile.xml @@ -1,5 +1,6 @@ + diff --git a/HeterogeneousCore/CUDAUtilities/interface/host_noncached_unique_ptr.h b/HeterogeneousCore/CUDAUtilities/interface/host_noncached_unique_ptr.h new file mode 100644 index 0000000000000..c9f9aff89d975 --- /dev/null +++ b/HeterogeneousCore/CUDAUtilities/interface/host_noncached_unique_ptr.h @@ -0,0 +1,65 @@ +#ifndef HeterogeneousCore_CUDAUtilities_interface_host_noncached_unique_ptr_h +#define HeterogeneousCore_CUDAUtilities_interface_host_noncached_unique_ptr_h + +#include + +#include +#include + +namespace cudautils { + namespace host { + namespace noncached { + namespace impl { + // Additional layer of types to distinguish from host::unique_ptr + class HostDeleter { + public: + void operator()(void *ptr) { + cuda::throw_if_error(cudaFreeHost(ptr)); + } + }; + } + + template + using unique_ptr = std::unique_ptr; + + namespace impl { + template + struct make_host_unique_selector { using non_array = cudautils::host::noncached::unique_ptr; }; + template + struct make_host_unique_selector { using unbounded_array = cudautils::host::noncached::unique_ptr; }; + template + struct make_host_unique_selector { struct bounded_array {}; }; + } + } + } + + /** + * The difference wrt. CUDAService::make_host_unique is that these + * do not cache, so they should not be called per-event. + */ + template + typename host::noncached::impl::make_host_unique_selector::non_array + make_host_noncached_unique(unsigned int flags = cudaHostAllocDefault) { + static_assert(std::is_trivially_constructible::value, "Allocating with non-trivial constructor on the pinned host memory is not supported"); + void *mem; + cuda::throw_if_error(cudaHostAlloc(&mem, sizeof(T), flags)); + return typename cudautils::host::noncached::impl::make_host_unique_selector::non_array(reinterpret_cast(mem)); + } + + template + typename host::noncached::impl::make_host_unique_selector::unbounded_array + make_host_noncached_unique(size_t n, unsigned int flags = cudaHostAllocDefault) { + using element_type = typename std::remove_extent::type; + static_assert(std::is_trivially_constructible::value, "Allocating with non-trivial constructor on the pinned host memory is not supported"); + void *mem; + cuda::throw_if_error(cudaHostAlloc(&mem, n*sizeof(element_type), flags)); + return typename cudautils::host::noncached::impl::make_host_unique_selector::unbounded_array(reinterpret_cast(mem)); + } + + template + typename cudautils::host::noncached::impl::make_host_unique_selector::bounded_array + make_host_noncached_unique(Args&&...) = delete; +} + +#endif + diff --git a/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml b/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml index 0f48f95a8e4ad..0d06484b72a5e 100644 --- a/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml +++ b/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml @@ -68,7 +68,7 @@ - + diff --git a/HeterogeneousCore/CUDAUtilities/test/host_noncached_unique_ptr_t.cpp b/HeterogeneousCore/CUDAUtilities/test/host_noncached_unique_ptr_t.cpp new file mode 100644 index 0000000000000..ae9e3c9a3849d --- /dev/null +++ b/HeterogeneousCore/CUDAUtilities/test/host_noncached_unique_ptr_t.cpp @@ -0,0 +1,22 @@ +#include "catch.hpp" + +#include "HeterogeneousCore/CUDAUtilities/interface/host_noncached_unique_ptr.h" +#include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h" + +TEST_CASE("host_noncached_unique_ptr", "[cudaMemTools]") { + exitSansCUDADevices(); + + SECTION("Single element") { + auto ptr1 = cudautils::make_host_noncached_unique(); + REQUIRE(ptr1 != nullptr); + auto ptr2 = cudautils::make_host_noncached_unique(cudaHostAllocPortable | cudaHostAllocWriteCombined); + REQUIRE(ptr2 != nullptr); + } + + SECTION("Multiple elements") { + auto ptr1 = cudautils::make_host_noncached_unique(10); + REQUIRE(ptr1 != nullptr); + auto ptr2 = cudautils::make_host_noncached_unique(10, cudaHostAllocPortable | cudaHostAllocWriteCombined); + REQUIRE(ptr2 != nullptr); + } +} From 2864b42ffeb5684f7a032be4d96e39e7e79f5e07 Mon Sep 17 00:00:00 2001 From: Matti Kortelainen Date: Wed, 10 Apr 2019 20:00:49 +0200 Subject: [PATCH 4/6] Use beginStream()-allocated write-combined memory for the BeamSpot transfer --- .../BeamSpot/interface/BeamSpotCUDA.h | 3 +- CUDADataFormats/BeamSpot/src/BeamSpotCUDA.cc | 5 ++- .../plugins/BeamSpotToCUDA.cc | 33 ++++++++++++++++--- 3 files changed, 31 insertions(+), 10 deletions(-) diff --git a/CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h b/CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h index 5dfc646fcad49..70c3f1b3f9d11 100644 --- a/CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h +++ b/CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h @@ -2,7 +2,6 @@ #define CUDADataFormats_BeamSpot_interface_BeamSpotCUDA_h #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" -#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" #include @@ -22,7 +21,7 @@ class BeamSpotCUDA { }; BeamSpotCUDA() = default; - BeamSpotCUDA(cudautils::host::unique_ptr data_h, cuda::stream_t<>& stream); + BeamSpotCUDA(Data const* data_h, cuda::stream_t<>& stream); Data const* data() const { return data_d_.get(); } diff --git a/CUDADataFormats/BeamSpot/src/BeamSpotCUDA.cc b/CUDADataFormats/BeamSpot/src/BeamSpotCUDA.cc index 487f506297cd1..2714df51d2456 100644 --- a/CUDADataFormats/BeamSpot/src/BeamSpotCUDA.cc +++ b/CUDADataFormats/BeamSpot/src/BeamSpotCUDA.cc @@ -2,11 +2,10 @@ #include "FWCore/ServiceRegistry/interface/Service.h" #include "HeterogeneousCore/CUDAServices/interface/CUDAService.h" -#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h" -BeamSpotCUDA::BeamSpotCUDA(cudautils::host::unique_ptr data_h, cuda::stream_t<>& stream) { +BeamSpotCUDA::BeamSpotCUDA(Data const* data_h, cuda::stream_t<>& stream) { edm::Service cs; data_d_ = cs->make_device_unique(stream); - cudautils::copyAsync(data_d_, data_h, stream); + cuda::memory::async::copy(data_d_.get(), data_h, sizeof(Data), stream.id()); } diff --git a/RecoVertex/BeamSpotProducer/plugins/BeamSpotToCUDA.cc b/RecoVertex/BeamSpotProducer/plugins/BeamSpotToCUDA.cc index 68c1a19eb3e71..9b15f08b8dfc2 100644 --- a/RecoVertex/BeamSpotProducer/plugins/BeamSpotToCUDA.cc +++ b/RecoVertex/BeamSpotProducer/plugins/BeamSpotToCUDA.cc @@ -10,16 +10,39 @@ #include "FWCore/ServiceRegistry/interface/Service.h" #include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h" #include "HeterogeneousCore/CUDAServices/interface/CUDAService.h" -#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" +#include "HeterogeneousCore/CUDAUtilities/interface/host_noncached_unique_ptr.h" +#include -class BeamSpotToCUDA: public edm::global::EDProducer<> { +namespace { + class BSHost { + public: + BSHost(): + bs{cudautils::make_host_noncached_unique(cudaHostAllocWriteCombined)} + {} + BeamSpotCUDA::Data *get() { return bs.get(); } + + private: + cudautils::host::noncached::unique_ptr bs; + }; +} + +class BeamSpotToCUDA: public edm::global::EDProducer > { public: explicit BeamSpotToCUDA(const edm::ParameterSet& iConfig); ~BeamSpotToCUDA() override = default; static void fillDescriptions(edm::ConfigurationDescriptions& descriptions); + std::unique_ptr beginStream(edm::StreamID) const { + edm::Service cs; + if(cs->enabled()) { + return std::make_unique(); + } + else { + return nullptr; + } + } void produce(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const override; private: @@ -43,8 +66,8 @@ void BeamSpotToCUDA::produce(edm::StreamID streamID, edm::Event& iEvent, const e const reco::BeamSpot& bs = iEvent.get(bsGetToken_); - edm::Service cs; - auto bsHost = cs->make_host_unique(ctx.stream()); + BeamSpotCUDA::Data *bsHost = streamCache(streamID)->get(); + bsHost->x = bs.x0(); bsHost->y = bs.y0(); bsHost->z = bs.z0(); @@ -58,7 +81,7 @@ void BeamSpotToCUDA::produce(edm::StreamID streamID, edm::Event& iEvent, const e bsHost->emittanceY = bs.emittanceY(); bsHost->betaStar = bs.betaStar(); - ctx.emplace(iEvent, bsPutToken_, std::move(bsHost), ctx.stream()); + ctx.emplace(iEvent, bsPutToken_, bsHost, ctx.stream()); } DEFINE_FWK_MODULE(BeamSpotToCUDA); From 0135dbf43632988e4beeda22d8403036c8ac1955 Mon Sep 17 00:00:00 2001 From: Matti Kortelainen Date: Tue, 15 Jan 2019 23:25:29 +0100 Subject: [PATCH 5/6] Move raw data pinned host buffers to be allocated in constructor with cudaHostAllocWriteCombined --- .../plugins/SiPixelRawToClusterCUDA.cc | 13 ++++++++++--- .../plugins/SiPixelRawToClusterGPUKernel.cu | 7 +++---- .../plugins/SiPixelRawToClusterGPUKernel.h | 7 ++++--- 3 files changed, 17 insertions(+), 10 deletions(-) diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterCUDA.cc b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterCUDA.cc index b23faad9e78d3..f2dacd5fbc415 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterCUDA.cc +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterCUDA.cc @@ -24,7 +24,9 @@ #include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h" #include "FWCore/ParameterSet/interface/ParameterSetDescription.h" #include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "FWCore/ServiceRegistry/interface/Service.h" #include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h" +#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h" #include "RecoLocalTracker/SiPixelClusterizer/interface/SiPixelFedCablingMapGPUWrapper.h" #include "RecoTracker/Record/interface/CkfComponentsRecord.h" @@ -62,6 +64,7 @@ class SiPixelRawToClusterCUDA: public edm::stream::EDProducer std::unique_ptr regions_; pixelgpudetails::SiPixelRawToClusterGPUKernel gpuAlgo_; + std::unique_ptr wordFedAppender_; PixelDataFormatter::Errors errors_; const bool includeErrors_; @@ -88,6 +91,11 @@ SiPixelRawToClusterCUDA::SiPixelRawToClusterCUDA(const edm::ParameterSet& iConfi } if(usePilotBlade_) edm::LogInfo("SiPixelRawToCluster") << " Use pilot blade data (FED 40)"; + + edm::Service cs; + if(cs->enabled()) { + wordFedAppender_ = std::make_unique(); + } } void SiPixelRawToClusterCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { @@ -161,7 +169,6 @@ void SiPixelRawToClusterCUDA::acquire(const edm::Event& iEvent, const edm::Event // In CPU algorithm this loop is part of PixelDataFormatter::interpretRawData() ErrorChecker errorcheck; - auto wordFedAppender = pixelgpudetails::SiPixelRawToClusterGPUKernel::WordFedAppender(ctx.stream()); for(int fedId: fedIds_) { if (!usePilotBlade_ && (fedId==40) ) continue; // skip pilot blade data if (regions_ && !regions_->mayUnpackFED(fedId)) continue; @@ -209,13 +216,13 @@ void SiPixelRawToClusterCUDA::acquire(const edm::Event& iEvent, const edm::Event const cms_uint32_t * ew = (const cms_uint32_t *)(trailer); assert(0 == (ew-bw)%2); - wordFedAppender.initializeWordFed(fedId, wordCounterGPU, bw, (ew-bw)); + wordFedAppender_->initializeWordFed(fedId, wordCounterGPU, bw, (ew-bw)); wordCounterGPU+=(ew-bw); } // end of for loop gpuAlgo_.makeClustersAsync(gpuMap, gpuModulesToUnpack, gpuGains, - wordFedAppender, + *wordFedAppender_, std::move(errors_), wordCounterGPU, fedCounter, useQuality_, includeErrors_, diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu index 3d4e377eb8221..8fdb2ed8c90d5 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu @@ -45,10 +45,9 @@ namespace pixelgpudetails { // number of words for all the FEDs constexpr uint32_t MAX_FED_WORDS = pixelgpudetails::MAX_FED * pixelgpudetails::MAX_WORD; - SiPixelRawToClusterGPUKernel::WordFedAppender::WordFedAppender(cuda::stream_t<>& cudaStream) { - edm::Service cs; - word_ = cs->make_host_unique(MAX_FED_WORDS, cudaStream); - fedId_ = cs->make_host_unique(MAX_FED_WORDS, cudaStream); + SiPixelRawToClusterGPUKernel::WordFedAppender::WordFedAppender() { + word_ = cudautils::make_host_noncached_unique(MAX_FED_WORDS, cudaHostAllocWriteCombined); + fedId_ = cudautils::make_host_noncached_unique(MAX_FED_WORDS, cudaHostAllocWriteCombined); } void SiPixelRawToClusterGPUKernel::WordFedAppender::initializeWordFed(int fedId, unsigned int wordCounterGPU, const cms_uint32_t *src, unsigned int length) { diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h index a0f89dc241c64..0d2b6a8c7fc65 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h @@ -11,6 +11,7 @@ #include "FWCore/Utilities/interface/typedefs.h" #include "HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h" #include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" +#include "HeterogeneousCore/CUDAUtilities/interface/host_noncached_unique_ptr.h" #include "DataFormats/SiPixelDigi/interface/PixelErrors.h" struct SiPixelFedCablingMapGPU; @@ -159,7 +160,7 @@ namespace pixelgpudetails { public: class WordFedAppender { public: - WordFedAppender(cuda::stream_t<>& cudaStream); + WordFedAppender(); ~WordFedAppender() = default; void initializeWordFed(int fedId, unsigned int wordCounterGPU, const cms_uint32_t *src, unsigned int length); @@ -168,8 +169,8 @@ namespace pixelgpudetails { const unsigned char *fedId() const { return fedId_.get(); } private: - cudautils::host::unique_ptr word_; - cudautils::host::unique_ptr fedId_; + cudautils::host::noncached::unique_ptr word_; + cudautils::host::noncached::unique_ptr fedId_; }; SiPixelRawToClusterGPUKernel() = default; From c31e4cb074d1b7149ff7f7b176d8189c225b7e9e Mon Sep 17 00:00:00 2001 From: Matti Kortelainen Date: Thu, 11 Apr 2019 18:21:18 +0200 Subject: [PATCH 6/6] Fix the CUDA product availability logic in rechit --- .../SiPixelRecHits/plugins/SiPixelRecHitHeterogeneous.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitHeterogeneous.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitHeterogeneous.cc index 8786747ef7a50..be4ca76f23ad5 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitHeterogeneous.cc +++ b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitHeterogeneous.cc @@ -191,13 +191,13 @@ void SiPixelRecHitHeterogeneous::acquireGPUCuda(const edm::HeterogeneousEvent& i // synchronize explicitly (implementation is from // CUDAScopedContext). In practice these should not be needed // (because of synchronizations upstream), but let's play generic. - if(not hclusters->isAvailable() && hclusters->event()->has_occurred()) { + if(not hclusters->isAvailable() and not hclusters->event()->has_occurred()) { cudaCheck(cudaStreamWaitEvent(cudaStream.id(), hclusters->event()->id(), 0)); } - if(not hdigis->isAvailable() && hdigis->event()->has_occurred()) { + if(not hdigis->isAvailable() and not hdigis->event()->has_occurred()) { cudaCheck(cudaStreamWaitEvent(cudaStream.id(), hclusters->event()->id(), 0)); } - if(not hbs->isAvailable() && hbs->event()->has_occurred()) { + if(not hbs->isAvailable() and not hbs->event()->has_occurred()) { cudaCheck(cudaStreamWaitEvent(cudaStream.id(), hbs->event()->id(), 0)); }