From eae6b1c3559068f3e094095d869d55ed83b85723 Mon Sep 17 00:00:00 2001 From: Matti Kortelainen Date: Tue, 23 Apr 2019 08:18:43 -0500 Subject: [PATCH] Move BeamSpot transfer to GPU to its own producer (cms-patatrack#318) Implement a non-caching host allocator, useful for host-to-device copy buffers: - not bound to any CUDA stream to allow use in EDM beginStream(); - with the possibility to pass flags to cudaHostAlloc(), e.g. cudaHostAllocWriteCombined. Add perfect forwarding overload for CUDAProduct constructor, enabling the use of CUDAScopedContext::emplace() in BeamSpotToCUDA::produce(). Move the BeamSpot host-to-device transfer to its own EDProducer, making use of beginStream()-allocated write-combined memory for the transfer. --- .../plugins/SiPixelRawToClusterCUDA.cc | 13 ++++++++++--- .../plugins/SiPixelRawToClusterGPUKernel.cu | 7 +++---- .../plugins/SiPixelRawToClusterGPUKernel.h | 7 ++++--- .../SiPixelRecHits/plugins/BuildFile.xml | 1 + .../SiPixelRecHits/plugins/PixelRecHits.cu | 7 ++----- .../SiPixelRecHits/plugins/PixelRecHits.h | 3 ++- .../SiPixelRecHits/plugins/gpuPixelRecHits.h | 9 +++++---- 7 files changed, 27 insertions(+), 20 deletions(-) diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterCUDA.cc b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterCUDA.cc index b23faad9e78d3..f2dacd5fbc415 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterCUDA.cc +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterCUDA.cc @@ -24,7 +24,9 @@ #include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h" #include "FWCore/ParameterSet/interface/ParameterSetDescription.h" #include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "FWCore/ServiceRegistry/interface/Service.h" #include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h" +#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h" #include "RecoLocalTracker/SiPixelClusterizer/interface/SiPixelFedCablingMapGPUWrapper.h" #include "RecoTracker/Record/interface/CkfComponentsRecord.h" @@ -62,6 +64,7 @@ class SiPixelRawToClusterCUDA: public edm::stream::EDProducer std::unique_ptr regions_; pixelgpudetails::SiPixelRawToClusterGPUKernel gpuAlgo_; + std::unique_ptr wordFedAppender_; PixelDataFormatter::Errors errors_; const bool includeErrors_; @@ -88,6 +91,11 @@ SiPixelRawToClusterCUDA::SiPixelRawToClusterCUDA(const edm::ParameterSet& iConfi } if(usePilotBlade_) edm::LogInfo("SiPixelRawToCluster") << " Use pilot blade data (FED 40)"; + + edm::Service cs; + if(cs->enabled()) { + wordFedAppender_ = std::make_unique(); + } } void SiPixelRawToClusterCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { @@ -161,7 +169,6 @@ void SiPixelRawToClusterCUDA::acquire(const edm::Event& iEvent, const edm::Event // In CPU algorithm this loop is part of PixelDataFormatter::interpretRawData() ErrorChecker errorcheck; - auto wordFedAppender = pixelgpudetails::SiPixelRawToClusterGPUKernel::WordFedAppender(ctx.stream()); for(int fedId: fedIds_) { if (!usePilotBlade_ && (fedId==40) ) continue; // skip pilot blade data if (regions_ && !regions_->mayUnpackFED(fedId)) continue; @@ -209,13 +216,13 @@ void SiPixelRawToClusterCUDA::acquire(const edm::Event& iEvent, const edm::Event const cms_uint32_t * ew = (const cms_uint32_t *)(trailer); assert(0 == (ew-bw)%2); - wordFedAppender.initializeWordFed(fedId, wordCounterGPU, bw, (ew-bw)); + wordFedAppender_->initializeWordFed(fedId, wordCounterGPU, bw, (ew-bw)); wordCounterGPU+=(ew-bw); } // end of for loop gpuAlgo_.makeClustersAsync(gpuMap, gpuModulesToUnpack, gpuGains, - wordFedAppender, + *wordFedAppender_, std::move(errors_), wordCounterGPU, fedCounter, useQuality_, includeErrors_, diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu index 3d4e377eb8221..8fdb2ed8c90d5 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu @@ -45,10 +45,9 @@ namespace pixelgpudetails { // number of words for all the FEDs constexpr uint32_t MAX_FED_WORDS = pixelgpudetails::MAX_FED * pixelgpudetails::MAX_WORD; - SiPixelRawToClusterGPUKernel::WordFedAppender::WordFedAppender(cuda::stream_t<>& cudaStream) { - edm::Service cs; - word_ = cs->make_host_unique(MAX_FED_WORDS, cudaStream); - fedId_ = cs->make_host_unique(MAX_FED_WORDS, cudaStream); + SiPixelRawToClusterGPUKernel::WordFedAppender::WordFedAppender() { + word_ = cudautils::make_host_noncached_unique(MAX_FED_WORDS, cudaHostAllocWriteCombined); + fedId_ = cudautils::make_host_noncached_unique(MAX_FED_WORDS, cudaHostAllocWriteCombined); } void SiPixelRawToClusterGPUKernel::WordFedAppender::initializeWordFed(int fedId, unsigned int wordCounterGPU, const cms_uint32_t *src, unsigned int length) { diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h index a0f89dc241c64..0d2b6a8c7fc65 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h @@ -11,6 +11,7 @@ #include "FWCore/Utilities/interface/typedefs.h" #include "HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h" #include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" +#include "HeterogeneousCore/CUDAUtilities/interface/host_noncached_unique_ptr.h" #include "DataFormats/SiPixelDigi/interface/PixelErrors.h" struct SiPixelFedCablingMapGPU; @@ -159,7 +160,7 @@ namespace pixelgpudetails { public: class WordFedAppender { public: - WordFedAppender(cuda::stream_t<>& cudaStream); + WordFedAppender(); ~WordFedAppender() = default; void initializeWordFed(int fedId, unsigned int wordCounterGPU, const cms_uint32_t *src, unsigned int length); @@ -168,8 +169,8 @@ namespace pixelgpudetails { const unsigned char *fedId() const { return fedId_.get(); } private: - cudautils::host::unique_ptr word_; - cudautils::host::unique_ptr fedId_; + cudautils::host::noncached::unique_ptr word_; + cudautils::host::noncached::unique_ptr fedId_; }; SiPixelRawToClusterGPUKernel() = default; diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/BuildFile.xml b/RecoLocalTracker/SiPixelRecHits/plugins/BuildFile.xml index a8af0c8a7c4f9..27ee3af86e102 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/BuildFile.xml +++ b/RecoLocalTracker/SiPixelRecHits/plugins/BuildFile.xml @@ -1,3 +1,4 @@ + diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu index 80be13dedd26b..6ac70fce95a88 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu +++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu @@ -36,7 +36,6 @@ namespace pixelgpudetails { constexpr auto MAX_HITS = siPixelRecHitsHeterogeneousProduct::maxHits(); - cudaCheck(cudaMalloc((void **) & gpu_.bs_d, 3 * sizeof(float))); cudaCheck(cudaMalloc((void **) & gpu_.hitsLayerStart_d, 11 * sizeof(uint32_t))); // Coalesce all 32bit and 16bit arrays to two big blobs @@ -111,7 +110,6 @@ namespace pixelgpudetails { #endif } PixelRecHitGPUKernel::~PixelRecHitGPUKernel() { - cudaCheck(cudaFree(gpu_.bs_d)); cudaCheck(cudaFree(gpu_.hitsLayerStart_d)); cudaCheck(cudaFree(gpu_.owner_32bit_)); cudaCheck(cudaFree(gpu_.owner_16bit_)); @@ -131,11 +129,10 @@ namespace pixelgpudetails { void PixelRecHitGPUKernel::makeHitsAsync(SiPixelDigisCUDA const& digis_d, SiPixelClustersCUDA const& clusters_d, - float const * bs, + BeamSpotCUDA const& bs_d, pixelCPEforGPU::ParamsOnGPU const * cpeParams, bool transferToCPU, cuda::stream_t<>& stream) { - cudaCheck(cudaMemcpyAsync(gpu_.bs_d, bs, 3 * sizeof(float), cudaMemcpyDefault, stream.id())); gpu_.hitsModuleStart_d = clusters_d.clusModuleStart(); gpu_.cpeParams = cpeParams; // copy it for use in clients cudaCheck(cudaMemcpyAsync(gpu_d, &gpu_, sizeof(HitsOnGPU), cudaMemcpyDefault, stream.id())); @@ -148,7 +145,7 @@ namespace pixelgpudetails { #endif gpuPixelRecHits::getHits<<>>( cpeParams, - gpu_.bs_d, + bs_d.data(), digis_d.moduleInd(), digis_d.xx(), digis_d.yy(), digis_d.adc(), clusters_d.moduleStart(), diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.h b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.h index 49164d24ab335..8e5599c239789 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.h +++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.h @@ -1,6 +1,7 @@ #ifndef RecoLocalTracker_SiPixelRecHits_plugins_PixelRecHits_h #define RecoLocalTracker_SiPixelRecHits_plugins_PixelRecHits_h +#include "CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h" #include "CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h" #include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h" #include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusteringConstants.h" @@ -34,7 +35,7 @@ namespace pixelgpudetails { void makeHitsAsync(SiPixelDigisCUDA const& digis_d, SiPixelClustersCUDA const& clusters_d, - float const * bs, + BeamSpotCUDA const& bs_d, pixelCPEforGPU::ParamsOnGPU const * cpeParams, bool transferToCPU, cuda::stream_t<>& stream); diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h b/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h index 6864a046bf1dc..cbd354e71143e 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h +++ b/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h @@ -5,6 +5,7 @@ #include #include +#include "CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h" #include "DataFormats/Math/interface/approx_atan2.h" #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h" #include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h" @@ -15,7 +16,7 @@ namespace gpuPixelRecHits { __global__ void getHits(pixelCPEforGPU::ParamsOnGPU const * __restrict__ cpeParams, - float const * __restrict__ bs, + BeamSpotCUDA::Data const * __restrict__ bs, uint16_t const * __restrict__ id, uint16_t const * __restrict__ x, uint16_t const * __restrict__ y, @@ -143,9 +144,9 @@ namespace gpuPixelRecHits { // to global and compute phi... cpeParams->detParams(me).frame.toGlobal(xl[h],yl[h], xg[h],yg[h],zg[h]); // here correct for the beamspot... - xg[h]-=bs[0]; - yg[h]-=bs[1]; - zg[h]-=bs[2]; + xg[h]-=bs->x; + yg[h]-=bs->y; + zg[h]-=bs->z; rg[h] = std::sqrt(xg[h]*xg[h]+yg[h]*yg[h]); iph[h] = unsafe_atan2s<7>(yg[h],xg[h]);