From 028882679c5da3783c92da1a0b5dedd41d9f30b0 Mon Sep 17 00:00:00 2001 From: Matti Kortelainen Date: Tue, 27 Nov 2018 09:26:57 -0600 Subject: [PATCH] Add infrastructure around cub CachingDeviceAllocator, and use it in SiPixelRawToCluster (cms-patatrack#172) Add infrastructure around cub CachingDeviceAllocator for device memory allocations, and CachingHostAllocator for pinned (or managed) host memory. CUDAService uses the CachingHostAllocator to allocate requested GPU->CPU/CPU->GPU buffers and data products. Configuration options can be used to request: - to print all memory (re)allocations and frees; - to preallocate device and host buffers. SiPixelRawToCluster uses the CachingDeviceAllocator for temporary buffers and data products. Fix a memory problem with SiPixelFedCablingMapGPUWrapper::ModulesToUnpack. --- CUDADataFormats/SiPixelCluster/BuildFile.xml | 8 + .../interface/SiPixelClustersCUDA.h | 73 +++++ .../SiPixelCluster/src/SiPixelClustersCUDA.cc | 24 ++ CUDADataFormats/SiPixelDigi/BuildFile.xml | 7 + .../SiPixelDigi/interface/SiPixelDigisCUDA.h | 65 ++++ .../SiPixelDigi/src/SiPixelDigisCUDA.cc | 24 ++ .../SiPixelClusterizer/plugins/BuildFile.xml | 2 + .../plugins/SiPixelRawToClusterGPUKernel.cu | 289 +++++++----------- .../plugins/SiPixelRawToClusterGPUKernel.h | 108 +++---- 9 files changed, 375 insertions(+), 225 deletions(-) create mode 100644 CUDADataFormats/SiPixelCluster/BuildFile.xml create mode 100644 CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h create mode 100644 CUDADataFormats/SiPixelCluster/src/SiPixelClustersCUDA.cc create mode 100644 CUDADataFormats/SiPixelDigi/BuildFile.xml create mode 100644 CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h create mode 100644 CUDADataFormats/SiPixelDigi/src/SiPixelDigisCUDA.cc diff --git a/CUDADataFormats/SiPixelCluster/BuildFile.xml b/CUDADataFormats/SiPixelCluster/BuildFile.xml new file mode 100644 index 0000000000000..21c527e7b2f0d --- /dev/null +++ b/CUDADataFormats/SiPixelCluster/BuildFile.xml @@ -0,0 +1,8 @@ + + + + + + + + diff --git a/CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h b/CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h new file mode 100644 index 0000000000000..22d9ff9d103ba --- /dev/null +++ b/CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h @@ -0,0 +1,73 @@ +#ifndef CUDADataFormats_SiPixelCluster_interface_SiPixelClustersCUDA_h +#define CUDADataFormats_SiPixelCluster_interface_SiPixelClustersCUDA_h + +#include "CUDADataFormats/Common/interface/device_unique_ptr.h" + +#include + +class SiPixelClustersCUDA { +public: + SiPixelClustersCUDA() = default; + explicit SiPixelClustersCUDA(size_t feds, size_t nelements, cuda::stream_t<>& stream); + ~SiPixelClustersCUDA() = default; + + SiPixelClustersCUDA(const SiPixelClustersCUDA&) = delete; + SiPixelClustersCUDA& operator=(const SiPixelClustersCUDA&) = delete; + SiPixelClustersCUDA(SiPixelClustersCUDA&&) = default; + SiPixelClustersCUDA& operator=(SiPixelClustersCUDA&&) = default; + + uint32_t *moduleStart() { return moduleStart_d.get(); } + int32_t *clus() { return clus_d.get(); } + uint32_t *clusInModule() { return clusInModule_d.get(); } + uint32_t *moduleId() { return moduleId_d.get(); } + uint32_t *clusModuleStart() { return clusModuleStart_d.get(); } + + uint32_t const *moduleStart() const { return moduleStart_d.get(); } + int32_t const *clus() const { return clus_d.get(); } + uint32_t const *clusInModule() const { return clusInModule_d.get(); } + uint32_t const *moduleId() const { return moduleId_d.get(); } + uint32_t const *clusModuleStart() const { return clusModuleStart_d.get(); } + + uint32_t const *c_moduleStart() const { return moduleStart_d.get(); } + int32_t const *c_clus() const { return clus_d.get(); } + uint32_t const *c_clusInModule() const { return clusInModule_d.get(); } + uint32_t const *c_moduleId() const { return moduleId_d.get(); } + uint32_t const *c_clusModuleStart() const { return clusModuleStart_d.get(); } + + class DeviceConstView { + public: + DeviceConstView() = default; + +#ifdef __CUDACC__ + __device__ __forceinline__ uint32_t moduleStart(int i) const { return __ldg(moduleStart_+i); } + __device__ __forceinline__ int32_t clus(int i) const { return __ldg(clus_+i); } + __device__ __forceinline__ uint32_t clusInModule(int i) const { return __ldg(clusInModule_+i); } + __device__ __forceinline__ uint32_t moduleId(int i) const { return __ldg(moduleId_+i); } + __device__ __forceinline__ uint32_t clusModuleStart(int i) const { return __ldg(clusModuleStart_+i); } +#endif + + friend SiPixelClustersCUDA; + + private: + uint32_t const *moduleStart_ = nullptr; + int32_t const *clus_ = nullptr; + uint32_t const *clusInModule_ = nullptr; + uint32_t const *moduleId_ = nullptr; + uint32_t const *clusModuleStart_ = nullptr; + }; + + DeviceConstView *view() const { return view_d.get(); } + +private: + edm::cuda::device::unique_ptr moduleStart_d; // index of the first pixel of each module + edm::cuda::device::unique_ptr clus_d; // cluster id of each pixel + edm::cuda::device::unique_ptr clusInModule_d; // number of clusters found in each module + edm::cuda::device::unique_ptr moduleId_d; // module id of each module + + // originally from rechits + edm::cuda::device::unique_ptr clusModuleStart_d; + + edm::cuda::device::unique_ptr view_d; // "me" pointer +}; + +#endif diff --git a/CUDADataFormats/SiPixelCluster/src/SiPixelClustersCUDA.cc b/CUDADataFormats/SiPixelCluster/src/SiPixelClustersCUDA.cc new file mode 100644 index 0000000000000..7363c2fd364af --- /dev/null +++ b/CUDADataFormats/SiPixelCluster/src/SiPixelClustersCUDA.cc @@ -0,0 +1,24 @@ +#include "CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h" + +#include "FWCore/ServiceRegistry/interface/Service.h" +#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h" + +SiPixelClustersCUDA::SiPixelClustersCUDA(size_t feds, size_t nelements, cuda::stream_t<>& stream) { + edm::Service cs; + + moduleStart_d = cs->make_device_unique(nelements+1, stream); + clus_d = cs->make_device_unique< int32_t[]>(feds, stream); + clusInModule_d = cs->make_device_unique(nelements, stream); + moduleId_d = cs->make_device_unique(nelements, stream); + clusModuleStart_d = cs->make_device_unique(nelements+1, stream); + + auto view = cs->make_host_unique(stream); + view->moduleStart_ = moduleStart_d.get(); + view->clus_ = clus_d.get(); + view->clusInModule_ = clusInModule_d.get(); + view->moduleId_ = moduleId_d.get(); + view->clusModuleStart_ = clusModuleStart_d.get(); + + view_d = cs->make_device_unique(stream); + cudaMemcpyAsync(view_d.get(), view.get(), sizeof(DeviceConstView), cudaMemcpyDefault, stream.id()); +} diff --git a/CUDADataFormats/SiPixelDigi/BuildFile.xml b/CUDADataFormats/SiPixelDigi/BuildFile.xml new file mode 100644 index 0000000000000..259aa9f08d054 --- /dev/null +++ b/CUDADataFormats/SiPixelDigi/BuildFile.xml @@ -0,0 +1,7 @@ + + + + + + + diff --git a/CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h b/CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h new file mode 100644 index 0000000000000..25e8b54a743c2 --- /dev/null +++ b/CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h @@ -0,0 +1,65 @@ +#ifndef CUDADataFormats_SiPixelDigi_interface_SiPixelDigisCUDA_h +#define CUDADataFormats_SiPixelDigi_interface_SiPixelDigisCUDA_h + +#include "CUDADataFormats/Common/interface/device_unique_ptr.h" +#include "FWCore/Utilities/interface/propagate_const.h" + +#include + +class SiPixelDigisCUDA { +public: + SiPixelDigisCUDA() = default; + explicit SiPixelDigisCUDA(size_t nelements, cuda::stream_t<>& stream); + ~SiPixelDigisCUDA() = default; + + SiPixelDigisCUDA(const SiPixelDigisCUDA&) = delete; + SiPixelDigisCUDA& operator=(const SiPixelDigisCUDA&) = delete; + SiPixelDigisCUDA(SiPixelDigisCUDA&&) = default; + SiPixelDigisCUDA& operator=(SiPixelDigisCUDA&&) = default; + + uint16_t * xx() { return xx_d.get(); } + uint16_t * yy() { return yy_d.get(); } + uint16_t * adc() { return adc_d.get(); } + uint16_t * moduleInd() { return moduleInd_d.get(); } + + uint16_t const *xx() const { return xx_d.get(); } + uint16_t const *yy() const { return yy_d.get(); } + uint16_t const *adc() const { return adc_d.get(); } + uint16_t const *moduleInd() const { return moduleInd_d.get(); } + + uint16_t const *c_xx() const { return xx_d.get(); } + uint16_t const *c_yy() const { return yy_d.get(); } + uint16_t const *c_adc() const { return adc_d.get(); } + uint16_t const *c_moduleInd() const { return moduleInd_d.get(); } + + class DeviceConstView { + public: + DeviceConstView() = default; + +#ifdef __CUDACC__ + __device__ __forceinline__ uint16_t xx(int i) const { return __ldg(xx_+i); } + __device__ __forceinline__ uint16_t yy(int i) const { return __ldg(yy_+i); } + __device__ __forceinline__ uint16_t adc(int i) const { return __ldg(adc_+i); } + __device__ __forceinline__ uint16_t moduleInd(int i) const { return __ldg(moduleInd_+i); } +#endif + + friend class SiPixelDigisCUDA; + + private: + uint16_t const *xx_ = nullptr; + uint16_t const *yy_ = nullptr; + uint16_t const *adc_ = nullptr; + uint16_t const *moduleInd_ = nullptr; + }; + + const DeviceConstView *view() const { return view_d.get(); } + +private: + edm::cuda::device::unique_ptr xx_d; // local coordinates of each pixel + edm::cuda::device::unique_ptr yy_d; // + edm::cuda::device::unique_ptr adc_d; // ADC of each pixel + edm::cuda::device::unique_ptr moduleInd_d; // module id of each pixel + edm::cuda::device::unique_ptr view_d; // "me" pointer +}; + +#endif diff --git a/CUDADataFormats/SiPixelDigi/src/SiPixelDigisCUDA.cc b/CUDADataFormats/SiPixelDigi/src/SiPixelDigisCUDA.cc new file mode 100644 index 0000000000000..5ba2e920e9b04 --- /dev/null +++ b/CUDADataFormats/SiPixelDigi/src/SiPixelDigisCUDA.cc @@ -0,0 +1,24 @@ +#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h" + +#include "FWCore/ServiceRegistry/interface/Service.h" +#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h" + +#include + +SiPixelDigisCUDA::SiPixelDigisCUDA(size_t nelements, cuda::stream_t<>& stream) { + edm::Service cs; + + xx_d = cs->make_device_unique(nelements, stream); + yy_d = cs->make_device_unique(nelements, stream); + adc_d = cs->make_device_unique(nelements, stream); + moduleInd_d = cs->make_device_unique(nelements, stream); + + auto view = cs->make_host_unique(stream); + view->xx_ = xx_d.get(); + view->yy_ = yy_d.get(); + view->adc_ = adc_d.get(); + view->moduleInd_ = moduleInd_d.get(); + + view_d = cs->make_device_unique(stream); + cudaMemcpyAsync(view_d.get(), view.get(), sizeof(DeviceConstView), cudaMemcpyDefault, stream.id()); +} diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/BuildFile.xml b/RecoLocalTracker/SiPixelClusterizer/plugins/BuildFile.xml index 9db4a46f367b3..40a489f763397 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/BuildFile.xml +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/BuildFile.xml @@ -7,6 +7,8 @@ + + diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu index 7bd6eac473cc7..dc768ce8f643d 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu @@ -31,6 +31,8 @@ #include // CMSSW includes +#include "FWCore/ServiceRegistry/interface/Service.h" +#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h" #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" #include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuCalibPixel.h" #include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h" @@ -48,114 +50,17 @@ namespace pixelgpudetails { // number of words for all the FEDs constexpr uint32_t MAX_FED_WORDS = pixelgpudetails::MAX_FED * pixelgpudetails::MAX_WORD; - constexpr uint32_t MAX_WORD08_SIZE = MAX_FED_WORDS * sizeof(uint8_t); - constexpr uint32_t MAX_WORD32_SIZE = MAX_FED_WORDS * sizeof(uint32_t); - constexpr uint32_t MAX_WORD16_SIZE = MAX_FED_WORDS * sizeof(uint16_t); constexpr uint32_t MAX_ERROR_SIZE = MAX_FED_WORDS * esize; - SiPixelRawToClusterGPUKernel::SiPixelRawToClusterGPUKernel(cuda::stream_t<>& cudaStream) { - - cudaCheck(cudaMallocHost(&word, MAX_FED_WORDS * sizeof(unsigned int))); - cudaCheck(cudaMallocHost(&fedId_h, MAX_FED_WORDS * sizeof(unsigned char))); - - // to store the output of RawToDigi - cudaCheck(cudaMallocHost(&pdigi_h, MAX_FED_WORDS * sizeof(uint32_t))); - cudaCheck(cudaMallocHost(&rawIdArr_h, MAX_FED_WORDS * sizeof(uint32_t))); - - cudaCheck(cudaMallocHost(&adc_h, MAX_FED_WORDS * sizeof(uint16_t))); - cudaCheck(cudaMallocHost(&clus_h, MAX_FED_WORDS * sizeof(int32_t))); - - cudaCheck(cudaMallocHost(&error_h, vsize)); - cudaCheck(cudaMallocHost(&error_h_tmp, vsize)); - cudaCheck(cudaMallocHost(&data_h, MAX_ERROR_SIZE)); - - cudaCheck(cudaMalloc((void**) & word_d, MAX_WORD32_SIZE)); - cudaCheck(cudaMalloc((void**) & fedId_d, MAX_WORD08_SIZE)); - cudaCheck(cudaMalloc((void**) & pdigi_d, MAX_WORD32_SIZE)); // to store thepacked digi - cudaCheck(cudaMalloc((void**) & xx_d, MAX_WORD16_SIZE)); // to store the x and y coordinate - cudaCheck(cudaMalloc((void**) & yy_d, MAX_WORD16_SIZE)); - cudaCheck(cudaMalloc((void**) & adc_d, MAX_WORD16_SIZE)); - - cudaCheck(cudaMalloc((void**) & moduleInd_d, MAX_WORD16_SIZE)); - cudaCheck(cudaMalloc((void**) & rawIdArr_d, MAX_WORD32_SIZE)); - cudaCheck(cudaMalloc((void**) & error_d, vsize)); - cudaCheck(cudaMalloc((void**) & data_d, MAX_ERROR_SIZE)); - cudaCheck(cudaMemset(data_d, 0x00, MAX_ERROR_SIZE)); - - // for the clusterizer - cudaCheck(cudaMalloc((void**) & clus_d, MAX_WORD32_SIZE)); // cluser index in module - - using namespace gpuClustering; - cudaCheck(cudaMalloc((void**) & moduleStart_d, (MaxNumModules+1)*sizeof(uint32_t) )); - cudaCheck(cudaMalloc((void**) & clusInModule_d,(MaxNumModules)*sizeof(uint32_t) )); - cudaCheck(cudaMalloc((void**) & moduleId_d, (MaxNumModules)*sizeof(uint32_t) )); - - new (error_h) GPU::SimpleVector(MAX_FED_WORDS, data_h); - new (error_h_tmp) GPU::SimpleVector(MAX_FED_WORDS, data_d); - assert(error_h->size() == 0); - assert(error_h->capacity() == static_cast(MAX_FED_WORDS)); - assert(error_h_tmp->size() == 0); - assert(error_h_tmp->capacity() == static_cast(MAX_FED_WORDS)); - - // Need these in pinned memory to be truly asynchronous - cudaCheck(cudaMallocHost(&nModulesActive, sizeof(uint32_t))); - cudaCheck(cudaMallocHost(&nClusters, sizeof(uint32_t))); - - cudaCheck(cudaMalloc((void**) & gpuProduct_d, sizeof(GPUProduct))); - gpuProduct = getProduct(); - assert(xx_d==gpuProduct.xx_d); - - cudaCheck(cudaMemcpyAsync(gpuProduct_d, &gpuProduct, sizeof(GPUProduct), cudaMemcpyDefault,cudaStream.id())); - - // originally from rechits - cudaCheck(cudaMalloc((void**) & clusModuleStart_d, (MaxNumModules+1)*sizeof(uint32_t) )); - uint32_t *tmp = nullptr; - cudaCheck(cub::DeviceScan::InclusiveSum(nullptr, tempScanStorageSize, tmp, tmp, MaxNumModules)); - cudaCheck(cudaMalloc(&tempScanStorage_d, tempScanStorageSize)); + SiPixelRawToClusterGPUKernel::WordFedAppender::WordFedAppender(cuda::stream_t<>& cudaStream) { + edm::Service cs; + word_ = cs->make_host_unique(MAX_FED_WORDS, cudaStream); + fedId_ = cs->make_host_unique(MAX_FED_WORDS, cudaStream); } - SiPixelRawToClusterGPUKernel::~SiPixelRawToClusterGPUKernel() { - // free the host memory - cudaCheck(cudaFreeHost(word)); - cudaCheck(cudaFreeHost(fedId_h)); - cudaCheck(cudaFreeHost(pdigi_h)); - cudaCheck(cudaFreeHost(rawIdArr_h)); - cudaCheck(cudaFreeHost(adc_h)); - cudaCheck(cudaFreeHost(clus_h)); - cudaCheck(cudaFreeHost(error_h)); - cudaCheck(cudaFreeHost(error_h_tmp)); - cudaCheck(cudaFreeHost(data_h)); - cudaCheck(cudaFreeHost(nModulesActive)); - cudaCheck(cudaFreeHost(nClusters)); - - // free device memory used for RawToDigi on GPU - // free the GPU memory - cudaCheck(cudaFree(word_d)); - cudaCheck(cudaFree(fedId_d)); - cudaCheck(cudaFree(pdigi_d)); - cudaCheck(cudaFree(xx_d)); - cudaCheck(cudaFree(yy_d)); - cudaCheck(cudaFree(adc_d)); - cudaCheck(cudaFree(moduleInd_d)); - cudaCheck(cudaFree(rawIdArr_d)); - cudaCheck(cudaFree(error_d)); - cudaCheck(cudaFree(data_d)); - - // these are for the clusterizer - cudaCheck(cudaFree(moduleStart_d)); - cudaCheck(cudaFree(clus_d)); - cudaCheck(cudaFree(clusInModule_d)); - cudaCheck(cudaFree(moduleId_d)); - cudaCheck(cudaFree(gpuProduct_d)); - - // originally from rechits - cudaCheck(cudaFree(tempScanStorage_d)); - cudaCheck(cudaFree(clusModuleStart_d)); - } - - void SiPixelRawToClusterGPUKernel::initializeWordFed(int fedId, unsigned int wordCounterGPU, const cms_uint32_t *src, unsigned int length) { - std::memcpy(word+wordCounterGPU, src, sizeof(cms_uint32_t)*length); - std::memset(fedId_h+wordCounterGPU/2, fedId - 1200, length/2); + void SiPixelRawToClusterGPUKernel::WordFedAppender::initializeWordFed(int fedId, unsigned int wordCounterGPU, const cms_uint32_t *src, unsigned int length) { + std::memcpy(word_.get()+wordCounterGPU, src, sizeof(cms_uint32_t)*length); + std::memset(fedId_.get()+wordCounterGPU/2, fedId - 1200, length/2); } //////////////////// @@ -613,6 +518,7 @@ namespace pixelgpudetails { const SiPixelFedCablingMapGPU *cablingMap, const unsigned char *modToUnp, const SiPixelGainForHLTonGPU *gains, + const WordFedAppender& wordFed, const uint32_t wordCounter, const uint32_t fedCounter, bool convertADCtoElectrons, bool useQualityInfo, bool includeErrors, bool transferToCPU, bool debug, @@ -620,52 +526,82 @@ namespace pixelgpudetails { { nDigis = wordCounter; - const int threadsPerBlock = 512; - const int blocks = (wordCounter + threadsPerBlock-1) /threadsPerBlock; // fill it all - - assert(0 == wordCounter%2); - // wordCounter is the total no of words in each event to be trasfered on device - cudaCheck(cudaMemcpyAsync(&word_d[0], &word[0], wordCounter*sizeof(uint32_t), cudaMemcpyDefault, stream.id())); - cudaCheck(cudaMemcpyAsync(&fedId_d[0], &fedId_h[0], wordCounter*sizeof(uint8_t) / 2, cudaMemcpyDefault, stream.id())); - cudaCheck(cudaMemcpyAsync(error_d, error_h_tmp, vsize, cudaMemcpyDefault, stream.id())); - - // Launch rawToDigi kernel - RawToDigi_kernel<<>>( - cablingMap, - modToUnp, - wordCounter, - word_d, - fedId_d, - xx_d, yy_d, adc_d, - pdigi_d, - rawIdArr_d, - moduleInd_d, - error_d, - useQualityInfo, - includeErrors, - debug); - cudaCheck(cudaGetLastError()); - - // copy data to host variable - if(transferToCPU) { - cudaCheck(cudaMemcpyAsync(pdigi_h, pdigi_d, wordCounter*sizeof(uint32_t), cudaMemcpyDefault, stream.id())); - cudaCheck(cudaMemcpyAsync(rawIdArr_h, rawIdArr_d, wordCounter*sizeof(uint32_t), cudaMemcpyDefault, stream.id())); - - if (includeErrors) { - cudaCheck(cudaMemcpyAsync(error_h, error_d, vsize, cudaMemcpyDefault, stream.id())); - cudaCheck(cudaMemcpyAsync(data_h, data_d, MAX_ERROR_SIZE, cudaMemcpyDefault, stream.id())); - // If we want to transfer only the minimal amount of data, we - // need a synchronization point. A single ExternalWork (of - // SiPixelRawToClusterHeterogeneous) does not help because it is - // already used to synchronize the data movement. So we'd need - // two ExternalWorks (or explicit use of TBB tasks). The - // prototype of #100 would allow this easily (as there would be - // two ExternalWorks). - // - //error_h->set_data(data_h); - //cudaCheck(cudaStreamSynchronize(stream.id())); - //int size = error_h->size(); - //cudaCheck(cudaMemcpyAsync(data_h, data_d, size*esize, cudaMemcpyDefault, stream.id())); + constexpr uint32_t MAX_FED_WORDS = pixelgpudetails::MAX_FED * pixelgpudetails::MAX_WORD; + digis_d = SiPixelDigisCUDA(MAX_FED_WORDS, stream); + clusters_d = SiPixelClustersCUDA(MAX_FED_WORDS, gpuClustering::MaxNumModules, stream); + + edm::Service cs; + digis_clusters_h.nModules_Clusters = cs->make_host_unique(2, stream); + + { + const int threadsPerBlock = 512; + const int blocks = (wordCounter + threadsPerBlock-1) /threadsPerBlock; // fill it all + + assert(0 == wordCounter%2); + // wordCounter is the total no of words in each event to be trasfered on device + auto word_d = cs->make_device_unique(wordCounter, stream); + auto fedId_d = cs->make_device_unique(wordCounter, stream); + + auto error_d = cs->make_device_unique>(stream); + auto data_d = cs->make_device_unique(MAX_FED_WORDS, stream); + cudaCheck(cudaMemsetAsync(data_d.get(), 0x00, MAX_ERROR_SIZE, stream.id())); + auto error_h_tmp = cs->make_host_unique>(stream); + new (error_h_tmp.get()) GPU::SimpleVector(MAX_FED_WORDS, data_d.get()); // should make_host_unique() call the constructor as well? note that even if std::make_unique does that, we can't do that in make_device_unique + assert(error_h_tmp->size() == 0); + assert(error_h_tmp->capacity() == static_cast(MAX_FED_WORDS)); + + cudaCheck(cudaMemcpyAsync(word_d.get(), wordFed.word(), wordCounter*sizeof(uint32_t), cudaMemcpyDefault, stream.id())); + cudaCheck(cudaMemcpyAsync(fedId_d.get(), wordFed.fedId(), wordCounter*sizeof(uint8_t) / 2, cudaMemcpyDefault, stream.id())); + cudaCheck(cudaMemcpyAsync(error_d.get(), error_h_tmp.get(), vsize, cudaMemcpyDefault, stream.id())); + + auto pdigi_d = cs->make_device_unique(wordCounter, stream); + auto rawIdArr_d = cs->make_device_unique(wordCounter, stream); + + // Launch rawToDigi kernel + RawToDigi_kernel<<>>( + cablingMap, + modToUnp, + wordCounter, + word_d.get(), + fedId_d.get(), + digis_d.xx(), digis_d.yy(), digis_d.adc(), + pdigi_d.get(), + rawIdArr_d.get(), + digis_d.moduleInd(), + error_d.get(), + useQualityInfo, + includeErrors, + debug); + cudaCheck(cudaGetLastError()); + + // copy data to host variable + if(transferToCPU) { + digis_clusters_h.pdigi = cs->make_host_unique(MAX_FED_WORDS, stream); + digis_clusters_h.rawIdArr = cs->make_host_unique(MAX_FED_WORDS, stream); + cudaCheck(cudaMemcpyAsync(digis_clusters_h.pdigi.get(), pdigi_d.get(), wordCounter*sizeof(uint32_t), cudaMemcpyDefault, stream.id())); + cudaCheck(cudaMemcpyAsync(digis_clusters_h.rawIdArr.get(), rawIdArr_d.get(), wordCounter*sizeof(uint32_t), cudaMemcpyDefault, stream.id())); + + if (includeErrors) { + digis_clusters_h.data = cs->make_host_unique(MAX_FED_WORDS, stream); + digis_clusters_h.error = cs->make_host_unique>(stream); + new (digis_clusters_h.error.get()) GPU::SimpleVector(MAX_FED_WORDS, digis_clusters_h.data.get()); + assert(digis_clusters_h.error->size() == 0); + assert(digis_clusters_h.error->capacity() == static_cast(MAX_FED_WORDS)); + + cudaCheck(cudaMemcpyAsync(digis_clusters_h.error.get(), error_d.get(), vsize, cudaMemcpyDefault, stream.id())); + cudaCheck(cudaMemcpyAsync(digis_clusters_h.data.get(), data_d.get(), MAX_ERROR_SIZE, cudaMemcpyDefault, stream.id())); + // If we want to transfer only the minimal amount of data, we + // need a synchronization point. A single ExternalWork (of + // SiPixelRawToClusterHeterogeneous) does not help because it is + // already used to synchronize the data movement. So we'd need + // two ExternalWorks (or explicit use of TBB tasks). The + // prototype of #100 would allow this easily (as there would be + // two ExternalWorks). + // + //cudaCheck(cudaStreamSynchronize(stream.id())); + //int size = digis_clusters_h.error->size(); + //cudaCheck(cudaMemcpyAsync(digis_clusters_h.data.get(), data_d.get(), size*esize, cudaMemcpyDefault, stream.id())); + } } } // End of Raw2Digi and passing data for cluserisation @@ -677,15 +613,16 @@ namespace pixelgpudetails { int blocks = (wordCounter + threadsPerBlock - 1) / threadsPerBlock; gpuCalibPixel::calibDigis<<>>( - moduleInd_d, - xx_d, yy_d, adc_d, + digis_d.moduleInd(), + digis_d.c_xx(), digis_d.c_yy(), digis_d.adc(), gains, wordCounter); cudaCheck(cudaGetLastError()); // calibrated adc if(transferToCPU) { - cudaCheck(cudaMemcpyAsync(adc_h, adc_d, wordCounter*sizeof(uint16_t), cudaMemcpyDefault, stream.id())); + digis_clusters_h.adc = cs->make_host_unique(MAX_FED_WORDS, stream); + cudaCheck(cudaMemcpyAsync(digis_clusters_h.adc.get(), digis_d.adc(), wordCounter*sizeof(uint16_t), cudaMemcpyDefault, stream.id())); } #ifdef GPU_DEBUG @@ -694,13 +631,13 @@ namespace pixelgpudetails { << " blocks of " << threadsPerBlock << " threads\n"; #endif - cudaCheck(cudaMemsetAsync(moduleStart_d, 0x00, sizeof(uint32_t), stream.id())); + cudaCheck(cudaMemsetAsync(clusters_d.moduleStart(), 0x00, sizeof(uint32_t), stream.id())); - countModules<<>>(moduleInd_d, moduleStart_d, clus_d, wordCounter); + countModules<<>>(digis_d.c_moduleInd(), clusters_d.moduleStart(), clusters_d.clus(), wordCounter); cudaCheck(cudaGetLastError()); // read the number of modules into a data member, used by getProduct()) - cudaCheck(cudaMemcpyAsync(nModulesActive, moduleStart_d, sizeof(uint32_t), cudaMemcpyDefault, stream.id())); + cudaCheck(cudaMemcpyAsync(&(digis_clusters_h.nModules_Clusters[0]), clusters_d.moduleStart(), sizeof(uint32_t), cudaMemcpyDefault, stream.id())); threadsPerBlock = 256; blocks = MaxNumModules; @@ -708,23 +645,23 @@ namespace pixelgpudetails { std::cout << "CUDA findClus kernel launch with " << blocks << " blocks of " << threadsPerBlock << " threads\n"; #endif - cudaCheck(cudaMemsetAsync(clusInModule_d, 0, (MaxNumModules)*sizeof(uint32_t), stream.id())); + cudaCheck(cudaMemsetAsync(clusters_d.clusInModule(), 0, (MaxNumModules)*sizeof(uint32_t), stream.id())); findClus<<>>( - moduleInd_d, - xx_d, yy_d, - moduleStart_d, - clusInModule_d, moduleId_d, - clus_d, + digis_d.c_moduleInd(), + digis_d.c_xx(), digis_d.c_yy(), + clusters_d.c_moduleStart(), + clusters_d.clusInModule(), clusters_d.moduleId(), + clusters_d.clus(), wordCounter); cudaCheck(cudaGetLastError()); // apply charge cut clusterChargeCut<<>>( - moduleInd_d, - adc_d, - moduleStart_d, - clusInModule_d, moduleId_d, - clus_d, + digis_d.moduleInd(), + digis_d.c_adc(), + clusters_d.c_moduleStart(), + clusters_d.clusInModule(), clusters_d.c_moduleId(), + clusters_d.clus(), wordCounter); cudaCheck(cudaGetLastError()); @@ -735,19 +672,27 @@ namespace pixelgpudetails { // available in the rechit producer without additional points of // synchronization/ExternalWork // + // Temporary storage + size_t tempScanStorageSize = 0; + { + uint32_t *tmp = nullptr; + cudaCheck(cub::DeviceScan::InclusiveSum(nullptr, tempScanStorageSize, tmp, tmp, MaxNumModules)); + } + auto tempScanStorage_d = cs->make_device_unique(tempScanStorageSize, stream); // Set first the first element to 0 - cudaCheck(cudaMemsetAsync(clusModuleStart_d, 0, sizeof(uint32_t), stream.id())); + cudaCheck(cudaMemsetAsync(clusters_d.clusModuleStart(), 0, sizeof(uint32_t), stream.id())); // Then use inclusive_scan to get the partial sum to the rest - cudaCheck(cub::DeviceScan::InclusiveSum(tempScanStorage_d, tempScanStorageSize, - clusInModule_d, &clusModuleStart_d[1], gpuClustering::MaxNumModules, + cudaCheck(cub::DeviceScan::InclusiveSum(tempScanStorage_d.get(), tempScanStorageSize, + clusters_d.c_clusInModule(), &clusters_d.clusModuleStart()[1], gpuClustering::MaxNumModules, stream.id())); // last element holds the number of all clusters - cudaCheck(cudaMemcpyAsync(nClusters, clusModuleStart_d+gpuClustering::MaxNumModules, sizeof(uint32_t), cudaMemcpyDefault, stream.id())); + cudaCheck(cudaMemcpyAsync(&(digis_clusters_h.nModules_Clusters[1]), clusters_d.clusModuleStart()+gpuClustering::MaxNumModules, sizeof(uint32_t), cudaMemcpyDefault, stream.id())); // clusters if(transferToCPU) { - cudaCheck(cudaMemcpyAsync(clus_h, clus_d, wordCounter*sizeof(uint32_t), cudaMemcpyDefault, stream.id())); + digis_clusters_h.clus = cs->make_host_unique(MAX_FED_WORDS, stream); + cudaCheck(cudaMemcpyAsync(digis_clusters_h.clus.get(), clusters_d.clus(), wordCounter*sizeof(uint32_t), cudaMemcpyDefault, stream.id())); } } // end clusterizer scope } diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h index ca8bd73106c2c..a2d9cdda92573 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h @@ -5,6 +5,7 @@ #include #include "cuda/api_wrappers.h" +#include "CUDADataFormats/Common/interface/host_unique_ptr.h" #include "FWCore/Utilities/interface/typedefs.h" #include "HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h" #include "siPixelRawToClusterHeterogeneousProduct.h" @@ -159,8 +160,43 @@ namespace pixelgpudetails { using GPUProduct = siPixelRawToClusterHeterogeneousProduct::GPUProduct; - SiPixelRawToClusterGPUKernel(cuda::stream_t<>& cudaStream); - ~SiPixelRawToClusterGPUKernel(); + struct CPUData { + CPUData() = default; + ~CPUData() = default; + + CPUData(const CPUData&) = delete; + CPUData& operator=(const CPUData&) = delete; + CPUData(CPUData&&) = default; + CPUData& operator=(CPUData&&) = default; + + edm::cuda::host::unique_ptr nModules_Clusters; // These should really be part of the GPU product + + edm::cuda::host::unique_ptr data; + edm::cuda::host::unique_ptr> error; + + edm::cuda::host::unique_ptr pdigi; + edm::cuda::host::unique_ptr rawIdArr; + edm::cuda::host::unique_ptr adc; + edm::cuda::host::unique_ptr clus; + }; + + class WordFedAppender { + public: + WordFedAppender(cuda::stream_t<>& cudaStream); + ~WordFedAppender() = default; + + void initializeWordFed(int fedId, unsigned int wordCounterGPU, const cms_uint32_t *src, unsigned int length); + + const unsigned int *word() const { return word_.get(); } + const unsigned char *fedId() const { return fedId_.get(); } + + private: + edm::cuda::host::unique_ptr word_; + edm::cuda::host::unique_ptr fedId_; + }; + + SiPixelRawToClusterGPUKernel() = default; + ~SiPixelRawToClusterGPUKernel() = default; SiPixelRawToClusterGPUKernel(const SiPixelRawToClusterGPUKernel&) = delete; @@ -168,69 +204,35 @@ namespace pixelgpudetails { SiPixelRawToClusterGPUKernel& operator=(const SiPixelRawToClusterGPUKernel&) = delete; SiPixelRawToClusterGPUKernel& operator=(SiPixelRawToClusterGPUKernel&&) = delete; - void initializeWordFed(int fedId, unsigned int wordCounterGPU, const cms_uint32_t *src, unsigned int length); - void makeClustersAsync(const SiPixelFedCablingMapGPU *cablingMap, const unsigned char *modToUnp, const SiPixelGainForHLTonGPU *gains, + const WordFedAppender& wordFed, const uint32_t wordCounter, const uint32_t fedCounter, bool convertADCtoElectrons, bool useQualityInfo, bool includeErrors, bool transferToCPU_, bool debug, cuda::stream_t<>& stream); - auto getProduct() { - error_h->set_data(data_h); - return siPixelRawToClusterHeterogeneousProduct::GPUProduct{ - pdigi_h, rawIdArr_h, clus_h, adc_h, error_h, - gpuProduct_d, - xx_d, yy_d, adc_d, moduleInd_d, moduleStart_d,clus_d, clusInModule_d, moduleId_d, - clusModuleStart_d, - nDigis, *nModulesActive, *nClusters - }; + siPixelRawToClusterHeterogeneousProduct::GPUProduct getProduct() { + return siPixelRawToClusterHeterogeneousProduct::GPUProduct( + std::move(digis_d), std::move(clusters_d), + nDigis, + digis_clusters_h.nModules_Clusters[0], + digis_clusters_h.nModules_Clusters[1] + ); } - private: - // input - unsigned int *word = nullptr; // to hold input for rawtodigi - unsigned char *fedId_h = nullptr; // to hold fed index for each word - - // output - GPUProduct gpuProduct; - GPUProduct * gpuProduct_d; - - // FIXME cleanup all these are in the gpuProduct above... - - uint32_t *pdigi_h = nullptr, *rawIdArr_h = nullptr; // host copy of output - uint16_t *adc_h = nullptr; int32_t *clus_h = nullptr; // host copy of calib&clus output - pixelgpudetails::error_obj *data_h = nullptr; - GPU::SimpleVector *error_h = nullptr; - GPU::SimpleVector *error_h_tmp = nullptr; + CPUData&& getCPUData() { + return std::move(digis_clusters_h); + } + private: uint32_t nDigis = 0; - uint32_t *nModulesActive = nullptr; - uint32_t *nClusters = nullptr; - - // scratch memory buffers - uint32_t * word_d; - uint8_t * fedId_d; - uint32_t * pdigi_d; - uint16_t * xx_d; - uint16_t * yy_d; - uint16_t * adc_d; - uint16_t * moduleInd_d; - uint32_t * rawIdArr_d; - GPU::SimpleVector * error_d; - error_obj * data_d; - - // these are for the clusterizer (to be moved) - uint32_t * moduleStart_d; - int32_t * clus_d; - uint32_t * clusInModule_d; - uint32_t * moduleId_d; + // CPU data + CPUData digis_clusters_h; - // originally in rechit, moved here - uint32_t *clusModuleStart_d = nullptr; - void *tempScanStorage_d = nullptr; - size_t tempScanStorageSize = 0; + // Data to be put in the event + SiPixelDigisCUDA digis_d; + SiPixelClustersCUDA clusters_d; }; // configuration and memory buffers alocated on the GPU