From 090b1b2221af1e34436c576e2e0333ca9b8ec74d Mon Sep 17 00:00:00 2001 From: Vincenzo Innocente Date: Fri, 10 Jul 2020 17:28:01 +0200 Subject: [PATCH 1/8] reimplement dyn memory for cells --- .../CUDAUtilities/interface/cudaCompat.h | 7 ++++ .../plugins/CAHitNtupletGeneratorKernels.cc | 21 ++++++---- .../plugins/CAHitNtupletGeneratorKernels.cu | 20 ++++++---- .../plugins/CAHitNtupletGeneratorKernels.h | 4 +- .../CAHitNtupletGeneratorKernelsAlloc.h | 8 +--- .../PixelTriplets/plugins/GPUCACell.h | 38 +++++++++++++++---- .../PixelTriplets/plugins/gpuPixelDoublets.h | 9 +++++ 7 files changed, 75 insertions(+), 32 deletions(-) diff --git a/HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h b/HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h index 593821fe805ed..f9b4b2f8a4c16 100644 --- a/HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h +++ b/HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h @@ -27,6 +27,13 @@ namespace cms { extern thread_local dim3 blockIdx; extern thread_local dim3 gridDim; + template + T1 atomicCAS(T1* address, T1 compare, T2 val) { + T1 old = *address; + *address = old == compare ? val : old; + return old; + } + template T1 atomicInc(T1* a, T2 b) { auto ret = *a; diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc index 4eafb6dccd31c..8e75303555d8f 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc @@ -24,11 +24,16 @@ void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsOnCPU const &hh, cudaStr device_isOuterHitOfCell_.reset( (GPUCACell::OuterHitOfCell *)malloc(std::max(1U, nhits) * sizeof(GPUCACell::OuterHitOfCell))); assert(device_isOuterHitOfCell_.get()); + device_theCellNeighborsContainer_.reset( + (GPUCACell::CellNeighbors *)malloc(CAConstants::maxNumOfActiveDoublets() * sizeof(GPUCACell::CellNeighbors))); + device_theCellTracksContainer_.reset( + (GPUCACell::CellTracks *)malloc(CAConstants::maxNumOfActiveDoublets() * sizeof(GPUCACell::CellTracks))); + gpuPixelDoublets::initDoublets(device_isOuterHitOfCell_.get(), nhits, - device_theCellNeighbors_, + device_theCellNeighbors_.get(), device_theCellNeighborsContainer_.get(), - device_theCellTracks_, + device_theCellTracks_.get(), device_theCellTracksContainer_.get()); // device_theCells_ = Traits:: template make_unique(cs, m_params.maxNumberOfDoublets_, stream); @@ -47,8 +52,8 @@ void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsOnCPU const &hh, cudaStr assert(nActualPairs <= gpuPixelDoublets::nPairs); gpuPixelDoublets::getDoubletsFromHisto(device_theCells_.get(), device_nCells_, - device_theCellNeighbors_, - device_theCellTracks_, + device_theCellNeighbors_.get(), + device_theCellTracks_.get(), hh.view(), device_isOuterHitOfCell_.get(), nActualPairs, @@ -84,7 +89,7 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA * hh.view(), device_theCells_.get(), device_nCells_, - device_theCellNeighbors_, + device_theCellNeighbors_.get(), device_isOuterHitOfCell_.get(), m_params.hardCurvCut_, m_params.ptmin_, @@ -101,7 +106,7 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA * kernel_find_ntuplets(hh.view(), device_theCells_.get(), device_nCells_, - device_theCellTracks_, + device_theCellTracks_.get(), tuples_d, device_hitTuple_apc_, quality_d, @@ -129,8 +134,8 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA * device_hitTuple_apc_, device_theCells_.get(), device_nCells_, - device_theCellNeighbors_, - device_theCellTracks_, + device_theCellNeighbors_.get(), + device_theCellTracks_.get(), device_isOuterHitOfCell_.get(), nhits, m_params.maxNumberOfDoublets_, diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu index 541ab5ed905f5..479078f80f613 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu @@ -51,7 +51,7 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA * hh.view(), device_theCells_.get(), device_nCells_, - device_theCellNeighbors_, + device_theCellNeighbors_.get(), device_isOuterHitOfCell_.get(), m_params.hardCurvCut_, m_params.ptmin_, @@ -78,7 +78,7 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA * kernel_find_ntuplets<<>>(hh.view(), device_theCells_.get(), device_nCells_, - device_theCellTracks_, + device_theCellTracks_.get(), tuples_d, device_hitTuple_apc_, quality_d, @@ -132,8 +132,8 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA * device_hitTuple_apc_, device_theCells_.get(), device_nCells_, - device_theCellNeighbors_, - device_theCellTracks_, + device_theCellNeighbors_.get(), + device_theCellTracks_.get(), device_isOuterHitOfCell_.get(), nhits, m_params.maxNumberOfDoublets_, @@ -162,15 +162,19 @@ void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStr // in principle we can use "nhits" to heuristically dimension the workspace... device_isOuterHitOfCell_ = cms::cuda::make_device_unique(std::max(1U, nhits), stream); assert(device_isOuterHitOfCell_.get()); + device_theCellNeighborsContainer_ = + cms::cuda::make_device_unique(CAConstants::maxNumOfActiveDoublets(), stream); + device_theCellTracksContainer_ = + cms::cuda::make_device_unique(CAConstants::maxNumOfActiveDoublets(), stream); { int threadsPerBlock = 128; // at least one block! int blocks = (std::max(1U, nhits) + threadsPerBlock - 1) / threadsPerBlock; gpuPixelDoublets::initDoublets<<>>(device_isOuterHitOfCell_.get(), nhits, - device_theCellNeighbors_, + device_theCellNeighbors_.get(), device_theCellNeighborsContainer_.get(), - device_theCellTracks_, + device_theCellTracks_.get(), device_theCellTracksContainer_.get()); cudaCheck(cudaGetLastError()); } @@ -201,8 +205,8 @@ void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStr dim3 thrs(stride, threadsPerBlock, 1); gpuPixelDoublets::getDoubletsFromHisto<<>>(device_theCells_.get(), device_nCells_, - device_theCellNeighbors_, - device_theCellTracks_, + device_theCellNeighbors_.get(), + device_theCellTracks_.get(), hh.view(), device_isOuterHitOfCell_.get(), nActualPairs, diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h index e112e9d17adeb..7a0b196d8ae89 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h @@ -179,9 +179,9 @@ class CAHitNtupletGeneratorKernels { private: // workspace - CAConstants::CellNeighborsVector* device_theCellNeighbors_ = nullptr; + unique_ptr device_theCellNeighbors_; unique_ptr device_theCellNeighborsContainer_; - CAConstants::CellTracksVector* device_theCellTracks_ = nullptr; + unique_ptr device_theCellTracks_; unique_ptr device_theCellTracksContainer_; unique_ptr device_theCells_; diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.h index 05bf4f09f7f93..fb750267f5c37 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.h @@ -12,12 +12,8 @@ void CAHitNtupletGeneratorKernelsCPU::allocateOnGPU(cudaStream_t stream) { // ALLOCATIONS FOR THE INTERMEDIATE RESULTS (STAYS ON WORKER) ////////////////////////////////////////////////////////// - /* not used at the moment - cudaCheck(cudaMalloc(&device_theCellNeighbors_, sizeof(CAConstants::CellNeighborsVector))); - cudaCheck(cudaMemset(device_theCellNeighbors_, 0, sizeof(CAConstants::CellNeighborsVector))); - cudaCheck(cudaMalloc(&device_theCellTracks_, sizeof(CAConstants::CellTracksVector))); - cudaCheck(cudaMemset(device_theCellTracks_, 0, sizeof(CAConstants::CellTracksVector))); - */ + device_theCellNeighbors_ = Traits::template make_unique(stream); + device_theCellTracks_ = Traits::template make_unique(stream); device_hitToTuple_ = Traits::template make_unique(stream); diff --git a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h index 6e1c2a587e212..cb4dedd4dd45a 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h @@ -56,24 +56,46 @@ class GPUCACell { theInnerZ = hh.zGlobal(innerHitId); theInnerR = hh.rGlobal(innerHitId); - outerNeighbors().reset(); - tracks().reset(); + // link to default empty + theOuterNeighbors = &cellNeighbors[0]; + theTracks = &cellTracks[0]; assert(outerNeighbors().empty()); assert(tracks().empty()); } __device__ __forceinline__ int addOuterNeighbor(CellNeighbors::value_t t, CellNeighborsVector& cellNeighbors) { + // use smart cache + if (outerNeighbors().empty()) { + auto i = cellNeighbors.extend(); // maybe waisted.... + if (i > 0) { + cellNeighbors[i].reset(); + auto zero = (ptrAsInt)(&cellNeighbors[0]); + atomicCAS((ptrAsInt*)(&theOuterNeighbors), + zero, + (ptrAsInt)(&cellNeighbors[i])); // if fails we cannot give "i" back... + } else + return -1; + } return outerNeighbors().push_back(t); } __device__ __forceinline__ int addTrack(CellTracks::value_t t, CellTracksVector& cellTracks) { + if (tracks().empty()) { + auto i = cellTracks.extend(); // maybe waisted.... + if (i > 0) { + cellTracks[i].reset(); + auto zero = (ptrAsInt)(&cellTracks[0]); + atomicCAS((ptrAsInt*)(&theTracks), zero, (ptrAsInt)(&cellTracks[i])); // if fails we cannot give "i" back... + } else + return -1; + } return tracks().push_back(t); } - __device__ __forceinline__ CellTracks& tracks() { return theTracks; } - __device__ __forceinline__ CellTracks const& tracks() const { return theTracks; } - __device__ __forceinline__ CellNeighbors& outerNeighbors() { return theOuterNeighbors; } - __device__ __forceinline__ CellNeighbors const& outerNeighbors() const { return theOuterNeighbors; } + __device__ __forceinline__ CellTracks& tracks() { return *theTracks; } + __device__ __forceinline__ CellTracks const& tracks() const { return *theTracks; } + __device__ __forceinline__ CellNeighbors& outerNeighbors() { return *theOuterNeighbors; } + __device__ __forceinline__ CellNeighbors const& outerNeighbors() const { return *theOuterNeighbors; } __device__ __forceinline__ float get_inner_x(Hits const& hh) const { return hh.xGlobal(theInnerHitId); } __device__ __forceinline__ float get_outer_x(Hits const& hh) const { return hh.xGlobal(theOuterHitId); } __device__ __forceinline__ float get_inner_y(Hits const& hh) const { return hh.yGlobal(theInnerHitId); } @@ -297,8 +319,8 @@ class GPUCACell { } private: - CellNeighbors theOuterNeighbors; - CellTracks theTracks; + CellNeighbors* theOuterNeighbors; + CellTracks* theTracks; public: int32_t theDoubletId; diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h index 8e0b05dcb6c8a..25633fa7b7283 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h @@ -73,6 +73,15 @@ namespace gpuPixelDoublets { int first = blockIdx.x * blockDim.x + threadIdx.x; for (int i = first; i < nHits; i += gridDim.x * blockDim.x) isOuterHitOfCell[i].reset(); + + cellNeighbors->construct(CAConstants::maxNumOfActiveDoublets(), cellNeighborsContainer); + cellTracks->construct(CAConstants::maxNumOfActiveDoublets(), cellTracksContainer); + auto i = cellNeighbors->extend(); + assert(0 == i); + (*cellNeighbors)[0].reset(); + i = cellTracks->extend(); + assert(0 == i); + (*cellTracks)[0].reset(); } constexpr auto getDoubletsFromHistoMaxBlockSize = 64; // for both x and y From e4b82bcdcfeb0e0ea379251175b6ad15b2a5d501 Mon Sep 17 00:00:00 2001 From: Vincenzo Innocente Date: Sat, 11 Jul 2020 18:56:47 +0200 Subject: [PATCH 2/8] reduce memory buffer, adjust size --- .../PixelTriplets/plugins/CAConstants.h | 6 +++--- .../plugins/CAHitNtupletGeneratorKernels.cc | 15 +++++++++------ .../plugins/CAHitNtupletGeneratorKernels.cu | 16 +++++++++++----- .../plugins/CAHitNtupletGeneratorKernels.h | 5 +++-- .../plugins/CAHitNtupletGeneratorKernelsImpl.h | 4 ++++ 5 files changed, 30 insertions(+), 16 deletions(-) diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h b/RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h index fce0c23596137..0ebbdf3ed3705 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h @@ -27,7 +27,7 @@ namespace CAConstants { constexpr uint32_t maxNumberOfQuadruplets() { return maxNumberOfTuples(); } #ifndef ONLY_PHICUT #ifndef GPU_SMALL_EVENTS - constexpr uint32_t maxNumberOfDoublets() { return 448 * 1024; } + constexpr uint32_t maxNumberOfDoublets() { return 512 * 1024; } constexpr uint32_t maxCellsPerHit() { return 128; } #else constexpr uint32_t maxNumberOfDoublets() { return 128 * 1024; } @@ -37,7 +37,7 @@ namespace CAConstants { constexpr uint32_t maxNumberOfDoublets() { return 2 * 1024 * 1024; } constexpr uint32_t maxCellsPerHit() { return 8 * 128; } #endif - constexpr uint32_t maxNumOfActiveDoublets() { return maxNumberOfDoublets() / 4; } + constexpr uint32_t maxNumOfActiveDoublets() { return maxNumberOfDoublets() / 8; } constexpr uint32_t maxNumberOfLayerPairs() { return 20; } constexpr uint32_t maxNumberOfLayers() { return 10; } @@ -49,7 +49,7 @@ namespace CAConstants { #ifndef ONLY_PHICUT using CellNeighbors = cms::cuda::VecArray; - using CellTracks = cms::cuda::VecArray; + using CellTracks = cms::cuda::VecArray; #else using CellNeighbors = cms::cuda::VecArray; using CellTracks = cms::cuda::VecArray; diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc index 8e75303555d8f..4d4791b87ad3b 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc @@ -24,17 +24,20 @@ void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsOnCPU const &hh, cudaStr device_isOuterHitOfCell_.reset( (GPUCACell::OuterHitOfCell *)malloc(std::max(1U, nhits) * sizeof(GPUCACell::OuterHitOfCell))); assert(device_isOuterHitOfCell_.get()); - device_theCellNeighborsContainer_.reset( - (GPUCACell::CellNeighbors *)malloc(CAConstants::maxNumOfActiveDoublets() * sizeof(GPUCACell::CellNeighbors))); - device_theCellTracksContainer_.reset( - (GPUCACell::CellTracks *)malloc(CAConstants::maxNumOfActiveDoublets() * sizeof(GPUCACell::CellTracks))); + + cellStorage_.reset((unsigned char *)malloc(CAConstants::maxNumOfActiveDoublets() * sizeof(GPUCACell::CellNeighbors) + + CAConstants::maxNumOfActiveDoublets() * sizeof(GPUCACell::CellTracks))); + device_theCellNeighborsContainer_ = (GPUCACell::CellNeighbors *)cellStorage_.get(); + device_theCellTracksContainer_ = + (GPUCACell::CellTracks *)(cellStorage_.get() + + CAConstants::maxNumOfActiveDoublets() * sizeof(GPUCACell::CellNeighbors)); gpuPixelDoublets::initDoublets(device_isOuterHitOfCell_.get(), nhits, device_theCellNeighbors_.get(), - device_theCellNeighborsContainer_.get(), + device_theCellNeighborsContainer_, device_theCellTracks_.get(), - device_theCellTracksContainer_.get()); + device_theCellTracksContainer_); // device_theCells_ = Traits:: template make_unique(cs, m_params.maxNumberOfDoublets_, stream); device_theCells_.reset((GPUCACell *)malloc(sizeof(GPUCACell) * m_params.maxNumberOfDoublets_)); diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu index 479078f80f613..c3980b5881af2 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu @@ -162,10 +162,16 @@ void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStr // in principle we can use "nhits" to heuristically dimension the workspace... device_isOuterHitOfCell_ = cms::cuda::make_device_unique(std::max(1U, nhits), stream); assert(device_isOuterHitOfCell_.get()); - device_theCellNeighborsContainer_ = - cms::cuda::make_device_unique(CAConstants::maxNumOfActiveDoublets(), stream); + + cellStorage_ = cms::cuda::make_device_unique( + CAConstants::maxNumOfActiveDoublets() * sizeof(GPUCACell::CellNeighbors) + + CAConstants::maxNumOfActiveDoublets() * sizeof(GPUCACell::CellTracks), + stream); + device_theCellNeighborsContainer_ = (GPUCACell::CellNeighbors *)cellStorage_.get(); device_theCellTracksContainer_ = - cms::cuda::make_device_unique(CAConstants::maxNumOfActiveDoublets(), stream); + (GPUCACell::CellTracks *)(cellStorage_.get() + + CAConstants::maxNumOfActiveDoublets() * sizeof(GPUCACell::CellNeighbors)); + { int threadsPerBlock = 128; // at least one block! @@ -173,9 +179,9 @@ void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStr gpuPixelDoublets::initDoublets<<>>(device_isOuterHitOfCell_.get(), nhits, device_theCellNeighbors_.get(), - device_theCellNeighborsContainer_.get(), + device_theCellNeighborsContainer_, device_theCellTracks_.get(), - device_theCellTracksContainer_.get()); + device_theCellTracksContainer_); cudaCheck(cudaGetLastError()); } diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h index 7a0b196d8ae89..7ab3ed010927e 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h @@ -179,10 +179,11 @@ class CAHitNtupletGeneratorKernels { private: // workspace + unique_ptr cellStorage_; unique_ptr device_theCellNeighbors_; - unique_ptr device_theCellNeighborsContainer_; + CAConstants::CellNeighbors* device_theCellNeighborsContainer_; unique_ptr device_theCellTracks_; - unique_ptr device_theCellTracksContainer_; + CAConstants::CellTracks* device_theCellTracksContainer_; unique_ptr device_theCells_; unique_ptr device_isOuterHitOfCell_; diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h index 654b37c076f99..691395887dddb 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h @@ -79,6 +79,10 @@ __global__ void kernel_checkOverflows(HitContainer const *foundNtuplets, printf("Tuples overflow\n"); if (*nCells >= maxNumberOfDoublets) printf("Cells overflow\n"); + if (cellNeighbors && cellNeighbors->full()) + printf("cellNeighbors overflow\n"); + if (cellTracks && cellTracks->full()) + printf("cellTracks overflow\n"); } for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) { From fc0bdd27157e423139601e21ec466a0aa43c447f Mon Sep 17 00:00:00 2001 From: Vincenzo Innocente Date: Sat, 11 Jul 2020 19:46:14 +0200 Subject: [PATCH 3/8] add a fence --- RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h index cb4dedd4dd45a..4ec1711524eac 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h @@ -76,6 +76,7 @@ class GPUCACell { } else return -1; } + __threadfence(); return outerNeighbors().push_back(t); } @@ -89,6 +90,7 @@ class GPUCACell { } else return -1; } + __threadfence(); return tracks().push_back(t); } From 00287f868e5981da5bced4c5f0e485e39ea117b9 Mon Sep 17 00:00:00 2001 From: Vincenzo Innocente Date: Sun, 12 Jul 2020 10:55:30 +0200 Subject: [PATCH 4/8] attempt to delete early, memory increases! --- .../plugins/CAHitNtupletGeneratorKernels.cu | 5 +++ .../plugins/CAHitNtupletGeneratorOnGPU.cc | 5 +-- .../PixelTriplets/test/BuildFile.xml | 5 +++ .../PixelTriplets/test/CAsizes_t.cpp | 32 +++++++++++++++++++ 4 files changed, 45 insertions(+), 2 deletions(-) create mode 100644 RecoPixelVertexing/PixelTriplets/test/CAsizes_t.cpp diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu index c3980b5881af2..947bd5790bb59 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu @@ -144,6 +144,10 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA * cudaDeviceSynchronize(); cudaCheck(cudaGetLastError()); #endif + + // free space asap + // device_isOuterHitOfCell_.reset(); + } template <> @@ -229,6 +233,7 @@ void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStr #endif } + template <> void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) { // these are pointer on GPU! diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc index 4a8240706efc2..3e16728a002dd 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc @@ -168,14 +168,15 @@ PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecH CAHitNtupletGeneratorKernelsGPU kernels(m_params); kernels.counters_ = m_counters; - HelixFitOnGPU fitter(bfield, m_params.fit5as4_); kernels.allocateOnGPU(stream); - fitter.allocateOnGPU(&(soa->hitIndices), kernels.tupleMultiplicity(), soa); kernels.buildDoublets(hits_d, stream); kernels.launchKernels(hits_d, soa, stream); kernels.fillHitDetIndices(hits_d.view(), soa, stream); // in principle needed only if Hits not "available" + + HelixFitOnGPU fitter(bfield, m_params.fit5as4_); + fitter.allocateOnGPU(&(soa->hitIndices), kernels.tupleMultiplicity(), soa); if (m_params.useRiemannFit_) { fitter.launchRiemannKernels(hits_d.view(), hits_d.nHits(), CAConstants::maxNumberOfQuadruplets(), stream); } else { diff --git a/RecoPixelVertexing/PixelTriplets/test/BuildFile.xml b/RecoPixelVertexing/PixelTriplets/test/BuildFile.xml index 9b8b315e93937..92fa4370faa70 100644 --- a/RecoPixelVertexing/PixelTriplets/test/BuildFile.xml +++ b/RecoPixelVertexing/PixelTriplets/test/BuildFile.xml @@ -27,3 +27,8 @@ + + + + + diff --git a/RecoPixelVertexing/PixelTriplets/test/CAsizes_t.cpp b/RecoPixelVertexing/PixelTriplets/test/CAsizes_t.cpp new file mode 100644 index 0000000000000..0b623f2eccbbe --- /dev/null +++ b/RecoPixelVertexing/PixelTriplets/test/CAsizes_t.cpp @@ -0,0 +1,32 @@ +#include "RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h" + +#include +#include + +template +void print() { + + std::cout << "size of " << typeid(T).name() << ' ' << sizeof(T) << std::endl; + + +} + + + +int main() { + + using namespace CAConstants; + + print(); + print(); + print(); + print(); + print(); + print(); + print(); + + print(); + + return 0; + +} From 29caebfb4448395ef633340dcb1abcf8e9a35db0 Mon Sep 17 00:00:00 2001 From: Vincenzo Innocente Date: Sun, 12 Jul 2020 15:20:09 +0200 Subject: [PATCH 5/8] fix data race? --- .../CUDAUtilities/interface/prefixScan.h | 13 ++++++++----- .../PixelTriplets/plugins/gpuPixelDoublets.h | 18 ++++++++++-------- 2 files changed, 18 insertions(+), 13 deletions(-) diff --git a/HeterogeneousCore/CUDAUtilities/interface/prefixScan.h b/HeterogeneousCore/CUDAUtilities/interface/prefixScan.h index 6d4d5f4e7cc5e..7693b2b3e8508 100644 --- a/HeterogeneousCore/CUDAUtilities/interface/prefixScan.h +++ b/HeterogeneousCore/CUDAUtilities/interface/prefixScan.h @@ -41,9 +41,9 @@ namespace cms { namespace cuda { // limited to 32*32 elements.... - template - __host__ __device__ __forceinline__ void blockPrefixScan(T const* __restrict__ ci, - T* __restrict__ co, + template + __host__ __device__ __forceinline__ void blockPrefixScan(VT const* ci, + VT* co, uint32_t size, T* ws #ifndef __CUDA_ARCH__ @@ -138,8 +138,10 @@ namespace cms { // in principle not limited.... template - __global__ void multiBlockPrefixScan(T const* ci, T* co, int32_t size, int32_t* pc) { - __shared__ T ws[32]; + __global__ void multiBlockPrefixScan(T const* ici, T* ico, int32_t size, int32_t* pc) { + volatile T const * ci = ici; + volatile T * co = ico; + __shared__ T ws[32]; #ifdef __CUDA_ARCH__ assert(sizeof(T) * gridDim.x <= dynamic_smem_size()); // size of psum below #endif @@ -152,6 +154,7 @@ namespace cms { // count blocks that finished __shared__ bool isLastBlockDone; if (0 == threadIdx.x) { + __threadfence(); auto value = atomicAdd(pc, 1); // block counter isLastBlockDone = (value == (int(gridDim.x) - 1)); } diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h index 25633fa7b7283..76cf1bd5e11b3 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h @@ -74,14 +74,16 @@ namespace gpuPixelDoublets { for (int i = first; i < nHits; i += gridDim.x * blockDim.x) isOuterHitOfCell[i].reset(); - cellNeighbors->construct(CAConstants::maxNumOfActiveDoublets(), cellNeighborsContainer); - cellTracks->construct(CAConstants::maxNumOfActiveDoublets(), cellTracksContainer); - auto i = cellNeighbors->extend(); - assert(0 == i); - (*cellNeighbors)[0].reset(); - i = cellTracks->extend(); - assert(0 == i); - (*cellTracks)[0].reset(); + if (0==first) { + cellNeighbors->construct(CAConstants::maxNumOfActiveDoublets(), cellNeighborsContainer); + cellTracks->construct(CAConstants::maxNumOfActiveDoublets(), cellTracksContainer); + auto i = cellNeighbors->extend(); + assert(0 == i); + (*cellNeighbors)[0].reset(); + i = cellTracks->extend(); + assert(0 == i); + (*cellTracks)[0].reset(); + } } constexpr auto getDoubletsFromHistoMaxBlockSize = 64; // for both x and y From abcf9aa4a32efba46fefd5ce149986abac44eb1d Mon Sep 17 00:00:00 2001 From: Vincenzo Innocente Date: Sun, 12 Jul 2020 19:38:08 +0200 Subject: [PATCH 6/8] solve type punning issue on cpu --- RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h index 4ec1711524eac..e913b77fe0953 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h @@ -69,10 +69,14 @@ class GPUCACell { auto i = cellNeighbors.extend(); // maybe waisted.... if (i > 0) { cellNeighbors[i].reset(); +#ifdef __CUDACC__ auto zero = (ptrAsInt)(&cellNeighbors[0]); atomicCAS((ptrAsInt*)(&theOuterNeighbors), zero, (ptrAsInt)(&cellNeighbors[i])); // if fails we cannot give "i" back... +#else + theOuterNeighbors = &cellNeighbors[i]; +#endif } else return -1; } @@ -85,8 +89,12 @@ class GPUCACell { auto i = cellTracks.extend(); // maybe waisted.... if (i > 0) { cellTracks[i].reset(); +#ifdef __CUDACC__ auto zero = (ptrAsInt)(&cellTracks[0]); atomicCAS((ptrAsInt*)(&theTracks), zero, (ptrAsInt)(&cellTracks[i])); // if fails we cannot give "i" back... +#else + theTracks = &cellTracks[i]; +#endif } else return -1; } From 715f4f07d603e0b1e9585b593a62c5ce627c730f Mon Sep 17 00:00:00 2001 From: Vincenzo Innocente Date: Mon, 13 Jul 2020 15:44:49 +0200 Subject: [PATCH 7/8] use growing factor 2 in allocator --- .../CUDAUtilities/src/getCachingDeviceAllocator.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/HeterogeneousCore/CUDAUtilities/src/getCachingDeviceAllocator.h b/HeterogeneousCore/CUDAUtilities/src/getCachingDeviceAllocator.h index 8158f414b07d4..3770dbac574d9 100644 --- a/HeterogeneousCore/CUDAUtilities/src/getCachingDeviceAllocator.h +++ b/HeterogeneousCore/CUDAUtilities/src/getCachingDeviceAllocator.h @@ -13,11 +13,11 @@ namespace cms::cuda::allocator { // Use caching or not constexpr bool useCaching = true; // Growth factor (bin_growth in cub::CachingDeviceAllocator - constexpr unsigned int binGrowth = 8; + constexpr unsigned int binGrowth = 2; // Smallest bin, corresponds to binGrowth^minBin bytes (min_bin in cub::CacingDeviceAllocator - constexpr unsigned int minBin = 1; + constexpr unsigned int minBin = 8; // Largest bin, corresponds to binGrowth^maxBin bytes (max_bin in cub::CachingDeviceAllocator). Note that unlike in cub, allocations larger than binGrowth^maxBin are set to fail. - constexpr unsigned int maxBin = 10; + constexpr unsigned int maxBin = 30; // Total storage for the allocator. 0 means no limit. constexpr size_t maxCachedBytes = 0; // Fraction of total device memory taken for the allocator. In case there are multiple devices with different amounts of memory, the smallest of them is taken. If maxCachedBytes is non-zero, the smallest of them is taken. From 0b021bf3958de2c85f0dbdbb546223c1589c2f7d Mon Sep 17 00:00:00 2001 From: Vincenzo Innocente Date: Mon, 13 Jul 2020 15:47:11 +0200 Subject: [PATCH 8/8] code format --- HeterogeneousCore/CUDAUtilities/interface/prefixScan.h | 6 +++--- .../plugins/CAHitNtupletGeneratorKernels.cu | 2 -- .../PixelTriplets/plugins/gpuPixelDoublets.h | 2 +- RecoPixelVertexing/PixelTriplets/test/CAsizes_t.cpp | 9 +-------- 4 files changed, 5 insertions(+), 14 deletions(-) diff --git a/HeterogeneousCore/CUDAUtilities/interface/prefixScan.h b/HeterogeneousCore/CUDAUtilities/interface/prefixScan.h index 7693b2b3e8508..33dc6a18ffa2a 100644 --- a/HeterogeneousCore/CUDAUtilities/interface/prefixScan.h +++ b/HeterogeneousCore/CUDAUtilities/interface/prefixScan.h @@ -139,9 +139,9 @@ namespace cms { // in principle not limited.... template __global__ void multiBlockPrefixScan(T const* ici, T* ico, int32_t size, int32_t* pc) { - volatile T const * ci = ici; - volatile T * co = ico; - __shared__ T ws[32]; + volatile T const* ci = ici; + volatile T* co = ico; + __shared__ T ws[32]; #ifdef __CUDA_ARCH__ assert(sizeof(T) * gridDim.x <= dynamic_smem_size()); // size of psum below #endif diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu index 947bd5790bb59..8a213eee2f579 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu @@ -147,7 +147,6 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA * // free space asap // device_isOuterHitOfCell_.reset(); - } template <> @@ -233,7 +232,6 @@ void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStr #endif } - template <> void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) { // these are pointer on GPU! diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h index 76cf1bd5e11b3..5b0d3e8833a52 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h @@ -74,7 +74,7 @@ namespace gpuPixelDoublets { for (int i = first; i < nHits; i += gridDim.x * blockDim.x) isOuterHitOfCell[i].reset(); - if (0==first) { + if (0 == first) { cellNeighbors->construct(CAConstants::maxNumOfActiveDoublets(), cellNeighborsContainer); cellTracks->construct(CAConstants::maxNumOfActiveDoublets(), cellTracksContainer); auto i = cellNeighbors->extend(); diff --git a/RecoPixelVertexing/PixelTriplets/test/CAsizes_t.cpp b/RecoPixelVertexing/PixelTriplets/test/CAsizes_t.cpp index 0b623f2eccbbe..5c57eb7005691 100644 --- a/RecoPixelVertexing/PixelTriplets/test/CAsizes_t.cpp +++ b/RecoPixelVertexing/PixelTriplets/test/CAsizes_t.cpp @@ -3,18 +3,12 @@ #include #include -template +template void print() { - std::cout << "size of " << typeid(T).name() << ' ' << sizeof(T) << std::endl; - - } - - int main() { - using namespace CAConstants; print(); @@ -28,5 +22,4 @@ int main() { print(); return 0; - }