From 00e8cf422c138fd3ccff17711d4a8ddeb862dcb9 Mon Sep 17 00:00:00 2001 From: Vincenzo Innocente Date: Thu, 24 Jan 2019 14:21:32 +0100 Subject: [PATCH] Speed up the doublet finder (#260) Introduce the inner loop parallelization in the doublet finder using the stride pattern already used in the "fishbone", and make use of a 2D grid instead of a hand-made stride. --- .../CAHitQuadrupletGeneratorKernels.cu | 52 ++++++++++++------- .../PixelTriplets/plugins/gpuFishbone.h | 19 ++++--- .../PixelTriplets/plugins/gpuPixelDoublets.h | 12 +++-- 3 files changed, 51 insertions(+), 32 deletions(-) diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorKernels.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorKernels.cu index 85dc10ee04587..ec9c4960062ef 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorKernels.cu +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorKernels.cu @@ -144,9 +144,11 @@ kernel_connect(AtomicPairCounter * apc1, AtomicPairCounter * apc2, // just to z constexpr auto hardCurvCut = 1.f/(0.35f * 87.f); // FIXME VI tune constexpr auto ptmin = 0.9f; // FIXME original "tune" - auto cellIndex = threadIdx.x + blockIdx.x * blockDim.x; + auto cellIndex = threadIdx.y + blockIdx.y * blockDim.y; + auto first = threadIdx.x; + auto stride = blockDim.x; - if (0==cellIndex) { (*apc1)=0; (*apc2)=0; }// ready for next kernel + if (0==(cellIndex+first)) { (*apc1)=0; (*apc2)=0; }// ready for next kernel if (cellIndex >= (*nCells) ) return; auto const & thisCell = cells[cellIndex]; @@ -154,7 +156,7 @@ kernel_connect(AtomicPairCounter * apc1, AtomicPairCounter * apc2, // just to z auto innerHitId = thisCell.get_inner_hit_id(); auto numberOfPossibleNeighbors = isOuterHitOfCell[innerHitId].size(); auto vi = isOuterHitOfCell[innerHitId].data(); - for (auto j = 0; j < numberOfPossibleNeighbors; ++j) { + for (auto j = first; j < numberOfPossibleNeighbors; j+=stride) { auto otherCell = __ldg(vi+j); if (cells[otherCell].theDoubletId<0) continue; if (thisCell.check_alignment(hh, @@ -172,6 +174,8 @@ void kernel_find_ntuplets( unsigned int minHitsPerNtuplet) { + // recursive: not obvious to widen + auto cellIndex = threadIdx.x + blockIdx.x * blockDim.x; if (cellIndex >= (*nCells) ) return; auto &thisCell = cells[cellIndex]; @@ -246,23 +250,29 @@ void CAHitQuadrupletGeneratorKernels::launchKernels( // here goes algoparms.... assert(nhits <= PixelGPUConstants::maxNumberOfHits); if (earlyFishbone_) { - auto blockSize = 128; + auto nthTot = 64; auto stride = 4; + auto blockSize = nthTot/stride; auto numberOfBlocks = (nhits + blockSize - 1)/blockSize; - numberOfBlocks *=stride; - - fishbone<<>>( + dim3 blks(1,numberOfBlocks,1); + dim3 thrs(stride,blockSize,1); + fishbone<<>>( hh.gpu_d, device_theCells_, device_nCells_, device_isOuterHitOfCell_, - nhits, stride, false + nhits, false ); cudaCheck(cudaGetLastError()); } - auto blockSize = 64; + auto nthTot = 64; + auto stride = 4; + auto blockSize = nthTot/stride; auto numberOfBlocks = (maxNumberOfDoublets_ + blockSize - 1)/blockSize; - kernel_connect<<>>( + dim3 blks(1,numberOfBlocks,1); + dim3 thrs(stride,blockSize,1); + + kernel_connect<<>>( gpu_.apc_d, device_hitToTuple_apc_, // needed only to be reset, ready for next kernel hh.gpu_d, device_theCells_, device_nCells_, @@ -282,14 +292,17 @@ void CAHitQuadrupletGeneratorKernels::launchKernels( // here goes algoparms.... cudautils::finalizeBulk<<>>(gpu_.apc_d,gpu_.tuples_d); if (lateFishbone_) { - auto stride=4; - numberOfBlocks = (nhits + blockSize - 1)/blockSize; - numberOfBlocks *=stride; - fishbone<<>>( + auto nthTot = 128; + auto stride = 16; + auto blockSize = nthTot/stride; + auto numberOfBlocks = (nhits + blockSize - 1)/blockSize; + dim3 blks(1,numberOfBlocks,1); + dim3 thrs(stride,blockSize,1); + fishbone<<>>( hh.gpu_d, device_theCells_, device_nCells_, device_isOuterHitOfCell_, - nhits, stride, true + nhits, true ); cudaCheck(cudaGetLastError()); } @@ -312,9 +325,13 @@ void CAHitQuadrupletGeneratorKernels::launchKernels( // here goes algoparms.... void CAHitQuadrupletGeneratorKernels::buildDoublets(HitsOnCPU const & hh, cudaStream_t stream) { auto nhits = hh.nHits; - int threadsPerBlock = gpuPixelDoublets::getDoubletsFromHistoMaxBlockSize; + int stride=1; + int threadsPerBlock = gpuPixelDoublets::getDoubletsFromHistoMaxBlockSize/stride; int blocks = (3 * nhits + threadsPerBlock - 1) / threadsPerBlock; - gpuPixelDoublets::getDoubletsFromHisto<<>>(device_theCells_, device_nCells_, hh.gpu_d, device_isOuterHitOfCell_); + dim3 blks(1,blocks,1); + dim3 thrs(stride,threadsPerBlock,1); + gpuPixelDoublets::getDoubletsFromHisto<<>>( + device_theCells_, device_nCells_, hh.gpu_d, device_isOuterHitOfCell_); cudaCheck(cudaGetLastError()); } @@ -330,4 +347,3 @@ void CAHitQuadrupletGeneratorKernels::classifyTuples(HitsOnCPU const & hh, Tuple kernel_fastDuplicateRemover<<>>(device_theCells_, device_nCells_,tuples.tuples_d,tuples.helix_fit_results_d, tuples.quality_d); } - diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h index 717cbf777fcdb..796241eaf50ff 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h @@ -26,7 +26,7 @@ namespace gpuPixelDoublets { GPUCACell * cells, uint32_t const * __restrict__ nCells, GPUCACell::OuterHitOfCell const * __restrict__ isOuterHitOfCell, uint32_t nHits, - uint32_t stride, bool checkTrack) { + bool checkTrack) { constexpr auto maxCellsPerHit = GPUCACell::maxCellsPerHit; @@ -35,13 +35,12 @@ namespace gpuPixelDoublets { uint8_t const * __restrict__ layerp = hh.phase1TopologyLayer_d; auto layer = [&](uint16_t id) { return __ldg(layerp+id/phase1PixelTopology::maxModuleStride);}; - auto ldx = threadIdx.x + blockIdx.x * blockDim.x; - auto idx = ldx/stride; - auto first = ldx - idx*stride; - assert(first=nHits) return; - auto const & vc = isOuterHitOfCell[idx]; + if (idy>=nHits) return; + auto const & vc = isOuterHitOfCell[idy]; auto s = vc.size(); if (s<2) return; // if alligned kill one of the two. @@ -66,8 +65,8 @@ namespace gpuPixelDoublets { ++sg; } if (sg<2) return; - // here we parallelize - for (uint32_t ic=first; ic= innerLayerCumulativeSize[pairLayerId++]); @@ -115,7 +118,8 @@ namespace gpuPixelDoublets { nmin += hist.size(kk+hoff); auto const * __restrict__ p = hist.begin(kk+hoff); auto const * __restrict__ e = hist.end(kk+hoff); - for (;p < e; ++p) { + p+=first; + for (;p < e; p+=stride) { auto oi=__ldg(p); assert(oi>=offsets[outer]); assert(oi