From 8293c5ea6ed4bb64575d449c7aa8bf98a1eae024 Mon Sep 17 00:00:00 2001 From: Andrea Bocci Date: Thu, 13 Sep 2018 14:49:03 +0200 Subject: [PATCH] Optimise gpuPixelDoublets::doubletsFromHisto() kernel (#167) Pre-compute few constants that could not be declared constexpr. Reduce temporary buffer size. Reduce the block size of the calls to gpuPixelDoublets::getDoubletsFromHisto() from 256 to 64, to make better usage of the GPU processors. --- .../plugins/CAHitQuadrupletGeneratorGPU.cu | 22 +- .../PixelTriplets/plugins/gpuPixelDoublets.h | 274 +++++++++--------- 2 files changed, 155 insertions(+), 141 deletions(-) diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.cu index efa3793ff62cc..4b87896bc6f3b 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.cu +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.cu @@ -24,7 +24,7 @@ kernel_checkOverflows(GPU::SimpleVector *foundNtuplets, if (idx < (*nCells) ) { auto &thisCell = cells[idx]; if (thisCell.theOuterNeighbors.full()) //++tooManyNeighbors[thisCell.theLayerPairId]; - printf("OuterNeighbors overflow %d in %d\n",idx,thisCell.theLayerPairId); + printf("OuterNeighbors overflow %d in %d\n", idx, thisCell.theLayerPairId); } if (idx < nHits) { if (isOuterHitOfCell[idx].full()) // ++tooManyOuterHitOfCell; @@ -43,8 +43,8 @@ kernel_connect(GPU::SimpleVector *foundNtuplets, const float phiCut, const float hardPtCut, unsigned int maxNumberOfDoublets_, unsigned int maxNumberOfHits_) { - float region_origin_x =0.; - float region_origin_y =0.; + float region_origin_x = 0.; + float region_origin_y = 0.; auto cellIndex = threadIdx.x + blockIdx.x * blockDim.x; @@ -86,7 +86,7 @@ __global__ void kernel_find_ntuplets( __global__ void kernel_print_found_ntuplets(GPU::SimpleVector *foundNtuplets, int maxPrint) { - for (int i = 0; i < std::min(maxPrint,foundNtuplets->size()); ++i) { + for (int i = 0; i < std::min(maxPrint, foundNtuplets->size()); ++i) { printf("\nquadruplet %d: %d %d %d %d\n", i, (*foundNtuplets)[i].hitId[0], (*foundNtuplets)[i].hitId[1], @@ -177,7 +177,7 @@ void CAHitQuadrupletGeneratorGPU::launchKernels(const TrackingRegion ®ion, cudaCheck(cudaGetLastError()); - numberOfBlocks = (std::max(int(nhits),maxNumberOfDoublets_) + 512 - 1)/512; + numberOfBlocks = (std::max(int(nhits), maxNumberOfDoublets_) + 512 - 1)/512; kernel_checkOverflows<<>>( d_foundNtupletsVec_[regionIndex], device_theCells_, device_nCells_, @@ -185,7 +185,7 @@ void CAHitQuadrupletGeneratorGPU::launchKernels(const TrackingRegion ®ion, ); - // kernel_print_found_ntuplets<<<1,1,0, cudaStream>>>(d_foundNtupletsVec_[regionIndex],10); + // kernel_print_found_ntuplets<<<1, 1, 0, cudaStream>>>(d_foundNtupletsVec_[regionIndex], 10); if(transferToCPU) { cudaCheck(cudaMemcpyAsync(h_foundNtupletsVec_[regionIndex], d_foundNtupletsVec_[regionIndex], @@ -203,7 +203,7 @@ void CAHitQuadrupletGeneratorGPU::cleanup(cudaStream_t cudaStream) { cudaCheck(cudaMemsetAsync(device_isOuterHitOfCell_, 0, maxNumberOfLayers_ * maxNumberOfHits_ * sizeof(GPU::VecArray), cudaStream)); - cudaCheck(cudaMemsetAsync(device_nCells_,0,sizeof(uint32_t),cudaStream)); + cudaCheck(cudaMemsetAsync(device_nCells_, 0, sizeof(uint32_t), cudaStream)); } std::vector> @@ -220,10 +220,10 @@ CAHitQuadrupletGeneratorGPU::fetchKernelResult(int regionIndex) } void CAHitQuadrupletGeneratorGPU::buildDoublets(HitsOnCPU const & hh, cudaStream_t stream) { - auto nhits = hh.nHits; + auto nhits = hh.nHits; - int threadsPerBlock = 256; - int blocks = (3*nhits + threadsPerBlock - 1) / threadsPerBlock; - gpuPixelDoublets::getDoubletsFromHisto<<>>(device_theCells_,device_nCells_,hh.gpu_d, device_isOuterHitOfCell_); + int threadsPerBlock = gpuPixelDoublets::getDoubletsFromHistoMaxBlockSize; + int blocks = (3 * nhits + threadsPerBlock - 1) / threadsPerBlock; + gpuPixelDoublets::getDoubletsFromHisto<<>>(device_theCells_, device_nCells_, hh.gpu_d, device_isOuterHitOfCell_); cudaCheck(cudaGetLastError()); } diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h index e46627bf2c322..31844f39f9727 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h @@ -20,157 +20,171 @@ namespace gpuPixelDoublets { template __device__ - void doubletsFromHisto(uint8_t const * __restrict__ layerPairs, uint32_t nPairs, GPUCACell * cells, uint32_t * nCells, - int16_t const * __restrict__ iphi, Hist const * __restrict__ hist, uint32_t const * __restrict__ offsets, + void doubletsFromHisto(uint8_t const * __restrict__ layerPairs, + uint32_t nPairs, + GPUCACell * cells, + uint32_t * nCells, + int16_t const * __restrict__ iphi, + Hist const * __restrict__ hist, + uint32_t const * __restrict__ offsets, siPixelRecHitsHeterogeneousProduct::HitsOnGPU const & __restrict__ hh, - GPU::VecArray< unsigned int, 256> * isOuterHitOfCell, - int16_t const * phicuts, float const * minz, float const * maxz, float const * maxr) { - + GPU::VecArray< unsigned int, 256> * isOuterHitOfCell, + int16_t const * __restrict__ phicuts, + float const * __restrict__ minz, + float const * __restrict__ maxz, + float const * __restrict__ maxr) + { auto layerSize = [=](uint8_t li) { return offsets[li+1]-offsets[li]; }; - // to be optimized later - uint32_t innerLayerCumulativeSize[64]; - assert(nPairs<=64); + // nPairsMax to be optimized later (originally was 64). + // If it should be much bigger, consider using a block-wide parallel prefix scan, + // e.g. see https://nvlabs.github.io/cub/classcub_1_1_warp_scan.html + const int nPairsMax = 16; + assert(nPairs <= nPairsMax); + uint32_t innerLayerCumulativeSize[nPairsMax]; innerLayerCumulativeSize[0] = layerSize(layerPairs[0]); - for (uint32_t i=1; i=innerLayerCumulativeSize[pairLayerId++]); --pairLayerId; // move to lower_bound ?? - - assert(pairLayerId=innerLayerCumulativeSize[pairLayerId-1]); - - uint8_t inner = layerPairs[2*pairLayerId]; - uint8_t outer = layerPairs[2*pairLayerId+1]; - assert(outer>inner); - - auto i = (0==pairLayerId) ? j : j-innerLayerCumulativeSize[pairLayerId-1]; - i += offsets[inner]; - - // printf("Hit in Layer %d %d %d %d\n", i, inner, pairLayerId, j); - - assert(i>=offsets[inner]); - assert(i maxz[pairLayerId] || - abs(hh.zg_d[j]-mez) < minz[pairLayerId] || - hh.rg_d[j]-mer > maxr[pairLayerId]; - }; - - constexpr float z0cut = 12.f; - constexpr float hardPtCut = 0.5f; - constexpr float minRadius = hardPtCut * 87.f; - constexpr float minRadius2T4 = 4.f*minRadius*minRadius; - auto ptcut = [&](int j) { - auto r2t4 = minRadius2T4; - auto ri = mer; - auto ro = hh.rg_d[j]; - auto dphi = short2phi( min( abs(int16_t(mep-iphi[j])),abs(int16_t(iphi[j]-mep)) ) ); - return dphi*dphi*(r2t4 -ri*ro) > (ro-ri)*(ro-ri); - }; - auto z0cutoff = [&](int j) { - auto zo = hh.zg_d[j]; - auto ro = hh.rg_d[j]; - auto dr = ro-mer; - return dr > maxr[pairLayerId] || - dr<0 || std::abs((mez*ro - mer*zo)) > z0cut*dr; - }; - - auto iphicut = phicuts[pairLayerId]; - - auto kl = hist[outer].bin(int16_t(mep-iphicut)); - auto kh = hist[outer].bin(int16_t(mep+iphicut)); - auto incr = [](auto & k) { return k = (k+1)%Hist::nbins();}; - int tot = 0; - int nmin = 0; - auto khh = kh; - incr(khh); - - int tooMany=0; - for (auto kk=kl; kk!=khh; incr(kk)) { - if (kk!=kl && kk!=kh) nmin+=hist[outer].size(kk); - for (auto p=hist[outer].begin(kk); p=offsets[outer]); - assert(oi iphicut) - continue; - if (z0cutoff(oi) || ptcut(oi)) continue; - auto ind = atomicInc(nCells,MaxNumOfDoublets); - // int layerPairId, int doubletId, int innerHitId,int outerHitId) - cells[ind].init(hh,pairLayerId,ind,i,oi); - isOuterHitOfCell[oi].push_back(ind); - if (isOuterHitOfCell[oi].full()) ++tooMany; - ++tot; + auto idx = blockIdx.x * blockDim.x + threadIdx.x; + for (auto j = idx; j < ntot; j += blockDim.x * gridDim.x) { + + uint32_t pairLayerId=0; + while (j >= innerLayerCumulativeSize[pairLayerId++]); + --pairLayerId; // move to lower_bound ?? + + assert(pairLayerId < nPairs); + assert(j < innerLayerCumulativeSize[pairLayerId]); + assert(0 == pairLayerId || j >= innerLayerCumulativeSize[pairLayerId-1]); + + uint8_t inner = layerPairs[2*pairLayerId]; + uint8_t outer = layerPairs[2*pairLayerId+1]; + assert(outer > inner); + + auto i = (0 == pairLayerId) ? j : j-innerLayerCumulativeSize[pairLayerId-1]; + i += offsets[inner]; + + // printf("Hit in Layer %d %d %d %d\n", i, inner, pairLayerId, j); + + assert(i >= offsets[inner]); + assert(i < offsets[inner+1]); + + // found hit corresponding to our cuda thread, now do the job + auto mep = iphi[i]; + auto mez = hh.zg_d[i]; + auto mer = hh.rg_d[i]; + + constexpr float z0cut = 12.f; // cm + constexpr float hardPtCut = 0.5f; // GeV + constexpr float minRadius = hardPtCut * 87.78f; // cm (1 GeV track has 1 GeV/c / (e * 3.8T) ~ 87 cm radius in a 3.8T field) + constexpr float minRadius2T4 = 4.f*minRadius*minRadius; + auto ptcut = [&](int j) { + auto r2t4 = minRadius2T4; + auto ri = mer; + auto ro = hh.rg_d[j]; + auto dphi = short2phi( min( abs(int16_t(mep-iphi[j])), abs(int16_t(iphi[j]-mep)) ) ); + return dphi*dphi * (r2t4 - ri*ro) > (ro-ri)*(ro-ri); + }; + auto z0cutoff = [&](int j) { + auto zo = hh.zg_d[j]; + auto ro = hh.rg_d[j]; + auto dr = ro-mer; + return dr > maxr[pairLayerId] || + dr<0 || std::abs((mez*ro - mer*zo)) > z0cut*dr; + }; + + auto iphicut = phicuts[pairLayerId]; + + auto kl = hist[outer].bin(int16_t(mep-iphicut)); + auto kh = hist[outer].bin(int16_t(mep+iphicut)); + auto incr = [](auto & k) { return k = (k+1) % Hist::nbins();}; + int tot = 0; + int nmin = 0; + auto khh = kh; + incr(khh); + + int tooMany=0; + for (auto kk = kl; kk != khh; incr(kk)) { + if (kk != kl && kk != kh) + nmin += hist[outer].size(kk); + for (auto p = hist[outer].begin(kk); p < hist[outer].end(kk); ++p) { + auto oi=*p; + assert(oi>=offsets[outer]); + assert(oi iphicut) + continue; + if (z0cutoff(oi) || ptcut(oi)) continue; + auto ind = atomicInc(nCells, MaxNumOfDoublets); + // int layerPairId, int doubletId, int innerHitId, int outerHitId) + cells[ind].init(hh, pairLayerId, ind, i, oi); + isOuterHitOfCell[oi].push_back(ind); + if (isOuterHitOfCell[oi].full()) ++tooMany; + ++tot; + } } - } - if (tooMany>0) printf("OuterHitOfCell full for %d in layer %d/%d, %d:%d %d,%d\n", i, inner,outer, kl,kh,nmin,tot); + if (tooMany > 0) + printf("OuterHitOfCell full for %d in layer %d/%d, %d:%d %d,%d\n", i, inner, outer, kl, kh, nmin, tot); - if (hist[outer].nspills>0) - printf("spill bin to be checked in %d %d\n",outer,hist[outer].nspills); + if (hist[outer].nspills > 0) + printf("spill bin to be checked in %d %d\n", outer, hist[outer].nspills); - // if (0==hist[outer].nspills) assert(tot>=nmin); - // look in spill bin as well.... + // if (0==hist[outer].nspills) assert(tot>=nmin); + // look in spill bin as well.... - - } // loop in block... + } // loop in block... } - __global__ - void getDoubletsFromHisto(GPUCACell * cells, uint32_t * nCells, siPixelRecHitsHeterogeneousProduct::HitsOnGPU const * __restrict__ hhp, - GPU::VecArray< unsigned int, 256> *isOuterHitOfCell) { - - uint8_t const layerPairs[2*13] = {0,1 ,1,2 ,2,3 - // ,0,4 ,1,4 ,2,4 ,4,5 ,5,6 - ,0,7 ,1,7 ,2,7 ,7,8 ,8,9 - ,0,4 ,1,4 ,2,4 ,4,5 ,5,6 - }; + constexpr auto getDoubletsFromHistoMaxBlockSize = 64; - const int16_t phi0p05 = phi2short(0.05); - const int16_t phi0p06 = phi2short(0.06); - const int16_t phi0p07 = phi2short(0.07); + __global__ + __launch_bounds__(getDoubletsFromHistoMaxBlockSize) + void getDoubletsFromHisto(GPUCACell * cells, + uint32_t * nCells, + siPixelRecHitsHeterogeneousProduct::HitsOnGPU const * __restrict__ hhp, + GPU::VecArray * isOuterHitOfCell) + { + constexpr int nPairs = 13; + constexpr const uint8_t layerPairs[2*nPairs] = { + 0, 1, 1, 2, 2, 3, + // 0, 4, 1, 4, 2, 4, 4, 5, 5, 6, + 0, 7, 1, 7, 2, 7, 7, 8, 8, 9, + 0, 4, 1, 4, 2, 4, 4, 5, 5, 6 + }; - int16_t const phicuts[13] { phi0p05, phi0p05, phi0p06 - ,phi0p07, phi0p06, phi0p06, phi0p05, phi0p05 - ,phi0p07, phi0p06, phi0p06, phi0p05, phi0p05 - }; + constexpr int16_t phi0p05 = 522; // round(521.52189...) = phi2short(0.05); + constexpr int16_t phi0p06 = 626; // round(625.82270...) = phi2short(0.06); + constexpr int16_t phi0p07 = 730; // round(730.12648...) = phi2short(0.07); - float const minz[13] = { 0., 0., 0. - ,0., 0., 0., 0., 0. - ,0., 0., 0., 0., 0. - }; + constexpr const int16_t phicuts[nPairs] { + phi0p05, phi0p05, phi0p06, + phi0p07, phi0p06, phi0p06, phi0p05, phi0p05, + phi0p07, phi0p06, phi0p06, phi0p05, phi0p05 + }; - float const maxz[13] = { 20.,15.,12. - ,30.,20.,20., 50., 50. - ,30.,20.,20., 50., 50. - }; + float const minz[nPairs] = { + 0., 0., 0., + 0., 0., 0., 0., 0., + 0., 0., 0., 0., 0. + }; - float const maxr[13] = { 20., 20., 20. - ,9., 7., 6., 5., 5. - ,9., 7., 6., 5., 5. - }; + float const maxz[nPairs] = { + 20., 15., 12., + 30., 20., 20., 50., 50., + 30., 20., 20., 50., 50. + }; + float const maxr[nPairs] = { + 20., 20., 20., + 9., 7., 6., 5., 5., + 9., 7., 6., 5., 5. + }; auto const & __restrict__ hh = *hhp; - doubletsFromHisto(layerPairs, 13, cells, nCells, - hh.iphi_d,hh.hist_d,hh.hitsLayerStart_d, + doubletsFromHisto(layerPairs, nPairs, cells, nCells, + hh.iphi_d, hh.hist_d, hh.hitsLayerStart_d, hh, isOuterHitOfCell, phicuts, minz, maxz, maxr); }