From 61a11b8acca01379de0e19749883df7e0842574d Mon Sep 17 00:00:00 2001 From: Vincenzo Innocente Date: Thu, 24 Jan 2019 14:21:32 +0100 Subject: [PATCH] Speed up the doublet finder (cms-patatrack#260) Introduce the inner loop parallelization in the doublet finder using the stride pattern already used in the "fishbone", and make use of a 2D grid instead of a hand-made stride. --- .../PixelTriplets/plugins/gpuFishbone.h | 19 +++++++++---------- .../PixelTriplets/plugins/gpuPixelDoublets.h | 12 ++++++++---- 2 files changed, 17 insertions(+), 14 deletions(-) diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h index 717cbf777fcdb..796241eaf50ff 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h @@ -26,7 +26,7 @@ namespace gpuPixelDoublets { GPUCACell * cells, uint32_t const * __restrict__ nCells, GPUCACell::OuterHitOfCell const * __restrict__ isOuterHitOfCell, uint32_t nHits, - uint32_t stride, bool checkTrack) { + bool checkTrack) { constexpr auto maxCellsPerHit = GPUCACell::maxCellsPerHit; @@ -35,13 +35,12 @@ namespace gpuPixelDoublets { uint8_t const * __restrict__ layerp = hh.phase1TopologyLayer_d; auto layer = [&](uint16_t id) { return __ldg(layerp+id/phase1PixelTopology::maxModuleStride);}; - auto ldx = threadIdx.x + blockIdx.x * blockDim.x; - auto idx = ldx/stride; - auto first = ldx - idx*stride; - assert(first=nHits) return; - auto const & vc = isOuterHitOfCell[idx]; + if (idy>=nHits) return; + auto const & vc = isOuterHitOfCell[idy]; auto s = vc.size(); if (s<2) return; // if alligned kill one of the two. @@ -66,8 +65,8 @@ namespace gpuPixelDoublets { ++sg; } if (sg<2) return; - // here we parallelize - for (uint32_t ic=first; ic= innerLayerCumulativeSize[pairLayerId++]); @@ -115,7 +118,8 @@ namespace gpuPixelDoublets { nmin += hist.size(kk+hoff); auto const * __restrict__ p = hist.begin(kk+hoff); auto const * __restrict__ e = hist.end(kk+hoff); - for (;p < e; ++p) { + p+=first; + for (;p < e; p+=stride) { auto oi=__ldg(p); assert(oi>=offsets[outer]); assert(oi