Skip to content

Commit

Permalink
Speed up the doublet finder (#260)
Browse files Browse the repository at this point in the history
Introduce the inner loop parallelization in the doublet finder using the
stride pattern already used in the "fishbone", and make use of a 2D grid
instead of a hand-made stride.
  • Loading branch information
cmsbuild authored and fwyzard committed Dec 25, 2020
1 parent 795ca8e commit 665d7da
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 14 deletions.
19 changes: 9 additions & 10 deletions RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ namespace gpuPixelDoublets {
GPUCACell * cells, uint32_t const * __restrict__ nCells,
GPUCACell::OuterHitOfCell const * __restrict__ isOuterHitOfCell,
uint32_t nHits,
uint32_t stride, bool checkTrack) {
bool checkTrack) {

constexpr auto maxCellsPerHit = GPUCACell::maxCellsPerHit;

Expand All @@ -35,13 +35,12 @@ namespace gpuPixelDoublets {
uint8_t const * __restrict__ layerp = hh.phase1TopologyLayer_d;
auto layer = [&](uint16_t id) { return __ldg(layerp+id/phase1PixelTopology::maxModuleStride);};

auto ldx = threadIdx.x + blockIdx.x * blockDim.x;
auto idx = ldx/stride;
auto first = ldx - idx*stride;
assert(first<stride);
// x run faster...
auto idy = threadIdx.y + blockIdx.y * blockDim.y;
auto first = threadIdx.x;

if (idx>=nHits) return;
auto const & vc = isOuterHitOfCell[idx];
if (idy>=nHits) return;
auto const & vc = isOuterHitOfCell[idy];
auto s = vc.size();
if (s<2) return;
// if alligned kill one of the two.
Expand All @@ -66,8 +65,8 @@ namespace gpuPixelDoublets {
++sg;
}
if (sg<2) return;
// here we parallelize
for (uint32_t ic=first; ic<sg-1; ic+=stride) {
// here we parallelize
for (uint32_t ic=first; ic<sg-1; ic+=blockDim.x) {
auto & ci = cells[cc[ic]];
for (auto jc=ic+1; jc<sg; ++jc) {
auto & cj = cells[cc[jc]];
Expand All @@ -90,4 +89,4 @@ namespace gpuPixelDoublets {

}

#endif
#endif // RecoLocalTracker_SiPixelRecHits_plugins_gpuFishbone_h
12 changes: 8 additions & 4 deletions RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,11 @@ namespace gpuPixelDoublets {
}
auto ntot = innerLayerCumulativeSize[nPairs-1];

auto idx = blockIdx.x * blockDim.x + threadIdx.x;
for (auto j = idx; j < ntot; j += blockDim.x * gridDim.x) {
// x runs faster
auto idy = blockIdx.y * blockDim.y + threadIdx.y;
auto first = threadIdx.x;
auto stride = blockDim.x;
for (auto j = idy; j < ntot; j += blockDim.y * gridDim.y ) {

uint32_t pairLayerId=0;
while (j >= innerLayerCumulativeSize[pairLayerId++]);
Expand Down Expand Up @@ -115,7 +118,8 @@ namespace gpuPixelDoublets {
nmin += hist.size(kk+hoff);
auto const * __restrict__ p = hist.begin(kk+hoff);
auto const * __restrict__ e = hist.end(kk+hoff);
for (;p < e; ++p) {
p+=first;
for (;p < e; p+=stride) {
auto oi=__ldg(p);
assert(oi>=offsets[outer]);
assert(oi<offsets[outer+1]);
Expand All @@ -139,7 +143,7 @@ namespace gpuPixelDoublets {
} // loop in block...
}

constexpr auto getDoubletsFromHistoMaxBlockSize = 64;
constexpr auto getDoubletsFromHistoMaxBlockSize = 64; // for both x and y
constexpr auto getDoubletsFromHistoMinBlocksPerMP = 16;

__global__
Expand Down

0 comments on commit 665d7da

Please sign in to comment.