From 00e8cf422c138fd3ccff17711d4a8ddeb862dcb9 Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Thu, 24 Jan 2019 14:21:32 +0100
Subject: [PATCH] Speed up the doublet finder (#260)

Introduce the inner loop parallelization in the doublet finder using the
stride pattern already used in the "fishbone", and make use of a 2D grid
instead of a hand-made stride.
---
 .../CAHitQuadrupletGeneratorKernels.cu        | 52 ++++++++++++-------
 .../PixelTriplets/plugins/gpuFishbone.h       | 19 ++++---
 .../PixelTriplets/plugins/gpuPixelDoublets.h  | 12 +++--
 3 files changed, 51 insertions(+), 32 deletions(-)

diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorKernels.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorKernels.cu
index 85dc10ee04587..ec9c4960062ef 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorKernels.cu
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorKernels.cu
@@ -144,9 +144,11 @@ kernel_connect(AtomicPairCounter * apc1, AtomicPairCounter * apc2,  // just to z
   constexpr auto hardCurvCut = 1.f/(0.35f * 87.f); // FIXME VI tune
   constexpr auto ptmin = 0.9f; // FIXME original "tune"
 
-  auto cellIndex = threadIdx.x + blockIdx.x * blockDim.x;
+  auto cellIndex = threadIdx.y + blockIdx.y * blockDim.y;
+  auto first = threadIdx.x;
+  auto stride = blockDim.x;
 
-  if (0==cellIndex) { (*apc1)=0; (*apc2)=0; }// ready for next kernel
+  if (0==(cellIndex+first)) { (*apc1)=0; (*apc2)=0; }// ready for next kernel
 
   if (cellIndex >= (*nCells) ) return;
   auto const & thisCell = cells[cellIndex];
@@ -154,7 +156,7 @@ kernel_connect(AtomicPairCounter * apc1, AtomicPairCounter * apc2,  // just to z
   auto innerHitId = thisCell.get_inner_hit_id();
   auto numberOfPossibleNeighbors = isOuterHitOfCell[innerHitId].size();
   auto vi = isOuterHitOfCell[innerHitId].data();
-  for (auto j = 0; j < numberOfPossibleNeighbors; ++j) {
+  for (auto j = first; j < numberOfPossibleNeighbors; j+=stride) {
      auto otherCell = __ldg(vi+j);
      if (cells[otherCell].theDoubletId<0) continue;
      if (thisCell.check_alignment(hh,
@@ -172,6 +174,8 @@ void kernel_find_ntuplets(
     unsigned int minHitsPerNtuplet)
 {
 
+  // recursive: not obvious to widen
+
   auto cellIndex = threadIdx.x + blockIdx.x * blockDim.x;
   if (cellIndex >= (*nCells) ) return;
   auto &thisCell = cells[cellIndex];
@@ -246,23 +250,29 @@ void CAHitQuadrupletGeneratorKernels::launchKernels( // here goes algoparms....
   assert(nhits <= PixelGPUConstants::maxNumberOfHits);
   
   if (earlyFishbone_) {
-    auto blockSize = 128;
+    auto nthTot = 64;
     auto stride = 4;
+    auto blockSize = nthTot/stride;
     auto numberOfBlocks = (nhits + blockSize - 1)/blockSize;
-    numberOfBlocks *=stride;
-  
-    fishbone<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+    dim3 blks(1,numberOfBlocks,1);
+    dim3 thrs(stride,blockSize,1);
+    fishbone<<<blks,thrs, 0, cudaStream>>>(
       hh.gpu_d,
       device_theCells_, device_nCells_,
       device_isOuterHitOfCell_,
-      nhits, stride, false
+      nhits, false
     );
     cudaCheck(cudaGetLastError());
   }
 
-  auto blockSize = 64;
+  auto nthTot = 64;
+  auto stride = 4;
+  auto blockSize = nthTot/stride;
   auto numberOfBlocks = (maxNumberOfDoublets_ + blockSize - 1)/blockSize;
-  kernel_connect<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+  dim3 blks(1,numberOfBlocks,1);
+  dim3 thrs(stride,blockSize,1);
+
+  kernel_connect<<<blks, thrs, 0, cudaStream>>>(
       gpu_.apc_d, device_hitToTuple_apc_,  // needed only to be reset, ready for next kernel
       hh.gpu_d,
       device_theCells_, device_nCells_,
@@ -282,14 +292,17 @@ void CAHitQuadrupletGeneratorKernels::launchKernels( // here goes algoparms....
   cudautils::finalizeBulk<<<numberOfBlocks, blockSize, 0, cudaStream>>>(gpu_.apc_d,gpu_.tuples_d);
 
   if (lateFishbone_) {
-    auto stride=4;
-    numberOfBlocks = (nhits + blockSize - 1)/blockSize;
-    numberOfBlocks *=stride;
-    fishbone<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+    auto nthTot = 128;
+    auto stride = 16;
+    auto blockSize = nthTot/stride;
+    auto numberOfBlocks = (nhits + blockSize - 1)/blockSize;
+    dim3 blks(1,numberOfBlocks,1);
+    dim3 thrs(stride,blockSize,1);
+    fishbone<<<blks,thrs, 0, cudaStream>>>(
       hh.gpu_d,
       device_theCells_, device_nCells_,
       device_isOuterHitOfCell_,
-      nhits, stride, true
+      nhits, true
     );
     cudaCheck(cudaGetLastError());
   }
@@ -312,9 +325,13 @@ void CAHitQuadrupletGeneratorKernels::launchKernels( // here goes algoparms....
 void CAHitQuadrupletGeneratorKernels::buildDoublets(HitsOnCPU const & hh, cudaStream_t stream) {
   auto nhits = hh.nHits;
 
-  int threadsPerBlock = gpuPixelDoublets::getDoubletsFromHistoMaxBlockSize;
+  int stride=1;
+  int threadsPerBlock = gpuPixelDoublets::getDoubletsFromHistoMaxBlockSize/stride;
   int blocks = (3 * nhits + threadsPerBlock - 1) / threadsPerBlock;
-  gpuPixelDoublets::getDoubletsFromHisto<<<blocks, threadsPerBlock, 0, stream>>>(device_theCells_, device_nCells_, hh.gpu_d, device_isOuterHitOfCell_);
+  dim3 blks(1,blocks,1);
+  dim3 thrs(stride,threadsPerBlock,1);
+  gpuPixelDoublets::getDoubletsFromHisto<<<blks, thrs, 0, stream>>>(
+            device_theCells_, device_nCells_, hh.gpu_d, device_isOuterHitOfCell_);
   cudaCheck(cudaGetLastError());
 }
 
@@ -330,4 +347,3 @@ void CAHitQuadrupletGeneratorKernels::classifyTuples(HitsOnCPU const & hh, Tuple
     kernel_fastDuplicateRemover<<<numberOfBlocks, blockSize, 0, cudaStream>>>(device_theCells_, device_nCells_,tuples.tuples_d,tuples.helix_fit_results_d, tuples.quality_d);
 
 }
-
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h
index 717cbf777fcdb..796241eaf50ff 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h
@@ -26,7 +26,7 @@ namespace gpuPixelDoublets {
                GPUCACell * cells, uint32_t const * __restrict__ nCells,
                GPUCACell::OuterHitOfCell const * __restrict__ isOuterHitOfCell,
                uint32_t nHits,
-               uint32_t stride, bool checkTrack) {
+               bool checkTrack) {
 
     constexpr auto maxCellsPerHit = GPUCACell::maxCellsPerHit;
 
@@ -35,13 +35,12 @@ namespace gpuPixelDoublets {
     uint8_t const * __restrict__ layerp =  hh.phase1TopologyLayer_d;
     auto layer = [&](uint16_t id) { return __ldg(layerp+id/phase1PixelTopology::maxModuleStride);};
 
-    auto ldx = threadIdx.x + blockIdx.x * blockDim.x;
-    auto idx = ldx/stride;
-    auto first = ldx - idx*stride;
-    assert(first<stride);
+    // x run faster...
+    auto idy = threadIdx.y + blockIdx.y * blockDim.y;
+    auto first = threadIdx.x;
 
-    if (idx>=nHits) return;
-    auto const & vc = isOuterHitOfCell[idx];
+    if (idy>=nHits) return;
+    auto const & vc = isOuterHitOfCell[idy];
     auto s = vc.size();
     if (s<2) return;
     // if alligned kill one of the two.
@@ -66,8 +65,8 @@ namespace gpuPixelDoublets {
       ++sg;
     }
     if (sg<2) return;   
-    // here we parallelize
-    for (uint32_t ic=first; ic<sg-1;  ic+=stride) {
+    // here we parallelize 
+    for (uint32_t ic=first; ic<sg-1;  ic+=blockDim.x) {
       auto & ci = cells[cc[ic]];
       for    (auto jc=ic+1; jc<sg; ++jc) {
         auto & cj = cells[cc[jc]];
@@ -90,4 +89,4 @@ namespace gpuPixelDoublets {
 
 }
 
-#endif
+#endif // RecoLocalTracker_SiPixelRecHits_plugins_gpuFishbone_h
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
index 02a175fcc2903..192efd0d8919f 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
@@ -50,8 +50,11 @@ namespace gpuPixelDoublets {
     }
     auto ntot = innerLayerCumulativeSize[nPairs-1];
 
-    auto idx = blockIdx.x * blockDim.x + threadIdx.x;
-    for (auto j = idx; j < ntot; j += blockDim.x * gridDim.x) {
+    // x runs faster
+    auto idy = blockIdx.y * blockDim.y + threadIdx.y;
+    auto first = threadIdx.x;
+    auto stride = blockDim.x;
+    for (auto j = idy; j < ntot; j += blockDim.y * gridDim.y ) {
 
       uint32_t pairLayerId=0;
       while (j >= innerLayerCumulativeSize[pairLayerId++]);
@@ -115,7 +118,8 @@ namespace gpuPixelDoublets {
           nmin += hist.size(kk+hoff);
         auto const * __restrict__ p = hist.begin(kk+hoff);
         auto const * __restrict__ e = hist.end(kk+hoff);
-        for (;p < e; ++p) {
+        p+=first;
+        for (;p < e; p+=stride) {
           auto oi=__ldg(p);
           assert(oi>=offsets[outer]);
           assert(oi<offsets[outer+1]);
@@ -139,7 +143,7 @@ namespace gpuPixelDoublets {
     }  // loop in block...
   }
 
-  constexpr auto getDoubletsFromHistoMaxBlockSize = 64;
+  constexpr auto getDoubletsFromHistoMaxBlockSize = 64;  // for both x and y
   constexpr auto getDoubletsFromHistoMinBlocksPerMP = 16;
 
   __global__