From 8293c5ea6ed4bb64575d449c7aa8bf98a1eae024 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Thu, 13 Sep 2018 14:49:03 +0200
Subject: [PATCH] Optimise gpuPixelDoublets::doubletsFromHisto() kernel (#167)

Pre-compute few constants that could not be declared constexpr.
Reduce temporary buffer size.
Reduce the block size of the calls to gpuPixelDoublets::getDoubletsFromHisto() from 256 to 64, to make better usage of the GPU processors.
---
 .../plugins/CAHitQuadrupletGeneratorGPU.cu    |  22 +-
 .../PixelTriplets/plugins/gpuPixelDoublets.h  | 274 +++++++++---------
 2 files changed, 155 insertions(+), 141 deletions(-)
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.cu
index efa3793ff62cc..4b87896bc6f3b 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.cu
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.cu
@@ -24,7 +24,7 @@ kernel_checkOverflows(GPU::SimpleVector<Quadruplet> *foundNtuplets,
  if (idx < (*nCells) ) {
    auto &thisCell = cells[idx];
    if (thisCell.theOuterNeighbors.full()) //++tooManyNeighbors[thisCell.theLayerPairId];
-     printf("OuterNeighbors overflow %d in %d\n",idx,thisCell.theLayerPairId);
+     printf("OuterNeighbors overflow %d in %d\n", idx, thisCell.theLayerPairId);
  }
  if (idx < nHits) {
    if (isOuterHitOfCell[idx].full()) // ++tooManyOuterHitOfCell;
@@ -43,8 +43,8 @@ kernel_connect(GPU::SimpleVector<Quadruplet> *foundNtuplets,
                const float phiCut, const float hardPtCut,
                unsigned int maxNumberOfDoublets_, unsigned int maxNumberOfHits_) {
 
-  float region_origin_x =0.;
-  float region_origin_y =0.;
+  float region_origin_x = 0.;
+  float region_origin_y = 0.;
 
   auto cellIndex = threadIdx.x + blockIdx.x * blockDim.x;
 
@@ -86,7 +86,7 @@ __global__ void kernel_find_ntuplets(
 
 __global__ void
 kernel_print_found_ntuplets(GPU::SimpleVector<Quadruplet> *foundNtuplets, int maxPrint) {
-  for (int i = 0; i < std::min(maxPrint,foundNtuplets->size()); ++i) {
+  for (int i = 0; i < std::min(maxPrint, foundNtuplets->size()); ++i) {
     printf("\nquadruplet %d: %d %d %d %d\n", i,
            (*foundNtuplets)[i].hitId[0],
            (*foundNtuplets)[i].hitId[1],
@@ -177,7 +177,7 @@ void CAHitQuadrupletGeneratorGPU::launchKernels(const TrackingRegion &region,
   cudaCheck(cudaGetLastError());
 
 
-  numberOfBlocks = (std::max(int(nhits),maxNumberOfDoublets_) + 512 - 1)/512;
+  numberOfBlocks = (std::max(int(nhits), maxNumberOfDoublets_) + 512 - 1)/512;
   kernel_checkOverflows<<<numberOfBlocks, 512, 0, cudaStream>>>(
                         d_foundNtupletsVec_[regionIndex],
                         device_theCells_, device_nCells_,
@@ -185,7 +185,7 @@ void CAHitQuadrupletGeneratorGPU::launchKernels(const TrackingRegion &region,
                        );
 
 
-  // kernel_print_found_ntuplets<<<1,1,0, cudaStream>>>(d_foundNtupletsVec_[regionIndex],10);
+  // kernel_print_found_ntuplets<<<1, 1, 0, cudaStream>>>(d_foundNtupletsVec_[regionIndex], 10);
 
   if(transferToCPU) {
     cudaCheck(cudaMemcpyAsync(h_foundNtupletsVec_[regionIndex], d_foundNtupletsVec_[regionIndex],
@@ -203,7 +203,7 @@ void CAHitQuadrupletGeneratorGPU::cleanup(cudaStream_t cudaStream) {
   cudaCheck(cudaMemsetAsync(device_isOuterHitOfCell_, 0,
                             maxNumberOfLayers_ * maxNumberOfHits_ * sizeof(GPU::VecArray<unsigned int, maxCellsPerHit_>),
                             cudaStream));
-  cudaCheck(cudaMemsetAsync(device_nCells_,0,sizeof(uint32_t),cudaStream));
+  cudaCheck(cudaMemsetAsync(device_nCells_, 0, sizeof(uint32_t), cudaStream));
 }
 
 std::vector<std::array<int, 4>>
@@ -220,10 +220,10 @@ CAHitQuadrupletGeneratorGPU::fetchKernelResult(int regionIndex)
 }
 
 void CAHitQuadrupletGeneratorGPU::buildDoublets(HitsOnCPU const & hh, cudaStream_t stream) {
-   auto nhits = hh.nHits;
+  auto nhits = hh.nHits;
 
-  int threadsPerBlock = 256;
-  int blocks = (3*nhits + threadsPerBlock - 1) / threadsPerBlock;
-  gpuPixelDoublets::getDoubletsFromHisto<<<blocks, threadsPerBlock, 0, stream>>>(device_theCells_,device_nCells_,hh.gpu_d, device_isOuterHitOfCell_);
+  int threadsPerBlock = gpuPixelDoublets::getDoubletsFromHistoMaxBlockSize;
+  int blocks = (3 * nhits + threadsPerBlock - 1) / threadsPerBlock;
+  gpuPixelDoublets::getDoubletsFromHisto<<<blocks, threadsPerBlock, 0, stream>>>(device_theCells_, device_nCells_, hh.gpu_d, device_isOuterHitOfCell_);
   cudaCheck(cudaGetLastError());
 }
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
index e46627bf2c322..31844f39f9727 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
@@ -20,157 +20,171 @@ namespace gpuPixelDoublets {
 
   template<typename Hist>
   __device__
-  void doubletsFromHisto(uint8_t const * __restrict__ layerPairs, uint32_t nPairs, GPUCACell * cells, uint32_t * nCells,
-                         int16_t const * __restrict__ iphi, Hist const * __restrict__ hist, uint32_t const * __restrict__ offsets,
+  void doubletsFromHisto(uint8_t const * __restrict__ layerPairs,
+                         uint32_t nPairs,
+                         GPUCACell * cells,
+                         uint32_t * nCells,
+                         int16_t const * __restrict__ iphi,
+                         Hist const * __restrict__ hist,
+                         uint32_t const * __restrict__ offsets,
                          siPixelRecHitsHeterogeneousProduct::HitsOnGPU const &  __restrict__ hh,
-                         GPU::VecArray< unsigned int, 256>  * isOuterHitOfCell,
-                        int16_t const * phicuts, float const * minz, float const * maxz, float const * maxr) {
-
+                         GPU::VecArray< unsigned int, 256> * isOuterHitOfCell,
+                         int16_t const * __restrict__ phicuts,
+                         float const * __restrict__ minz,
+                         float const * __restrict__ maxz,
+                         float const * __restrict__ maxr)
+  {
     auto layerSize = [=](uint8_t li) { return offsets[li+1]-offsets[li]; };
 
-    // to be optimized later
-    uint32_t innerLayerCumulativeSize[64];
-    assert(nPairs<=64);
+    // nPairsMax to be optimized later (originally was 64).
+    // If it should be much bigger, consider using a block-wide parallel prefix scan,
+    // e.g. see  https://nvlabs.github.io/cub/classcub_1_1_warp_scan.html
+    const int nPairsMax = 16;
+    assert(nPairs <= nPairsMax);
+    uint32_t innerLayerCumulativeSize[nPairsMax];
     innerLayerCumulativeSize[0] = layerSize(layerPairs[0]);
-    for (uint32_t i=1; i<nPairs; ++i) {
-       innerLayerCumulativeSize[i] = innerLayerCumulativeSize[i-1] + layerSize(layerPairs[2*i]);
+    for (uint32_t i = 1; i < nPairs; ++i) {
+      innerLayerCumulativeSize[i] = innerLayerCumulativeSize[i-1] + layerSize(layerPairs[2*i]);
     }
-
     auto ntot = innerLayerCumulativeSize[nPairs-1];
 
-
-    auto idx = blockIdx.x*blockDim.x + threadIdx.x;
-  for(auto j=idx;j<ntot;j+=blockDim.x*gridDim.x) {
-
-    uint32_t pairLayerId=0;
-    while(j>=innerLayerCumulativeSize[pairLayerId++]);  --pairLayerId; // move to lower_bound ??
-
-    assert(pairLayerId<nPairs);
-    assert(j<innerLayerCumulativeSize[pairLayerId]);
-    assert(0==pairLayerId || j>=innerLayerCumulativeSize[pairLayerId-1]);
-
-    uint8_t inner = layerPairs[2*pairLayerId];
-    uint8_t outer = layerPairs[2*pairLayerId+1];
-    assert(outer>inner);
-
-    auto i = (0==pairLayerId) ? j :  j-innerLayerCumulativeSize[pairLayerId-1];
-    i += offsets[inner];
-
-    // printf("Hit in Layer %d %d %d %d\n", i, inner, pairLayerId, j);
-
-    assert(i>=offsets[inner]);
-    assert(i<offsets[inner+1]);
-
-    // found hit corresponding to our cuda thread!!!!!
-    // do the job
-
-    auto mep = iphi[i];
-    auto mez = hh.zg_d[i];
-    auto mer = hh.rg_d[i];
-    auto cutoff = [&](int j) { return
-        abs(hh.zg_d[j]-mez) > maxz[pairLayerId] ||
-      	abs(hh.zg_d[j]-mez) < minz[pairLayerId] ||
-        hh.rg_d[j]-mer > maxr[pairLayerId];
-    };
-
-    constexpr float z0cut = 12.f;
-    constexpr float hardPtCut = 0.5f;
-    constexpr float minRadius = hardPtCut * 87.f;
-    constexpr float minRadius2T4 = 4.f*minRadius*minRadius;
-    auto ptcut = [&](int j) {
-      auto r2t4 = minRadius2T4;
-      auto ri = mer;
-      auto ro = hh.rg_d[j];
-      auto dphi = short2phi( min( abs(int16_t(mep-iphi[j])),abs(int16_t(iphi[j]-mep)) ) );
-      return dphi*dphi*(r2t4 -ri*ro) > (ro-ri)*(ro-ri);
-    };
-    auto z0cutoff = [&](int j) {
-      auto zo =	hh.zg_d[j];
-      auto ro = hh.rg_d[j];
-      auto dr = ro-mer;
-      return dr > maxr[pairLayerId] ||
-             dr<0 || std::abs((mez*ro - mer*zo)) > z0cut*dr;
-    };
-
-    auto iphicut = phicuts[pairLayerId];
-
-    auto kl = hist[outer].bin(int16_t(mep-iphicut));
-    auto kh = hist[outer].bin(int16_t(mep+iphicut));
-    auto incr = [](auto & k) { return k = (k+1)%Hist::nbins();};
-    int tot  = 0;
-    int nmin = 0;
-    auto khh = kh;
-    incr(khh);
-
-    int tooMany=0;
-    for (auto kk=kl; kk!=khh; incr(kk)) {
-      if (kk!=kl && kk!=kh) nmin+=hist[outer].size(kk);
-      for (auto p=hist[outer].begin(kk); p<hist[outer].end(kk); ++p) {
-        auto oi=*p;
-        assert(oi>=offsets[outer]);
-        assert(oi<offsets[outer+1]);
-
-        if (std::min(std::abs(int16_t(iphi[oi]-mep)), std::abs(int16_t(mep-iphi[oi]))) > iphicut)
-          continue;
-        if (z0cutoff(oi) || ptcut(oi)) continue;
-        auto ind = atomicInc(nCells,MaxNumOfDoublets);
-        // int layerPairId, int doubletId, int innerHitId,int outerHitId)
-        cells[ind].init(hh,pairLayerId,ind,i,oi);
-        isOuterHitOfCell[oi].push_back(ind);
-        if (isOuterHitOfCell[oi].full()) ++tooMany;
-        ++tot;
+    auto idx = blockIdx.x * blockDim.x + threadIdx.x;
+    for (auto j = idx; j < ntot; j += blockDim.x * gridDim.x) {
+
+      uint32_t pairLayerId=0;
+      while (j >= innerLayerCumulativeSize[pairLayerId++]);
+      --pairLayerId; // move to lower_bound ??
+
+      assert(pairLayerId < nPairs);
+      assert(j < innerLayerCumulativeSize[pairLayerId]);
+      assert(0 == pairLayerId || j >= innerLayerCumulativeSize[pairLayerId-1]);
+
+      uint8_t inner = layerPairs[2*pairLayerId];
+      uint8_t outer = layerPairs[2*pairLayerId+1];
+      assert(outer > inner);
+
+      auto i = (0 == pairLayerId) ? j : j-innerLayerCumulativeSize[pairLayerId-1];
+      i += offsets[inner];
+
+      // printf("Hit in Layer %d %d %d %d\n", i, inner, pairLayerId, j);
+
+      assert(i >= offsets[inner]);
+      assert(i < offsets[inner+1]);
+
+      // found hit corresponding to our cuda thread, now do the job
+      auto mep = iphi[i];
+      auto mez = hh.zg_d[i];
+      auto mer = hh.rg_d[i];
+
+      constexpr float z0cut = 12.f;                     // cm
+      constexpr float hardPtCut = 0.5f;                 // GeV
+      constexpr float minRadius = hardPtCut * 87.78f;   // cm (1 GeV track has 1 GeV/c / (e * 3.8T) ~ 87 cm radius in a 3.8T field)
+      constexpr float minRadius2T4 = 4.f*minRadius*minRadius;
+      auto ptcut = [&](int j) {
+        auto r2t4 = minRadius2T4;
+        auto ri = mer;
+        auto ro = hh.rg_d[j];
+        auto dphi = short2phi( min( abs(int16_t(mep-iphi[j])), abs(int16_t(iphi[j]-mep)) ) );
+        return dphi*dphi * (r2t4 - ri*ro) > (ro-ri)*(ro-ri);
+      };
+      auto z0cutoff = [&](int j) {
+        auto zo = hh.zg_d[j];
+        auto ro = hh.rg_d[j];
+        auto dr = ro-mer;
+        return dr > maxr[pairLayerId] ||
+          dr<0 || std::abs((mez*ro - mer*zo)) > z0cut*dr;
+      };
+
+      auto iphicut = phicuts[pairLayerId];
+
+      auto kl = hist[outer].bin(int16_t(mep-iphicut));
+      auto kh = hist[outer].bin(int16_t(mep+iphicut));
+      auto incr = [](auto & k) { return k = (k+1) % Hist::nbins();};
+      int  tot  = 0;
+      int  nmin = 0;
+      auto khh = kh;
+      incr(khh);
+
+      int tooMany=0;
+      for (auto kk = kl; kk != khh; incr(kk)) {
+        if (kk != kl && kk != kh)
+          nmin += hist[outer].size(kk);
+        for (auto p = hist[outer].begin(kk); p < hist[outer].end(kk); ++p) {
+          auto oi=*p;
+          assert(oi>=offsets[outer]);
+          assert(oi<offsets[outer+1]);
+
+          if (std::min(std::abs(int16_t(iphi[oi]-mep)), std::abs(int16_t(mep-iphi[oi]))) > iphicut)
+            continue;
+          if (z0cutoff(oi) || ptcut(oi)) continue;
+          auto ind = atomicInc(nCells, MaxNumOfDoublets);
+          // int layerPairId, int doubletId, int innerHitId, int outerHitId)
+          cells[ind].init(hh, pairLayerId, ind, i, oi);
+          isOuterHitOfCell[oi].push_back(ind);
+          if (isOuterHitOfCell[oi].full()) ++tooMany;
+          ++tot;
+        }
       }
-    }
-    if (tooMany>0) printf("OuterHitOfCell full for %d in layer %d/%d, %d:%d   %d,%d\n", i, inner,outer, kl,kh,nmin,tot);
+      if (tooMany > 0)
+        printf("OuterHitOfCell full for %d in layer %d/%d, %d:%d   %d,%d\n", i, inner, outer, kl, kh, nmin, tot);
 
-    if (hist[outer].nspills>0)
-      printf("spill bin to be checked in %d %d\n",outer,hist[outer].nspills);
+      if (hist[outer].nspills > 0)
+        printf("spill bin to be checked in %d %d\n", outer, hist[outer].nspills);
 
-    // if (0==hist[outer].nspills) assert(tot>=nmin);
-    // look in spill bin as well....
+      // if (0==hist[outer].nspills) assert(tot>=nmin);
+      // look in spill bin as well....
 
-
-  }  // loop in block...
+    }  // loop in block...
   }
 
-  __global__
-  void getDoubletsFromHisto(GPUCACell * cells, uint32_t * nCells, siPixelRecHitsHeterogeneousProduct::HitsOnGPU const *  __restrict__ hhp,
-                            GPU::VecArray< unsigned int, 256> *isOuterHitOfCell) {
-
-    uint8_t const layerPairs[2*13] = {0,1 ,1,2 ,2,3
-                                     // ,0,4 ,1,4 ,2,4 ,4,5 ,5,6
-                                     ,0,7 ,1,7 ,2,7 ,7,8 ,8,9
-                                     ,0,4 ,1,4 ,2,4 ,4,5 ,5,6
-                                     };
+  constexpr auto getDoubletsFromHistoMaxBlockSize = 64;
 
-    const int16_t phi0p05 = phi2short(0.05);
-    const int16_t phi0p06 = phi2short(0.06);
-    const int16_t phi0p07 = phi2short(0.07);
+  __global__
+  __launch_bounds__(getDoubletsFromHistoMaxBlockSize)
+  void getDoubletsFromHisto(GPUCACell * cells,
+                            uint32_t * nCells,
+                            siPixelRecHitsHeterogeneousProduct::HitsOnGPU const *  __restrict__ hhp,
+                            GPU::VecArray<unsigned int, 256> * isOuterHitOfCell)
+  {
+    constexpr int nPairs = 13;
+    constexpr const uint8_t layerPairs[2*nPairs] = {
+      0, 1,  1, 2,  2, 3,
+   // 0, 4,  1, 4,  2, 4,  4, 5,  5, 6,
+      0, 7,  1, 7,  2, 7,  7, 8,  8, 9,
+      0, 4,  1, 4,  2, 4,  4, 5,  5, 6
+    };
 
-    int16_t const phicuts[13] { phi0p05, phi0p05, phi0p06
-                               ,phi0p07, phi0p06, phi0p06, phi0p05, phi0p05
-                               ,phi0p07, phi0p06, phi0p06, phi0p05, phi0p05
-                              };
+    constexpr int16_t phi0p05 = 522;    // round(521.52189...) = phi2short(0.05);
+    constexpr int16_t phi0p06 = 626;    // round(625.82270...) = phi2short(0.06);
+    constexpr int16_t phi0p07 = 730;    // round(730.12648...) = phi2short(0.07);
 
-    float const minz[13] = { 0., 0., 0.
-                            ,0., 0., 0., 0., 0.
-      	       	       	    ,0., 0., 0., 0., 0.
-                           };
+    constexpr const int16_t phicuts[nPairs] {
+      phi0p05, phi0p05, phi0p06,
+      phi0p07, phi0p06, phi0p06, phi0p05, phi0p05,
+      phi0p07, phi0p06, phi0p06, phi0p05, phi0p05
+    };
 
-    float const	maxz[13] = { 20.,15.,12.
-                            ,30.,20.,20., 50., 50.
-       	       	       	    ,30.,20.,20., 50., 50.
-                           };
+    float const minz[nPairs] = {
+      0., 0., 0.,
+      0., 0., 0., 0., 0.,
+      0., 0., 0., 0., 0.
+    };
 
-    float const maxr[13] = { 20., 20., 20.
-                            ,9., 7., 6., 5., 5.
-      	       	       	    ,9., 7., 6., 5., 5.
-                           };
+    float const maxz[nPairs] = {
+      20., 15., 12.,
+      30., 20., 20., 50., 50.,
+      30., 20., 20., 50., 50.
+    };
 
+    float const maxr[nPairs] = {
+      20., 20., 20.,
+       9.,  7.,  6.,  5.,  5.,
+       9.,  7.,  6.,  5.,  5.
+    };
 
     auto const &  __restrict__ hh = *hhp;
-    doubletsFromHisto(layerPairs, 13, cells, nCells,
-                      hh.iphi_d,hh.hist_d,hh.hitsLayerStart_d,
+    doubletsFromHisto(layerPairs, nPairs, cells, nCells,
+                      hh.iphi_d, hh.hist_d, hh.hitsLayerStart_d,
                       hh, isOuterHitOfCell,
                       phicuts, minz, maxz, maxr);
   }