cms-patatrack · makortel · Apr 1, 2021 · Feb 11, 2021 · Feb 11, 2021 · Feb 15, 2021
diff --git a/src/alpaka/AlpakaCore/alpakaWorkDivHelper.h b/src/alpaka/AlpakaCore/alpakaWorkDivHelper.h
@@ -232,7 +232,10 @@ namespace cms {
            threadIdx < maxNumberOfElements;
            threadIdx += blockDimension, endElementIdx += blockDimension) {
         // (CPU) Loop on all elements.
-        for (uint32_t i = threadIdx; i < std::min(endElementIdx, maxNumberOfElements); ++i) {
+        if (endElementIdx > maxNumberOfElements) {
+          endElementIdx = maxNumberOfElements;
+        }
+        for (uint32_t i = threadIdx; i < endElementIdx; ++i) {
           func(i);
         }
       }
@@ -274,7 +277,10 @@ namespace cms {
            threadIdx < maxNumberOfElements;
            threadIdx += gridDimension, endElementIdx += gridDimension) {
         // (CPU) Loop on all elements.
-        for (uint32_t i = threadIdx; i < std::min(endElementIdx, maxNumberOfElements); ++i) {
+        if (endElementIdx > maxNumberOfElements) {
+          endElementIdx = maxNumberOfElements;
+        }
+        for (uint32_t i = threadIdx; i < endElementIdx; ++i) {
           func(i);
         }
       }
@@ -291,6 +297,27 @@ namespace cms {
       cms::alpakatools::for_each_element_1D_grid_stride(acc, maxNumberOfElements, elementIdxShift, func);
     }
 
+    /*
+     * Case where the input index has reached the end of threadDimension: strides the input index.
+     * Otherwise: do nothing.
+     */
+    ALPAKA_FN_ACC bool get_next_element_1D_index_stride(uint32_t& i,
+                                                        uint32_t& firstElementIdx,
+                                                        uint32_t& endElementIdx,
+                                                        const uint32_t stride,
+                                                        const uint32_t maxNumberOfElements) {
+      bool isNextStrideElementValid = true;
+      if (i == endElementIdx) {
+        firstElementIdx += stride;
+        endElementIdx += stride;
+        i = firstElementIdx;
+        if (i >= maxNumberOfElements) {
+          isNextStrideElementValid = false;
+        }
+      }
+      return isNextStrideElementValid;
+    }
+
   }  // namespace alpakatools
 }  // namespace cms
 

diff --git a/src/alpaka/Geometry/phase1PixelTopology.h b/src/alpaka/Geometry/phase1PixelTopology.h
@@ -0,0 +1,174 @@
+#ifndef Geometry_TrackerGeometryBuilder_phase1PixelTopology_h
+#define Geometry_TrackerGeometryBuilder_phase1PixelTopology_h
+
+#include <cstdint>
+#include <array>
+
+namespace phase1PixelTopology {
+
+  constexpr uint16_t numRowsInRoc = 80;
+  constexpr uint16_t numColsInRoc = 52;
+  constexpr uint16_t lastRowInRoc = numRowsInRoc - 1;
+  constexpr uint16_t lastColInRoc = numColsInRoc - 1;
+
+  constexpr uint16_t numRowsInModule = 2 * numRowsInRoc;
+  constexpr uint16_t numColsInModule = 8 * numColsInRoc;
+  constexpr uint16_t lastRowInModule = numRowsInModule - 1;
+  constexpr uint16_t lastColInModule = numColsInModule - 1;
+
+  constexpr int16_t xOffset = -81;
+  constexpr int16_t yOffset = -54 * 4;
+
+  constexpr uint32_t numPixsInModule = uint32_t(numRowsInModule) * uint32_t(numColsInModule);
+
+  constexpr uint32_t numberOfModules = 1856;
+  constexpr uint32_t numberOfLayers = 10;
+  constexpr uint32_t layerStart[numberOfLayers + 1] = {0,
+                                                       96,
+                                                       320,
+                                                       672,  // barrel
+                                                       1184,
+                                                       1296,
+                                                       1408,  // positive endcap
+                                                       1520,
+                                                       1632,
+                                                       1744,  // negative endcap
+                                                       numberOfModules};
+  constexpr char const* layerName[numberOfLayers] = {
+      "BL1",
+      "BL2",
+      "BL3",
+      "BL4",  // barrel
+      "E+1",
+      "E+2",
+      "E+3",  // positive endcap
+      "E-1",
+      "E-2",
+      "E-3"  // negative endcap
+  };
+
+  constexpr uint32_t numberOfModulesInBarrel = 1184;
+  constexpr uint32_t numberOfLaddersInBarrel = numberOfModulesInBarrel / 8;
+
+  template <class Function, std::size_t... Indices>
+  constexpr auto map_to_array_helper(Function f, std::index_sequence<Indices...>)
+      -> std::array<typename std::result_of<Function(std::size_t)>::type, sizeof...(Indices)> {
+    return {{f(Indices)...}};
+  }
+
+  template <int N, class Function>
+  constexpr auto map_to_array(Function f) -> std::array<typename std::result_of<Function(std::size_t)>::type, N> {
+    return map_to_array_helper(f, std::make_index_sequence<N>{});
+  }
+
+  constexpr uint32_t findMaxModuleStride() {
+    bool go = true;
+    int n = 2;
+    while (go) {
+      for (uint8_t i = 1; i < 11; ++i) {
+        if (layerStart[i] % n != 0) {
+          go = false;
+          break;
+        }
+      }
+      if (!go)
+        break;
+      n *= 2;
+    }
+    return n / 2;
+  }
+
+  constexpr uint32_t maxModuleStride = findMaxModuleStride();
+
+  constexpr uint8_t findLayer(uint32_t detId) {
+    for (uint8_t i = 0; i < 11; ++i)
+      if (detId < layerStart[i + 1])
+        return i;
+    return 11;
+  }
+
+  constexpr uint8_t findLayerFromCompact(uint32_t detId) {
+    detId *= maxModuleStride;
+    for (uint8_t i = 0; i < 11; ++i)
+      if (detId < layerStart[i + 1])
+        return i;
+    return 11;
+  }
+
+  constexpr uint32_t layerIndexSize = numberOfModules / maxModuleStride;
+  constexpr std::array<uint8_t, layerIndexSize> layer = map_to_array<layerIndexSize>(findLayerFromCompact);
+
+  constexpr bool validateLayerIndex() {
+    bool res = true;
+    for (auto i = 0U; i < numberOfModules; ++i) {
+      auto j = i / maxModuleStride;
+      res &= (layer[j] < 10);
+      res &= (i >= layerStart[layer[j]]);
+      res &= (i < layerStart[layer[j] + 1]);
+    }
+    return res;
+  }
+
+  static_assert(validateLayerIndex(), "layer from detIndex algo is buggy");
+
+  // this is for the ROC n<512 (upgrade 1024)
+  constexpr inline uint16_t divu52(uint16_t n) {
+    n = n >> 2;
+    uint16_t q = (n >> 1) + (n >> 4);
+    q = q + (q >> 4) + (q >> 5);
+    q = q >> 3;
+    uint16_t r = n - q * 13;
+    return q + ((r + 3) >> 4);
+  }
+
+  constexpr inline bool isEdgeX(uint16_t px) { return (px == 0) | (px == lastRowInModule); }
+
+  constexpr inline bool isEdgeY(uint16_t py) { return (py == 0) | (py == lastColInModule); }
+
+  constexpr inline uint16_t toRocX(uint16_t px) { return (px < numRowsInRoc) ? px : px - numRowsInRoc; }
+
+  constexpr inline uint16_t toRocY(uint16_t py) {
+    auto roc = divu52(py);
+    return py - 52 * roc;
+  }
+
+  constexpr inline bool isBigPixX(uint16_t px) { return (px == 79) | (px == 80); }
+
+  constexpr inline bool isBigPixY(uint16_t py) {
+    auto ly = toRocY(py);
+    return (ly == 0) | (ly == lastColInRoc);
+  }
+
+  constexpr inline uint16_t localX(uint16_t px) {
+    auto shift = 0;
+    if (px > lastRowInRoc)
+      shift += 1;
+    if (px > numRowsInRoc)
+      shift += 1;
+    return px + shift;
+  }
+
+  constexpr inline uint16_t localY(uint16_t py) {
+    auto roc = divu52(py);
+    auto shift = 2 * roc;
+    auto yInRoc = py - 52 * roc;
+    if (yInRoc > 0)
+      shift += 1;
+    return py + shift;
+  }
+
+  //FIXME move it elsewhere?
+  struct AverageGeometry {
+    static constexpr auto numberOfLaddersInBarrel = phase1PixelTopology::numberOfLaddersInBarrel;
+    float ladderZ[numberOfLaddersInBarrel];
+    float ladderX[numberOfLaddersInBarrel];
+    float ladderY[numberOfLaddersInBarrel];
+    float ladderR[numberOfLaddersInBarrel];
+    float ladderMinZ[numberOfLaddersInBarrel];
+    float ladderMaxZ[numberOfLaddersInBarrel];
+    float endCapZ[2];  // just for pos and neg Layer1
+  };
+
+}  // namespace phase1PixelTopology
+
+#endif  // Geometry_TrackerGeometryBuilder_phase1PixelTopology_h
diff --git a/src/alpaka/plugin-SiPixelClusterizer/clustering_alpaka.h b/src/alpaka/plugin-SiPixelClusterizer/clustering_alpaka.h
diff --git a/src/alpaka/plugin-SiPixelClusterizer/gpuClusterChargeCut.h b/src/alpaka/plugin-SiPixelClusterizer/gpuClusterChargeCut.h
@@ -0,0 +1,147 @@
+#ifndef RecoLocalTracker_SiPixelClusterizer_plugins_gpuClusterChargeCut_h
+#define RecoLocalTracker_SiPixelClusterizer_plugins_gpuClusterChargeCut_h
+
+#include <cstdint>
+#include <cstdio>
+
+#include "AlpakaCore/alpakaConfig.h"
+#include "AlpakaCore/alpakaWorkDivHelper.h"
+#include "AlpakaCore/prefixScan.h"
+#include "AlpakaDataFormats/gpuClusteringConstants.h"
+
+namespace gpuClustering {
+
+  struct clusterChargeCut {
+    template <typename T_Acc>
+    ALPAKA_FN_ACC void operator()(
+        const T_Acc& acc,
+        uint16_t* __restrict__ id,                 // module id of each pixel (modified if bad cluster)
+        uint16_t const* __restrict__ adc,          //  charge of each pixel
+        uint32_t const* __restrict__ moduleStart,  // index of the first pixel of each module
+        uint32_t* __restrict__ nClustersInModule,  // modified: number of clusters found in each module
+        uint32_t const* __restrict__ moduleId,     // module id of each module
+        int32_t* __restrict__ clusterId,           // modified: cluster id of each pixel
+        const uint32_t numElements) const {
+      const uint32_t blockIdx(alpaka::idx::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0u]);
+      if (blockIdx >= moduleStart[0])
+        return;
+
+      auto firstPixel = moduleStart[1 + blockIdx];
+      auto thisModuleId = id[firstPixel];
+      assert(thisModuleId < MaxNumModules);
+      assert(thisModuleId == moduleId[blockIdx]);
+
+      auto nclus = nClustersInModule[thisModuleId];
+      if (nclus == 0)
+        return;
+
+      const uint32_t threadIdxLocal(alpaka::idx::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]);
+      if (threadIdxLocal == 0 && nclus > MaxNumClustersPerModules)
+        printf("Warning too many clusters in module %d in block %d: %d > %d\n",
+               thisModuleId,
+               blockIdx,
+               nclus,
+               MaxNumClustersPerModules);
+
+      // Stride = block size.
+      const uint32_t blockDimension(alpaka::workdiv::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u]);
+
+      // Get thread / CPU element indices in block.
+      const auto& [firstElementIdxNoStride, endElementIdxNoStride] =
+          cms::alpakatools::element_index_range_in_block(acc, Vec1::all(firstPixel));
+
+      if (nclus > MaxNumClustersPerModules) {
+        uint32_t firstElementIdx = firstElementIdxNoStride[0u];
+        uint32_t endElementIdx = endElementIdxNoStride[0u];
+        // remove excess  FIXME find a way to cut charge first....
+        for (uint32_t i = firstElementIdx; i < numElements; ++i) {
+          if (!cms::alpakatools::get_next_element_1D_index_stride(
+                  i, firstElementIdx, endElementIdx, blockDimension, numElements))
+            break;
+          if (id[i] == InvId)
+            continue;  // not valid
+          if (id[i] != thisModuleId)
+            break;  // end of module
+          if (clusterId[i] >= MaxNumClustersPerModules) {
+            id[i] = InvId;
+            clusterId[i] = InvId;
+          }
+        }
+        nclus = MaxNumClustersPerModules;
+      }
+
+#ifdef GPU_DEBUG
+      if (thisModuleId % 100 == 1)
+        if (threadIdxLocal == 0)
+          printf("start clusterizer for module %d in block %d\n", thisModuleId, blockIdx);
+#endif
+
+      auto&& charge = alpaka::block::shared::st::allocVar<int32_t[MaxNumClustersPerModules], __COUNTER__>(acc);
+      auto&& ok = alpaka::block::shared::st::allocVar<uint8_t[MaxNumClustersPerModules], __COUNTER__>(acc);
+      auto&& newclusId = alpaka::block::shared::st::allocVar<uint16_t[MaxNumClustersPerModules], __COUNTER__>(acc);
+
+      assert(nclus <= MaxNumClustersPerModules);
+      cms::alpakatools::for_each_element_1D_block_stride(acc, nclus, [&](uint32_t i) { charge[i] = 0; });
+      alpaka::block::sync::syncBlockThreads(acc);
+
+      uint32_t firstElementIdx = firstElementIdxNoStride[0u];
+      uint32_t endElementIdx = endElementIdxNoStride[0u];
+      for (uint32_t i = firstElementIdx; i < numElements; ++i) {
+        if (!cms::alpakatools::get_next_element_1D_index_stride(
+                i, firstElementIdx, endElementIdx, blockDimension, numElements))
+          break;
+        if (id[i] == InvId)
+          continue;  // not valid
+        if (id[i] != thisModuleId)
+          break;  // end of module
+        alpaka::atomic::atomicOp<alpaka::atomic::op::Add>(acc, &charge[clusterId[i]], static_cast<int32_t>(adc[i]));
+      }
+      alpaka::block::sync::syncBlockThreads(acc);
+
+      auto chargeCut = thisModuleId < 96 ? 2000 : 4000;  // move in constants (calib?)
+      cms::alpakatools::for_each_element_1D_block_stride(
+          acc, nclus, [&](uint32_t i) { newclusId[i] = ok[i] = charge[i] > chargeCut ? 1 : 0; });
+      alpaka::block::sync::syncBlockThreads(acc);
+
+      // renumber
+      auto&& ws = alpaka::block::shared::st::allocVar<uint16_t[32], __COUNTER__>(acc);
+      cms::alpakatools::blockPrefixScan(acc, newclusId, nclus, ws);
+
+      assert(nclus >= newclusId[nclus - 1]);
+
+      if (nclus == newclusId[nclus - 1])
+        return;
+
+      nClustersInModule[thisModuleId] = newclusId[nclus - 1];
+      alpaka::block::sync::syncBlockThreads(acc);
+
+      // mark bad cluster again
+      cms::alpakatools::for_each_element_1D_block_stride(acc, nclus, [&](uint32_t i) {
+        if (0 == ok[i])
+          newclusId[i] = InvId + 1;
+      });
+      alpaka::block::sync::syncBlockThreads(acc);
+
+      // reassign id
+      firstElementIdx = firstElementIdxNoStride[0u];
+      endElementIdx = endElementIdxNoStride[0u];
+      for (uint32_t i = firstElementIdx; i < numElements; ++i) {
+        if (!cms::alpakatools::get_next_element_1D_index_stride(
+                i, firstElementIdx, endElementIdx, blockDimension, numElements))
+          break;
+        if (id[i] == InvId)
+          continue;  // not valid
+        if (id[i] != thisModuleId)
+          break;  // end of module
+        clusterId[i] = newclusId[clusterId[i]] - 1;
+        if (clusterId[i] == InvId)
+          id[i] = InvId;
+      }
+
+      //done
+    }
+  };
+
+}  // namespace gpuClustering
+
+#endif  // RecoLocalTracker_SiPixelClusterizer_plugins_gpuClusterChargeCut_h