From 28bcb5515121641fadd3a74a03fa45a89ec591f8 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Fri, 2 Feb 2024 22:27:11 +0100
Subject: [PATCH 01/25] Skip invalid or corrupted ROCs

Make the Alpaka implementation of the pixel unpacker skip spurious ROCs,
similar to the legacy and CUDA versions of the unpacker, and store the
invalid ROC number error (errorType=36).

Disable printf statements at compile time.

Use named constants instead of magic numbers.
---
 .../plugins/alpaka/CalibPixel.h               |  77 ++---
 .../alpaka/SiPixelRawToClusterKernel.dev.cc   | 274 ++++++++++--------
 .../alpaka/SiPixelRawToClusterKernel.h        |   3 +-
 3 files changed, 191 insertions(+), 163 deletions(-)
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/CalibPixel.h b/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/CalibPixel.h
index d1f5509052468..2808255782bc9 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/CalibPixel.h
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/CalibPixel.h
@@ -23,9 +23,7 @@
 namespace calibPixel {
   using namespace cms::alpakatools;
 
-  constexpr uint16_t InvId = std::numeric_limits<uint16_t>::max() - 1;
-  // must be > MaxNumModules
-
+  template <bool debug = false>
   struct CalibDigis {
     template <typename TAcc>
     ALPAKA_FN_ACC void operator()(const TAcc& acc,
@@ -41,52 +39,57 @@ namespace calibPixel {
 
       // zero for next kernels...
       if (cms::alpakatools::once_per_grid(acc)) {
-        clus_view[0].clusModuleStart() = clus_view[0].moduleStart() = 0;
+        clus_view[0].clusModuleStart() = 0;
+        clus_view[0].moduleStart() = 0;
+      }
+      for (auto i : cms::alpakatools::elements_with_stride(acc, phase1PixelTopology::numberOfModules)) {
+        clus_view[i].clusInModule() = 0;
       }
 
-      cms::alpakatools::for_each_element_in_grid_strided(
-          acc, phase1PixelTopology::numberOfModules, [&](uint32_t i) { clus_view[i].clusInModule() = 0; });
-      cms::alpakatools::for_each_element_in_grid_strided(acc, numElements, [&](uint32_t i) {
+      for (auto i : cms::alpakatools::elements_with_stride(acc, numElements)) {
         auto dvgi = view[i];
-        if (dvgi.moduleId() != InvId) {
-          bool isDeadColumn = false, isNoisyColumn = false;
-          int row = dvgi.xx();
-          int col = dvgi.yy();
-          auto ret = SiPixelGainUtilities::getPedAndGain(gains, dvgi.moduleId(), col, row, isDeadColumn, isNoisyColumn);
-          float pedestal = ret.first;
-          float gain = ret.second;
-          if (isDeadColumn | isNoisyColumn) {
-            dvgi.moduleId() = InvId;
-            dvgi.adc() = 0;
+        if (dvgi.moduleId() == ::pixelClustering::invalidModuleId)
+          continue;
+
+        bool isDeadColumn = false, isNoisyColumn = false;
+        int row = dvgi.xx();
+        int col = dvgi.yy();
+        auto ret = SiPixelGainUtilities::getPedAndGain(gains, dvgi.moduleId(), col, row, isDeadColumn, isNoisyColumn);
+        float pedestal = ret.first;
+        float gain = ret.second;
+        if (isDeadColumn | isNoisyColumn) {
+          if constexpr (debug)
             printf("bad pixel at %d in %d\n", i, dvgi.moduleId());
-          } else {
-            float vcal = dvgi.adc() * gain - pedestal * gain;
+          dvgi.moduleId() = ::pixelClustering::invalidModuleId;
+          dvgi.adc() = 0;
+        } else {
+          float vcal = dvgi.adc() * gain - pedestal * gain;
 
-            float conversionFactor = dvgi.moduleId() < 96 ? VCaltoElectronGain_L1 : VCaltoElectronGain;
-            float offset = dvgi.moduleId() < 96 ? VCaltoElectronOffset_L1 : VCaltoElectronOffset;
+          float conversionFactor = dvgi.moduleId() < 96 ? VCaltoElectronGain_L1 : VCaltoElectronGain;
+          float offset = dvgi.moduleId() < 96 ? VCaltoElectronOffset_L1 : VCaltoElectronOffset;
 #ifdef GPU_DEBUG
-            auto old_adc = dvgi.adc();
+          auto old_adc = dvgi.adc();
 #endif
-            dvgi.adc() = std::max(100, int(vcal * conversionFactor + offset));
+          dvgi.adc() = std::max(100, int(vcal * conversionFactor + offset));
 #ifdef GPU_DEBUG
-            if (cms::alpakatools::once_per_grid(acc)) {
-              printf(
-                  "module %d pixel %d -> old_adc = %d; vcal = %.2f; conversionFactor = %.2f; offset = %.2f; new_adc = "
-                  "%d \n",
-                  dvgi.moduleId(),
-                  i,
-                  old_adc,
-                  vcal,
-                  conversionFactor,
-                  offset,
-                  dvgi.adc());
-            }
-#endif
+          if (cms::alpakatools::once_per_grid(acc)) {
+            printf(
+                "module %d pixel %d -> old_adc = %d; vcal = %.2f; conversionFactor = %.2f; offset = %.2f; new_adc = "
+                "%d \n",
+                dvgi.moduleId(),
+                i,
+                old_adc,
+                vcal,
+                conversionFactor,
+                offset,
+                dvgi.adc());
           }
+#endif
         }
-      });
+      }
     }
   };
+
   struct CalibDigisPhase2 {
     template <typename TAcc>
     ALPAKA_FN_ACC void operator()(const TAcc& acc,
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/SiPixelRawToClusterKernel.dev.cc b/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/SiPixelRawToClusterKernel.dev.cc
index 88ad79c6af609..cd9509d2be46b 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/SiPixelRawToClusterKernel.dev.cc
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/SiPixelRawToClusterKernel.dev.cc
@@ -12,15 +12,16 @@
 #include <utility>
 
 // CMSSW includes
-#include "HeterogeneousCore/AlpakaInterface/interface/prefixScan.h"
 #include "HeterogeneousCore/AlpakaInterface/interface/SimpleVector.h"
 #include "HeterogeneousCore/AlpakaInterface/interface/config.h"
 #include "HeterogeneousCore/AlpakaInterface/interface/memory.h"
-#include "HeterogeneousCore/AlpakaInterface/interface/config.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/prefixScan.h"
 #include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
-
 #include "CondFormats/SiPixelObjects/interface/SiPixelGainCalibrationForHLTLayout.h"
 #include "CondFormats/SiPixelObjects/interface/SiPixelMappingLayout.h"
+#include "DataFormats/DetId/interface/DetId.h"
+#include "DataFormats/SiPixelClusterSoA/interface/ClusteringConstants.h"
+#include "DataFormats/SiPixelDetId/interface/PixelSubdetector.h"
 #include "DataFormats/SiPixelDigi/interface/SiPixelDigiConstants.h"
 
 // local includes
@@ -34,30 +35,17 @@
 namespace ALPAKA_ACCELERATOR_NAMESPACE {
   namespace pixelDetails {
 
-    ////////////////////
-
-    ALPAKA_FN_ACC uint32_t getLink(uint32_t ww) {
-      return ((ww >> ::sipixelconstants::LINK_shift) & ::sipixelconstants::LINK_mask);
-    }
-
-    ALPAKA_FN_ACC uint32_t getRoc(uint32_t ww) {
-      return ((ww >> ::sipixelconstants::ROC_shift) & ::sipixelconstants::ROC_mask);
+    ALPAKA_FN_ACC bool isBarrel(uint32_t rawId) {
+      return (PixelSubdetector::PixelBarrel == ((rawId >> DetId::kSubdetOffset) & DetId::kSubdetMask));
     }
 
-    ALPAKA_FN_ACC uint32_t getADC(uint32_t ww) {
-      return ((ww >> ::sipixelconstants::ADC_shift) & ::sipixelconstants::ADC_mask);
-    }
-
-    ALPAKA_FN_ACC bool isBarrel(uint32_t rawId) { return (1 == ((rawId >> 25) & 0x7)); }
-
     ALPAKA_FN_ACC ::pixelDetails::DetIdGPU getRawId(const SiPixelMappingSoAConstView &cablingMap,
                                                     uint8_t fed,
                                                     uint32_t link,
                                                     uint32_t roc) {
       using namespace ::pixelDetails;
       uint32_t index = fed * MAX_LINK * MAX_ROC + (link - 1) * MAX_ROC + roc;
-      ::pixelDetails::DetIdGPU detId = {
-          cablingMap.rawId()[index], cablingMap.rocInDet()[index], cablingMap.moduleId()[index]};
+      DetIdGPU detId = {cablingMap.rawId()[index], cablingMap.rocInDet()[index], cablingMap.moduleId()[index]};
       return detId;
     }
 
@@ -131,36 +119,37 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
       return global;
     }
 
-    ALPAKA_FN_ACC uint8_t conversionError(uint8_t fedId, uint8_t status, bool debug = false) {
+    template <bool debug = false>
+    ALPAKA_FN_ACC uint8_t conversionError(uint8_t fedId, uint8_t status) {
       uint8_t errorType = 0;
 
       switch (status) {
         case 1: {
-          if (debug)
+          if constexpr (debug)
             printf("Error in Fed: %i, invalid channel Id (errorType = 35\n)", fedId);
           errorType = 35;
           break;
         }
         case 2: {
-          if (debug)
+          if constexpr (debug)
             printf("Error in Fed: %i, invalid ROC Id (errorType = 36)\n", fedId);
           errorType = 36;
           break;
         }
         case 3: {
-          if (debug)
+          if constexpr (debug)
             printf("Error in Fed: %i, invalid dcol/pixel value (errorType = 37)\n", fedId);
           errorType = 37;
           break;
         }
         case 4: {
-          if (debug)
+          if constexpr (debug)
             printf("Error in Fed: %i, dcol/pixel read out of order (errorType = 38)\n", fedId);
           errorType = 38;
           break;
         }
         default:
-          if (debug)
+          if constexpr (debug)
             printf("Cabling check returned unexpected result, status = %i\n", status);
       };
 
@@ -171,17 +160,15 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
       uint32_t numRowsInRoc = 80;
       uint32_t numColsInRoc = 52;
 
-      /// row and collumn in ROC representation
+      /// row and column in ROC representation
       return ((rocRow < numRowsInRoc) & (rocCol < numColsInRoc));
     }
 
     ALPAKA_FN_ACC bool dcolIsValid(uint32_t dcol, uint32_t pxid) { return ((dcol < 26) & (2 <= pxid) & (pxid < 162)); }
 
-    ALPAKA_FN_ACC uint8_t checkROC(uint32_t errorWord,
-                                   uint8_t fedId,
-                                   uint32_t link,
-                                   const SiPixelMappingSoAConstView &cablingMap,
-                                   bool debug = false) {
+    template <bool debug = false>
+    ALPAKA_FN_ACC uint8_t
+    checkROC(uint32_t errorWord, uint8_t fedId, uint32_t link, const SiPixelMappingSoAConstView &cablingMap) {
       uint8_t errorType = (errorWord >> ::pixelDetails::ROC_shift) & ::pixelDetails::ERROR_mask;
       if (errorType < 25)
         return 0;
@@ -201,42 +188,42 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
           break;
         }
         case (26): {
-          if (debug)
+          if constexpr (debug)
             printf("Gap word found (errorType = 26)\n");
           errorFound = true;
           break;
         }
         case (27): {
-          if (debug)
+          if constexpr (debug)
             printf("Dummy word found (errorType = 27)\n");
           errorFound = true;
           break;
         }
         case (28): {
-          if (debug)
+          if constexpr (debug)
             printf("Error fifo nearly full (errorType = 28)\n");
           errorFound = true;
           break;
         }
         case (29): {
-          if (debug)
+          if constexpr (debug)
             printf("Timeout on a channel (errorType = 29)\n");
           if ((errorWord >> ::pixelDetails::OMIT_ERR_shift) & ::pixelDetails::OMIT_ERR_mask) {
-            if (debug)
+            if constexpr (debug)
               printf("...first errorType=29 error, this gets masked out\n");
           }
           errorFound = true;
           break;
         }
         case (30): {
-          if (debug)
+          if constexpr (debug)
             printf("TBM error trailer (errorType = 30)\n");
           int StateMatch_bits = 4;
           int StateMatch_shift = 8;
           uint32_t StateMatch_mask = ~(~uint32_t(0) << StateMatch_bits);
           int StateMatch = (errorWord >> StateMatch_shift) & StateMatch_mask;
           if (StateMatch != 1 && StateMatch != 8) {
-            if (debug)
+            if constexpr (debug)
               printf("FED error 30 with unexpected State Bits (errorType = 30)\n");
           }
           if (StateMatch == 1)
@@ -245,7 +232,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
           break;
         }
         case (31): {
-          if (debug)
+          if constexpr (debug)
             printf("Event number error (errorType = 31)\n");
           errorFound = true;
           break;
@@ -257,11 +244,9 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
       return errorFound ? errorType : 0;
     }
 
-    ALPAKA_FN_ACC uint32_t getErrRawID(uint8_t fedId,
-                                       uint32_t errWord,
-                                       uint32_t errorType,
-                                       const SiPixelMappingSoAConstView &cablingMap,
-                                       bool debug = false) {
+    template <bool debug = false>
+    ALPAKA_FN_ACC uint32_t
+    getErrRawID(uint8_t fedId, uint32_t errWord, uint32_t errorType, const SiPixelMappingSoAConstView &cablingMap) {
       uint32_t rID = 0xffffffff;
 
       switch (errorType) {
@@ -272,7 +257,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
         case 40: {
           uint32_t roc = 1;
           uint32_t link = (errWord >> ::pixelDetails::LINK_shift) & ::pixelDetails::LINK_mask;
-          uint32_t rID_temp = getRawId(cablingMap, fedId, link, roc).RawId;
+          uint32_t rID_temp = getRawId(cablingMap, fedId, link, roc).rawId;
           if (rID_temp != 9999)
             rID = rID_temp;
           break;
@@ -305,7 +290,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
 
           uint32_t roc = 1;
           uint32_t link = chanNmbr;
-          uint32_t rID_temp = getRawId(cablingMap, fedId, link, roc).RawId;
+          uint32_t rID_temp = getRawId(cablingMap, fedId, link, roc).rawId;
           if (rID_temp != 9999)
             rID = rID_temp;
           break;
@@ -314,7 +299,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
         case 38: {
           uint32_t roc = (errWord >> ::pixelDetails::ROC_shift) & ::pixelDetails::ROC_mask;
           uint32_t link = (errWord >> ::pixelDetails::LINK_shift) & ::pixelDetails::LINK_mask;
-          uint32_t rID_temp = getRawId(cablingMap, fedId, link, roc).RawId;
+          uint32_t rID_temp = getRawId(cablingMap, fedId, link, roc).rawId;
           if (rID_temp != 9999)
             rID = rID_temp;
           break;
@@ -327,6 +312,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
     }
 
     // Kernel to perform Raw to Digi conversion
+    template <bool debug = false>
     struct RawToDigi_kernel {
       template <typename TAcc>
       ALPAKA_FN_ACC void operator()(const TAcc &acc,
@@ -338,19 +324,18 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
                                     SiPixelDigisSoAView digisView,
                                     SiPixelDigiErrorsSoAView err,
                                     bool useQualityInfo,
-                                    bool includeErrors,
-                                    bool debug) const {
-        cms::alpakatools::for_each_element_in_grid_strided(acc, wordCounter, [&](uint32_t iloop) {
-          auto gIndex = iloop;
+                                    bool includeErrors) const {
+        // FIXME there is no guarantee that this is initialised to 0 before any of the atomicInc happens
+        if (cms::alpakatools::once_per_grid(acc))
+          err.size() = 0;
+
+        for (auto gIndex : cms::alpakatools::elements_with_stride(acc, wordCounter)) {
           auto dvgi = digisView[gIndex];
           dvgi.xx() = 0;
           dvgi.yy() = 0;
           dvgi.adc() = 0;
-          bool skipROC = false;
-
-          if (gIndex == 0)
-            err[gIndex].size() = 0;
 
+          // initialise the errors
           err[gIndex].pixelErrors() = SiPixelErrorCompact{0, 0, 0, 0};
 
           uint8_t fedId = fedIds[gIndex / 2];  // +1200;
@@ -358,45 +343,59 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
           // initialize (too many coninue below)
           dvgi.pdigi() = 0;
           dvgi.rawIdArr() = 0;
-          constexpr uint16_t invalidModuleId = std::numeric_limits<uint16_t>::max() - 1;
-          dvgi.moduleId() = invalidModuleId;
+          dvgi.moduleId() = ::pixelClustering::invalidModuleId;
 
           uint32_t ww = word[gIndex];  // Array containing 32 bit raw data
           if (ww == 0) {
             // 0 is an indicator of a noise/dead channel, skip these pixels during clusterization
-            return;
+            continue;
           }
 
-          uint32_t link = getLink(ww);  // Extract link
-          uint32_t roc = getRoc(ww);    // Extract Roc in link
-          ::pixelDetails::DetIdGPU detId = getRawId(cablingMap, fedId, link, roc);
+          uint32_t link = sipixelconstants::getLink(ww);  // Extract link
+          uint32_t roc = sipixelconstants::getROC(ww);    // Extract ROC in link
 
-          uint8_t errorType = checkROC(ww, fedId, link, cablingMap, debug);
-          skipROC = (roc < ::pixelDetails::maxROCIndex) ? false : (errorType != 0);
+          uint8_t errorType = checkROC<debug>(ww, fedId, link, cablingMap);
+          bool skipROC = (roc < ::pixelDetails::maxROCIndex) ? false : (errorType != 0);
           if (includeErrors and skipROC) {
-            uint32_t rID = getErrRawID(fedId, ww, errorType, cablingMap, debug);
-            err[gIndex].pixelErrors() = SiPixelErrorCompact{rID, ww, errorType, fedId};
-            alpaka::atomicInc(acc, &err.size(), 0xffffffff, alpaka::hierarchy::Threads{});
-            return;
+            uint32_t rawId = getErrRawID<debug>(fedId, ww, errorType, cablingMap);
+            if (rawId != 0xffffffff)  // Store errors only for valid DetIds
+            {
+              err[gIndex].pixelErrors() = SiPixelErrorCompact{rawId, ww, errorType, fedId};
+              alpaka::atomicInc(acc, &err.size(), 0xffffffff, alpaka::hierarchy::Blocks{});
+            }
+            continue;
           }
 
-          uint32_t rawId = detId.RawId;
-          uint32_t rocIdInDetUnit = detId.rocInDet;
-          bool barrel = isBarrel(rawId);
+          // Check for spurious channels
+          if (roc > ::pixelDetails::MAX_ROC or link > ::pixelDetails::MAX_LINK) {
+            uint32_t rawId = getRawId(cablingMap, fedId, link, 1).rawId;
+            if constexpr (debug) {
+              printf("spurious roc %d found on link %d, detector %d (index %d)\n", roc, link, rawId, gIndex);
+            }
+            if (roc > ::pixelDetails::MAX_ROC and roc < 25) {
+              uint8_t error = conversionError<debug>(fedId, 2);
+              err[gIndex].pixelErrors() = SiPixelErrorCompact{rawId, ww, error, fedId};
+              alpaka::atomicInc(acc, &err.size(), 0xffffffff, alpaka::hierarchy::Blocks{});
+            }
+            continue;
+          }
 
           uint32_t index =
               fedId * ::pixelDetails::MAX_LINK * ::pixelDetails::MAX_ROC + (link - 1) * ::pixelDetails::MAX_ROC + roc;
           if (useQualityInfo) {
             skipROC = cablingMap.badRocs()[index];
             if (skipROC)
-              return;
+              continue;
           }
           skipROC = modToUnp[index];
           if (skipROC)
-            return;
+            continue;
 
-          uint32_t layer = 0;                   //, ladder =0;
-          int side = 0, panel = 0, module = 0;  //disk = 0, blade = 0
+          ::pixelDetails::DetIdGPU detId = getRawId(cablingMap, fedId, link, roc);
+          uint32_t rawId = detId.rawId;
+          uint32_t layer = 0;
+          int side = 0, panel = 0, module = 0;
+          bool barrel = isBarrel(rawId);
 
           if (barrel) {
             layer = (rawId >> ::pixelDetails::layerStartBit) & ::pixelDetails::layerMask;
@@ -406,54 +405,50 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
             // endcap ids
             layer = 0;
             panel = (rawId >> ::pixelDetails::panelStartBit) & ::pixelDetails::panelMask;
-            //disk  = (rawId >> diskStartBit_) & diskMask_;
             side = (panel == 1) ? -1 : 1;
-            //blade = (rawId >> bladeStartBit_) & bladeMask_;
           }
 
-          // ***special case of layer to 1 be handled here
           ::pixelDetails::Pixel localPix;
           if (layer == 1) {
-            uint32_t col = (ww >> ::pixelDetails::COL_shift) & ::pixelDetails::COL_mask;
-            uint32_t row = (ww >> ::pixelDetails::ROW_shift) & ::pixelDetails::ROW_mask;
+            // Special case of barrel layer 1
+            uint32_t col = sipixelconstants::getCol(ww);
+            uint32_t row = sipixelconstants::getRow(ww);
             localPix.row = row;
             localPix.col = col;
-            if (includeErrors) {
-              if (not rocRowColIsValid(row, col)) {
-                uint8_t error = conversionError(fedId, 3, debug);  //use the device function and fill the arrays
-                err[gIndex].pixelErrors() = SiPixelErrorCompact{rawId, ww, error, fedId};
-                alpaka::atomicInc(acc, &err.size(), 0xffffffff, alpaka::hierarchy::Threads{});
-                if (debug)
-                  printf("BPIX1  Error status: %i\n", error);
-                return;
-              }
+            if (includeErrors and not rocRowColIsValid(row, col)) {
+              uint8_t error = conversionError<debug>(fedId, 3);
+              err[gIndex].pixelErrors() = SiPixelErrorCompact{rawId, ww, error, fedId};
+              alpaka::atomicInc(acc, &err.size(), 0xffffffff, alpaka::hierarchy::Blocks{});
+              if constexpr (debug)
+                printf("BPIX1 Error status: %i\n", error);
+              continue;
             }
           } else {
-            // ***conversion rules for dcol and pxid
-            uint32_t dcol = (ww >> ::pixelDetails::DCOL_shift) & ::pixelDetails::DCOL_mask;
-            uint32_t pxid = (ww >> ::pixelDetails::PXID_shift) & ::pixelDetails::PXID_mask;
+            // Other layers with double columns
+            uint32_t dcol = sipixelconstants::getDCol(ww);
+            uint32_t pxid = sipixelconstants::getPxId(ww);
             uint32_t row = ::pixelDetails::numRowsInRoc - pxid / 2;
             uint32_t col = dcol * 2 + pxid % 2;
             localPix.row = row;
             localPix.col = col;
             if (includeErrors and not dcolIsValid(dcol, pxid)) {
-              uint8_t error = conversionError(fedId, 3, debug);
+              uint8_t error = conversionError<debug>(fedId, 3);
               err[gIndex].pixelErrors() = SiPixelErrorCompact{rawId, ww, error, fedId};
-              alpaka::atomicInc(acc, &err.size(), 0xffffffff, alpaka::hierarchy::Threads{});
-              if (debug)
+              alpaka::atomicInc(acc, &err.size(), 0xffffffff, alpaka::hierarchy::Blocks{});
+              if constexpr (debug)
                 printf("Error status: %i %d %d %d %d\n", error, dcol, pxid, fedId, roc);
-              return;
+              continue;
             }
           }
 
-          ::pixelDetails::Pixel globalPix = frameConversion(barrel, side, layer, rocIdInDetUnit, localPix);
+          ::pixelDetails::Pixel globalPix = frameConversion(barrel, side, layer, detId.rocInDet, localPix);
           dvgi.xx() = globalPix.row;  // origin shifting by 1 0-159
           dvgi.yy() = globalPix.col;  // origin shifting by 1 0-415
-          dvgi.adc() = getADC(ww);
+          dvgi.adc() = sipixelconstants::getADC(ww);
           dvgi.pdigi() = ::pixelDetails::pack(globalPix.row, globalPix.col, dvgi.adc());
           dvgi.moduleId() = detId.moduleId;
           dvgi.rawIdArr() = rawId;
-        });  // end of stride on grid
+        }  // end of stride on grid
 
       }  // end of Raw to Digi kernel operator()
     };   // end of Raw to Digi struct
@@ -464,7 +459,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
       ALPAKA_FN_ACC void operator()(const TAcc &acc, SiPixelClustersSoAView clus_view) const {
         ALPAKA_ASSERT_OFFLOAD(TrackerTraits::numberOfModules < 2048);  // easy to extend at least till 32*1024
 
-        constexpr int nMaxModules = TrackerTraits::numberOfModules;
+        constexpr int numberOfModules = TrackerTraits::numberOfModules;
         constexpr uint32_t maxHitsInModule = TrackerTraits::maxHitsInModule;
 
 #ifndef NDEBUG
@@ -475,12 +470,12 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
 #endif
 
         // limit to maxHitsInModule;
-        cms::alpakatools::for_each_element_in_block_strided(acc, nMaxModules, [&](uint32_t i) {
+        cms::alpakatools::for_each_element_in_block_strided(acc, numberOfModules, [&](uint32_t i) {
           clus_view[i + 1].clusModuleStart() = std::min(maxHitsInModule, clus_view[i].clusInModule());
         });
 
         constexpr bool isPhase2 = std::is_base_of<pixelTopology::Phase2, TrackerTraits>::value;
-        constexpr auto leftModules = isPhase2 ? 1024 : nMaxModules - 1024;
+        constexpr auto leftModules = isPhase2 ? 1024 : numberOfModules - 1024;
 
         auto &&ws = alpaka::declareSharedVar<uint32_t[32], __COUNTER__>(acc);
 
@@ -496,11 +491,11 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
           cms::alpakatools::blockPrefixScan(acc,
                                             clus_view.clusModuleStart() + 3072 + 1,
                                             clus_view.clusModuleStart() + 3072 + 1,
-                                            nMaxModules - 3072,
+                                            numberOfModules - 3072,
                                             ws);
         }
 
-        constexpr auto lastModule = isPhase2 ? 2049u : nMaxModules + 1;
+        constexpr auto lastModule = isPhase2 ? 2049u : numberOfModules + 1;
         cms::alpakatools::for_each_element_in_block_strided(acc, lastModule, 1025u, [&](uint32_t i) {
           clus_view[i].clusModuleStart() += clus_view[1024].clusModuleStart();
         });
@@ -512,7 +507,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
           });
           alpaka::syncBlockThreads(acc);
 
-          cms::alpakatools::for_each_element_in_block_strided(acc, nMaxModules + 1, 3073u, [&](uint32_t i) {
+          cms::alpakatools::for_each_element_in_block_strided(acc, numberOfModules + 1, 3073u, [&](uint32_t i) {
             clus_view[i].clusModuleStart() += clus_view[3072].clusModuleStart();
           });
           alpaka::syncBlockThreads(acc);
@@ -523,9 +518,9 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
         ALPAKA_ASSERT_OFFLOAD(c0 == clus_view[1].moduleStart());
         ALPAKA_ASSERT_OFFLOAD(clus_view[1024].moduleStart() >= clus_view[1023].moduleStart());
         ALPAKA_ASSERT_OFFLOAD(clus_view[1025].moduleStart() >= clus_view[1024].moduleStart());
-        ALPAKA_ASSERT_OFFLOAD(clus_view[nMaxModules].moduleStart() >= clus_view[1025].moduleStart());
+        ALPAKA_ASSERT_OFFLOAD(clus_view[numberOfModules].moduleStart() >= clus_view[1025].moduleStart());
 
-        cms::alpakatools::for_each_element_in_block_strided(acc, nMaxModules + 1, [&](uint32_t i) {
+        cms::alpakatools::for_each_element_in_block_strided(acc, numberOfModules + 1, [&](uint32_t i) {
           if (0 != i)
             ALPAKA_ASSERT_OFFLOAD(clus_view[i].moduleStart() >= clus_view[i - i].moduleStart());
           // Check BPX2 (1), FP1 (4)
@@ -537,7 +532,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
 #endif
         // avoid overflow
         constexpr auto MAX_HITS = TrackerTraits::maxNumberOfHits;
-        cms::alpakatools::for_each_element_in_block_strided(acc, nMaxModules + 1, [&](uint32_t i) {
+        cms::alpakatools::for_each_element_in_block_strided(acc, numberOfModules + 1, [&](uint32_t i) {
           if (clus_view[i].clusModuleStart() > MAX_HITS)
             clus_view[i].clusModuleStart() = MAX_HITS;
         });
@@ -587,19 +582,33 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
         alpaka::memcpy(queue, word_d, wordFed.word(), wordCounter);
         alpaka::memcpy(queue, fedId_d, wordFed.fedId(), wordCounter / 2);
         // Launch rawToDigi kernel
-        alpaka::exec<Acc1D>(queue,
-                            workDiv,
-                            RawToDigi_kernel{},
-                            cablingMap,
-                            modToUnp,
-                            wordCounter,
-                            word_d.data(),
-                            fedId_d.data(),
-                            digis_d->view(),
-                            digiErrors_d->view(),
-                            useQualityInfo,
-                            includeErrors,
-                            debug);
+        if (debug) {
+          alpaka::exec<Acc1D>(queue,
+                              workDiv,
+                              RawToDigi_kernel<true>{},
+                              cablingMap,
+                              modToUnp,
+                              wordCounter,
+                              word_d.data(),
+                              fedId_d.data(),
+                              digis_d->view(),
+                              digiErrors_d->view(),
+                              useQualityInfo,
+                              includeErrors);
+        } else {
+          alpaka::exec<Acc1D>(queue,
+                              workDiv,
+                              RawToDigi_kernel<false>{},
+                              cablingMap,
+                              modToUnp,
+                              wordCounter,
+                              word_d.data(),
+                              fedId_d.data(),
+                              digis_d->view(),
+                              digiErrors_d->view(),
+                              useQualityInfo,
+                              includeErrors);
+        }
 
 #ifdef GPU_DEBUG
         alpaka::wait(queue);
@@ -625,9 +634,25 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
                                                            threadsPerBlockOrElementsPerThread);
         const auto workDiv = cms::alpakatools::make_workdiv<Acc1D>(blocks, threadsPerBlockOrElementsPerThread);
 
-        alpaka::exec<Acc1D>(
-            queue, workDiv, CalibDigis{}, clusterThresholds, digis_d->view(), clusters_d->view(), gains, wordCounter);
-
+        if (debug) {
+          alpaka::exec<Acc1D>(queue,
+                              workDiv,
+                              CalibDigis<true>{},
+                              clusterThresholds,
+                              digis_d->view(),
+                              clusters_d->view(),
+                              gains,
+                              wordCounter);
+        } else {
+          alpaka::exec<Acc1D>(queue,
+                              workDiv,
+                              CalibDigis<false>{},
+                              clusterThresholds,
+                              digis_d->view(),
+                              clusters_d->view(),
+                              gains,
+                              wordCounter);
+        }
 #ifdef GPU_DEBUG
         alpaka::wait(queue);
         std::cout << "CountModules kernel launch with " << blocks << " blocks of " << threadsPerBlockOrElementsPerThread
@@ -642,8 +667,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
         alpaka::memcpy(queue, nModules_Clusters_h, moduleStartFirstElement);
 
         // TODO
-        // - we are fixing this here since it needs to be needed
-        // at compile time also in the kernel (for_each_element_in_block_strided)
+        // - we are fixing this here since it is used at compile time also in the kernel
         // - put maxIter in the Geometry traits
         constexpr auto threadsOrElementsFindClus = 256;
 
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/SiPixelRawToClusterKernel.h b/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/SiPixelRawToClusterKernel.h
index b7b9071506652..c5d4cdcb13533 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/SiPixelRawToClusterKernel.h
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/SiPixelRawToClusterKernel.h
@@ -29,6 +29,7 @@ namespace pixelDetails {
   constexpr auto MAX_LINK = pixelgpudetails::MAX_LINK;
   constexpr auto MAX_SIZE = pixelgpudetails::MAX_SIZE;
   constexpr auto MAX_ROC = pixelgpudetails::MAX_ROC;
+
   // Phase 1 geometry constants
   constexpr uint32_t layerStartBit = 20;
   constexpr uint32_t ladderStartBit = 12;
@@ -85,7 +86,7 @@ namespace pixelDetails {
   constexpr uint32_t OMIT_ERR_mask = ~(~uint32_t(0) << OMIT_ERR_bits);
 
   struct DetIdGPU {
-    uint32_t RawId;
+    uint32_t rawId;
     uint32_t rocInDet;
     uint32_t moduleId;
   };

From e142ccc69c8477d90ccde4342fed26ad8018b60e Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Fri, 2 Feb 2024 22:30:19 +0100
Subject: [PATCH 02/25] Add check on the ROC range

---
 .../plugins/MeasurementTrackerEventProducer.cc             | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/RecoTracker/MeasurementDet/plugins/MeasurementTrackerEventProducer.cc b/RecoTracker/MeasurementDet/plugins/MeasurementTrackerEventProducer.cc
index 68e43c2d5b970..11282ac034908 100644
--- a/RecoTracker/MeasurementDet/plugins/MeasurementTrackerEventProducer.cc
+++ b/RecoTracker/MeasurementDet/plugins/MeasurementTrackerEventProducer.cc
@@ -178,7 +178,7 @@ void MeasurementTrackerEventProducer::updatePixels(const edm::Event& event,
 
     edm::Handle<PixelFEDChannelCollection> pixelFEDChannelCollectionHandle;
     for (const edm::EDGetTokenT<PixelFEDChannelCollection>& tk : theBadPixelFEDChannelsLabels) {
-      if (!event.getByToken(tk, pixelFEDChannelCollectionHandle))
+      if (not event.getByToken(tk, pixelFEDChannelCollectionHandle))
         continue;
       int i = 0;
       for (const auto& disabledChannels : *pixelFEDChannelCollectionHandle) {
@@ -189,7 +189,8 @@ void MeasurementTrackerEventProducer::updatePixels(const edm::Event& event,
           // PixelFEDChannelCollection addresses the ROCs by their 'idInDetUnit' (from 0 to 15), ROCs also know their on 'idInDetUnit',
           // however the cabling map uses a numbering [1,numberOfROCs], see sipixelobjects::PixelFEDLink::roc(unsigned int id), not necessarily sorted in the same direction.
           // PixelFEDChannelCollection MUST be filled such that ch.roc_first (ch.roc_last) correspond to the lowest (highest) 'idInDetUnit' in the channel
-          for (path.roc = 1; path.roc <= (ch.roc_last - ch.roc_first) + 1; path.roc++) {
+          assert(ch.roc_last >= ch.roc_first);
+          for (path.roc = 1; path.roc <= (ch.roc_last - ch.roc_first) + 1; ++path.roc) {
             const sipixelobjects::PixelROC* roc = cablingMap.findItem(path);
             if (roc == nullptr)
               continue;
@@ -217,7 +218,7 @@ void MeasurementTrackerEventProducer::updatePixels(const edm::Event& event,
           LocalPoint ur(std::max(lp1.x(), lp2.x()), std::max(lp1.y(), lp2.y()), std::max(lp1.z(), lp2.z()));
           positions.push_back(std::make_pair(ll, ur));
         }  // loop on channels
-        if (!positions.empty()) {
+        if (not positions.empty()) {
           i = thePxDets.find(disabledChannels.detId(), i);
           assert(i != thePxDets.size() && thePxDets.id(i) == disabledChannels.detId());
           thePxDets.addBadFEDChannelPositions(i, positions);

From a2c7a2708ec0416d007db9f1e983ca4a9be06075 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Sat, 3 Feb 2024 01:22:26 +0100
Subject: [PATCH 03/25] Clean up includes and dependencies in pixel CondFormats

---
 CondFormats/SiPixelObjects/BuildFile.xml      | 20 +++++++++++--------
 .../interface/SiPixelMappingHost.h            |  9 ++++-----
 .../interface/alpaka/SiPixelMappingDevice.h   |  5 ++---
 CondFormats/SiPixelObjects/test/BuildFile.xml |  4 ++++
 4 files changed, 22 insertions(+), 16 deletions(-)

diff --git a/CondFormats/SiPixelObjects/BuildFile.xml b/CondFormats/SiPixelObjects/BuildFile.xml
index ddd87c956d217..9f8fe0199a9f6 100644
--- a/CondFormats/SiPixelObjects/BuildFile.xml
+++ b/CondFormats/SiPixelObjects/BuildFile.xml
@@ -1,20 +1,24 @@
 <use name="alpaka"/>
-<use name="DataFormats/DetId"/>
-<use name="DataFormats/SiPixelDetId"/>
 <use name="CalibFormats/SiPixelObjects"/>
-<use name="FWCore/Utilities"/>
-<use name="DataFormats/FEDRawData"/>
-<use name="CondFormats/Serialization"/>
+<use name="CondCore/DBOutputService"/>
 <use name="CondFormats/DataRecord"/>
-<use name="CondFormats/SiStripObjects"/>
-<use name="Geometry/TrackerGeometryBuilder"/>
 <use name="CondFormats/External"/>
+<use name="CondFormats/Serialization"/>
+<use name="CondFormats/SiStripObjects"/>
+<use name="DataFormats/DetId"/>
+<use name="DataFormats/FEDRawData"/>
 <use name="DataFormats/GeometryVector"/>
+<use name="DataFormats/Portable"/>
+<use name="DataFormats/SiPixelDetId"/>
+<use name="DataFormats/SoATemplate"/>
+<use name="DataFormats/TrackerCommon"/>
 <use name="FWCore/MessageLogger"/>
+<use name="FWCore/Utilities"/>
 <use name="Geometry/CommonDetUnit"/>
 <use name="Geometry/CommonTopologies"/>
-<use name="HeterogeneousCore/AlpakaInterface"/>
+<use name="Geometry/TrackerGeometryBuilder"/>
 <use name="HeterogeneousCore/AlpakaCore"/>
+<use name="HeterogeneousCore/AlpakaInterface"/>
 <flags ALPAKA_BACKENDS="1"/>
 <export>
   <lib name="1"/>
diff --git a/CondFormats/SiPixelObjects/interface/SiPixelMappingHost.h b/CondFormats/SiPixelObjects/interface/SiPixelMappingHost.h
index 772a7a97e267b..ad8a726048358 100644
--- a/CondFormats/SiPixelObjects/interface/SiPixelMappingHost.h
+++ b/CondFormats/SiPixelObjects/interface/SiPixelMappingHost.h
@@ -1,10 +1,9 @@
-#ifndef CondFormats_SiPixelObjects_SiPixelMappingHost_h
-#define CondFormats_SiPixelObjects_SiPixelMappingHost_h
+#ifndef CondFormats_SiPixelObjects_interface_SiPixelMappingHost_h
+#define CondFormats_SiPixelObjects_interface_SiPixelMappingHost_h
 
-#include <alpaka/alpaka.hpp>
-#include "DataFormats/Portable/interface/PortableHostCollection.h"
 #include "CondFormats/SiPixelObjects/interface/SiPixelMappingLayout.h"
+#include "DataFormats/Portable/interface/PortableHostCollection.h"
 
 using SiPixelMappingHost = PortableHostCollection<SiPixelMappingSoA>;
 
-#endif  // CondFormats_SiPixelObjects_SiPixelMappingHost_h
+#endif  // CondFormats_SiPixelObjects_interface_SiPixelMappingHost_h
diff --git a/CondFormats/SiPixelObjects/interface/alpaka/SiPixelMappingDevice.h b/CondFormats/SiPixelObjects/interface/alpaka/SiPixelMappingDevice.h
index 8a16caa0d7368..41831706c9164 100644
--- a/CondFormats/SiPixelObjects/interface/alpaka/SiPixelMappingDevice.h
+++ b/CondFormats/SiPixelObjects/interface/alpaka/SiPixelMappingDevice.h
@@ -2,11 +2,10 @@
 #define CondFormats_SiPixelObjects_interface_alpaka_SiPixelMappingDevice_h
 
 #include <cstdint>
-#include <alpaka/alpaka.hpp>
+
 #include "DataFormats/Portable/interface/alpaka/PortableCollection.h"
 #include "CondFormats/SiPixelObjects/interface/SiPixelMappingLayout.h"
-#include "HeterogeneousCore/AlpakaCore/interface/alpaka/ESProducer.h"
-#include "DataFormats/Portable/interface/PortableHostCollection.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/config.h"
 
 namespace ALPAKA_ACCELERATOR_NAMESPACE {
 
diff --git a/CondFormats/SiPixelObjects/test/BuildFile.xml b/CondFormats/SiPixelObjects/test/BuildFile.xml
index ed2c98ca1d5e1..a0a0b02735603 100644
--- a/CondFormats/SiPixelObjects/test/BuildFile.xml
+++ b/CondFormats/SiPixelObjects/test/BuildFile.xml
@@ -1,6 +1,10 @@
 <use name="CondCore/DBOutputService"/>
 <use name="CondFormats/DataRecord"/>
 <use name="CondFormats/SiPixelObjects"/>
+<use name="FWCore/Framework"/>
+<use name="FWCore/ParameterSet"/>
+<use name="FWCore/ServiceRegistry"/>
+<use name="Geometry/Records"/>
 <use name="Geometry/TrackerGeometryBuilder"/>
 <library name="SiPixelObjectsTest" file="*.cc">
   <flags EDM_PLUGIN="1"/>

From 2b5e6f6d9a4c8faa88497babce393fc90d716111 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Sat, 3 Feb 2024 09:27:08 +0100
Subject: [PATCH 04/25] Add overflow checks to CountModules::operator()

---
 .../plugins/alpaka/PixelClustering.h          | 33 +++++++++++--------
 1 file changed, 19 insertions(+), 14 deletions(-)

diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/PixelClustering.h b/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/PixelClustering.h
index 7da68c7b2f5da..4f05264b95404 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/PixelClustering.h
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/PixelClustering.h
@@ -98,30 +98,35 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
                                     SiPixelDigisSoAView digi_view,
                                     SiPixelClustersSoAView clus_view,
                                     const unsigned int numElements) const {
-        [[maybe_unused]] constexpr int nMaxModules = TrackerTraits::numberOfModules;
+        // Make sure the atomicInc below does not overflow
+        static_assert(TrackerTraits::numberOfModules < ::pixelClustering::maxNumModules);
 
 #ifdef GPU_DEBUG
         if (cms::alpakatools::once_per_grid(acc)) {
           printf("Starting to count modules to set module starts:");
         }
 #endif
-        cms::alpakatools::for_each_element_in_grid_strided(acc, numElements, [&](uint32_t i) {
+        for (int32_t i : cms::alpakatools::elements_with_stride(acc, numElements)) {
           digi_view[i].clus() = i;
-          if (::pixelClustering::invalidModuleId != digi_view[i].moduleId()) {
-            int j = i - 1;
-            while (j >= 0 and digi_view[j].moduleId() == ::pixelClustering::invalidModuleId)
-              --j;
-            if (j < 0 or digi_view[j].moduleId() != digi_view[i].moduleId()) {
-              // boundary...
-              auto loc = alpaka::atomicInc(
-                  acc, clus_view.moduleStart(), std::decay_t<uint32_t>(nMaxModules), alpaka::hierarchy::Blocks{});
+          if (::pixelClustering::invalidModuleId == digi_view[i].moduleId())
+            continue;
+
+          int32_t j = i - 1;
+          while (j >= 0 and digi_view[j].moduleId() == ::pixelClustering::invalidModuleId)
+            --j;
+          if (j < 0 or digi_view[j].moduleId() != digi_view[i].moduleId()) {
+            // Found a module boundary: count the number of modules in  clus_view[0].moduleStart()
+            auto loc = alpaka::atomicInc(acc,
+                                         &clus_view[0].moduleStart(),
+                                         static_cast<uint32_t>(::pixelClustering::maxNumModules),
+                                         alpaka::hierarchy::Blocks{});
+            ALPAKA_ASSERT_OFFLOAD(loc < TrackerTraits::numberOfModules);
 #ifdef GPU_DEBUG
-              printf("> New module (no. %d) found at digi %d \n", loc, i);
+            printf("> New module (no. %d) found at digi %d \n", loc, i);
 #endif
-              clus_view[loc + 1].moduleStart() = i;
-            }
+            clus_view[loc + 1].moduleStart() = i;
           }
-        });
+        }
       }
     };
 

From 937f0daa9e953ba1464062246568356ac6b34564 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Sun, 4 Feb 2024 12:18:20 +0100
Subject: [PATCH 05/25] Rewrite the pixel clustering code

Rewrite loops using cms::alpakatools::independent_groups(acc, ...) and
independent_group_elements(acc, ...).

Rename variables and improve comments to help readability.
---
 .../plugins/alpaka/PixelClustering.h          | 772 +++++++++---------
 1 file changed, 392 insertions(+), 380 deletions(-)

diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/PixelClustering.h b/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/PixelClustering.h
index 4f05264b95404..37afda9847a99 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/PixelClustering.h
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/PixelClustering.h
@@ -16,445 +16,457 @@
 
 //#define GPU_DEBUG
 
-namespace ALPAKA_ACCELERATOR_NAMESPACE {
-
-  namespace pixelClustering {
+namespace ALPAKA_ACCELERATOR_NAMESPACE::pixelClustering {
 
 #ifdef GPU_DEBUG
-    template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
-    ALPAKA_STATIC_ACC_MEM_GLOBAL uint32_t gMaxHit = 0;
+  template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
+  ALPAKA_STATIC_ACC_MEM_GLOBAL uint32_t gMaxHit = 0;
 #endif
 
-    namespace pixelStatus {
-      // Phase-1 pixel modules
-      constexpr uint32_t pixelSizeX = pixelTopology::Phase1::numRowsInModule;
-      constexpr uint32_t pixelSizeY = pixelTopology::Phase1::numColsInModule;
-
-      // Use 0x00, 0x01, 0x03 so each can be OR'ed on top of the previous ones
-      enum Status : uint32_t { kEmpty = 0x00, kFound = 0x01, kDuplicate = 0x03 };
-
-      constexpr uint32_t bits = 2;
-      constexpr uint32_t mask = (0x01 << bits) - 1;
-      constexpr uint32_t valuesPerWord = sizeof(uint32_t) * 8 / bits;
-      constexpr uint32_t size = pixelSizeX * pixelSizeY / valuesPerWord;
-
-      ALPAKA_FN_ACC ALPAKA_FN_INLINE constexpr uint32_t getIndex(uint16_t x, uint16_t y) {
-        return (pixelSizeX * y + x) / valuesPerWord;
-      }
-
-      ALPAKA_FN_ACC ALPAKA_FN_INLINE constexpr uint32_t getShift(uint16_t x, uint16_t y) {
-        return (x % valuesPerWord) * 2;
-      }
-
-      ALPAKA_FN_ACC ALPAKA_FN_INLINE constexpr Status getStatus(uint32_t const* __restrict__ status,
-                                                                uint16_t x,
-                                                                uint16_t y) {
-        uint32_t index = getIndex(x, y);
-        uint32_t shift = getShift(x, y);
-        return Status{(status[index] >> shift) & mask};
-      }
-
-      ALPAKA_FN_ACC ALPAKA_FN_INLINE constexpr bool isDuplicate(uint32_t const* __restrict__ status,
-                                                                uint16_t x,
-                                                                uint16_t y) {
-        return getStatus(status, x, y) == kDuplicate;
-      }
-
-      /* FIXME
+  namespace pixelStatus {
+    // Phase-1 pixel modules
+    constexpr uint32_t pixelSizeX = pixelTopology::Phase1::numRowsInModule;
+    constexpr uint32_t pixelSizeY = pixelTopology::Phase1::numColsInModule;
+
+    // Use 0x00, 0x01, 0x03 so each can be OR'ed on top of the previous ones
+    enum Status : uint32_t { kEmpty = 0x00, kFound = 0x01, kDuplicate = 0x03 };
+
+    constexpr uint32_t bits = 2;
+    constexpr uint32_t mask = (0x01 << bits) - 1;
+    constexpr uint32_t valuesPerWord = sizeof(uint32_t) * 8 / bits;
+    constexpr uint32_t size = pixelSizeX * pixelSizeY / valuesPerWord;
+
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE constexpr uint32_t getIndex(uint16_t x, uint16_t y) {
+      return (pixelSizeX * y + x) / valuesPerWord;
+    }
+
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE constexpr uint32_t getShift(uint16_t x, uint16_t y) {
+      return (x % valuesPerWord) * 2;
+    }
+
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE constexpr Status getStatus(uint32_t const* __restrict__ status,
+                                                              uint16_t x,
+                                                              uint16_t y) {
+      uint32_t index = getIndex(x, y);
+      uint32_t shift = getShift(x, y);
+      return Status{(status[index] >> shift) & mask};
+    }
+
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE constexpr bool isDuplicate(uint32_t const* __restrict__ status,
+                                                              uint16_t x,
+                                                              uint16_t y) {
+      return getStatus(status, x, y) == kDuplicate;
+    }
+
+    /* FIXME
        * In the more general case (e.g. a multithreaded CPU backend) there is a potential race condition
        * between the read of status[index] at line NNN and the atomicCas at line NNN.
        * We should investigate:
        *   - if `status` should be read through a `volatile` pointer (CUDA/ROCm)
        *   - if `status` should be read with an atomic load (CPU)
        */
-      template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
-      ALPAKA_FN_ACC ALPAKA_FN_INLINE constexpr void promote(TAcc const& acc,
-                                                            uint32_t* __restrict__ status,
-                                                            const uint16_t x,
-                                                            const uint16_t y) {
-        uint32_t index = getIndex(x, y);
-        uint32_t shift = getShift(x, y);
-        uint32_t old_word = status[index];
-        uint32_t expected = old_word;
-        do {
-          expected = old_word;
-          Status old_status{(old_word >> shift) & mask};
-          if (kDuplicate == old_status) {
-            // nothing to do
-            return;
-          }
-          Status new_status = (kEmpty == old_status) ? kFound : kDuplicate;
-          uint32_t new_word = old_word | (static_cast<uint32_t>(new_status) << shift);
-          old_word = alpaka::atomicCas(acc, &status[index], expected, new_word, alpaka::hierarchy::Blocks{});
-        } while (expected != old_word);
-      }
-
-    }  // namespace pixelStatus
-
-    template <typename TrackerTraits>
-    struct CountModules {
-      template <typename TAcc>
-      ALPAKA_FN_ACC void operator()(const TAcc& acc,
-                                    SiPixelDigisSoAView digi_view,
-                                    SiPixelClustersSoAView clus_view,
-                                    const unsigned int numElements) const {
-        // Make sure the atomicInc below does not overflow
-        static_assert(TrackerTraits::numberOfModules < ::pixelClustering::maxNumModules);
+    template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE constexpr void promote(TAcc const& acc,
+                                                          uint32_t* __restrict__ status,
+                                                          const uint16_t x,
+                                                          const uint16_t y) {
+      uint32_t index = getIndex(x, y);
+      uint32_t shift = getShift(x, y);
+      uint32_t old_word = status[index];
+      uint32_t expected = old_word;
+      do {
+        expected = old_word;
+        Status old_status{(old_word >> shift) & mask};
+        if (kDuplicate == old_status) {
+          // nothing to do
+          return;
+        }
+        Status new_status = (kEmpty == old_status) ? kFound : kDuplicate;
+        uint32_t new_word = old_word | (static_cast<uint32_t>(new_status) << shift);
+        old_word = alpaka::atomicCas(acc, &status[index], expected, new_word, alpaka::hierarchy::Blocks{});
+      } while (expected != old_word);
+    }
+
+  }  // namespace pixelStatus
+
+  template <typename TrackerTraits>
+  struct CountModules {
+    template <typename TAcc>
+    ALPAKA_FN_ACC void operator()(const TAcc& acc,
+                                  SiPixelDigisSoAView digi_view,
+                                  SiPixelClustersSoAView clus_view,
+                                  const unsigned int numElements) const {
+      // Make sure the atomicInc below does not overflow
+      static_assert(TrackerTraits::numberOfModules < ::pixelClustering::maxNumModules);
 
 #ifdef GPU_DEBUG
-        if (cms::alpakatools::once_per_grid(acc)) {
-          printf("Starting to count modules to set module starts:");
-        }
+      if (cms::alpakatools::once_per_grid(acc)) {
+        printf("Starting to count modules to set module starts:");
+      }
 #endif
-        for (int32_t i : cms::alpakatools::elements_with_stride(acc, numElements)) {
-          digi_view[i].clus() = i;
-          if (::pixelClustering::invalidModuleId == digi_view[i].moduleId())
-            continue;
-
-          int32_t j = i - 1;
-          while (j >= 0 and digi_view[j].moduleId() == ::pixelClustering::invalidModuleId)
-            --j;
-          if (j < 0 or digi_view[j].moduleId() != digi_view[i].moduleId()) {
-            // Found a module boundary: count the number of modules in  clus_view[0].moduleStart()
-            auto loc = alpaka::atomicInc(acc,
-                                         &clus_view[0].moduleStart(),
-                                         static_cast<uint32_t>(::pixelClustering::maxNumModules),
-                                         alpaka::hierarchy::Blocks{});
-            ALPAKA_ASSERT_OFFLOAD(loc < TrackerTraits::numberOfModules);
+      for (int32_t i : cms::alpakatools::elements_with_stride(acc, numElements)) {
+        digi_view[i].clus() = i;
+        if (::pixelClustering::invalidModuleId == digi_view[i].moduleId())
+          continue;
+
+        int32_t j = i - 1;
+        while (j >= 0 and digi_view[j].moduleId() == ::pixelClustering::invalidModuleId)
+          --j;
+        if (j < 0 or digi_view[j].moduleId() != digi_view[i].moduleId()) {
+          // Found a module boundary: count the number of modules in  clus_view[0].moduleStart()
+          auto loc = alpaka::atomicInc(acc,
+                                       &clus_view[0].moduleStart(),
+                                       static_cast<uint32_t>(::pixelClustering::maxNumModules),
+                                       alpaka::hierarchy::Blocks{});
+          ALPAKA_ASSERT_OFFLOAD(loc < TrackerTraits::numberOfModules);
 #ifdef GPU_DEBUG
-            printf("> New module (no. %d) found at digi %d \n", loc, i);
+          printf("> New module (no. %d) found at digi %d \n", loc, i);
 #endif
-            clus_view[loc + 1].moduleStart() = i;
-          }
+          clus_view[loc + 1].moduleStart() = i;
         }
       }
-    };
-
-    template <typename TrackerTraits>
-    struct FindClus {
-      template <typename TAcc>
-      ALPAKA_FN_ACC void operator()(const TAcc& acc,
-                                    SiPixelDigisSoAView digi_view,
-                                    SiPixelClustersSoAView clus_view,
-                                    const unsigned int numElements) const {
-        constexpr bool isPhase2 = std::is_base_of<pixelTopology::Phase2, TrackerTraits>::value;
-        constexpr const uint32_t pixelStatusSize = isPhase2 ? 1 : pixelStatus::size;
+    }
+  };
+
+  template <typename TrackerTraits>
+  struct FindClus {
+    template <typename TAcc>
+    ALPAKA_FN_ACC void operator()(const TAcc& acc,
+                                  SiPixelDigisSoAView digi_view,
+                                  SiPixelClustersSoAView clus_view,
+                                  const unsigned int numElements) const {
+      static_assert(TrackerTraits::numberOfModules < ::pixelClustering::maxNumModules);
+
+      auto& lastPixel = alpaka::declareSharedVar<unsigned int, __COUNTER__>(acc);
+
+      const uint32_t lastModule = clus_view[0].moduleStart();
+      for (uint32_t module : cms::alpakatools::independent_groups(acc, lastModule)) {
+        auto firstPixel = clus_view[1 + module].moduleStart();
+        uint32_t thisModuleId = digi_view[firstPixel].moduleId();
+        ALPAKA_ASSERT_OFFLOAD(thisModuleId < TrackerTraits::numberOfModules);
 
-        // packed words array used to store the pixelStatus of each pixel
-        auto& status = alpaka::declareSharedVar<uint32_t[pixelStatusSize], __COUNTER__>(acc);
+#ifdef GPU_DEBUG
+        if (thisModuleId % 100 == 1)
+          if (cms::alpakatools::once_per_block(acc))
+            printf("start clusterizer for module %4d in block %4d\n",
+                   thisModuleId,
+                   alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0u]);
+#endif
 
         // find the index of the first pixel not belonging to this module (or invalid)
-        auto& msize = alpaka::declareSharedVar<unsigned int, __COUNTER__>(acc);
-
-        const uint32_t blockIdx = alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0u];
-        if (blockIdx >= clus_view[0].moduleStart())
-          return;
-
-        auto firstModule = blockIdx;
-        auto endModule = clus_view[0].moduleStart();
+        lastPixel = numElements;
+        alpaka::syncBlockThreads(acc);
+
+        // skip threads not associated to an existing pixel
+        for (uint32_t i : cms::alpakatools::independent_group_elements(acc, firstPixel, numElements)) {
+          auto id = digi_view[i].moduleId();
+          // skip invalid pixels
+          if (id == ::pixelClustering::invalidModuleId)
+            continue;
+          // find the first pixel in a different module
+          if (id != thisModuleId) {
+            alpaka::atomicMin(acc, &lastPixel, i, alpaka::hierarchy::Threads{});
+            break;
+          }
+        }
 
-        const uint32_t gridDimension(alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[0u]);
+        using Hist = cms::alpakatools::HistoContainer<uint16_t,
+                                                      TrackerTraits::clusterBinning,
+                                                      TrackerTraits::maxPixInModule,
+                                                      TrackerTraits::clusterBits,
+                                                      uint16_t>;
+        auto& hist = alpaka::declareSharedVar<Hist, __COUNTER__>(acc);
+        auto& ws = alpaka::declareSharedVar<typename Hist::Counter[32], __COUNTER__>(acc);
+        for (uint32_t j : cms::alpakatools::independent_group_elements(acc, Hist::totbins())) {
+          hist.off[j] = 0;
+        }
+        alpaka::syncBlockThreads(acc);
+
+        ALPAKA_ASSERT_OFFLOAD((lastPixel == numElements) or
+                              ((lastPixel < numElements) and (digi_view[lastPixel].moduleId() != thisModuleId)));
+        // limit to maxPixInModule  (FIXME if recurrent (and not limited to simulation with low threshold) one will need to implement something cleverer)
+        if (cms::alpakatools::once_per_block(acc)) {
+          if (lastPixel - firstPixel > TrackerTraits::maxPixInModule) {
+            printf("too many pixels in module %u: %u > %u\n",
+                   thisModuleId,
+                   lastPixel - firstPixel,
+                   TrackerTraits::maxPixInModule);
+            lastPixel = TrackerTraits::maxPixInModule + firstPixel;
+          }
+        }
+        alpaka::syncBlockThreads(acc);
+        ALPAKA_ASSERT_OFFLOAD(lastPixel - firstPixel <= TrackerTraits::maxPixInModule);
 
-        for (auto module = firstModule; module < endModule; module += gridDimension) {
-          auto firstPixel = clus_view[1 + module].moduleStart();
-          auto thisModuleId = digi_view[firstPixel].moduleId();
-          ALPAKA_ASSERT_OFFLOAD(thisModuleId < TrackerTraits::numberOfModules);
 #ifdef GPU_DEBUG
-          if (thisModuleId % 100 == 1)
-            if (cms::alpakatools::once_per_block(acc))
-              printf("start clusterizer for module %d in block %d\n", thisModuleId, module);
+        auto& totGood = alpaka::declareSharedVar<uint32_t, __COUNTER__>(acc);
+        totGood = 0;
+        alpaka::syncBlockThreads(acc);
 #endif
 
-          msize = numElements;
-          alpaka::syncBlockThreads(acc);
+        // remove duplicate pixels
+        constexpr bool isPhase2 = std::is_base_of<pixelTopology::Phase2, TrackerTraits>::value;
+        if constexpr (not isPhase2) {
+          // packed words array used to store the pixelStatus of each pixel
+          auto& status = alpaka::declareSharedVar<uint32_t[pixelStatus::size], __COUNTER__>(acc);
 
-          // Stride = block size.
-          const uint32_t blockDimension(alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u]);
-
-          // Get thread / CPU element indices in block.
-          const auto& [firstElementIdxNoStride, endElementIdxNoStride] =
-              cms::alpakatools::element_index_range_in_block(acc, firstPixel);
-          uint32_t firstElementIdx = firstElementIdxNoStride;
-          uint32_t endElementIdx = endElementIdxNoStride;
-
-          // skip threads not associated to an existing pixel
-          for (uint32_t i = firstElementIdx; i < numElements; ++i) {
-            if (not cms::alpakatools::next_valid_element_index_strided(
-                    i, firstElementIdx, endElementIdx, blockDimension, numElements))
-              break;
-            auto id = digi_view[i].moduleId();
-            if (id == ::pixelClustering::invalidModuleId)  // skip invalid pixels
-              continue;
-            if (id != thisModuleId) {  // find the first pixel in a different module
-              alpaka::atomicMin(acc, &msize, i, alpaka::hierarchy::Threads{});
-              break;
+          if (lastPixel > 1) {
+            for (uint32_t i : cms::alpakatools::independent_group_elements(acc, pixelStatus::size)) {
+              status[i] = 0;
             }
-          }
-          //init hist  (ymax=416 < 512 : 9bits)
-          constexpr uint32_t maxPixInModule = TrackerTraits::maxPixInModule;
-          constexpr auto nbins = TrackerTraits::clusterBinning;
-          constexpr auto nbits = TrackerTraits::clusterBits;
-          using Hist = cms::alpakatools::HistoContainer<uint16_t, nbins, maxPixInModule, nbits, uint16_t>;
-          auto& hist = alpaka::declareSharedVar<Hist, __COUNTER__>(acc);
-          auto& ws = alpaka::declareSharedVar<typename Hist::Counter[32], __COUNTER__>(acc);
-          cms::alpakatools::for_each_element_in_block_strided(
-              acc, Hist::totbins(), [&](uint32_t j) { hist.off[j] = 0; });
-          alpaka::syncBlockThreads(acc);
-          ALPAKA_ASSERT_OFFLOAD((msize == numElements) or
-                                ((msize < numElements) and (digi_view[msize].moduleId() != thisModuleId)));
-          // limit to maxPixInModule  (FIXME if recurrent (and not limited to simulation with low threshold) one will need to implement something cleverer)
-          if (cms::alpakatools::once_per_grid(acc)) {
-            if (msize - firstPixel > maxPixInModule) {
-              printf("too many pixels in module %d: %d > %d\n", thisModuleId, msize - firstPixel, maxPixInModule);
-              msize = maxPixInModule + firstPixel;
+            alpaka::syncBlockThreads(acc);
+
+            for (uint32_t i : cms::alpakatools::independent_group_elements(acc, firstPixel, lastPixel - 1)) {
+              // skip invalid pixels
+              if (digi_view[i].moduleId() == ::pixelClustering::invalidModuleId)
+                continue;
+              pixelStatus::promote(acc, status, digi_view[i].xx(), digi_view[i].yy());
             }
+            alpaka::syncBlockThreads(acc);
+
+            for (uint32_t i : cms::alpakatools::independent_group_elements(acc, firstPixel, lastPixel - 1)) {
+              // skip invalid pixels
+              if (digi_view[i].moduleId() == ::pixelClustering::invalidModuleId)
+                continue;
+              if (pixelStatus::isDuplicate(status, digi_view[i].xx(), digi_view[i].yy())) {
+                digi_view[i].moduleId() = ::pixelClustering::invalidModuleId;
+                digi_view[i].rawIdArr() = 0;
+              }
+            }
+            alpaka::syncBlockThreads(acc);
           }
-          alpaka::syncBlockThreads(acc);
-          ALPAKA_ASSERT_OFFLOAD(msize - firstPixel <= maxPixInModule);
+        }
 
+        // fill histo
+        for (uint32_t i : cms::alpakatools::independent_group_elements(acc, firstPixel, lastPixel)) {
+          // skip invalid pixels
+          if (digi_view[i].moduleId() != ::pixelClustering::invalidModuleId) {
+            hist.count(acc, digi_view[i].yy());
 #ifdef GPU_DEBUG
-          auto& totGood = alpaka::declareSharedVar<uint32_t, __COUNTER__>(acc);
-          totGood = 0;
-          alpaka::syncBlockThreads(acc);
+            alpaka::atomicAdd(acc, &totGood, 1u, alpaka::hierarchy::Blocks{});
 #endif
-          // remove duplicate pixels
-          if constexpr (not isPhase2) {  //FIXME remove THIS
-            if (msize > 1) {
-              cms::alpakatools::for_each_element_in_block_strided(
-                  acc, pixelStatus::size, [&](uint32_t i) { status[i] = 0; });
-              alpaka::syncBlockThreads(acc);
-
-              cms::alpakatools::for_each_element_in_block_strided(acc, msize - 1, firstElementIdx, [&](uint32_t i) {
-                // skip invalid pixels
-                if (digi_view[i].moduleId() == ::pixelClustering::invalidModuleId)
-                  return;
-                pixelStatus::promote(acc, status, digi_view[i].xx(), digi_view[i].yy());
-              });
-              alpaka::syncBlockThreads(acc);
-              cms::alpakatools::for_each_element_in_block_strided(acc, msize - 1, firstElementIdx, [&](uint32_t i) {
-                // skip invalid pixels
-                if (digi_view[i].moduleId() == ::pixelClustering::invalidModuleId)
-                  return;
-                if (pixelStatus::isDuplicate(status, digi_view[i].xx(), digi_view[i].yy())) {
-                  digi_view[i].moduleId() = ::pixelClustering::invalidModuleId;
-                  digi_view[i].rawIdArr() = 0;
-                }
-              });
-              alpaka::syncBlockThreads(acc);
-            }
           }
-          // fill histo
-          cms::alpakatools::for_each_element_in_block_strided(acc, msize, firstPixel, [&](uint32_t i) {
-            if (digi_view[i].moduleId() != ::pixelClustering::invalidModuleId) {  // skip invalid pixels
-              hist.count(acc, digi_view[i].yy());
+        }
+        alpaka::syncBlockThreads(acc);  // FIXME this can be removed
+        for (uint32_t i : cms::alpakatools::independent_group_elements(acc, 32u)) {
+          ws[i] = 0;  // used by prefix scan...
+        }
+        alpaka::syncBlockThreads(acc);
+        hist.finalize(acc, ws);
+        alpaka::syncBlockThreads(acc);
 #ifdef GPU_DEBUG
-              alpaka::atomicAdd(acc, &totGood, 1u, alpaka::hierarchy::Blocks{});
+        ALPAKA_ASSERT_OFFLOAD(hist.size() == totGood);
+        if (thisModuleId % 100 == 1)
+          if (cms::alpakatools::once_per_block(acc))
+            printf("histo size %d\n", hist.size());
 #endif
-            }
-          });
-          alpaka::syncBlockThreads(acc);
-          cms::alpakatools::for_each_element_in_block(acc, 32u, [&](uint32_t i) {
-            ws[i] = 0;  // used by prefix scan...
-          });
-          alpaka::syncBlockThreads(acc);
-          hist.finalize(acc, ws);
-          alpaka::syncBlockThreads(acc);
+        for (uint32_t i : cms::alpakatools::independent_group_elements(acc, firstPixel, lastPixel)) {
+          // skip invalid pixels
+          if (digi_view[i].moduleId() != ::pixelClustering::invalidModuleId) {
+            hist.fill(acc, digi_view[i].yy(), i - firstPixel);
+          }
+        }
+
 #ifdef GPU_DEBUG
-          ALPAKA_ASSERT_OFFLOAD(hist.size() == totGood);
-          if (thisModuleId % 100 == 1)
-            if (cms::alpakatools::once_per_block(acc))
-              printf("histo size %d\n", hist.size());
-#endif
-          cms::alpakatools::for_each_element_in_block_strided(acc, msize, firstPixel, [&](uint32_t i) {
-            if (digi_view[i].moduleId() != ::pixelClustering::invalidModuleId) {  // skip invalid pixels
-              hist.fill(acc, digi_view[i].yy(), i - firstPixel);
-            }
-          });
-          // Assume that we can cover the whole module with up to 16 blockDimension-wide iterations
-          // This maxiter value was tuned for GPU, with 256 or 512 threads per block.
-          // Hence, also works for CPU case, with 256 or 512 elements per thread.
-          // Real constrainst is maxiter = hist.size() / blockDimension,
-          // with blockDimension = threadPerBlock * elementsPerThread.
-          // Hence, maxiter can be tuned accordingly to the workdiv.
-          constexpr unsigned int maxiter = 16;
-          ALPAKA_ASSERT_OFFLOAD((hist.size() / blockDimension) <= maxiter);
-
-          // NB: can be tuned.
-          constexpr uint32_t threadDimension = cms::alpakatools::requires_single_thread_per_block_v<TAcc> ? 256 : 1;
-
-#ifndef NDEBUG
-          [[maybe_unused]] const uint32_t runTimeThreadDimension =
-              alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[0u];
-          ALPAKA_ASSERT_OFFLOAD(runTimeThreadDimension <= threadDimension);
+        // look for anomalous high occupancy
+        auto& n40 = alpaka::declareSharedVar<uint32_t, __COUNTER__>(acc);
+        auto& n60 = alpaka::declareSharedVar<uint32_t, __COUNTER__>(acc);
+        if (cms::alpakatools::once_per_block(acc)) {
+          n40 = 0;
+          n60 = 0;
+        }
+        alpaka::syncBlockThreads(acc);
+        for (uint32_t j : cms::alpakatools::independent_group_elements(acc, Hist::nbins())) {
+          if (hist.size(j) > 60)
+            alpaka::atomicAdd(acc, &n60, 1u, alpaka::hierarchy::Blocks{});
+          if (hist.size(j) > 40)
+            alpaka::atomicAdd(acc, &n40, 1u, alpaka::hierarchy::Blocks{});
+        }
+        alpaka::syncBlockThreads(acc);
+        if (cms::alpakatools::once_per_block(acc)) {
+          if (n60 > 0)
+            printf("columns with more than 60 px %d in %d\n", n60, thisModuleId);
+          else if (n40 > 0)
+            printf("columns with more than 40 px %d in %d\n", n40, thisModuleId);
+        }
+        alpaka::syncBlockThreads(acc);
 #endif
 
-          // nearest neighbour
-          // allocate space for duplicate pixels: a pixel can appear more than once with different charge in the same event
-          constexpr int maxNeighbours = 10;
-          uint16_t nn[maxiter][threadDimension][maxNeighbours];
-          uint8_t nnn[maxiter][threadDimension];  // number of nn
-          for (uint32_t elementIdx = 0; elementIdx < threadDimension; ++elementIdx) {
-            for (uint32_t k = 0; k < maxiter; ++k) {
-              nnn[k][elementIdx] = 0;
+        [[maybe_unused]] const uint32_t blockDimension = alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u];
+        // Assume that we can cover the whole module with up to 16 blockDimension-wide iterations
+        // This maxIter value was tuned for GPU, with 256 or 512 threads per block.
+        // Hence, also works for CPU case, with 256 or 512 elements per thread.
+        // Real constrainst is maxIter = hist.size() / blockDimension,
+        // with blockDimension = threadPerBlock * elementsPerThread.
+        // Hence, maxIter can be tuned accordingly to the workdiv.
+        constexpr unsigned int maxIterGPU = 16;
+        ALPAKA_ASSERT_OFFLOAD((hist.size() / blockDimension) < maxIterGPU);
+
+        // NB: can be tuned.
+        constexpr uint32_t maxElements = cms::alpakatools::requires_single_thread_per_block_v<TAcc> ? 256 : 1;
+        ALPAKA_ASSERT_OFFLOAD((alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[0u] <= maxElements));
+
+        constexpr unsigned int maxIter = maxIterGPU * maxElements;
+
+        // nearest neighbours (nn)
+        // allocate space for duplicate pixels: a pixel can appear more than once with different charge in the same event
+        constexpr int maxNeighbours = 10;
+        uint16_t nn[maxIter][maxNeighbours];
+        uint8_t nnn[maxIter];  // number of nn
+        for (uint32_t k = 0; k < maxIter; ++k) {
+          nnn[k] = 0;
+        }
+
+        alpaka::syncBlockThreads(acc);  // for hit filling!
+
+        // fill the nearest neighbours
+        uint32_t k = 0;
+        for (uint32_t j : cms::alpakatools::independent_group_elements(acc, hist.size())) {
+          ALPAKA_ASSERT_OFFLOAD(k < maxIter);
+          auto p = hist.begin() + j;
+          auto i = *p + firstPixel;
+          ALPAKA_ASSERT_OFFLOAD(digi_view[i].moduleId() != ::pixelClustering::invalidModuleId);
+          ALPAKA_ASSERT_OFFLOAD(digi_view[i].moduleId() == thisModuleId);  // same module
+          auto bin = Hist::bin(digi_view[i].yy() + 1);
+          auto end = hist.end(bin);
+          ++p;
+          ALPAKA_ASSERT_OFFLOAD(0 == nnn[k]);
+          for (; p < end; ++p) {
+            auto m = *p + firstPixel;
+            ALPAKA_ASSERT_OFFLOAD(m != i);
+            ALPAKA_ASSERT_OFFLOAD(int(digi_view[m].yy()) - int(digi_view[i].yy()) >= 0);
+            ALPAKA_ASSERT_OFFLOAD(int(digi_view[m].yy()) - int(digi_view[i].yy()) <= 1);
+            if (std::abs(int(digi_view[m].xx()) - int(digi_view[i].xx())) <= 1) {
+              auto l = nnn[k]++;
+              ALPAKA_ASSERT_OFFLOAD(l < maxNeighbours);
+              nn[k][l] = *p;
             }
           }
+          ++k;
+        }
 
-          alpaka::syncBlockThreads(acc);  // for hit filling!
-
-#ifdef GPU_DEBUG
-          // look for anomalous high occupancy
-          auto& n40 = alpaka::declareSharedVar<uint32_t, __COUNTER__>(acc);
-          auto& n60 = alpaka::declareSharedVar<uint32_t, __COUNTER__>(acc);
-          n40 = n60 = 0;
-          alpaka::syncBlockThreads(acc);
-          cms::alpakatools::for_each_element_in_block_strided(acc, Hist::nbins(), [&](uint32_t j) {
-            if (hist.size(j) > 60)
-              alpaka::atomicAdd(acc, &n60, 1u, alpaka::hierarchy::Blocks{});
-            if (hist.size(j) > 40)
-              alpaka::atomicAdd(acc, &n40, 1u, alpaka::hierarchy::Blocks{});
-          });
-          alpaka::syncBlockThreads(acc);
-          if (cms::alpakatools::once_per_block(acc)) {
-            if (n60 > 0)
-              printf("columns with more than 60 px %d in %d\n", n60, thisModuleId);
-            else if (n40 > 0)
-              printf("columns with more than 40 px %d in %d\n", n40, thisModuleId);
-          }
-          alpaka::syncBlockThreads(acc);
-#endif
-          // fill NN
-          uint32_t k = 0u;
-          cms::alpakatools::for_each_element_in_block_strided(acc, hist.size(), [&](uint32_t j) {
-            const uint32_t jEquivalentClass = j % threadDimension;
-            k = j / blockDimension;
-            ALPAKA_ASSERT_OFFLOAD(k < maxiter);
+        // for each pixel, look at all the pixels until the end of the module;
+        // when two valid pixels within +/- 1 in x or y are found, set their id to the minimum;
+        // after the loop, all the pixel in each cluster should have the id equeal to the lowest
+        // pixel in the cluster ( clus[i] == i ).
+        bool more = true;
+        /*
+          int nloops = 0;
+          */
+        while (alpaka::syncBlockThreadsPredicate<alpaka::BlockOr>(acc, more)) {
+          /*
+            if (nloops % 2 == 0) {
+              // even iterations of the outer loop
+            */
+          more = false;
+          uint32_t k = 0;
+          for (uint32_t j : cms::alpakatools::independent_group_elements(acc, hist.size())) {
+            ALPAKA_ASSERT_OFFLOAD(k < maxIter);
             auto p = hist.begin() + j;
             auto i = *p + firstPixel;
-            ALPAKA_ASSERT_OFFLOAD(digi_view[i].moduleId() != ::pixelClustering::invalidModuleId);
-            ALPAKA_ASSERT_OFFLOAD(digi_view[i].moduleId() == thisModuleId);  // same module
-            int be = Hist::bin(digi_view[i].yy() + 1);
-            auto e = hist.end(be);
-            ++p;
-            ALPAKA_ASSERT_OFFLOAD(0 == nnn[k][jEquivalentClass]);
-            for (; p < e; ++p) {
-              auto m = (*p) + firstPixel;
+            for (int kk = 0; kk < nnn[k]; ++kk) {
+              auto l = nn[k][kk];
+              auto m = l + firstPixel;
               ALPAKA_ASSERT_OFFLOAD(m != i);
-              ALPAKA_ASSERT_OFFLOAD(int(digi_view[m].yy()) - int(digi_view[i].yy()) >= 0);
-              ALPAKA_ASSERT_OFFLOAD(int(digi_view[m].yy()) - int(digi_view[i].yy()) <= 1);
-              if (std::abs(int(digi_view[m].xx()) - int(digi_view[i].xx())) <= 1) {
-                auto l = nnn[k][jEquivalentClass]++;
-                ALPAKA_ASSERT_OFFLOAD(l < maxNeighbours);
-                nn[k][jEquivalentClass][l] = *p;
+              // FIXME ::Threads ?
+              auto old = alpaka::atomicMin(acc, &digi_view[m].clus(), digi_view[i].clus(), alpaka::hierarchy::Blocks{});
+              if (old != digi_view[i].clus()) {
+                // end the loop only if no changes were applied
+                more = true;
               }
-            }
-          });
-          // for each pixel, look at all the pixels until the end of the module;
-          // when two valid pixels within +/- 1 in x or y are found, set their id to the minimum;
-          // after the loop, all the pixel in each cluster should have the id equeal to the lowest
-          // pixel in the cluster ( clus[i] == i ).
-          bool more = true;
-          int nloops = 0;
-          while (alpaka::syncBlockThreadsPredicate<alpaka::BlockOr>(acc, more)) {
-            if (1 == nloops % 2) {
-              cms::alpakatools::for_each_element_in_block_strided(acc, hist.size(), [&](uint32_t j) {
-                auto p = hist.begin() + j;
-                auto i = *p + firstPixel;
-                auto m = digi_view[i].clus();
-                while (m != digi_view[m].clus())
-                  m = digi_view[m].clus();
-                digi_view[i].clus() = m;
-              });
+              // FIXME ::Threads ?
+              alpaka::atomicMin(acc, &digi_view[i].clus(), old, alpaka::hierarchy::Blocks{});
+            }  // neighbours loop
+            ++k;
+          }  // pixel loop
+             /*
+              // use the outer loop to force a synchronisation
             } else {
-              more = false;
-              uint32_t k = 0u;
-              cms::alpakatools::for_each_element_in_block_strided(acc, hist.size(), [&](uint32_t j) {
-                k = j / blockDimension;
-                const uint32_t jEquivalentClass = j % threadDimension;
-                auto p = hist.begin() + j;
-                auto i = *p + firstPixel;
-                for (int kk = 0; kk < nnn[k][jEquivalentClass]; ++kk) {
-                  auto l = nn[k][jEquivalentClass][kk];
-                  auto m = l + firstPixel;
-                  ALPAKA_ASSERT_OFFLOAD(m != i);
-                  auto old =
-                      alpaka::atomicMin(acc, &digi_view[m].clus(), digi_view[i].clus(), alpaka::hierarchy::Blocks{});
-                  if (old != digi_view[i].clus()) {
-                    // end the loop only if no changes were applied
-                    more = true;
-                  }
-                  alpaka::atomicMin(acc, &digi_view[i].clus(), old, alpaka::hierarchy::Blocks{});
-                }  // nnloop
-              });  // pixel loop
+              // odd iterations of the outer loop
+            */
+          alpaka::syncBlockThreads(acc);
+          for (uint32_t j : cms::alpakatools::independent_group_elements(acc, hist.size())) {
+            auto p = hist.begin() + j;
+            auto i = *p + firstPixel;
+            auto m = digi_view[i].clus();
+            while (m != digi_view[m].clus())
+              m = digi_view[m].clus();
+            digi_view[i].clus() = m;
+          }
+          /*
             }
             ++nloops;
-          }  // end while
-#ifdef GPU_DEBUG
-          {
+            */
+        }  // end while
+
+        /*
+            // check that all threads in the block have executed the same number of iterations
             auto& n0 = alpaka::declareSharedVar<int, __COUNTER__>(acc);
             if (cms::alpakatools::once_per_block(acc))
               n0 = nloops;
             alpaka::syncBlockThreads(acc);
-#ifndef NDEBUG
-            [[maybe_unused]] auto ok = n0 == nloops;
-            ALPAKA_ASSERT_OFFLOAD(alpaka::syncBlockThreadsPredicate<alpaka::BlockAnd>(acc, ok));
-#endif
+            ALPAKA_ASSERT_OFFLOAD(alpaka::syncBlockThreadsPredicate<alpaka::BlockAnd>(acc, nloops == n0));
             if (thisModuleId % 100 == 1)
               if (cms::alpakatools::once_per_block(acc))
                 printf("# loops %d\n", nloops);
-          }
-#endif
-          auto& foundClusters = alpaka::declareSharedVar<unsigned int, __COUNTER__>(acc);
-          foundClusters = 0;
-          alpaka::syncBlockThreads(acc);
+          */
 
-          // find the number of different clusters, identified by a pixels with clus[i] == i;
-          // mark these pixels with a negative id.
-          cms::alpakatools::for_each_element_in_block_strided(acc, msize, firstPixel, [&](uint32_t i) {
-            if (digi_view[i].moduleId() != ::pixelClustering::invalidModuleId) {  // skip invalid pixels
-              if (digi_view[i].clus() == static_cast<int>(i)) {
-                auto old = alpaka::atomicInc(acc, &foundClusters, 0xffffffff, alpaka::hierarchy::Threads{});
-                digi_view[i].clus() = -(old + 1);
-              }
-            }
-          });
-          alpaka::syncBlockThreads(acc);
+        auto& foundClusters = alpaka::declareSharedVar<unsigned int, __COUNTER__>(acc);
+        foundClusters = 0;
+        alpaka::syncBlockThreads(acc);
 
-          // propagate the negative id to all the pixels in the cluster.
-          cms::alpakatools::for_each_element_in_block_strided(acc, msize, firstPixel, [&](uint32_t i) {
-            if (digi_view[i].moduleId() != ::pixelClustering::invalidModuleId) {  // skip invalid pixels
-              if (digi_view[i].clus() >= 0) {
-                // mark each pixel in a cluster with the same id as the first one
-                digi_view[i].clus() = digi_view[digi_view[i].clus()].clus();
-              }
-            }
-          });
-          alpaka::syncBlockThreads(acc);
+        // find the number of different clusters, identified by a pixels with clus[i] == i;
+        // mark these pixels with a negative id.
+        for (uint32_t i : cms::alpakatools::independent_group_elements(acc, firstPixel, lastPixel)) {
+          // skip invalid pixels
+          if (digi_view[i].moduleId() == ::pixelClustering::invalidModuleId)
+            continue;
+          if (digi_view[i].clus() == static_cast<int>(i)) {
+            auto old = alpaka::atomicInc(acc, &foundClusters, 0xffffffff, alpaka::hierarchy::Threads{});
+            digi_view[i].clus() = -(old + 1);
+          }
+        }
+        alpaka::syncBlockThreads(acc);
 
-          // adjust the cluster id to be a positive value starting from 0
-          cms::alpakatools::for_each_element_in_block_strided(acc, msize, firstPixel, [&](uint32_t i) {
-            if (digi_view[i].moduleId() == ::pixelClustering::invalidModuleId) {  // skip invalid pixels
-              digi_view[i].clus() = ::pixelClustering::invalidClusterId;
-            } else {
-              digi_view[i].clus() = -digi_view[i].clus() - 1;
-            }
-          });
-          alpaka::syncBlockThreads(acc);
-          if (cms::alpakatools::once_per_block(acc)) {
-            clus_view[thisModuleId].clusInModule() = foundClusters;
-            clus_view[module].moduleId() = thisModuleId;
+        // propagate the negative id to all the pixels in the cluster.
+        for (uint32_t i : cms::alpakatools::independent_group_elements(acc, firstPixel, lastPixel)) {
+          // skip invalid pixels
+          if (digi_view[i].moduleId() == ::pixelClustering::invalidModuleId)
+            continue;
+          if (digi_view[i].clus() >= 0) {
+            // mark each pixel in a cluster with the same id as the first one
+            digi_view[i].clus() = digi_view[digi_view[i].clus()].clus();
+          }
+        }
+        alpaka::syncBlockThreads(acc);
+
+        // adjust the cluster id to be a positive value starting from 0
+        for (uint32_t i : cms::alpakatools::independent_group_elements(acc, firstPixel, lastPixel)) {
+          if (digi_view[i].moduleId() == ::pixelClustering::invalidModuleId) {
+            // mark invalid pixels with an invalid cluster index
+            digi_view[i].clus() = ::pixelClustering::invalidClusterId;
+          } else {
+            digi_view[i].clus() = -digi_view[i].clus() - 1;
+          }
+        }
+        alpaka::syncBlockThreads(acc);
+
+        if (cms::alpakatools::once_per_block(acc)) {
+          clus_view[thisModuleId].clusInModule() = foundClusters;
+          clus_view[module].moduleId() = thisModuleId;
 #ifdef GPU_DEBUG
-            if (foundClusters > gMaxHit<TAcc>) {
-              gMaxHit<TAcc> = foundClusters;
-              if (foundClusters > 8)
-                printf("max hit %d in %d\n", foundClusters, thisModuleId);
-            }
-            // if (thisModuleId % 100 == 1)
+          if (foundClusters > gMaxHit<TAcc>) {
+            gMaxHit<TAcc> = foundClusters;
+            if (foundClusters > 8)
+              printf("max hit %d in %d\n", foundClusters, thisModuleId);
+          }
+          if (thisModuleId % 100 == 1)
             printf("%d clusters in module %d\n", foundClusters, thisModuleId);
 #endif
-          }
-        }  // module loop
-      }
-    };
-  }  // namespace pixelClustering
-}  // namespace ALPAKA_ACCELERATOR_NAMESPACE
+        }
+      }  // module loop
+    }
+  };
+
+}  // namespace ALPAKA_ACCELERATOR_NAMESPACE::pixelClustering
+
 #endif  // plugin_SiPixelClusterizer_alpaka_PixelClustering.h

From 571c91f07f36a13e0d120172488ac5990c02f612 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Mon, 5 Feb 2024 10:49:57 +0100
Subject: [PATCH 06/25] Rewrite the pixel charge cut code

Rewrite loops using cms::alpakatools::independent_groups(acc, ...) and
independent_group_elements(acc, ...).

General clean up.
---
 .../plugins/alpaka/ClusterChargeCut.h         | 148 ++++++------------
 1 file changed, 52 insertions(+), 96 deletions(-)

diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/ClusterChargeCut.h b/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/ClusterChargeCut.h
index 4056090517aee..d50995cf8d6e5 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/ClusterChargeCut.h
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/ClusterChargeCut.h
@@ -1,9 +1,11 @@
-#ifndef RecoLocalTracker_SiPixelClusterizer_alpaka_ClusterChargeCut_h
-#define RecoLocalTracker_SiPixelClusterizer_alpaka_ClusterChargeCut_h
+#ifndef RecoLocalTracker_SiPixelClusterizer_plugins_alpaka_ClusterChargeCut_h
+#define RecoLocalTracker_SiPixelClusterizer_plugins_alpaka_ClusterChargeCut_h
 
 #include <cstdint>
 #include <cstdio>
 
+#include <alpaka/alpaka.hpp>
+
 #include "DataFormats/SiPixelClusterSoA/interface/ClusteringConstants.h"
 #include "DataFormats/SiPixelClusterSoA/interface/SiPixelClustersSoA.h"
 #include "DataFormats/SiPixelDigiSoA/interface/SiPixelDigisSoA.h"
@@ -17,62 +19,56 @@ namespace pixelClustering {
   template <typename TrackerTraits>
   struct ClusterChargeCut {
     template <typename TAcc>
-    ALPAKA_FN_ACC void operator()(
-        const TAcc& acc,
-        SiPixelDigisSoAView digi_view,
-        SiPixelClustersSoAView clus_view,
-        SiPixelClusterThresholds
-            clusterThresholds,  // charge cut on cluster in electrons (for layer 1 and for other layers)
-        const uint32_t numElements) const {
-      constexpr int startBPIX2 = TrackerTraits::layerStart[1];
+    ALPAKA_FN_ACC void operator()(TAcc const& acc,
+                                  SiPixelDigisSoAView digi_view,
+                                  SiPixelClustersSoAView clus_view,
+                                  // charge cut on cluster in electrons (for layer 1 and for other layers)
+                                  SiPixelClusterThresholds clusterThresholds,
+                                  const uint32_t numElements) const {
       constexpr int32_t maxNumClustersPerModules = TrackerTraits::maxNumClustersPerModules;
-      [[maybe_unused]] constexpr int nMaxModules = TrackerTraits::numberOfModules;
-
-      const uint32_t blockIdx(alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0u]);
-      auto firstModule = blockIdx;
-      auto endModule = clus_view[0].moduleStart();
-      if (blockIdx >= endModule)
-        return;
 
       auto& charge = alpaka::declareSharedVar<int32_t[maxNumClustersPerModules], __COUNTER__>(acc);
       auto& ok = alpaka::declareSharedVar<uint8_t[maxNumClustersPerModules], __COUNTER__>(acc);
       auto& newclusId = alpaka::declareSharedVar<uint16_t[maxNumClustersPerModules], __COUNTER__>(acc);
 
-      const uint32_t gridDimension(alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[0u]);
+      constexpr int startBPIX2 = TrackerTraits::layerStart[1];
 
-      for (auto module = firstModule; module < endModule; module += gridDimension) {
+      ALPAKA_ASSERT_OFFLOAD(TrackerTraits::numberOfModules < maxNumModules);
+      ALPAKA_ASSERT_OFFLOAD(startBPIX2 < TrackerTraits::numberOfModules);
+
+      auto endModule = clus_view[0].moduleStart();
+      for (auto module : cms::alpakatools::independent_groups(acc, endModule)) {
         auto firstPixel = clus_view[1 + module].moduleStart();
         auto thisModuleId = digi_view[firstPixel].moduleId();
-
-        ALPAKA_ASSERT_OFFLOAD(nMaxModules < maxNumModules);
-        ALPAKA_ASSERT_OFFLOAD(startBPIX2 < nMaxModules);
+        while (thisModuleId == invalidModuleId and firstPixel < numElements) {
+          // skip invalid or duplicate pixels
+          ++firstPixel;
+          thisModuleId = digi_view[firstPixel].moduleId();
+        }
+        if (firstPixel >= numElements) {
+          // reached the end of the input while skipping the invalid pixels, nothing left to do
+          break;
+        }
+        if (thisModuleId != clus_view[module].moduleId()) {
+          // reached the end of the module while skipping the invalid pixels, skip this module
+          continue;
+        }
+        ALPAKA_ASSERT_OFFLOAD(thisModuleId < TrackerTraits::numberOfModules);
 
         uint32_t nclus = clus_view[thisModuleId].clusInModule();
         if (nclus == 0)
           return;
 
         if (cms::alpakatools::once_per_block(acc) && nclus > maxNumClustersPerModules)
-          printf("Warning too many clusters in module %d in block %d: %d > %d\n",
+          printf("Warning: too many clusters in module %u in block %u: %u > %d\n",
                  thisModuleId,
                  module,
                  nclus,
                  maxNumClustersPerModules);
 
-        // Stride = block size.
-        const uint32_t blockDimension(alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u]);
-
-        // Get thread / CPU element indices in block.
-        const auto& [firstElementIdxNoStride, endElementIdxNoStride] =
-            cms::alpakatools::element_index_range_in_block(acc, firstPixel);
-
         if (nclus > maxNumClustersPerModules) {
-          uint32_t firstElementIdx = firstElementIdxNoStride;
-          uint32_t endElementIdx = endElementIdxNoStride;
           // remove excess  FIXME find a way to cut charge first....
-          for (uint32_t i = firstElementIdx; i < numElements; ++i) {
-            if (not cms::alpakatools::next_valid_element_index_strided(
-                    i, firstElementIdx, endElementIdx, blockDimension, numElements))
-              break;
+          for (auto i : cms::alpakatools::independent_group_elements(acc, firstPixel, numElements)) {
             if (digi_view[i].moduleId() == invalidModuleId)
               continue;  // not valid
             if (digi_view[i].moduleId() != thisModuleId)
@@ -92,15 +88,12 @@ namespace pixelClustering {
 #endif
 
         ALPAKA_ASSERT_OFFLOAD(nclus <= maxNumClustersPerModules);
-        cms::alpakatools::for_each_element_in_block_strided(acc, nclus, [&](uint32_t i) { charge[i] = 0; });
+        for (auto i : cms::alpakatools::independent_group_elements(acc, nclus)) {
+          charge[i] = 0;
+        }
         alpaka::syncBlockThreads(acc);
 
-        uint32_t firstElementIdx = firstElementIdxNoStride;
-        uint32_t endElementIdx = endElementIdxNoStride;
-        for (uint32_t i = firstElementIdx; i < numElements; ++i) {
-          if (not cms::alpakatools::next_valid_element_index_strided(
-                  i, firstElementIdx, endElementIdx, blockDimension, numElements))
-            break;
+        for (auto i : cms::alpakatools::independent_group_elements(acc, firstPixel, numElements)) {
           if (digi_view[i].moduleId() == invalidModuleId)
             continue;  // not valid
           if (digi_view[i].moduleId() != thisModuleId)
@@ -113,75 +106,42 @@ namespace pixelClustering {
         alpaka::syncBlockThreads(acc);
 
         auto chargeCut = clusterThresholds.getThresholdForLayerOnCondition(thisModuleId < startBPIX2);
-        bool allGood = true;
-
-        cms::alpakatools::for_each_element_in_block_strided(acc, nclus, [&](uint32_t i) {
-          newclusId[i] = ok[i] = (charge[i] > chargeCut) ? 1 : 0;
-          if (ok[i] == 0)
-            allGood = allGood && false;
 
-          // #ifdef GPU_DEBUG
-          // printf("module %d -> chargeCut = %d; cluster %d; charge = %d; ok = %s\n",thisModuleId, chargeCut,i,charge[i],ok[i] > 0 ? " -> good" : "-> cut");
-          // #endif
-        });
-        alpaka::syncBlockThreads(acc);
+        bool good = true;
+        for (auto i : cms::alpakatools::independent_group_elements(acc, nclus)) {
+          newclusId[i] = ok[i] = (charge[i] >= chargeCut) ? 1 : 0;
+          if (0 == ok[i])
+            good = false;
+        }
 
-        // if all clusters above threshold do nothing
-        // if (allGood)
-        //   continue;
+        // if all clusters are above threshold, do nothing
+        if (alpaka::syncBlockThreadsPredicate<alpaka::BlockAnd>(acc, good))
+          continue;
 
         // renumber
         auto& ws = alpaka::declareSharedVar<uint16_t[32], __COUNTER__>(acc);
+        // FIXME this value should come from cms::alpakatools::blockPrefixScan itself
         constexpr uint32_t maxThreads = 1024;
         auto minClust = std::min(nclus, maxThreads);
 
         cms::alpakatools::blockPrefixScan(acc, newclusId, minClust, ws);
-
         if constexpr (maxNumClustersPerModules > maxThreads)  //only if needed
         {
           for (uint32_t offset = maxThreads; offset < nclus; offset += maxThreads) {
             cms::alpakatools::blockPrefixScan(acc, newclusId + offset, nclus - offset, ws);
-
-            cms::alpakatools::for_each_element_in_block_strided(acc, nclus - offset, [&](uint32_t i) {
-              uint32_t prevBlockEnd = ((i + offset / maxThreads) * maxThreads) - 1;
+            for (uint32_t i : cms::alpakatools::independent_group_elements(acc, offset, nclus)) {
+              uint32_t prevBlockEnd = (i / maxThreads) * maxThreads - 1;
               newclusId[i] += newclusId[prevBlockEnd];
-            });
+            }
             alpaka::syncBlockThreads(acc);
           }
         }
-
         ALPAKA_ASSERT_OFFLOAD(nclus >= newclusId[nclus - 1]);
 
-        if (nclus == newclusId[nclus - 1])
-          return;
-
         clus_view[thisModuleId].clusInModule() = newclusId[nclus - 1];
-        alpaka::syncBlockThreads(acc);
-
-#ifdef GPU_DEBUG
-        if (thisModuleId % 100 == 1)
-          if (cms::alpakatools::once_per_block(acc))
-            printf("module %d -> chargeCut = %d; nclus (pre cut) = %d; nclus (after cut) = %d\n",
-                   thisModuleId,
-                   chargeCut,
-                   nclus,
-                   clus_view[thisModuleId].clusInModule());
-#endif
-        // mark bad cluster again
-        cms::alpakatools::for_each_element_in_block_strided(acc, nclus, [&](uint32_t i) {
-          if (0 == ok[i])
-            newclusId[i] = invalidModuleId + 1;
-        });
-
-        alpaka::syncBlockThreads(acc);
 
         // reassign id
-        firstElementIdx = firstElementIdxNoStride;
-        endElementIdx = endElementIdxNoStride;
-        for (uint32_t i = firstElementIdx; i < numElements; ++i) {
-          if (not cms::alpakatools::next_valid_element_index_strided(
-                  i, firstElementIdx, endElementIdx, blockDimension, numElements))
-            break;
+        for (auto i : cms::alpakatools::independent_group_elements(acc, firstPixel, numElements)) {
           if (digi_view[i].moduleId() == invalidModuleId)
             continue;  // not valid
           if (digi_view[i].moduleId() != thisModuleId)
@@ -190,18 +150,14 @@ namespace pixelClustering {
             digi_view[i].moduleId() = digi_view[i].clus() = invalidModuleId;
           else
             digi_view[i].clus() = newclusId[digi_view[i].clus()] - 1;
-          // digi_view[i].clus() = newclusId[digi_view[i].clus()] - 1;
-          // if (digi_view[i].clus() == invalidModuleId)
-          //   digi_view[i].moduleId() = invalidModuleId;
         }
 
+        // done
         alpaka::syncBlockThreads(acc);
-
-        //done
       }
     }
   };
 
 }  // namespace pixelClustering
 
-#endif  //
+#endif  // RecoLocalTracker_SiPixelClusterizer_plugins_alpaka_ClusterChargeCut_h

From 357cfba38bd233458a5dfa2653e1bf8620562744 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Sun, 4 Feb 2024 00:00:49 +0100
Subject: [PATCH 07/25] Update comments to pixel topologies

---
 .../interface/SimplePixelTopology.h           | 32 ++++++++++---------
 1 file changed, 17 insertions(+), 15 deletions(-)

diff --git a/Geometry/CommonTopologies/interface/SimplePixelTopology.h b/Geometry/CommonTopologies/interface/SimplePixelTopology.h
index 95ab7db6aa274..faf511ee87f39 100644
--- a/Geometry/CommonTopologies/interface/SimplePixelTopology.h
+++ b/Geometry/CommonTopologies/interface/SimplePixelTopology.h
@@ -214,26 +214,26 @@ namespace phase2PixelTopology {
 
   HOST_DEVICE_CONSTANT uint8_t layerPairs[2 * nPairs] = {
 
-      0,  1,  0,  4,  0,  16,  //BPIX1 (3)
-      1,  2,  1,  4,  1,  16,  //BPIX2 (6)
-      2,  3,  2,  4,  2,  16,  //BPIX3 & Forward (9)
+      0,  1,  0,  4,  0,  16,  // BPIX1 (3)
+      1,  2,  1,  4,  1,  16,  // BPIX2 (6)
+      2,  3,  2,  4,  2,  16,  // BPIX3 & Forward (9)
 
-      4,  5,  5,  6,  6,  7,  7,  8,  8,  9,  9,  10, 10, 11,  //POS (16)
-      16, 17, 17, 18, 18, 19, 19, 20, 20, 21, 21, 22, 22, 23,  //NEG (23)
+      4,  5,  5,  6,  6,  7,  7,  8,  8,  9,  9,  10, 10, 11,  // POS (16)
+      16, 17, 17, 18, 18, 19, 19, 20, 20, 21, 21, 22, 22, 23,  // NEG (23)
 
       0,  2,  0,  5,  0,  17, 0,  6,  0,  18,  // BPIX1 Jump (28)
       1,  3,  1,  5,  1,  17, 1,  6,  1,  18,  // BPIX2 Jump (33)
 
-      11, 12, 12, 13, 13, 14, 14, 15,  //Late POS (37)
-      23, 24, 24, 25, 25, 26, 26, 27,  //Late NEG (41)
+      11, 12, 12, 13, 13, 14, 14, 15,  // Late POS (37)
+      23, 24, 24, 25, 25, 26, 26, 27,  // Late NEG (41)
 
-      4,  6,  5,  7,  6,  8,  7,  9,  8,  10, 9,  11, 10, 12,  //POS Jump (48)
-      16, 18, 17, 19, 18, 20, 19, 21, 20, 22, 21, 23, 22, 24,  //NEG Jump (55)
+      4,  6,  5,  7,  6,  8,  7,  9,  8,  10, 9,  11, 10, 12,  // POS Jump (48)
+      16, 18, 17, 19, 18, 20, 19, 21, 20, 22, 21, 23, 22, 24,  // NEG Jump (55)
   };
   HOST_DEVICE_CONSTANT uint32_t layerStart[numberOfLayers + 1] = {0,
                                                                   108,
                                                                   324,
-                                                                  504,  //Barrel
+                                                                  504,  // Barrel
                                                                   756,
                                                                   864,
                                                                   972,
@@ -245,7 +245,7 @@ namespace phase2PixelTopology {
                                                                   1620,
                                                                   1796,
                                                                   1972,
-                                                                  2148,  //Fp
+                                                                  2148,  // Fp
                                                                   2324,
                                                                   2432,
                                                                   2540,
@@ -257,7 +257,7 @@ namespace phase2PixelTopology {
                                                                   3188,
                                                                   3364,
                                                                   3540,
-                                                                  3716,  //Np
+                                                                  3716,  // Np
                                                                   numberOfModules};
 
   HOST_DEVICE_CONSTANT int16_t phicuts[nPairs]{
@@ -332,7 +332,7 @@ namespace pixelTopology {
     static constexpr uint32_t maxCellsPerHit = 256;
     static constexpr uint32_t avgTracksPerHit = 10;
     static constexpr uint32_t maxNumberOfTuples = 256 * 1024;
-    //this is well above thanks to maxNumberOfTuples
+    // this is well above thanks to maxNumberOfTuples
     static constexpr uint32_t maxHitsForContainers = avgHitsPerTrack * maxNumberOfTuples;
     static constexpr uint32_t maxNumberOfDoublets = 5 * 512 * 1024;
     static constexpr uint32_t maxNumOfActiveDoublets = maxNumberOfDoublets / 8;
@@ -381,6 +381,7 @@ namespace pixelTopology {
 
     static constexpr uint16_t numberOfModules = 3892;
 
+    // 1024 bins, 10 bits
     static constexpr uint16_t clusterBinning = 1024;
     static constexpr uint16_t clusterBits = 10;
 
@@ -391,7 +392,7 @@ namespace pixelTopology {
     static constexpr uint16_t firstEndcapPos = 4;
     static constexpr uint16_t firstEndcapNeg = 16;
 
-    static constexpr int16_t xOffset = -1e4;  //not used actually, to suppress static analyzer warnings
+    static constexpr int16_t xOffset = -1e4;  // not used actually, to suppress static analyzer warnings
 
     static constexpr char const *nameModifier = "Phase2";
 
@@ -483,6 +484,7 @@ namespace pixelTopology {
     static constexpr uint16_t lastRowInModule = numRowsInModule - 1;
     static constexpr uint16_t lastColInModule = numColsInModule - 1;
 
+    // 418 bins < 512, 9 bits are enough
     static constexpr uint16_t clusterBinning = numColsInModule + 2;
     static constexpr uint16_t clusterBits = 9;
 
@@ -557,7 +559,7 @@ namespace pixelTopology {
     static constexpr uint32_t maxPixInModule = 10000;
 
     static constexpr uint32_t maxNumOfActiveDoublets =
-        maxNumberOfDoublets / 4;  //TODO need to think a better way to avoid this duplication
+        maxNumberOfDoublets / 4;  // TODO need to think a better way to avoid this duplication
     static constexpr uint32_t maxCellsPerHit = 256;
 
     static constexpr uint32_t maxNumClustersPerModules = phase1HIonPixelTopology::maxNumClustersPerModules;

From 8f6ecaeb044983a8af0b2bf98facc550543c05c6 Mon Sep 17 00:00:00 2001
From: Adriano Di Florio <adriano.di.florio@cern.ch>
Date: Mon, 5 Feb 2024 15:38:35 +0100
Subject: [PATCH 08/25] Fixed decoding of the pixel timeout error

---
 .../alpaka/SiPixelRawToClusterKernel.dev.cc   | 39 ++-----------------
 1 file changed, 4 insertions(+), 35 deletions(-)

diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/SiPixelRawToClusterKernel.dev.cc b/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/SiPixelRawToClusterKernel.dev.cc
index cd9509d2be46b..abf56c4c85037 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/SiPixelRawToClusterKernel.dev.cc
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/SiPixelRawToClusterKernel.dev.cc
@@ -208,9 +208,10 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
         case (29): {
           if constexpr (debug)
             printf("Timeout on a channel (errorType = 29)\n");
-          if ((errorWord >> ::pixelDetails::OMIT_ERR_shift) & ::pixelDetails::OMIT_ERR_mask) {
+          if (!((errorWord >> sipixelconstants::OMIT_ERR_shift) & sipixelconstants::OMIT_ERR_mask)) {
             if constexpr (debug)
-              printf("...first errorType=29 error, this gets masked out\n");
+              printf("...2nd errorType=29 error, skip\n");
+            break;
           }
           errorFound = true;
           break;
@@ -251,6 +252,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
 
       switch (errorType) {
         case 25:
+        case 29:
         case 30:
         case 31:
         case 36:
@@ -262,39 +264,6 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
             rID = rID_temp;
           break;
         }
-        case 29: {
-          int chanNmbr = 0;
-          const int DB0_shift = 0;
-          const int DB1_shift = DB0_shift + 1;
-          const int DB2_shift = DB1_shift + 1;
-          const int DB3_shift = DB2_shift + 1;
-          const int DB4_shift = DB3_shift + 1;
-          const uint32_t DataBit_mask = ~(~uint32_t(0) << 1);
-
-          int CH1 = (errWord >> DB0_shift) & DataBit_mask;
-          int CH2 = (errWord >> DB1_shift) & DataBit_mask;
-          int CH3 = (errWord >> DB2_shift) & DataBit_mask;
-          int CH4 = (errWord >> DB3_shift) & DataBit_mask;
-          int CH5 = (errWord >> DB4_shift) & DataBit_mask;
-          int BLOCK_bits = 3;
-          int BLOCK_shift = 8;
-          uint32_t BLOCK_mask = ~(~uint32_t(0) << BLOCK_bits);
-          int BLOCK = (errWord >> BLOCK_shift) & BLOCK_mask;
-          int localCH = 1 * CH1 + 2 * CH2 + 3 * CH3 + 4 * CH4 + 5 * CH5;
-          if (BLOCK % 2 == 0)
-            chanNmbr = (BLOCK / 2) * 9 + localCH;
-          else
-            chanNmbr = ((BLOCK - 1) / 2) * 9 + 4 + localCH;
-          if ((chanNmbr < 1) || (chanNmbr > 36))
-            break;  // signifies unexpected result
-
-          uint32_t roc = 1;
-          uint32_t link = chanNmbr;
-          uint32_t rID_temp = getRawId(cablingMap, fedId, link, roc).rawId;
-          if (rID_temp != 9999)
-            rID = rID_temp;
-          break;
-        }
         case 37:
         case 38: {
           uint32_t roc = (errWord >> ::pixelDetails::ROC_shift) & ::pixelDetails::ROC_mask;

From c0732699b8ade53326d76aa04e5a0ccd9fad343a Mon Sep 17 00:00:00 2001
From: Adriano Di Florio <adriano.di.florio@cern.ch>
Date: Mon, 5 Feb 2024 17:19:44 +0100
Subject: [PATCH 09/25] Fix check for invalid pixel digis

---
 .../SiPixelDigisClustersFromSoAAlpaka.cc      | 26 ++++++++++++-------
 1 file changed, 17 insertions(+), 9 deletions(-)

diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelDigisClustersFromSoAAlpaka.cc b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelDigisClustersFromSoAAlpaka.cc
index 423951f4cb74f..982239924463f 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelDigisClustersFromSoAAlpaka.cc
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelDigisClustersFromSoAAlpaka.cc
@@ -160,22 +160,26 @@ void SiPixelDigisClustersFromSoAAlpaka<TrackerTraits>::produce(edm::StreamID,
   std::cout << "Dumping all digis. nDigis = " << nDigis << std::endl;
 #endif
   for (uint32_t i = 0; i < nDigis; i++) {
-#ifdef GPU_DEBUG
-    PixelDigi dig2{digisView[i].pdigi()};
-    std::cout << i << ";" << digisView[i].rawIdArr() << ";" << digisView[i].clus() << ";" << digisView[i].pdigi() << ";"
-              << digisView[i].adc() << ";" << dig2.row() << ";" << dig2.column() << std::endl;
-#endif
-
     // check for uninitialized digis
     if (digisView[i].rawIdArr() == 0)
       continue;
     // check for noisy/dead pixels (electrons set to 0)
     if (digisView[i].adc() == 0)
       continue;
-    if (digisView[i].clus() >= -pixelClustering::invalidClusterId)
-      continue;  // not in cluster; TODO add an assert for the size
+    // not in cluster; TODO add an assert for the size
+    if (digisView[i].clus() == pixelClustering::invalidClusterId) {
+      continue;
+    }
+    // unexpected invalid value
+    if (digisView[i].clus() < pixelClustering::invalidClusterId) {
+      edm::LogError("SiPixelDigisClustersFromSoAAlpaka")
+          << "Skipping pixel digi with unexpected invalid cluster id " << digisView[i].clus();
+      continue;
+    }
+    // from clusters killed by charge cut
     if (digisView[i].clus() == pixelClustering::invalidModuleId)
-      continue;  // from clusters killed by charge cut
+      continue;
+
 #ifdef EDM_ML_DEBUG
     assert(digisView[i].rawIdArr() > 109999);
 #endif
@@ -200,6 +204,10 @@ void SiPixelDigisClustersFromSoAAlpaka<TrackerTraits>::produce(edm::StreamID,
       }
     }
     PixelDigi dig{digisView[i].pdigi()};
+#ifdef GPU_DEBUG
+    std::cout << i << ";" << digisView[i].rawIdArr() << ";" << digisView[i].clus() << ";" << digisView[i].pdigi() << ";"
+              << digisView[i].adc() << ";" << dig.row() << ";" << dig.column() << std::endl;
+#endif
 
     if (storeDigis_)
       (*detDigis).data.emplace_back(dig);

From 1f9176577417a5d34f19145f41feb4136969cb4b Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Mon, 5 Feb 2024 17:39:32 +0100
Subject: [PATCH 10/25] Fix the constants used in the pixel clustering

---
 .../plugins/alpaka/SiPixelRawToClusterKernel.dev.cc           | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/SiPixelRawToClusterKernel.dev.cc b/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/SiPixelRawToClusterKernel.dev.cc
index abf56c4c85037..11ae14f2fa6a5 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/SiPixelRawToClusterKernel.dev.cc
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/SiPixelRawToClusterKernel.dev.cc
@@ -260,7 +260,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
           uint32_t roc = 1;
           uint32_t link = (errWord >> ::pixelDetails::LINK_shift) & ::pixelDetails::LINK_mask;
           uint32_t rID_temp = getRawId(cablingMap, fedId, link, roc).rawId;
-          if (rID_temp != 9999)
+          if (rID_temp != ::pixelClustering::invalidModuleId)
             rID = rID_temp;
           break;
         }
@@ -269,7 +269,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
           uint32_t roc = (errWord >> ::pixelDetails::ROC_shift) & ::pixelDetails::ROC_mask;
           uint32_t link = (errWord >> ::pixelDetails::LINK_shift) & ::pixelDetails::LINK_mask;
           uint32_t rID_temp = getRawId(cablingMap, fedId, link, roc).rawId;
-          if (rID_temp != 9999)
+          if (rID_temp != ::pixelClustering::invalidModuleId)
             rID = rID_temp;
           break;
         }

From c116e106b894f23553a3391eaf1517e75939b194 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Mon, 5 Feb 2024 17:42:35 +0100
Subject: [PATCH 11/25] Minor clean up of legacy CUDA code

---
 .../plugins/SiPixelRawToClusterGPUKernel.cu           |  6 +++---
 .../SiPixelClusterizer/plugins/gpuClusterChargeCut.h  | 11 ++++-------
 2 files changed, 7 insertions(+), 10 deletions(-)

diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu
index 452b0e2097071..1e2e3ad235b79 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu
@@ -607,8 +607,8 @@ namespace pixelgpudetails {
           digis_d->moduleId(), clusters_d->moduleStart(), digis_d->clus(), wordCounter);
       cudaCheck(cudaGetLastError());
 
-      threadsPerBlock = ((TrackerTraits::maxPixInModule / 16 + 128 - 1) / 128) *
-                        128;  /// should be larger than maxPixInModule/16 aka (maxPixInModule/maxiter in the kernel)
+      // should be larger than maxPixInModule/16 aka (maxPixInModule/maxiter in the kernel)
+      threadsPerBlock = ((TrackerTraits::maxPixInModule / 16 + 128 - 1) / 128) * 128;
       blocks = TrackerTraits::numberOfModules;
 #ifdef GPU_DEBUG
       std::cout << "CUDA findClus kernel launch with " << blocks << " blocks of " << threadsPerBlock << " threads\n";
@@ -752,13 +752,13 @@ namespace pixelgpudetails {
     cudaCheck(cudaGetLastError());
 
     auto nModules_Clusters_d = cms::cuda::make_device_unique<uint32_t[]>(3, stream);
-    // MUST be ONE block
 
 #ifdef GPU_DEBUG
     cudaCheck(cudaStreamSynchronize(stream));
     std::cout << "CUDA fillHitsModuleStart kernel launch \n";
 #endif
 
+    // MUST be ONE block
     fillHitsModuleStart<TrackerTraits><<<1, 1024, 0, stream>>>(clusters_d->clusInModule(),
                                                                clusters_d->clusModuleStart(),
                                                                clusters_d->moduleStart(),
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusterChargeCut.h b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusterChargeCut.h
index 1ff62ed1c6c57..f8554e341ff9c 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusterChargeCut.h
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusterChargeCut.h
@@ -30,10 +30,9 @@ namespace gpuClustering {
     __shared__ uint16_t newclusId[maxNumClustersPerModules];
 
     constexpr int startBPIX2 = TrackerTraits::layerStart[1];
-    [[maybe_unused]] constexpr int nMaxModules = TrackerTraits::numberOfModules;
 
-    assert(nMaxModules < maxNumModules);
-    assert(startBPIX2 < nMaxModules);
+    assert(TrackerTraits::numberOfModules < maxNumModules);
+    assert(startBPIX2 < TrackerTraits::numberOfModules);
 
     auto firstModule = blockIdx.x;
     auto endModule = moduleStart[0];
@@ -53,7 +52,7 @@ namespace gpuClustering {
         // reached the end of the module while skipping the invalid pixels, skip this module
         continue;
       }
-      assert(thisModuleId < nMaxModules);
+      assert(thisModuleId < TrackerTraits::numberOfModules);
 
       auto nclus = nClustersInModule[thisModuleId];
       if (nclus == 0)
@@ -127,12 +126,10 @@ namespace gpuClustering {
       {
         for (uint32_t offset = maxThreads; offset < nclus; offset += maxThreads) {
           cms::cuda::blockPrefixScan(newclusId + offset, newclusId + offset, nclus - offset, ws);
-
           for (uint32_t i = threadIdx.x + offset; i < nclus; i += blockDim.x) {
             uint32_t prevBlockEnd = ((i / maxThreads) * maxThreads) - 1;
             newclusId[i] += newclusId[prevBlockEnd];
           }
-
           __syncthreads();
         }
       }
@@ -152,7 +149,7 @@ namespace gpuClustering {
           clusterId[i] = newclusId[clusterId[i]] - 1;
       }
 
-      //done
+      // done
       __syncthreads();
     }  // loop on modules
   }

From f3592edfabea1b5fbf99a5049acde97d7790d4a7 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Tue, 6 Feb 2024 18:41:58 +0100
Subject: [PATCH 12/25] Renumber the ECAL-only alpaka workflow to .412

Renumber the ECAL-only alpaka workflow from ##.411 to ##.412, for
consistency with the old gpu workflows (##.512) and the other alpaka
workflows.
---
 Configuration/PyReleaseValidation/README.md                     | 2 +-
 .../PyReleaseValidation/python/upgradeWorkflowComponents.py     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Configuration/PyReleaseValidation/README.md b/Configuration/PyReleaseValidation/README.md
index 817a20e2c86a5..ca3d8af1a6f85 100644
--- a/Configuration/PyReleaseValidation/README.md
+++ b/Configuration/PyReleaseValidation/README.md
@@ -30,7 +30,7 @@ The offsets currently in use are:
 * 0.2: Tracking Run-2 era, `Run2_2017_trackingRun2`
 * 0.3: 0.1 + 0.2
 * 0.4: LowPU tracking era, `Run2_2017_trackingLowPU`
-* 0.411: Patatrack, ECAL only, Alpaka
+* 0.412: Patatrack, ECAL only, Alpaka
 * 0.5: Pixel tracking only + 0.1
 * 0.501: Patatrack, pixel only quadruplets, on CPU
 * 0.502: Patatrack, pixel only quadruplets, with automatic offload to GPU if available
diff --git a/Configuration/PyReleaseValidation/python/upgradeWorkflowComponents.py b/Configuration/PyReleaseValidation/python/upgradeWorkflowComponents.py
index c6fd188a22c89..9a9084b0d4978 100644
--- a/Configuration/PyReleaseValidation/python/upgradeWorkflowComponents.py
+++ b/Configuration/PyReleaseValidation/python/upgradeWorkflowComponents.py
@@ -1073,7 +1073,7 @@ def setup_(self, step, stepName, stepDict, k, properties):
         '-s': 'HARVESTING:@ecalOnlyValidation+@ecal'
     },
     suffix = 'Patatrack_ECALOnlyAlpaka',
-    offset = 0.411,
+    offset = 0.412,
 )
 
 # ECAL-only workflow running on CPU

From e16a1a8e7d7bdb14dbb5a0e8f4392074bd7c2835 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Tue, 6 Feb 2024 18:48:43 +0100
Subject: [PATCH 13/25] Add pixel-only alpaka workflows to the README

---
 Configuration/PyReleaseValidation/README.md | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/Configuration/PyReleaseValidation/README.md b/Configuration/PyReleaseValidation/README.md
index ca3d8af1a6f85..041fefe015376 100644
--- a/Configuration/PyReleaseValidation/README.md
+++ b/Configuration/PyReleaseValidation/README.md
@@ -30,7 +30,10 @@ The offsets currently in use are:
 * 0.2: Tracking Run-2 era, `Run2_2017_trackingRun2`
 * 0.3: 0.1 + 0.2
 * 0.4: LowPU tracking era, `Run2_2017_trackingLowPU`
-* 0.412: Patatrack, ECAL only, Alpaka
+* 0.402: Alpaka, pixel only quadruplets, portable
+* 0.403: Alpaka, pixel only quadruplets, portable vs. CPU validation
+* 0.404: Alpaka, pixel only quadruplets, portable profiling
+* 0.412: Alpaka, ECAL only, portable
 * 0.5: Pixel tracking only + 0.1
 * 0.501: Patatrack, pixel only quadruplets, on CPU
 * 0.502: Patatrack, pixel only quadruplets, with automatic offload to GPU if available

From 0851cc3dd29017d42780882ccb5d98d3026bc198 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Wed, 7 Feb 2024 11:01:08 +0100
Subject: [PATCH 14/25] Add alpaka workflows to the GPU relvals

Add pixel-only and ECAL-only alpaka workflows to the set of GPU relvals.
---
 .../PyReleaseValidation/python/relval_gpu.py  | 22 ++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/Configuration/PyReleaseValidation/python/relval_gpu.py b/Configuration/PyReleaseValidation/python/relval_gpu.py
index 3eeabf9d3c43a..f7d9afdede813 100644
--- a/Configuration/PyReleaseValidation/python/relval_gpu.py
+++ b/Configuration/PyReleaseValidation/python/relval_gpu.py
@@ -13,7 +13,11 @@
 
 # mc WFs to run in IB:
 
-# mc 2023   Patatrack pixel-only quadruplets:                   ZMM - on GPU (optional), GPU-vs-CPU validation, profiling
+# mc 2023   Alpaka pixel-only quadruplets:                      ZMM: any backend, any backend vs cpu validation, profiling
+#           Alpaka pixel-only quadruplets:                      TTbar: any backend, any backend vs cpu validation, profiling
+#           Alpaka ECAL-only:                                   TTbar: any backend
+#           Alpaka pixel-only quadruplets:                      Single Nu E10: any backend
+#           Patatrack pixel-only quadruplets:                   ZMM - on GPU (optional), GPU-vs-CPU validation, profiling
 #           Patatrack pixel-only triplets:                      ZMM - on GPU (optional), GPU-vs-CPU validation, profiling
 #           Patatrack pixel-only quadruplets:                   TTbar - on GPU (optional), GPU-vs-CPU validation, profiling
 #           Patatrack pixel-only triplets:                      TTbar - on GPU (optional), GPU-vs-CPU validation, profiling
@@ -23,10 +27,17 @@
 #           Patatrack pixel-only triplets, ECAL, HCAL:          TTbar - on GPU (optional), GPU-vs-CPU validation, profiling (to be implemented)
 #           full reco with Patatrack pixel-only quadruplets:    TTbar - on GPU (optional), GPU-vs-CPU validation
 #           full reco with Patatrack pixel-only triplets:       TTbar - on GPU (optional), GPU-vs-CPU validation
-#           Patatrack Single Nu E10 on GPU (optional)
-# mc 2026   Patatrack Single Nu E10 on GPU (optional)
+#           Patatrack pixel-only quadruplets:                   Single Nu E10 on GPU (optional)
+# mc 2026   Patatrack pixel-only quadruplets:                   Single Nu E10 on GPU (optional)
 numWFIB = [
-           # 2023
+           # 2023, Alpaka-based
+           12450.402, 12450.403, 12450.404,
+           #12450.406, 12450.407, 12450.408,
+           12434.402, 12434.403, 12434.404,
+           #12434.406, 12434.407, 12434.408,
+           12434.412, #12434.413, 12434.414,
+           12461.402,
+           # 2023, CUDA-based
            12450.502, 12450.503, 12450.504,
            12450.506, 12450.507, 12450.508,
            12434.502, 12434.503, 12434.504,
@@ -37,7 +48,8 @@
            12434.586, 12434.587, # 12434.588,
            12434.592, 12434.593,
            12434.596, 12434.597,
-           12461.502, 
+           12461.502,
+           # 2026, CUDA-based
            24861.502
         ]
 

From 9369dbc260a50aa3571aabf86e4332d574d155ba Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Wed, 7 Feb 2024 10:46:44 +0100
Subject: [PATCH 15/25] Add a protection for quasi-empty events

Do not call the fishbone for quasi-empty events, with pixel hits only in
the innermost layer.

Clean up the implementation of the fishbone.

Extend TrackingRecHits collections for testing and their unit tests.

Co-authored-by: Adriano Di Florio <adriano.di.florio@cern.ch>
---
 .../interface/TrackingRecHitsDevice.h         |  26 +-
 .../interface/TrackingRecHitsHost.h           |  17 +-
 .../alpaka/TrackingRecHitsSoACollection.h     |  19 +-
 .../test/alpaka/Hits_test.cc                  |  37 ++-
 .../test/alpaka/Hits_test.dev.cc              |  27 +-
 .../plugins/alpaka/PixelRecHitKernels.dev.cc  |   2 +-
 .../PixelSeeding/plugins/alpaka/CAFishbone.h  | 261 +++++++++---------
 .../plugins/alpaka/CAHitNtupletGenerator.cc   |   6 +-
 .../CAHitNtupletGeneratorKernels.dev.cc       |  42 +--
 .../alpaka/CAHitNtupletGeneratorKernels.h     |   6 +-
 .../plugins/alpaka/CAPixelDoublets.h          |   2 +-
 11 files changed, 247 insertions(+), 198 deletions(-)

diff --git a/DataFormats/TrackingRecHitSoA/interface/TrackingRecHitsDevice.h b/DataFormats/TrackingRecHitSoA/interface/TrackingRecHitsDevice.h
index c0fc252729df7..7a0104dc899cb 100644
--- a/DataFormats/TrackingRecHitSoA/interface/TrackingRecHitsDevice.h
+++ b/DataFormats/TrackingRecHitSoA/interface/TrackingRecHitsDevice.h
@@ -8,23 +8,23 @@
 #include "DataFormats/Portable/interface/PortableDeviceCollection.h"
 #include "DataFormats/TrackingRecHitSoA/interface/TrackingRecHitsHost.h"
 #include "DataFormats/TrackingRecHitSoA/interface/TrackingRecHitsSoA.h"
-#include "HeterogeneousCore/AlpakaInterface/interface/config.h"
 
 template <typename TrackerTraits, typename TDev>
 class TrackingRecHitDevice : public PortableDeviceCollection<TrackingRecHitLayout<TrackerTraits>, TDev> {
 public:
   using hitSoA = TrackingRecHitSoA<TrackerTraits>;
-  //Need to decorate the class with the inherited portable accessors being now a template
+
+  // Need to decorate the class with the inherited portable accessors being now a template
   using PortableDeviceCollection<TrackingRecHitLayout<TrackerTraits>, TDev>::view;
   using PortableDeviceCollection<TrackingRecHitLayout<TrackerTraits>, TDev>::const_view;
   using PortableDeviceCollection<TrackingRecHitLayout<TrackerTraits>, TDev>::buffer;
 
   TrackingRecHitDevice() = default;
 
-  // Constructor which specifies the SoA size
+  // Constructor which specifies the SoA size, number of BPIX1 hits, and the modules entry points
   template <typename TQueue>
-  explicit TrackingRecHitDevice(uint32_t nHits, int32_t offsetBPIX2, uint32_t const* hitsModuleStart, TQueue queue)
-      : PortableDeviceCollection<TrackingRecHitLayout<TrackerTraits>, TDev>(nHits, queue) {
+  explicit TrackingRecHitDevice(TQueue queue, uint32_t nHits, int32_t offsetBPIX2, uint32_t const* hitsModuleStart)
+      : PortableDeviceCollection<TrackingRecHitLayout<TrackerTraits>, TDev>(nHits, queue), offsetBPIX2_{offsetBPIX2} {
     const auto device = alpaka::getDev(queue);
 
     auto start_h = cms::alpakatools::make_host_view(hitsModuleStart, TrackerTraits::numberOfModules + 1);
@@ -39,6 +39,22 @@ class TrackingRecHitDevice : public PortableDeviceCollection<TrackingRecHitLayou
   }
 
   uint32_t nHits() const { return view().metadata().size(); }
+
+  int32_t offsetBPIX2() const { return offsetBPIX2_; }
+
   uint32_t const* hitsModuleStart() const { return view().hitsModuleStart().data(); }
+
+  // asynchronously update the information cached within the class itself from the information on the device
+  template <typename TQueue>
+  void updateFromDevice(TQueue queue) {
+    auto off_h = cms::alpakatools::make_host_view(offsetBPIX2_);
+    auto off_d = cms::alpakatools::make_device_view(alpaka::getDev(queue), view().offsetBPIX2());
+    alpaka::memcpy(queue, off_h, off_d);
+  }
+
+private:
+  // offsetBPIX2 is used on host functions so is useful to have it also stored in the class and not only in the layout
+  int32_t offsetBPIX2_ = 0;
 };
+
 #endif  // DataFormats_RecHits_interface_TrackingRecHitSoADevice_h
diff --git a/DataFormats/TrackingRecHitSoA/interface/TrackingRecHitsHost.h b/DataFormats/TrackingRecHitSoA/interface/TrackingRecHitsHost.h
index ce3f57232ac93..e7212ce9a6252 100644
--- a/DataFormats/TrackingRecHitSoA/interface/TrackingRecHitsHost.h
+++ b/DataFormats/TrackingRecHitSoA/interface/TrackingRecHitsHost.h
@@ -13,27 +13,36 @@ template <typename TrackerTraits>
 class TrackingRecHitHost : public PortableHostCollection<TrackingRecHitLayout<TrackerTraits>> {
 public:
   using hitSoA = TrackingRecHitSoA<TrackerTraits>;
-  //Need to decorate the class with the inherited portable accessors being now a template
+
+  // Need to decorate the class with the inherited portable accessors being now a template
   using PortableHostCollection<TrackingRecHitLayout<TrackerTraits>>::view;
   using PortableHostCollection<TrackingRecHitLayout<TrackerTraits>>::const_view;
   using PortableHostCollection<TrackingRecHitLayout<TrackerTraits>>::buffer;
 
   TrackingRecHitHost() = default;
 
+  // Constructor which specifies only the SoA size, to be used when copying the results from the device to the host
   template <typename TQueue>
-  explicit TrackingRecHitHost(uint32_t nHits, TQueue queue)
+  explicit TrackingRecHitHost(TQueue queue, uint32_t nHits)
       : PortableHostCollection<TrackingRecHitLayout<TrackerTraits>>(nHits, queue) {}
 
-  // Constructor which specifies the SoA size
+  // Constructor which specifies the SoA size, number of BPIX1 hits, and the modules entry points
   template <typename TQueue>
-  explicit TrackingRecHitHost(uint32_t nHits, int32_t offsetBPIX2, uint32_t const* hitsModuleStart, TQueue queue)
+  explicit TrackingRecHitHost(TQueue queue, uint32_t nHits, int32_t offsetBPIX2, uint32_t const* hitsModuleStart)
       : PortableHostCollection<TrackingRecHitLayout<TrackerTraits>>(nHits, queue) {
     std::copy(hitsModuleStart, hitsModuleStart + TrackerTraits::numberOfModules + 1, view().hitsModuleStart().data());
     view().offsetBPIX2() = offsetBPIX2;
   }
 
   uint32_t nHits() const { return view().metadata().size(); }
+
+  int32_t offsetBPIX2() const { return view().offsetBPIX2(); }
+
   uint32_t const* hitsModuleStart() const { return view().hitsModuleStart().data(); }
+
+  // do nothing for a host collection
+  template <typename TQueue>
+  void updateFromDevice(TQueue) {}
 };
 
 using TrackingRecHitHostPhase1 = TrackingRecHitHost<pixelTopology::Phase1>;
diff --git a/DataFormats/TrackingRecHitSoA/interface/alpaka/TrackingRecHitsSoACollection.h b/DataFormats/TrackingRecHitSoA/interface/alpaka/TrackingRecHitsSoACollection.h
index 0e0e848afcfd9..2a7439c34c513 100644
--- a/DataFormats/TrackingRecHitSoA/interface/alpaka/TrackingRecHitsSoACollection.h
+++ b/DataFormats/TrackingRecHitSoA/interface/alpaka/TrackingRecHitsSoACollection.h
@@ -1,12 +1,14 @@
-#ifndef DataFormats_RecHits_interface_alpakaTrackingRecHitsSoACollection
-#define DataFormats_RecHits_interface_alpakaTrackingRecHitsSoACollection
+#ifndef DataFormats_TrackingRecHitSoA_interface_alpaka_TrackingRecHitsSoACollection_h
+#define DataFormats_TrackingRecHitSoA_interface_alpaka_TrackingRecHitsSoACollection_h
 
 #include <cstdint>
+
 #include <alpaka/alpaka.hpp>
+
 #include "DataFormats/Portable/interface/alpaka/PortableCollection.h"
-#include "DataFormats/TrackingRecHitSoA/interface/TrackingRecHitsSoA.h"
-#include "DataFormats/TrackingRecHitSoA/interface/TrackingRecHitsHost.h"
 #include "DataFormats/TrackingRecHitSoA/interface/TrackingRecHitsDevice.h"
+#include "DataFormats/TrackingRecHitSoA/interface/TrackingRecHitsHost.h"
+#include "DataFormats/TrackingRecHitSoA/interface/TrackingRecHitsSoA.h"
 #include "Geometry/CommonTopologies/interface/SimplePixelTopology.h"
 #include "HeterogeneousCore/AlpakaInterface/interface/CopyToHost.h"
 
@@ -17,7 +19,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
                                                           TrackingRecHitHost<TrackerTraits>,
                                                           TrackingRecHitDevice<TrackerTraits, Device>>;
 
-  //Classes definition for Phase1/Phase2, to make the classes_def lighter. Not actually used in the code.
+  // Classes definition for Phase1/Phase2, to make the classes_def lighter. Not actually used in the code.
   using TrackingRecHitSoAPhase1 = TrackingRecHitsSoACollection<pixelTopology::Phase1>;
   using TrackingRecHitSoAPhase2 = TrackingRecHitsSoACollection<pixelTopology::Phase2>;
   using TrackingRecHitSoAHIonPhase1 = TrackingRecHitsSoACollection<pixelTopology::HIonPhase1>;
@@ -29,10 +31,13 @@ namespace cms::alpakatools {
   struct CopyToHost<TrackingRecHitDevice<TrackerTraits, TDevice>> {
     template <typename TQueue>
     static auto copyAsync(TQueue& queue, TrackingRecHitDevice<TrackerTraits, TDevice> const& deviceData) {
-      TrackingRecHitHost<TrackerTraits> hostData(deviceData.view().metadata().size(), queue);
+      TrackingRecHitHost<TrackerTraits> hostData(queue, deviceData.view().metadata().size());
       alpaka::memcpy(queue, hostData.buffer(), deviceData.buffer());
 #ifdef GPU_DEBUG
       printf("TrackingRecHitsSoACollection: I'm copying to host.\n");
+      alpaka::wait(queue);
+      assert(deviceData.nHits() == hostData.nHits());
+      assert(deviceData.offsetBPIX2() == hostData.offsetBPIX2());
 #endif
       return hostData;
     }
@@ -43,4 +48,4 @@ ASSERT_DEVICE_MATCHES_HOST_COLLECTION(TrackingRecHitSoAPhase1, TrackingRecHitHos
 ASSERT_DEVICE_MATCHES_HOST_COLLECTION(TrackingRecHitSoAPhase2, TrackingRecHitHostPhase2);
 ASSERT_DEVICE_MATCHES_HOST_COLLECTION(TrackingRecHitSoAHIonPhase1, TrackingRecHitHostHIonPhase1);
 
-#endif  // DataFormats_RecHits_interface_alpakaTrackingRecHitsSoACollection
\ No newline at end of file
+#endif  // DataFormats_TrackingRecHitSoA_interface_alpaka_TrackingRecHitsSoACollection_h
diff --git a/DataFormats/TrackingRecHitSoA/test/alpaka/Hits_test.cc b/DataFormats/TrackingRecHitSoA/test/alpaka/Hits_test.cc
index 378bb95db7b30..fa8c63bb627b3 100644
--- a/DataFormats/TrackingRecHitSoA/test/alpaka/Hits_test.cc
+++ b/DataFormats/TrackingRecHitSoA/test/alpaka/Hits_test.cc
@@ -1,18 +1,16 @@
-#include "DataFormats/TrackingRecHitSoA/interface/TrackingRecHitsHost.h"
+#include <unistd.h>
+
+#include <alpaka/alpaka.hpp>
+
 #include "DataFormats/TrackingRecHitSoA/interface/TrackingRecHitsDevice.h"
+#include "DataFormats/TrackingRecHitSoA/interface/TrackingRecHitsHost.h"
 #include "DataFormats/TrackingRecHitSoA/interface/alpaka/TrackingRecHitsSoACollection.h"
-
+#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/config.h"
 #include "HeterogeneousCore/AlpakaInterface/interface/devices.h"
-#include "HeterogeneousCore/AlpakaInterface/interface/host.h"
 #include "HeterogeneousCore/AlpakaInterface/interface/memory.h"
-#include "HeterogeneousCore/AlpakaInterface/interface/config.h"
 #include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
 
-#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h"
-
-#include <alpaka/alpaka.hpp>
-#include <unistd.h>
-
 using namespace ALPAKA_ACCELERATOR_NAMESPACE;
 
 namespace ALPAKA_ACCELERATOR_NAMESPACE {
@@ -25,7 +23,6 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
 }  // namespace ALPAKA_ACCELERATOR_NAMESPACE
 
 int main() {
-  const auto host = cms::alpakatools::host();
   const auto device = cms::alpakatools::devices<Platform>()[0];
   Queue queue(device);
 
@@ -35,13 +32,29 @@ int main() {
     int32_t offset = 100;
     uint32_t moduleStart[pixelTopology::Phase1::numberOfModules + 1];
 
-    for (size_t i = 0; i < pixelTopology::Phase1::numberOfModules + 1; i++) {
+    for (size_t i = 0; i < pixelTopology::Phase1::numberOfModules + 1; ++i) {
       moduleStart[i] = i * 2;
     }
-    TrackingRecHitsSoACollection<pixelTopology::Phase1> tkhit(nHits, offset, &moduleStart[0], queue);
+    TrackingRecHitsSoACollection<pixelTopology::Phase1> tkhit(queue, nHits, offset, moduleStart);
 
     testTrackingRecHitSoA::runKernels<pixelTopology::Phase1>(tkhit.view(), queue);
+    tkhit.updateFromDevice(queue);
+
+#if defined ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED or defined ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED
+    // requires c++23 to make cms::alpakatools::CopyToHost compile using if constexpr
+    // see https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2022/p2593r0.html
+    TrackingRecHitHost<pixelTopology::Phase1> const& host_collection = tkhit;
+#else
+    TrackingRecHitHost<pixelTopology::Phase1> host_collection =
+        cms::alpakatools::CopyToHost<TrackingRecHitDevice<pixelTopology::Phase1, Device> >::copyAsync(queue, tkhit);
+#endif
+    // wait for the kernel and the potential copy to complete
     alpaka::wait(queue);
+    assert(tkhit.nHits() == nHits);
+    assert(tkhit.offsetBPIX2() == 22);  // set in the kernel
+    assert(tkhit.nHits() == host_collection.nHits());
+    assert(tkhit.offsetBPIX2() == host_collection.offsetBPIX2());
   }
+
   return 0;
 }
diff --git a/DataFormats/TrackingRecHitSoA/test/alpaka/Hits_test.dev.cc b/DataFormats/TrackingRecHitSoA/test/alpaka/Hits_test.dev.cc
index 79d8bd69cbc3a..1ea67ad822536 100644
--- a/DataFormats/TrackingRecHitSoA/test/alpaka/Hits_test.dev.cc
+++ b/DataFormats/TrackingRecHitSoA/test/alpaka/Hits_test.dev.cc
@@ -12,14 +12,13 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
   namespace testTrackingRecHitSoA {
 
     template <typename TrackerTraits>
-    class TestFillKernel {
-    public:
+    struct TestFillKernel {
       template <typename TAcc, typename = std::enable_if_t<isAccelerator<TAcc>>>
       ALPAKA_FN_ACC void operator()(TAcc const& acc, TrackingRecHitSoAView<TrackerTraits> soa) const {
         const uint32_t i(alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0u]);
         const uint32_t j(alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]);
 
-        if (i == 0 and j == 0) {
+        if (cms::alpakatools::once_per_grid(acc)) {
           soa.offsetBPIX2() = 22;
           soa[10].xLocal() = 1.11;
         }
@@ -30,22 +29,20 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
     };
 
     template <typename TrackerTraits>
-    class ShowKernel {
-    public:
+    struct ShowKernel {
       template <typename TAcc, typename = std::enable_if_t<isAccelerator<TAcc>>>
       ALPAKA_FN_ACC void operator()(TAcc const& acc, TrackingRecHitSoAConstView<TrackerTraits> soa) const {
-        const uint32_t i(alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0u]);
-        const uint32_t j(alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]);
-
-        if (i == 0 and j == 0) {
-          printf("nbins = %d \n", soa.phiBinner().nbins());
-          printf("offsetBPIX %d ->%d \n", i, soa.offsetBPIX2());
-          printf("nHits %d ->%d \n", i, soa.metadata().size());
-          //printf("hitsModuleStart %d ->%d \n", i, soa.hitsModuleStart().at(28));
+        if (cms::alpakatools::once_per_grid(acc)) {
+          printf("nbins = %d\n", soa.phiBinner().nbins());
+          printf("offsetBPIX = %d\n", soa.offsetBPIX2());
+          printf("nHits = %d\n", soa.metadata().size());
+          //printf("hitsModuleStart[28] = %d\n", soa[28].hitsModuleStart());
         }
 
-        if (i < 10)  // can be increased to soa.nHits() for debugging
-          printf("iPhi %d ->%d \n", i, soa[i].iphi());
+        // can be increased to soa.nHits() for debugging
+        for (uint32_t i : cms::alpakatools::elements_with_stride(acc, 10)) {
+          printf("iPhi %d -> %d\n", i, soa[i].iphi());
+        }
       }
     };
 
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/alpaka/PixelRecHitKernels.dev.cc b/RecoLocalTracker/SiPixelRecHits/plugins/alpaka/PixelRecHitKernels.dev.cc
index f0d61a646c0ce..95592e1d5b3a2 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/alpaka/PixelRecHitKernels.dev.cc
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/alpaka/PixelRecHitKernels.dev.cc
@@ -56,7 +56,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
       auto nHits = clusters_d.nClusters();
       auto offsetBPIX2 = clusters_d.offsetBPIX2();
 
-      TrackingRecHitsSoACollection<TrackerTraits> hits_d(nHits, offsetBPIX2, clusters_d->clusModuleStart(), queue);
+      TrackingRecHitsSoACollection<TrackerTraits> hits_d(queue, nHits, offsetBPIX2, clusters_d->clusModuleStart());
 
       int activeModulesWithDigis = digis_d.nModules();
 
diff --git a/RecoTracker/PixelSeeding/plugins/alpaka/CAFishbone.h b/RecoTracker/PixelSeeding/plugins/alpaka/CAFishbone.h
index 343e0cf9ad005..6e89adf483ff7 100644
--- a/RecoTracker/PixelSeeding/plugins/alpaka/CAFishbone.h
+++ b/RecoTracker/PixelSeeding/plugins/alpaka/CAFishbone.h
@@ -1,5 +1,5 @@
-#ifndef RecoPixelVertexing_PixelTriplets_alpaka_CAFishbone_h
-#define RecoPixelVertexing_PixelTriplets_alpaka_CAFishbone_h
+#ifndef RecoTracker_PixelSeeding_plugins_alpaka_CAFishbone_h
+#define RecoTracker_PixelSeeding_plugins_alpaka_CAFishbone_h
 
 #include <algorithm>
 #include <cmath>
@@ -8,141 +8,146 @@
 #include <limits>
 
 #include <alpaka/alpaka.hpp>
-#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
-#include "HeterogeneousCore/AlpakaInterface/interface/traits.h"
-#include "HeterogeneousCore/AlpakaInterface/interface/VecArray.h"
+
 #include "DataFormats/Math/interface/approx_atan2.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/VecArray.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/traits.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
 
 #include "CACell.h"
 #include "CAStructures.h"
 
-namespace ALPAKA_ACCELERATOR_NAMESPACE {
-  namespace caPixelDoublets {
-
-    template <typename TrackerTraits>
-    using CellNeighbors = caStructures::CellNeighborsT<TrackerTraits>;
-    template <typename TrackerTraits>
-    using CellTracks = caStructures::CellTracksT<TrackerTraits>;
-    template <typename TrackerTraits>
-    using CellNeighborsVector = caStructures::CellNeighborsVectorT<TrackerTraits>;
-    template <typename TrackerTraits>
-    using CellTracksVector = caStructures::CellTracksVectorT<TrackerTraits>;
-    template <typename TrackerTraits>
-    using OuterHitOfCell = caStructures::OuterHitOfCellT<TrackerTraits>;
-    template <typename TrackerTraits>
-    using HitsConstView = typename CACellT<TrackerTraits>::HitsConstView;
-
-    template <typename TrackerTraits>
-    class CAFishbone {
-    public:
-      template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
-      ALPAKA_FN_ACC void operator()(TAcc const& acc,
-                                    HitsConstView<TrackerTraits> hh,
-                                    CACellT<TrackerTraits>* cells,
-                                    uint32_t const* __restrict__ nCells,
-                                    OuterHitOfCell<TrackerTraits> const* isOuterHitOfCellWrap,
-                                    int32_t nHits,
-                                    bool checkTrack) const {
-        if (nHits <= isOuterHitOfCellWrap->offset)
-          return;
-        constexpr auto maxCellsPerHit = CACellT<TrackerTraits>::maxCellsPerHit;
-
-        auto const isOuterHitOfCell = isOuterHitOfCellWrap->container;
-
-        // x runs faster...
-
-        float x[maxCellsPerHit], y[maxCellsPerHit], z[maxCellsPerHit], n[maxCellsPerHit];
-        uint16_t d[maxCellsPerHit];
-        uint32_t cc[maxCellsPerHit];
-        uint8_t l[maxCellsPerHit];
-        const uint32_t dimIndexY = 0u;
-        const uint32_t dimIndexX = 1u;
-        const uint32_t blockDimensionX(alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[dimIndexX]);
-        const auto& [firstElementIdxNoStrideX, endElementIdxNoStrideX] =
-            cms::alpakatools::element_index_range_in_block(acc, 0u, dimIndexX);
-
-        // Outermost loop on Y
-        const uint32_t gridDimensionY(alpaka::getWorkDiv<alpaka::Grid, alpaka::Elems>(acc)[dimIndexY]);
-        const auto& [firstElementIdxNoStrideY, endElementIdxNoStrideY] =
-            cms::alpakatools::element_index_range_in_grid(acc, 0u, dimIndexY);
-        uint32_t firstElementIdxY = firstElementIdxNoStrideY;
-        uint32_t endElementIdxY = endElementIdxNoStrideY;
-
-        for (uint32_t idy = firstElementIdxY, nt = nHits; idy < nt; ++idy) {
+namespace ALPAKA_ACCELERATOR_NAMESPACE::caPixelDoublets {
+
+  template <typename TrackerTraits>
+  using CellNeighbors = caStructures::CellNeighborsT<TrackerTraits>;
+  template <typename TrackerTraits>
+  using CellTracks = caStructures::CellTracksT<TrackerTraits>;
+  template <typename TrackerTraits>
+  using CellNeighborsVector = caStructures::CellNeighborsVectorT<TrackerTraits>;
+  template <typename TrackerTraits>
+  using CellTracksVector = caStructures::CellTracksVectorT<TrackerTraits>;
+  template <typename TrackerTraits>
+  using OuterHitOfCell = caStructures::OuterHitOfCellT<TrackerTraits>;
+  template <typename TrackerTraits>
+  using HitsConstView = typename CACellT<TrackerTraits>::HitsConstView;
+
+  template <typename TrackerTraits>
+  class CAFishbone {
+  public:
+    template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
+    ALPAKA_FN_ACC void operator()(TAcc const& acc,
+                                  HitsConstView<TrackerTraits> hh,
+                                  CACellT<TrackerTraits>* cells,
+                                  uint32_t const* __restrict__ nCells,
+                                  OuterHitOfCell<TrackerTraits> const* isOuterHitOfCellWrap,
+                                  int32_t nHits,
+                                  bool checkTrack) const {
+      constexpr auto maxCellsPerHit = CACellT<TrackerTraits>::maxCellsPerHit;
+
+      int32_t layer2Offset = isOuterHitOfCellWrap->offset;
+      // if there are no hits outside of the BPIX1, there is nothing to do
+      if (nHits <= layer2Offset)
+        return;
+
+      auto const isOuterHitOfCell = isOuterHitOfCellWrap->container;
+
+      float x[maxCellsPerHit], y[maxCellsPerHit], z[maxCellsPerHit], n[maxCellsPerHit];
+      uint32_t cc[maxCellsPerHit];
+      uint16_t d[maxCellsPerHit];
+      uint8_t l[maxCellsPerHit];
+
+      // index Dim<TAcc>::value - 1 runs faster...
+      constexpr uint32_t dimIndexX = 1u;
+      const uint32_t blockDimensionX(alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[dimIndexX]);
+      const auto [firstElementIdxNoStrideX, endElementIdxNoStrideX] =
+          cms::alpakatools::element_index_range_in_block(acc, 0u, dimIndexX);
+
+      // Outermost loop on Y
+      constexpr uint32_t dimIndexY = 0u;
+      const uint32_t gridDimensionY(alpaka::getWorkDiv<alpaka::Grid, alpaka::Elems>(acc)[dimIndexY]);
+      const auto [firstElementIdxNoStrideY, endElementIdxNoStrideY] =
+          cms::alpakatools::element_index_range_in_grid(acc, 0u, dimIndexY);
+      uint32_t firstElementIdxY = firstElementIdxNoStrideY;
+      uint32_t endElementIdxY = endElementIdxNoStrideY;
+
+      for (uint32_t idy = firstElementIdxY, nt = nHits - layer2Offset; idy < nt; ++idy) {
+        if (not cms::alpakatools::next_valid_element_index_strided(
+                idy, firstElementIdxY, endElementIdxY, gridDimensionY, nt))
+          break;
+
+        auto const& vc = isOuterHitOfCell[idy];
+        auto size = vc.size();
+        if (size < 2)
+          continue;
+        // if alligned kill one of the two.
+        // in principle one could try to relax the cut (only in r-z?) for jumping-doublets
+        auto const& c0 = cells[vc[0]];
+        auto xo = c0.outer_x(hh);
+        auto yo = c0.outer_y(hh);
+        auto zo = c0.outer_z(hh);
+        auto sg = 0;
+        for (int32_t ic = 0; ic < size; ++ic) {
+          auto& ci = cells[vc[ic]];
+          if (ci.unused())
+            continue;  // for triplets equivalent to next
+          if (checkTrack && ci.tracks().empty())
+            continue;
+          cc[sg] = vc[ic];
+          l[sg] = ci.layerPairId();
+          d[sg] = ci.inner_detIndex(hh);
+          x[sg] = ci.inner_x(hh) - xo;
+          y[sg] = ci.inner_y(hh) - yo;
+          z[sg] = ci.inner_z(hh) - zo;
+          n[sg] = x[sg] * x[sg] + y[sg] * y[sg] + z[sg] * z[sg];
+          ++sg;
+        }
+        if (sg < 2)
+          continue;
+
+        // here we parallelize in X
+        uint32_t firstElementIdxX = firstElementIdxNoStrideX;
+        uint32_t endElementIdxX = endElementIdxNoStrideX;
+        for (uint32_t ic = firstElementIdxX; (int)ic < sg - 1; ++ic) {
           if (not cms::alpakatools::next_valid_element_index_strided(
-                  idy, firstElementIdxY, endElementIdxY, gridDimensionY, nt))
+                  ic, firstElementIdxX, endElementIdxX, blockDimensionX, sg - 1))
             break;
 
-          auto const& vc = isOuterHitOfCell[idy];
-          auto s = vc.size();
-          if (s < 2)
-            continue;
+          auto& ci = cells[cc[ic]];
+          for (auto jc = ic + 1; (int)jc < sg; ++jc) {
+            auto& cj = cells[cc[jc]];
+            // must be different detectors (in the same layer)
+            // if (d[ic]==d[jc]) continue;
+            auto cos12 = x[ic] * x[jc] + y[ic] * y[jc] + z[ic] * z[jc];
 
-          auto const& c0 = cells[vc[0]];
-          auto xo = c0.outer_x(hh);
-          auto yo = c0.outer_y(hh);
-          auto zo = c0.outer_z(hh);
-          auto sg = 0;
-          for (int32_t ic = 0; ic < s; ++ic) {
-            auto& ci = cells[vc[ic]];
-            if (ci.unused())
-              continue;  // for triplets equivalent to next
-            if (checkTrack && ci.tracks().empty())
-              continue;
-            cc[sg] = vc[ic];
-            d[sg] = ci.inner_detIndex(hh);
-            l[sg] = ci.layerPairId();
-            x[sg] = ci.inner_x(hh) - xo;
-            y[sg] = ci.inner_y(hh) - yo;
-            z[sg] = ci.inner_z(hh) - zo;
-            n[sg] = x[sg] * x[sg] + y[sg] * y[sg] + z[sg] * z[sg];
-            ++sg;
-          }
-          if (sg < 2)
-            continue;
-          // here we parallelize in X
-          uint32_t firstElementIdxX = firstElementIdxNoStrideX;
-          uint32_t endElementIdxX = endElementIdxNoStrideX;
-          for (uint32_t ic = firstElementIdxX; (int)ic < sg - 1; ++ic) {
-            if (not cms::alpakatools::next_valid_element_index_strided(
-                    ic, firstElementIdxX, endElementIdxX, blockDimensionX, sg - 1))
-              break;
-
-            auto& ci = cells[cc[ic]];
-            for (auto jc = ic + 1; (int)jc < sg; ++jc) {
-              auto& cj = cells[cc[jc]];
-              // must be different detectors (in the same layer)
-              //        if (d[ic]==d[jc]) continue;
-              // || l[ic]!=l[jc]) continue;
-              auto cos12 = x[ic] * x[jc] + y[ic] * y[jc] + z[ic] * z[jc];
-
-              if (d[ic] != d[jc] && cos12 * cos12 >= 0.99999f * (n[ic] * n[jc])) {
-                // alligned:  kill farthest (prefer consecutive layers)
-                // if same layer prefer farthest (longer level arm) and make space for intermediate hit
-                bool sameLayer = l[ic] == l[jc];
-                if (n[ic] > n[jc]) {
-                  if (sameLayer) {
-                    cj.kill();  // closest
-                    ci.setFishbone(acc, cj.inner_hit_id(), cj.inner_z(hh), hh);
-                  } else {
-                    ci.kill();  // farthest
-                    // break;  // removed to improve reproducibility. keep it for reference and tests
-                  }
+            if (d[ic] != d[jc] && cos12 * cos12 >= 0.99999f * (n[ic] * n[jc])) {
+              // alligned:  kill farthest (prefer consecutive layers)
+              // if same layer prefer farthest (longer level arm) and make space for intermediate hit
+              bool sameLayer = l[ic] == l[jc];
+              if (n[ic] > n[jc]) {
+                if (sameLayer) {
+                  cj.kill();  // closest
+                  ci.setFishbone(acc, cj.inner_hit_id(), cj.inner_z(hh), hh);
+                } else {
+                  ci.kill();  // farthest
+                  // break;  // removed to improve reproducibility, keep it for reference and tests
+                }
+              } else {
+                if (!sameLayer) {
+                  cj.kill();  // farthest
                 } else {
-                  if (!sameLayer) {
-                    cj.kill();  // farthest
-                  } else {
-                    ci.kill();  // closest
-                    cj.setFishbone(acc, ci.inner_hit_id(), ci.inner_z(hh), hh);
-                    // break;  // removed to improve reproducibility. keep it for reference    and tests
-                  }
+                  ci.kill();  // closest
+                  cj.setFishbone(acc, ci.inner_hit_id(), ci.inner_z(hh), hh);
+                  // break;  // removed to improve reproducibility, keep it for reference and tests
                 }
               }
-            }  //cj
-          }    // ci
-        }      // hits
-      }
-    };
-  }  // namespace caPixelDoublets
-}  // namespace ALPAKA_ACCELERATOR_NAMESPACE
-#endif  // RecoPixelVertexing_PixelTriplets_alpaka_CAFishbone_h
+            }
+          }  // cj
+        }    // ci
+      }      // hits
+    }
+  };
+
+}  // namespace ALPAKA_ACCELERATOR_NAMESPACE::caPixelDoublets
+
+#endif  // RecoTracker_PixelSeeding_plugins_alpaka_CAFishbone_h
diff --git a/RecoTracker/PixelSeeding/plugins/alpaka/CAHitNtupletGenerator.cc b/RecoTracker/PixelSeeding/plugins/alpaka/CAHitNtupletGenerator.cc
index 8f898872a66f4..1a62b0d26f904 100644
--- a/RecoTracker/PixelSeeding/plugins/alpaka/CAHitNtupletGenerator.cc
+++ b/RecoTracker/PixelSeeding/plugins/alpaka/CAHitNtupletGenerator.cc
@@ -300,10 +300,10 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
 
     TrackSoA tracks(queue);
 
-    GPUKernels kernels(m_params, hits_d.view().metadata().size(), queue);
+    GPUKernels kernels(m_params, hits_d.view().metadata().size(), hits_d.offsetBPIX2(), queue);
 
-    kernels.buildDoublets(hits_d.view(), queue);
-    kernels.launchKernels(hits_d.view(), tracks.view(), queue);
+    kernels.buildDoublets(hits_d.view(), hits_d.offsetBPIX2(), queue);
+    kernels.launchKernels(hits_d.view(), hits_d.offsetBPIX2(), tracks.view(), queue);
 
     HelixFit fitter(bfield, m_params.fitNas4_);
     fitter.allocate(kernels.tupleMultiplicity(), tracks.view());
diff --git a/RecoTracker/PixelSeeding/plugins/alpaka/CAHitNtupletGeneratorKernels.dev.cc b/RecoTracker/PixelSeeding/plugins/alpaka/CAHitNtupletGeneratorKernels.dev.cc
index 44e3295bdb606..d5067476063b8 100644
--- a/RecoTracker/PixelSeeding/plugins/alpaka/CAHitNtupletGeneratorKernels.dev.cc
+++ b/RecoTracker/PixelSeeding/plugins/alpaka/CAHitNtupletGeneratorKernels.dev.cc
@@ -19,6 +19,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
   template <typename TrackerTraits>
   CAHitNtupletGeneratorKernels<TrackerTraits>::CAHitNtupletGeneratorKernels(Params const &params,
                                                                             uint32_t nhits,
+                                                                            uint32_t offsetBPIX2,
                                                                             Queue &queue)
       : m_params(params),
         //////////////////////////////////////////////////////////
@@ -35,7 +36,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
             cms::alpakatools::make_device_buffer<CACell[]>(queue, m_params.caParams_.maxNumberOfDoublets_)},
         // in principle we can use "nhits" to heuristically dimension the workspace...
         device_isOuterHitOfCell_{
-            cms::alpakatools::make_device_buffer<OuterHitOfCellContainer[]>(queue, std::max(1u, nhits))},
+            cms::alpakatools::make_device_buffer<OuterHitOfCellContainer[]>(queue, std::max(1u, nhits - offsetBPIX2))},
         isOuterHitOfCell_{cms::alpakatools::make_device_buffer<OuterHitOfCell>(queue)},
 
         device_theCellNeighbors_{cms::alpakatools::make_device_buffer<CellNeighborsVector>(queue)},
@@ -77,6 +78,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
 
   template <typename TrackerTraits>
   void CAHitNtupletGeneratorKernels<TrackerTraits>::launchKernels(const HitsConstView &hh,
+                                                                  uint32_t offsetBPIX2,
                                                                   TkSoAView &tracks_view,
                                                                   Queue &queue) {
     using namespace caPixelDoublets;
@@ -85,7 +87,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
     // zero tuples
     HitContainer::template launchZero<Acc1D>(&(tracks_view.hitIndices()), queue);
 
-    int32_t nhits = hh.metadata().size();
+    uint32_t nhits = hh.metadata().size();
 
 #ifdef NTUPLE_DEBUG
     std::cout << "start tuple building. N hits " << nhits << std::endl;
@@ -94,7 +96,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
 #endif
 
     //
-    // applying conbinatoric cleaning such as fishbone at this stage is too expensive
+    // applying combinatoric cleaning such as fishbone at this stage is too expensive
     //
 
     const auto nthTot = 64;
@@ -123,11 +125,11 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
                         this->m_params.caParams_);
 
     // do not run the fishbone if there are hits only in BPIX1
-    if (this->m_params.earlyFishbone_) {
+    if (this->m_params.earlyFishbone_ and nhits > offsetBPIX2) {
       const auto nthTot = 128;
       const auto stride = 16;
       const auto blockSize = nthTot / stride;
-      const auto numberOfBlocks = cms::alpakatools::divide_up_by(nhits, blockSize);
+      const auto numberOfBlocks = cms::alpakatools::divide_up_by(nhits - offsetBPIX2, blockSize);
       const Vec2D blks{numberOfBlocks, 1u};
       const Vec2D thrs{blockSize, stride};
       const auto fishboneWorkDiv = cms::alpakatools::make_workdiv<Acc2D>(blks, thrs);
@@ -224,11 +226,11 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
     alpaka::wait(queue);
 #endif
     // do not run the fishbone if there are hits only in BPIX1
-    if (this->m_params.lateFishbone_) {
+    if (this->m_params.lateFishbone_ and nhits > offsetBPIX2) {
       const auto nthTot = 128;
       const auto stride = 16;
       const auto blockSize = nthTot / stride;
-      const auto numberOfBlocks = cms::alpakatools::divide_up_by(nhits, blockSize);
+      const auto numberOfBlocks = cms::alpakatools::divide_up_by(nhits - offsetBPIX2, blockSize);
       const Vec2D blks{numberOfBlocks, 1u};
       const Vec2D thrs{blockSize, stride};
       const auto workDiv2D = cms::alpakatools::make_workdiv<Acc2D>(blks, thrs);
@@ -250,17 +252,17 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
   }
 
   template <typename TrackerTraits>
-  void CAHitNtupletGeneratorKernels<TrackerTraits>::buildDoublets(const HitsConstView &hh, Queue &queue) {
-    auto nhits = hh.metadata().size();
-
+  void CAHitNtupletGeneratorKernels<TrackerTraits>::buildDoublets(const HitsConstView &hh,
+                                                                  uint32_t offsetBPIX2,
+                                                                  Queue &queue) {
     using namespace caPixelDoublets;
-
     using CACell = CACellT<TrackerTraits>;
     using OuterHitOfCell = typename CACell::OuterHitOfCell;
     using CellNeighbors = typename CACell::CellNeighbors;
     using CellTracks = typename CACell::CellTracks;
     using OuterHitOfCellContainer = typename CACell::OuterHitOfCellContainer;
 
+    auto nhits = hh.metadata().size();
 #ifdef NTUPLE_DEBUG
     std::cout << "building Doublets out of " << nhits << " Hits" << std::endl;
 #endif
@@ -290,7 +292,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
     {
       int threadsPerBlock = 128;
       // at least one block!
-      int blocks = std::max(1u, cms::alpakatools::divide_up_by(nhits, threadsPerBlock));
+      int blocks = std::max(1u, cms::alpakatools::divide_up_by(nhits - offsetBPIX2, threadsPerBlock));
       const auto workDiv1D = cms::alpakatools::make_workdiv<Acc1D>(blocks, threadsPerBlock);
 
       alpaka::exec<Acc1D>(queue,
@@ -523,14 +525,16 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
     }
 #endif
   }
-  // This will make sense when we will be able to run this once per job in Alpaka
-  /*
-template <typename TrackerTraits>
-void CAHitNtupletGeneratorKernels<TrackerTraits>::printCounters() {
+
+  /* This will make sense when we will be able to run this once per job in Alpaka
+
+  template <typename TrackerTraits>
+  void CAHitNtupletGeneratorKernels<TrackerTraits>::printCounters() {
     auto workDiv1D = cms::alpakatools::make_workdiv<Acc1D>(1,1);
-    alpaka::exec<Acc1D>(queue_,workDiv1D,Kernel_printCounters{},this->counters_.data());
-}
-*/
+    alpaka::exec<Acc1D>(queue_, workDiv1D, Kernel_printCounters{}, this->counters_.data());
+  }
+  */
+
   template class CAHitNtupletGeneratorKernels<pixelTopology::Phase1>;
   template class CAHitNtupletGeneratorKernels<pixelTopology::Phase2>;
   template class CAHitNtupletGeneratorKernels<pixelTopology::HIonPhase1>;
diff --git a/RecoTracker/PixelSeeding/plugins/alpaka/CAHitNtupletGeneratorKernels.h b/RecoTracker/PixelSeeding/plugins/alpaka/CAHitNtupletGeneratorKernels.h
index d55be09e6e497..c52768cca745e 100644
--- a/RecoTracker/PixelSeeding/plugins/alpaka/CAHitNtupletGeneratorKernels.h
+++ b/RecoTracker/PixelSeeding/plugins/alpaka/CAHitNtupletGeneratorKernels.h
@@ -233,16 +233,16 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
     using Quality = ::pixelTrack::Quality;
     using HitContainer = typename reco::TrackSoA<TrackerTraits>::HitContainer;
 
-    CAHitNtupletGeneratorKernels(Params const& params, uint32_t nhits, Queue& queue);
+    CAHitNtupletGeneratorKernels(Params const& params, uint32_t nhits, uint32_t offsetBPIX2, Queue& queue);
     ~CAHitNtupletGeneratorKernels() = default;
 
     TupleMultiplicity const* tupleMultiplicity() const { return device_tupleMultiplicity_.data(); }
 
-    void launchKernels(const HitsConstView& hh, TkSoAView& track_view, Queue& queue);
+    void launchKernels(const HitsConstView& hh, uint32_t offsetBPIX2, TkSoAView& track_view, Queue& queue);
 
     void classifyTuples(const HitsConstView& hh, TkSoAView& track_view, Queue& queue);
 
-    void buildDoublets(const HitsConstView& hh, Queue& queue);
+    void buildDoublets(const HitsConstView& hh, uint32_t offsetBPIX2, Queue& queue);
 
     static void printCounters();
 
diff --git a/RecoTracker/PixelSeeding/plugins/alpaka/CAPixelDoublets.h b/RecoTracker/PixelSeeding/plugins/alpaka/CAPixelDoublets.h
index 0b5ab0a985163..2bbdcdb7b084c 100644
--- a/RecoTracker/PixelSeeding/plugins/alpaka/CAPixelDoublets.h
+++ b/RecoTracker/PixelSeeding/plugins/alpaka/CAPixelDoublets.h
@@ -25,7 +25,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
                                     CellTracks<TrackerTraits>* cellTracksContainer) const {
         ALPAKA_ASSERT_OFFLOAD((*isOuterHitOfCell).container);
 
-        for (auto i : cms::alpakatools::elements_with_stride(acc, nHits))
+        for (auto i : cms::alpakatools::elements_with_stride(acc, nHits - isOuterHitOfCell->offset))
           (*isOuterHitOfCell).container[i].reset();
 
         if (cms::alpakatools::once_per_grid(acc)) {

From 63902cb243e7d3be176ab8aba7588a235171695a Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Fri, 9 Feb 2024 08:25:56 +0100
Subject: [PATCH 16/25] Fix include guards, clean up namespaces and includes

---
 .../SiPixelGainCalibrationForHLTDevice.h      |    5 +-
 .../alpaka/SiPixelMappingUtilities.h          |   11 +-
 .../interface/alpaka/TrackUtilities.h         |   15 +-
 .../interface/alpaka/TracksSoACollection.h    |   18 +-
 .../TrackSoA/src/alpaka/classes_cuda.h        |    2 +-
 .../TrackSoA/src/alpaka/classes_rocm.h        |    2 +-
 .../alpaka/TrackSoAHeterogeneous_test.dev.cc  |    9 +-
 .../alpaka/TrackingRecHitsSoACollection.h     |    2 +
 .../test/alpaka/Hits_test.dev.cc              |    9 +-
 .../alpaka/SiPixelPhase2DigiToCluster.cc      |    2 +
 .../alpaka/SiPixelRawToClusterKernel.dev.cc   |    2 +
 .../plugins/alpaka/BrokenLineFit.dev.cc       |   10 +-
 .../PixelSeeding/plugins/alpaka/CACell.h      |   25 +-
 .../PixelSeeding/plugins/alpaka/CAFishbone.h  |    4 +-
 .../plugins/alpaka/CAHitNtupletGenerator.cc   |    9 +-
 .../plugins/alpaka/CAHitNtupletGenerator.h    |    9 +-
 .../CAHitNtupletGeneratorKernels.dev.cc       |   18 +-
 .../alpaka/CAHitNtupletGeneratorKernels.h     |    7 +-
 .../alpaka/CAHitNtupletGeneratorKernelsImpl.h | 1903 +++++++++--------
 .../plugins/alpaka/CAPixelDoublets.h          |   15 +-
 .../plugins/alpaka/CAPixelDoubletsAlgos.h     |  581 ++---
 .../plugins/alpaka/CAStructures.h             |    6 +-
 .../PixelSeeding/plugins/alpaka/HelixFit.h    |   13 +-
 .../plugins/alpaka/RiemannFit.dev.cc          |   11 +-
 .../PixelSeeding/test/alpaka/CAsizes_t.cpp    |   12 +-
 .../PixelTrackFitting/interface/BrokenLine.h  |    1 +
 .../PixelTrackFitting/interface/FitResult.h   |    1 +
 .../PixelTrackFitting/interface/RiemannFit.h  |    3 +
 .../interface/alpaka/BrokenLine.h             |  879 ++++----
 .../interface/alpaka/FitResult.h              |    9 +-
 .../interface/alpaka/FitUtils.h               |   13 +-
 .../interface/alpaka/RiemannFit.h             | 1297 +++++------
 .../plugins/alpaka/clusterTracksByDensity.h   |  429 ++--
 .../plugins/alpaka/clusterTracksDBSCAN.h      |  434 ++--
 .../plugins/alpaka/clusterTracksIterative.h   |    5 +-
 .../plugins/alpaka/fitVertices.h              |  211 +-
 .../plugins/alpaka/sortByPt2.h                |  130 +-
 .../plugins/alpaka/splitVertices.h            |  278 +--
 .../plugins/alpaka/vertexFinder.dev.cc        |    6 +-
 .../plugins/alpaka/vertexFinder.h             |  115 +-
 .../test/alpaka/VertexFinder_t.dev.cc         |    9 +-
 41 files changed, 3311 insertions(+), 3209 deletions(-)

diff --git a/CondFormats/SiPixelObjects/interface/alpaka/SiPixelGainCalibrationForHLTDevice.h b/CondFormats/SiPixelObjects/interface/alpaka/SiPixelGainCalibrationForHLTDevice.h
index 3c5e7094654c6..765ca65de4609 100644
--- a/CondFormats/SiPixelObjects/interface/alpaka/SiPixelGainCalibrationForHLTDevice.h
+++ b/CondFormats/SiPixelObjects/interface/alpaka/SiPixelGainCalibrationForHLTDevice.h
@@ -1,13 +1,14 @@
 #ifndef CondFormats_SiPixelObjects_interface_alpaka_SiPixelGainCalibrationForHLTDevice_h
 #define CondFormats_SiPixelObjects_interface_alpaka_SiPixelGainCalibrationForHLTDevice_h
 
-#include <alpaka/alpaka.hpp>
-#include "DataFormats/Portable/interface/alpaka/PortableCollection.h"
 #include "CondFormats/SiPixelObjects/interface/SiPixelGainCalibrationForHLTLayout.h"
+#include "DataFormats/Portable/interface/alpaka/PortableCollection.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/config.h"
 
 namespace ALPAKA_ACCELERATOR_NAMESPACE {
 
   using SiPixelGainCalibrationForHLTDevice = PortableCollection<SiPixelGainCalibrationForHLTSoA>;
 
 }  // namespace ALPAKA_ACCELERATOR_NAMESPACE
+
 #endif  // CondFormats_SiPixelObjects_interface_alpaka_SiPixelGainCalibrationForHLTDevice_h
diff --git a/CondFormats/SiPixelObjects/interface/alpaka/SiPixelMappingUtilities.h b/CondFormats/SiPixelObjects/interface/alpaka/SiPixelMappingUtilities.h
index 800cf0ac671cd..4c1eba454871e 100644
--- a/CondFormats/SiPixelObjects/interface/alpaka/SiPixelMappingUtilities.h
+++ b/CondFormats/SiPixelObjects/interface/alpaka/SiPixelMappingUtilities.h
@@ -1,11 +1,18 @@
 #ifndef CondFormats_SiPixelObjects_interface_alpaka_SiPixelMappingUtilities_h
 #define CondFormats_SiPixelObjects_interface_alpaka_SiPixelMappingUtilities_h
 
-#include <cstdint>
+#include <set>
+#include <vector>
+
 #include <alpaka/alpaka.hpp>
-#include "CondFormats/SiPixelObjects/interface/SiPixelMappingLayout.h"
+
+#include "CondFormats/SiPixelObjects/interface/CablingPathToDetUnit.h"
+#include "CondFormats/SiPixelObjects/interface/PixelROC.h"
 #include "CondFormats/SiPixelObjects/interface/SiPixelFedCablingMap.h"
 #include "CondFormats/SiPixelObjects/interface/SiPixelFedCablingTree.h"
+#include "CondFormats/SiPixelObjects/interface/SiPixelMappingLayout.h"
+#include "CondFormats/SiPixelObjects/interface/SiPixelROCsStatusAndMapping.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/config.h"
 #include "HeterogeneousCore/AlpakaInterface/interface/memory.h"
 
 namespace ALPAKA_ACCELERATOR_NAMESPACE {
diff --git a/DataFormats/TrackSoA/interface/alpaka/TrackUtilities.h b/DataFormats/TrackSoA/interface/alpaka/TrackUtilities.h
index 6b95d2843653f..f50756f3ddbca 100644
--- a/DataFormats/TrackSoA/interface/alpaka/TrackUtilities.h
+++ b/DataFormats/TrackSoA/interface/alpaka/TrackUtilities.h
@@ -1,9 +1,14 @@
-#ifndef DataFormats_Track_interface_alpaka_TrackUtilities_h
-#define DataFormats_Track_interface_alpaka_TrackUtilities_h
+#ifndef DataFormats_TrackSoA_interface_alpaka_TrackUtilities_h
+#define DataFormats_TrackSoA_interface_alpaka_TrackUtilities_h
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+
+#include <alpaka/alpaka.hpp>
 
-#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h"
-#include "DataFormats/TrackSoA/interface/TrackDefinitions.h"
 #include "DataFormats/TrackSoA/interface/TracksSoA.h"
+#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h"
 
 // Methods that operate on View and ConstView of the TrackSoA, and cannot be class methods.
 template <typename TrackerTraits>
@@ -170,4 +175,4 @@ namespace pixelTrack {
 template struct TracksUtilities<pixelTopology::Phase1>;
 template struct TracksUtilities<pixelTopology::Phase2>;
 
-#endif
+#endif  // DataFormats_TrackSoA_interface_alpaka_TrackUtilities_h
diff --git a/DataFormats/TrackSoA/interface/alpaka/TracksSoACollection.h b/DataFormats/TrackSoA/interface/alpaka/TracksSoACollection.h
index 62e9f69e34636..c9294d693d4c4 100644
--- a/DataFormats/TrackSoA/interface/alpaka/TracksSoACollection.h
+++ b/DataFormats/TrackSoA/interface/alpaka/TracksSoACollection.h
@@ -1,15 +1,17 @@
-#ifndef DataFormats_Track_interface_alpaka_TracksSoACollection_h
-#define DataFormats_Track_interface_alpaka_TracksSoACollection_h
+#ifndef DataFormats_TrackSoA_interface_alpaka_TracksSoACollection_h
+#define DataFormats_TrackSoA_interface_alpaka_TracksSoACollection_h
+
+#include <type_traits>
 
-#include <cstdint>
 #include <alpaka/alpaka.hpp>
-#include "HeterogeneousCore/AlpakaInterface/interface/config.h"
+
 #include "DataFormats/Portable/interface/alpaka/PortableCollection.h"
-#include "DataFormats/TrackSoA/interface/TracksSoA.h"
-#include "DataFormats/TrackSoA/interface/TrackDefinitions.h"
-#include "DataFormats/TrackSoA/interface/TracksHost.h"
 #include "DataFormats/TrackSoA/interface/TracksDevice.h"
+#include "DataFormats/TrackSoA/interface/TracksHost.h"
+#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/AssertDeviceMatchesHostCollection.h"
 #include "HeterogeneousCore/AlpakaInterface/interface/CopyToHost.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/config.h"
 
 // TODO: The class is created via inheritance of the PortableCollection.
 // This is generally discouraged, and should be done via composition.
@@ -49,4 +51,4 @@ ASSERT_DEVICE_MATCHES_HOST_COLLECTION(pixelTrack::TracksSoACollectionPhase1, pix
 ASSERT_DEVICE_MATCHES_HOST_COLLECTION(pixelTrack::TracksSoACollectionPhase2, pixelTrack::TracksHostPhase2);
 ASSERT_DEVICE_MATCHES_HOST_COLLECTION(pixelTrack::TracksSoACollectionHIonPhase1, pixelTrack::TracksHostHIonPhase1);
 
-#endif  // DataFormats_Track_interface_alpaka_TracksSoACollection_h
+#endif  // DataFormats_TrackSoA_interface_alpaka_TracksSoACollection_h
diff --git a/DataFormats/TrackSoA/src/alpaka/classes_cuda.h b/DataFormats/TrackSoA/src/alpaka/classes_cuda.h
index 17f3b64498711..a33b836f8a747 100644
--- a/DataFormats/TrackSoA/src/alpaka/classes_cuda.h
+++ b/DataFormats/TrackSoA/src/alpaka/classes_cuda.h
@@ -3,9 +3,9 @@
 
 #include "DataFormats/Common/interface/DeviceProduct.h"
 #include "DataFormats/Common/interface/Wrapper.h"
+#include "DataFormats/TrackSoA/interface/TracksDevice.h"
 #include "DataFormats/TrackSoA/interface/TracksSoA.h"
 #include "DataFormats/TrackSoA/interface/alpaka/TracksSoACollection.h"
-#include "DataFormats/TrackSoA/interface/TracksDevice.h"
 #include "HeterogeneousCore/AlpakaInterface/interface/config.h"
 
 using namespace pixelTopology;
diff --git a/DataFormats/TrackSoA/src/alpaka/classes_rocm.h b/DataFormats/TrackSoA/src/alpaka/classes_rocm.h
index 0267ddeb213d5..0271f8ef0dbcd 100644
--- a/DataFormats/TrackSoA/src/alpaka/classes_rocm.h
+++ b/DataFormats/TrackSoA/src/alpaka/classes_rocm.h
@@ -3,9 +3,9 @@
 
 #include "DataFormats/Common/interface/DeviceProduct.h"
 #include "DataFormats/Common/interface/Wrapper.h"
+#include "DataFormats/TrackSoA/interface/TracksDevice.h"
 #include "DataFormats/TrackSoA/interface/TracksSoA.h"
 #include "DataFormats/TrackSoA/interface/alpaka/TracksSoACollection.h"
-#include "DataFormats/TrackSoA/interface/TracksDevice.h"
 #include "HeterogeneousCore/AlpakaInterface/interface/config.h"
 
 using namespace pixelTopology;
diff --git a/DataFormats/TrackSoA/test/alpaka/TrackSoAHeterogeneous_test.dev.cc b/DataFormats/TrackSoA/test/alpaka/TrackSoAHeterogeneous_test.dev.cc
index 2b9807e3db054..accf175bccfe6 100644
--- a/DataFormats/TrackSoA/test/alpaka/TrackSoAHeterogeneous_test.dev.cc
+++ b/DataFormats/TrackSoA/test/alpaka/TrackSoAHeterogeneous_test.dev.cc
@@ -1,8 +1,13 @@
-#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h"
+#include <type_traits>
+
+#include <alpaka/alpaka.hpp>
+
 #include "DataFormats/TrackSoA/interface/TrackDefinitions.h"
-#include "DataFormats/TrackSoA/interface/alpaka/TracksSoACollection.h"
 #include "DataFormats/TrackSoA/interface/TracksDevice.h"
 #include "DataFormats/TrackSoA/interface/TracksHost.h"
+#include "DataFormats/TrackSoA/interface/alpaka/TracksSoACollection.h"
+#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/config.h"
 
 using namespace reco;
 
diff --git a/DataFormats/TrackingRecHitSoA/interface/alpaka/TrackingRecHitsSoACollection.h b/DataFormats/TrackingRecHitSoA/interface/alpaka/TrackingRecHitsSoACollection.h
index 2a7439c34c513..14d0a2e1aa8f0 100644
--- a/DataFormats/TrackingRecHitSoA/interface/alpaka/TrackingRecHitsSoACollection.h
+++ b/DataFormats/TrackingRecHitSoA/interface/alpaka/TrackingRecHitsSoACollection.h
@@ -2,6 +2,7 @@
 #define DataFormats_TrackingRecHitSoA_interface_alpaka_TrackingRecHitsSoACollection_h
 
 #include <cstdint>
+#include <type_traits>
 
 #include <alpaka/alpaka.hpp>
 
@@ -11,6 +12,7 @@
 #include "DataFormats/TrackingRecHitSoA/interface/TrackingRecHitsSoA.h"
 #include "Geometry/CommonTopologies/interface/SimplePixelTopology.h"
 #include "HeterogeneousCore/AlpakaInterface/interface/CopyToHost.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/config.h"
 
 namespace ALPAKA_ACCELERATOR_NAMESPACE {
 
diff --git a/DataFormats/TrackingRecHitSoA/test/alpaka/Hits_test.dev.cc b/DataFormats/TrackingRecHitSoA/test/alpaka/Hits_test.dev.cc
index 1ea67ad822536..b987b0ee82a63 100644
--- a/DataFormats/TrackingRecHitSoA/test/alpaka/Hits_test.dev.cc
+++ b/DataFormats/TrackingRecHitSoA/test/alpaka/Hits_test.dev.cc
@@ -1,8 +1,13 @@
-#include "DataFormats/TrackingRecHitSoA/interface/TrackingRecHitsSoA.h"
+#include <type_traits>
+
+#include <alpaka/alpaka.hpp>
+
 #include "DataFormats/TrackingRecHitSoA/interface/TrackingRecHitsDevice.h"
+#include "DataFormats/TrackingRecHitSoA/interface/TrackingRecHitsSoA.h"
 #include "DataFormats/TrackingRecHitSoA/interface/alpaka/TrackingRecHitsSoACollection.h"
-#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/config.h"
 #include "HeterogeneousCore/AlpakaInterface/interface/traits.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
 
 using namespace alpaka;
 
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/SiPixelPhase2DigiToCluster.cc b/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/SiPixelPhase2DigiToCluster.cc
index 5d0b355d1eebc..575c5ab925145 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/SiPixelPhase2DigiToCluster.cc
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/SiPixelPhase2DigiToCluster.cc
@@ -5,6 +5,8 @@
 #include <utility>
 #include <vector>
 
+#include <alpaka/alpaka.hpp>
+
 #include "DataFormats/Common/interface/DetSetVector.h"
 #include "DataFormats/SiPixelClusterSoA/interface/alpaka/SiPixelClustersSoACollection.h"
 #include "DataFormats/SiPixelDigi/interface/PixelDigi.h"
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/SiPixelRawToClusterKernel.dev.cc b/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/SiPixelRawToClusterKernel.dev.cc
index 11ae14f2fa6a5..c4b8562f2ca92 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/SiPixelRawToClusterKernel.dev.cc
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/SiPixelRawToClusterKernel.dev.cc
@@ -11,6 +11,8 @@
 #include <string>
 #include <utility>
 
+#include <alpaka/alpaka.hpp>
+
 // CMSSW includes
 #include "HeterogeneousCore/AlpakaInterface/interface/SimpleVector.h"
 #include "HeterogeneousCore/AlpakaInterface/interface/config.h"
diff --git a/RecoTracker/PixelSeeding/plugins/alpaka/BrokenLineFit.dev.cc b/RecoTracker/PixelSeeding/plugins/alpaka/BrokenLineFit.dev.cc
index a21fed668b54c..9882c5c47b43e 100644
--- a/RecoTracker/PixelSeeding/plugins/alpaka/BrokenLineFit.dev.cc
+++ b/RecoTracker/PixelSeeding/plugins/alpaka/BrokenLineFit.dev.cc
@@ -1,15 +1,13 @@
-//
-// Author: Felice Pantaleo, CERN
-//
-
 //#define BROKENLINE_DEBUG
 //#define BL_DUMP_HITS
-#include <alpaka/alpaka.hpp>
+
 #include <cstdint>
 
+#include <alpaka/alpaka.hpp>
+
 #include "DataFormats/TrackingRecHitSoA/interface/TrackingRecHitsSoA.h"
-#include "HeterogeneousCore/AlpakaInterface/interface/traits.h"
 #include "HeterogeneousCore/AlpakaInterface/interface/config.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/traits.h"
 #include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforDevice.h"
 #include "RecoTracker/PixelTrackFitting/interface/alpaka/BrokenLine.h"
 
diff --git a/RecoTracker/PixelSeeding/plugins/alpaka/CACell.h b/RecoTracker/PixelSeeding/plugins/alpaka/CACell.h
index d0142f78415ae..4c83eef84fdfe 100644
--- a/RecoTracker/PixelSeeding/plugins/alpaka/CACell.h
+++ b/RecoTracker/PixelSeeding/plugins/alpaka/CACell.h
@@ -1,21 +1,22 @@
-#ifndef RecoPixelVertexing_PixelTriplets_CACellT_h
-#define RecoPixelVertexing_PixelTriplets_CACellT_h
-
-//
-// Author: Felice Pantaleo, CERN
-//
+#ifndef RecoTracker_PixelSeeding_plugins_alpaka_CACell_h
+#define RecoTracker_PixelSeeding_plugins_alpaka_CACell_h
 
 // #define ONLY_TRIPLETS_IN_HOLE
 
+#include <cmath>
+#include <limits>
+
 #include <alpaka/alpaka.hpp>
 
-#include "DataFormats/TrackingRecHitSoA/interface/TrackingRecHitsSoA.h"
-#include "HeterogeneousCore/AlpakaInterface/interface/VecArray.h"
-#include "HeterogeneousCore/AlpakaInterface/interface/SimpleVector.h"
-#include "RecoTracker/PixelSeeding/interface/CircleEq.h"
 #include "DataFormats/TrackSoA/interface/TrackDefinitions.h"
 #include "DataFormats/TrackSoA/interface/TracksSoA.h"
+#include "DataFormats/TrackingRecHitSoA/interface/TrackingRecHitsSoA.h"
 #include "Geometry/CommonTopologies/interface/SimplePixelTopology.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/SimpleVector.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/VecArray.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/config.h"
+#include "RecoTracker/PixelSeeding/interface/CircleEq.h"
+
 #include "CAStructures.h"
 
 namespace ALPAKA_ACCELERATOR_NAMESPACE {
@@ -387,5 +388,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
     hindex_type theOuterHitId;
     hindex_type theFishboneId;
   };
+
 }  // namespace ALPAKA_ACCELERATOR_NAMESPACE
-#endif  // RecoPixelVertexing_PixelTriplets_plugins_CACellT_h
+
+#endif  // RecoTracker_PixelSeeding_plugins_alpaka_CACell_h
diff --git a/RecoTracker/PixelSeeding/plugins/alpaka/CAFishbone.h b/RecoTracker/PixelSeeding/plugins/alpaka/CAFishbone.h
index 6e89adf483ff7..0e04350651aa6 100644
--- a/RecoTracker/PixelSeeding/plugins/alpaka/CAFishbone.h
+++ b/RecoTracker/PixelSeeding/plugins/alpaka/CAFishbone.h
@@ -11,6 +11,7 @@
 
 #include "DataFormats/Math/interface/approx_atan2.h"
 #include "HeterogeneousCore/AlpakaInterface/interface/VecArray.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/config.h"
 #include "HeterogeneousCore/AlpakaInterface/interface/traits.h"
 #include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
 
@@ -63,7 +64,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::caPixelDoublets {
       const auto [firstElementIdxNoStrideX, endElementIdxNoStrideX] =
           cms::alpakatools::element_index_range_in_block(acc, 0u, dimIndexX);
 
-      // Outermost loop on Y
+      // Outermost parallel loop on the slower dimension (Y or 0 in a 2D grid)
       constexpr uint32_t dimIndexY = 0u;
       const uint32_t gridDimensionY(alpaka::getWorkDiv<alpaka::Grid, alpaka::Elems>(acc)[dimIndexY]);
       const auto [firstElementIdxNoStrideY, endElementIdxNoStrideY] =
@@ -106,6 +107,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::caPixelDoublets {
           continue;
 
         // here we parallelize in X
+        // Innermost parallel loop on the faster dimension (X or 1 in a 2D grid)
         uint32_t firstElementIdxX = firstElementIdxNoStrideX;
         uint32_t endElementIdxX = endElementIdxNoStrideX;
         for (uint32_t ic = firstElementIdxX; (int)ic < sg - 1; ++ic) {
diff --git a/RecoTracker/PixelSeeding/plugins/alpaka/CAHitNtupletGenerator.cc b/RecoTracker/PixelSeeding/plugins/alpaka/CAHitNtupletGenerator.cc
index 1a62b0d26f904..c6615c08d73bf 100644
--- a/RecoTracker/PixelSeeding/plugins/alpaka/CAHitNtupletGenerator.cc
+++ b/RecoTracker/PixelSeeding/plugins/alpaka/CAHitNtupletGenerator.cc
@@ -1,7 +1,3 @@
-//
-// Original Author: Felice Pantaleo, CERN
-//
-
 //#define GPU_DEBUG
 //#define DUMP_GPU_TK_TUPLES
 
@@ -10,12 +6,15 @@
 #include <functional>
 #include <vector>
 
-#include "DataFormats/TrackSoA/interface/alpaka/TracksSoACollection.h"
+#include <alpaka/alpaka.hpp>
+
 #include "DataFormats/TrackSoA/interface/TracksDevice.h"
 #include "DataFormats/TrackSoA/interface/TracksHost.h"
+#include "DataFormats/TrackSoA/interface/alpaka/TracksSoACollection.h"
 #include "FWCore/ParameterSet/interface/ParameterSet.h"
 #include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
 #include "FWCore/Utilities/interface/Exception.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/config.h"
 
 #include "CAHitNtupletGenerator.h"
 #include "CAHitNtupletGeneratorKernels.h"
diff --git a/RecoTracker/PixelSeeding/plugins/alpaka/CAHitNtupletGenerator.h b/RecoTracker/PixelSeeding/plugins/alpaka/CAHitNtupletGenerator.h
index 826b92d4a195a..ec3273a89dee6 100644
--- a/RecoTracker/PixelSeeding/plugins/alpaka/CAHitNtupletGenerator.h
+++ b/RecoTracker/PixelSeeding/plugins/alpaka/CAHitNtupletGenerator.h
@@ -1,5 +1,5 @@
-#ifndef RecoPixelVertexing_PixelTriplets_Alpaka_CAHitNtupletGenerator_h
-#define RecoPixelVertexing_PixelTriplets_Alpaka_CAHitNtupletGenerator_h
+#ifndef RecoTracker_PixelSeeding_plugins_alpaka_CAHitNtupletGenerator_h
+#define RecoTracker_PixelSeeding_plugins_alpaka_CAHitNtupletGenerator_h
 
 #include <alpaka/alpaka.hpp>
 
@@ -11,10 +11,11 @@
 #include "FWCore/Framework/interface/EventSetup.h"
 #include "FWCore/ParameterSet/interface/ParameterSet.h"
 #include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/config.h"
 #include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforDevice.h"
 
-#include "CAHitNtupletGeneratorKernels.h"
 #include "CACell.h"
+#include "CAHitNtupletGeneratorKernels.h"
 #include "HelixFit.h"
 
 namespace edm {
@@ -83,4 +84,4 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
 
 }  // namespace ALPAKA_ACCELERATOR_NAMESPACE
 
-#endif  // RecoPixelVertexing_PixelTriplets_plugins_CAHitNtupletGenerator_h
+#endif  // RecoTracker_PixelSeeding_plugins_alpaka_CAHitNtupletGenerator_h
diff --git a/RecoTracker/PixelSeeding/plugins/alpaka/CAHitNtupletGeneratorKernels.dev.cc b/RecoTracker/PixelSeeding/plugins/alpaka/CAHitNtupletGeneratorKernels.dev.cc
index d5067476063b8..56bae962fbe06 100644
--- a/RecoTracker/PixelSeeding/plugins/alpaka/CAHitNtupletGeneratorKernels.dev.cc
+++ b/RecoTracker/PixelSeeding/plugins/alpaka/CAHitNtupletGeneratorKernels.dev.cc
@@ -1,15 +1,23 @@
+// C++ headers
+#ifdef DUMP_GPU_TK_TUPLES
+#include <mutex>
+#endif
+
+// Alpaka headers
 #include <alpaka/alpaka.hpp>
+
+// CMSSW headers
+#include "HeterogeneousCore/AlpakaInterface/interface/HistoContainer.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/config.h"
 #include "HeterogeneousCore/AlpakaInterface/interface/devices.h"
 #include "HeterogeneousCore/AlpakaInterface/interface/host.h"
 #include "HeterogeneousCore/AlpakaInterface/interface/memory.h"
-#include "HeterogeneousCore/AlpakaInterface/interface/config.h"
 #include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
-#include "HeterogeneousCore/AlpakaInterface/interface/HistoContainer.h"
+
+// local headers
+#include "CAFishbone.h"
 #include "CAHitNtupletGeneratorKernels.h"
 #include "CAHitNtupletGeneratorKernelsImpl.h"
-#ifdef DUMP_GPU_TK_TUPLES
-#include <mutex>
-#endif
 
 //#define GPU_DEBUG
 //#define NTUPLE_DEBUG
diff --git a/RecoTracker/PixelSeeding/plugins/alpaka/CAHitNtupletGeneratorKernels.h b/RecoTracker/PixelSeeding/plugins/alpaka/CAHitNtupletGeneratorKernels.h
index c52768cca745e..ecf8e00c454ab 100644
--- a/RecoTracker/PixelSeeding/plugins/alpaka/CAHitNtupletGeneratorKernels.h
+++ b/RecoTracker/PixelSeeding/plugins/alpaka/CAHitNtupletGeneratorKernels.h
@@ -1,5 +1,5 @@
-#ifndef RecoPixelVertexing_PixelTriplets_CAHitNtupletGeneratorKernels_h
-#define RecoPixelVertexing_PixelTriplets_CAHitNtupletGeneratorKernels_h
+#ifndef RecoTracker_PixelSeeding_plugins_alpaka_CAHitNtupletGeneratorKernels_h
+#define RecoTracker_PixelSeeding_plugins_alpaka_CAHitNtupletGeneratorKernels_h
 
 //#define GPU_DEBUG
 //#define DUMP_GPU_TK_TUPLES
@@ -268,6 +268,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
     cms::alpakatools::AtomicPairCounter* device_hitToTuple_apc_;
     cms::alpakatools::device_view<Device, uint32_t> device_nCells_;
   };
+
 }  // namespace ALPAKA_ACCELERATOR_NAMESPACE
 
-#endif  // RecoPixelVertexing_PixelTriplets_plugins_CAHitNtupletGeneratorKernels_h
+#endif  // RecoTracker_PixelSeeding_plugins_alpaka_CAHitNtupletGeneratorKernels_h
diff --git a/RecoTracker/PixelSeeding/plugins/alpaka/CAHitNtupletGeneratorKernelsImpl.h b/RecoTracker/PixelSeeding/plugins/alpaka/CAHitNtupletGeneratorKernelsImpl.h
index b809caa2e5736..0153c78868519 100644
--- a/RecoTracker/PixelSeeding/plugins/alpaka/CAHitNtupletGeneratorKernelsImpl.h
+++ b/RecoTracker/PixelSeeding/plugins/alpaka/CAHitNtupletGeneratorKernelsImpl.h
@@ -1,1044 +1,1047 @@
-//
-// Original Author: Felice Pantaleo, CERN
-//
+#ifndef RecoTracker_PixelSeeding_plugins_alpaka_CAHitNtupletGeneratorKernelsImpl_h
+#define RecoTracker_PixelSeeding_plugins_alpaka_CAHitNtupletGeneratorKernelsImpl_h
 
 //#define GPU_DEBUG
 //#define NTUPLE_DEBUG
 
-#include <alpaka/alpaka.hpp>
+// C++ includes
 #include <cmath>
 #include <cstdint>
+#include <cstdio>
 #include <limits>
+#include <type_traits>
+
+// Alpaka includes
+#include <alpaka/alpaka.hpp>
 
+// CMSSW includes
+#include "DataFormats/TrackSoA/interface/TrackDefinitions.h"
+#include "DataFormats/TrackSoA/interface/TracksSoA.h"
+#include "DataFormats/TrackSoA/interface/alpaka/TrackUtilities.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/AtomicPairCounter.h"
 #include "HeterogeneousCore/AlpakaInterface/interface/config.h"
-#include "HeterogeneousCore/AlpakaInterface/interface/traits.h"
 #include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
-#include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforDevice.h"
-#include "DataFormats/TrackSoA/interface/alpaka/TrackUtilities.h"
-#include "DataFormats/TrackingRecHitSoA/interface/TrackingRecHitsSoA.h"
 
-#include "CAStructures.h"
-#include "CAHitNtupletGeneratorKernels.h"
+// local includes
 #include "CACell.h"
-#include "CAFishbone.h"
-#include "CAPixelDoublets.h"
-
-namespace ALPAKA_ACCELERATOR_NAMESPACE {
-  namespace caHitNtupletGeneratorKernels {
-
-    constexpr uint32_t tkNotFound = std::numeric_limits<uint16_t>::max();
-    constexpr float maxScore = std::numeric_limits<float>::max();
-    constexpr float nSigma2 = 25.f;
-
-    //all of these below are mostly to avoid brining around the relative namespace
-
-    template <typename TrackerTraits>
-    using HitToTuple = caStructures::HitToTupleT<TrackerTraits>;
-
-    template <typename TrackerTraits>
-    using TupleMultiplicity = caStructures::TupleMultiplicityT<TrackerTraits>;
-
-    template <typename TrackerTraits>
-    using CellNeighborsVector = caStructures::CellNeighborsVectorT<TrackerTraits>;
-
-    template <typename TrackerTraits>
-    using CellTracksVector = caStructures::CellTracksVectorT<TrackerTraits>;
-
-    template <typename TrackerTraits>
-    using OuterHitOfCell = caStructures::OuterHitOfCellT<TrackerTraits>;
-
-    using Quality = ::pixelTrack::Quality;
-
-    template <typename TrackerTraits>
-    using TkSoAView = reco::TrackSoAView<TrackerTraits>;
-
-    template <typename TrackerTraits>
-    using HitContainer = typename reco::TrackSoA<TrackerTraits>::HitContainer;
-
-    template <typename TrackerTraits>
-    using HitsConstView = typename CACellT<TrackerTraits>::HitsConstView;
-
-    template <typename TrackerTraits>
-    using QualityCuts = ::pixelTrack::QualityCutsT<TrackerTraits>;
-
-    template <typename TrackerTraits>
-    using CAParams = caHitNtupletGenerator::CAParamsT<TrackerTraits>;
-
-    using Counters = caHitNtupletGenerator::Counters;
-
-    template <typename TrackerTraits>
-    class Kernel_checkOverflows {
-    public:
-      template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
-      ALPAKA_FN_ACC void operator()(TAcc const &acc,
-                                    TkSoAView<TrackerTraits> tracks_view,
-                                    TupleMultiplicity<TrackerTraits> const *tupleMultiplicity,
-                                    HitToTuple<TrackerTraits> const *hitToTuple,
-                                    cms::alpakatools::AtomicPairCounter *apc,
-                                    CACellT<TrackerTraits> const *__restrict__ cells,
-                                    uint32_t const *__restrict__ nCells,
-                                    CellNeighborsVector<TrackerTraits> const *cellNeighbors,
-                                    CellTracksVector<TrackerTraits> const *cellTracks,
-                                    OuterHitOfCell<TrackerTraits> const *isOuterHitOfCell,
-                                    int32_t nHits,
-                                    uint32_t maxNumberOfDoublets,
-                                    Counters *counters) const {
-        auto &c = *counters;
-        // counters once per event
-        if (cms::alpakatools::once_per_grid(acc)) {
-          alpaka::atomicAdd(acc, &c.nEvents, 1ull, alpaka::hierarchy::Blocks{});
-          alpaka::atomicAdd(acc, &c.nHits, static_cast<unsigned long long>(nHits), alpaka::hierarchy::Blocks{});
-          alpaka::atomicAdd(acc, &c.nCells, static_cast<unsigned long long>(*nCells), alpaka::hierarchy::Blocks{});
-          alpaka::atomicAdd(
-              acc, &c.nTuples, static_cast<unsigned long long>(apc->get().first), alpaka::hierarchy::Blocks{});
-          alpaka::atomicAdd(acc,
-                            &c.nFitTracks,
-                            static_cast<unsigned long long>(tupleMultiplicity->size()),
-                            alpaka::hierarchy::Blocks{});
-        }
+#include "CAHitNtupletGeneratorKernels.h"
+#include "CAStructures.h"
+
+namespace ALPAKA_ACCELERATOR_NAMESPACE::caHitNtupletGeneratorKernels {
+
+  constexpr uint32_t tkNotFound = std::numeric_limits<uint16_t>::max();
+  constexpr float maxScore = std::numeric_limits<float>::max();
+  constexpr float nSigma2 = 25.f;
+
+  // all of these below are mostly to avoid brining around the relative namespace
+
+  template <typename TrackerTraits>
+  using HitToTuple = caStructures::HitToTupleT<TrackerTraits>;
+
+  template <typename TrackerTraits>
+  using TupleMultiplicity = caStructures::TupleMultiplicityT<TrackerTraits>;
+
+  template <typename TrackerTraits>
+  using CellNeighborsVector = caStructures::CellNeighborsVectorT<TrackerTraits>;
+
+  template <typename TrackerTraits>
+  using CellTracksVector = caStructures::CellTracksVectorT<TrackerTraits>;
+
+  template <typename TrackerTraits>
+  using OuterHitOfCell = caStructures::OuterHitOfCellT<TrackerTraits>;
+
+  using Quality = ::pixelTrack::Quality;
+
+  template <typename TrackerTraits>
+  using TkSoAView = reco::TrackSoAView<TrackerTraits>;
+
+  template <typename TrackerTraits>
+  using HitContainer = typename reco::TrackSoA<TrackerTraits>::HitContainer;
+
+  template <typename TrackerTraits>
+  using HitsConstView = typename CACellT<TrackerTraits>::HitsConstView;
+
+  template <typename TrackerTraits>
+  using QualityCuts = ::pixelTrack::QualityCutsT<TrackerTraits>;
+
+  template <typename TrackerTraits>
+  using CAParams = caHitNtupletGenerator::CAParamsT<TrackerTraits>;
+
+  using Counters = caHitNtupletGenerator::Counters;
+
+  template <typename TrackerTraits>
+  class Kernel_checkOverflows {
+  public:
+    template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
+    ALPAKA_FN_ACC void operator()(TAcc const &acc,
+                                  TkSoAView<TrackerTraits> tracks_view,
+                                  TupleMultiplicity<TrackerTraits> const *tupleMultiplicity,
+                                  HitToTuple<TrackerTraits> const *hitToTuple,
+                                  cms::alpakatools::AtomicPairCounter *apc,
+                                  CACellT<TrackerTraits> const *__restrict__ cells,
+                                  uint32_t const *__restrict__ nCells,
+                                  CellNeighborsVector<TrackerTraits> const *cellNeighbors,
+                                  CellTracksVector<TrackerTraits> const *cellTracks,
+                                  OuterHitOfCell<TrackerTraits> const *isOuterHitOfCell,
+                                  int32_t nHits,
+                                  uint32_t maxNumberOfDoublets,
+                                  Counters *counters) const {
+      auto &c = *counters;
+      // counters once per event
+      if (cms::alpakatools::once_per_grid(acc)) {
+        alpaka::atomicAdd(acc, &c.nEvents, 1ull, alpaka::hierarchy::Blocks{});
+        alpaka::atomicAdd(acc, &c.nHits, static_cast<unsigned long long>(nHits), alpaka::hierarchy::Blocks{});
+        alpaka::atomicAdd(acc, &c.nCells, static_cast<unsigned long long>(*nCells), alpaka::hierarchy::Blocks{});
+        alpaka::atomicAdd(
+            acc, &c.nTuples, static_cast<unsigned long long>(apc->get().first), alpaka::hierarchy::Blocks{});
+        alpaka::atomicAdd(acc,
+                          &c.nFitTracks,
+                          static_cast<unsigned long long>(tupleMultiplicity->size()),
+                          alpaka::hierarchy::Blocks{});
+      }
 
 #ifdef NTUPLE_DEBUGS
-        if (cms::alpakatools::once_per_grid(acc)) {
-          printf("number of found cells %d \n found tuples %d with total hits %d out of %d\n",
-                 *nCells,
-                 apc->get().first,
-                 apc->get().second,
-                 nHits);
-          if (apc->get().first < TrackerTraits::maxNumberOfQuadruplets) {
-            ALPAKA_ASSERT_OFFLOAD(tracks_view.hitIndices().size(apc->get().first) == 0);
-            ALPAKA_ASSERT_OFFLOAD(tracks_view.hitIndices().size() == apc->get().second);
-          }
-        }
-        const auto ntNbins = foundNtuplets->nbins();
-
-        for (auto idx : cms::alpakatools::elements_with_stride(acc, ntBins)) {
-          if (tracks_view.hitIndices().size(idx) > TrackerTraits::maxHitsOnTrack)  // current real limit
-            printf("ERROR %d, %d\n", idx, tracks_view.hitIndices().size(idx));
-          ALPAKA_ASSERT_OFFLOAD(ftracks_view.hitIndices().size(idx) <= TrackerTraits::maxHitsOnTrack);
-          for (auto ih = tracks_view.hitIndices().begin(idx); ih != tracks_view.hitIndices().end(idx); ++ih)
-            ALPAKA_ASSERT_OFFLOAD(int(*ih) < nHits);
+      if (cms::alpakatools::once_per_grid(acc)) {
+        printf("number of found cells %d \n found tuples %d with total hits %d out of %d\n",
+               *nCells,
+               apc->get().first,
+               apc->get().second,
+               nHits);
+        if (apc->get().first < TrackerTraits::maxNumberOfQuadruplets) {
+          ALPAKA_ASSERT_OFFLOAD(tracks_view.hitIndices().size(apc->get().first) == 0);
+          ALPAKA_ASSERT_OFFLOAD(tracks_view.hitIndices().size() == apc->get().second);
         }
+      }
+      const auto ntNbins = foundNtuplets->nbins();
+
+      for (auto idx : cms::alpakatools::elements_with_stride(acc, ntBins)) {
+        if (tracks_view.hitIndices().size(idx) > TrackerTraits::maxHitsOnTrack)  // current real limit
+          printf("ERROR %d, %d\n", idx, tracks_view.hitIndices().size(idx));
+        ALPAKA_ASSERT_OFFLOAD(ftracks_view.hitIndices().size(idx) <= TrackerTraits::maxHitsOnTrack);
+        for (auto ih = tracks_view.hitIndices().begin(idx); ih != tracks_view.hitIndices().end(idx); ++ih)
+          ALPAKA_ASSERT_OFFLOAD(int(*ih) < nHits);
+      }
 #endif
 
-        if (cms::alpakatools::once_per_grid(acc)) {
-          if (apc->get().first >= TrackerTraits::maxNumberOfQuadruplets)
-            printf("Tuples overflow\n");
-          if (*nCells >= maxNumberOfDoublets)
-            printf("Cells overflow\n");
-          if (cellNeighbors && cellNeighbors->full())
-            printf("cellNeighbors overflow %d %d \n", cellNeighbors->capacity(), cellNeighbors->size());
-          if (cellTracks && cellTracks->full())
-            printf("cellTracks overflow\n");
-          if (int(hitToTuple->nOnes()) < nHits)
-            printf("ERROR hitToTuple  overflow %d %d\n", hitToTuple->nOnes(), nHits);
+      if (cms::alpakatools::once_per_grid(acc)) {
+        if (apc->get().first >= TrackerTraits::maxNumberOfQuadruplets)
+          printf("Tuples overflow\n");
+        if (*nCells >= maxNumberOfDoublets)
+          printf("Cells overflow\n");
+        if (cellNeighbors && cellNeighbors->full())
+          printf("cellNeighbors overflow %d %d \n", cellNeighbors->capacity(), cellNeighbors->size());
+        if (cellTracks && cellTracks->full())
+          printf("cellTracks overflow\n");
+        if (int(hitToTuple->nOnes()) < nHits)
+          printf("ERROR hitToTuple  overflow %d %d\n", hitToTuple->nOnes(), nHits);
 #ifdef GPU_DEBUG
-          printf("size of cellNeighbors %d \n cellTracks %d \n hitToTuple %d \n",
-                 cellNeighbors->size(),
-                 cellTracks->size(),
-                 hitToTuple->size());
+        printf("size of cellNeighbors %d \n cellTracks %d \n hitToTuple %d \n",
+               cellNeighbors->size(),
+               cellTracks->size(),
+               hitToTuple->size());
 #endif
-        }
-
-        const auto ntNCells = (*nCells);
-        for (auto idx : cms::alpakatools::elements_with_stride(acc, ntNCells)) {
-          auto const &thisCell = cells[idx];
-          if (thisCell.hasFishbone() && !thisCell.isKilled())
-            alpaka::atomicAdd(acc, &c.nFishCells, 1ull, alpaka::hierarchy::Blocks{});
-          if (thisCell.outerNeighbors().full())  //++tooManyNeighbors[thisCell.theLayerPairId];
-            printf("OuterNeighbors overflow %d in %d\n", idx, thisCell.layerPairId());
-          if (thisCell.tracks().full())  //++tooManyTracks[thisCell.theLayerPairId];
-            printf("Tracks overflow %d in %d\n", idx, thisCell.layerPairId());
-          if (thisCell.isKilled())
-            alpaka::atomicAdd(acc, &c.nKilledCells, 1ull, alpaka::hierarchy::Blocks{});
-          if (!thisCell.unused())
-            alpaka::atomicAdd(acc, &c.nEmptyCells, 1ull, alpaka::hierarchy::Blocks{});
-          if ((0 == hitToTuple->size(thisCell.inner_hit_id())) && (0 == hitToTuple->size(thisCell.outer_hit_id())))
-            alpaka::atomicAdd(acc, &c.nZeroTrackCells, 1ull, alpaka::hierarchy::Blocks{});
-        }
-
-        for (auto idx : cms::alpakatools::elements_with_stride(acc, nHits))
-          if ((*isOuterHitOfCell).container[idx].full())  // ++tooManyOuterHitOfCell;
-            printf("OuterHitOfCell overflow %d\n", idx);
       }
-    };
-
-    template <typename TrackerTraits>
-    class Kernel_fishboneCleaner {
-    public:
-      template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
-      ALPAKA_FN_ACC void operator()(TAcc const &acc,
-                                    CACellT<TrackerTraits> const *cells,
-                                    uint32_t const *__restrict__ nCells,
-                                    TkSoAView<TrackerTraits> tracks_view) const {
-        constexpr auto reject = Quality::dup;
-        const auto ntNCells = (*nCells);
-
-        for (auto idx : cms::alpakatools::elements_with_stride(acc, ntNCells)) {
-          auto const &thisCell = cells[idx];
-          if (!thisCell.isKilled())
-            continue;
 
-          for (auto it : thisCell.tracks())
-            tracks_view[it].quality() = reject;
-        }
+      const auto ntNCells = (*nCells);
+      for (auto idx : cms::alpakatools::elements_with_stride(acc, ntNCells)) {
+        auto const &thisCell = cells[idx];
+        if (thisCell.hasFishbone() && !thisCell.isKilled())
+          alpaka::atomicAdd(acc, &c.nFishCells, 1ull, alpaka::hierarchy::Blocks{});
+        if (thisCell.outerNeighbors().full())  //++tooManyNeighbors[thisCell.theLayerPairId];
+          printf("OuterNeighbors overflow %d in %d\n", idx, thisCell.layerPairId());
+        if (thisCell.tracks().full())  //++tooManyTracks[thisCell.theLayerPairId];
+          printf("Tracks overflow %d in %d\n", idx, thisCell.layerPairId());
+        if (thisCell.isKilled())
+          alpaka::atomicAdd(acc, &c.nKilledCells, 1ull, alpaka::hierarchy::Blocks{});
+        if (!thisCell.unused())
+          alpaka::atomicAdd(acc, &c.nEmptyCells, 1ull, alpaka::hierarchy::Blocks{});
+        if ((0 == hitToTuple->size(thisCell.inner_hit_id())) && (0 == hitToTuple->size(thisCell.outer_hit_id())))
+          alpaka::atomicAdd(acc, &c.nZeroTrackCells, 1ull, alpaka::hierarchy::Blocks{});
       }
-    };
-    // remove shorter tracks if sharing a cell
-    // It does not seem to affect efficiency in any way!
-    template <typename TrackerTraits>
-    class Kernel_earlyDuplicateRemover {
-    public:
-      template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
-      ALPAKA_FN_ACC void operator()(TAcc const &acc,
-                                    CACellT<TrackerTraits> const *cells,
-                                    uint32_t const *__restrict__ nCells,
-                                    TkSoAView<TrackerTraits> tracks_view,
-                                    bool dupPassThrough) const {
-        // quality to mark rejected
-        constexpr auto reject = Quality::edup;  /// cannot be loose
-        ALPAKA_ASSERT_OFFLOAD(nCells);
-        const auto ntNCells = (*nCells);
-
-        for (auto idx : cms::alpakatools::elements_with_stride(acc, ntNCells)) {
-          auto const &thisCell = cells[idx];
-
-          if (thisCell.tracks().size() < 2)
-            continue;
 
-          int8_t maxNl = 0;
-
-          // find maxNl
-          for (auto it : thisCell.tracks()) {
-            auto nl = tracks_view[it].nLayers();
-            maxNl = std::max(nl, maxNl);
-          }
+      for (auto idx : cms::alpakatools::elements_with_stride(acc, nHits))
+        if ((*isOuterHitOfCell).container[idx].full())  // ++tooManyOuterHitOfCell;
+          printf("OuterHitOfCell overflow %d\n", idx);
+    }
+  };
+
+  template <typename TrackerTraits>
+  class Kernel_fishboneCleaner {
+  public:
+    template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
+    ALPAKA_FN_ACC void operator()(TAcc const &acc,
+                                  CACellT<TrackerTraits> const *cells,
+                                  uint32_t const *__restrict__ nCells,
+                                  TkSoAView<TrackerTraits> tracks_view) const {
+      constexpr auto reject = Quality::dup;
+      const auto ntNCells = (*nCells);
+
+      for (auto idx : cms::alpakatools::elements_with_stride(acc, ntNCells)) {
+        auto const &thisCell = cells[idx];
+        if (!thisCell.isKilled())
+          continue;
+
+        for (auto it : thisCell.tracks())
+          tracks_view[it].quality() = reject;
+      }
+    }
+  };
+  // remove shorter tracks if sharing a cell
+  // It does not seem to affect efficiency in any way!
+  template <typename TrackerTraits>
+  class Kernel_earlyDuplicateRemover {
+  public:
+    template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
+    ALPAKA_FN_ACC void operator()(TAcc const &acc,
+                                  CACellT<TrackerTraits> const *cells,
+                                  uint32_t const *__restrict__ nCells,
+                                  TkSoAView<TrackerTraits> tracks_view,
+                                  bool dupPassThrough) const {
+      // quality to mark rejected
+      constexpr auto reject = Quality::edup;  /// cannot be loose
+      ALPAKA_ASSERT_OFFLOAD(nCells);
+      const auto ntNCells = (*nCells);
+
+      for (auto idx : cms::alpakatools::elements_with_stride(acc, ntNCells)) {
+        auto const &thisCell = cells[idx];
+
+        if (thisCell.tracks().size() < 2)
+          continue;
+
+        int8_t maxNl = 0;
+
+        // find maxNl
+        for (auto it : thisCell.tracks()) {
+          auto nl = tracks_view[it].nLayers();
+          maxNl = std::max(nl, maxNl);
+        }
 
-          // if (maxNl<4) continue;
-          // quad pass through (leave it her for tests)
-          //  maxNl = std::min(4, maxNl);
+        // if (maxNl<4) continue;
+        // quad pass through (leave it her for tests)
+        //  maxNl = std::min(4, maxNl);
 
-          for (auto it : thisCell.tracks()) {
-            if (tracks_view[it].nLayers() < maxNl)
-              tracks_view[it].quality() = reject;  //no race:  simple assignment of the same constant
-          }
+        for (auto it : thisCell.tracks()) {
+          if (tracks_view[it].nLayers() < maxNl)
+            tracks_view[it].quality() = reject;  //no race:  simple assignment of the same constant
         }
       }
-    };
-
-    // assume the above (so, short tracks already removed)
-    template <typename TrackerTraits>
-    class Kernel_fastDuplicateRemover {
-    public:
-      template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
-      ALPAKA_FN_ACC void operator()(TAcc const &acc,
-                                    CACellT<TrackerTraits> const *__restrict__ cells,
-                                    uint32_t const *__restrict__ nCells,
-                                    TkSoAView<TrackerTraits> tracks_view,
-                                    bool dupPassThrough) const {
-        // quality to mark rejected
-        auto const reject = dupPassThrough ? Quality::loose : Quality::dup;
-        constexpr auto loose = Quality::loose;
-
-        ALPAKA_ASSERT_OFFLOAD(nCells);
-        const auto ntNCells = (*nCells);
-
-        for (auto idx : cms::alpakatools::elements_with_stride(acc, ntNCells)) {
-          auto const &thisCell = cells[idx];
-          if (thisCell.tracks().size() < 2)
+    }
+  };
+
+  // assume the above (so, short tracks already removed)
+  template <typename TrackerTraits>
+  class Kernel_fastDuplicateRemover {
+  public:
+    template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
+    ALPAKA_FN_ACC void operator()(TAcc const &acc,
+                                  CACellT<TrackerTraits> const *__restrict__ cells,
+                                  uint32_t const *__restrict__ nCells,
+                                  TkSoAView<TrackerTraits> tracks_view,
+                                  bool dupPassThrough) const {
+      // quality to mark rejected
+      auto const reject = dupPassThrough ? Quality::loose : Quality::dup;
+      constexpr auto loose = Quality::loose;
+
+      ALPAKA_ASSERT_OFFLOAD(nCells);
+      const auto ntNCells = (*nCells);
+
+      for (auto idx : cms::alpakatools::elements_with_stride(acc, ntNCells)) {
+        auto const &thisCell = cells[idx];
+        if (thisCell.tracks().size() < 2)
+          continue;
+
+        float mc = maxScore;
+        uint16_t im = tkNotFound;
+
+        auto score = [&](auto it) { return std::abs(reco::tip(tracks_view, it)); };
+
+        // full crazy combinatorics
+        int ntr = thisCell.tracks().size();
+        for (int i = 0; i < ntr - 1; ++i) {
+          auto it = thisCell.tracks()[i];
+          auto qi = tracks_view[it].quality();
+          if (qi <= reject)
             continue;
-
-          float mc = maxScore;
-          uint16_t im = tkNotFound;
-
-          auto score = [&](auto it) { return std::abs(reco::tip(tracks_view, it)); };
-
-          // full crazy combinatorics
-          int ntr = thisCell.tracks().size();
-          for (int i = 0; i < ntr - 1; ++i) {
-            auto it = thisCell.tracks()[i];
-            auto qi = tracks_view[it].quality();
-            if (qi <= reject)
+          auto opi = tracks_view[it].state()(2);
+          auto e2opi = tracks_view[it].covariance()(9);
+          auto cti = tracks_view[it].state()(3);
+          auto e2cti = tracks_view[it].covariance()(12);
+          for (auto j = i + 1; j < ntr; ++j) {
+            auto jt = thisCell.tracks()[j];
+            auto qj = tracks_view[jt].quality();
+            if (qj <= reject)
+              continue;
+            auto opj = tracks_view[jt].state()(2);
+            auto ctj = tracks_view[jt].state()(3);
+            auto dct = nSigma2 * (tracks_view[jt].covariance()(12) + e2cti);
+            if ((cti - ctj) * (cti - ctj) > dct)
+              continue;
+            auto dop = nSigma2 * (tracks_view[jt].covariance()(9) + e2opi);
+            if ((opi - opj) * (opi - opj) > dop)
               continue;
-            auto opi = tracks_view[it].state()(2);
-            auto e2opi = tracks_view[it].covariance()(9);
-            auto cti = tracks_view[it].state()(3);
-            auto e2cti = tracks_view[it].covariance()(12);
-            for (auto j = i + 1; j < ntr; ++j) {
-              auto jt = thisCell.tracks()[j];
-              auto qj = tracks_view[jt].quality();
-              if (qj <= reject)
-                continue;
-              auto opj = tracks_view[jt].state()(2);
-              auto ctj = tracks_view[jt].state()(3);
-              auto dct = nSigma2 * (tracks_view[jt].covariance()(12) + e2cti);
-              if ((cti - ctj) * (cti - ctj) > dct)
-                continue;
-              auto dop = nSigma2 * (tracks_view[jt].covariance()(9) + e2opi);
-              if ((opi - opj) * (opi - opj) > dop)
-                continue;
-              if ((qj < qi) || (qj == qi && score(it) < score(jt)))
-                tracks_view[jt].quality() = reject;
-              else {
-                tracks_view[it].quality() = reject;
-                break;
-              }
+            if ((qj < qi) || (qj == qi && score(it) < score(jt)))
+              tracks_view[jt].quality() = reject;
+            else {
+              tracks_view[it].quality() = reject;
+              break;
             }
           }
+        }
 
-          // find maxQual
-          auto maxQual = reject;  // no duplicate!
-          for (auto it : thisCell.tracks()) {
-            if (tracks_view[it].quality() > maxQual)
-              maxQual = tracks_view[it].quality();
-          }
+        // find maxQual
+        auto maxQual = reject;  // no duplicate!
+        for (auto it : thisCell.tracks()) {
+          if (tracks_view[it].quality() > maxQual)
+            maxQual = tracks_view[it].quality();
+        }
 
-          if (maxQual <= loose)
-            continue;
+        if (maxQual <= loose)
+          continue;
 
-          // find min score
-          for (auto it : thisCell.tracks()) {
-            if (tracks_view[it].quality() == maxQual && score(it) < mc) {
-              mc = score(it);
-              im = it;
-            }
+        // find min score
+        for (auto it : thisCell.tracks()) {
+          if (tracks_view[it].quality() == maxQual && score(it) < mc) {
+            mc = score(it);
+            im = it;
           }
+        }
 
-          if (tkNotFound == im)
-            continue;
+        if (tkNotFound == im)
+          continue;
 
-          // mark all other duplicates  (not yet, keep it loose)
-          for (auto it : thisCell.tracks()) {
-            if (tracks_view[it].quality() > loose && it != im)
-              tracks_view[it].quality() = loose;  //no race:  simple assignment of the same constant
-          }
+        // mark all other duplicates  (not yet, keep it loose)
+        for (auto it : thisCell.tracks()) {
+          if (tracks_view[it].quality() > loose && it != im)
+            tracks_view[it].quality() = loose;  //no race:  simple assignment of the same constant
         }
       }
-    };
-
-    template <typename TrackerTraits>
-    class Kernel_connect {
-    public:
-      template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
-      ALPAKA_FN_ACC void operator()(TAcc const &acc,
-                                    cms::alpakatools::AtomicPairCounter *apc1,
-                                    cms::alpakatools::AtomicPairCounter *apc2,  // just to zero them
-                                    HitsConstView<TrackerTraits> hh,
-                                    CACellT<TrackerTraits> *cells,
-                                    uint32_t *nCells,
-                                    CellNeighborsVector<TrackerTraits> *cellNeighbors,
-                                    OuterHitOfCell<TrackerTraits> const *isOuterHitOfCell,
-                                    CAParams<TrackerTraits> params) const {
-        using Cell = CACellT<TrackerTraits>;
-
-        const uint32_t dimIndexY = 0u;
-        const uint32_t dimIndexX = 1u;
-        const uint32_t threadIdxY(alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[dimIndexY]);
-        const uint32_t threadIdxLocalX(alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[dimIndexX]);
-
-        if (0 == (threadIdxY + threadIdxLocalX)) {
-          (*apc1) = 0;
-          (*apc2) = 0;
-        }  // ready for next kernel
-
-        constexpr uint32_t last_bpix1_detIndex = TrackerTraits::last_bpix1_detIndex;
-        constexpr uint32_t last_barrel_detIndex = TrackerTraits::last_barrel_detIndex;
-
-        cms::alpakatools::for_each_element_in_grid_strided(
-            acc,
-            (*nCells),
-            0u,
-            [&](uint32_t idx) {
-              auto cellIndex = idx;
-              auto &thisCell = cells[idx];
-              auto innerHitId = thisCell.inner_hit_id();
-              if (int(innerHitId) >= isOuterHitOfCell->offset) {
-                uint32_t numberOfPossibleNeighbors = (*isOuterHitOfCell)[innerHitId].size();
-                auto vi = (*isOuterHitOfCell)[innerHitId].data();
-
-                auto ri = thisCell.inner_r(hh);
-                auto zi = thisCell.inner_z(hh);
-
-                auto ro = thisCell.outer_r(hh);
-                auto zo = thisCell.outer_z(hh);
-                auto isBarrel = thisCell.inner_detIndex(hh) < last_barrel_detIndex;
-
-                cms::alpakatools::for_each_element_in_block_strided(
-                    acc,
-                    numberOfPossibleNeighbors,
-                    0u,
-                    [&](uint32_t j) {
-                      auto otherCell = (vi[j]);
-                      auto &oc = cells[otherCell];
-                      auto r1 = oc.inner_r(hh);
-                      auto z1 = oc.inner_z(hh);
-                      bool aligned = Cell::areAlignedRZ(
-                          r1,
-                          z1,
-                          ri,
-                          zi,
-                          ro,
-                          zo,
-                          params.ptmin_,
-                          isBarrel ? params.CAThetaCutBarrel_
-                                   : params.CAThetaCutForward_);  // 2.f*thetaCut); // FIXME tune cuts
-                      if (aligned &&
-                          thisCell.dcaCut(hh,
-                                          oc,
-                                          oc.inner_detIndex(hh) < last_bpix1_detIndex ? params.dcaCutInnerTriplet_
-                                                                                      : params.dcaCutOuterTriplet_,
-                                          params.hardCurvCut_)) {  // FIXME tune cuts
-                        oc.addOuterNeighbor(acc, cellIndex, *cellNeighbors);
-                        thisCell.setStatusBits(Cell::StatusBit::kUsed);
-                        oc.setStatusBits(Cell::StatusBit::kUsed);
-                      }
-                    },
-                    dimIndexX);  // loop on inner cells
-              }
-            },
-            dimIndexY);  // loop on outer cells
-      }
-    };
-    template <typename TrackerTraits>
-    class Kernel_find_ntuplets {
-    public:
-      template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
-      ALPAKA_FN_ACC void operator()(TAcc const &acc,
-                                    HitsConstView<TrackerTraits> hh,
-                                    TkSoAView<TrackerTraits> tracks_view,
-                                    CACellT<TrackerTraits> *__restrict__ cells,
-                                    uint32_t const *nCells,
-                                    CellTracksVector<TrackerTraits> *cellTracks,
-                                    cms::alpakatools::AtomicPairCounter *apc,
-                                    CAParams<TrackerTraits> params) const {
-        // recursive: not obvious to widen
-
-        using Cell = CACellT<TrackerTraits>;
+    }
+  };
+
+  template <typename TrackerTraits>
+  class Kernel_connect {
+  public:
+    template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
+    ALPAKA_FN_ACC void operator()(TAcc const &acc,
+                                  cms::alpakatools::AtomicPairCounter *apc1,
+                                  cms::alpakatools::AtomicPairCounter *apc2,  // just to zero them
+                                  HitsConstView<TrackerTraits> hh,
+                                  CACellT<TrackerTraits> *cells,
+                                  uint32_t *nCells,
+                                  CellNeighborsVector<TrackerTraits> *cellNeighbors,
+                                  OuterHitOfCell<TrackerTraits> const *isOuterHitOfCell,
+                                  CAParams<TrackerTraits> params) const {
+      using Cell = CACellT<TrackerTraits>;
+
+      const uint32_t dimIndexY = 0u;
+      const uint32_t dimIndexX = 1u;
+      const uint32_t threadIdxY(alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[dimIndexY]);
+      const uint32_t threadIdxLocalX(alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[dimIndexX]);
+
+      if (0 == (threadIdxY + threadIdxLocalX)) {
+        (*apc1) = 0;
+        (*apc2) = 0;
+      }  // ready for next kernel
+
+      constexpr uint32_t last_bpix1_detIndex = TrackerTraits::last_bpix1_detIndex;
+      constexpr uint32_t last_barrel_detIndex = TrackerTraits::last_barrel_detIndex;
+
+      cms::alpakatools::for_each_element_in_grid_strided(
+          acc,
+          (*nCells),
+          0u,
+          [&](uint32_t idx) {
+            auto cellIndex = idx;
+            auto &thisCell = cells[idx];
+            auto innerHitId = thisCell.inner_hit_id();
+            if (int(innerHitId) >= isOuterHitOfCell->offset) {
+              uint32_t numberOfPossibleNeighbors = (*isOuterHitOfCell)[innerHitId].size();
+              auto vi = (*isOuterHitOfCell)[innerHitId].data();
+              auto ri = thisCell.inner_r(hh);
+              auto zi = thisCell.inner_z(hh);
+              auto ro = thisCell.outer_r(hh);
+              auto zo = thisCell.outer_z(hh);
+              auto isBarrel = thisCell.inner_detIndex(hh) < last_barrel_detIndex;
+
+              cms::alpakatools::for_each_element_in_block_strided(
+                  acc,
+                  numberOfPossibleNeighbors,
+                  0u,
+                  [&](uint32_t j) {
+                    auto otherCell = (vi[j]);
+                    auto &oc = cells[otherCell];
+                    auto r1 = oc.inner_r(hh);
+                    auto z1 = oc.inner_z(hh);
+                    bool aligned =
+                        Cell::areAlignedRZ(r1,
+                                           z1,
+                                           ri,
+                                           zi,
+                                           ro,
+                                           zo,
+                                           params.ptmin_,
+                                           isBarrel ? params.CAThetaCutBarrel_
+                                                    : params.CAThetaCutForward_);  // 2.f*thetaCut); // FIXME tune cuts
+                    if (aligned &&
+                        thisCell.dcaCut(hh,
+                                        oc,
+                                        oc.inner_detIndex(hh) < last_bpix1_detIndex ? params.dcaCutInnerTriplet_
+                                                                                    : params.dcaCutOuterTriplet_,
+                                        params.hardCurvCut_)) {  // FIXME tune cuts
+                      oc.addOuterNeighbor(acc, cellIndex, *cellNeighbors);
+                      thisCell.setStatusBits(Cell::StatusBit::kUsed);
+                      oc.setStatusBits(Cell::StatusBit::kUsed);
+                    }
+                  },
+                  dimIndexX);  // loop on inner cells
+            }
+          },
+          dimIndexY);  // loop on outer cells
+    }
+  };
+  template <typename TrackerTraits>
+  class Kernel_find_ntuplets {
+  public:
+    template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
+    ALPAKA_FN_ACC void operator()(TAcc const &acc,
+                                  HitsConstView<TrackerTraits> hh,
+                                  TkSoAView<TrackerTraits> tracks_view,
+                                  CACellT<TrackerTraits> *__restrict__ cells,
+                                  uint32_t const *nCells,
+                                  CellTracksVector<TrackerTraits> *cellTracks,
+                                  cms::alpakatools::AtomicPairCounter *apc,
+                                  CAParams<TrackerTraits> params) const {
+      // recursive: not obvious to widen
+
+      using Cell = CACellT<TrackerTraits>;
 
 #ifdef GPU_DEBUG
-        if (cms::alpakatools::once_per_grid(acc))
-          printf("starting producing ntuplets from %d cells \n", *nCells);
+      if (cms::alpakatools::once_per_grid(acc))
+        printf("starting producing ntuplets from %d cells \n", *nCells);
 #endif
 
-        for (auto idx : cms::alpakatools::elements_with_stride(acc, (*nCells))) {
-          auto const &thisCell = cells[idx];
-
-          if (thisCell.isKilled())
-            continue;  // cut by earlyFishbone
-
-          // we require at least three hits...
-
-          if (thisCell.outerNeighbors().empty())
-            continue;
-
-          auto pid = thisCell.layerPairId();
-          bool doit = params.startingLayerPair(pid);
-
-          constexpr uint32_t maxDepth = TrackerTraits::maxDepth;
-
-          if (doit) {
-            typename Cell::TmpTuple stack;
-            stack.reset();
-            bool bpix1Start = params.startAt0(pid);
-            thisCell.template find_ntuplets<maxDepth, TAcc>(acc,
-                                                            hh,
-                                                            cells,
-                                                            *cellTracks,
-                                                            tracks_view.hitIndices(),
-                                                            *apc,
-                                                            tracks_view.quality(),
-                                                            stack,
-                                                            params.minHitsPerNtuplet_,
-                                                            bpix1Start);
-            ALPAKA_ASSERT_OFFLOAD(stack.empty());
-          }
+      for (auto idx : cms::alpakatools::elements_with_stride(acc, (*nCells))) {
+        auto const &thisCell = cells[idx];
+
+        if (thisCell.isKilled())
+          continue;  // cut by earlyFishbone
+
+        // we require at least three hits...
+
+        if (thisCell.outerNeighbors().empty())
+          continue;
+
+        auto pid = thisCell.layerPairId();
+        bool doit = params.startingLayerPair(pid);
+
+        constexpr uint32_t maxDepth = TrackerTraits::maxDepth;
+
+        if (doit) {
+          typename Cell::TmpTuple stack;
+          stack.reset();
+          bool bpix1Start = params.startAt0(pid);
+          thisCell.template find_ntuplets<maxDepth, TAcc>(acc,
+                                                          hh,
+                                                          cells,
+                                                          *cellTracks,
+                                                          tracks_view.hitIndices(),
+                                                          *apc,
+                                                          tracks_view.quality(),
+                                                          stack,
+                                                          params.minHitsPerNtuplet_,
+                                                          bpix1Start);
+          ALPAKA_ASSERT_OFFLOAD(stack.empty());
         }
       }
-    };
-
-    template <typename TrackerTraits>
-    class Kernel_mark_used {
-    public:
-      template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
-      ALPAKA_FN_ACC void operator()(TAcc const &acc,
-                                    CACellT<TrackerTraits> *__restrict__ cells,
-                                    uint32_t const *nCells) const {
-        using Cell = CACellT<TrackerTraits>;
-        for (auto idx : cms::alpakatools::elements_with_stride(acc, (*nCells))) {
-          auto &thisCell = cells[idx];
-          if (!thisCell.tracks().empty())
-            thisCell.setStatusBits(Cell::StatusBit::kInTrack);
-        }
+    }
+  };
+
+  template <typename TrackerTraits>
+  class Kernel_mark_used {
+  public:
+    template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
+    ALPAKA_FN_ACC void operator()(TAcc const &acc,
+                                  CACellT<TrackerTraits> *__restrict__ cells,
+                                  uint32_t const *nCells) const {
+      using Cell = CACellT<TrackerTraits>;
+      for (auto idx : cms::alpakatools::elements_with_stride(acc, (*nCells))) {
+        auto &thisCell = cells[idx];
+        if (!thisCell.tracks().empty())
+          thisCell.setStatusBits(Cell::StatusBit::kInTrack);
       }
-    };
-
-    template <typename TrackerTraits>
-    class Kernel_countMultiplicity {
-    public:
-      template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
-      ALPAKA_FN_ACC void operator()(TAcc const &acc,
-                                    TkSoAView<TrackerTraits> tracks_view,
-                                    TupleMultiplicity<TrackerTraits> *tupleMultiplicity) const {
-        for (auto it : cms::alpakatools::elements_with_stride(acc, tracks_view.hitIndices().nOnes())) {
-          auto nhits = tracks_view.hitIndices().size(it);
-          if (nhits < 3)
-            continue;
-          if (tracks_view[it].quality() == Quality::edup)
-            continue;
-          ALPAKA_ASSERT_OFFLOAD(tracks_view[it].quality() == Quality::bad);
-          if (nhits > TrackerTraits::maxHitsOnTrack)  // current limit
-            printf("wrong mult %d %d\n", it, nhits);
-          ALPAKA_ASSERT_OFFLOAD(nhits <= TrackerTraits::maxHitsOnTrack);
-          tupleMultiplicity->count(acc, nhits);
-        }
+    }
+  };
+
+  template <typename TrackerTraits>
+  class Kernel_countMultiplicity {
+  public:
+    template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
+    ALPAKA_FN_ACC void operator()(TAcc const &acc,
+                                  TkSoAView<TrackerTraits> tracks_view,
+                                  TupleMultiplicity<TrackerTraits> *tupleMultiplicity) const {
+      for (auto it : cms::alpakatools::elements_with_stride(acc, tracks_view.hitIndices().nOnes())) {
+        auto nhits = tracks_view.hitIndices().size(it);
+        if (nhits < 3)
+          continue;
+        if (tracks_view[it].quality() == Quality::edup)
+          continue;
+        ALPAKA_ASSERT_OFFLOAD(tracks_view[it].quality() == Quality::bad);
+        if (nhits > TrackerTraits::maxHitsOnTrack)  // current limit
+          printf("wrong mult %d %d\n", it, nhits);
+        ALPAKA_ASSERT_OFFLOAD(nhits <= TrackerTraits::maxHitsOnTrack);
+        tupleMultiplicity->count(acc, nhits);
       }
-    };
-
-    template <typename TrackerTraits>
-    class Kernel_fillMultiplicity {
-    public:
-      template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
-      ALPAKA_FN_ACC void operator()(TAcc const &acc,
-                                    TkSoAView<TrackerTraits> tracks_view,
-                                    TupleMultiplicity<TrackerTraits> *tupleMultiplicity) const {
-        for (auto it : cms::alpakatools::elements_with_stride(acc, tracks_view.hitIndices().nOnes())) {
-          auto nhits = tracks_view.hitIndices().size(it);
-          if (nhits < 3)
-            continue;
-          if (tracks_view[it].quality() == Quality::edup)
-            continue;
-          ALPAKA_ASSERT_OFFLOAD(tracks_view[it].quality() == Quality::bad);
-          if (nhits > TrackerTraits::maxHitsOnTrack)
-            printf("wrong mult %d %d\n", it, nhits);
-          ALPAKA_ASSERT_OFFLOAD(nhits <= TrackerTraits::maxHitsOnTrack);
-          tupleMultiplicity->fill(acc, nhits, it);
-        }
+    }
+  };
+
+  template <typename TrackerTraits>
+  class Kernel_fillMultiplicity {
+  public:
+    template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
+    ALPAKA_FN_ACC void operator()(TAcc const &acc,
+                                  TkSoAView<TrackerTraits> tracks_view,
+                                  TupleMultiplicity<TrackerTraits> *tupleMultiplicity) const {
+      for (auto it : cms::alpakatools::elements_with_stride(acc, tracks_view.hitIndices().nOnes())) {
+        auto nhits = tracks_view.hitIndices().size(it);
+        if (nhits < 3)
+          continue;
+        if (tracks_view[it].quality() == Quality::edup)
+          continue;
+        ALPAKA_ASSERT_OFFLOAD(tracks_view[it].quality() == Quality::bad);
+        if (nhits > TrackerTraits::maxHitsOnTrack)
+          printf("wrong mult %d %d\n", it, nhits);
+        ALPAKA_ASSERT_OFFLOAD(nhits <= TrackerTraits::maxHitsOnTrack);
+        tupleMultiplicity->fill(acc, nhits, it);
       }
-    };
-
-    template <typename TrackerTraits>
-    class Kernel_classifyTracks {
-    public:
-      template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
-      ALPAKA_FN_ACC void operator()(TAcc const &acc,
-                                    TkSoAView<TrackerTraits> tracks_view,
-                                    QualityCuts<TrackerTraits> cuts) const {
-        for (auto it : cms::alpakatools::elements_with_stride(acc, tracks_view.hitIndices().nOnes())) {
-          auto nhits = tracks_view.hitIndices().size(it);
-          if (nhits == 0)
-            break;  // guard
-
-          // if duplicate: not even fit
-          if (tracks_view[it].quality() == Quality::edup)
-            continue;
-
-          ALPAKA_ASSERT_OFFLOAD(tracks_view[it].quality() == Quality::bad);
-
-          // mark doublets as bad
-          if (nhits < 3)
-            continue;
-
-          // if the fit has any invalid parameters, mark it as bad
-          bool isNaN = false;
-          for (int i = 0; i < 5; ++i) {
-            isNaN |= std::isnan(tracks_view[it].state()(i));
-          }
-          if (isNaN) {
+    }
+  };
+
+  template <typename TrackerTraits>
+  class Kernel_classifyTracks {
+  public:
+    template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
+    ALPAKA_FN_ACC void operator()(TAcc const &acc,
+                                  TkSoAView<TrackerTraits> tracks_view,
+                                  QualityCuts<TrackerTraits> cuts) const {
+      for (auto it : cms::alpakatools::elements_with_stride(acc, tracks_view.hitIndices().nOnes())) {
+        auto nhits = tracks_view.hitIndices().size(it);
+        if (nhits == 0)
+          break;  // guard
+
+        // if duplicate: not even fit
+        if (tracks_view[it].quality() == Quality::edup)
+          continue;
+
+        ALPAKA_ASSERT_OFFLOAD(tracks_view[it].quality() == Quality::bad);
+
+        // mark doublets as bad
+        if (nhits < 3)
+          continue;
+
+        // if the fit has any invalid parameters, mark it as bad
+        bool isNaN = false;
+        for (int i = 0; i < 5; ++i) {
+          isNaN |= std::isnan(tracks_view[it].state()(i));
+        }
+        if (isNaN) {
 #ifdef NTUPLE_DEBUG
-            printf("NaN in fit %d size %d chi2 %f\n", it, tracks_view.hitIndices().size(it), tracks_view[it].chi2());
+          printf("NaN in fit %d size %d chi2 %f\n", it, tracks_view.hitIndices().size(it), tracks_view[it].chi2());
 #endif
-            continue;
-          }
+          continue;
+        }
 
-          tracks_view[it].quality() = Quality::strict;
+        tracks_view[it].quality() = Quality::strict;
 
-          if (cuts.strictCut(tracks_view, it))
-            continue;
+        if (cuts.strictCut(tracks_view, it))
+          continue;
 
-          tracks_view[it].quality() = Quality::tight;
+        tracks_view[it].quality() = Quality::tight;
 
-          if (cuts.isHP(tracks_view, nhits, it))
-            tracks_view[it].quality() = Quality::highPurity;
-        }
+        if (cuts.isHP(tracks_view, nhits, it))
+          tracks_view[it].quality() = Quality::highPurity;
       }
-    };
-
-    template <typename TrackerTraits>
-    class Kernel_doStatsForTracks {
-    public:
-      template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
-      ALPAKA_FN_ACC void operator()(TAcc const &acc, TkSoAView<TrackerTraits> tracks_view, Counters *counters) const {
-        for (auto idx : cms::alpakatools::elements_with_stride(acc, tracks_view.hitIndices().nOnes())) {
-          if (tracks_view.hitIndices().size(idx) == 0)
-            break;  //guard
-          if (tracks_view[idx].quality() < Quality::loose)
-            continue;
-          alpaka::atomicAdd(acc, &(counters->nLooseTracks), 1ull, alpaka::hierarchy::Blocks{});
-          if (tracks_view[idx].quality() < Quality::strict)
-            continue;
-          alpaka::atomicAdd(acc, &(counters->nGoodTracks), 1ull, alpaka::hierarchy::Blocks{});
-        }
+    }
+  };
+
+  template <typename TrackerTraits>
+  class Kernel_doStatsForTracks {
+  public:
+    template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
+    ALPAKA_FN_ACC void operator()(TAcc const &acc, TkSoAView<TrackerTraits> tracks_view, Counters *counters) const {
+      for (auto idx : cms::alpakatools::elements_with_stride(acc, tracks_view.hitIndices().nOnes())) {
+        if (tracks_view.hitIndices().size(idx) == 0)
+          break;  //guard
+        if (tracks_view[idx].quality() < Quality::loose)
+          continue;
+        alpaka::atomicAdd(acc, &(counters->nLooseTracks), 1ull, alpaka::hierarchy::Blocks{});
+        if (tracks_view[idx].quality() < Quality::strict)
+          continue;
+        alpaka::atomicAdd(acc, &(counters->nGoodTracks), 1ull, alpaka::hierarchy::Blocks{});
       }
-    };
-
-    template <typename TrackerTraits>
-    class Kernel_countHitInTracks {
-    public:
-      template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
-      ALPAKA_FN_ACC void operator()(TAcc const &acc,
-                                    TkSoAView<TrackerTraits> tracks_view,
-                                    HitToTuple<TrackerTraits> *hitToTuple) const {
-        for (auto idx : cms::alpakatools::elements_with_stride(acc, tracks_view.hitIndices().nOnes())) {
-          if (tracks_view.hitIndices().size(idx) == 0)
-            break;  // guard
-          for (auto h = tracks_view.hitIndices().begin(idx); h != tracks_view.hitIndices().end(idx); ++h)
-            hitToTuple->count(acc, *h);
-        }
+    }
+  };
+
+  template <typename TrackerTraits>
+  class Kernel_countHitInTracks {
+  public:
+    template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
+    ALPAKA_FN_ACC void operator()(TAcc const &acc,
+                                  TkSoAView<TrackerTraits> tracks_view,
+                                  HitToTuple<TrackerTraits> *hitToTuple) const {
+      for (auto idx : cms::alpakatools::elements_with_stride(acc, tracks_view.hitIndices().nOnes())) {
+        if (tracks_view.hitIndices().size(idx) == 0)
+          break;  // guard
+        for (auto h = tracks_view.hitIndices().begin(idx); h != tracks_view.hitIndices().end(idx); ++h)
+          hitToTuple->count(acc, *h);
       }
-    };
-
-    template <typename TrackerTraits>
-    class Kernel_fillHitInTracks {
-    public:
-      template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
-      ALPAKA_FN_ACC void operator()(TAcc const &acc,
-                                    TkSoAView<TrackerTraits> tracks_view,
-                                    HitToTuple<TrackerTraits> *hitToTuple) const {
-        for (auto idx : cms::alpakatools::elements_with_stride(acc, tracks_view.hitIndices().nOnes())) {
-          if (tracks_view.hitIndices().size(idx) == 0)
-            break;  // guard
-          for (auto h = tracks_view.hitIndices().begin(idx); h != tracks_view.hitIndices().end(idx); ++h)
-            hitToTuple->fill(acc, *h, idx);
-        }
+    }
+  };
+
+  template <typename TrackerTraits>
+  class Kernel_fillHitInTracks {
+  public:
+    template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
+    ALPAKA_FN_ACC void operator()(TAcc const &acc,
+                                  TkSoAView<TrackerTraits> tracks_view,
+                                  HitToTuple<TrackerTraits> *hitToTuple) const {
+      for (auto idx : cms::alpakatools::elements_with_stride(acc, tracks_view.hitIndices().nOnes())) {
+        if (tracks_view.hitIndices().size(idx) == 0)
+          break;  // guard
+        for (auto h = tracks_view.hitIndices().begin(idx); h != tracks_view.hitIndices().end(idx); ++h)
+          hitToTuple->fill(acc, *h, idx);
       }
-    };
-
-    template <typename TrackerTraits>
-    class Kernel_fillHitDetIndices {
-    public:
-      template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
-      ALPAKA_FN_ACC void operator()(TAcc const &acc,
-                                    TkSoAView<TrackerTraits> tracks_view,
-                                    HitsConstView<TrackerTraits> hh) const {
-        // copy offsets
-        for (auto idx : cms::alpakatools::elements_with_stride(acc, tracks_view.hitIndices().nOnes())) {
-          tracks_view.detIndices().off[idx] = tracks_view.hitIndices().off[idx];
-        }
-        // fill hit indices
-        for (auto idx : cms::alpakatools::elements_with_stride(acc, tracks_view.hitIndices().size())) {
-          ALPAKA_ASSERT_OFFLOAD(tracks_view.hitIndices().content[idx] < (uint32_t)hh.metadata().size());
-          tracks_view.detIndices().content[idx] = hh[tracks_view.hitIndices().content[idx]].detectorIndex();
-        }
+    }
+  };
+
+  template <typename TrackerTraits>
+  class Kernel_fillHitDetIndices {
+  public:
+    template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
+    ALPAKA_FN_ACC void operator()(TAcc const &acc,
+                                  TkSoAView<TrackerTraits> tracks_view,
+                                  HitsConstView<TrackerTraits> hh) const {
+      // copy offsets
+      for (auto idx : cms::alpakatools::elements_with_stride(acc, tracks_view.hitIndices().nOnes())) {
+        tracks_view.detIndices().off[idx] = tracks_view.hitIndices().off[idx];
       }
-    };
-
-    template <typename TrackerTraits>
-    class Kernel_fillNLayers {
-    public:
-      template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
-      ALPAKA_FN_ACC void operator()(TAcc const &acc,
-                                    TkSoAView<TrackerTraits> tracks_view,
-                                    cms::alpakatools::AtomicPairCounter *apc) const {
-        // clamp the number of tracks to the capacity of the SoA
-        auto ntracks = std::min<int>(apc->get().first, tracks_view.metadata().size() - 1);
-
-        if (cms::alpakatools::once_per_grid(acc))
-          tracks_view.nTracks() = ntracks;
-        for (auto idx : cms::alpakatools::elements_with_stride(acc, ntracks)) {
-          ALPAKA_ASSERT_OFFLOAD(TracksUtilities<TrackerTraits>::nHits(tracks_view, idx) >= 3);
-          tracks_view[idx].nLayers() = TracksUtilities<TrackerTraits>::computeNumberOfLayers(tracks_view, idx);
-        }
+      // fill hit indices
+      for (auto idx : cms::alpakatools::elements_with_stride(acc, tracks_view.hitIndices().size())) {
+        ALPAKA_ASSERT_OFFLOAD(tracks_view.hitIndices().content[idx] < (uint32_t)hh.metadata().size());
+        tracks_view.detIndices().content[idx] = hh[tracks_view.hitIndices().content[idx]].detectorIndex();
       }
-    };
-
-    template <typename TrackerTraits>
-    class Kernel_doStatsForHitInTracks {
-    public:
-      template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
-      ALPAKA_FN_ACC void operator()(TAcc const &acc,
-                                    HitToTuple<TrackerTraits> const *__restrict__ hitToTuple,
-                                    Counters *counters) const {
-        auto &c = *counters;
-        for (auto idx : cms::alpakatools::elements_with_stride(acc, hitToTuple->nOnes())) {
-          if (hitToTuple->size(idx) == 0)
-            continue;  // SHALL NOT BE break
-          alpaka::atomicAdd(acc, &c.nUsedHits, 1ull, alpaka::hierarchy::Blocks{});
-          if (hitToTuple->size(idx) > 1)
-            alpaka::atomicAdd(acc, &c.nDupHits, 1ull, alpaka::hierarchy::Blocks{});
-        }
+    }
+  };
+
+  template <typename TrackerTraits>
+  class Kernel_fillNLayers {
+  public:
+    template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
+    ALPAKA_FN_ACC void operator()(TAcc const &acc,
+                                  TkSoAView<TrackerTraits> tracks_view,
+                                  cms::alpakatools::AtomicPairCounter *apc) const {
+      // clamp the number of tracks to the capacity of the SoA
+      auto ntracks = std::min<int>(apc->get().first, tracks_view.metadata().size() - 1);
+
+      if (cms::alpakatools::once_per_grid(acc))
+        tracks_view.nTracks() = ntracks;
+      for (auto idx : cms::alpakatools::elements_with_stride(acc, ntracks)) {
+        ALPAKA_ASSERT_OFFLOAD(TracksUtilities<TrackerTraits>::nHits(tracks_view, idx) >= 3);
+        tracks_view[idx].nLayers() = TracksUtilities<TrackerTraits>::computeNumberOfLayers(tracks_view, idx);
       }
-    };
-
-    template <typename TrackerTraits>
-    class Kernel_countSharedHit {
-    public:
-      template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
-      ALPAKA_FN_ACC void operator()(TAcc const &acc,
-                                    int *__restrict__ nshared,
-                                    HitContainer<TrackerTraits> const *__restrict__ ptuples,
-                                    Quality const *__restrict__ quality,
-                                    HitToTuple<TrackerTraits> const *__restrict__ phitToTuple) const {
-        constexpr auto loose = Quality::loose;
-
-        auto &hitToTuple = *phitToTuple;
-        auto const &foundNtuplets = *ptuples;
-        for (auto idx : cms::alpakatools::elements_with_stride(acc, hitToTuple->nbins())) {
-          if (hitToTuple.size(idx) < 2)
-            continue;
-
-          int nt = 0;
-
-          // count "good" tracks
-          for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) {
-            if (quality[*it] < loose)
-              continue;
-            ++nt;
-          }
-
-          if (nt < 2)
+    }
+  };
+
+  template <typename TrackerTraits>
+  class Kernel_doStatsForHitInTracks {
+  public:
+    template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
+    ALPAKA_FN_ACC void operator()(TAcc const &acc,
+                                  HitToTuple<TrackerTraits> const *__restrict__ hitToTuple,
+                                  Counters *counters) const {
+      auto &c = *counters;
+      for (auto idx : cms::alpakatools::elements_with_stride(acc, hitToTuple->nOnes())) {
+        if (hitToTuple->size(idx) == 0)
+          continue;  // SHALL NOT BE break
+        alpaka::atomicAdd(acc, &c.nUsedHits, 1ull, alpaka::hierarchy::Blocks{});
+        if (hitToTuple->size(idx) > 1)
+          alpaka::atomicAdd(acc, &c.nDupHits, 1ull, alpaka::hierarchy::Blocks{});
+      }
+    }
+  };
+
+  template <typename TrackerTraits>
+  class Kernel_countSharedHit {
+  public:
+    template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
+    ALPAKA_FN_ACC void operator()(TAcc const &acc,
+                                  int *__restrict__ nshared,
+                                  HitContainer<TrackerTraits> const *__restrict__ ptuples,
+                                  Quality const *__restrict__ quality,
+                                  HitToTuple<TrackerTraits> const *__restrict__ phitToTuple) const {
+      constexpr auto loose = Quality::loose;
+
+      auto &hitToTuple = *phitToTuple;
+      auto const &foundNtuplets = *ptuples;
+      for (auto idx : cms::alpakatools::elements_with_stride(acc, hitToTuple->nbins())) {
+        if (hitToTuple.size(idx) < 2)
+          continue;
+
+        int nt = 0;
+
+        // count "good" tracks
+        for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) {
+          if (quality[*it] < loose)
             continue;
+          ++nt;
+        }
 
-          // now mark  each track triplet as sharing a hit
-          for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) {
-            if (foundNtuplets.size(*it) > 3)
-              continue;
-            alpaka::atomicAdd(acc, &nshared[*it], 1ull, alpaka::hierarchy::Blocks{});
-          }
+        if (nt < 2)
+          continue;
 
-        }  //  hit loop
-      }
-    };
-
-    template <typename TrackerTraits>
-    class Kernel_markSharedHit {
-      template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
-      ALPAKA_FN_ACC void operator()(TAcc const &acc,
-                                    int const *__restrict__ nshared,
-                                    HitContainer<TrackerTraits> const *__restrict__ tuples,
-                                    Quality *__restrict__ quality,
-                                    bool dupPassThrough) const {
-        // constexpr auto bad = Quality::bad;
-        constexpr auto dup = Quality::dup;
-        constexpr auto loose = Quality::loose;
-        // constexpr auto strict = Quality::strict;
-
-        // quality to mark rejected
-        auto const reject = dupPassThrough ? loose : dup;
-        for (auto idx : cms::alpakatools::elements_with_stride(acc, tuples->nbins())) {
-          if (tuples->size(idx) == 0)
-            break;  //guard
-          if (quality[idx] <= reject)
+        // now mark  each track triplet as sharing a hit
+        for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) {
+          if (foundNtuplets.size(*it) > 3)
             continue;
-          if (nshared[idx] > 2)
-            quality[idx] = reject;
+          alpaka::atomicAdd(acc, &nshared[*it], 1ull, alpaka::hierarchy::Blocks{});
         }
+
+      }  //  hit loop
+    }
+  };
+
+  template <typename TrackerTraits>
+  class Kernel_markSharedHit {
+    template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
+    ALPAKA_FN_ACC void operator()(TAcc const &acc,
+                                  int const *__restrict__ nshared,
+                                  HitContainer<TrackerTraits> const *__restrict__ tuples,
+                                  Quality *__restrict__ quality,
+                                  bool dupPassThrough) const {
+      // constexpr auto bad = Quality::bad;
+      constexpr auto dup = Quality::dup;
+      constexpr auto loose = Quality::loose;
+      // constexpr auto strict = Quality::strict;
+
+      // quality to mark rejected
+      auto const reject = dupPassThrough ? loose : dup;
+      for (auto idx : cms::alpakatools::elements_with_stride(acc, tuples->nbins())) {
+        if (tuples->size(idx) == 0)
+          break;  //guard
+        if (quality[idx] <= reject)
+          continue;
+        if (nshared[idx] > 2)
+          quality[idx] = reject;
       }
-    };
-
-    // mostly for very forward triplets.....
-    template <typename TrackerTraits>
-    class Kernel_rejectDuplicate {
-    public:
-      template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
-      ALPAKA_FN_ACC void operator()(TAcc const &acc,
-                                    TkSoAView<TrackerTraits> tracks_view,
-                                    uint16_t nmin,
-                                    bool dupPassThrough,
-                                    HitToTuple<TrackerTraits> const *__restrict__ phitToTuple) const {
-        // quality to mark rejected
-        auto const reject = dupPassThrough ? Quality::loose : Quality::dup;
-
-        auto &hitToTuple = *phitToTuple;
-
-        for (auto idx : cms::alpakatools::elements_with_stride(acc, hitToTuple.nOnes())) {
-          if (hitToTuple.size(idx) < 2)
+    }
+  };
+
+  // mostly for very forward triplets.....
+  template <typename TrackerTraits>
+  class Kernel_rejectDuplicate {
+  public:
+    template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
+    ALPAKA_FN_ACC void operator()(TAcc const &acc,
+                                  TkSoAView<TrackerTraits> tracks_view,
+                                  uint16_t nmin,
+                                  bool dupPassThrough,
+                                  HitToTuple<TrackerTraits> const *__restrict__ phitToTuple) const {
+      // quality to mark rejected
+      auto const reject = dupPassThrough ? Quality::loose : Quality::dup;
+
+      auto &hitToTuple = *phitToTuple;
+
+      for (auto idx : cms::alpakatools::elements_with_stride(acc, hitToTuple.nOnes())) {
+        if (hitToTuple.size(idx) < 2)
+          continue;
+
+        auto score = [&](auto it, auto nl) { return std::abs(reco::tip(tracks_view, it)); };
+
+        // full combinatorics
+        for (auto ip = hitToTuple.begin(idx); ip < hitToTuple.end(idx) - 1; ++ip) {
+          auto const it = *ip;
+          auto qi = tracks_view[it].quality();
+          if (qi <= reject)
             continue;
-
-          auto score = [&](auto it, auto nl) { return std::abs(reco::tip(tracks_view, it)); };
-
-          // full combinatorics
-          for (auto ip = hitToTuple.begin(idx); ip < hitToTuple.end(idx) - 1; ++ip) {
-            auto const it = *ip;
-            auto qi = tracks_view[it].quality();
-            if (qi <= reject)
+          auto opi = tracks_view[it].state()(2);
+          auto e2opi = tracks_view[it].covariance()(9);
+          auto cti = tracks_view[it].state()(3);
+          auto e2cti = tracks_view[it].covariance()(12);
+          auto nli = tracks_view[it].nLayers();
+          for (auto jp = ip + 1; jp < hitToTuple.end(idx); ++jp) {
+            auto const jt = *jp;
+            auto qj = tracks_view[jt].quality();
+            if (qj <= reject)
+              continue;
+            auto opj = tracks_view[jt].state()(2);
+            auto ctj = tracks_view[jt].state()(3);
+            auto dct = nSigma2 * (tracks_view[jt].covariance()(12) + e2cti);
+            if ((cti - ctj) * (cti - ctj) > dct)
+              continue;
+            auto dop = nSigma2 * (tracks_view[jt].covariance()(9) + e2opi);
+            if ((opi - opj) * (opi - opj) > dop)
               continue;
-            auto opi = tracks_view[it].state()(2);
-            auto e2opi = tracks_view[it].covariance()(9);
-            auto cti = tracks_view[it].state()(3);
-            auto e2cti = tracks_view[it].covariance()(12);
-            auto nli = tracks_view[it].nLayers();
-            for (auto jp = ip + 1; jp < hitToTuple.end(idx); ++jp) {
-              auto const jt = *jp;
-              auto qj = tracks_view[jt].quality();
-              if (qj <= reject)
-                continue;
-              auto opj = tracks_view[jt].state()(2);
-              auto ctj = tracks_view[jt].state()(3);
-              auto dct = nSigma2 * (tracks_view[jt].covariance()(12) + e2cti);
-              if ((cti - ctj) * (cti - ctj) > dct)
-                continue;
-              auto dop = nSigma2 * (tracks_view[jt].covariance()(9) + e2opi);
-              if ((opi - opj) * (opi - opj) > dop)
-                continue;
-              auto nlj = tracks_view[jt].nLayers();
-              if (nlj < nli || (nlj == nli && (qj < qi || (qj == qi && score(it, nli) < score(jt, nlj)))))
-                tracks_view[jt].quality() = reject;
-              else {
-                tracks_view[it].quality() = reject;
-                break;
-              }
+            auto nlj = tracks_view[jt].nLayers();
+            if (nlj < nli || (nlj == nli && (qj < qi || (qj == qi && score(it, nli) < score(jt, nlj)))))
+              tracks_view[jt].quality() = reject;
+            else {
+              tracks_view[it].quality() = reject;
+              break;
             }
           }
         }
       }
-    };
-
-    template <typename TrackerTraits>
-    class Kernel_sharedHitCleaner {
-    public:
-      template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
-      ALPAKA_FN_ACC void operator()(TAcc const &acc,
-                                    HitsConstView<TrackerTraits> hh,
-                                    TkSoAView<TrackerTraits> tracks_view,
-                                    int nmin,
-                                    bool dupPassThrough,
-                                    HitToTuple<TrackerTraits> const *__restrict__ phitToTuple) const {
-        // quality to mark rejected
-        auto const reject = dupPassThrough ? Quality::loose : Quality::dup;
-        // quality of longest track
-        auto const longTqual = Quality::highPurity;
-
-        auto &hitToTuple = *phitToTuple;
-
-        uint32_t l1end = hh.hitsLayerStart()[1];
-
-        for (auto idx : cms::alpakatools::elements_with_stride(acc, hitToTuple.nOnes())) {
-          if (hitToTuple.size(idx) < 2)
+    }
+  };
+
+  template <typename TrackerTraits>
+  class Kernel_sharedHitCleaner {
+  public:
+    template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
+    ALPAKA_FN_ACC void operator()(TAcc const &acc,
+                                  HitsConstView<TrackerTraits> hh,
+                                  TkSoAView<TrackerTraits> tracks_view,
+                                  int nmin,
+                                  bool dupPassThrough,
+                                  HitToTuple<TrackerTraits> const *__restrict__ phitToTuple) const {
+      // quality to mark rejected
+      auto const reject = dupPassThrough ? Quality::loose : Quality::dup;
+      // quality of longest track
+      auto const longTqual = Quality::highPurity;
+
+      auto &hitToTuple = *phitToTuple;
+
+      uint32_t l1end = hh.hitsLayerStart()[1];
+
+      for (auto idx : cms::alpakatools::elements_with_stride(acc, hitToTuple.nOnes())) {
+        if (hitToTuple.size(idx) < 2)
+          continue;
+
+        int8_t maxNl = 0;
+
+        // find maxNl
+        for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) {
+          if (tracks_view[*it].quality() < longTqual)
             continue;
+          // if (tracks_view[*it].nHits()==3) continue;
+          auto nl = tracks_view[*it].nLayers();
+          maxNl = std::max(nl, maxNl);
+        }
 
-          int8_t maxNl = 0;
-
-          // find maxNl
-          for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) {
-            if (tracks_view[*it].quality() < longTqual)
-              continue;
-            // if (tracks_view[*it].nHits()==3) continue;
-            auto nl = tracks_view[*it].nLayers();
-            maxNl = std::max(nl, maxNl);
-          }
-
-          if (maxNl < 4)
-            continue;
+        if (maxNl < 4)
+          continue;
 
-          // quad pass through (leave for tests)
-          // maxNl = std::min(4, maxNl);
+        // quad pass through (leave for tests)
+        // maxNl = std::min(4, maxNl);
 
-          // kill all tracks shorter than maxHl (only triplets???
-          for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) {
-            auto nl = tracks_view[*it].nLayers();
+        // kill all tracks shorter than maxHl (only triplets???
+        for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) {
+          auto nl = tracks_view[*it].nLayers();
 
-            //checking if shared hit is on bpix1 and if the tuple is short enough
-            if (idx < l1end and nl > nmin)
-              continue;
+          //checking if shared hit is on bpix1 and if the tuple is short enough
+          if (idx < l1end and nl > nmin)
+            continue;
 
-            if (nl < maxNl && tracks_view[*it].quality() > reject)
-              tracks_view[*it].quality() = reject;
-          }
+          if (nl < maxNl && tracks_view[*it].quality() > reject)
+            tracks_view[*it].quality() = reject;
         }
       }
-    };
-    template <typename TrackerTraits>
-    class Kernel_tripletCleaner {
-    public:
-      template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
-      ALPAKA_FN_ACC void operator()(TAcc const &acc,
-                                    TkSoAView<TrackerTraits> tracks_view,
-                                    uint16_t nmin,
-                                    bool dupPassThrough,
-                                    HitToTuple<TrackerTraits> const *__restrict__ phitToTuple) const {
-        // quality to mark rejected
-        auto const reject = Quality::loose;
-        /// min quality of good
-        auto const good = Quality::strict;
-
-        auto &hitToTuple = *phitToTuple;
-
-        for (auto idx : cms::alpakatools::elements_with_stride(acc, hitToTuple.nOnes())) {
-          if (hitToTuple.size(idx) < 2)
+    }
+  };
+  template <typename TrackerTraits>
+  class Kernel_tripletCleaner {
+  public:
+    template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
+    ALPAKA_FN_ACC void operator()(TAcc const &acc,
+                                  TkSoAView<TrackerTraits> tracks_view,
+                                  uint16_t nmin,
+                                  bool dupPassThrough,
+                                  HitToTuple<TrackerTraits> const *__restrict__ phitToTuple) const {
+      // quality to mark rejected
+      auto const reject = Quality::loose;
+      /// min quality of good
+      auto const good = Quality::strict;
+
+      auto &hitToTuple = *phitToTuple;
+
+      for (auto idx : cms::alpakatools::elements_with_stride(acc, hitToTuple.nOnes())) {
+        if (hitToTuple.size(idx) < 2)
+          continue;
+
+        float mc = maxScore;
+        uint16_t im = tkNotFound;
+        bool onlyTriplets = true;
+
+        // check if only triplets
+        for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) {
+          if (tracks_view[*it].quality() <= good)
             continue;
-
-          float mc = maxScore;
-          uint16_t im = tkNotFound;
-          bool onlyTriplets = true;
-
-          // check if only triplets
-          for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) {
-            if (tracks_view[*it].quality() <= good)
-              continue;
-            onlyTriplets &= reco::isTriplet(tracks_view, *it);
-            if (!onlyTriplets)
-              break;
-          }
-
-          // only triplets
+          onlyTriplets &= reco::isTriplet(tracks_view, *it);
           if (!onlyTriplets)
-            continue;
-
-          // for triplets choose best tip!  (should we first find best quality???)
-          for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) {
-            auto const it = *ip;
-            if (tracks_view[it].quality() >= good && std::abs(reco::tip(tracks_view, it)) < mc) {
-              mc = std::abs(reco::tip(tracks_view, it));
-              im = it;
-            }
-          }
+            break;
+        }
 
-          if (tkNotFound == im)
-            continue;
+        // only triplets
+        if (!onlyTriplets)
+          continue;
 
-          // mark worse ambiguities
-          for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) {
-            auto const it = *ip;
-            if (tracks_view[it].quality() > reject && it != im)
-              tracks_view[it].quality() = reject;  //no race:  simple assignment of the same constant
+        // for triplets choose best tip!  (should we first find best quality???)
+        for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) {
+          auto const it = *ip;
+          if (tracks_view[it].quality() >= good && std::abs(reco::tip(tracks_view, it)) < mc) {
+            mc = std::abs(reco::tip(tracks_view, it));
+            im = it;
           }
+        }
 
-        }  // loop over hits
-      }
-    };
-
-    template <typename TrackerTraits>
-    class Kernel_simpleTripletCleaner {
-    public:
-      template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
-      ALPAKA_FN_ACC void operator()(TAcc const &acc,
-                                    TkSoAView<TrackerTraits> tracks_view,
-                                    uint16_t nmin,
-                                    bool dupPassThrough,
-                                    HitToTuple<TrackerTraits> const *__restrict__ phitToTuple) const {
-        // quality to mark rejected
-        auto const reject = Quality::loose;
-        /// min quality of good
-        auto const good = Quality::loose;
-
-        auto &hitToTuple = *phitToTuple;
-
-        for (auto idx : cms::alpakatools::elements_with_stride(acc, hitToTuple.nOnes())) {
-          if (hitToTuple.size(idx) < 2)
-            continue;
+        if (tkNotFound == im)
+          continue;
 
-          float mc = maxScore;
-          uint16_t im = tkNotFound;
+        // mark worse ambiguities
+        for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) {
+          auto const it = *ip;
+          if (tracks_view[it].quality() > reject && it != im)
+            tracks_view[it].quality() = reject;  //no race:  simple assignment of the same constant
+        }
 
-          // choose best tip!  (should we first find best quality???)
-          for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) {
-            auto const it = *ip;
-            if (tracks_view[it].quality() >= good && std::abs(reco::tip(tracks_view, it)) < mc) {
-              mc = std::abs(reco::tip(tracks_view, it));
-              im = it;
-            }
+      }  // loop over hits
+    }
+  };
+
+  template <typename TrackerTraits>
+  class Kernel_simpleTripletCleaner {
+  public:
+    template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
+    ALPAKA_FN_ACC void operator()(TAcc const &acc,
+                                  TkSoAView<TrackerTraits> tracks_view,
+                                  uint16_t nmin,
+                                  bool dupPassThrough,
+                                  HitToTuple<TrackerTraits> const *__restrict__ phitToTuple) const {
+      // quality to mark rejected
+      auto const reject = Quality::loose;
+      /// min quality of good
+      auto const good = Quality::loose;
+
+      auto &hitToTuple = *phitToTuple;
+
+      for (auto idx : cms::alpakatools::elements_with_stride(acc, hitToTuple.nOnes())) {
+        if (hitToTuple.size(idx) < 2)
+          continue;
+
+        float mc = maxScore;
+        uint16_t im = tkNotFound;
+
+        // choose best tip!  (should we first find best quality???)
+        for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) {
+          auto const it = *ip;
+          if (tracks_view[it].quality() >= good && std::abs(reco::tip(tracks_view, it)) < mc) {
+            mc = std::abs(reco::tip(tracks_view, it));
+            im = it;
           }
+        }
 
-          if (tkNotFound == im)
-            continue;
-
-          // mark worse ambiguities
-          for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) {
-            auto const it = *ip;
-            if (tracks_view[it].quality() > reject && reco::isTriplet(tracks_view, it) && it != im)
-              tracks_view[it].quality() = reject;  //no race:  simple assignment of the same constant
-          }
+        if (tkNotFound == im)
+          continue;
 
-        }  // loop over hits
-      }
-    };
-
-    template <typename TrackerTraits>
-    class Kernel_print_found_ntuplets {
-    public:
-      template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
-      ALPAKA_FN_ACC void operator()(TAcc const &acc,
-                                    HitsConstView<TrackerTraits> hh,
-                                    TkSoAView<TrackerTraits> tracks_view,
-                                    HitToTuple<TrackerTraits> const *__restrict__ phitToTuple,
-                                    int32_t firstPrint,
-                                    int32_t lastPrint,
-                                    int iev) const {
-        constexpr auto loose = Quality::loose;
-
-        for (auto i : cms::alpakatools::elements_with_stride(acc, tracks_view.hitIndices().nbins())) {
-          auto nh = tracks_view.hitIndices().size(i);
-          if (nh < 3)
-            continue;
-          if (tracks_view[i].quality() < loose)
-            continue;
-          printf("TK: %d %d %d %d %f %f %f %f %f %f %f %.3f %.3f %.3f %.3f %.3f %.3f %.3f\n",
-                 10000 * iev + i,
-                 int(tracks_view[i].quality()),
-                 nh,
-                 tracks_view[i].nLayers(),
-                 reco::charge(tracks_view, i),
-                 tracks_view[i].pt(),
-                 tracks_view[i].eta(),
-                 reco::phi(tracks_view, i),
-                 reco::tip(tracks_view, i),
-                 reco::zip(tracks_view, i),
-                 tracks_view[i].chi2(),
-                 hh[*tracks_view.hitIndices().begin(i)].zGlobal(),
-                 hh[*(tracks_view.hitIndices().begin(i) + 1)].zGlobal(),
-                 hh[*(tracks_view.hitIndices().begin(i) + 2)].zGlobal(),
-                 nh > 3 ? hh[int(*(tracks_view.hitIndices().begin(i) + 3))].zGlobal() : 0,
-                 nh > 4 ? hh[int(*(tracks_view.hitIndices().begin(i) + 4))].zGlobal() : 0,
-                 nh > 5 ? hh[int(*(tracks_view.hitIndices().begin(i) + 5))].zGlobal() : 0,
-                 nh > 6 ? hh[int(*(tracks_view.hitIndices().begin(i) + nh - 1))].zGlobal() : 0);
+        // mark worse ambiguities
+        for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) {
+          auto const it = *ip;
+          if (tracks_view[it].quality() > reject && reco::isTriplet(tracks_view, it) && it != im)
+            tracks_view[it].quality() = reject;  //no race:  simple assignment of the same constant
         }
+
+      }  // loop over hits
+    }
+  };
+
+  template <typename TrackerTraits>
+  class Kernel_print_found_ntuplets {
+  public:
+    template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
+    ALPAKA_FN_ACC void operator()(TAcc const &acc,
+                                  HitsConstView<TrackerTraits> hh,
+                                  TkSoAView<TrackerTraits> tracks_view,
+                                  HitToTuple<TrackerTraits> const *__restrict__ phitToTuple,
+                                  int32_t firstPrint,
+                                  int32_t lastPrint,
+                                  int iev) const {
+      constexpr auto loose = Quality::loose;
+
+      for (auto i : cms::alpakatools::elements_with_stride(acc, tracks_view.hitIndices().nbins())) {
+        auto nh = tracks_view.hitIndices().size(i);
+        if (nh < 3)
+          continue;
+        if (tracks_view[i].quality() < loose)
+          continue;
+        printf("TK: %d %d %d %d %f %f %f %f %f %f %f %.3f %.3f %.3f %.3f %.3f %.3f %.3f\n",
+               10000 * iev + i,
+               int(tracks_view[i].quality()),
+               nh,
+               tracks_view[i].nLayers(),
+               reco::charge(tracks_view, i),
+               tracks_view[i].pt(),
+               tracks_view[i].eta(),
+               reco::phi(tracks_view, i),
+               reco::tip(tracks_view, i),
+               reco::zip(tracks_view, i),
+               tracks_view[i].chi2(),
+               hh[*tracks_view.hitIndices().begin(i)].zGlobal(),
+               hh[*(tracks_view.hitIndices().begin(i) + 1)].zGlobal(),
+               hh[*(tracks_view.hitIndices().begin(i) + 2)].zGlobal(),
+               nh > 3 ? hh[int(*(tracks_view.hitIndices().begin(i) + 3))].zGlobal() : 0,
+               nh > 4 ? hh[int(*(tracks_view.hitIndices().begin(i) + 4))].zGlobal() : 0,
+               nh > 5 ? hh[int(*(tracks_view.hitIndices().begin(i) + 5))].zGlobal() : 0,
+               nh > 6 ? hh[int(*(tracks_view.hitIndices().begin(i) + nh - 1))].zGlobal() : 0);
       }
-    };
-
-    class Kernel_printCounters {
-    public:
-      template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
-      ALPAKA_FN_ACC void operator()(TAcc const &acc, Counters const *counters) const {
-        auto const &c = *counters;
-        printf(
-            "||Counters | nEvents | nHits | nCells | nTuples | nFitTacks  |  nLooseTracks  |  nGoodTracks | "
-            "nUsedHits "
-            "| "
-            "nDupHits | "
-            "nFishCells | "
-            "nKilledCells | "
-            "nUsedCells | nZeroTrackCells ||\n");
-        printf("Counters Raw %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld\n",
-               c.nEvents,
-               c.nHits,
-               c.nCells,
-               c.nTuples,
-               c.nFitTracks,
-               c.nLooseTracks,
-               c.nGoodTracks,
-               c.nUsedHits,
-               c.nDupHits,
-               c.nFishCells,
-               c.nKilledCells,
-               c.nEmptyCells,
-               c.nZeroTrackCells);
-        printf(
-            "Counters Norm %lld ||  %.1f|  %.1f|  %.1f|  %.1f|  %.1f|  %.1f|  %.1f|  %.1f|  %.3f|  %.3f|  "
-            "%.3f|  "
-            "%.3f||\n",
-            c.nEvents,
-            c.nHits / double(c.nEvents),
-            c.nCells / double(c.nEvents),
-            c.nTuples / double(c.nEvents),
-            c.nFitTracks / double(c.nEvents),
-            c.nLooseTracks / double(c.nEvents),
-            c.nGoodTracks / double(c.nEvents),
-            c.nUsedHits / double(c.nEvents),
-            c.nDupHits / double(c.nEvents),
-            c.nFishCells / double(c.nCells),
-            c.nKilledCells / double(c.nCells),
-            c.nEmptyCells / double(c.nCells),
-            c.nZeroTrackCells / double(c.nCells));
-      }
-    };
-  }  // namespace caHitNtupletGeneratorKernels
-}  // namespace ALPAKA_ACCELERATOR_NAMESPACE
+    }
+  };
+
+  class Kernel_printCounters {
+  public:
+    template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
+    ALPAKA_FN_ACC void operator()(TAcc const &acc, Counters const *counters) const {
+      auto const &c = *counters;
+      printf(
+          "||Counters | nEvents | nHits | nCells | nTuples | nFitTacks  |  nLooseTracks  |  nGoodTracks | "
+          "nUsedHits "
+          "| "
+          "nDupHits | "
+          "nFishCells | "
+          "nKilledCells | "
+          "nUsedCells | nZeroTrackCells ||\n");
+      printf("Counters Raw %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld\n",
+             c.nEvents,
+             c.nHits,
+             c.nCells,
+             c.nTuples,
+             c.nFitTracks,
+             c.nLooseTracks,
+             c.nGoodTracks,
+             c.nUsedHits,
+             c.nDupHits,
+             c.nFishCells,
+             c.nKilledCells,
+             c.nEmptyCells,
+             c.nZeroTrackCells);
+      printf(
+          "Counters Norm %lld ||  %.1f|  %.1f|  %.1f|  %.1f|  %.1f|  %.1f|  %.1f|  %.1f|  %.3f|  %.3f|  "
+          "%.3f|  "
+          "%.3f||\n",
+          c.nEvents,
+          c.nHits / double(c.nEvents),
+          c.nCells / double(c.nEvents),
+          c.nTuples / double(c.nEvents),
+          c.nFitTracks / double(c.nEvents),
+          c.nLooseTracks / double(c.nEvents),
+          c.nGoodTracks / double(c.nEvents),
+          c.nUsedHits / double(c.nEvents),
+          c.nDupHits / double(c.nEvents),
+          c.nFishCells / double(c.nCells),
+          c.nKilledCells / double(c.nCells),
+          c.nEmptyCells / double(c.nCells),
+          c.nZeroTrackCells / double(c.nCells));
+    }
+  };
+
+}  // namespace ALPAKA_ACCELERATOR_NAMESPACE::caHitNtupletGeneratorKernels
+
+#endif  // RecoTracker_PixelSeeding_plugins_alpaka_CAHitNtupletGeneratorKernelsImpl_h
diff --git a/RecoTracker/PixelSeeding/plugins/alpaka/CAPixelDoublets.h b/RecoTracker/PixelSeeding/plugins/alpaka/CAPixelDoublets.h
index 2bbdcdb7b084c..518a55c318402 100644
--- a/RecoTracker/PixelSeeding/plugins/alpaka/CAPixelDoublets.h
+++ b/RecoTracker/PixelSeeding/plugins/alpaka/CAPixelDoublets.h
@@ -1,10 +1,14 @@
-#ifndef RecoPixelVertexing_PixelTriplets_alpaka_CAPixelDoublets_h
-#define RecoPixelVertexing_PixelTriplets_alpaka_CAPixelDoublets_h
+#ifndef RecoTracker_PixelSeeding_plugins_alpaka_CAPixelDoublets_h
+#define RecoTracker_PixelSeeding_plugins_alpaka_CAPixelDoublets_h
+
+#include <type_traits>
 
 #include <alpaka/alpaka.hpp>
 
-#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/config.h"
 #include "HeterogeneousCore/AlpakaInterface/interface/traits.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
+
 #include "CAPixelDoubletsAlgos.h"
 
 namespace ALPAKA_ACCELERATOR_NAMESPACE {
@@ -66,6 +70,9 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
             acc, nActualPairs, maxNumOfDoublets, cells, nCells, cellNeighbors, cellTracks, hh, *isOuterHitOfCell, cuts);
       }
     };
+
   }  // namespace caPixelDoublets
+
 }  // namespace ALPAKA_ACCELERATOR_NAMESPACE
-#endif  // RecoPixelVertexing_PixelTriplets_plugins_CAPixelDoublets_h
+
+#endif  // RecoTracker_PixelSeeding_plugins_alpaka_CAPixelDoublets_h
diff --git a/RecoTracker/PixelSeeding/plugins/alpaka/CAPixelDoubletsAlgos.h b/RecoTracker/PixelSeeding/plugins/alpaka/CAPixelDoubletsAlgos.h
index 234b9b7527a3c..ddeb853a7ec93 100644
--- a/RecoTracker/PixelSeeding/plugins/alpaka/CAPixelDoubletsAlgos.h
+++ b/RecoTracker/PixelSeeding/plugins/alpaka/CAPixelDoubletsAlgos.h
@@ -1,5 +1,5 @@
-#ifndef RecoPixelVertexing_PixelTriplets_alpaka_CAPixelDoubletsAlgos_h
-#define RecoPixelVertexing_PixelTriplets_alpaka_CAPixelDoubletsAlgos_h
+#ifndef RecoTracker_PixelSeeding_plugins_alpaka_CAPixelDoubletsAlgos_h
+#define RecoTracker_PixelSeeding_plugins_alpaka_CAPixelDoubletsAlgos_h
 
 #include <algorithm>
 #include <cmath>
@@ -13,6 +13,7 @@
 #include "DataFormats/TrackingRecHitSoA/interface/TrackingRecHitsSoA.h"
 #include "Geometry/CommonTopologies/interface/SimplePixelTopology.h"
 #include "HeterogeneousCore/AlpakaInterface/interface/VecArray.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/config.h"
 #include "HeterogeneousCore/AlpakaInterface/interface/traits.h"
 #include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
 
@@ -22,323 +23,323 @@
 //#define GPU_DEBUG
 //#define NTUPLE_DEBUG
 
-namespace ALPAKA_ACCELERATOR_NAMESPACE {
-  namespace caPixelDoublets {
-    using namespace cms::alpakatools;
-
-    template <typename TrackerTraits>
-    using CellNeighbors = caStructures::CellNeighborsT<TrackerTraits>;
-    template <typename TrackerTraits>
-    using CellTracks = caStructures::CellTracksT<TrackerTraits>;
-    template <typename TrackerTraits>
-    using CellNeighborsVector = caStructures::CellNeighborsVectorT<TrackerTraits>;
-    template <typename TrackerTraits>
-    using CellTracksVector = caStructures::CellTracksVectorT<TrackerTraits>;
-    template <typename TrackerTraits>
-    using OuterHitOfCell = caStructures::OuterHitOfCellT<TrackerTraits>;
-    template <typename TrackerTraits>
-    using HitsConstView = typename CACellT<TrackerTraits>::HitsConstView;
-
-    template <typename TrackerTraits>
-    struct CellCutsT {
-      using H = HitsConstView<TrackerTraits>;
-      using T = TrackerTraits;
-
-      CellCutsT() = default;
-
-      CellCutsT(const bool doClusterCut,
-                const bool doZ0Cut,
-                const bool doPtCut,
-                const bool idealConditions,
-                const float z0Cut,
-                const float ptCut,
-                const std::vector<int>& phiCutsV)
-          : doClusterCut_(doClusterCut),
-            doZ0Cut_(doZ0Cut),
-            doPtCut_(doPtCut),
-            idealConditions_(idealConditions),
-            z0Cut_(z0Cut),
-            ptCut_(ptCut) {
-        assert(phiCutsV.size() == TrackerTraits::nPairs);
-        std::copy(phiCutsV.begin(), phiCutsV.end(), &phiCuts[0]);
-      }
-
-      bool doClusterCut_;
-      bool doZ0Cut_;
-      bool doPtCut_;
-      bool idealConditions_;  //this is actually not used by phase2
-
-      float z0Cut_;  //FIXME: check if could be const now
-      float ptCut_;
-
-      int phiCuts[T::nPairs];
-
-      template <typename TAcc>
-      ALPAKA_FN_ACC ALPAKA_FN_INLINE bool __attribute__((always_inline))
-      zSizeCut(const TAcc& acc, H hh, int i, int o) const {
-        const uint32_t mi = hh[i].detectorIndex();
-
-        bool innerB1 = mi < T::last_bpix1_detIndex;
-        bool isOuterLadder = idealConditions_ ? true : 0 == (mi / 8) % 2;
-        auto mes = (!innerB1) || isOuterLadder ? hh[i].clusterSizeY() : -1;
-
-        if (mes < 0)
-          return false;
-
-        const uint32_t mo = hh[o].detectorIndex();
-        auto so = hh[o].clusterSizeY();
-
-        auto dz = hh[i].zGlobal() - hh[o].zGlobal();
-        auto dr = hh[i].rGlobal() - hh[o].rGlobal();
-
-        auto innerBarrel = mi < T::last_barrel_detIndex;
-        auto onlyBarrel = mo < T::last_barrel_detIndex;
-
-        if (not innerBarrel and not onlyBarrel)
-          return false;
-        auto dy = innerB1 ? T::maxDYsize12 : T::maxDYsize;
-
-        return onlyBarrel ? so > 0 && std::abs(so - mes) > dy
-                          : innerBarrel && std::abs(mes - int(std::abs(dz / dr) * T::dzdrFact + 0.5f)) > T::maxDYPred;
-      }
-
-      template <typename TAcc>
-      ALPAKA_FN_ACC ALPAKA_FN_INLINE bool __attribute__((always_inline))
-      clusterCut(const TAcc& acc, H hh, uint32_t i) const {
-        const uint32_t mi = hh[i].detectorIndex();
-        bool innerB1orB2 = mi < T::last_bpix2_detIndex;
-
-        if (!innerB1orB2)
-          return false;
-
-        bool innerB1 = mi < T::last_bpix1_detIndex;
-        bool isOuterLadder = idealConditions_ ? true : 0 == (mi / 8) % 2;
-        auto mes = (!innerB1) || isOuterLadder ? hh[i].clusterSizeY() : -1;
-
-        if (innerB1)  // B1
-          if (mes > 0 && mes < T::minYsizeB1)
-            return true;                                                                 // only long cluster  (5*8)
-        bool innerB2 = (mi >= T::last_bpix1_detIndex) && (mi < T::last_bpix2_detIndex);  //FIXME number
-        if (innerB2)                                                                     // B2 and F1
-          if (mes > 0 && mes < T::minYsizeB2)
-            return true;
-
+namespace ALPAKA_ACCELERATOR_NAMESPACE::caPixelDoublets {
+  using namespace cms::alpakatools;
+
+  template <typename TrackerTraits>
+  using CellNeighbors = caStructures::CellNeighborsT<TrackerTraits>;
+  template <typename TrackerTraits>
+  using CellTracks = caStructures::CellTracksT<TrackerTraits>;
+  template <typename TrackerTraits>
+  using CellNeighborsVector = caStructures::CellNeighborsVectorT<TrackerTraits>;
+  template <typename TrackerTraits>
+  using CellTracksVector = caStructures::CellTracksVectorT<TrackerTraits>;
+  template <typename TrackerTraits>
+  using OuterHitOfCell = caStructures::OuterHitOfCellT<TrackerTraits>;
+  template <typename TrackerTraits>
+  using HitsConstView = typename CACellT<TrackerTraits>::HitsConstView;
+
+  template <typename TrackerTraits>
+  struct CellCutsT {
+    using H = HitsConstView<TrackerTraits>;
+    using T = TrackerTraits;
+
+    CellCutsT() = default;
+
+    CellCutsT(const bool doClusterCut,
+              const bool doZ0Cut,
+              const bool doPtCut,
+              const bool idealConditions,
+              const float z0Cut,
+              const float ptCut,
+              const std::vector<int>& phiCutsV)
+        : doClusterCut_(doClusterCut),
+          doZ0Cut_(doZ0Cut),
+          doPtCut_(doPtCut),
+          idealConditions_(idealConditions),
+          z0Cut_(z0Cut),
+          ptCut_(ptCut) {
+      assert(phiCutsV.size() == TrackerTraits::nPairs);
+      std::copy(phiCutsV.begin(), phiCutsV.end(), &phiCuts[0]);
+    }
+
+    bool doClusterCut_;
+    bool doZ0Cut_;
+    bool doPtCut_;
+    bool idealConditions_;  //this is actually not used by phase2
+
+    float z0Cut_;  //FIXME: check if could be const now
+    float ptCut_;
+
+    int phiCuts[T::nPairs];
+
+    template <typename TAcc>
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool __attribute__((always_inline))
+    zSizeCut(const TAcc& acc, H hh, int i, int o) const {
+      const uint32_t mi = hh[i].detectorIndex();
+
+      bool innerB1 = mi < T::last_bpix1_detIndex;
+      bool isOuterLadder = idealConditions_ ? true : 0 == (mi / 8) % 2;
+      auto mes = (!innerB1) || isOuterLadder ? hh[i].clusterSizeY() : -1;
+
+      if (mes < 0)
         return false;
-      }
-    };
-
-    template <typename TrackerTraits, typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE void __attribute__((always_inline))
-    doubletsFromHisto(const TAcc& acc,
-                      uint32_t nPairs,
-                      const uint32_t maxNumOfDoublets,
-                      CACellT<TrackerTraits>* cells,
-                      uint32_t* nCells,
-                      CellNeighborsVector<TrackerTraits>* cellNeighbors,
-                      CellTracksVector<TrackerTraits>* cellTracks,
-                      HitsConstView<TrackerTraits> hh,
-                      OuterHitOfCell<TrackerTraits> isOuterHitOfCell,
-                      CellCutsT<TrackerTraits> const& cuts) {  // ysize cuts (z in the barrel)  times 8
-                                                               // these are used if doClusterCut is true
-
-      const bool doClusterCut = cuts.doClusterCut_;
-      const bool doZ0Cut = cuts.doZ0Cut_;
-      const bool doPtCut = cuts.doPtCut_;
-
-      const float z0cut = cuts.z0Cut_;      // cm
-      const float hardPtCut = cuts.ptCut_;  // GeV
-      // cm (1 GeV track has 1 GeV/c / (e * 3.8T) ~ 87 cm radius in a 3.8T field)
-      const float minRadius = hardPtCut * 87.78f;
-      const float minRadius2T4 = 4.f * minRadius * minRadius;
-
-      using PhiBinner = typename TrackingRecHitSoA<TrackerTraits>::PhiBinner;
-
-      auto const& __restrict__ phiBinner = hh.phiBinner();
-      uint32_t const* __restrict__ offsets = hh.hitsLayerStart().data();
-      ALPAKA_ASSERT_OFFLOAD(offsets);
-
-      auto layerSize = [=](uint8_t li) { return offsets[li + 1] - offsets[li]; };
-
-      // nPairsMax to be optimized later (originally was 64).
-      // If it should much be bigger, consider using a block-wide parallel prefix scan,
-      // e.g. see  https://nvlabs.github.io/cub/classcub_1_1_warp_scan.html
-      auto& innerLayerCumulativeSize = alpaka::declareSharedVar<uint32_t[TrackerTraits::nPairs], __COUNTER__>(acc);
-      auto& ntot = alpaka::declareSharedVar<uint32_t, __COUNTER__>(acc);
-
-      constexpr uint32_t dimIndexY = 0u;
-      constexpr uint32_t dimIndexX = 1u;
-      const uint32_t threadIdxLocalY(alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[dimIndexY]);
-      const uint32_t threadIdxLocalX(alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[dimIndexX]);
-
-      if (threadIdxLocalY == 0 && threadIdxLocalX == 0) {
-        innerLayerCumulativeSize[0] = layerSize(TrackerTraits::layerPairs[0]);
-        for (uint32_t i = 1; i < nPairs; ++i) {
-          innerLayerCumulativeSize[i] = innerLayerCumulativeSize[i - 1] + layerSize(TrackerTraits::layerPairs[2 * i]);
-        }
-        ntot = innerLayerCumulativeSize[nPairs - 1];
-      }
-      alpaka::syncBlockThreads(acc);
-
-      // x runs faster
-      const uint32_t blockDimensionX(alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[dimIndexX]);
-      const auto& [firstElementIdxNoStrideX, endElementIdxNoStrideX] =
-          cms::alpakatools::element_index_range_in_block(acc, 0u, dimIndexX);
-
-      uint32_t pairLayerId = 0;  // cannot go backward
 
-      // Outermost loop on Y
-      const uint32_t gridDimensionY(alpaka::getWorkDiv<alpaka::Grid, alpaka::Elems>(acc)[dimIndexY]);
-      const auto& [firstElementIdxNoStrideY, endElementIdxNoStrideY] =
-          cms::alpakatools::element_index_range_in_grid(acc, 0u, dimIndexY);
-      uint32_t firstElementIdxY = firstElementIdxNoStrideY;
-      uint32_t endElementIdxY = endElementIdxNoStrideY;
+      const uint32_t mo = hh[o].detectorIndex();
+      auto so = hh[o].clusterSizeY();
 
-      //const uint32_t incY = cms::alpakatools::requires_single_thread_per_block_v<TAcc> ? 1 : gridDimensionY;
-      for (uint32_t j = firstElementIdxY; j < ntot; j++) {
-        if (not cms::alpakatools::next_valid_element_index_strided(
-                j, firstElementIdxY, endElementIdxY, gridDimensionY, ntot))
-          break;
+      auto dz = hh[i].zGlobal() - hh[o].zGlobal();
+      auto dr = hh[i].rGlobal() - hh[o].rGlobal();
 
-        while (j >= innerLayerCumulativeSize[pairLayerId++])
-          ;
-        --pairLayerId;  // move to lower_bound ??
+      auto innerBarrel = mi < T::last_barrel_detIndex;
+      auto onlyBarrel = mo < T::last_barrel_detIndex;
 
-        ALPAKA_ASSERT_OFFLOAD(pairLayerId < nPairs);
-        ALPAKA_ASSERT_OFFLOAD(j < innerLayerCumulativeSize[pairLayerId]);
-        ALPAKA_ASSERT_OFFLOAD(0 == pairLayerId || j >= innerLayerCumulativeSize[pairLayerId - 1]);
-
-        uint8_t inner = TrackerTraits::layerPairs[2 * pairLayerId];
-        uint8_t outer = TrackerTraits::layerPairs[2 * pairLayerId + 1];
-        ALPAKA_ASSERT_OFFLOAD(outer > inner);
+      if (not innerBarrel and not onlyBarrel)
+        return false;
+      auto dy = innerB1 ? T::maxDYsize12 : T::maxDYsize;
 
-        auto hoff = PhiBinner::histOff(outer);
-        auto i = (0 == pairLayerId) ? j : j - innerLayerCumulativeSize[pairLayerId - 1];
-        i += offsets[inner];
+      return onlyBarrel ? so > 0 && std::abs(so - mes) > dy
+                        : innerBarrel && std::abs(mes - int(std::abs(dz / dr) * T::dzdrFact + 0.5f)) > T::maxDYPred;
+    }
 
-        ALPAKA_ASSERT_OFFLOAD(i >= offsets[inner]);
-        ALPAKA_ASSERT_OFFLOAD(i < offsets[inner + 1]);
+    template <typename TAcc>
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool __attribute__((always_inline))
+    clusterCut(const TAcc& acc, H hh, uint32_t i) const {
+      const uint32_t mi = hh[i].detectorIndex();
+      bool innerB1orB2 = mi < T::last_bpix2_detIndex;
 
-        // found hit corresponding to our cuda thread, now do the job
-        if (hh[i].detectorIndex() > pixelClustering::maxNumModules)
-          continue;  // invalid
+      if (!innerB1orB2)
+        return false;
 
-        /* maybe clever, not effective when zoCut is on
+      bool innerB1 = mi < T::last_bpix1_detIndex;
+      bool isOuterLadder = idealConditions_ ? true : 0 == (mi / 8) % 2;
+      auto mes = (!innerB1) || isOuterLadder ? hh[i].clusterSizeY() : -1;
+
+      if (innerB1)  // B1
+        if (mes > 0 && mes < T::minYsizeB1)
+          return true;                                                                 // only long cluster  (5*8)
+      bool innerB2 = (mi >= T::last_bpix1_detIndex) && (mi < T::last_bpix2_detIndex);  //FIXME number
+      if (innerB2)                                                                     // B2 and F1
+        if (mes > 0 && mes < T::minYsizeB2)
+          return true;
+
+      return false;
+    }
+  };
+
+  template <typename TrackerTraits, typename TAcc>
+  ALPAKA_FN_ACC ALPAKA_FN_INLINE void __attribute__((always_inline))
+  doubletsFromHisto(const TAcc& acc,
+                    uint32_t nPairs,
+                    const uint32_t maxNumOfDoublets,
+                    CACellT<TrackerTraits>* cells,
+                    uint32_t* nCells,
+                    CellNeighborsVector<TrackerTraits>* cellNeighbors,
+                    CellTracksVector<TrackerTraits>* cellTracks,
+                    HitsConstView<TrackerTraits> hh,
+                    OuterHitOfCell<TrackerTraits> isOuterHitOfCell,
+                    CellCutsT<TrackerTraits> const& cuts) {  // ysize cuts (z in the barrel)  times 8
+                                                             // these are used if doClusterCut is true
+
+    const bool doClusterCut = cuts.doClusterCut_;
+    const bool doZ0Cut = cuts.doZ0Cut_;
+    const bool doPtCut = cuts.doPtCut_;
+
+    const float z0cut = cuts.z0Cut_;      // cm
+    const float hardPtCut = cuts.ptCut_;  // GeV
+    // cm (1 GeV track has 1 GeV/c / (e * 3.8T) ~ 87 cm radius in a 3.8T field)
+    const float minRadius = hardPtCut * 87.78f;
+    const float minRadius2T4 = 4.f * minRadius * minRadius;
+
+    using PhiBinner = typename TrackingRecHitSoA<TrackerTraits>::PhiBinner;
+
+    auto const& __restrict__ phiBinner = hh.phiBinner();
+    uint32_t const* __restrict__ offsets = hh.hitsLayerStart().data();
+    ALPAKA_ASSERT_OFFLOAD(offsets);
+
+    auto layerSize = [=](uint8_t li) { return offsets[li + 1] - offsets[li]; };
+
+    // nPairsMax to be optimized later (originally was 64).
+    // If it should much be bigger, consider using a block-wide parallel prefix scan,
+    // e.g. see  https://nvlabs.github.io/cub/classcub_1_1_warp_scan.html
+    auto& innerLayerCumulativeSize = alpaka::declareSharedVar<uint32_t[TrackerTraits::nPairs], __COUNTER__>(acc);
+    auto& ntot = alpaka::declareSharedVar<uint32_t, __COUNTER__>(acc);
+
+    constexpr uint32_t dimIndexY = 0u;
+    constexpr uint32_t dimIndexX = 1u;
+    const uint32_t threadIdxLocalY(alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[dimIndexY]);
+    const uint32_t threadIdxLocalX(alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[dimIndexX]);
+
+    if (threadIdxLocalY == 0 && threadIdxLocalX == 0) {
+      innerLayerCumulativeSize[0] = layerSize(TrackerTraits::layerPairs[0]);
+      for (uint32_t i = 1; i < nPairs; ++i) {
+        innerLayerCumulativeSize[i] = innerLayerCumulativeSize[i - 1] + layerSize(TrackerTraits::layerPairs[2 * i]);
+      }
+      ntot = innerLayerCumulativeSize[nPairs - 1];
+    }
+    alpaka::syncBlockThreads(acc);
+
+    // x runs faster
+    const uint32_t blockDimensionX(alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[dimIndexX]);
+    const auto& [firstElementIdxNoStrideX, endElementIdxNoStrideX] =
+        cms::alpakatools::element_index_range_in_block(acc, 0u, dimIndexX);
+
+    uint32_t pairLayerId = 0;  // cannot go backward
+
+    // Outermost loop on Y
+    const uint32_t gridDimensionY(alpaka::getWorkDiv<alpaka::Grid, alpaka::Elems>(acc)[dimIndexY]);
+    const auto& [firstElementIdxNoStrideY, endElementIdxNoStrideY] =
+        cms::alpakatools::element_index_range_in_grid(acc, 0u, dimIndexY);
+    uint32_t firstElementIdxY = firstElementIdxNoStrideY;
+    uint32_t endElementIdxY = endElementIdxNoStrideY;
+
+    //const uint32_t incY = cms::alpakatools::requires_single_thread_per_block_v<TAcc> ? 1 : gridDimensionY;
+    for (uint32_t j = firstElementIdxY; j < ntot; j++) {
+      if (not cms::alpakatools::next_valid_element_index_strided(
+              j, firstElementIdxY, endElementIdxY, gridDimensionY, ntot))
+        break;
+
+      while (j >= innerLayerCumulativeSize[pairLayerId++])
+        ;
+      --pairLayerId;  // move to lower_bound ??
+
+      ALPAKA_ASSERT_OFFLOAD(pairLayerId < nPairs);
+      ALPAKA_ASSERT_OFFLOAD(j < innerLayerCumulativeSize[pairLayerId]);
+      ALPAKA_ASSERT_OFFLOAD(0 == pairLayerId || j >= innerLayerCumulativeSize[pairLayerId - 1]);
+
+      uint8_t inner = TrackerTraits::layerPairs[2 * pairLayerId];
+      uint8_t outer = TrackerTraits::layerPairs[2 * pairLayerId + 1];
+      ALPAKA_ASSERT_OFFLOAD(outer > inner);
+
+      auto hoff = PhiBinner::histOff(outer);
+      auto i = (0 == pairLayerId) ? j : j - innerLayerCumulativeSize[pairLayerId - 1];
+      i += offsets[inner];
+
+      ALPAKA_ASSERT_OFFLOAD(i >= offsets[inner]);
+      ALPAKA_ASSERT_OFFLOAD(i < offsets[inner + 1]);
+
+      // found hit corresponding to our cuda thread, now do the job
+      if (hh[i].detectorIndex() > pixelClustering::maxNumModules)
+        continue;  // invalid
+
+      /* maybe clever, not effective when zoCut is on
       auto bpos = (mi%8)/4;  // if barrel is 1 for z>0
       auto fpos = (outer>3) & (outer<7);
       if ( ((inner<3) & (outer>3)) && bpos!=fpos) continue;
       */
 
-        auto mez = hh[i].zGlobal();
+      auto mez = hh[i].zGlobal();
 
-        if (mez < TrackerTraits::minz[pairLayerId] || mez > TrackerTraits::maxz[pairLayerId])
-          continue;
+      if (mez < TrackerTraits::minz[pairLayerId] || mez > TrackerTraits::maxz[pairLayerId])
+        continue;
 
-        if (doClusterCut && outer > pixelTopology::last_barrel_layer && cuts.clusterCut(acc, hh, i))
-          continue;
+      if (doClusterCut && outer > pixelTopology::last_barrel_layer && cuts.clusterCut(acc, hh, i))
+        continue;
 
-        auto mep = hh[i].iphi();
-        auto mer = hh[i].rGlobal();
+      auto mep = hh[i].iphi();
+      auto mer = hh[i].rGlobal();
 
-        // all cuts: true if fails
-        auto ptcut = [&](int j, int16_t idphi) {
-          auto r2t4 = minRadius2T4;
-          auto ri = mer;
-          auto ro = hh[j].rGlobal();
-          auto dphi = short2phi(idphi);
-          return dphi * dphi * (r2t4 - ri * ro) > (ro - ri) * (ro - ri);
-        };
-        auto z0cutoff = [&](int j) {
-          auto zo = hh[j].zGlobal();
-          auto ro = hh[j].rGlobal();
-          auto dr = ro - mer;
-          return dr > TrackerTraits::maxr[pairLayerId] || dr < 0 || std::abs((mez * ro - mer * zo)) > z0cut * dr;
-        };
+      // all cuts: true if fails
+      auto ptcut = [&](int j, int16_t idphi) {
+        auto r2t4 = minRadius2T4;
+        auto ri = mer;
+        auto ro = hh[j].rGlobal();
+        auto dphi = short2phi(idphi);
+        return dphi * dphi * (r2t4 - ri * ro) > (ro - ri) * (ro - ri);
+      };
+      auto z0cutoff = [&](int j) {
+        auto zo = hh[j].zGlobal();
+        auto ro = hh[j].rGlobal();
+        auto dr = ro - mer;
+        return dr > TrackerTraits::maxr[pairLayerId] || dr < 0 || std::abs((mez * ro - mer * zo)) > z0cut * dr;
+      };
 
-        auto iphicut = cuts.phiCuts[pairLayerId];
+      auto iphicut = cuts.phiCuts[pairLayerId];
 
-        auto kl = PhiBinner::bin(int16_t(mep - iphicut));
-        auto kh = PhiBinner::bin(int16_t(mep + iphicut));
-        auto incr = [](auto& k) { return k = (k + 1) % PhiBinner::nbins(); };
+      auto kl = PhiBinner::bin(int16_t(mep - iphicut));
+      auto kh = PhiBinner::bin(int16_t(mep + iphicut));
+      auto incr = [](auto& k) { return k = (k + 1) % PhiBinner::nbins(); };
 
 #ifdef GPU_DEBUG
-        int tot = 0;
-        int nmin = 0;
-        int tooMany = 0;
+      int tot = 0;
+      int nmin = 0;
+      int tooMany = 0;
 #endif
 
-        auto khh = kh;
-        incr(khh);
-        for (auto kk = kl; kk != khh; incr(kk)) {
+      auto khh = kh;
+      incr(khh);
+      for (auto kk = kl; kk != khh; incr(kk)) {
 #ifdef GPU_DEBUG
-          if (kk != kl && kk != kh)
-            nmin += phiBinner.size(kk + hoff);
+        if (kk != kl && kk != kh)
+          nmin += phiBinner.size(kk + hoff);
 #endif
-          auto const* __restrict__ p = phiBinner.begin(kk + hoff);
-          auto const* __restrict__ e = phiBinner.end(kk + hoff);
-          auto const maxpIndex = e - p;
-
-          // Here we parallelize in X
-          uint32_t firstElementIdxX = firstElementIdxNoStrideX;
-          uint32_t endElementIdxX = endElementIdxNoStrideX;
-
-          for (uint32_t pIndex = firstElementIdxX; pIndex < maxpIndex; ++pIndex) {
-            if (not cms::alpakatools::next_valid_element_index_strided(
-                    pIndex, firstElementIdxX, endElementIdxX, blockDimensionX, maxpIndex))
-              break;
-            auto oi = p[pIndex];  // auto oi = __ldg(p); is not allowed since __ldg is device-only
-            ALPAKA_ASSERT_OFFLOAD(oi >= offsets[outer]);
-            ALPAKA_ASSERT_OFFLOAD(oi < offsets[outer + 1]);
-            auto mo = hh[oi].detectorIndex();
-
-            if (mo > pixelClustering::maxNumModules)
-              continue;  //    invalid
-
-            if (doZ0Cut && z0cutoff(oi))
-              continue;
-
-            auto mop = hh[oi].iphi();
-            uint16_t idphi = std::min(std::abs(int16_t(mop - mep)), std::abs(int16_t(mep - mop)));
-
-            if (idphi > iphicut)
-              continue;
-
-            if (doClusterCut && cuts.zSizeCut(acc, hh, i, oi))
-              continue;
-
-            if (doPtCut && ptcut(oi, idphi))
-              continue;
-
-            auto ind = alpaka::atomicAdd(acc, nCells, (uint32_t)1, alpaka::hierarchy::Blocks{});
-            if (ind >= maxNumOfDoublets) {
-              alpaka::atomicSub(acc, nCells, (uint32_t)1, alpaka::hierarchy::Blocks{});
-              break;
-            }  // move to SimpleVector??
-            cells[ind].init(*cellNeighbors, *cellTracks, hh, pairLayerId, i, oi);
-            isOuterHitOfCell[oi].push_back(acc, ind);
+        auto const* __restrict__ p = phiBinner.begin(kk + hoff);
+        auto const* __restrict__ e = phiBinner.end(kk + hoff);
+        auto const maxpIndex = e - p;
+
+        // Here we parallelize in X
+        uint32_t firstElementIdxX = firstElementIdxNoStrideX;
+        uint32_t endElementIdxX = endElementIdxNoStrideX;
+
+        for (uint32_t pIndex = firstElementIdxX; pIndex < maxpIndex; ++pIndex) {
+          if (not cms::alpakatools::next_valid_element_index_strided(
+                  pIndex, firstElementIdxX, endElementIdxX, blockDimensionX, maxpIndex))
+            break;
+          auto oi = p[pIndex];  // auto oi = __ldg(p); is not allowed since __ldg is device-only
+          ALPAKA_ASSERT_OFFLOAD(oi >= offsets[outer]);
+          ALPAKA_ASSERT_OFFLOAD(oi < offsets[outer + 1]);
+          auto mo = hh[oi].detectorIndex();
+
+          if (mo > pixelClustering::maxNumModules)
+            continue;  //    invalid
+
+          if (doZ0Cut && z0cutoff(oi))
+            continue;
+
+          auto mop = hh[oi].iphi();
+          uint16_t idphi = std::min(std::abs(int16_t(mop - mep)), std::abs(int16_t(mep - mop)));
+
+          if (idphi > iphicut)
+            continue;
+
+          if (doClusterCut && cuts.zSizeCut(acc, hh, i, oi))
+            continue;
+
+          if (doPtCut && ptcut(oi, idphi))
+            continue;
+
+          auto ind = alpaka::atomicAdd(acc, nCells, (uint32_t)1, alpaka::hierarchy::Blocks{});
+          if (ind >= maxNumOfDoublets) {
+            alpaka::atomicSub(acc, nCells, (uint32_t)1, alpaka::hierarchy::Blocks{});
+            break;
+          }  // move to SimpleVector??
+          cells[ind].init(*cellNeighbors, *cellTracks, hh, pairLayerId, i, oi);
+          isOuterHitOfCell[oi].push_back(acc, ind);
 #ifdef GPU_DEBUG
-            if (isOuterHitOfCell[oi].full())
-              ++tooMany;
-            ++tot;
+          if (isOuterHitOfCell[oi].full())
+            ++tooMany;
+          ++tot;
 #endif
-          }
         }
+      }
 //      #endif
 #ifdef GPU_DEBUG
-        if (tooMany > 0 or tot > 0)
-          printf("OuterHitOfCell for %d in layer %d/%d, %d,%d %d, %d %.3f %.3f %s\n",
-                 i,
-                 inner,
-                 outer,
-                 nmin,
-                 tot,
-                 tooMany,
-                 iphicut,
-                 TrackerTraits::minz[pairLayerId],
-                 TrackerTraits::maxz[pairLayerId],
-                 tooMany > 0 ? "FULL!!" : "not full.");
+      if (tooMany > 0 or tot > 0)
+        printf("OuterHitOfCell for %d in layer %d/%d, %d,%d %d, %d %.3f %.3f %s\n",
+               i,
+               inner,
+               outer,
+               nmin,
+               tot,
+               tooMany,
+               iphicut,
+               TrackerTraits::minz[pairLayerId],
+               TrackerTraits::maxz[pairLayerId],
+               tooMany > 0 ? "FULL!!" : "not full.");
 #endif
-      }  // loop in block...
-    }    // namespace caPixelDoublets
-  }      // namespace caPixelDoublets
-}  // namespace ALPAKA_ACCELERATOR_NAMESPACE
-#endif  // RecoPixelVertexing_PixelTriplets_CAPixelDoubletsAlgos_h
+    }  // loop in block...
+  }
+
+}  // namespace ALPAKA_ACCELERATOR_NAMESPACE::caPixelDoublets
+
+#endif  // RecoTracker_PixelSeeding_plugins_alpaka_CAPixelDoubletsAlgos_h
diff --git a/RecoTracker/PixelSeeding/plugins/alpaka/CAStructures.h b/RecoTracker/PixelSeeding/plugins/alpaka/CAStructures.h
index 6ac7a90c724fc..fcc4fab8ead54 100644
--- a/RecoTracker/PixelSeeding/plugins/alpaka/CAStructures.h
+++ b/RecoTracker/PixelSeeding/plugins/alpaka/CAStructures.h
@@ -1,5 +1,5 @@
-#ifndef RecoPixelVertexing_PixelTriplets_CAStructures_h
-#define RecoPixelVertexing_PixelTriplets_CAStructures_h
+#ifndef RecoTracker_PixelSeeding_plugins_alpaka_CAStructures_h
+#define RecoTracker_PixelSeeding_plugins_alpaka_CAStructures_h
 
 #include "HeterogeneousCore/AlpakaInterface/interface/SimpleVector.h"
 #include "HeterogeneousCore/AlpakaInterface/interface/VecArray.h"
@@ -49,4 +49,4 @@ namespace caStructures {
 
 }  // namespace caStructures
 
-#endif
+#endif  // RecoTracker_PixelSeeding_plugins_alpaka_CAStructures_h
diff --git a/RecoTracker/PixelSeeding/plugins/alpaka/HelixFit.h b/RecoTracker/PixelSeeding/plugins/alpaka/HelixFit.h
index 908124bb83081..f3e75e83106a7 100644
--- a/RecoTracker/PixelSeeding/plugins/alpaka/HelixFit.h
+++ b/RecoTracker/PixelSeeding/plugins/alpaka/HelixFit.h
@@ -1,7 +1,10 @@
-#ifndef RecoPixelVertexing_PixelTriplets_HelixFit_h
-#define RecoPixelVertexing_PixelTriplets_HelixFit_h
+#ifndef RecoTracker_PixelSeeding_plugins_alpaka_HelixFit_h
+#define RecoTracker_PixelSeeding_plugins_alpaka_HelixFit_h
 
 #include <alpaka/alpaka.hpp>
+
+#include <Eigen/Core>
+
 #include "DataFormats/TrackSoA/interface/alpaka/TrackUtilities.h"
 #include "DataFormats/TrackingRecHitSoA/interface/TrackingRecHitsSoA.h"
 #include "RecoTracker/PixelTrackFitting/interface/alpaka/FitResult.h"
@@ -10,7 +13,9 @@
 #include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforDevice.h"
 
 #include "CAStructures.h"
+
 namespace riemannFit {
+
   // TODO: Can this be taken from TrackerTraits or somewhere else?
   // in case of memory issue can be made smaller
   constexpr uint32_t maxNumberOfConcurrentFits = 32 * 1024;
@@ -89,5 +94,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
 
     const bool fitNas4_;
   };
+
 }  // namespace ALPAKA_ACCELERATOR_NAMESPACE
-#endif  // RecoPixelVertexing_PixelTriplets_plugins_HelixFit_h
+
+#endif  // RecoTracker_PixelSeeding_plugins_alpaka_HelixFit_h
diff --git a/RecoTracker/PixelSeeding/plugins/alpaka/RiemannFit.dev.cc b/RecoTracker/PixelSeeding/plugins/alpaka/RiemannFit.dev.cc
index 5aa202700580c..3a1d5dacd8435 100644
--- a/RecoTracker/PixelSeeding/plugins/alpaka/RiemannFit.dev.cc
+++ b/RecoTracker/PixelSeeding/plugins/alpaka/RiemannFit.dev.cc
@@ -1,16 +1,15 @@
-//
-// Author: Felice Pantaleo, CERN
-//
+#include <cstdint>
 
 #include <alpaka/alpaka.hpp>
-#include <cstdint>
 
+#include "DataFormats/TrackSoA/interface/alpaka/TrackUtilities.h"
+#include "DataFormats/TrackingRecHitSoA/interface/TrackingRecHitsSoA.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/config.h"
 #include "HeterogeneousCore/AlpakaInterface/interface/memory.h"
 #include "HeterogeneousCore/AlpakaInterface/interface/traits.h"
-#include "DataFormats/TrackingRecHitSoA/interface/TrackingRecHitsSoA.h"
-#include "DataFormats/TrackSoA/interface/alpaka/TrackUtilities.h"
 #include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforDevice.h"
 #include "RecoTracker/PixelTrackFitting/interface/alpaka/RiemannFit.h"
+
 #include "HelixFit.h"
 #include "CAStructures.h"
 
diff --git a/RecoTracker/PixelSeeding/test/alpaka/CAsizes_t.cpp b/RecoTracker/PixelSeeding/test/alpaka/CAsizes_t.cpp
index 770957d9a79c0..9e164d1eb09e1 100644
--- a/RecoTracker/PixelSeeding/test/alpaka/CAsizes_t.cpp
+++ b/RecoTracker/PixelSeeding/test/alpaka/CAsizes_t.cpp
@@ -1,7 +1,9 @@
-#include "RecoTracker/PixelSeeding/plugins/alpaka/CACell.h"
-#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h"
-#include <typeinfo>
 #include <iostream>
+#include <typeinfo>
+
+#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/config.h"
+#include "RecoTracker/PixelSeeding/plugins/alpaka/CACell.h"
 
 using namespace ALPAKA_ACCELERATOR_NAMESPACE;
 
@@ -13,6 +15,7 @@ void print() {
 int main() {
   using namespace pixelTopology;
   using namespace caStructures;
+
   //for Phase-I
   print<CACellT<Phase1>>();
   print<CellNeighborsT<Phase1>>();
@@ -21,11 +24,9 @@ int main() {
   print<TuplesContainerT<Phase1>>();
   print<HitToTupleT<Phase1>>();
   print<TupleMultiplicityT<Phase1>>();
-
   print<CellNeighborsVectorT<Phase1>>();
 
   //for Phase-II
-
   print<CACellT<Phase2>>();
   print<CellNeighborsT<Phase2>>();
   print<CellTracksT<Phase2>>();
@@ -33,7 +34,6 @@ int main() {
   print<TuplesContainerT<Phase2>>();
   print<HitToTupleT<Phase2>>();
   print<TupleMultiplicityT<Phase2>>();
-
   print<CellNeighborsVectorT<Phase2>>();
 
   return 0;
diff --git a/RecoTracker/PixelTrackFitting/interface/BrokenLine.h b/RecoTracker/PixelTrackFitting/interface/BrokenLine.h
index 872f02b290f85..fe856867ecfaf 100644
--- a/RecoTracker/PixelTrackFitting/interface/BrokenLine.h
+++ b/RecoTracker/PixelTrackFitting/interface/BrokenLine.h
@@ -1,6 +1,7 @@
 #ifndef RecoTracker_PixelTrackFitting_BrokenLine_h
 #define RecoTracker_PixelTrackFitting_BrokenLine_h
 
+#include <Eigen/Core>
 #include <Eigen/Eigenvalues>
 
 #include "RecoTracker/PixelTrackFitting/interface/FitUtils.h"
diff --git a/RecoTracker/PixelTrackFitting/interface/FitResult.h b/RecoTracker/PixelTrackFitting/interface/FitResult.h
index 86941b00a28d7..e7d956b86e4d0 100644
--- a/RecoTracker/PixelTrackFitting/interface/FitResult.h
+++ b/RecoTracker/PixelTrackFitting/interface/FitResult.h
@@ -5,6 +5,7 @@
 #include <cstdint>
 
 #include <cuda_runtime.h>
+
 #include <Eigen/Core>
 #include <Eigen/Eigenvalues>
 
diff --git a/RecoTracker/PixelTrackFitting/interface/RiemannFit.h b/RecoTracker/PixelTrackFitting/interface/RiemannFit.h
index 3eef5db71dfea..611c52d05b68b 100644
--- a/RecoTracker/PixelTrackFitting/interface/RiemannFit.h
+++ b/RecoTracker/PixelTrackFitting/interface/RiemannFit.h
@@ -1,6 +1,9 @@
 #ifndef RecoTracker_PixelTrackFitting_RiemannFit_h
 #define RecoTracker_PixelTrackFitting_RiemannFit_h
 
+#include <Eigen/Core>
+#include <Eigen/Eigenvalues>
+
 #include "RecoTracker/PixelTrackFitting/interface/FitUtils.h"
 
 namespace riemannFit {
diff --git a/RecoTracker/PixelTrackFitting/interface/alpaka/BrokenLine.h b/RecoTracker/PixelTrackFitting/interface/alpaka/BrokenLine.h
index 9e656e2de18dc..21b1ac1564ff9 100644
--- a/RecoTracker/PixelTrackFitting/interface/alpaka/BrokenLine.h
+++ b/RecoTracker/PixelTrackFitting/interface/alpaka/BrokenLine.h
@@ -1,40 +1,43 @@
-#ifndef RecoPixelVertexing_PixelTrackFitting_interface_BrokenLine_h
-#define RecoPixelVertexing_PixelTrackFitting_interface_BrokenLine_h
+#ifndef RecoTracker_PixelTrackFitting_interface_alpaka_BrokenLine_h
+#define RecoTracker_PixelTrackFitting_interface_alpaka_BrokenLine_h
+
 #include <alpaka/alpaka.hpp>
-#include <Eigen/Eigenvalues>
+
+#include <Eigen/Core>
+
 #include "HeterogeneousCore/AlpakaInterface/interface/config.h"
 #include "RecoTracker/PixelTrackFitting/interface/alpaka/FitUtils.h"
 
-namespace ALPAKA_ACCELERATOR_NAMESPACE {
-  namespace brokenline {
-    using namespace cms::alpakatools;
-    using namespace ::riemannFit;
+namespace ALPAKA_ACCELERATOR_NAMESPACE::brokenline {
+
+  using namespace cms::alpakatools;
+  using namespace ::riemannFit;
 
-    //!< Karimäki's parameters: (phi, d, k=1/R)
-    /*!< covariance matrix: \n
+  //!< Karimäki's parameters: (phi, d, k=1/R)
+  /*!< covariance matrix: \n
     |cov(phi,phi)|cov( d ,phi)|cov( k ,phi)| \n
     |cov(phi, d )|cov( d , d )|cov( k , d )| \n
     |cov(phi, k )|cov( d , k )|cov( k , k )| \n
     as defined in Karimäki V., 1990, Effective circle fitting for particle trajectories, 
     Nucl. Instr. and Meth. A305 (1991) 187.
   */
-    using karimaki_circle_fit = riemannFit::CircleFit;
+  using karimaki_circle_fit = riemannFit::CircleFit;
 
-    /*!
+  /*!
     \brief data needed for the Broken Line fit procedure.
   */
-    template <int n>
-    struct PreparedBrokenLineData {
-      int qCharge;                          //!< particle charge
-      riemannFit::Matrix2xNd<n> radii;      //!< xy data in the system in which the pre-fitted center is the origin
-      riemannFit::VectorNd<n> sTransverse;  //!< total distance traveled in the transverse plane
-                                            //   starting from the pre-fitted closest approach
-      riemannFit::VectorNd<n> sTotal;       //!< total distance traveled (three-dimensional)
-      riemannFit::VectorNd<n> zInSZplane;   //!< orthogonal coordinate to the pre-fitted line in the sz plane
-      riemannFit::VectorNd<n> varBeta;      //!< kink angles in the SZ plane
-    };
-
-    /*!
+  template <int n>
+  struct PreparedBrokenLineData {
+    int qCharge;                          //!< particle charge
+    riemannFit::Matrix2xNd<n> radii;      //!< xy data in the system in which the pre-fitted center is the origin
+    riemannFit::VectorNd<n> sTransverse;  //!< total distance traveled in the transverse plane
+                                          //   starting from the pre-fitted closest approach
+    riemannFit::VectorNd<n> sTotal;       //!< total distance traveled (three-dimensional)
+    riemannFit::VectorNd<n> zInSZplane;   //!< orthogonal coordinate to the pre-fitted line in the sz plane
+    riemannFit::VectorNd<n> varBeta;      //!< kink angles in the SZ plane
+  };
+
+  /*!
     \brief Computes the Coulomb multiple scattering variance of the planar angle.
     
     \param length length of the track in the material.
@@ -53,42 +56,42 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
     
     \return the variance of the planar angle ((theta_0)^2 /3).
   */
-    template <typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE double multScatt(
-        const TAcc& acc, const double& length, const double bField, const double radius, int layer, double slope) {
-      // limit R to 20GeV...
-      auto pt2 = alpaka::math::min(acc, 20., bField * radius);
-      pt2 *= pt2;
-      constexpr double inv_X0 = 0.06 / 16.;  //!< inverse of radiation length of the material in cm
-      //if(Layer==1) XXI_0=0.06/16.;
-      // else XXI_0=0.06/16.;
-      //XX_0*=1;
-
-      //! number between 1/3 (uniform material) and 1 (thin scatterer) to be manually tuned
-      constexpr double geometry_factor = 0.7;
-      constexpr double fact = geometry_factor * riemannFit::sqr(13.6 / 1000.);
-      return fact / (pt2 * (1. + riemannFit::sqr(slope))) * (alpaka::math::abs(acc, length) * inv_X0) *
-             riemannFit::sqr(1. + 0.038 * log(alpaka::math::abs(acc, length) * inv_X0));
-    }
-
-    /*!
+  template <typename TAcc>
+  ALPAKA_FN_ACC ALPAKA_FN_INLINE double multScatt(
+      const TAcc& acc, const double& length, const double bField, const double radius, int layer, double slope) {
+    // limit R to 20GeV...
+    auto pt2 = alpaka::math::min(acc, 20., bField * radius);
+    pt2 *= pt2;
+    constexpr double inv_X0 = 0.06 / 16.;  //!< inverse of radiation length of the material in cm
+    //if(Layer==1) XXI_0=0.06/16.;
+    // else XXI_0=0.06/16.;
+    //XX_0*=1;
+
+    //! number between 1/3 (uniform material) and 1 (thin scatterer) to be manually tuned
+    constexpr double geometry_factor = 0.7;
+    constexpr double fact = geometry_factor * riemannFit::sqr(13.6 / 1000.);
+    return fact / (pt2 * (1. + riemannFit::sqr(slope))) * (alpaka::math::abs(acc, length) * inv_X0) *
+           riemannFit::sqr(1. + 0.038 * log(alpaka::math::abs(acc, length) * inv_X0));
+  }
+
+  /*!
     \brief Computes the 2D rotation matrix that transforms the line y=slope*x into the line y=0.
     
     \param slope tangent of the angle of rotation.
     
     \return 2D rotation matrix.
   */
-    template <typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE riemannFit::Matrix2d rotationMatrix(const TAcc& acc, double slope) {
-      riemannFit::Matrix2d rot;
-      rot(0, 0) = 1. / alpaka::math::sqrt(acc, 1. + riemannFit::sqr(slope));
-      rot(0, 1) = slope * rot(0, 0);
-      rot(1, 0) = -rot(0, 1);
-      rot(1, 1) = rot(0, 0);
-      return rot;
-    }
-
-    /*!
+  template <typename TAcc>
+  ALPAKA_FN_ACC ALPAKA_FN_INLINE riemannFit::Matrix2d rotationMatrix(const TAcc& acc, double slope) {
+    riemannFit::Matrix2d rot;
+    rot(0, 0) = 1. / alpaka::math::sqrt(acc, 1. + riemannFit::sqr(slope));
+    rot(0, 1) = slope * rot(0, 0);
+    rot(1, 0) = -rot(0, 1);
+    rot(1, 1) = rot(0, 0);
+    return rot;
+  }
+
+  /*!
     \brief Changes the Karimäki parameters (and consequently their covariance matrix) under a 
    *       translation of the coordinate system, such that the old origin has coordinates (x0,y0) 
    *       in the new coordinate system. The formulas are taken from Karimäki V., 1990, Effective 
@@ -99,50 +102,50 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
     \param y0 y coordinate of the translation vector.
     \param jacobian passed by reference in order to save stack.
   */
-    template <typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE void translateKarimaki(
-        const TAcc& acc, karimaki_circle_fit& circle, double x0, double y0, riemannFit::Matrix3d& jacobian) {
-      // Avoid multiple access to the circle.par vector.
-      using scalar = typename std::remove_reference<decltype(circle.par(0))>::type;
-      scalar phi = circle.par(0);
-      scalar dee = circle.par(1);
-      scalar rho = circle.par(2);
-
-      // Avoid repeated trig. computations
-      scalar sinPhi = alpaka::math::sin(acc, phi);
-      scalar cosPhi = alpaka::math::cos(acc, phi);
-
-      // Intermediate computations for the circle parameters
-      scalar deltaPara = x0 * cosPhi + y0 * sinPhi;
-      scalar deltaOrth = x0 * sinPhi - y0 * cosPhi + dee;
-      scalar tempSmallU = 1 + rho * dee;
-      scalar tempC = -rho * y0 + tempSmallU * cosPhi;
-      scalar tempB = rho * x0 + tempSmallU * sinPhi;
-      scalar tempA = 2. * deltaOrth + rho * (riemannFit::sqr(deltaOrth) + riemannFit::sqr(deltaPara));
-      scalar tempU = alpaka::math::sqrt(acc, 1. + rho * tempA);
-
-      // Intermediate computations for the error matrix transform
-      scalar xi = 1. / (riemannFit::sqr(tempB) + riemannFit::sqr(tempC));
-      scalar tempV = 1. + rho * deltaOrth;
-      scalar lambda = (0.5 * tempA) / (riemannFit::sqr(1. + tempU) * tempU);
-      scalar mu = 1. / (tempU * (1. + tempU)) + rho * lambda;
-      scalar zeta = riemannFit::sqr(deltaOrth) + riemannFit::sqr(deltaPara);
-      jacobian << xi * tempSmallU * tempV, -xi * riemannFit::sqr(rho) * deltaOrth, xi * deltaPara,
-          2. * mu * tempSmallU * deltaPara, 2. * mu * tempV, mu * zeta - lambda * tempA, 0, 0, 1.;
-
-      // translated circle parameters
-      // phi
-      circle.par(0) = alpaka::math::atan2(acc, tempB, tempC);
-      // d
-      circle.par(1) = tempA / (1 + tempU);
-      // rho after translation. It is invariant, so noop
-      // circle.par(2)= rho;
-
-      // translated error matrix
-      circle.cov = jacobian * circle.cov * jacobian.transpose();
-    }
-
-    /*!
+  template <typename TAcc>
+  ALPAKA_FN_ACC ALPAKA_FN_INLINE void translateKarimaki(
+      const TAcc& acc, karimaki_circle_fit& circle, double x0, double y0, riemannFit::Matrix3d& jacobian) {
+    // Avoid multiple access to the circle.par vector.
+    using scalar = typename std::remove_reference<decltype(circle.par(0))>::type;
+    scalar phi = circle.par(0);
+    scalar dee = circle.par(1);
+    scalar rho = circle.par(2);
+
+    // Avoid repeated trig. computations
+    scalar sinPhi = alpaka::math::sin(acc, phi);
+    scalar cosPhi = alpaka::math::cos(acc, phi);
+
+    // Intermediate computations for the circle parameters
+    scalar deltaPara = x0 * cosPhi + y0 * sinPhi;
+    scalar deltaOrth = x0 * sinPhi - y0 * cosPhi + dee;
+    scalar tempSmallU = 1 + rho * dee;
+    scalar tempC = -rho * y0 + tempSmallU * cosPhi;
+    scalar tempB = rho * x0 + tempSmallU * sinPhi;
+    scalar tempA = 2. * deltaOrth + rho * (riemannFit::sqr(deltaOrth) + riemannFit::sqr(deltaPara));
+    scalar tempU = alpaka::math::sqrt(acc, 1. + rho * tempA);
+
+    // Intermediate computations for the error matrix transform
+    scalar xi = 1. / (riemannFit::sqr(tempB) + riemannFit::sqr(tempC));
+    scalar tempV = 1. + rho * deltaOrth;
+    scalar lambda = (0.5 * tempA) / (riemannFit::sqr(1. + tempU) * tempU);
+    scalar mu = 1. / (tempU * (1. + tempU)) + rho * lambda;
+    scalar zeta = riemannFit::sqr(deltaOrth) + riemannFit::sqr(deltaPara);
+    jacobian << xi * tempSmallU * tempV, -xi * riemannFit::sqr(rho) * deltaOrth, xi * deltaPara,
+        2. * mu * tempSmallU * deltaPara, 2. * mu * tempV, mu * zeta - lambda * tempA, 0, 0, 1.;
+
+    // translated circle parameters
+    // phi
+    circle.par(0) = alpaka::math::atan2(acc, tempB, tempC);
+    // d
+    circle.par(1) = tempA / (1 + tempU);
+    // rho after translation. It is invariant, so noop
+    // circle.par(2)= rho;
+
+    // translated error matrix
+    circle.cov = jacobian * circle.cov * jacobian.transpose();
+  }
+
+  /*!
     \brief Computes the data needed for the Broken Line fit procedure that are mainly common for the circle and the line fit.
     
     \param hits hits coordinates.
@@ -150,65 +153,60 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
     \param bField magnetic field in Gev/cm/c.
     \param results PreparedBrokenLineData to be filled (see description of PreparedBrokenLineData).
   */
-    template <typename TAcc, typename M3xN, typename V4, int n>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE void __attribute__((always_inline))
-    prepareBrokenLineData(const TAcc& acc,
-                          const M3xN& hits,
-                          const V4& fast_fit,
-                          const double bField,
-                          PreparedBrokenLineData<n>& results) {
-      riemannFit::Vector2d dVec;
-      riemannFit::Vector2d eVec;
-
-      int mId = 1;
-
-      if constexpr (n > 3) {
-        riemannFit::Vector2d middle = 0.5 * (hits.block(0, n - 1, 2, 1) + hits.block(0, 0, 2, 1));
-        auto d1 = (hits.block(0, n / 2, 2, 1) - middle).squaredNorm();
-        auto d2 = (hits.block(0, n / 2 - 1, 2, 1) - middle).squaredNorm();
-        mId = d1 < d2 ? n / 2 : n / 2 - 1;
-      }
+  template <typename TAcc, typename M3xN, typename V4, int n>
+  ALPAKA_FN_ACC ALPAKA_FN_INLINE void __attribute__((always_inline)) prepareBrokenLineData(
+      const TAcc& acc, const M3xN& hits, const V4& fast_fit, const double bField, PreparedBrokenLineData<n>& results) {
+    riemannFit::Vector2d dVec;
+    riemannFit::Vector2d eVec;
+
+    int mId = 1;
+
+    if constexpr (n > 3) {
+      riemannFit::Vector2d middle = 0.5 * (hits.block(0, n - 1, 2, 1) + hits.block(0, 0, 2, 1));
+      auto d1 = (hits.block(0, n / 2, 2, 1) - middle).squaredNorm();
+      auto d2 = (hits.block(0, n / 2 - 1, 2, 1) - middle).squaredNorm();
+      mId = d1 < d2 ? n / 2 : n / 2 - 1;
+    }
 
-      dVec = hits.block(0, mId, 2, 1) - hits.block(0, 0, 2, 1);
-      eVec = hits.block(0, n - 1, 2, 1) - hits.block(0, mId, 2, 1);
-      results.qCharge = riemannFit::cross2D(acc, dVec, eVec) > 0 ? -1 : 1;
+    dVec = hits.block(0, mId, 2, 1) - hits.block(0, 0, 2, 1);
+    eVec = hits.block(0, n - 1, 2, 1) - hits.block(0, mId, 2, 1);
+    results.qCharge = riemannFit::cross2D(acc, dVec, eVec) > 0 ? -1 : 1;
 
-      const double slope = -results.qCharge / fast_fit(3);
+    const double slope = -results.qCharge / fast_fit(3);
 
-      riemannFit::Matrix2d rotMat = rotationMatrix(acc, slope);
+    riemannFit::Matrix2d rotMat = rotationMatrix(acc, slope);
 
-      // calculate radii and s
-      results.radii = hits.block(0, 0, 2, n) - fast_fit.head(2) * riemannFit::MatrixXd::Constant(1, n, 1);
-      eVec = -fast_fit(2) * fast_fit.head(2) / fast_fit.head(2).norm();
-      for (u_int i = 0; i < n; i++) {
-        dVec = results.radii.block(0, i, 2, 1);
-        results.sTransverse(i) =
-            results.qCharge * fast_fit(2) *
-            alpaka::math::atan2(
-                acc, riemannFit::cross2D(acc, dVec, eVec), dVec.dot(eVec));  // calculates the arc length
-      }
-      riemannFit::VectorNd<n> zVec = hits.block(2, 0, 1, n).transpose();
-
-      //calculate sTotal and zVec
-      riemannFit::Matrix2xNd<n> pointsSZ = riemannFit::Matrix2xNd<n>::Zero();
-      for (u_int i = 0; i < n; i++) {
-        pointsSZ(0, i) = results.sTransverse(i);
-        pointsSZ(1, i) = zVec(i);
-        pointsSZ.block(0, i, 2, 1) = rotMat * pointsSZ.block(0, i, 2, 1);
-      }
-      results.sTotal = pointsSZ.block(0, 0, 1, n).transpose();
-      results.zInSZplane = pointsSZ.block(1, 0, 1, n).transpose();
-
-      //calculate varBeta
-      results.varBeta(0) = results.varBeta(n - 1) = 0;
-      for (u_int i = 1; i < n - 1; i++) {
-        results.varBeta(i) =
-            multScatt(acc, results.sTotal(i + 1) - results.sTotal(i), bField, fast_fit(2), i + 2, slope) +
-            multScatt(acc, results.sTotal(i) - results.sTotal(i - 1), bField, fast_fit(2), i + 1, slope);
-      }
+    // calculate radii and s
+    results.radii = hits.block(0, 0, 2, n) - fast_fit.head(2) * riemannFit::MatrixXd::Constant(1, n, 1);
+    eVec = -fast_fit(2) * fast_fit.head(2) / fast_fit.head(2).norm();
+    for (u_int i = 0; i < n; i++) {
+      dVec = results.radii.block(0, i, 2, 1);
+      results.sTransverse(i) =
+          results.qCharge * fast_fit(2) *
+          alpaka::math::atan2(acc, riemannFit::cross2D(acc, dVec, eVec), dVec.dot(eVec));  // calculates the arc length
+    }
+    riemannFit::VectorNd<n> zVec = hits.block(2, 0, 1, n).transpose();
+
+    //calculate sTotal and zVec
+    riemannFit::Matrix2xNd<n> pointsSZ = riemannFit::Matrix2xNd<n>::Zero();
+    for (u_int i = 0; i < n; i++) {
+      pointsSZ(0, i) = results.sTransverse(i);
+      pointsSZ(1, i) = zVec(i);
+      pointsSZ.block(0, i, 2, 1) = rotMat * pointsSZ.block(0, i, 2, 1);
     }
+    results.sTotal = pointsSZ.block(0, 0, 1, n).transpose();
+    results.zInSZplane = pointsSZ.block(1, 0, 1, n).transpose();
+
+    //calculate varBeta
+    results.varBeta(0) = results.varBeta(n - 1) = 0;
+    for (u_int i = 1; i < n - 1; i++) {
+      results.varBeta(i) =
+          multScatt(acc, results.sTotal(i + 1) - results.sTotal(i), bField, fast_fit(2), i + 2, slope) +
+          multScatt(acc, results.sTotal(i) - results.sTotal(i - 1), bField, fast_fit(2), i + 1, slope);
+    }
+  }
 
-    /*!
+  /*!
     \brief Computes the n-by-n band matrix obtained minimizing the Broken Line's cost function w.r.t u. 
    *       This is the whole matrix in the case of the line fit and the main n-by-n block in the case 
    *       of the circle fit.
@@ -220,41 +218,41 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
     
     \return the n-by-n matrix of the linear system
   */
-    template <typename TAcc, int n>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE riemannFit::MatrixNd<n> matrixC_u(const TAcc& acc,
-                                                                     const riemannFit::VectorNd<n>& weights,
-                                                                     const riemannFit::VectorNd<n>& sTotal,
-                                                                     const riemannFit::VectorNd<n>& varBeta) {
-      riemannFit::MatrixNd<n> c_uMat = riemannFit::MatrixNd<n>::Zero();
-      for (u_int i = 0; i < n; i++) {
-        c_uMat(i, i) = weights(i);
-        if (i > 1)
-          c_uMat(i, i) += 1. / (varBeta(i - 1) * riemannFit::sqr(sTotal(i) - sTotal(i - 1)));
-        if (i > 0 && i < n - 1)
-          c_uMat(i, i) +=
-              (1. / varBeta(i)) * riemannFit::sqr((sTotal(i + 1) - sTotal(i - 1)) /
-                                                  ((sTotal(i + 1) - sTotal(i)) * (sTotal(i) - sTotal(i - 1))));
-        if (i < n - 2)
-          c_uMat(i, i) += 1. / (varBeta(i + 1) * riemannFit::sqr(sTotal(i + 1) - sTotal(i)));
-
-        if (i > 0 && i < n - 1)
-          c_uMat(i, i + 1) =
-              1. / (varBeta(i) * (sTotal(i + 1) - sTotal(i))) *
-              (-(sTotal(i + 1) - sTotal(i - 1)) / ((sTotal(i + 1) - sTotal(i)) * (sTotal(i) - sTotal(i - 1))));
-        if (i < n - 2)
-          c_uMat(i, i + 1) +=
-              1. / (varBeta(i + 1) * (sTotal(i + 1) - sTotal(i))) *
-              (-(sTotal(i + 2) - sTotal(i)) / ((sTotal(i + 2) - sTotal(i + 1)) * (sTotal(i + 1) - sTotal(i))));
-
-        if (i < n - 2)
-          c_uMat(i, i + 2) = 1. / (varBeta(i + 1) * (sTotal(i + 2) - sTotal(i + 1)) * (sTotal(i + 1) - sTotal(i)));
-
-        c_uMat(i, i) *= 0.5;
-      }
-      return c_uMat + c_uMat.transpose();
+  template <typename TAcc, int n>
+  ALPAKA_FN_ACC ALPAKA_FN_INLINE riemannFit::MatrixNd<n> matrixC_u(const TAcc& acc,
+                                                                   const riemannFit::VectorNd<n>& weights,
+                                                                   const riemannFit::VectorNd<n>& sTotal,
+                                                                   const riemannFit::VectorNd<n>& varBeta) {
+    riemannFit::MatrixNd<n> c_uMat = riemannFit::MatrixNd<n>::Zero();
+    for (u_int i = 0; i < n; i++) {
+      c_uMat(i, i) = weights(i);
+      if (i > 1)
+        c_uMat(i, i) += 1. / (varBeta(i - 1) * riemannFit::sqr(sTotal(i) - sTotal(i - 1)));
+      if (i > 0 && i < n - 1)
+        c_uMat(i, i) +=
+            (1. / varBeta(i)) * riemannFit::sqr((sTotal(i + 1) - sTotal(i - 1)) /
+                                                ((sTotal(i + 1) - sTotal(i)) * (sTotal(i) - sTotal(i - 1))));
+      if (i < n - 2)
+        c_uMat(i, i) += 1. / (varBeta(i + 1) * riemannFit::sqr(sTotal(i + 1) - sTotal(i)));
+
+      if (i > 0 && i < n - 1)
+        c_uMat(i, i + 1) =
+            1. / (varBeta(i) * (sTotal(i + 1) - sTotal(i))) *
+            (-(sTotal(i + 1) - sTotal(i - 1)) / ((sTotal(i + 1) - sTotal(i)) * (sTotal(i) - sTotal(i - 1))));
+      if (i < n - 2)
+        c_uMat(i, i + 1) +=
+            1. / (varBeta(i + 1) * (sTotal(i + 1) - sTotal(i))) *
+            (-(sTotal(i + 2) - sTotal(i)) / ((sTotal(i + 2) - sTotal(i + 1)) * (sTotal(i + 1) - sTotal(i))));
+
+      if (i < n - 2)
+        c_uMat(i, i + 2) = 1. / (varBeta(i + 1) * (sTotal(i + 2) - sTotal(i + 1)) * (sTotal(i + 1) - sTotal(i)));
+
+      c_uMat(i, i) *= 0.5;
     }
+    return c_uMat + c_uMat.transpose();
+  }
 
-    /*!
+  /*!
     \brief A very fast helix fit.
     
     \param hits the measured hits.
@@ -264,40 +262,40 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
     \warning sign of theta is (intentionally, for now) mistaken for negative charges.
   */
 
-    template <typename TAcc, typename M3xN, typename V4>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE void fastFit(const TAcc& acc, const M3xN& hits, V4& result) {
-      constexpr uint32_t n = M3xN::ColsAtCompileTime;
+  template <typename TAcc, typename M3xN, typename V4>
+  ALPAKA_FN_ACC ALPAKA_FN_INLINE void fastFit(const TAcc& acc, const M3xN& hits, V4& result) {
+    constexpr uint32_t n = M3xN::ColsAtCompileTime;
 
-      int mId = 1;
+    int mId = 1;
 
-      if constexpr (n > 3) {
-        riemannFit::Vector2d middle = 0.5 * (hits.block(0, n - 1, 2, 1) + hits.block(0, 0, 2, 1));
-        auto d1 = (hits.block(0, n / 2, 2, 1) - middle).squaredNorm();
-        auto d2 = (hits.block(0, n / 2 - 1, 2, 1) - middle).squaredNorm();
-        mId = d1 < d2 ? n / 2 : n / 2 - 1;
-      }
+    if constexpr (n > 3) {
+      riemannFit::Vector2d middle = 0.5 * (hits.block(0, n - 1, 2, 1) + hits.block(0, 0, 2, 1));
+      auto d1 = (hits.block(0, n / 2, 2, 1) - middle).squaredNorm();
+      auto d2 = (hits.block(0, n / 2 - 1, 2, 1) - middle).squaredNorm();
+      mId = d1 < d2 ? n / 2 : n / 2 - 1;
+    }
 
-      const riemannFit::Vector2d a = hits.block(0, mId, 2, 1) - hits.block(0, 0, 2, 1);
-      const riemannFit::Vector2d b = hits.block(0, n - 1, 2, 1) - hits.block(0, mId, 2, 1);
-      const riemannFit::Vector2d c = hits.block(0, 0, 2, 1) - hits.block(0, n - 1, 2, 1);
+    const riemannFit::Vector2d a = hits.block(0, mId, 2, 1) - hits.block(0, 0, 2, 1);
+    const riemannFit::Vector2d b = hits.block(0, n - 1, 2, 1) - hits.block(0, mId, 2, 1);
+    const riemannFit::Vector2d c = hits.block(0, 0, 2, 1) - hits.block(0, n - 1, 2, 1);
 
-      auto tmp = 0.5 / riemannFit::cross2D(acc, c, a);
-      result(0) = hits(0, 0) - (a(1) * c.squaredNorm() + c(1) * a.squaredNorm()) * tmp;
-      result(1) = hits(1, 0) + (a(0) * c.squaredNorm() + c(0) * a.squaredNorm()) * tmp;
-      // check Wikipedia for these formulas
+    auto tmp = 0.5 / riemannFit::cross2D(acc, c, a);
+    result(0) = hits(0, 0) - (a(1) * c.squaredNorm() + c(1) * a.squaredNorm()) * tmp;
+    result(1) = hits(1, 0) + (a(0) * c.squaredNorm() + c(0) * a.squaredNorm()) * tmp;
+    // check Wikipedia for these formulas
 
-      result(2) = alpaka::math::sqrt(acc, a.squaredNorm() * b.squaredNorm() * c.squaredNorm()) /
-                  (2. * alpaka::math::abs(acc, riemannFit::cross2D(acc, b, a)));
-      // Using Math Olympiad's formula R=abc/(4A)
+    result(2) = alpaka::math::sqrt(acc, a.squaredNorm() * b.squaredNorm() * c.squaredNorm()) /
+                (2. * alpaka::math::abs(acc, riemannFit::cross2D(acc, b, a)));
+    // Using Math Olympiad's formula R=abc/(4A)
 
-      const riemannFit::Vector2d d = hits.block(0, 0, 2, 1) - result.head(2);
-      const riemannFit::Vector2d e = hits.block(0, n - 1, 2, 1) - result.head(2);
+    const riemannFit::Vector2d d = hits.block(0, 0, 2, 1) - result.head(2);
+    const riemannFit::Vector2d e = hits.block(0, n - 1, 2, 1) - result.head(2);
 
-      result(3) = result(2) * atan2(riemannFit::cross2D(acc, d, e), d.dot(e)) / (hits(2, n - 1) - hits(2, 0));
-      // ds/dz slope between last and first point
-    }
+    result(3) = result(2) * atan2(riemannFit::cross2D(acc, d, e), d.dot(e)) / (hits(2, n - 1) - hits(2, 0));
+    // ds/dz slope between last and first point
+  }
 
-    /*!
+  /*!
     \brief Performs the Broken Line fit in the curved track case (that is, the fit 
    *       parameters are the interceptions u and the curvature correction \Delta\kappa).
     
@@ -321,134 +319,134 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
    * in which the first hit is the origin) and then the parameters and their 
    * covariance matrix are transformed to the original coordinate system.
   */
-    template <typename TAcc, typename M3xN, typename M6xN, typename V4, int n>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE void circleFit(const TAcc& acc,
-                                                  const M3xN& hits,
-                                                  const M6xN& hits_ge,
-                                                  const V4& fast_fit,
-                                                  const double bField,
-                                                  PreparedBrokenLineData<n>& data,
-                                                  karimaki_circle_fit& circle_results) {
-      circle_results.qCharge = data.qCharge;
-      auto& radii = data.radii;
-      const auto& sTransverse = data.sTransverse;
-      const auto& sTotal = data.sTotal;
-      auto& zInSZplane = data.zInSZplane;
-      auto& varBeta = data.varBeta;
-      const double slope = -circle_results.qCharge / fast_fit(3);
-      varBeta *= 1. + riemannFit::sqr(slope);  // the kink angles are projected!
-
-      for (u_int i = 0; i < n; i++) {
-        zInSZplane(i) = radii.block(0, i, 2, 1).norm() - fast_fit(2);
-      }
+  template <typename TAcc, typename M3xN, typename M6xN, typename V4, int n>
+  ALPAKA_FN_ACC ALPAKA_FN_INLINE void circleFit(const TAcc& acc,
+                                                const M3xN& hits,
+                                                const M6xN& hits_ge,
+                                                const V4& fast_fit,
+                                                const double bField,
+                                                PreparedBrokenLineData<n>& data,
+                                                karimaki_circle_fit& circle_results) {
+    circle_results.qCharge = data.qCharge;
+    auto& radii = data.radii;
+    const auto& sTransverse = data.sTransverse;
+    const auto& sTotal = data.sTotal;
+    auto& zInSZplane = data.zInSZplane;
+    auto& varBeta = data.varBeta;
+    const double slope = -circle_results.qCharge / fast_fit(3);
+    varBeta *= 1. + riemannFit::sqr(slope);  // the kink angles are projected!
+
+    for (u_int i = 0; i < n; i++) {
+      zInSZplane(i) = radii.block(0, i, 2, 1).norm() - fast_fit(2);
+    }
 
-      riemannFit::Matrix2d vMat;           // covariance matrix
-      riemannFit::VectorNd<n> weightsVec;  // weights
-      riemannFit::Matrix2d rotMat;         // rotation matrix point by point
-      for (u_int i = 0; i < n; i++) {
-        vMat(0, 0) = hits_ge.col(i)[0];               // x errors
-        vMat(0, 1) = vMat(1, 0) = hits_ge.col(i)[1];  // cov_xy
-        vMat(1, 1) = hits_ge.col(i)[2];               // y errors
-        rotMat = rotationMatrix(acc, -radii(0, i) / radii(1, i));
-        weightsVec(i) =
-            1. / ((rotMat * vMat * rotMat.transpose())(1, 1));  // compute the orthogonal weight point by point
-      }
+    riemannFit::Matrix2d vMat;           // covariance matrix
+    riemannFit::VectorNd<n> weightsVec;  // weights
+    riemannFit::Matrix2d rotMat;         // rotation matrix point by point
+    for (u_int i = 0; i < n; i++) {
+      vMat(0, 0) = hits_ge.col(i)[0];               // x errors
+      vMat(0, 1) = vMat(1, 0) = hits_ge.col(i)[1];  // cov_xy
+      vMat(1, 1) = hits_ge.col(i)[2];               // y errors
+      rotMat = rotationMatrix(acc, -radii(0, i) / radii(1, i));
+      weightsVec(i) =
+          1. / ((rotMat * vMat * rotMat.transpose())(1, 1));  // compute the orthogonal weight point by point
+    }
 
-      riemannFit::VectorNplusONEd<n> r_uVec;
-      r_uVec(n) = 0;
-      for (u_int i = 0; i < n; i++) {
-        r_uVec(i) = weightsVec(i) * zInSZplane(i);
-      }
+    riemannFit::VectorNplusONEd<n> r_uVec;
+    r_uVec(n) = 0;
+    for (u_int i = 0; i < n; i++) {
+      r_uVec(i) = weightsVec(i) * zInSZplane(i);
+    }
 
-      riemannFit::MatrixNplusONEd<n> c_uMat;
-      c_uMat.block(0, 0, n, n) = matrixC_u(acc, weightsVec, sTransverse, varBeta);
-      c_uMat(n, n) = 0;
-      //add the border to the c_uMat matrix
-      for (u_int i = 0; i < n; i++) {
-        c_uMat(i, n) = 0;
-        if (i > 0 && i < n - 1) {
-          c_uMat(i, n) +=
-              -(sTransverse(i + 1) - sTransverse(i - 1)) * (sTransverse(i + 1) - sTransverse(i - 1)) /
-              (2. * varBeta(i) * (sTransverse(i + 1) - sTransverse(i)) * (sTransverse(i) - sTransverse(i - 1)));
-        }
-        if (i > 1) {
-          c_uMat(i, n) +=
-              (sTransverse(i) - sTransverse(i - 2)) / (2. * varBeta(i - 1) * (sTransverse(i) - sTransverse(i - 1)));
-        }
-        if (i < n - 2) {
-          c_uMat(i, n) +=
-              (sTransverse(i + 2) - sTransverse(i)) / (2. * varBeta(i + 1) * (sTransverse(i + 1) - sTransverse(i)));
-        }
-        c_uMat(n, i) = c_uMat(i, n);
-        if (i > 0 && i < n - 1)
-          c_uMat(n, n) += riemannFit::sqr(sTransverse(i + 1) - sTransverse(i - 1)) / (4. * varBeta(i));
+    riemannFit::MatrixNplusONEd<n> c_uMat;
+    c_uMat.block(0, 0, n, n) = matrixC_u(acc, weightsVec, sTransverse, varBeta);
+    c_uMat(n, n) = 0;
+    //add the border to the c_uMat matrix
+    for (u_int i = 0; i < n; i++) {
+      c_uMat(i, n) = 0;
+      if (i > 0 && i < n - 1) {
+        c_uMat(i, n) +=
+            -(sTransverse(i + 1) - sTransverse(i - 1)) * (sTransverse(i + 1) - sTransverse(i - 1)) /
+            (2. * varBeta(i) * (sTransverse(i + 1) - sTransverse(i)) * (sTransverse(i) - sTransverse(i - 1)));
+      }
+      if (i > 1) {
+        c_uMat(i, n) +=
+            (sTransverse(i) - sTransverse(i - 2)) / (2. * varBeta(i - 1) * (sTransverse(i) - sTransverse(i - 1)));
+      }
+      if (i < n - 2) {
+        c_uMat(i, n) +=
+            (sTransverse(i + 2) - sTransverse(i)) / (2. * varBeta(i + 1) * (sTransverse(i + 1) - sTransverse(i)));
       }
+      c_uMat(n, i) = c_uMat(i, n);
+      if (i > 0 && i < n - 1)
+        c_uMat(n, n) += riemannFit::sqr(sTransverse(i + 1) - sTransverse(i - 1)) / (4. * varBeta(i));
+    }
 
 #ifdef CPP_DUMP
-      std::cout << "CU5\n" << c_uMat << std::endl;
+    std::cout << "CU5\n" << c_uMat << std::endl;
 #endif
-      riemannFit::MatrixNplusONEd<n> iMat;
-      math::cholesky::invert(c_uMat, iMat);
+    riemannFit::MatrixNplusONEd<n> iMat;
+    math::cholesky::invert(c_uMat, iMat);
 #ifdef CPP_DUMP
-      std::cout << "I5\n" << iMat << std::endl;
+    std::cout << "I5\n" << iMat << std::endl;
 #endif
-      riemannFit::VectorNplusONEd<n> uVec = iMat * r_uVec;  // obtain the fitted parameters by solving the linear system
+    riemannFit::VectorNplusONEd<n> uVec = iMat * r_uVec;  // obtain the fitted parameters by solving the linear system
 
-      // compute (phi, d_ca, k) in the system in which the midpoint of the first two corrected hits is the origin...
+    // compute (phi, d_ca, k) in the system in which the midpoint of the first two corrected hits is the origin...
 
-      radii.block(0, 0, 2, 1) /= radii.block(0, 0, 2, 1).norm();
-      radii.block(0, 1, 2, 1) /= radii.block(0, 1, 2, 1).norm();
+    radii.block(0, 0, 2, 1) /= radii.block(0, 0, 2, 1).norm();
+    radii.block(0, 1, 2, 1) /= radii.block(0, 1, 2, 1).norm();
 
-      riemannFit::Vector2d dVec = hits.block(0, 0, 2, 1) + (-zInSZplane(0) + uVec(0)) * radii.block(0, 0, 2, 1);
-      riemannFit::Vector2d eVec = hits.block(0, 1, 2, 1) + (-zInSZplane(1) + uVec(1)) * radii.block(0, 1, 2, 1);
-      auto eMinusd = eVec - dVec;
-      auto eMinusd2 = eMinusd.squaredNorm();
-      auto tmp1 = 1. / eMinusd2;
-      auto tmp2 = alpaka::math::sqrt(acc, riemannFit::sqr(fast_fit(2)) - 0.25 * eMinusd2);
+    riemannFit::Vector2d dVec = hits.block(0, 0, 2, 1) + (-zInSZplane(0) + uVec(0)) * radii.block(0, 0, 2, 1);
+    riemannFit::Vector2d eVec = hits.block(0, 1, 2, 1) + (-zInSZplane(1) + uVec(1)) * radii.block(0, 1, 2, 1);
+    auto eMinusd = eVec - dVec;
+    auto eMinusd2 = eMinusd.squaredNorm();
+    auto tmp1 = 1. / eMinusd2;
+    auto tmp2 = alpaka::math::sqrt(acc, riemannFit::sqr(fast_fit(2)) - 0.25 * eMinusd2);
 
-      circle_results.par << atan2(eMinusd(1), eMinusd(0)), circle_results.qCharge * (tmp2 - fast_fit(2)),
-          circle_results.qCharge * (1. / fast_fit(2) + uVec(n));
+    circle_results.par << atan2(eMinusd(1), eMinusd(0)), circle_results.qCharge * (tmp2 - fast_fit(2)),
+        circle_results.qCharge * (1. / fast_fit(2) + uVec(n));
 
-      tmp2 = 1. / tmp2;
+    tmp2 = 1. / tmp2;
 
-      riemannFit::Matrix3d jacobian;
-      jacobian << (radii(1, 0) * eMinusd(0) - eMinusd(1) * radii(0, 0)) * tmp1,
-          (radii(1, 1) * eMinusd(0) - eMinusd(1) * radii(0, 1)) * tmp1, 0,
-          circle_results.qCharge * (eMinusd(0) * radii(0, 0) + eMinusd(1) * radii(1, 0)) * tmp2,
-          circle_results.qCharge * (eMinusd(0) * radii(0, 1) + eMinusd(1) * radii(1, 1)) * tmp2, 0, 0, 0,
-          circle_results.qCharge;
-
-      circle_results.cov << iMat(0, 0), iMat(0, 1), iMat(0, n), iMat(1, 0), iMat(1, 1), iMat(1, n), iMat(n, 0),
-          iMat(n, 1), iMat(n, n);
-
-      circle_results.cov = jacobian * circle_results.cov * jacobian.transpose();
-
-      //...Translate in the system in which the first corrected hit is the origin, adding the m.s. correction...
-
-      translateKarimaki(acc, circle_results, 0.5 * eMinusd(0), 0.5 * eMinusd(1), jacobian);
-      circle_results.cov(0, 0) +=
-          (1 + riemannFit::sqr(slope)) * multScatt(acc, sTotal(1) - sTotal(0), bField, fast_fit(2), 2, slope);
-
-      //...And translate back to the original system
-
-      translateKarimaki(acc, circle_results, dVec(0), dVec(1), jacobian);
-
-      // compute chi2
-      circle_results.chi2 = 0;
-      for (u_int i = 0; i < n; i++) {
-        circle_results.chi2 += weightsVec(i) * riemannFit::sqr(zInSZplane(i) - uVec(i));
-        if (i > 0 && i < n - 1)
-          circle_results.chi2 +=
-              riemannFit::sqr(uVec(i - 1) / (sTransverse(i) - sTransverse(i - 1)) -
-                              uVec(i) * (sTransverse(i + 1) - sTransverse(i - 1)) /
-                                  ((sTransverse(i + 1) - sTransverse(i)) * (sTransverse(i) - sTransverse(i - 1))) +
-                              uVec(i + 1) / (sTransverse(i + 1) - sTransverse(i)) +
-                              (sTransverse(i + 1) - sTransverse(i - 1)) * uVec(n) / 2) /
-              varBeta(i);
-      }
+    riemannFit::Matrix3d jacobian;
+    jacobian << (radii(1, 0) * eMinusd(0) - eMinusd(1) * radii(0, 0)) * tmp1,
+        (radii(1, 1) * eMinusd(0) - eMinusd(1) * radii(0, 1)) * tmp1, 0,
+        circle_results.qCharge * (eMinusd(0) * radii(0, 0) + eMinusd(1) * radii(1, 0)) * tmp2,
+        circle_results.qCharge * (eMinusd(0) * radii(0, 1) + eMinusd(1) * radii(1, 1)) * tmp2, 0, 0, 0,
+        circle_results.qCharge;
+
+    circle_results.cov << iMat(0, 0), iMat(0, 1), iMat(0, n), iMat(1, 0), iMat(1, 1), iMat(1, n), iMat(n, 0),
+        iMat(n, 1), iMat(n, n);
+
+    circle_results.cov = jacobian * circle_results.cov * jacobian.transpose();
+
+    //...Translate in the system in which the first corrected hit is the origin, adding the m.s. correction...
+
+    translateKarimaki(acc, circle_results, 0.5 * eMinusd(0), 0.5 * eMinusd(1), jacobian);
+    circle_results.cov(0, 0) +=
+        (1 + riemannFit::sqr(slope)) * multScatt(acc, sTotal(1) - sTotal(0), bField, fast_fit(2), 2, slope);
+
+    //...And translate back to the original system
+
+    translateKarimaki(acc, circle_results, dVec(0), dVec(1), jacobian);
+
+    // compute chi2
+    circle_results.chi2 = 0;
+    for (u_int i = 0; i < n; i++) {
+      circle_results.chi2 += weightsVec(i) * riemannFit::sqr(zInSZplane(i) - uVec(i));
+      if (i > 0 && i < n - 1)
+        circle_results.chi2 +=
+            riemannFit::sqr(uVec(i - 1) / (sTransverse(i) - sTransverse(i - 1)) -
+                            uVec(i) * (sTransverse(i + 1) - sTransverse(i - 1)) /
+                                ((sTransverse(i + 1) - sTransverse(i)) * (sTransverse(i) - sTransverse(i - 1))) +
+                            uVec(i + 1) / (sTransverse(i + 1) - sTransverse(i)) +
+                            (sTransverse(i + 1) - sTransverse(i - 1)) * uVec(n) / 2) /
+            varBeta(i);
     }
+  }
 
-    /*!
+  /*!
     \brief Performs the Broken Line fit in the straight track case (that is, the fit parameters are only the interceptions u).
     
     \param hits hits coordinates.
@@ -470,95 +468,95 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
    * in which the first hit is the origin) and then the parameters and their covariance 
    * matrix are transformed to the original coordinate system.
    */
-    template <typename TAcc, typename V4, typename M6xN, int n>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE void lineFit(const TAcc& acc,
-                                                const M6xN& hits_ge,
-                                                const V4& fast_fit,
-                                                const double bField,
-                                                const PreparedBrokenLineData<n>& data,
-                                                riemannFit::LineFit& line_results) {
-      const auto& radii = data.radii;
-      const auto& sTotal = data.sTotal;
-      const auto& zInSZplane = data.zInSZplane;
-      const auto& varBeta = data.varBeta;
-
-      const double slope = -data.qCharge / fast_fit(3);
-      riemannFit::Matrix2d rotMat = rotationMatrix(acc, slope);
-
-      riemannFit::Matrix3d vMat = riemannFit::Matrix3d::Zero();  // covariance matrix XYZ
-      riemannFit::Matrix2x3d jacobXYZtosZ =
-          riemannFit::Matrix2x3d::Zero();  // jacobian for computation of the error on s (xyz -> sz)
-      riemannFit::VectorNd<n> weights = riemannFit::VectorNd<n>::Zero();
-      for (u_int i = 0; i < n; i++) {
-        vMat(0, 0) = hits_ge.col(i)[0];               // x errors
-        vMat(0, 1) = vMat(1, 0) = hits_ge.col(i)[1];  // cov_xy
-        vMat(0, 2) = vMat(2, 0) = hits_ge.col(i)[3];  // cov_xz
-        vMat(1, 1) = hits_ge.col(i)[2];               // y errors
-        vMat(2, 1) = vMat(1, 2) = hits_ge.col(i)[4];  // cov_yz
-        vMat(2, 2) = hits_ge.col(i)[5];               // z errors
-        auto tmp = 1. / radii.block(0, i, 2, 1).norm();
-        jacobXYZtosZ(0, 0) = radii(1, i) * tmp;
-        jacobXYZtosZ(0, 1) = -radii(0, i) * tmp;
-        jacobXYZtosZ(1, 2) = 1.;
-        weights(i) = 1. / ((rotMat * jacobXYZtosZ * vMat * jacobXYZtosZ.transpose() * rotMat.transpose())(
-                              1, 1));  // compute the orthogonal weight point by point
-      }
+  template <typename TAcc, typename V4, typename M6xN, int n>
+  ALPAKA_FN_ACC ALPAKA_FN_INLINE void lineFit(const TAcc& acc,
+                                              const M6xN& hits_ge,
+                                              const V4& fast_fit,
+                                              const double bField,
+                                              const PreparedBrokenLineData<n>& data,
+                                              riemannFit::LineFit& line_results) {
+    const auto& radii = data.radii;
+    const auto& sTotal = data.sTotal;
+    const auto& zInSZplane = data.zInSZplane;
+    const auto& varBeta = data.varBeta;
+
+    const double slope = -data.qCharge / fast_fit(3);
+    riemannFit::Matrix2d rotMat = rotationMatrix(acc, slope);
+
+    riemannFit::Matrix3d vMat = riemannFit::Matrix3d::Zero();  // covariance matrix XYZ
+    riemannFit::Matrix2x3d jacobXYZtosZ =
+        riemannFit::Matrix2x3d::Zero();  // jacobian for computation of the error on s (xyz -> sz)
+    riemannFit::VectorNd<n> weights = riemannFit::VectorNd<n>::Zero();
+    for (u_int i = 0; i < n; i++) {
+      vMat(0, 0) = hits_ge.col(i)[0];               // x errors
+      vMat(0, 1) = vMat(1, 0) = hits_ge.col(i)[1];  // cov_xy
+      vMat(0, 2) = vMat(2, 0) = hits_ge.col(i)[3];  // cov_xz
+      vMat(1, 1) = hits_ge.col(i)[2];               // y errors
+      vMat(2, 1) = vMat(1, 2) = hits_ge.col(i)[4];  // cov_yz
+      vMat(2, 2) = hits_ge.col(i)[5];               // z errors
+      auto tmp = 1. / radii.block(0, i, 2, 1).norm();
+      jacobXYZtosZ(0, 0) = radii(1, i) * tmp;
+      jacobXYZtosZ(0, 1) = -radii(0, i) * tmp;
+      jacobXYZtosZ(1, 2) = 1.;
+      weights(i) = 1. / ((rotMat * jacobXYZtosZ * vMat * jacobXYZtosZ.transpose() * rotMat.transpose())(
+                            1, 1));  // compute the orthogonal weight point by point
+    }
 
-      riemannFit::VectorNd<n> r_u;
-      for (u_int i = 0; i < n; i++) {
-        r_u(i) = weights(i) * zInSZplane(i);
-      }
+    riemannFit::VectorNd<n> r_u;
+    for (u_int i = 0; i < n; i++) {
+      r_u(i) = weights(i) * zInSZplane(i);
+    }
 #ifdef CPP_DUMP
-      std::cout << "CU4\n" << matrixC_u(w, sTotal, varBeta) << std::endl;
+    std::cout << "CU4\n" << matrixC_u(w, sTotal, varBeta) << std::endl;
 #endif
-      riemannFit::MatrixNd<n> iMat;
-      math::cholesky::invert(matrixC_u(acc, weights, sTotal, varBeta), iMat);
+    riemannFit::MatrixNd<n> iMat;
+    math::cholesky::invert(matrixC_u(acc, weights, sTotal, varBeta), iMat);
 #ifdef CPP_DUMP
-      std::cout << "I4\n" << iMat << std::endl;
+    std::cout << "I4\n" << iMat << std::endl;
 #endif
 
-      riemannFit::VectorNd<n> uVec = iMat * r_u;  // obtain the fitted parameters by solving the linear system
-
-      // line parameters in the system in which the first hit is the origin and with axis along SZ
-      line_results.par << (uVec(1) - uVec(0)) / (sTotal(1) - sTotal(0)), uVec(0);
-      auto idiff = 1. / (sTotal(1) - sTotal(0));
-      line_results.cov << (iMat(0, 0) - 2 * iMat(0, 1) + iMat(1, 1)) * riemannFit::sqr(idiff) +
-                              multScatt(acc, sTotal(1) - sTotal(0), bField, fast_fit(2), 2, slope),
-          (iMat(0, 1) - iMat(0, 0)) * idiff, (iMat(0, 1) - iMat(0, 0)) * idiff, iMat(0, 0);
-
-      // translate to the original SZ system
-      riemannFit::Matrix2d jacobian;
-      jacobian(0, 0) = 1.;
-      jacobian(0, 1) = 0;
-      jacobian(1, 0) = -sTotal(0);
-      jacobian(1, 1) = 1.;
-      line_results.par(1) += -line_results.par(0) * sTotal(0);
-      line_results.cov = jacobian * line_results.cov * jacobian.transpose();
-
-      // rotate to the original sz system
-      auto tmp = rotMat(0, 0) - line_results.par(0) * rotMat(0, 1);
-      jacobian(1, 1) = 1. / tmp;
-      jacobian(0, 0) = jacobian(1, 1) * jacobian(1, 1);
-      jacobian(0, 1) = 0;
-      jacobian(1, 0) = line_results.par(1) * rotMat(0, 1) * jacobian(0, 0);
-      line_results.par(1) = line_results.par(1) * jacobian(1, 1);
-      line_results.par(0) = (rotMat(0, 1) + line_results.par(0) * rotMat(0, 0)) * jacobian(1, 1);
-      line_results.cov = jacobian * line_results.cov * jacobian.transpose();
-
-      // compute chi2
-      line_results.chi2 = 0;
-      for (u_int i = 0; i < n; i++) {
-        line_results.chi2 += weights(i) * riemannFit::sqr(zInSZplane(i) - uVec(i));
-        if (i > 0 && i < n - 1)
-          line_results.chi2 += riemannFit::sqr(uVec(i - 1) / (sTotal(i) - sTotal(i - 1)) -
-                                               uVec(i) * (sTotal(i + 1) - sTotal(i - 1)) /
-                                                   ((sTotal(i + 1) - sTotal(i)) * (sTotal(i) - sTotal(i - 1))) +
-                                               uVec(i + 1) / (sTotal(i + 1) - sTotal(i))) /
-                               varBeta(i);
-      }
+    riemannFit::VectorNd<n> uVec = iMat * r_u;  // obtain the fitted parameters by solving the linear system
+
+    // line parameters in the system in which the first hit is the origin and with axis along SZ
+    line_results.par << (uVec(1) - uVec(0)) / (sTotal(1) - sTotal(0)), uVec(0);
+    auto idiff = 1. / (sTotal(1) - sTotal(0));
+    line_results.cov << (iMat(0, 0) - 2 * iMat(0, 1) + iMat(1, 1)) * riemannFit::sqr(idiff) +
+                            multScatt(acc, sTotal(1) - sTotal(0), bField, fast_fit(2), 2, slope),
+        (iMat(0, 1) - iMat(0, 0)) * idiff, (iMat(0, 1) - iMat(0, 0)) * idiff, iMat(0, 0);
+
+    // translate to the original SZ system
+    riemannFit::Matrix2d jacobian;
+    jacobian(0, 0) = 1.;
+    jacobian(0, 1) = 0;
+    jacobian(1, 0) = -sTotal(0);
+    jacobian(1, 1) = 1.;
+    line_results.par(1) += -line_results.par(0) * sTotal(0);
+    line_results.cov = jacobian * line_results.cov * jacobian.transpose();
+
+    // rotate to the original sz system
+    auto tmp = rotMat(0, 0) - line_results.par(0) * rotMat(0, 1);
+    jacobian(1, 1) = 1. / tmp;
+    jacobian(0, 0) = jacobian(1, 1) * jacobian(1, 1);
+    jacobian(0, 1) = 0;
+    jacobian(1, 0) = line_results.par(1) * rotMat(0, 1) * jacobian(0, 0);
+    line_results.par(1) = line_results.par(1) * jacobian(1, 1);
+    line_results.par(0) = (rotMat(0, 1) + line_results.par(0) * rotMat(0, 0)) * jacobian(1, 1);
+    line_results.cov = jacobian * line_results.cov * jacobian.transpose();
+
+    // compute chi2
+    line_results.chi2 = 0;
+    for (u_int i = 0; i < n; i++) {
+      line_results.chi2 += weights(i) * riemannFit::sqr(zInSZplane(i) - uVec(i));
+      if (i > 0 && i < n - 1)
+        line_results.chi2 += riemannFit::sqr(uVec(i - 1) / (sTotal(i) - sTotal(i - 1)) -
+                                             uVec(i) * (sTotal(i + 1) - sTotal(i - 1)) /
+                                                 ((sTotal(i + 1) - sTotal(i)) * (sTotal(i) - sTotal(i - 1))) +
+                                             uVec(i + 1) / (sTotal(i + 1) - sTotal(i))) /
+                             varBeta(i);
     }
+  }
 
-    /*!
+  /*!
     \brief Helix fit by three step:
     -fast pre-fit (see Fast_fit() for further info); \n
     -circle fit of the hits projected in the transverse plane by Broken Line algorithm (see BL_Circle_fit() for further info); \n
@@ -593,42 +591,43 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
     \return (phi,Tip,p_t,cot(theta)),Zip), their covariance matrix and the chi2's of the circle and line fits.
   */
 
-    template <int n>
-    class helixFit {
-    public:
-      template <typename TAcc>
-      ALPAKA_FN_ACC ALPAKA_FN_INLINE void operator()(const TAcc& acc,
-                                                     const riemannFit::Matrix3xNd<n>* hits,
-                                                     const Eigen::Matrix<float, 6, 4>* hits_ge,
-                                                     const double bField,
-                                                     riemannFit::HelixFit* helix) const {
-        riemannFit::Vector4d fast_fit;
-        fastFit(acc, *hits, fast_fit);
-
-        PreparedBrokenLineData<n> data;
-        karimaki_circle_fit circle;
-        riemannFit::LineFit line;
-        riemannFit::Matrix3d jacobian;
-
-        prepareBrokenLineData(acc, *hits, fast_fit, bField, data);
-        lineFit(acc, *hits_ge, fast_fit, bField, data, line);
-        circleFit(acc, *hits, *hits_ge, fast_fit, bField, data, circle);
-
-        // the circle fit gives k, but here we want p_t, so let's change the parameter and the covariance matrix
-        jacobian << 1., 0, 0, 0, 1., 0, 0, 0,
-            -alpaka::math::abs(acc, circle.par(2)) * bField / (riemannFit::sqr(circle.par(2)) * circle.par(2));
-        circle.par(2) = bField / alpaka::math::abs(acc, circle.par(2));
-        circle.cov = jacobian * circle.cov * jacobian.transpose();
-
-        helix->par << circle.par, line.par;
-        helix->cov = riemannFit::MatrixXd::Zero(5, 5);
-        helix->cov.block(0, 0, 3, 3) = circle.cov;
-        helix->cov.block(3, 3, 2, 2) = line.cov;
-        helix->qCharge = circle.qCharge;
-        helix->chi2_circle = circle.chi2;
-        helix->chi2_line = line.chi2;
-      }
-    };
-  }  // namespace brokenline
-}  // namespace ALPAKA_ACCELERATOR_NAMESPACE
-#endif  // RecoPixelVertexing_PixelTrackFitting_interface_BrokenLine_h
+  template <int n>
+  class helixFit {
+  public:
+    template <typename TAcc>
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE void operator()(const TAcc& acc,
+                                                   const riemannFit::Matrix3xNd<n>* hits,
+                                                   const Eigen::Matrix<float, 6, 4>* hits_ge,
+                                                   const double bField,
+                                                   riemannFit::HelixFit* helix) const {
+      riemannFit::Vector4d fast_fit;
+      fastFit(acc, *hits, fast_fit);
+
+      PreparedBrokenLineData<n> data;
+      karimaki_circle_fit circle;
+      riemannFit::LineFit line;
+      riemannFit::Matrix3d jacobian;
+
+      prepareBrokenLineData(acc, *hits, fast_fit, bField, data);
+      lineFit(acc, *hits_ge, fast_fit, bField, data, line);
+      circleFit(acc, *hits, *hits_ge, fast_fit, bField, data, circle);
+
+      // the circle fit gives k, but here we want p_t, so let's change the parameter and the covariance matrix
+      jacobian << 1., 0, 0, 0, 1., 0, 0, 0,
+          -alpaka::math::abs(acc, circle.par(2)) * bField / (riemannFit::sqr(circle.par(2)) * circle.par(2));
+      circle.par(2) = bField / alpaka::math::abs(acc, circle.par(2));
+      circle.cov = jacobian * circle.cov * jacobian.transpose();
+
+      helix->par << circle.par, line.par;
+      helix->cov = riemannFit::MatrixXd::Zero(5, 5);
+      helix->cov.block(0, 0, 3, 3) = circle.cov;
+      helix->cov.block(3, 3, 2, 2) = line.cov;
+      helix->qCharge = circle.qCharge;
+      helix->chi2_circle = circle.chi2;
+      helix->chi2_line = line.chi2;
+    }
+  };
+
+}  // namespace ALPAKA_ACCELERATOR_NAMESPACE::brokenline
+
+#endif  // RecoTracker_PixelTrackFitting_interface_alpaka_BrokenLine_h
diff --git a/RecoTracker/PixelTrackFitting/interface/alpaka/FitResult.h b/RecoTracker/PixelTrackFitting/interface/alpaka/FitResult.h
index 3daf271a5ca13..4feb8e38d50c7 100644
--- a/RecoTracker/PixelTrackFitting/interface/alpaka/FitResult.h
+++ b/RecoTracker/PixelTrackFitting/interface/alpaka/FitResult.h
@@ -1,11 +1,9 @@
-#ifndef RecoPixelVertexing_PixelTrackFitting_interface_FitResult_h
-#define RecoPixelVertexing_PixelTrackFitting_interface_FitResult_h
+#ifndef RecoTracker_PixelTrackFitting_interface_alpaka_FitResult_h
+#define RecoTracker_PixelTrackFitting_interface_alpaka_FitResult_h
 
-#include <cmath>
 #include <cstdint>
 
 #include <Eigen/Core>
-#include <Eigen/Eigenvalues>
 
 namespace riemannFit {
 
@@ -61,4 +59,5 @@ namespace riemannFit {
   };                  // __attribute__((aligned(16)));
 
 }  // namespace riemannFit
-#endif
+
+#endif  // RecoTracker_PixelTrackFitting_interface_alpaka_FitResult_h
diff --git a/RecoTracker/PixelTrackFitting/interface/alpaka/FitUtils.h b/RecoTracker/PixelTrackFitting/interface/alpaka/FitUtils.h
index 5dfa609ad3905..98922e47b6702 100644
--- a/RecoTracker/PixelTrackFitting/interface/alpaka/FitUtils.h
+++ b/RecoTracker/PixelTrackFitting/interface/alpaka/FitUtils.h
@@ -1,8 +1,14 @@
-#ifndef RecoPixelVertexing_PixelTrackFitting_alpaka_FitUtils_h
-#define RecoPixelVertexing_PixelTrackFitting_alpaka_FitUtils_h
+#ifndef RecoTracker_PixelTrackFitting_interface_alpaka_FitUtils_h
+#define RecoTracker_PixelTrackFitting_interface_alpaka_FitUtils_h
+
 #include <alpaka/alpaka.hpp>
+
+#include <Eigen/Core>
+
 #include "DataFormats/Math/interface/choleskyInversion.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/config.h"
 #include "RecoTracker/PixelTrackFitting/interface/alpaka/FitResult.h"
+
 namespace riemannFit {
 
   constexpr double epsilon = 1.e-4;  //!< used in numerical derivative (J2 in Circle_fit())
@@ -250,4 +256,5 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
   }  // namespace riemannFit
 
 }  // namespace ALPAKA_ACCELERATOR_NAMESPACE
-#endif  // RecoPixelVertexing_PixelTrackFitting_interface_FitUtils_h
+
+#endif  // RecoTracker_PixelTrackFitting_interface_alpaka_FitUtils_h
diff --git a/RecoTracker/PixelTrackFitting/interface/alpaka/RiemannFit.h b/RecoTracker/PixelTrackFitting/interface/alpaka/RiemannFit.h
index 8455a03e9f58f..131c47697d885 100644
--- a/RecoTracker/PixelTrackFitting/interface/alpaka/RiemannFit.h
+++ b/RecoTracker/PixelTrackFitting/interface/alpaka/RiemannFit.h
@@ -1,13 +1,18 @@
-#ifndef RecoPixelVertexing_PixelTrackFitting_interface_RiemannFit_h
-#define RecoPixelVertexing_PixelTrackFitting_interface_RiemannFit_h
+#ifndef RecoTracker_PixelTrackFitting_interface_alpaka_RiemannFit_h
+#define RecoTracker_PixelTrackFitting_interface_alpaka_RiemannFit_h
+
 #include <alpaka/alpaka.hpp>
+
+#include <Eigen/Core>
+#include <Eigen/Eigenvalues>
+
+#include "HeterogeneousCore/AlpakaInterface/interface/config.h"
 #include "RecoTracker/PixelTrackFitting/interface/alpaka/FitUtils.h"
 
-namespace ALPAKA_ACCELERATOR_NAMESPACE {
+namespace ALPAKA_ACCELERATOR_NAMESPACE::riemannFit {
+  using namespace ::riemannFit;
 
-  namespace riemannFit {
-    using namespace ::riemannFit;
-    /*!  Compute the Radiation length in the uniform hypothesis
+  /*!  Compute the Radiation length in the uniform hypothesis
  *
  * The Pixel detector, barrel and forward, is considered as an homogeneous
  * cylinder of material, whose radiation lengths has been derived from the TDR
@@ -29,21 +34,21 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
  * \return incremental radiation lengths that correspond to each segment.
  */
 
-    template <typename TAcc, typename VNd1, typename VNd2>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE void computeRadLenUniformMaterial(const TAcc& acc,
-                                                                     const VNd1& length_values,
-                                                                     VNd2& rad_lengths) {
-      // Radiation length of the pixel detector in the uniform assumption, with
-      // 0.06 rad_len at 16 cm
-      constexpr double xx_0_inv = 0.06 / 16.;
-      uint n = length_values.rows();
-      rad_lengths(0) = length_values(0) * xx_0_inv;
-      for (uint j = 1; j < n; ++j) {
-        rad_lengths(j) = alpaka::math::abs(acc, length_values(j) - length_values(j - 1)) * xx_0_inv;
-      }
+  template <typename TAcc, typename VNd1, typename VNd2>
+  ALPAKA_FN_ACC ALPAKA_FN_INLINE void computeRadLenUniformMaterial(const TAcc& acc,
+                                                                   const VNd1& length_values,
+                                                                   VNd2& rad_lengths) {
+    // Radiation length of the pixel detector in the uniform assumption, with
+    // 0.06 rad_len at 16 cm
+    constexpr double xx_0_inv = 0.06 / 16.;
+    uint n = length_values.rows();
+    rad_lengths(0) = length_values(0) * xx_0_inv;
+    for (uint j = 1; j < n; ++j) {
+      rad_lengths(j) = alpaka::math::abs(acc, length_values(j) - length_values(j - 1)) * xx_0_inv;
     }
+  }
 
-    /*!
+  /*!
     \brief Compute the covariance matrix along cartesian S-Z of points due to
     multiple Coulomb scattering to be used in the line_fit, for the barrel
     and forward cases.
@@ -62,57 +67,57 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
     correspond to the case at eta = 0.
  */
 
-    template <typename TAcc, typename V4, typename VNd1, typename VNd2, int N>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE auto scatterCovLine(const TAcc& acc,
-                                                       Matrix2d const* cov_sz,
-                                                       const V4& fast_fit,
-                                                       VNd1 const& s_arcs,
-                                                       VNd2 const& z_values,
-                                                       const double theta,
-                                                       const double bField,
-                                                       MatrixNd<N>& ret) {
+  template <typename TAcc, typename V4, typename VNd1, typename VNd2, int N>
+  ALPAKA_FN_ACC ALPAKA_FN_INLINE auto scatterCovLine(const TAcc& acc,
+                                                     Matrix2d const* cov_sz,
+                                                     const V4& fast_fit,
+                                                     VNd1 const& s_arcs,
+                                                     VNd2 const& z_values,
+                                                     const double theta,
+                                                     const double bField,
+                                                     MatrixNd<N>& ret) {
 #ifdef RFIT_DEBUG
-      riemannFit::printIt(&s_arcs, "Scatter_cov_line - s_arcs: ");
+    riemannFit::printIt(&s_arcs, "Scatter_cov_line - s_arcs: ");
 #endif
-      constexpr uint n = N;
-      double p_t = alpaka::math::min(acc, 20., fast_fit(2) * bField);  // limit pt to avoid too small error!!!
-      double p_2 = p_t * p_t * (1. + 1. / sqr(fast_fit(3)));
-      VectorNd<N> rad_lengths_S;
-      // See documentation at http://eigen.tuxfamily.org/dox/group__TutorialArrayClass.html
-      // Basically, to perform cwise operations on Matrices and Vectors, you need
-      // to transform them into Array-like objects.
-      VectorNd<N> s_values = s_arcs.array() * s_arcs.array() + z_values.array() * z_values.array();
-      s_values = s_values.array().sqrt();
-      computeRadLenUniformMaterial(acc, s_values, rad_lengths_S);
-      VectorNd<N> sig2_S;
-      sig2_S = .000225 / p_2 * (1. + 0.038 * rad_lengths_S.array().log()).abs2() * rad_lengths_S.array();
+    constexpr uint n = N;
+    double p_t = alpaka::math::min(acc, 20., fast_fit(2) * bField);  // limit pt to avoid too small error!!!
+    double p_2 = p_t * p_t * (1. + 1. / sqr(fast_fit(3)));
+    VectorNd<N> rad_lengths_S;
+    // See documentation at http://eigen.tuxfamily.org/dox/group__TutorialArrayClass.html
+    // Basically, to perform cwise operations on Matrices and Vectors, you need
+    // to transform them into Array-like objects.
+    VectorNd<N> s_values = s_arcs.array() * s_arcs.array() + z_values.array() * z_values.array();
+    s_values = s_values.array().sqrt();
+    computeRadLenUniformMaterial(acc, s_values, rad_lengths_S);
+    VectorNd<N> sig2_S;
+    sig2_S = .000225 / p_2 * (1. + 0.038 * rad_lengths_S.array().log()).abs2() * rad_lengths_S.array();
 #ifdef RFIT_DEBUG
-      riemannFit::printIt(cov_sz, "Scatter_cov_line - cov_sz: ");
+    riemannFit::printIt(cov_sz, "Scatter_cov_line - cov_sz: ");
 #endif
-      Matrix2Nd<N> tmp = Matrix2Nd<N>::Zero();
-      for (uint k = 0; k < n; ++k) {
-        tmp(k, k) = cov_sz[k](0, 0);
-        tmp(k + n, k + n) = cov_sz[k](1, 1);
-        tmp(k, k + n) = tmp(k + n, k) = cov_sz[k](0, 1);
-      }
-      for (uint k = 0; k < n; ++k) {
-        for (uint l = k; l < n; ++l) {
-          for (uint i = 0; i < uint(alpaka::math::min(acc, k, l)); ++i) {
-            tmp(k + n, l + n) += alpaka::math::abs(acc, s_values(k) - s_values(i)) *
-                                 alpaka::math::abs(acc, s_values(l) - s_values(i)) * sig2_S(i);
-          }
-          tmp(l + n, k + n) = tmp(k + n, l + n);
+    Matrix2Nd<N> tmp = Matrix2Nd<N>::Zero();
+    for (uint k = 0; k < n; ++k) {
+      tmp(k, k) = cov_sz[k](0, 0);
+      tmp(k + n, k + n) = cov_sz[k](1, 1);
+      tmp(k, k + n) = tmp(k + n, k) = cov_sz[k](0, 1);
+    }
+    for (uint k = 0; k < n; ++k) {
+      for (uint l = k; l < n; ++l) {
+        for (uint i = 0; i < uint(alpaka::math::min(acc, k, l)); ++i) {
+          tmp(k + n, l + n) += alpaka::math::abs(acc, s_values(k) - s_values(i)) *
+                               alpaka::math::abs(acc, s_values(l) - s_values(i)) * sig2_S(i);
         }
+        tmp(l + n, k + n) = tmp(k + n, l + n);
       }
-      // We are interested only in the errors orthogonal to the rotated s-axis
-      // which, in our formalism, are in the lower square matrix.
+    }
+    // We are interested only in the errors orthogonal to the rotated s-axis
+    // which, in our formalism, are in the lower square matrix.
 #ifdef RFIT_DEBUG
-      riemannFit::printIt(&tmp, "Scatter_cov_line - tmp: ");
+    riemannFit::printIt(&tmp, "Scatter_cov_line - tmp: ");
 #endif
-      ret = tmp.block(n, n, n, n);
-    }
+    ret = tmp.block(n, n, n, n);
+  }
 
-    /*!
+  /*!
     \brief Compute the covariance matrix (in radial coordinates) of points in
     the transverse plane due to multiple Coulomb scattering.
     \param p2D 2D points in the transverse plane.
@@ -125,45 +130,45 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
     \details Only the tangential component is computed (the radial one is
     negligible).
  */
-    template <typename TAcc, typename M2xN, typename V4, int N>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE MatrixNd<N> scatter_cov_rad(
-        const TAcc& acc, const M2xN& p2D, const V4& fast_fit, VectorNd<N> const& rad, double B) {
-      constexpr uint n = N;
-      double p_t = alpaka::math::min(acc, 20., fast_fit(2) * B);  // limit pt to avoid too small error!!!
-      double p_2 = p_t * p_t * (1. + 1. / sqr(fast_fit(3)));
-      double theta = atan(fast_fit(3));
-      theta = theta < 0. ? theta + M_PI : theta;
-      VectorNd<N> s_values;
-      VectorNd<N> rad_lengths;
-      const Vector2d oVec(fast_fit(0), fast_fit(1));
-
-      // associated Jacobian, used in weights and errors computation
-      for (uint i = 0; i < n; ++i) {  // x
-        Vector2d pVec = p2D.block(0, i, 2, 1) - oVec;
-        const double cross = cross2D(acc, -oVec, pVec);
-        const double dot = (-oVec).dot(pVec);
-        const double tempAtan2 = atan2(cross, dot);
-        s_values(i) = alpaka::math::abs(acc, tempAtan2 * fast_fit(2));
-      }
-      computeRadLenUniformMaterial(acc, s_values * sqrt(1. + 1. / sqr(fast_fit(3))), rad_lengths);
-      MatrixNd<N> scatter_cov_rad = MatrixNd<N>::Zero();
-      VectorNd<N> sig2 = (1. + 0.038 * rad_lengths.array().log()).abs2() * rad_lengths.array();
-      sig2 *= 0.000225 / (p_2 * sqr(sin(theta)));
-      for (uint k = 0; k < n; ++k) {
-        for (uint l = k; l < n; ++l) {
-          for (uint i = 0; i < uint(alpaka::math::min(acc, k, l)); ++i) {
-            scatter_cov_rad(k, l) += (rad(k) - rad(i)) * (rad(l) - rad(i)) * sig2(i);
-          }
-          scatter_cov_rad(l, k) = scatter_cov_rad(k, l);
+  template <typename TAcc, typename M2xN, typename V4, int N>
+  ALPAKA_FN_ACC ALPAKA_FN_INLINE MatrixNd<N> scatter_cov_rad(
+      const TAcc& acc, const M2xN& p2D, const V4& fast_fit, VectorNd<N> const& rad, double B) {
+    constexpr uint n = N;
+    double p_t = alpaka::math::min(acc, 20., fast_fit(2) * B);  // limit pt to avoid too small error!!!
+    double p_2 = p_t * p_t * (1. + 1. / sqr(fast_fit(3)));
+    double theta = atan(fast_fit(3));
+    theta = theta < 0. ? theta + M_PI : theta;
+    VectorNd<N> s_values;
+    VectorNd<N> rad_lengths;
+    const Vector2d oVec(fast_fit(0), fast_fit(1));
+
+    // associated Jacobian, used in weights and errors computation
+    for (uint i = 0; i < n; ++i) {  // x
+      Vector2d pVec = p2D.block(0, i, 2, 1) - oVec;
+      const double cross = cross2D(acc, -oVec, pVec);
+      const double dot = (-oVec).dot(pVec);
+      const double tempAtan2 = atan2(cross, dot);
+      s_values(i) = alpaka::math::abs(acc, tempAtan2 * fast_fit(2));
+    }
+    computeRadLenUniformMaterial(acc, s_values * sqrt(1. + 1. / sqr(fast_fit(3))), rad_lengths);
+    MatrixNd<N> scatter_cov_rad = MatrixNd<N>::Zero();
+    VectorNd<N> sig2 = (1. + 0.038 * rad_lengths.array().log()).abs2() * rad_lengths.array();
+    sig2 *= 0.000225 / (p_2 * sqr(sin(theta)));
+    for (uint k = 0; k < n; ++k) {
+      for (uint l = k; l < n; ++l) {
+        for (uint i = 0; i < uint(alpaka::math::min(acc, k, l)); ++i) {
+          scatter_cov_rad(k, l) += (rad(k) - rad(i)) * (rad(l) - rad(i)) * sig2(i);
         }
+        scatter_cov_rad(l, k) = scatter_cov_rad(k, l);
       }
+    }
 #ifdef RFIT_DEBUG
-      riemannFit::printIt(&scatter_cov_rad, "Scatter_cov_rad - scatter_cov_rad: ");
+    riemannFit::printIt(&scatter_cov_rad, "Scatter_cov_rad - scatter_cov_rad: ");
 #endif
-      return scatter_cov_rad;
-    }
+    return scatter_cov_rad;
+  }
 
-    /*!
+  /*!
     \brief Transform covariance matrix from radial (only tangential component)
     to Cartesian coordinates (only transverse plane component).
     \param p2D 2D points in the transverse plane.
@@ -171,35 +176,35 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
     \return cov_cart covariance matrix in Cartesian coordinates.
 */
 
-    template <typename TAcc, typename M2xN, int N>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE Matrix2Nd<N> cov_radtocart(const TAcc& acc,
-                                                              const M2xN& p2D,
-                                                              const MatrixNd<N>& cov_rad,
-                                                              const VectorNd<N>& rad) {
+  template <typename TAcc, typename M2xN, int N>
+  ALPAKA_FN_ACC ALPAKA_FN_INLINE Matrix2Nd<N> cov_radtocart(const TAcc& acc,
+                                                            const M2xN& p2D,
+                                                            const MatrixNd<N>& cov_rad,
+                                                            const VectorNd<N>& rad) {
 #ifdef RFIT_DEBUG
-      printf("Address of p2D: %p\n", &p2D);
+    printf("Address of p2D: %p\n", &p2D);
 #endif
-      printIt(&p2D, "cov_radtocart - p2D:");
-      constexpr uint n = N;
-      Matrix2Nd<N> cov_cart = Matrix2Nd<N>::Zero();
-      VectorNd<N> rad_inv = rad.cwiseInverse();
-      printIt(&rad_inv, "cov_radtocart - rad_inv:");
-      for (uint i = 0; i < n; ++i) {
-        for (uint j = i; j < n; ++j) {
-          cov_cart(i, j) = cov_rad(i, j) * p2D(1, i) * rad_inv(i) * p2D(1, j) * rad_inv(j);
-          cov_cart(i + n, j + n) = cov_rad(i, j) * p2D(0, i) * rad_inv(i) * p2D(0, j) * rad_inv(j);
-          cov_cart(i, j + n) = -cov_rad(i, j) * p2D(1, i) * rad_inv(i) * p2D(0, j) * rad_inv(j);
-          cov_cart(i + n, j) = -cov_rad(i, j) * p2D(0, i) * rad_inv(i) * p2D(1, j) * rad_inv(j);
-          cov_cart(j, i) = cov_cart(i, j);
-          cov_cart(j + n, i + n) = cov_cart(i + n, j + n);
-          cov_cart(j + n, i) = cov_cart(i, j + n);
-          cov_cart(j, i + n) = cov_cart(i + n, j);
-        }
+    printIt(&p2D, "cov_radtocart - p2D:");
+    constexpr uint n = N;
+    Matrix2Nd<N> cov_cart = Matrix2Nd<N>::Zero();
+    VectorNd<N> rad_inv = rad.cwiseInverse();
+    printIt(&rad_inv, "cov_radtocart - rad_inv:");
+    for (uint i = 0; i < n; ++i) {
+      for (uint j = i; j < n; ++j) {
+        cov_cart(i, j) = cov_rad(i, j) * p2D(1, i) * rad_inv(i) * p2D(1, j) * rad_inv(j);
+        cov_cart(i + n, j + n) = cov_rad(i, j) * p2D(0, i) * rad_inv(i) * p2D(0, j) * rad_inv(j);
+        cov_cart(i, j + n) = -cov_rad(i, j) * p2D(1, i) * rad_inv(i) * p2D(0, j) * rad_inv(j);
+        cov_cart(i + n, j) = -cov_rad(i, j) * p2D(0, i) * rad_inv(i) * p2D(1, j) * rad_inv(j);
+        cov_cart(j, i) = cov_cart(i, j);
+        cov_cart(j + n, i + n) = cov_cart(i + n, j + n);
+        cov_cart(j + n, i) = cov_cart(i, j + n);
+        cov_cart(j, i + n) = cov_cart(i + n, j);
       }
-      return cov_cart;
     }
+    return cov_cart;
+  }
 
-    /*!
+  /*!
     \brief Transform covariance matrix from Cartesian coordinates (only
     transverse plane component) to radial coordinates (both radial and
     tangential component but only diagonal terms, correlation between different
@@ -209,27 +214,27 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
     \return cov_rad covariance matrix in raidal coordinate.
     \warning correlation between different point are not computed.
 */
-    template <typename TAcc, typename M2xN, int N>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE VectorNd<N> cov_carttorad(const TAcc& acc,
-                                                             const M2xN& p2D,
-                                                             const Matrix2Nd<N>& cov_cart,
-                                                             const VectorNd<N>& rad) {
-      constexpr uint n = N;
-      VectorNd<N> cov_rad;
-      const VectorNd<N> rad_inv2 = rad.cwiseInverse().array().square();
-      for (uint i = 0; i < n; ++i) {
-        //!< in case you have (0,0) to avoid dividing by 0 radius
-        if (rad(i) < 1.e-4)
-          cov_rad(i) = cov_cart(i, i);
-        else {
-          cov_rad(i) = rad_inv2(i) * (cov_cart(i, i) * sqr(p2D(1, i)) + cov_cart(i + n, i + n) * sqr(p2D(0, i)) -
-                                      2. * cov_cart(i, i + n) * p2D(0, i) * p2D(1, i));
-        }
+  template <typename TAcc, typename M2xN, int N>
+  ALPAKA_FN_ACC ALPAKA_FN_INLINE VectorNd<N> cov_carttorad(const TAcc& acc,
+                                                           const M2xN& p2D,
+                                                           const Matrix2Nd<N>& cov_cart,
+                                                           const VectorNd<N>& rad) {
+    constexpr uint n = N;
+    VectorNd<N> cov_rad;
+    const VectorNd<N> rad_inv2 = rad.cwiseInverse().array().square();
+    for (uint i = 0; i < n; ++i) {
+      //!< in case you have (0,0) to avoid dividing by 0 radius
+      if (rad(i) < 1.e-4)
+        cov_rad(i) = cov_cart(i, i);
+      else {
+        cov_rad(i) = rad_inv2(i) * (cov_cart(i, i) * sqr(p2D(1, i)) + cov_cart(i + n, i + n) * sqr(p2D(0, i)) -
+                                    2. * cov_cart(i, i + n) * p2D(0, i) * p2D(1, i));
       }
-      return cov_rad;
     }
+    return cov_rad;
+  }
 
-    /*!
+  /*!
     \brief Transform covariance matrix from Cartesian coordinates (only
     transverse plane component) to coordinates system orthogonal to the
     pre-fitted circle in each point.
@@ -241,30 +246,30 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
     \return cov_rad covariance matrix in the pre-fitted circle's
     orthogonal system.
 */
-    template <typename TAcc, typename M2xN, typename V4, int N>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE VectorNd<N> cov_carttorad_prefit(
-        const TAcc& acc, const M2xN& p2D, const Matrix2Nd<N>& cov_cart, V4& fast_fit, const VectorNd<N>& rad) {
-      constexpr uint n = N;
-      VectorNd<N> cov_rad;
-      for (uint i = 0; i < n; ++i) {
-        //!< in case you have (0,0) to avoid dividing by 0 radius
-        if (rad(i) < 1.e-4)
-          cov_rad(i) = cov_cart(i, i);  // TO FIX
-        else {
-          Vector2d a = p2D.col(i);
-          Vector2d b = p2D.col(i) - fast_fit.head(2);
-          const double x2 = a.dot(b);
-          const double y2 = cross2D(acc, a, b);
-          const double tan_c = -y2 / x2;
-          const double tan_c2 = sqr(tan_c);
-          cov_rad(i) =
-              1. / (1. + tan_c2) * (cov_cart(i, i) + cov_cart(i + n, i + n) * tan_c2 + 2 * cov_cart(i, i + n) * tan_c);
-        }
+  template <typename TAcc, typename M2xN, typename V4, int N>
+  ALPAKA_FN_ACC ALPAKA_FN_INLINE VectorNd<N> cov_carttorad_prefit(
+      const TAcc& acc, const M2xN& p2D, const Matrix2Nd<N>& cov_cart, V4& fast_fit, const VectorNd<N>& rad) {
+    constexpr uint n = N;
+    VectorNd<N> cov_rad;
+    for (uint i = 0; i < n; ++i) {
+      //!< in case you have (0,0) to avoid dividing by 0 radius
+      if (rad(i) < 1.e-4)
+        cov_rad(i) = cov_cart(i, i);  // TO FIX
+      else {
+        Vector2d a = p2D.col(i);
+        Vector2d b = p2D.col(i) - fast_fit.head(2);
+        const double x2 = a.dot(b);
+        const double y2 = cross2D(acc, a, b);
+        const double tan_c = -y2 / x2;
+        const double tan_c2 = sqr(tan_c);
+        cov_rad(i) =
+            1. / (1. + tan_c2) * (cov_cart(i, i) + cov_cart(i + n, i + n) * tan_c2 + 2 * cov_cart(i, i + n) * tan_c);
       }
-      return cov_rad;
     }
+    return cov_rad;
+  }
 
-    /*!
+  /*!
     \brief Compute the points' weights' vector for the circle fit when multiple
     scattering is managed.
     Further information in attached documentation.
@@ -275,12 +280,12 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
     diagonal cov matrix. Further investigation needed.
 */
 
-    template <typename TAcc, int N>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE VectorNd<N> weightCircle(const TAcc& acc, const MatrixNd<N>& cov_rad_inv) {
-      return cov_rad_inv.colwise().sum().transpose();
-    }
+  template <typename TAcc, int N>
+  ALPAKA_FN_ACC ALPAKA_FN_INLINE VectorNd<N> weightCircle(const TAcc& acc, const MatrixNd<N>& cov_rad_inv) {
+    return cov_rad_inv.colwise().sum().transpose();
+  }
 
-    /*!
+  /*!
     \brief Find particle q considering the  sign of cross product between
     particles velocity (estimated by the first 2 hits) and the vector radius
     between the first hit and the center of the fitted circle.
@@ -288,16 +293,15 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
     \param par_uvr result of the circle fit in this form: (X0,Y0,R).
     \return q int 1 or -1.
 */
-    template <typename TAcc, typename M2xN>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE int32_t charge(const TAcc& acc, const M2xN& p2D, const Vector3d& par_uvr) {
-      return ((p2D(0, 1) - p2D(0, 0)) * (par_uvr.y() - p2D(1, 0)) -
-                  (p2D(1, 1) - p2D(1, 0)) * (par_uvr.x() - p2D(0, 0)) >
-              0)
-                 ? -1
-                 : 1;
-    }
+  template <typename TAcc, typename M2xN>
+  ALPAKA_FN_ACC ALPAKA_FN_INLINE int32_t charge(const TAcc& acc, const M2xN& p2D, const Vector3d& par_uvr) {
+    return ((p2D(0, 1) - p2D(0, 0)) * (par_uvr.y() - p2D(1, 0)) - (p2D(1, 1) - p2D(1, 0)) * (par_uvr.x() - p2D(0, 0)) >
+            0)
+               ? -1
+               : 1;
+  }
 
-    /*!
+  /*!
     \brief Compute the eigenvector associated to the minimum eigenvalue.
     \param A the Matrix you want to know eigenvector and eigenvalue.
     \param chi2 the double were the chi2-related quantity will be stored.
@@ -311,22 +315,22 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
     fast closed-form algorithm.
     For this optimization the matrix type must be known at compiling time.
 */
-    template <typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE Vector3d min_eigen3D(const TAcc& acc, const Matrix3d& A, double& chi2) {
+  template <typename TAcc>
+  ALPAKA_FN_ACC ALPAKA_FN_INLINE Vector3d min_eigen3D(const TAcc& acc, const Matrix3d& A, double& chi2) {
 #ifdef RFIT_DEBUG
-      printf("min_eigen3D - enter\n");
+    printf("min_eigen3D - enter\n");
 #endif
-      Eigen::SelfAdjointEigenSolver<Matrix3d> solver(3);
-      solver.computeDirect(A);
-      int min_index;
-      chi2 = solver.eigenvalues().minCoeff(&min_index);
+    Eigen::SelfAdjointEigenSolver<Matrix3d> solver(3);
+    solver.computeDirect(A);
+    int min_index;
+    chi2 = solver.eigenvalues().minCoeff(&min_index);
 #ifdef RFIT_DEBUG
-      printf("min_eigen3D - exit\n");
+    printf("min_eigen3D - exit\n");
 #endif
-      return solver.eigenvectors().col(min_index);
-    }
+    return solver.eigenvectors().col(min_index);
+  }
 
-    /*!
+  /*!
     \brief A faster version of min_eigen3D() where double precision is not
     needed.
     \param A the Matrix you want to know eigenvector and eigenvalue.
@@ -337,16 +341,16 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
     speed up in  single precision.
 */
 
-    template <typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE Vector3d min_eigen3D_fast(const TAcc& acc, const Matrix3d& A) {
-      Eigen::SelfAdjointEigenSolver<Matrix3f> solver(3);
-      solver.computeDirect(A.cast<float>());
-      int min_index;
-      solver.eigenvalues().minCoeff(&min_index);
-      return solver.eigenvectors().col(min_index).cast<double>();
-    }
+  template <typename TAcc>
+  ALPAKA_FN_ACC ALPAKA_FN_INLINE Vector3d min_eigen3D_fast(const TAcc& acc, const Matrix3d& A) {
+    Eigen::SelfAdjointEigenSolver<Matrix3f> solver(3);
+    solver.computeDirect(A.cast<float>());
+    int min_index;
+    solver.eigenvalues().minCoeff(&min_index);
+    return solver.eigenvectors().col(min_index).cast<double>();
+  }
 
-    /*!
+  /*!
     \brief 2D version of min_eigen3D().
     \param aMat the Matrix you want to know eigenvector and eigenvalue.
     \param chi2 the double were the chi2-related quantity will be stored
@@ -355,16 +359,16 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
     do not use special math function (just sqrt) therefore it doesn't speed up
     significantly in single precision.
 */
-    template <typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE Vector2d min_eigen2D(const TAcc& acc, const Matrix2d& aMat, double& chi2) {
-      Eigen::SelfAdjointEigenSolver<Matrix2d> solver(2);
-      solver.computeDirect(aMat);
-      int min_index;
-      chi2 = solver.eigenvalues().minCoeff(&min_index);
-      return solver.eigenvectors().col(min_index);
-    }
+  template <typename TAcc>
+  ALPAKA_FN_ACC ALPAKA_FN_INLINE Vector2d min_eigen2D(const TAcc& acc, const Matrix2d& aMat, double& chi2) {
+    Eigen::SelfAdjointEigenSolver<Matrix2d> solver(2);
+    solver.computeDirect(aMat);
+    int min_index;
+    chi2 = solver.eigenvalues().minCoeff(&min_index);
+    return solver.eigenvectors().col(min_index);
+  }
 
-    /*!
+  /*!
     \brief A very fast helix fit: it fits a circle by three points (first, middle
     and last point) and a line by two points (first and last).
     \param hits points to be fitted
@@ -377,60 +381,60 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
     - computation of error due to multiple scattering.
 */
 
-    template <typename TAcc, typename M3xN, typename V4>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE void fastFit(const TAcc& acc, const M3xN& hits, V4& result) {
-      constexpr uint32_t N = M3xN::ColsAtCompileTime;
-      constexpr auto n = N;  // get the number of hits
-      printIt(&hits, "Fast_fit - hits: ");
-
-      // CIRCLE FIT
-      // Make segments between middle-to-first(b) and last-to-first(c) hits
-      const Vector2d bVec = hits.block(0, n / 2, 2, 1) - hits.block(0, 0, 2, 1);
-      const Vector2d cVec = hits.block(0, n - 1, 2, 1) - hits.block(0, 0, 2, 1);
-      printIt(&bVec, "Fast_fit - b: ");
-      printIt(&cVec, "Fast_fit - c: ");
-      // Compute their lengths
-      auto b2 = bVec.squaredNorm();
-      auto c2 = cVec.squaredNorm();
-      // The algebra has been verified (MR). The usual approach has been followed:
-      // * use an orthogonal reference frame passing from the first point.
-      // * build the segments (chords)
-      // * build orthogonal lines through mid points
-      // * make a system and solve for X0 and Y0.
-      // * add the initial point
-      bool flip = abs(bVec.x()) < abs(bVec.y());
-      auto bx = flip ? bVec.y() : bVec.x();
-      auto by = flip ? bVec.x() : bVec.y();
-      auto cx = flip ? cVec.y() : cVec.x();
-      auto cy = flip ? cVec.x() : cVec.y();
-      //!< in case b.x is 0 (2 hits with same x)
-      auto div = 2. * (cx * by - bx * cy);
-      // if aligned TO FIX
-      auto y0 = (cx * b2 - bx * c2) / div;
-      auto x0 = (0.5 * b2 - y0 * by) / bx;
-      result(0) = hits(0, 0) + (flip ? y0 : x0);
-      result(1) = hits(1, 0) + (flip ? x0 : y0);
-      result(2) = sqrt(sqr(x0) + sqr(y0));
-      printIt(&result, "Fast_fit - result: ");
-
-      // LINE FIT
-      const Vector2d dVec = hits.block(0, 0, 2, 1) - result.head(2);
-      const Vector2d eVec = hits.block(0, n - 1, 2, 1) - result.head(2);
-      printIt(&eVec, "Fast_fit - e: ");
-      printIt(&dVec, "Fast_fit - d: ");
-      // Compute the arc-length between first and last point: L = R * theta = R * atan (tan (Theta) )
-      auto dr = result(2) * atan2(cross2D(acc, dVec, eVec), dVec.dot(eVec));
-      // Simple difference in Z between last and first hit
-      auto dz = hits(2, n - 1) - hits(2, 0);
-
-      result(3) = (dr / dz);
+  template <typename TAcc, typename M3xN, typename V4>
+  ALPAKA_FN_ACC ALPAKA_FN_INLINE void fastFit(const TAcc& acc, const M3xN& hits, V4& result) {
+    constexpr uint32_t N = M3xN::ColsAtCompileTime;
+    constexpr auto n = N;  // get the number of hits
+    printIt(&hits, "Fast_fit - hits: ");
+
+    // CIRCLE FIT
+    // Make segments between middle-to-first(b) and last-to-first(c) hits
+    const Vector2d bVec = hits.block(0, n / 2, 2, 1) - hits.block(0, 0, 2, 1);
+    const Vector2d cVec = hits.block(0, n - 1, 2, 1) - hits.block(0, 0, 2, 1);
+    printIt(&bVec, "Fast_fit - b: ");
+    printIt(&cVec, "Fast_fit - c: ");
+    // Compute their lengths
+    auto b2 = bVec.squaredNorm();
+    auto c2 = cVec.squaredNorm();
+    // The algebra has been verified (MR). The usual approach has been followed:
+    // * use an orthogonal reference frame passing from the first point.
+    // * build the segments (chords)
+    // * build orthogonal lines through mid points
+    // * make a system and solve for X0 and Y0.
+    // * add the initial point
+    bool flip = abs(bVec.x()) < abs(bVec.y());
+    auto bx = flip ? bVec.y() : bVec.x();
+    auto by = flip ? bVec.x() : bVec.y();
+    auto cx = flip ? cVec.y() : cVec.x();
+    auto cy = flip ? cVec.x() : cVec.y();
+    //!< in case b.x is 0 (2 hits with same x)
+    auto div = 2. * (cx * by - bx * cy);
+    // if aligned TO FIX
+    auto y0 = (cx * b2 - bx * c2) / div;
+    auto x0 = (0.5 * b2 - y0 * by) / bx;
+    result(0) = hits(0, 0) + (flip ? y0 : x0);
+    result(1) = hits(1, 0) + (flip ? x0 : y0);
+    result(2) = sqrt(sqr(x0) + sqr(y0));
+    printIt(&result, "Fast_fit - result: ");
+
+    // LINE FIT
+    const Vector2d dVec = hits.block(0, 0, 2, 1) - result.head(2);
+    const Vector2d eVec = hits.block(0, n - 1, 2, 1) - result.head(2);
+    printIt(&eVec, "Fast_fit - e: ");
+    printIt(&dVec, "Fast_fit - d: ");
+    // Compute the arc-length between first and last point: L = R * theta = R * atan (tan (Theta) )
+    auto dr = result(2) * atan2(cross2D(acc, dVec, eVec), dVec.dot(eVec));
+    // Simple difference in Z between last and first hit
+    auto dz = hits(2, n - 1) - hits(2, 0);
+
+    result(3) = (dr / dz);
 
 #ifdef RFIT_DEBUG
-      printf("Fast_fit: [%f, %f, %f, %f]\n", result(0), result(1), result(2), result(3));
+    printf("Fast_fit: [%f, %f, %f, %f]\n", result(0), result(1), result(2), result(3));
 #endif
-    }
+  }
 
-    /*!
+  /*!
     \brief Fit a generic number of 2D points with a circle using Riemann-Chernov
     algorithm. Covariance matrix of fitted parameter is optionally computed.
     Multiple scattering (currently only in barrel layer) is optionally handled.
@@ -457,322 +461,322 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
     \bug further investigation needed for error propagation with multiple
     scattering.
 */
-    template <typename TAcc, typename M2xN, typename V4, int N>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE CircleFit circleFit(const TAcc& acc,
-                                                       const M2xN& hits2D,
-                                                       const Matrix2Nd<N>& hits_cov2D,
-                                                       const V4& fast_fit,
-                                                       const VectorNd<N>& rad,
-                                                       const double bField,
-                                                       const bool error) {
+  template <typename TAcc, typename M2xN, typename V4, int N>
+  ALPAKA_FN_ACC ALPAKA_FN_INLINE CircleFit circleFit(const TAcc& acc,
+                                                     const M2xN& hits2D,
+                                                     const Matrix2Nd<N>& hits_cov2D,
+                                                     const V4& fast_fit,
+                                                     const VectorNd<N>& rad,
+                                                     const double bField,
+                                                     const bool error) {
 #ifdef RFIT_DEBUG
-      printf("circle_fit - enter\n");
+    printf("circle_fit - enter\n");
 #endif
-      // INITIALIZATION
-      Matrix2Nd<N> vMat = hits_cov2D;
-      constexpr uint n = N;
-      printIt(&hits2D, "circle_fit - hits2D:");
-      printIt(&hits_cov2D, "circle_fit - hits_cov2D:");
+    // INITIALIZATION
+    Matrix2Nd<N> vMat = hits_cov2D;
+    constexpr uint n = N;
+    printIt(&hits2D, "circle_fit - hits2D:");
+    printIt(&hits_cov2D, "circle_fit - hits_cov2D:");
 
 #ifdef RFIT_DEBUG
-      printf("circle_fit - WEIGHT COMPUTATION\n");
+    printf("circle_fit - WEIGHT COMPUTATION\n");
 #endif
-      // WEIGHT COMPUTATION
-      VectorNd<N> weight;
-      MatrixNd<N> gMat;
-      double renorm;
-      {
-        MatrixNd<N> cov_rad = cov_carttorad_prefit(acc, hits2D, vMat, fast_fit, rad).asDiagonal();
-        MatrixNd<N> scatterCovRadMat = scatter_cov_rad(acc, hits2D, fast_fit, rad, bField);
-        printIt(&scatterCovRadMat, "circle_fit - scatter_cov_rad:");
-        printIt(&hits2D, "circle_fit - hits2D bis:");
+    // WEIGHT COMPUTATION
+    VectorNd<N> weight;
+    MatrixNd<N> gMat;
+    double renorm;
+    {
+      MatrixNd<N> cov_rad = cov_carttorad_prefit(acc, hits2D, vMat, fast_fit, rad).asDiagonal();
+      MatrixNd<N> scatterCovRadMat = scatter_cov_rad(acc, hits2D, fast_fit, rad, bField);
+      printIt(&scatterCovRadMat, "circle_fit - scatter_cov_rad:");
+      printIt(&hits2D, "circle_fit - hits2D bis:");
 #ifdef RFIT_DEBUG
-        printf("Address of hits2D: a) %p\n", &hits2D);
+      printf("Address of hits2D: a) %p\n", &hits2D);
 #endif
-        vMat += cov_radtocart(acc, hits2D, scatterCovRadMat, rad);
-        printIt(&vMat, "circle_fit - V:");
-        cov_rad += scatterCovRadMat;
-        printIt(&cov_rad, "circle_fit - cov_rad:");
-        math::cholesky::invert(cov_rad, gMat);
-        // gMat = cov_rad.inverse();
-        renorm = gMat.sum();
-        gMat *= 1. / renorm;
-        weight = weightCircle(acc, gMat);
-      }
-      printIt(&weight, "circle_fit - weight:");
+      vMat += cov_radtocart(acc, hits2D, scatterCovRadMat, rad);
+      printIt(&vMat, "circle_fit - V:");
+      cov_rad += scatterCovRadMat;
+      printIt(&cov_rad, "circle_fit - cov_rad:");
+      math::cholesky::invert(cov_rad, gMat);
+      // gMat = cov_rad.inverse();
+      renorm = gMat.sum();
+      gMat *= 1. / renorm;
+      weight = weightCircle(acc, gMat);
+    }
+    printIt(&weight, "circle_fit - weight:");
 
-      // SPACE TRANSFORMATION
+    // SPACE TRANSFORMATION
 #ifdef RFIT_DEBUG
-      printf("circle_fit - SPACE TRANSFORMATION\n");
+    printf("circle_fit - SPACE TRANSFORMATION\n");
 #endif
 
-      // center
+    // center
 #ifdef RFIT_DEBUG
-      printf("Address of hits2D: b) %p\n", &hits2D);
+    printf("Address of hits2D: b) %p\n", &hits2D);
 #endif
-      const Vector2d hCentroid = hits2D.rowwise().mean();  // centroid
-      printIt(&hCentroid, "circle_fit - h_:");
-      Matrix3xNd<N> p3D;
-      p3D.block(0, 0, 2, n) = hits2D.colwise() - hCentroid;
-      printIt(&p3D, "circle_fit - p3D: a)");
-      Vector2Nd<N> mc;  // centered hits, used in error computation
-      mc << p3D.row(0).transpose(), p3D.row(1).transpose();
-      printIt(&mc, "circle_fit - mc(centered hits):");
-
-      // scale
-      const double tempQ = mc.squaredNorm();
-      const double tempS = sqrt(n * 1. / tempQ);  // scaling factor
-      p3D.block(0, 0, 2, n) *= tempS;
-
-      // project on paraboloid
-      p3D.row(2) = p3D.block(0, 0, 2, n).colwise().squaredNorm();
-      printIt(&p3D, "circle_fit - p3D: b)");
+    const Vector2d hCentroid = hits2D.rowwise().mean();  // centroid
+    printIt(&hCentroid, "circle_fit - h_:");
+    Matrix3xNd<N> p3D;
+    p3D.block(0, 0, 2, n) = hits2D.colwise() - hCentroid;
+    printIt(&p3D, "circle_fit - p3D: a)");
+    Vector2Nd<N> mc;  // centered hits, used in error computation
+    mc << p3D.row(0).transpose(), p3D.row(1).transpose();
+    printIt(&mc, "circle_fit - mc(centered hits):");
+
+    // scale
+    const double tempQ = mc.squaredNorm();
+    const double tempS = sqrt(n * 1. / tempQ);  // scaling factor
+    p3D.block(0, 0, 2, n) *= tempS;
+
+    // project on paraboloid
+    p3D.row(2) = p3D.block(0, 0, 2, n).colwise().squaredNorm();
+    printIt(&p3D, "circle_fit - p3D: b)");
 
 #ifdef RFIT_DEBUG
-      printf("circle_fit - COST FUNCTION\n");
+    printf("circle_fit - COST FUNCTION\n");
 #endif
-      // COST FUNCTION
+    // COST FUNCTION
 
-      // compute
-      Vector3d r0;
-      r0.noalias() = p3D * weight;  // center of gravity
-      const Matrix3xNd<N> xMat = p3D.colwise() - r0;
-      Matrix3d aMat = xMat * gMat * xMat.transpose();
-      printIt(&aMat, "circle_fit - A:");
+    // compute
+    Vector3d r0;
+    r0.noalias() = p3D * weight;  // center of gravity
+    const Matrix3xNd<N> xMat = p3D.colwise() - r0;
+    Matrix3d aMat = xMat * gMat * xMat.transpose();
+    printIt(&aMat, "circle_fit - A:");
 
 #ifdef RFIT_DEBUG
-      printf("circle_fit - MINIMIZE\n");
+    printf("circle_fit - MINIMIZE\n");
 #endif
-      // minimize
-      double chi2;
-      Vector3d vVec = min_eigen3D(acc, aMat, chi2);
+    // minimize
+    double chi2;
+    Vector3d vVec = min_eigen3D(acc, aMat, chi2);
 #ifdef RFIT_DEBUG
-      printf("circle_fit - AFTER MIN_EIGEN\n");
+    printf("circle_fit - AFTER MIN_EIGEN\n");
 #endif
-      printIt(&vVec, "v BEFORE INVERSION");
-      vVec *= (vVec(2) > 0) ? 1 : -1;  // TO FIX dovrebbe essere N(3)>0
-      printIt(&vVec, "v AFTER INVERSION");
-      // This hack to be able to run on GPU where the automatic assignment to a
-      // double from the vector multiplication is not working.
+    printIt(&vVec, "v BEFORE INVERSION");
+    vVec *= (vVec(2) > 0) ? 1 : -1;  // TO FIX dovrebbe essere N(3)>0
+    printIt(&vVec, "v AFTER INVERSION");
+    // This hack to be able to run on GPU where the automatic assignment to a
+    // double from the vector multiplication is not working.
 #ifdef RFIT_DEBUG
-      printf("circle_fit - AFTER MIN_EIGEN 1\n");
+    printf("circle_fit - AFTER MIN_EIGEN 1\n");
 #endif
-      Eigen::Matrix<double, 1, 1> cm;
+    Eigen::Matrix<double, 1, 1> cm;
 #ifdef RFIT_DEBUG
-      printf("circle_fit - AFTER MIN_EIGEN 2\n");
+    printf("circle_fit - AFTER MIN_EIGEN 2\n");
 #endif
-      cm = -vVec.transpose() * r0;
+    cm = -vVec.transpose() * r0;
 #ifdef RFIT_DEBUG
-      printf("circle_fit - AFTER MIN_EIGEN 3\n");
+    printf("circle_fit - AFTER MIN_EIGEN 3\n");
 #endif
-      const double tempC = cm(0, 0);
+    const double tempC = cm(0, 0);
 
 #ifdef RFIT_DEBUG
-      printf("circle_fit - COMPUTE CIRCLE PARAMETER\n");
+    printf("circle_fit - COMPUTE CIRCLE PARAMETER\n");
 #endif
-      // COMPUTE CIRCLE PARAMETER
-
-      // auxiliary quantities
-      const double tempH = sqrt(1. - sqr(vVec(2)) - 4. * tempC * vVec(2));
-      const double v2x2_inv = 1. / (2. * vVec(2));
-      const double s_inv = 1. / tempS;
-      Vector3d par_uvr;  // used in error propagation
-      par_uvr << -vVec(0) * v2x2_inv, -vVec(1) * v2x2_inv, tempH * v2x2_inv;
-
-      CircleFit circle;
-      circle.par << par_uvr(0) * s_inv + hCentroid(0), par_uvr(1) * s_inv + hCentroid(1), par_uvr(2) * s_inv;
-      circle.qCharge = charge(acc, hits2D, circle.par);
-      circle.chi2 = abs(chi2) * renorm / sqr(2 * vVec(2) * par_uvr(2) * tempS);
-      printIt(&circle.par, "circle_fit - CIRCLE PARAMETERS:");
-      printIt(&circle.cov, "circle_fit - CIRCLE COVARIANCE:");
+    // COMPUTE CIRCLE PARAMETER
+
+    // auxiliary quantities
+    const double tempH = sqrt(1. - sqr(vVec(2)) - 4. * tempC * vVec(2));
+    const double v2x2_inv = 1. / (2. * vVec(2));
+    const double s_inv = 1. / tempS;
+    Vector3d par_uvr;  // used in error propagation
+    par_uvr << -vVec(0) * v2x2_inv, -vVec(1) * v2x2_inv, tempH * v2x2_inv;
+
+    CircleFit circle;
+    circle.par << par_uvr(0) * s_inv + hCentroid(0), par_uvr(1) * s_inv + hCentroid(1), par_uvr(2) * s_inv;
+    circle.qCharge = charge(acc, hits2D, circle.par);
+    circle.chi2 = abs(chi2) * renorm / sqr(2 * vVec(2) * par_uvr(2) * tempS);
+    printIt(&circle.par, "circle_fit - CIRCLE PARAMETERS:");
+    printIt(&circle.cov, "circle_fit - CIRCLE COVARIANCE:");
 #ifdef RFIT_DEBUG
-      printf("circle_fit - CIRCLE CHARGE: %d\n", circle.qCharge);
+    printf("circle_fit - CIRCLE CHARGE: %d\n", circle.qCharge);
 #endif
 
 #ifdef RFIT_DEBUG
-      printf("circle_fit - ERROR PROPAGATION\n");
+    printf("circle_fit - ERROR PROPAGATION\n");
 #endif
-      // ERROR PROPAGATION
-      if (error) {
+    // ERROR PROPAGATION
+    if (error) {
 #ifdef RFIT_DEBUG
-        printf("circle_fit - ERROR PRPAGATION ACTIVATED\n");
+      printf("circle_fit - ERROR PRPAGATION ACTIVATED\n");
 #endif
-        ArrayNd<N> vcsMat[2][2];  // cov matrix of center & scaled points
-        MatrixNd<N> cMat[3][3];   // cov matrix of 3D transformed points
+      ArrayNd<N> vcsMat[2][2];  // cov matrix of center & scaled points
+      MatrixNd<N> cMat[3][3];   // cov matrix of 3D transformed points
 #ifdef RFIT_DEBUG
-        printf("circle_fit - ERROR PRPAGATION ACTIVATED 2\n");
+      printf("circle_fit - ERROR PRPAGATION ACTIVATED 2\n");
 #endif
-        {
-          Eigen::Matrix<double, 1, 1> cm;
-          Eigen::Matrix<double, 1, 1> cm2;
-          cm = mc.transpose() * vMat * mc;
-          const double tempC2 = cm(0, 0);
-          Matrix2Nd<N> tempVcsMat;
-          tempVcsMat.template triangularView<Eigen::Upper>() =
-              (sqr(tempS) * vMat + sqr(sqr(tempS)) * 1. / (4. * tempQ * n) *
-                                       (2. * vMat.squaredNorm() + 4. * tempC2) *  // mc.transpose() * V * mc) *
-                                       (mc * mc.transpose()));
-
-          printIt(&tempVcsMat, "circle_fit - Vcs:");
-          cMat[0][0] = tempVcsMat.block(0, 0, n, n).template selfadjointView<Eigen::Upper>();
-          vcsMat[0][1] = tempVcsMat.block(0, n, n, n);
-          cMat[1][1] = tempVcsMat.block(n, n, n, n).template selfadjointView<Eigen::Upper>();
-          vcsMat[1][0] = vcsMat[0][1].transpose();
-          printIt(&tempVcsMat, "circle_fit - Vcs:");
-        }
+      {
+        Eigen::Matrix<double, 1, 1> cm;
+        Eigen::Matrix<double, 1, 1> cm2;
+        cm = mc.transpose() * vMat * mc;
+        const double tempC2 = cm(0, 0);
+        Matrix2Nd<N> tempVcsMat;
+        tempVcsMat.template triangularView<Eigen::Upper>() =
+            (sqr(tempS) * vMat + sqr(sqr(tempS)) * 1. / (4. * tempQ * n) *
+                                     (2. * vMat.squaredNorm() + 4. * tempC2) *  // mc.transpose() * V * mc) *
+                                     (mc * mc.transpose()));
+
+        printIt(&tempVcsMat, "circle_fit - Vcs:");
+        cMat[0][0] = tempVcsMat.block(0, 0, n, n).template selfadjointView<Eigen::Upper>();
+        vcsMat[0][1] = tempVcsMat.block(0, n, n, n);
+        cMat[1][1] = tempVcsMat.block(n, n, n, n).template selfadjointView<Eigen::Upper>();
+        vcsMat[1][0] = vcsMat[0][1].transpose();
+        printIt(&tempVcsMat, "circle_fit - Vcs:");
+      }
 
-        {
-          const ArrayNd<N> t0 = (VectorXd::Constant(n, 1.) * p3D.row(0));
-          const ArrayNd<N> t1 = (VectorXd::Constant(n, 1.) * p3D.row(1));
-          const ArrayNd<N> t00 = p3D.row(0).transpose() * p3D.row(0);
-          const ArrayNd<N> t01 = p3D.row(0).transpose() * p3D.row(1);
-          const ArrayNd<N> t11 = p3D.row(1).transpose() * p3D.row(1);
-          const ArrayNd<N> t10 = t01.transpose();
-          vcsMat[0][0] = cMat[0][0];
-          cMat[0][1] = vcsMat[0][1];
-          cMat[0][2] = 2. * (vcsMat[0][0] * t0 + vcsMat[0][1] * t1);
-          vcsMat[1][1] = cMat[1][1];
-          cMat[1][2] = 2. * (vcsMat[1][0] * t0 + vcsMat[1][1] * t1);
-          MatrixNd<N> tmp;
-          tmp.template triangularView<Eigen::Upper>() =
-              (2. * (vcsMat[0][0] * vcsMat[0][0] + vcsMat[0][0] * vcsMat[0][1] + vcsMat[1][1] * vcsMat[1][0] +
-                     vcsMat[1][1] * vcsMat[1][1]) +
-               4. * (vcsMat[0][0] * t00 + vcsMat[0][1] * t01 + vcsMat[1][0] * t10 + vcsMat[1][1] * t11))
-                  .matrix();
-          cMat[2][2] = tmp.template selfadjointView<Eigen::Upper>();
+      {
+        const ArrayNd<N> t0 = (VectorXd::Constant(n, 1.) * p3D.row(0));
+        const ArrayNd<N> t1 = (VectorXd::Constant(n, 1.) * p3D.row(1));
+        const ArrayNd<N> t00 = p3D.row(0).transpose() * p3D.row(0);
+        const ArrayNd<N> t01 = p3D.row(0).transpose() * p3D.row(1);
+        const ArrayNd<N> t11 = p3D.row(1).transpose() * p3D.row(1);
+        const ArrayNd<N> t10 = t01.transpose();
+        vcsMat[0][0] = cMat[0][0];
+        cMat[0][1] = vcsMat[0][1];
+        cMat[0][2] = 2. * (vcsMat[0][0] * t0 + vcsMat[0][1] * t1);
+        vcsMat[1][1] = cMat[1][1];
+        cMat[1][2] = 2. * (vcsMat[1][0] * t0 + vcsMat[1][1] * t1);
+        MatrixNd<N> tmp;
+        tmp.template triangularView<Eigen::Upper>() =
+            (2. * (vcsMat[0][0] * vcsMat[0][0] + vcsMat[0][0] * vcsMat[0][1] + vcsMat[1][1] * vcsMat[1][0] +
+                   vcsMat[1][1] * vcsMat[1][1]) +
+             4. * (vcsMat[0][0] * t00 + vcsMat[0][1] * t01 + vcsMat[1][0] * t10 + vcsMat[1][1] * t11))
+                .matrix();
+        cMat[2][2] = tmp.template selfadjointView<Eigen::Upper>();
+      }
+      printIt(&cMat[0][0], "circle_fit - C[0][0]:");
+
+      Matrix3d c0Mat;  // cov matrix of center of gravity (r0.x,r0.y,r0.z)
+      for (uint i = 0; i < 3; ++i) {
+        for (uint j = i; j < 3; ++j) {
+          Eigen::Matrix<double, 1, 1> tmp;
+          tmp = weight.transpose() * cMat[i][j] * weight;
+          // Workaround to get things working in GPU
+          const double tempC = tmp(0, 0);
+          c0Mat(i, j) = tempC;  //weight.transpose() * C[i][j] * weight;
+          c0Mat(j, i) = c0Mat(i, j);
         }
-        printIt(&cMat[0][0], "circle_fit - C[0][0]:");
+      }
+      printIt(&c0Mat, "circle_fit - C0:");
+
+      const MatrixNd<N> wMat = weight * weight.transpose();
+      const MatrixNd<N> hMat = MatrixNd<N>::Identity().rowwise() - weight.transpose();
+      const MatrixNx3d<N> s_v = hMat * p3D.transpose();
+      printIt(&wMat, "circle_fit - W:");
+      printIt(&hMat, "circle_fit - H:");
+      printIt(&s_v, "circle_fit - s_v:");
+
+      MatrixNd<N> dMat[3][3];  // cov(s_v)
+      dMat[0][0] = (hMat * cMat[0][0] * hMat.transpose()).cwiseProduct(wMat);
+      dMat[0][1] = (hMat * cMat[0][1] * hMat.transpose()).cwiseProduct(wMat);
+      dMat[0][2] = (hMat * cMat[0][2] * hMat.transpose()).cwiseProduct(wMat);
+      dMat[1][1] = (hMat * cMat[1][1] * hMat.transpose()).cwiseProduct(wMat);
+      dMat[1][2] = (hMat * cMat[1][2] * hMat.transpose()).cwiseProduct(wMat);
+      dMat[2][2] = (hMat * cMat[2][2] * hMat.transpose()).cwiseProduct(wMat);
+      dMat[1][0] = dMat[0][1].transpose();
+      dMat[2][0] = dMat[0][2].transpose();
+      dMat[2][1] = dMat[1][2].transpose();
+      printIt(&dMat[0][0], "circle_fit - D_[0][0]:");
+
+      constexpr uint nu[6][2] = {{0, 0}, {0, 1}, {0, 2}, {1, 1}, {1, 2}, {2, 2}};
+
+      Matrix6d eMat;  // cov matrix of the 6 independent elements of A
+      for (uint a = 0; a < 6; ++a) {
+        const uint i = nu[a][0], j = nu[a][1];
+        for (uint b = a; b < 6; ++b) {
+          const uint k = nu[b][0], l = nu[b][1];
+          VectorNd<N> t0(n);
+          VectorNd<N> t1(n);
+          if (l == k) {
+            t0 = 2. * dMat[j][l] * s_v.col(l);
+            if (i == j)
+              t1 = t0;
+            else
+              t1 = 2. * dMat[i][l] * s_v.col(l);
+          } else {
+            t0 = dMat[j][l] * s_v.col(k) + dMat[j][k] * s_v.col(l);
+            if (i == j)
+              t1 = t0;
+            else
+              t1 = dMat[i][l] * s_v.col(k) + dMat[i][k] * s_v.col(l);
+          }
 
-        Matrix3d c0Mat;  // cov matrix of center of gravity (r0.x,r0.y,r0.z)
-        for (uint i = 0; i < 3; ++i) {
-          for (uint j = i; j < 3; ++j) {
-            Eigen::Matrix<double, 1, 1> tmp;
-            tmp = weight.transpose() * cMat[i][j] * weight;
+          if (i == j) {
+            Eigen::Matrix<double, 1, 1> cm;
+            cm = s_v.col(i).transpose() * (t0 + t1);
             // Workaround to get things working in GPU
-            const double tempC = tmp(0, 0);
-            c0Mat(i, j) = tempC;  //weight.transpose() * C[i][j] * weight;
-            c0Mat(j, i) = c0Mat(i, j);
-          }
-        }
-        printIt(&c0Mat, "circle_fit - C0:");
-
-        const MatrixNd<N> wMat = weight * weight.transpose();
-        const MatrixNd<N> hMat = MatrixNd<N>::Identity().rowwise() - weight.transpose();
-        const MatrixNx3d<N> s_v = hMat * p3D.transpose();
-        printIt(&wMat, "circle_fit - W:");
-        printIt(&hMat, "circle_fit - H:");
-        printIt(&s_v, "circle_fit - s_v:");
-
-        MatrixNd<N> dMat[3][3];  // cov(s_v)
-        dMat[0][0] = (hMat * cMat[0][0] * hMat.transpose()).cwiseProduct(wMat);
-        dMat[0][1] = (hMat * cMat[0][1] * hMat.transpose()).cwiseProduct(wMat);
-        dMat[0][2] = (hMat * cMat[0][2] * hMat.transpose()).cwiseProduct(wMat);
-        dMat[1][1] = (hMat * cMat[1][1] * hMat.transpose()).cwiseProduct(wMat);
-        dMat[1][2] = (hMat * cMat[1][2] * hMat.transpose()).cwiseProduct(wMat);
-        dMat[2][2] = (hMat * cMat[2][2] * hMat.transpose()).cwiseProduct(wMat);
-        dMat[1][0] = dMat[0][1].transpose();
-        dMat[2][0] = dMat[0][2].transpose();
-        dMat[2][1] = dMat[1][2].transpose();
-        printIt(&dMat[0][0], "circle_fit - D_[0][0]:");
-
-        constexpr uint nu[6][2] = {{0, 0}, {0, 1}, {0, 2}, {1, 1}, {1, 2}, {2, 2}};
-
-        Matrix6d eMat;  // cov matrix of the 6 independent elements of A
-        for (uint a = 0; a < 6; ++a) {
-          const uint i = nu[a][0], j = nu[a][1];
-          for (uint b = a; b < 6; ++b) {
-            const uint k = nu[b][0], l = nu[b][1];
-            VectorNd<N> t0(n);
-            VectorNd<N> t1(n);
-            if (l == k) {
-              t0 = 2. * dMat[j][l] * s_v.col(l);
-              if (i == j)
-                t1 = t0;
-              else
-                t1 = 2. * dMat[i][l] * s_v.col(l);
-            } else {
-              t0 = dMat[j][l] * s_v.col(k) + dMat[j][k] * s_v.col(l);
-              if (i == j)
-                t1 = t0;
-              else
-                t1 = dMat[i][l] * s_v.col(k) + dMat[i][k] * s_v.col(l);
-            }
-
-            if (i == j) {
-              Eigen::Matrix<double, 1, 1> cm;
-              cm = s_v.col(i).transpose() * (t0 + t1);
-              // Workaround to get things working in GPU
-              const double tempC = cm(0, 0);
-              eMat(a, b) = 0. + tempC;
-            } else {
-              Eigen::Matrix<double, 1, 1> cm;
-              cm = (s_v.col(i).transpose() * t0) + (s_v.col(j).transpose() * t1);
-              // Workaround to get things working in GPU
-              const double tempC = cm(0, 0);
-              eMat(a, b) = 0. + tempC;  //(s_v.col(i).transpose() * t0) + (s_v.col(j).transpose() * t1);
-            }
-            if (b != a)
-              eMat(b, a) = eMat(a, b);
+            const double tempC = cm(0, 0);
+            eMat(a, b) = 0. + tempC;
+          } else {
+            Eigen::Matrix<double, 1, 1> cm;
+            cm = (s_v.col(i).transpose() * t0) + (s_v.col(j).transpose() * t1);
+            // Workaround to get things working in GPU
+            const double tempC = cm(0, 0);
+            eMat(a, b) = 0. + tempC;  //(s_v.col(i).transpose() * t0) + (s_v.col(j).transpose() * t1);
           }
+          if (b != a)
+            eMat(b, a) = eMat(a, b);
         }
-        printIt(&eMat, "circle_fit - E:");
-
-        Eigen::Matrix<double, 3, 6> j2Mat;  // Jacobian of min_eigen() (numerically computed)
-        for (uint a = 0; a < 6; ++a) {
-          const uint i = nu[a][0], j = nu[a][1];
-          Matrix3d delta = Matrix3d::Zero();
-          delta(i, j) = delta(j, i) = abs(aMat(i, j) * epsilon);
-          j2Mat.col(a) = min_eigen3D_fast(acc, aMat + delta);
-          const int sign = (j2Mat.col(a)(2) > 0) ? 1 : -1;
-          j2Mat.col(a) = (j2Mat.col(a) * sign - vVec) / delta(i, j);
-        }
-        printIt(&j2Mat, "circle_fit - J2:");
-
-        Matrix4d cvcMat;  // joint cov matrix of (v0,v1,v2,c)
-        {
-          Matrix3d t0 = j2Mat * eMat * j2Mat.transpose();
-          Vector3d t1 = -t0 * r0;
-          cvcMat.block(0, 0, 3, 3) = t0;
-          cvcMat.block(0, 3, 3, 1) = t1;
-          cvcMat.block(3, 0, 1, 3) = t1.transpose();
-          Eigen::Matrix<double, 1, 1> cm1;
-          Eigen::Matrix<double, 1, 1> cm3;
-          cm1 = (vVec.transpose() * c0Mat * vVec);
-          //      cm2 = (c0Mat.cwiseProduct(t0)).sum();
-          cm3 = (r0.transpose() * t0 * r0);
-          // Workaround to get things working in GPU
-          const double tempC = cm1(0, 0) + (c0Mat.cwiseProduct(t0)).sum() + cm3(0, 0);
-          cvcMat(3, 3) = tempC;
-          // (v.transpose() * c0Mat * v) + (c0Mat.cwiseProduct(t0)).sum() + (r0.transpose() * t0 * r0);
-        }
-        printIt(&cvcMat, "circle_fit - Cvc:");
-
-        Eigen::Matrix<double, 3, 4> j3Mat;  // Jacobian (v0,v1,v2,c)->(X0,Y0,R)
-        {
-          const double t = 1. / tempH;
-          j3Mat << -v2x2_inv, 0, vVec(0) * sqr(v2x2_inv) * 2., 0, 0, -v2x2_inv, vVec(1) * sqr(v2x2_inv) * 2., 0,
-              vVec(0) * v2x2_inv * t, vVec(1) * v2x2_inv * t,
-              -tempH * sqr(v2x2_inv) * 2. - (2. * tempC + vVec(2)) * v2x2_inv * t, -t;
-        }
-        printIt(&j3Mat, "circle_fit - J3:");
-
-        const RowVector2Nd<N> Jq = mc.transpose() * tempS * 1. / n;  // var(q)
-        printIt(&Jq, "circle_fit - Jq:");
+      }
+      printIt(&eMat, "circle_fit - E:");
+
+      Eigen::Matrix<double, 3, 6> j2Mat;  // Jacobian of min_eigen() (numerically computed)
+      for (uint a = 0; a < 6; ++a) {
+        const uint i = nu[a][0], j = nu[a][1];
+        Matrix3d delta = Matrix3d::Zero();
+        delta(i, j) = delta(j, i) = abs(aMat(i, j) * epsilon);
+        j2Mat.col(a) = min_eigen3D_fast(acc, aMat + delta);
+        const int sign = (j2Mat.col(a)(2) > 0) ? 1 : -1;
+        j2Mat.col(a) = (j2Mat.col(a) * sign - vVec) / delta(i, j);
+      }
+      printIt(&j2Mat, "circle_fit - J2:");
 
-        Matrix3d cov_uvr = j3Mat * cvcMat * j3Mat.transpose() * sqr(s_inv)  // cov(X0,Y0,R)
-                           + (par_uvr * par_uvr.transpose()) * (Jq * vMat * Jq.transpose());
+      Matrix4d cvcMat;  // joint cov matrix of (v0,v1,v2,c)
+      {
+        Matrix3d t0 = j2Mat * eMat * j2Mat.transpose();
+        Vector3d t1 = -t0 * r0;
+        cvcMat.block(0, 0, 3, 3) = t0;
+        cvcMat.block(0, 3, 3, 1) = t1;
+        cvcMat.block(3, 0, 1, 3) = t1.transpose();
+        Eigen::Matrix<double, 1, 1> cm1;
+        Eigen::Matrix<double, 1, 1> cm3;
+        cm1 = (vVec.transpose() * c0Mat * vVec);
+        //      cm2 = (c0Mat.cwiseProduct(t0)).sum();
+        cm3 = (r0.transpose() * t0 * r0);
+        // Workaround to get things working in GPU
+        const double tempC = cm1(0, 0) + (c0Mat.cwiseProduct(t0)).sum() + cm3(0, 0);
+        cvcMat(3, 3) = tempC;
+        // (v.transpose() * c0Mat * v) + (c0Mat.cwiseProduct(t0)).sum() + (r0.transpose() * t0 * r0);
+      }
+      printIt(&cvcMat, "circle_fit - Cvc:");
 
-        circle.cov = cov_uvr;
+      Eigen::Matrix<double, 3, 4> j3Mat;  // Jacobian (v0,v1,v2,c)->(X0,Y0,R)
+      {
+        const double t = 1. / tempH;
+        j3Mat << -v2x2_inv, 0, vVec(0) * sqr(v2x2_inv) * 2., 0, 0, -v2x2_inv, vVec(1) * sqr(v2x2_inv) * 2., 0,
+            vVec(0) * v2x2_inv * t, vVec(1) * v2x2_inv * t,
+            -tempH * sqr(v2x2_inv) * 2. - (2. * tempC + vVec(2)) * v2x2_inv * t, -t;
       }
+      printIt(&j3Mat, "circle_fit - J3:");
+
+      const RowVector2Nd<N> Jq = mc.transpose() * tempS * 1. / n;  // var(q)
+      printIt(&Jq, "circle_fit - Jq:");
+
+      Matrix3d cov_uvr = j3Mat * cvcMat * j3Mat.transpose() * sqr(s_inv)  // cov(X0,Y0,R)
+                         + (par_uvr * par_uvr.transpose()) * (Jq * vMat * Jq.transpose());
 
-      printIt(&circle.cov, "Circle cov:");
+      circle.cov = cov_uvr;
+    }
+
+    printIt(&circle.cov, "Circle cov:");
 #ifdef RFIT_DEBUG
-      printf("circle_fit - exit\n");
+    printf("circle_fit - exit\n");
 #endif
-      return circle;
-    }
+    return circle;
+  }
 
-    /*!  \brief Perform an ordinary least square fit in the s-z plane to compute
+  /*!  \brief Perform an ordinary least square fit in the s-z plane to compute
  * the parameters cotTheta and Zip.
  *
  * The fit is performed in the rotated S3D-Z' plane, following the formalism of
@@ -788,166 +792,165 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
  * what is done in the same fit in the Broken Line approach.
  */
 
-    template <typename TAcc, typename M3xN, typename M6xN, typename V4>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE LineFit lineFit(const TAcc& acc,
-                                                   const M3xN& hits,
-                                                   const M6xN& hits_ge,
-                                                   const CircleFit& circle,
-                                                   const V4& fast_fit,
-                                                   const double bField,
-                                                   const bool error) {
-      constexpr uint32_t N = M3xN::ColsAtCompileTime;
-      constexpr auto n = N;
-      double theta = -circle.qCharge * atan(fast_fit(3));
-      theta = theta < 0. ? theta + M_PI : theta;
-
-      // Prepare the Rotation Matrix to rotate the points
-      Eigen::Matrix<double, 2, 2> rot;
-      rot << sin(theta), cos(theta), -cos(theta), sin(theta);
-
-      // PROJECTION ON THE CILINDER
-      //
-      // p2D will be:
-      // [s1, s2, s3, ..., sn]
-      // [z1, z2, z3, ..., zn]
-      // s values will be ordinary x-values
-      // z values will be ordinary y-values
-
-      Matrix2xNd<N> p2D = Matrix2xNd<N>::Zero();
-      Eigen::Matrix<double, 2, 6> jxMat;
+  template <typename TAcc, typename M3xN, typename M6xN, typename V4>
+  ALPAKA_FN_ACC ALPAKA_FN_INLINE LineFit lineFit(const TAcc& acc,
+                                                 const M3xN& hits,
+                                                 const M6xN& hits_ge,
+                                                 const CircleFit& circle,
+                                                 const V4& fast_fit,
+                                                 const double bField,
+                                                 const bool error) {
+    constexpr uint32_t N = M3xN::ColsAtCompileTime;
+    constexpr auto n = N;
+    double theta = -circle.qCharge * atan(fast_fit(3));
+    theta = theta < 0. ? theta + M_PI : theta;
+
+    // Prepare the Rotation Matrix to rotate the points
+    Eigen::Matrix<double, 2, 2> rot;
+    rot << sin(theta), cos(theta), -cos(theta), sin(theta);
+
+    // PROJECTION ON THE CILINDER
+    //
+    // p2D will be:
+    // [s1, s2, s3, ..., sn]
+    // [z1, z2, z3, ..., zn]
+    // s values will be ordinary x-values
+    // z values will be ordinary y-values
+
+    Matrix2xNd<N> p2D = Matrix2xNd<N>::Zero();
+    Eigen::Matrix<double, 2, 6> jxMat;
 
 #ifdef RFIT_DEBUG
-      printf("Line_fit - B: %g\n", bField);
-      printIt(&hits, "Line_fit points: ");
-      printIt(&hits_ge, "Line_fit covs: ");
-      printIt(&rot, "Line_fit rot: ");
+    printf("Line_fit - B: %g\n", bField);
+    printIt(&hits, "Line_fit points: ");
+    printIt(&hits_ge, "Line_fit covs: ");
+    printIt(&rot, "Line_fit rot: ");
 #endif
-      // x & associated Jacobian
-      // cfr https://indico.cern.ch/event/663159/contributions/2707659/attachments/1517175/2368189/Riemann_fit.pdf
-      // Slide 11
-      // a ==> -o i.e. the origin of the circle in XY plane, negative
-      // b ==> p i.e. distances of the points wrt the origin of the circle.
-      const Vector2d oVec(circle.par(0), circle.par(1));
-
-      // associated Jacobian, used in weights and errors computation
-      Matrix6d covMat = Matrix6d::Zero();
-      Matrix2d cov_sz[N];
-      for (uint i = 0; i < n; ++i) {
-        Vector2d pVec = hits.block(0, i, 2, 1) - oVec;
-        const double cross = cross2D(acc, -oVec, pVec);
-        const double dot = (-oVec).dot(pVec);
-        // atan2(cross, dot) give back the angle in the transverse plane so tha the
-        // final equation reads: x_i = -q*R*theta (theta = angle returned by atan2)
-        const double tempQAtan2 = -circle.qCharge * atan2(cross, dot);
-        //    p2D.coeffRef(1, i) = atan2_ * circle.par(2);
-        p2D(0, i) = tempQAtan2 * circle.par(2);
-
-        // associated Jacobian, used in weights and errors- computation
-        const double temp0 = -circle.qCharge * circle.par(2) * 1. / (sqr(dot) + sqr(cross));
-        double d_X0 = 0., d_Y0 = 0., d_R = 0.;  // good approximation for big pt and eta
-        if (error) {
-          d_X0 = -temp0 * ((pVec(1) + oVec(1)) * dot - (pVec(0) - oVec(0)) * cross);
-          d_Y0 = temp0 * ((pVec(0) + oVec(0)) * dot - (oVec(1) - pVec(1)) * cross);
-          d_R = tempQAtan2;
-        }
-        const double d_x = temp0 * (oVec(1) * dot + oVec(0) * cross);
-        const double d_y = temp0 * (-oVec(0) * dot + oVec(1) * cross);
-        jxMat << d_X0, d_Y0, d_R, d_x, d_y, 0., 0., 0., 0., 0., 0., 1.;
-
-        covMat.block(0, 0, 3, 3) = circle.cov;
-        covMat(3, 3) = hits_ge.col(i)[0];                 // x errors
-        covMat(4, 4) = hits_ge.col(i)[2];                 // y errors
-        covMat(5, 5) = hits_ge.col(i)[5];                 // z errors
-        covMat(3, 4) = covMat(4, 3) = hits_ge.col(i)[1];  // cov_xy
-        covMat(3, 5) = covMat(5, 3) = hits_ge.col(i)[3];  // cov_xz
-        covMat(4, 5) = covMat(5, 4) = hits_ge.col(i)[4];  // cov_yz
-        Matrix2d tmp = jxMat * covMat * jxMat.transpose();
-        cov_sz[i].noalias() = rot * tmp * rot.transpose();
+    // x & associated Jacobian
+    // cfr https://indico.cern.ch/event/663159/contributions/2707659/attachments/1517175/2368189/Riemann_fit.pdf
+    // Slide 11
+    // a ==> -o i.e. the origin of the circle in XY plane, negative
+    // b ==> p i.e. distances of the points wrt the origin of the circle.
+    const Vector2d oVec(circle.par(0), circle.par(1));
+
+    // associated Jacobian, used in weights and errors computation
+    Matrix6d covMat = Matrix6d::Zero();
+    Matrix2d cov_sz[N];
+    for (uint i = 0; i < n; ++i) {
+      Vector2d pVec = hits.block(0, i, 2, 1) - oVec;
+      const double cross = cross2D(acc, -oVec, pVec);
+      const double dot = (-oVec).dot(pVec);
+      // atan2(cross, dot) give back the angle in the transverse plane so tha the
+      // final equation reads: x_i = -q*R*theta (theta = angle returned by atan2)
+      const double tempQAtan2 = -circle.qCharge * atan2(cross, dot);
+      //    p2D.coeffRef(1, i) = atan2_ * circle.par(2);
+      p2D(0, i) = tempQAtan2 * circle.par(2);
+
+      // associated Jacobian, used in weights and errors- computation
+      const double temp0 = -circle.qCharge * circle.par(2) * 1. / (sqr(dot) + sqr(cross));
+      double d_X0 = 0., d_Y0 = 0., d_R = 0.;  // good approximation for big pt and eta
+      if (error) {
+        d_X0 = -temp0 * ((pVec(1) + oVec(1)) * dot - (pVec(0) - oVec(0)) * cross);
+        d_Y0 = temp0 * ((pVec(0) + oVec(0)) * dot - (oVec(1) - pVec(1)) * cross);
+        d_R = tempQAtan2;
       }
-      // Math of d_{X0,Y0,R,x,y} all verified by hand
-      p2D.row(1) = hits.row(2);
+      const double d_x = temp0 * (oVec(1) * dot + oVec(0) * cross);
+      const double d_y = temp0 * (-oVec(0) * dot + oVec(1) * cross);
+      jxMat << d_X0, d_Y0, d_R, d_x, d_y, 0., 0., 0., 0., 0., 0., 1.;
+
+      covMat.block(0, 0, 3, 3) = circle.cov;
+      covMat(3, 3) = hits_ge.col(i)[0];                 // x errors
+      covMat(4, 4) = hits_ge.col(i)[2];                 // y errors
+      covMat(5, 5) = hits_ge.col(i)[5];                 // z errors
+      covMat(3, 4) = covMat(4, 3) = hits_ge.col(i)[1];  // cov_xy
+      covMat(3, 5) = covMat(5, 3) = hits_ge.col(i)[3];  // cov_xz
+      covMat(4, 5) = covMat(5, 4) = hits_ge.col(i)[4];  // cov_yz
+      Matrix2d tmp = jxMat * covMat * jxMat.transpose();
+      cov_sz[i].noalias() = rot * tmp * rot.transpose();
+    }
+    // Math of d_{X0,Y0,R,x,y} all verified by hand
+    p2D.row(1) = hits.row(2);
 
-      // The following matrix will contain errors orthogonal to the rotated S
-      // component only, with the Multiple Scattering properly treated!!
-      MatrixNd<N> cov_with_ms;
-      scatterCovLine(acc, cov_sz, fast_fit, p2D.row(0), p2D.row(1), theta, bField, cov_with_ms);
+    // The following matrix will contain errors orthogonal to the rotated S
+    // component only, with the Multiple Scattering properly treated!!
+    MatrixNd<N> cov_with_ms;
+    scatterCovLine(acc, cov_sz, fast_fit, p2D.row(0), p2D.row(1), theta, bField, cov_with_ms);
 #ifdef RFIT_DEBUG
-      printIt(cov_sz, "line_fit - cov_sz:");
-      printIt(&cov_with_ms, "line_fit - cov_with_ms: ");
+    printIt(cov_sz, "line_fit - cov_sz:");
+    printIt(&cov_with_ms, "line_fit - cov_with_ms: ");
 #endif
 
-      // Rotate Points with the shape [2, n]
-      Matrix2xNd<N> p2D_rot = rot * p2D;
+    // Rotate Points with the shape [2, n]
+    Matrix2xNd<N> p2D_rot = rot * p2D;
 
 #ifdef RFIT_DEBUG
-      printf("Fast fit Tan(theta): %g\n", fast_fit(3));
-      printf("Rotation angle: %g\n", theta);
-      printIt(&rot, "Rotation Matrix:");
-      printIt(&p2D, "Original Hits(s,z):");
-      printIt(&p2D_rot, "Rotated hits(S3D, Z'):");
-      printIt(&rot, "Rotation Matrix:");
+    printf("Fast fit Tan(theta): %g\n", fast_fit(3));
+    printf("Rotation angle: %g\n", theta);
+    printIt(&rot, "Rotation Matrix:");
+    printIt(&p2D, "Original Hits(s,z):");
+    printIt(&p2D_rot, "Rotated hits(S3D, Z'):");
+    printIt(&rot, "Rotation Matrix:");
 #endif
 
-      // Build the A Matrix
-      Matrix2xNd<N> aMat;
-      aMat << MatrixXd::Ones(1, n), p2D_rot.row(0);  // rotated s values
+    // Build the A Matrix
+    Matrix2xNd<N> aMat;
+    aMat << MatrixXd::Ones(1, n), p2D_rot.row(0);  // rotated s values
 
 #ifdef RFIT_DEBUG
-      printIt(&aMat, "A Matrix:");
+    printIt(&aMat, "A Matrix:");
 #endif
 
-      // Build A^T V-1 A, where V-1 is the covariance of only the Y components.
-      MatrixNd<N> vyInvMat;
-      math::cholesky::invert(cov_with_ms, vyInvMat);
-      // MatrixNd<N> vyInvMat = cov_with_ms.inverse();
-      Eigen::Matrix<double, 2, 2> covParamsMat = aMat * vyInvMat * aMat.transpose();
-      // Compute the Covariance Matrix of the fit parameters
-      math::cholesky::invert(covParamsMat, covParamsMat);
+    // Build A^T V-1 A, where V-1 is the covariance of only the Y components.
+    MatrixNd<N> vyInvMat;
+    math::cholesky::invert(cov_with_ms, vyInvMat);
+    // MatrixNd<N> vyInvMat = cov_with_ms.inverse();
+    Eigen::Matrix<double, 2, 2> covParamsMat = aMat * vyInvMat * aMat.transpose();
+    // Compute the Covariance Matrix of the fit parameters
+    math::cholesky::invert(covParamsMat, covParamsMat);
 
-      // Now Compute the Parameters in the form [2,1]
-      // The first component is q.
-      // The second component is m.
-      Eigen::Matrix<double, 2, 1> sol = covParamsMat * aMat * vyInvMat * p2D_rot.row(1).transpose();
+    // Now Compute the Parameters in the form [2,1]
+    // The first component is q.
+    // The second component is m.
+    Eigen::Matrix<double, 2, 1> sol = covParamsMat * aMat * vyInvMat * p2D_rot.row(1).transpose();
 
 #ifdef RFIT_DEBUG
-      printIt(&sol, "Rotated solutions:");
+    printIt(&sol, "Rotated solutions:");
 #endif
 
-      // We need now to transfer back the results in the original s-z plane
-      const auto sinTheta = sin(theta);
-      const auto cosTheta = cos(theta);
-      auto common_factor = 1. / (sinTheta - sol(1, 0) * cosTheta);
-      Eigen::Matrix<double, 2, 2> jMat;
-      jMat << 0., common_factor * common_factor, common_factor, sol(0, 0) * cosTheta * common_factor * common_factor;
+    // We need now to transfer back the results in the original s-z plane
+    const auto sinTheta = sin(theta);
+    const auto cosTheta = cos(theta);
+    auto common_factor = 1. / (sinTheta - sol(1, 0) * cosTheta);
+    Eigen::Matrix<double, 2, 2> jMat;
+    jMat << 0., common_factor * common_factor, common_factor, sol(0, 0) * cosTheta * common_factor * common_factor;
 
-      double tempM = common_factor * (sol(1, 0) * sinTheta + cosTheta);
-      double tempQ = common_factor * sol(0, 0);
-      auto cov_mq = jMat * covParamsMat * jMat.transpose();
+    double tempM = common_factor * (sol(1, 0) * sinTheta + cosTheta);
+    double tempQ = common_factor * sol(0, 0);
+    auto cov_mq = jMat * covParamsMat * jMat.transpose();
 
-      VectorNd<N> res = p2D_rot.row(1).transpose() - aMat.transpose() * sol;
-      double chi2 = res.transpose() * vyInvMat * res;
+    VectorNd<N> res = p2D_rot.row(1).transpose() - aMat.transpose() * sol;
+    double chi2 = res.transpose() * vyInvMat * res;
 
-      LineFit line;
-      line.par << tempM, tempQ;
-      line.cov << cov_mq;
-      line.chi2 = chi2;
+    LineFit line;
+    line.par << tempM, tempQ;
+    line.cov << cov_mq;
+    line.chi2 = chi2;
 
 #ifdef RFIT_DEBUG
-      printf("Common_factor: %g\n", common_factor);
-      printIt(&jMat, "Jacobian:");
-      printIt(&sol, "Rotated solutions:");
-      printIt(&covParamsMat, "Cov_params:");
-      printIt(&cov_mq, "Rotated Covariance Matrix:");
-      printIt(&(line.par), "Real Parameters:");
-      printIt(&(line.cov), "Real Covariance Matrix:");
-      printf("Chi2: %g\n", chi2);
+    printf("Common_factor: %g\n", common_factor);
+    printIt(&jMat, "Jacobian:");
+    printIt(&sol, "Rotated solutions:");
+    printIt(&covParamsMat, "Cov_params:");
+    printIt(&cov_mq, "Rotated Covariance Matrix:");
+    printIt(&(line.par), "Real Parameters:");
+    printIt(&(line.cov), "Real Covariance Matrix:");
+    printf("Chi2: %g\n", chi2);
 #endif
 
-      return line;
-    }
+    return line;
+  }
 
-  }  // namespace riemannFit
-}  // namespace ALPAKA_ACCELERATOR_NAMESPACE
+}  // namespace ALPAKA_ACCELERATOR_NAMESPACE::riemannFit
 
 namespace riemannFit {
   /*!
@@ -1019,5 +1022,7 @@ namespace riemannFit {
       helix->chi2_line = line.chi2;
     }
   };
+
 }  // namespace riemannFit
-#endif  // RecoPixelVertexing_PixelTrackFitting_interface_RiemannFit_h
+
+#endif  // RecoTracker_PixelTrackFitting_interface_alpaka_RiemannFit_h
diff --git a/RecoTracker/PixelVertexFinding/plugins/alpaka/clusterTracksByDensity.h b/RecoTracker/PixelVertexFinding/plugins/alpaka/clusterTracksByDensity.h
index 29cd537ac4aa7..447a3d6c89c07 100644
--- a/RecoTracker/PixelVertexFinding/plugins/alpaka/clusterTracksByDensity.h
+++ b/RecoTracker/PixelVertexFinding/plugins/alpaka/clusterTracksByDensity.h
@@ -1,248 +1,253 @@
-#ifndef RecoPixelVertexing_PixelVertexFinding_alpaka_clusterTracksByDensity_h
-#define RecoPixelVertexing_PixelVertexFinding_alpaka_clusterTracksByDensity_h
+#ifndef RecoTracker_PixelVertexFinding_plugins_alpaka_clusterTracksByDensity_h
+#define RecoTracker_PixelVertexFinding_plugins_alpaka_clusterTracksByDensity_h
 
 #include <algorithm>
 #include <cmath>
 #include <cstdint>
+
 #include <alpaka/alpaka.hpp>
+
 #include "DataFormats/VertexSoA/interface/ZVertexSoA.h"
-#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
 #include "HeterogeneousCore/AlpakaInterface/interface/HistoContainer.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/config.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
 #include "RecoTracker/PixelVertexFinding/interface/PixelVertexWorkSpaceLayout.h"
+
 #include "vertexFinder.h"
 
-namespace ALPAKA_ACCELERATOR_NAMESPACE {
-  namespace vertexFinder {
-    using VtxSoAView = ::reco::ZVertexSoAView;
-    using WsSoAView = ::vertexFinder::PixelVertexWorkSpaceSoAView;
-    // this algo does not really scale as it works in a single block...
-    // enough for <10K tracks we have
-    //
-    // based on Rodrighez&Laio algo
-    //
-    template <typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE void __attribute__((always_inline))
-    clusterTracksByDensity(const TAcc& acc,
-                           VtxSoAView& pdata,
-                           WsSoAView& pws,
-                           int minT,      // min number of neighbours to be "seed"
-                           float eps,     // max absolute distance to cluster
-                           float errmax,  // max error to be "seed"
-                           float chi2max  // max normalized distance to cluster
-    ) {
-      using namespace vertexFinder;
-      constexpr bool verbose = false;  // in principle the compiler should optmize out if false
-      const uint32_t threadIdxLocal(alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]);
-
-      if constexpr (verbose) {
-        if (cms::alpakatools::once_per_block(acc))
-          printf("params %d %f %f %f\n", minT, eps, errmax, chi2max);
-      }
-      auto er2mx = errmax * errmax;
+namespace ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder {
+
+  using VtxSoAView = ::reco::ZVertexSoAView;
+  using WsSoAView = ::vertexFinder::PixelVertexWorkSpaceSoAView;
+  // this algo does not really scale as it works in a single block...
+  // enough for <10K tracks we have
+  //
+  // based on Rodrighez&Laio algo
+  //
+  template <typename TAcc>
+  ALPAKA_FN_ACC ALPAKA_FN_INLINE void __attribute__((always_inline))
+  clusterTracksByDensity(const TAcc& acc,
+                         VtxSoAView& pdata,
+                         WsSoAView& pws,
+                         int minT,      // min number of neighbours to be "seed"
+                         float eps,     // max absolute distance to cluster
+                         float errmax,  // max error to be "seed"
+                         float chi2max  // max normalized distance to cluster
+  ) {
+    using namespace vertexFinder;
+    constexpr bool verbose = false;  // in principle the compiler should optmize out if false
+    const uint32_t threadIdxLocal(alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]);
+
+    if constexpr (verbose) {
+      if (cms::alpakatools::once_per_block(acc))
+        printf("params %d %f %f %f\n", minT, eps, errmax, chi2max);
+    }
+    auto er2mx = errmax * errmax;
 
-      auto& __restrict__ data = pdata;
-      auto& __restrict__ ws = pws;
-      auto nt = ws.ntrks();
-      float const* __restrict__ zt = ws.zt();
-      float const* __restrict__ ezt2 = ws.ezt2();
+    auto& __restrict__ data = pdata;
+    auto& __restrict__ ws = pws;
+    auto nt = ws.ntrks();
+    float const* __restrict__ zt = ws.zt();
+    float const* __restrict__ ezt2 = ws.ezt2();
 
-      uint32_t& nvFinal = data.nvFinal();
-      uint32_t& nvIntermediate = ws.nvIntermediate();
+    uint32_t& nvFinal = data.nvFinal();
+    uint32_t& nvIntermediate = ws.nvIntermediate();
 
-      uint8_t* __restrict__ izt = ws.izt();
-      int32_t* __restrict__ nn = data.ndof();
-      int32_t* __restrict__ iv = ws.iv();
+    uint8_t* __restrict__ izt = ws.izt();
+    int32_t* __restrict__ nn = data.ndof();
+    int32_t* __restrict__ iv = ws.iv();
 
-      ALPAKA_ASSERT_OFFLOAD(zt);
-      ALPAKA_ASSERT_OFFLOAD(ezt2);
-      ALPAKA_ASSERT_OFFLOAD(izt);
-      ALPAKA_ASSERT_OFFLOAD(nn);
-      ALPAKA_ASSERT_OFFLOAD(iv);
+    ALPAKA_ASSERT_OFFLOAD(zt);
+    ALPAKA_ASSERT_OFFLOAD(ezt2);
+    ALPAKA_ASSERT_OFFLOAD(izt);
+    ALPAKA_ASSERT_OFFLOAD(nn);
+    ALPAKA_ASSERT_OFFLOAD(iv);
 
-      using Hist = cms::alpakatools::HistoContainer<uint8_t, 256, 16000, 8, uint16_t>;
-      auto& hist = alpaka::declareSharedVar<Hist, __COUNTER__>(acc);
-      auto& hws = alpaka::declareSharedVar<Hist::Counter[32], __COUNTER__>(acc);
+    using Hist = cms::alpakatools::HistoContainer<uint8_t, 256, 16000, 8, uint16_t>;
+    auto& hist = alpaka::declareSharedVar<Hist, __COUNTER__>(acc);
+    auto& hws = alpaka::declareSharedVar<Hist::Counter[32], __COUNTER__>(acc);
 
-      for (auto j : cms::alpakatools::elements_with_stride(acc, Hist::totbins())) {
-        hist.off[j] = 0;
-      }
-      alpaka::syncBlockThreads(acc);
+    for (auto j : cms::alpakatools::elements_with_stride(acc, Hist::totbins())) {
+      hist.off[j] = 0;
+    }
+    alpaka::syncBlockThreads(acc);
 
-      if constexpr (verbose) {
-        if (cms::alpakatools::once_per_block(acc))
-          printf("booked hist with %d bins, size %d for %d tracks\n", hist.totbins(), hist.capacity(), nt);
-      }
-      ALPAKA_ASSERT_OFFLOAD(static_cast<int>(nt) <= hist.capacity());
-
-      // fill hist  (bin shall be wider than "eps")
-      for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
-        ALPAKA_ASSERT_OFFLOAD(i < ::zVertex::MAXTRACKS);
-        int iz = int(zt[i] * 10.);  // valid if eps<=0.1
-        // iz = std::clamp(iz, INT8_MIN, INT8_MAX);  // sorry c++17 only
-        iz = std::min(std::max(iz, INT8_MIN), INT8_MAX);
-        izt[i] = iz - INT8_MIN;
-        ALPAKA_ASSERT_OFFLOAD(iz - INT8_MIN >= 0);
-        ALPAKA_ASSERT_OFFLOAD(iz - INT8_MIN < 256);
-        hist.count(acc, izt[i]);
-        iv[i] = i;
-        nn[i] = 0;
-      }
-      alpaka::syncBlockThreads(acc);
-      if (threadIdxLocal < 32)
-        hws[threadIdxLocal] = 0;  // used by prefix scan...
-      alpaka::syncBlockThreads(acc);
-      hist.finalize(acc, hws);
-      alpaka::syncBlockThreads(acc);
-      ALPAKA_ASSERT_OFFLOAD(hist.size() == nt);
-      for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
-        hist.fill(acc, izt[i], uint16_t(i));
-      }
-      alpaka::syncBlockThreads(acc);
-      // count neighbours
-      for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
-        if (ezt2[i] > er2mx)
-          continue;
-        auto loop = [&](uint32_t j) {
-          if (i == j)
-            return;
-          auto dist = std::abs(zt[i] - zt[j]);
-          if (dist > eps)
-            return;
-          if (dist * dist > chi2max * (ezt2[i] + ezt2[j]))
-            return;
-          nn[i]++;
-        };
-
-        cms::alpakatools::forEachInBins(hist, izt[i], 1, loop);
-      }
-      alpaka::syncBlockThreads(acc);
-
-      // find closest above me .... (we ignore the possibility of two j at same distance from i)
-      for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
-        float mdist = eps;
-        auto loop = [&](uint32_t j) {
-          if (nn[j] < nn[i])
-            return;
-          if (nn[j] == nn[i] && zt[j] >= zt[i])
-            return;  // if equal use natural order...
-          auto dist = std::abs(zt[i] - zt[j]);
-          if (dist > mdist)
-            return;
-          if (dist * dist > chi2max * (ezt2[i] + ezt2[j]))
-            return;  // (break natural order???)
-          mdist = dist;
-          iv[i] = j;  // assign to cluster (better be unique??)
-        };
-        cms::alpakatools::forEachInBins(hist, izt[i], 1, loop);
-      }
-      alpaka::syncBlockThreads(acc);
+    if constexpr (verbose) {
+      if (cms::alpakatools::once_per_block(acc))
+        printf("booked hist with %d bins, size %d for %d tracks\n", hist.totbins(), hist.capacity(), nt);
+    }
+    ALPAKA_ASSERT_OFFLOAD(static_cast<int>(nt) <= hist.capacity());
+
+    // fill hist  (bin shall be wider than "eps")
+    for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
+      ALPAKA_ASSERT_OFFLOAD(i < ::zVertex::MAXTRACKS);
+      int iz = int(zt[i] * 10.);  // valid if eps<=0.1
+      // iz = std::clamp(iz, INT8_MIN, INT8_MAX);  // sorry c++17 only
+      iz = std::min(std::max(iz, INT8_MIN), INT8_MAX);
+      izt[i] = iz - INT8_MIN;
+      ALPAKA_ASSERT_OFFLOAD(iz - INT8_MIN >= 0);
+      ALPAKA_ASSERT_OFFLOAD(iz - INT8_MIN < 256);
+      hist.count(acc, izt[i]);
+      iv[i] = i;
+      nn[i] = 0;
+    }
+    alpaka::syncBlockThreads(acc);
+    if (threadIdxLocal < 32)
+      hws[threadIdxLocal] = 0;  // used by prefix scan...
+    alpaka::syncBlockThreads(acc);
+    hist.finalize(acc, hws);
+    alpaka::syncBlockThreads(acc);
+    ALPAKA_ASSERT_OFFLOAD(hist.size() == nt);
+    for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
+      hist.fill(acc, izt[i], uint16_t(i));
+    }
+    alpaka::syncBlockThreads(acc);
+    // count neighbours
+    for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
+      if (ezt2[i] > er2mx)
+        continue;
+      auto loop = [&](uint32_t j) {
+        if (i == j)
+          return;
+        auto dist = std::abs(zt[i] - zt[j]);
+        if (dist > eps)
+          return;
+        if (dist * dist > chi2max * (ezt2[i] + ezt2[j]))
+          return;
+        nn[i]++;
+      };
+
+      cms::alpakatools::forEachInBins(hist, izt[i], 1, loop);
+    }
+    alpaka::syncBlockThreads(acc);
+
+    // find closest above me .... (we ignore the possibility of two j at same distance from i)
+    for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
+      float mdist = eps;
+      auto loop = [&](uint32_t j) {
+        if (nn[j] < nn[i])
+          return;
+        if (nn[j] == nn[i] && zt[j] >= zt[i])
+          return;  // if equal use natural order...
+        auto dist = std::abs(zt[i] - zt[j]);
+        if (dist > mdist)
+          return;
+        if (dist * dist > chi2max * (ezt2[i] + ezt2[j]))
+          return;  // (break natural order???)
+        mdist = dist;
+        iv[i] = j;  // assign to cluster (better be unique??)
+      };
+      cms::alpakatools::forEachInBins(hist, izt[i], 1, loop);
+    }
+    alpaka::syncBlockThreads(acc);
 
 #ifdef GPU_DEBUG
-      //  mini verification
-      for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
-        if (iv[i] != int(i))
-          ALPAKA_ASSERT_OFFLOAD(iv[iv[i]] != int(i));
-      }
-      alpaka::syncBlockThreads(acc);
+    //  mini verification
+    for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
+      if (iv[i] != int(i))
+        ALPAKA_ASSERT_OFFLOAD(iv[iv[i]] != int(i));
+    }
+    alpaka::syncBlockThreads(acc);
 #endif
 
-      // consolidate graph (percolate index of seed)
-      for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
-        auto m = iv[i];
-        while (m != iv[m])
-          m = iv[m];
-        iv[i] = m;
-      }
+    // consolidate graph (percolate index of seed)
+    for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
+      auto m = iv[i];
+      while (m != iv[m])
+        m = iv[m];
+      iv[i] = m;
+    }
 
 #ifdef GPU_DEBUG
-      alpaka::syncBlockThreads(acc);
-      //  mini verification
-      for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
-        if (iv[i] != int(i))
-          ALPAKA_ASSERT_OFFLOAD(iv[iv[i]] != int(i));
-      }
+    alpaka::syncBlockThreads(acc);
+    //  mini verification
+    for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
+      if (iv[i] != int(i))
+        ALPAKA_ASSERT_OFFLOAD(iv[iv[i]] != int(i));
+    }
 #endif
 
 #ifdef GPU_DEBUG
-      // and verify that we did not spit any cluster...
-      for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
-        auto minJ = i;
-        auto mdist = eps;
-        auto loop = [&](uint32_t j) {
-          if (nn[j] < nn[i])
-            return;
-          if (nn[j] == nn[i] && zt[j] >= zt[i])
-            return;  // if equal use natural order...
-          auto dist = std::abs(zt[i] - zt[j]);
-          if (dist > mdist)
-            return;
-          if (dist * dist > chi2max * (ezt2[i] + ezt2[j]))
-            return;
-          mdist = dist;
-          minJ = j;
-        };
-        cms::alpakatools::forEachInBins(hist, izt[i], 1, loop);
-        // should belong to the same cluster...
-        ALPAKA_ASSERT_OFFLOAD(iv[i] == iv[minJ]);
-        ALPAKA_ASSERT_OFFLOAD(nn[i] <= nn[iv[i]]);
-      }
-      alpaka::syncBlockThreads(acc);
+    // and verify that we did not spit any cluster...
+    for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
+      auto minJ = i;
+      auto mdist = eps;
+      auto loop = [&](uint32_t j) {
+        if (nn[j] < nn[i])
+          return;
+        if (nn[j] == nn[i] && zt[j] >= zt[i])
+          return;  // if equal use natural order...
+        auto dist = std::abs(zt[i] - zt[j]);
+        if (dist > mdist)
+          return;
+        if (dist * dist > chi2max * (ezt2[i] + ezt2[j]))
+          return;
+        mdist = dist;
+        minJ = j;
+      };
+      cms::alpakatools::forEachInBins(hist, izt[i], 1, loop);
+      // should belong to the same cluster...
+      ALPAKA_ASSERT_OFFLOAD(iv[i] == iv[minJ]);
+      ALPAKA_ASSERT_OFFLOAD(nn[i] <= nn[iv[i]]);
+    }
+    alpaka::syncBlockThreads(acc);
 #endif
 
-      auto& foundClusters = alpaka::declareSharedVar<unsigned int, __COUNTER__>(acc);
-      foundClusters = 0;
-      alpaka::syncBlockThreads(acc);
-
-      // find the number of different clusters, identified by a tracks with clus[i] == i and density larger than threshold;
-      // mark these tracks with a negative id.
-      for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
-        if (iv[i] == int(i)) {
-          if (nn[i] >= minT) {
-            auto old = alpaka::atomicInc(acc, &foundClusters, 0xffffffff, alpaka::hierarchy::Threads{});
-            iv[i] = -(old + 1);
-          } else {  // noise
-            iv[i] = -9998;
-          }
+    auto& foundClusters = alpaka::declareSharedVar<unsigned int, __COUNTER__>(acc);
+    foundClusters = 0;
+    alpaka::syncBlockThreads(acc);
+
+    // find the number of different clusters, identified by a tracks with clus[i] == i and density larger than threshold;
+    // mark these tracks with a negative id.
+    for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
+      if (iv[i] == int(i)) {
+        if (nn[i] >= minT) {
+          auto old = alpaka::atomicInc(acc, &foundClusters, 0xffffffff, alpaka::hierarchy::Threads{});
+          iv[i] = -(old + 1);
+        } else {  // noise
+          iv[i] = -9998;
         }
       }
-      alpaka::syncBlockThreads(acc);
+    }
+    alpaka::syncBlockThreads(acc);
 
-      ALPAKA_ASSERT_OFFLOAD(foundClusters < ::zVertex::MAXVTX);
+    ALPAKA_ASSERT_OFFLOAD(foundClusters < ::zVertex::MAXVTX);
 
-      // propagate the negative id to all the tracks in the cluster.
-      for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
-        if (iv[i] >= 0) {
-          // mark each track in a cluster with the same id as the first one
-          iv[i] = iv[iv[i]];
-        }
+    // propagate the negative id to all the tracks in the cluster.
+    for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
+      if (iv[i] >= 0) {
+        // mark each track in a cluster with the same id as the first one
+        iv[i] = iv[iv[i]];
       }
-      alpaka::syncBlockThreads(acc);
+    }
+    alpaka::syncBlockThreads(acc);
 
-      // adjust the cluster id to be a positive value starting from 0
-      for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
-        iv[i] = -iv[i] - 1;
-      }
+    // adjust the cluster id to be a positive value starting from 0
+    for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
+      iv[i] = -iv[i] - 1;
+    }
 
-      nvIntermediate = nvFinal = foundClusters;
-      if constexpr (verbose) {
-        if (cms::alpakatools::once_per_block(acc))
-          printf("found %d proto vertices\n", foundClusters);
-      }
+    nvIntermediate = nvFinal = foundClusters;
+    if constexpr (verbose) {
+      if (cms::alpakatools::once_per_block(acc))
+        printf("found %d proto vertices\n", foundClusters);
     }
-    class ClusterTracksByDensityKernel {
-    public:
-      template <typename TAcc>
-      ALPAKA_FN_ACC void operator()(const TAcc& acc,
-                                    VtxSoAView pdata,
-                                    WsSoAView pws,
-                                    int minT,      // min number of neighbours to be "seed"
-                                    float eps,     // max absolute distance to cluster
-                                    float errmax,  // max error to be "seed"
-                                    float chi2max  // max normalized distance to cluster
-      ) const {
-        clusterTracksByDensity(acc, pdata, pws, minT, eps, errmax, chi2max);
-      }
-    };
-  }  // namespace vertexFinder
-}  // namespace ALPAKA_ACCELERATOR_NAMESPACE
-#endif  // RecoPixelVertexing_PixelVertexFinding_alpaka_clusterTracksByDensity_h
+  }
+  class ClusterTracksByDensityKernel {
+  public:
+    template <typename TAcc>
+    ALPAKA_FN_ACC void operator()(const TAcc& acc,
+                                  VtxSoAView pdata,
+                                  WsSoAView pws,
+                                  int minT,      // min number of neighbours to be "seed"
+                                  float eps,     // max absolute distance to cluster
+                                  float errmax,  // max error to be "seed"
+                                  float chi2max  // max normalized distance to cluster
+    ) const {
+      clusterTracksByDensity(acc, pdata, pws, minT, eps, errmax, chi2max);
+    }
+  };
+
+}  // namespace ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder
+
+#endif  // RecoTracker_PixelVertexFinding_plugins_alpaka_clusterTracksByDensity_h
diff --git a/RecoTracker/PixelVertexFinding/plugins/alpaka/clusterTracksDBSCAN.h b/RecoTracker/PixelVertexFinding/plugins/alpaka/clusterTracksDBSCAN.h
index 46ae2ad80ecc9..769896aa97252 100644
--- a/RecoTracker/PixelVertexFinding/plugins/alpaka/clusterTracksDBSCAN.h
+++ b/RecoTracker/PixelVertexFinding/plugins/alpaka/clusterTracksDBSCAN.h
@@ -1,255 +1,261 @@
-#ifndef RecoPixelVertexing_PixelVertexFinding_gpuClusterTracksDBSCAN_h
-#define RecoPixelVertexing_PixelVertexFinding_gpuClusterTracksDBSCAN_h
+#ifndef RecoTracker_PixelVertexFinding_plugins_alpaka_clusterTracksDBSCAN_h
+#define RecoTracker_PixelVertexFinding_plugins_alpaka_clusterTracksDBSCAN_h
 
 #include <algorithm>
 #include <cmath>
 #include <cstdint>
+
 #include <alpaka/alpaka.hpp>
+
 #include "DataFormats/VertexSoA/interface/ZVertexSoA.h"
-#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
 #include "HeterogeneousCore/AlpakaInterface/interface/HistoContainer.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/config.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
 #include "RecoTracker/PixelVertexFinding/interface/PixelVertexWorkSpaceLayout.h"
+
 #include "vertexFinder.h"
-namespace ALPAKA_ACCELERATOR_NAMESPACE {
-  namespace vertexFinder {
-    using VtxSoAView = ::reco::ZVertexSoAView;
-    using WsSoAView = ::vertexFinder::PixelVertexWorkSpaceSoAView;
-    // this algo does not really scale as it works in a single block...
-    // enough for <10K tracks we have
-    class ClusterTracksDBSCAN {
-    public:
-      template <typename TAcc>
-      ALPAKA_FN_ACC void operator()(const TAcc& acc,
-                                    VtxSoAView pdata,
-                                    WsSoAView pws,
-                                    int minT,      // min number of neighbours to be "core"
-                                    float eps,     // max absolute distance to cluster
-                                    float errmax,  // max error to be "seed"
-                                    float chi2max  // max normalized distance to cluster
-      ) const {
-        constexpr bool verbose = false;  // in principle the compiler should optmize out if false
-        const uint32_t threadIdxLocal(alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]);
-        if constexpr (verbose) {
-          if (cms::alpakatools::once_per_block(acc))
-            printf("params %d %f %f %f\n", minT, eps, errmax, chi2max);
-        }
-        auto er2mx = errmax * errmax;
 
-        auto& __restrict__ data = pdata;
-        auto& __restrict__ ws = pws;
-        auto nt = ws.ntrks();
-        float const* __restrict__ zt = ws.zt();
-        float const* __restrict__ ezt2 = ws.ezt2();
+namespace ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder {
 
-        uint32_t& nvFinal = data.nvFinal();
-        uint32_t& nvIntermediate = ws.nvIntermediate();
+  using VtxSoAView = ::reco::ZVertexSoAView;
+  using WsSoAView = ::vertexFinder::PixelVertexWorkSpaceSoAView;
+  // this algo does not really scale as it works in a single block...
+  // enough for <10K tracks we have
+  class ClusterTracksDBSCAN {
+  public:
+    template <typename TAcc>
+    ALPAKA_FN_ACC void operator()(const TAcc& acc,
+                                  VtxSoAView pdata,
+                                  WsSoAView pws,
+                                  int minT,      // min number of neighbours to be "core"
+                                  float eps,     // max absolute distance to cluster
+                                  float errmax,  // max error to be "seed"
+                                  float chi2max  // max normalized distance to cluster
+    ) const {
+      constexpr bool verbose = false;  // in principle the compiler should optmize out if false
+      const uint32_t threadIdxLocal(alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]);
+      if constexpr (verbose) {
+        if (cms::alpakatools::once_per_block(acc))
+          printf("params %d %f %f %f\n", minT, eps, errmax, chi2max);
+      }
+      auto er2mx = errmax * errmax;
 
-        uint8_t* __restrict__ izt = ws.izt();
-        int32_t* __restrict__ nn = data.ndof();
-        int32_t* __restrict__ iv = ws.iv();
+      auto& __restrict__ data = pdata;
+      auto& __restrict__ ws = pws;
+      auto nt = ws.ntrks();
+      float const* __restrict__ zt = ws.zt();
+      float const* __restrict__ ezt2 = ws.ezt2();
 
-        ALPAKA_ASSERT_OFFLOAD(zt);
-        ALPAKA_ASSERT_OFFLOAD(iv);
-        ALPAKA_ASSERT_OFFLOAD(nn);
-        ALPAKA_ASSERT_OFFLOAD(ezt2);
+      uint32_t& nvFinal = data.nvFinal();
+      uint32_t& nvIntermediate = ws.nvIntermediate();
 
-        using Hist = cms::alpakatools::HistoContainer<uint8_t, 256, 16000, 8, uint16_t>;
-        auto& hist = alpaka::declareSharedVar<Hist, __COUNTER__>(acc);
-        auto& hws = alpaka::declareSharedVar<Hist::Counter[32], __COUNTER__>(acc);
+      uint8_t* __restrict__ izt = ws.izt();
+      int32_t* __restrict__ nn = data.ndof();
+      int32_t* __restrict__ iv = ws.iv();
 
-        for (auto j : cms::alpakatools::elements_with_stride(acc, Hist::totbins())) {
-          hist.off[j] = 0;
-        }
-        alpaka::syncBlockThreads(acc);
+      ALPAKA_ASSERT_OFFLOAD(zt);
+      ALPAKA_ASSERT_OFFLOAD(iv);
+      ALPAKA_ASSERT_OFFLOAD(nn);
+      ALPAKA_ASSERT_OFFLOAD(ezt2);
 
-        if constexpr (verbose) {
-          if (cms::alpakatools::once_per_block(acc))
-            printf("booked hist with %d bins, size %d for %d tracks\n", hist.nbins(), hist.capacity(), nt);
-        }
+      using Hist = cms::alpakatools::HistoContainer<uint8_t, 256, 16000, 8, uint16_t>;
+      auto& hist = alpaka::declareSharedVar<Hist, __COUNTER__>(acc);
+      auto& hws = alpaka::declareSharedVar<Hist::Counter[32], __COUNTER__>(acc);
 
-        ALPAKA_ASSERT_OFFLOAD(static_cast<int>(nt) <= hist.capacity());
-
-        // fill hist  (bin shall be wider than "eps")
-        for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
-          ALPAKA_ASSERT_OFFLOAD(i < ::zVertex::MAXTRACKS);
-          int iz = int(zt[i] * 10.);  // valid if eps<=0.1
-          iz = std::clamp(iz, INT8_MIN, INT8_MAX);
-          izt[i] = iz - INT8_MIN;
-          ALPAKA_ASSERT_OFFLOAD(iz - INT8_MIN >= 0);
-          ALPAKA_ASSERT_OFFLOAD(iz - INT8_MIN < 256);
-          hist.count(acc, izt[i]);
-          iv[i] = i;
-          nn[i] = 0;
-        }
-        alpaka::syncBlockThreads(acc);
-        if (threadIdxLocal < 32)
-          hws[threadIdxLocal] = 0;  // used by prefix scan...
-        alpaka::syncBlockThreads(acc);
-        hist.finalize(acc, hws);
-        alpaka::syncBlockThreads(acc);
-        ALPAKA_ASSERT_OFFLOAD(hist.size() == nt);
-        for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
-          hist.fill(acc, izt[i], uint32_t(i));
-        }
-        alpaka::syncBlockThreads(acc);
-
-        // count neighbours
-        for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
-          if (ezt2[i] > er2mx)
-            continue;
-          auto loop = [&](uint32_t j) {
-            if (i == j)
-              return;
-            auto dist = std::abs(zt[i] - zt[j]);
-            if (dist > eps)
-              return;
-            //        if (dist*dist>chi2max*(ezt2[i]+ezt2[j])) return;
-            nn[i]++;
-          };
-
-          cms::alpakatools::forEachInBins(hist, izt[i], 1, loop);
-        }
+      for (auto j : cms::alpakatools::elements_with_stride(acc, Hist::totbins())) {
+        hist.off[j] = 0;
+      }
+      alpaka::syncBlockThreads(acc);
 
-        alpaka::syncBlockThreads(acc);
-
-        // find NN with smaller z...
-        for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
-          if (nn[i] < minT)
-            continue;  // DBSCAN core rule
-          float mz = zt[i];
-          auto loop = [&](uint32_t j) {
-            if (zt[j] >= mz)
-              return;
-            if (nn[j] < minT)
-              return;  // DBSCAN core rule
-            auto dist = std::abs(zt[i] - zt[j]);
-            if (dist > eps)
-              return;
-            //        if (dist*dist>chi2max*(ezt2[i]+ezt2[j])) return;
-            mz = zt[j];
-            iv[i] = j;  // assign to cluster (better be unique??)
-          };
-          cms::alpakatools::forEachInBins(hist, izt[i], 1, loop);
-        }
+      if constexpr (verbose) {
+        if (cms::alpakatools::once_per_block(acc))
+          printf("booked hist with %d bins, size %d for %d tracks\n", hist.nbins(), hist.capacity(), nt);
+      }
 
-        alpaka::syncBlockThreads(acc);
+      ALPAKA_ASSERT_OFFLOAD(static_cast<int>(nt) <= hist.capacity());
+
+      // fill hist  (bin shall be wider than "eps")
+      for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
+        ALPAKA_ASSERT_OFFLOAD(i < ::zVertex::MAXTRACKS);
+        int iz = int(zt[i] * 10.);  // valid if eps<=0.1
+        iz = std::clamp(iz, INT8_MIN, INT8_MAX);
+        izt[i] = iz - INT8_MIN;
+        ALPAKA_ASSERT_OFFLOAD(iz - INT8_MIN >= 0);
+        ALPAKA_ASSERT_OFFLOAD(iz - INT8_MIN < 256);
+        hist.count(acc, izt[i]);
+        iv[i] = i;
+        nn[i] = 0;
+      }
+      alpaka::syncBlockThreads(acc);
+      if (threadIdxLocal < 32)
+        hws[threadIdxLocal] = 0;  // used by prefix scan...
+      alpaka::syncBlockThreads(acc);
+      hist.finalize(acc, hws);
+      alpaka::syncBlockThreads(acc);
+      ALPAKA_ASSERT_OFFLOAD(hist.size() == nt);
+      for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
+        hist.fill(acc, izt[i], uint32_t(i));
+      }
+      alpaka::syncBlockThreads(acc);
+
+      // count neighbours
+      for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
+        if (ezt2[i] > er2mx)
+          continue;
+        auto loop = [&](uint32_t j) {
+          if (i == j)
+            return;
+          auto dist = std::abs(zt[i] - zt[j]);
+          if (dist > eps)
+            return;
+          //        if (dist*dist>chi2max*(ezt2[i]+ezt2[j])) return;
+          nn[i]++;
+        };
+
+        cms::alpakatools::forEachInBins(hist, izt[i], 1, loop);
+      }
+
+      alpaka::syncBlockThreads(acc);
+
+      // find NN with smaller z...
+      for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
+        if (nn[i] < minT)
+          continue;  // DBSCAN core rule
+        float mz = zt[i];
+        auto loop = [&](uint32_t j) {
+          if (zt[j] >= mz)
+            return;
+          if (nn[j] < minT)
+            return;  // DBSCAN core rule
+          auto dist = std::abs(zt[i] - zt[j]);
+          if (dist > eps)
+            return;
+          //        if (dist*dist>chi2max*(ezt2[i]+ezt2[j])) return;
+          mz = zt[j];
+          iv[i] = j;  // assign to cluster (better be unique??)
+        };
+        cms::alpakatools::forEachInBins(hist, izt[i], 1, loop);
+      }
+
+      alpaka::syncBlockThreads(acc);
 
 #ifdef GPU_DEBUG
-        //  mini verification
-        for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
-          if (iv[i] != int(i))
-            ALPAKA_ASSERT_OFFLOAD(iv[iv[i]] != int(i));
-        }
-        alpaka::syncBlockThreads(acc);
+      //  mini verification
+      for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
+        if (iv[i] != int(i))
+          ALPAKA_ASSERT_OFFLOAD(iv[iv[i]] != int(i));
+      }
+      alpaka::syncBlockThreads(acc);
 #endif
 
-        // consolidate graph (percolate index of seed)
-        for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
-          auto m = iv[i];
-          while (m != iv[m])
-            m = iv[m];
-          iv[i] = m;
-        }
+      // consolidate graph (percolate index of seed)
+      for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
+        auto m = iv[i];
+        while (m != iv[m])
+          m = iv[m];
+        iv[i] = m;
+      }
 
-        alpaka::syncBlockThreads(acc);
+      alpaka::syncBlockThreads(acc);
 
 #ifdef GPU_DEBUG
-        //  mini verification
-        for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
-          if (iv[i] != int(i))
-            ALPAKA_ASSERT_OFFLOAD(iv[iv[i]] != int(i));
-        }
-        alpaka::syncBlockThreads(acc);
+      //  mini verification
+      for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
+        if (iv[i] != int(i))
+          ALPAKA_ASSERT_OFFLOAD(iv[iv[i]] != int(i));
+      }
+      alpaka::syncBlockThreads(acc);
 #endif
 
 #ifdef GPU_DEBUG
-        // and verify that we did not spit any cluster...
-        for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
-          if (nn[i] < minT)
-            continue;  // DBSCAN core rule
-          ALPAKA_ASSERT_OFFLOAD(zt[iv[i]] <= zt[i]);
-          auto loop = [&](uint32_t j) {
-            if (nn[j] < minT)
-              return;  // DBSCAN core rule
-            auto dist = std::abs(zt[i] - zt[j]);
-            if (dist > eps)
-              return;
-            //  if (dist*dist>chi2max*(ezt2[i]+ezt2[j])) return;
-            // they should belong to the same cluster, isn't it?
-            if (iv[i] != iv[j]) {
-              printf("ERROR %d %d %f %f %d\n", i, iv[i], zt[i], zt[iv[i]], iv[iv[i]]);
-              printf("      %d %d %f %f %d\n", j, iv[j], zt[j], zt[iv[j]], iv[iv[j]]);
-              ;
-            }
-            ALPAKA_ASSERT_OFFLOAD(iv[i] == iv[j]);
-          };
-          cms::alpakatools::forEachInBins(hist, izt[i], 1, loop);
-        }
-        alpaka::syncBlockThreads(acc);
+      // and verify that we did not spit any cluster...
+      for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
+        if (nn[i] < minT)
+          continue;  // DBSCAN core rule
+        ALPAKA_ASSERT_OFFLOAD(zt[iv[i]] <= zt[i]);
+        auto loop = [&](uint32_t j) {
+          if (nn[j] < minT)
+            return;  // DBSCAN core rule
+          auto dist = std::abs(zt[i] - zt[j]);
+          if (dist > eps)
+            return;
+          //  if (dist*dist>chi2max*(ezt2[i]+ezt2[j])) return;
+          // they should belong to the same cluster, isn't it?
+          if (iv[i] != iv[j]) {
+            printf("ERROR %d %d %f %f %d\n", i, iv[i], zt[i], zt[iv[i]], iv[iv[i]]);
+            printf("      %d %d %f %f %d\n", j, iv[j], zt[j], zt[iv[j]], iv[iv[j]]);
+            ;
+          }
+          ALPAKA_ASSERT_OFFLOAD(iv[i] == iv[j]);
+        };
+        cms::alpakatools::forEachInBins(hist, izt[i], 1, loop);
+      }
+      alpaka::syncBlockThreads(acc);
 #endif
 
-        // collect edges (assign to closest cluster of closest point??? here to closest point)
-        for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
-          //    if (nn[i]==0 || nn[i]>=minT) continue;    // DBSCAN edge rule
-          if (nn[i] >= minT)
-            continue;  // DBSCAN edge rule
-          float mdist = eps;
-          auto loop = [&](uint32_t j) {
-            if (nn[j] < minT)
-              return;  // DBSCAN core rule
-            auto dist = std::abs(zt[i] - zt[j]);
-            if (dist > mdist)
-              return;
-            if (dist * dist > chi2max * (ezt2[i] + ezt2[j]))
-              return;  // needed?
-            mdist = dist;
-            iv[i] = iv[j];  // assign to cluster (better be unique??)
-          };
-          cms::alpakatools::forEachInBins(hist, izt[i], 1, loop);
-        }
+      // collect edges (assign to closest cluster of closest point??? here to closest point)
+      for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
+        //    if (nn[i]==0 || nn[i]>=minT) continue;    // DBSCAN edge rule
+        if (nn[i] >= minT)
+          continue;  // DBSCAN edge rule
+        float mdist = eps;
+        auto loop = [&](uint32_t j) {
+          if (nn[j] < minT)
+            return;  // DBSCAN core rule
+          auto dist = std::abs(zt[i] - zt[j]);
+          if (dist > mdist)
+            return;
+          if (dist * dist > chi2max * (ezt2[i] + ezt2[j]))
+            return;  // needed?
+          mdist = dist;
+          iv[i] = iv[j];  // assign to cluster (better be unique??)
+        };
+        cms::alpakatools::forEachInBins(hist, izt[i], 1, loop);
+      }
+
+      auto& foundClusters = alpaka::declareSharedVar<unsigned int, __COUNTER__>(acc);
+      foundClusters = 0;
+      alpaka::syncBlockThreads(acc);
 
-        auto& foundClusters = alpaka::declareSharedVar<unsigned int, __COUNTER__>(acc);
-        foundClusters = 0;
-        alpaka::syncBlockThreads(acc);
-
-        // find the number of different clusters, identified by a tracks with clus[i] == i;
-        // mark these tracks with a negative id.
-        for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
-          if (iv[i] == int(i)) {
-            if (nn[i] >= minT) {
-              auto old = alpaka::atomicInc(acc, &foundClusters, 0xffffffff, alpaka::hierarchy::Threads{});
-              iv[i] = -(old + 1);
-            } else {  // noise
-              iv[i] = -9998;
-            }
+      // find the number of different clusters, identified by a tracks with clus[i] == i;
+      // mark these tracks with a negative id.
+      for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
+        if (iv[i] == int(i)) {
+          if (nn[i] >= minT) {
+            auto old = alpaka::atomicInc(acc, &foundClusters, 0xffffffff, alpaka::hierarchy::Threads{});
+            iv[i] = -(old + 1);
+          } else {  // noise
+            iv[i] = -9998;
           }
         }
-        alpaka::syncBlockThreads(acc);
+      }
+      alpaka::syncBlockThreads(acc);
 
-        ALPAKA_ASSERT_OFFLOAD(foundClusters < ::zVertex::MAXVTX);
+      ALPAKA_ASSERT_OFFLOAD(foundClusters < ::zVertex::MAXVTX);
 
-        // propagate the negative id to all the tracks in the cluster.
-        for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
-          if (iv[i] >= 0) {
-            // mark each track in a cluster with the same id as the first one
-            iv[i] = iv[iv[i]];
-          }
+      // propagate the negative id to all the tracks in the cluster.
+      for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
+        if (iv[i] >= 0) {
+          // mark each track in a cluster with the same id as the first one
+          iv[i] = iv[iv[i]];
         }
-        alpaka::syncBlockThreads(acc);
+      }
+      alpaka::syncBlockThreads(acc);
 
-        // adjust the cluster id to be a positive value starting from 0
-        for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
-          iv[i] = -iv[i] - 1;
-        }
+      // adjust the cluster id to be a positive value starting from 0
+      for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
+        iv[i] = -iv[i] - 1;
+      }
 
-        nvIntermediate = nvFinal = foundClusters;
+      nvIntermediate = nvFinal = foundClusters;
 
-        if constexpr (verbose) {
-          if (cms::alpakatools::once_per_block(acc))
-            printf("found %d proto vertices\n", foundClusters);
-        }
+      if constexpr (verbose) {
+        if (cms::alpakatools::once_per_block(acc))
+          printf("found %d proto vertices\n", foundClusters);
       }
-    };
-  }  // namespace vertexFinder
-}  // namespace ALPAKA_ACCELERATOR_NAMESPACE
-#endif  // RecoPixelVertexing_PixelVertexFinding_plugins_gpuClusterTracksDBSCAN_h
+    }
+  };
+
+}  // namespace ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder
+
+#endif  // RecoTracker_PixelVertexFinding_plugins_alpaka_clusterTracksDBSCAN_h
diff --git a/RecoTracker/PixelVertexFinding/plugins/alpaka/clusterTracksIterative.h b/RecoTracker/PixelVertexFinding/plugins/alpaka/clusterTracksIterative.h
index 3fe0202121f80..6468fb9e185c4 100644
--- a/RecoTracker/PixelVertexFinding/plugins/alpaka/clusterTracksIterative.h
+++ b/RecoTracker/PixelVertexFinding/plugins/alpaka/clusterTracksIterative.h
@@ -4,12 +4,15 @@
 #include <algorithm>
 #include <cmath>
 #include <cstdint>
+
 #include <alpaka/alpaka.hpp>
 
 #include "DataFormats/VertexSoA/interface/ZVertexDefinitions.h"
-#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
 #include "HeterogeneousCore/AlpakaInterface/interface/HistoContainer.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/config.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
 #include "RecoTracker/PixelVertexFinding/interface/PixelVertexWorkSpaceLayout.h"
+
 #include "vertexFinder.h"
 
 namespace ALPAKA_ACCELERATOR_NAMESPACE {
diff --git a/RecoTracker/PixelVertexFinding/plugins/alpaka/fitVertices.h b/RecoTracker/PixelVertexFinding/plugins/alpaka/fitVertices.h
index 9ff4656b9718e..5ee24f610c1aa 100644
--- a/RecoTracker/PixelVertexFinding/plugins/alpaka/fitVertices.h
+++ b/RecoTracker/PixelVertexFinding/plugins/alpaka/fitVertices.h
@@ -1,123 +1,126 @@
-#ifndef RecoPixelVertexing_PixelVertexFinding_gpuFitVertices_h
-#define RecoPixelVertexing_PixelVertexFinding_gpuFitVertices_h
+#ifndef RecoTracker_PixelVertexFinding_plugins_alpaka_fitVertices_h
+#define RecoTracker_PixelVertexFinding_plugins_alpaka_fitVertices_h
 
 #include <algorithm>
 #include <cmath>
 #include <cstdint>
+
 #include <alpaka/alpaka.hpp>
-#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
+
 #include "HeterogeneousCore/AlpakaInterface/interface/HistoContainer.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/config.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
 
 #include "vertexFinder.h"
 
-namespace ALPAKA_ACCELERATOR_NAMESPACE {
-  namespace vertexFinder {
-    template <typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE __attribute__((always_inline)) void fitVertices(
-        const TAcc& acc,
-        VtxSoAView& pdata,
-        WsSoAView& pws,
-        float chi2Max  // for outlier rejection
-    ) {
-      constexpr bool verbose = false;  // in principle the compiler should optmize out if false
-
-      auto& __restrict__ data = pdata;
-      auto& __restrict__ ws = pws;
-      auto nt = ws.ntrks();
-      float const* __restrict__ zt = ws.zt();
-      float const* __restrict__ ezt2 = ws.ezt2();
-      float* __restrict__ zv = data.zv();
-      float* __restrict__ wv = data.wv();
-      float* __restrict__ chi2 = data.chi2();
-      uint32_t& nvFinal = data.nvFinal();
-      uint32_t& nvIntermediate = ws.nvIntermediate();
-
-      int32_t* __restrict__ nn = data.ndof();
-      int32_t* __restrict__ iv = ws.iv();
-
-      ALPAKA_ASSERT_OFFLOAD(nvFinal <= nvIntermediate);
-      nvFinal = nvIntermediate;
-      auto foundClusters = nvFinal;
-
-      // zero
-      for (auto i : cms::alpakatools::elements_with_stride(acc, foundClusters)) {
-        zv[i] = 0;
-        wv[i] = 0;
-        chi2[i] = 0;
-      }
+namespace ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder {
 
-      // only for test
-      auto& noise = alpaka::declareSharedVar<int, __COUNTER__>(acc);
+  template <typename TAcc>
+  ALPAKA_FN_ACC ALPAKA_FN_INLINE __attribute__((always_inline)) void fitVertices(const TAcc& acc,
+                                                                                 VtxSoAView& pdata,
+                                                                                 WsSoAView& pws,
+                                                                                 float chi2Max  // for outlier rejection
+  ) {
+    constexpr bool verbose = false;  // in principle the compiler should optmize out if false
 
-      if constexpr (verbose) {
-        if (cms::alpakatools::once_per_block(acc))
-          noise = 0;
-      }
-      alpaka::syncBlockThreads(acc);
-
-      // compute cluster location
-      for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
-        if (iv[i] > 9990) {
-          if constexpr (verbose)
-            alpaka::atomicAdd(acc, &noise, 1, alpaka::hierarchy::Threads{});
-          continue;
-        }
-        ALPAKA_ASSERT_OFFLOAD(iv[i] >= 0);
-        ALPAKA_ASSERT_OFFLOAD(iv[i] < int(foundClusters));
-        auto w = 1.f / ezt2[i];
-        alpaka::atomicAdd(acc, &zv[iv[i]], zt[i] * w, alpaka::hierarchy::Threads{});
-        alpaka::atomicAdd(acc, &wv[iv[i]], w, alpaka::hierarchy::Threads{});
-      }
+    auto& __restrict__ data = pdata;
+    auto& __restrict__ ws = pws;
+    auto nt = ws.ntrks();
+    float const* __restrict__ zt = ws.zt();
+    float const* __restrict__ ezt2 = ws.ezt2();
+    float* __restrict__ zv = data.zv();
+    float* __restrict__ wv = data.wv();
+    float* __restrict__ chi2 = data.chi2();
+    uint32_t& nvFinal = data.nvFinal();
+    uint32_t& nvIntermediate = ws.nvIntermediate();
+
+    int32_t* __restrict__ nn = data.ndof();
+    int32_t* __restrict__ iv = ws.iv();
 
-      alpaka::syncBlockThreads(acc);
-      // reuse nn
-      for (auto i : cms::alpakatools::elements_with_stride(acc, foundClusters)) {
-        ALPAKA_ASSERT_OFFLOAD(wv[i] > 0.f);
-        zv[i] /= wv[i];
-        nn[i] = -1;  // ndof
+    ALPAKA_ASSERT_OFFLOAD(nvFinal <= nvIntermediate);
+    nvFinal = nvIntermediate;
+    auto foundClusters = nvFinal;
+
+    // zero
+    for (auto i : cms::alpakatools::elements_with_stride(acc, foundClusters)) {
+      zv[i] = 0;
+      wv[i] = 0;
+      chi2[i] = 0;
+    }
+
+    // only for test
+    auto& noise = alpaka::declareSharedVar<int, __COUNTER__>(acc);
+
+    if constexpr (verbose) {
+      if (cms::alpakatools::once_per_block(acc))
+        noise = 0;
+    }
+    alpaka::syncBlockThreads(acc);
+
+    // compute cluster location
+    for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
+      if (iv[i] > 9990) {
+        if constexpr (verbose)
+          alpaka::atomicAdd(acc, &noise, 1, alpaka::hierarchy::Threads{});
+        continue;
       }
-      alpaka::syncBlockThreads(acc);
-
-      // compute chi2
-      for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
-        if (iv[i] > 9990)
-          continue;
-
-        auto c2 = zv[iv[i]] - zt[i];
-        c2 *= c2 / ezt2[i];
-        if (c2 > chi2Max) {
-          iv[i] = 9999;
-          continue;
-        }
-        alpaka::atomicAdd(acc, &chi2[iv[i]], c2, alpaka::hierarchy::Blocks{});
-        alpaka::atomicAdd(acc, &nn[iv[i]], 1, alpaka::hierarchy::Blocks{});
+      ALPAKA_ASSERT_OFFLOAD(iv[i] >= 0);
+      ALPAKA_ASSERT_OFFLOAD(iv[i] < int(foundClusters));
+      auto w = 1.f / ezt2[i];
+      alpaka::atomicAdd(acc, &zv[iv[i]], zt[i] * w, alpaka::hierarchy::Threads{});
+      alpaka::atomicAdd(acc, &wv[iv[i]], w, alpaka::hierarchy::Threads{});
+    }
+
+    alpaka::syncBlockThreads(acc);
+    // reuse nn
+    for (auto i : cms::alpakatools::elements_with_stride(acc, foundClusters)) {
+      ALPAKA_ASSERT_OFFLOAD(wv[i] > 0.f);
+      zv[i] /= wv[i];
+      nn[i] = -1;  // ndof
+    }
+    alpaka::syncBlockThreads(acc);
+
+    // compute chi2
+    for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
+      if (iv[i] > 9990)
+        continue;
+
+      auto c2 = zv[iv[i]] - zt[i];
+      c2 *= c2 / ezt2[i];
+      if (c2 > chi2Max) {
+        iv[i] = 9999;
+        continue;
       }
-      alpaka::syncBlockThreads(acc);
+      alpaka::atomicAdd(acc, &chi2[iv[i]], c2, alpaka::hierarchy::Blocks{});
+      alpaka::atomicAdd(acc, &nn[iv[i]], 1, alpaka::hierarchy::Blocks{});
+    }
+    alpaka::syncBlockThreads(acc);
 
-      for (auto i : cms::alpakatools::elements_with_stride(acc, foundClusters)) {
-        if (nn[i] > 0) {
-          wv[i] *= float(nn[i]) / chi2[i];
-        }
+    for (auto i : cms::alpakatools::elements_with_stride(acc, foundClusters)) {
+      if (nn[i] > 0) {
+        wv[i] *= float(nn[i]) / chi2[i];
       }
-      if constexpr (verbose) {
-        if (cms::alpakatools::once_per_block(acc)) {
-          printf("found %d proto clusters ", foundClusters);
-          printf("and %d noise\n", noise);
-        }
+    }
+    if constexpr (verbose) {
+      if (cms::alpakatools::once_per_block(acc)) {
+        printf("found %d proto clusters ", foundClusters);
+        printf("and %d noise\n", noise);
       }
     }
+  }
 
-    class FitVerticesKernel {
-    public:
-      template <typename TAcc>
-      ALPAKA_FN_ACC void operator()(const TAcc& acc,
-                                    VtxSoAView pdata,
-                                    WsSoAView pws,
-                                    float chi2Max  // for outlier rejection
-      ) const {
-        fitVertices(acc, pdata, pws, chi2Max);
-      }
-    };
-  }  // namespace vertexFinder
-}  // namespace ALPAKA_ACCELERATOR_NAMESPACE
-#endif  // RecoPixelVertexing_PixelVertexFinding_plugins_gpuFitVertices_h
+  class FitVerticesKernel {
+  public:
+    template <typename TAcc>
+    ALPAKA_FN_ACC void operator()(const TAcc& acc,
+                                  VtxSoAView pdata,
+                                  WsSoAView pws,
+                                  float chi2Max  // for outlier rejection
+    ) const {
+      fitVertices(acc, pdata, pws, chi2Max);
+    }
+  };
+
+}  // namespace ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder
+
+#endif  // RecoTracker_PixelVertexFinding_plugins_alpaka_fitVertices_h
diff --git a/RecoTracker/PixelVertexFinding/plugins/alpaka/sortByPt2.h b/RecoTracker/PixelVertexFinding/plugins/alpaka/sortByPt2.h
index 2c6f0cb0597e4..5d5765ed3d4b8 100644
--- a/RecoTracker/PixelVertexFinding/plugins/alpaka/sortByPt2.h
+++ b/RecoTracker/PixelVertexFinding/plugins/alpaka/sortByPt2.h
@@ -1,80 +1,84 @@
-#ifndef RecoPixelVertexing_PixelVertexFinding_sortByPt2_h
-#define RecoPixelVertexing_PixelVertexFinding_sortByPt2_h
+#ifndef RecoTracker_PixelVertexFinding_plugins_alpaka_sortByPt2_h
+#define RecoTracker_PixelVertexFinding_plugins_alpaka_sortByPt2_h
 
 #include <algorithm>
 #include <array>
 #include <cmath>
 #include <cstdint>
+
 #include <alpaka/alpaka.hpp>
-#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
+
+#include "DataFormats/VertexSoA/interface/ZVertexSoA.h"
 #include "HeterogeneousCore/AlpakaInterface/interface/HistoContainer.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/config.h"
 #include "HeterogeneousCore/AlpakaInterface/interface/radixSort.h"
-#include "DataFormats/VertexSoA/interface/ZVertexSoA.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
 #include "RecoTracker/PixelVertexFinding/interface/PixelVertexWorkSpaceLayout.h"
 
 #include "vertexFinder.h"
 
-namespace ALPAKA_ACCELERATOR_NAMESPACE {
-  namespace vertexFinder {
-    using VtxSoAView = ::reco::ZVertexSoAView;
-    using WsSoAView = ::vertexFinder::PixelVertexWorkSpaceSoAView;
+namespace ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder {
 
-    template <typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE void sortByPt2(const TAcc& acc, VtxSoAView& data, WsSoAView& ws) {
-      auto nt = ws.ntrks();
-      float const* __restrict__ ptt2 = ws.ptt2();
-      uint32_t const& nvFinal = data.nvFinal();
-
-      int32_t const* __restrict__ iv = ws.iv();
-      float* __restrict__ ptv2 = data.ptv2();
-      uint16_t* __restrict__ sortInd = data.sortInd();
-
-      if (nvFinal < 1)
-        return;
-
-      // fill indexing
-      for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
-        data.idv()[ws.itrk()[i]] = iv[i];
-      };
-
-      // can be done asynchronously at the end of previous event
-      for (auto i : cms::alpakatools::elements_with_stride(acc, nvFinal)) {
-        ptv2[i] = 0;
-      };
-      alpaka::syncBlockThreads(acc);
-
-      for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
-        if (iv[i] <= 9990) {
-          alpaka::atomicAdd(acc, &ptv2[iv[i]], ptt2[i], alpaka::hierarchy::Blocks{});
-        }
-      };
-      alpaka::syncBlockThreads(acc);
-
-      const uint32_t threadIdxLocal(alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]);
-      if (1 == nvFinal) {
-        if (threadIdxLocal == 0)
-          sortInd[0] = 0;
-        return;
-      }
+  using VtxSoAView = ::reco::ZVertexSoAView;
+  using WsSoAView = ::vertexFinder::PixelVertexWorkSpaceSoAView;
 
-      if constexpr (not cms::alpakatools::requires_single_thread_per_block_v<TAcc>) {
-        auto& sws = alpaka::declareSharedVar<uint16_t[1024], __COUNTER__>(acc);
-        // sort using only 16 bits
-        cms::alpakatools::radixSort<Acc1D, float, 2>(acc, ptv2, sortInd, sws, nvFinal);
-      } else {
-        for (uint16_t i = 0; i < nvFinal; ++i)
-          sortInd[i] = i;
-        std::sort(sortInd, sortInd + nvFinal, [&](auto i, auto j) { return ptv2[i] < ptv2[j]; });
-      }
-    }
+  template <typename TAcc>
+  ALPAKA_FN_ACC ALPAKA_FN_INLINE void sortByPt2(const TAcc& acc, VtxSoAView& data, WsSoAView& ws) {
+    auto nt = ws.ntrks();
+    float const* __restrict__ ptt2 = ws.ptt2();
+    uint32_t const& nvFinal = data.nvFinal();
 
-    class SortByPt2Kernel {
-    public:
-      template <typename TAcc>
-      ALPAKA_FN_ACC void operator()(const TAcc& acc, VtxSoAView pdata, WsSoAView pws) const {
-        sortByPt2(acc, pdata, pws);
+    int32_t const* __restrict__ iv = ws.iv();
+    float* __restrict__ ptv2 = data.ptv2();
+    uint16_t* __restrict__ sortInd = data.sortInd();
+
+    if (nvFinal < 1)
+      return;
+
+    // fill indexing
+    for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
+      data.idv()[ws.itrk()[i]] = iv[i];
+    };
+
+    // can be done asynchronously at the end of previous event
+    for (auto i : cms::alpakatools::elements_with_stride(acc, nvFinal)) {
+      ptv2[i] = 0;
+    };
+    alpaka::syncBlockThreads(acc);
+
+    for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
+      if (iv[i] <= 9990) {
+        alpaka::atomicAdd(acc, &ptv2[iv[i]], ptt2[i], alpaka::hierarchy::Blocks{});
       }
     };
-  }  // namespace vertexFinder
-}  // namespace ALPAKA_ACCELERATOR_NAMESPACE
-#endif  // RecoPixelVertexing_PixelVertexFinding_sortByPt2_h
+    alpaka::syncBlockThreads(acc);
+
+    const uint32_t threadIdxLocal(alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]);
+    if (1 == nvFinal) {
+      if (threadIdxLocal == 0)
+        sortInd[0] = 0;
+      return;
+    }
+
+    if constexpr (not cms::alpakatools::requires_single_thread_per_block_v<TAcc>) {
+      auto& sws = alpaka::declareSharedVar<uint16_t[1024], __COUNTER__>(acc);
+      // sort using only 16 bits
+      cms::alpakatools::radixSort<Acc1D, float, 2>(acc, ptv2, sortInd, sws, nvFinal);
+    } else {
+      for (uint16_t i = 0; i < nvFinal; ++i)
+        sortInd[i] = i;
+      std::sort(sortInd, sortInd + nvFinal, [&](auto i, auto j) { return ptv2[i] < ptv2[j]; });
+    }
+  }
+
+  class SortByPt2Kernel {
+  public:
+    template <typename TAcc>
+    ALPAKA_FN_ACC void operator()(const TAcc& acc, VtxSoAView pdata, WsSoAView pws) const {
+      sortByPt2(acc, pdata, pws);
+    }
+  };
+
+}  // namespace ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder
+
+#endif  // RecoTracker_PixelVertexFinding_plugins_alpaka_sortByPt2_h
diff --git a/RecoTracker/PixelVertexFinding/plugins/alpaka/splitVertices.h b/RecoTracker/PixelVertexFinding/plugins/alpaka/splitVertices.h
index f5b05e17bb038..5a16d9c57a20d 100644
--- a/RecoTracker/PixelVertexFinding/plugins/alpaka/splitVertices.h
+++ b/RecoTracker/PixelVertexFinding/plugins/alpaka/splitVertices.h
@@ -1,162 +1,166 @@
-#ifndef RecoPixelVertexing_PixelVertexFinding_splitVertices_h
-#define RecoPixelVertexing_PixelVertexFinding_splitVertices_h
+#ifndef RecoTracker_PixelVertexFinding_plugins_alpaka_splitVertices_h
+#define RecoTracker_PixelVertexFinding_plugins_alpaka_splitVertices_h
 
 #include <algorithm>
 #include <cmath>
 #include <cstdint>
+
 #include <alpaka/alpaka.hpp>
-#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
+
 #include "HeterogeneousCore/AlpakaInterface/interface/HistoContainer.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/config.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
 
 #include "vertexFinder.h"
 
-namespace ALPAKA_ACCELERATOR_NAMESPACE {
-  namespace vertexFinder {
-    using VtxSoAView = ::reco::ZVertexSoAView;
-    using WsSoAView = ::vertexFinder::PixelVertexWorkSpaceSoAView;
-    template <typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE __attribute__((always_inline)) void splitVertices(const TAcc& acc,
-                                                                                     VtxSoAView& pdata,
-                                                                                     WsSoAView& pws,
-                                                                                     float maxChi2) {
-      constexpr bool verbose = false;  // in principle the compiler should optmize out if false
-      const uint32_t threadIdxLocal(alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]);
-
-      auto& __restrict__ data = pdata;
-      auto& __restrict__ ws = pws;
-      auto nt = ws.ntrks();
-      float const* __restrict__ zt = ws.zt();
-      float const* __restrict__ ezt2 = ws.ezt2();
-      float* __restrict__ zv = data.zv();
-      float* __restrict__ wv = data.wv();
-      float const* __restrict__ chi2 = data.chi2();
-      uint32_t& nvFinal = data.nvFinal();
-
-      int32_t const* __restrict__ nn = data.ndof();
-      int32_t* __restrict__ iv = ws.iv();
-
-      ALPAKA_ASSERT_OFFLOAD(zt);
-      ALPAKA_ASSERT_OFFLOAD(wv);
-      ALPAKA_ASSERT_OFFLOAD(chi2);
-      ALPAKA_ASSERT_OFFLOAD(nn);
-
-      constexpr uint32_t MAXTK = 512;
-
-      auto& it = alpaka::declareSharedVar<uint32_t[MAXTK], __COUNTER__>(acc);   // track index
-      auto& zz = alpaka::declareSharedVar<float[MAXTK], __COUNTER__>(acc);      // z pos
-      auto& newV = alpaka::declareSharedVar<uint8_t[MAXTK], __COUNTER__>(acc);  // 0 or 1
-      auto& ww = alpaka::declareSharedVar<float[MAXTK], __COUNTER__>(acc);      // z weight
-      auto& nq = alpaka::declareSharedVar<uint32_t, __COUNTER__>(acc);          // number of track for this vertex
-
-      const uint32_t blockIdx(alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0u]);
-      const uint32_t gridDimension(alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[0u]);
-
-      // one vertex per block
-      for (auto kv = blockIdx; kv < nvFinal; kv += gridDimension) {
-        if (nn[kv] < 4)
-          continue;
-        if (chi2[kv] < maxChi2 * float(nn[kv]))
-          continue;
-
-        ALPAKA_ASSERT_OFFLOAD(nn[kv] < int32_t(MAXTK));
-
-        if ((uint32_t)nn[kv] >= MAXTK)
-          continue;  // too bad FIXME
-
-        nq = 0u;
-        alpaka::syncBlockThreads(acc);
+namespace ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder {
+
+  using VtxSoAView = ::reco::ZVertexSoAView;
+  using WsSoAView = ::vertexFinder::PixelVertexWorkSpaceSoAView;
+  template <typename TAcc>
+  ALPAKA_FN_ACC ALPAKA_FN_INLINE __attribute__((always_inline)) void splitVertices(const TAcc& acc,
+                                                                                   VtxSoAView& pdata,
+                                                                                   WsSoAView& pws,
+                                                                                   float maxChi2) {
+    constexpr bool verbose = false;  // in principle the compiler should optmize out if false
+    const uint32_t threadIdxLocal(alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]);
+
+    auto& __restrict__ data = pdata;
+    auto& __restrict__ ws = pws;
+    auto nt = ws.ntrks();
+    float const* __restrict__ zt = ws.zt();
+    float const* __restrict__ ezt2 = ws.ezt2();
+    float* __restrict__ zv = data.zv();
+    float* __restrict__ wv = data.wv();
+    float const* __restrict__ chi2 = data.chi2();
+    uint32_t& nvFinal = data.nvFinal();
+
+    int32_t const* __restrict__ nn = data.ndof();
+    int32_t* __restrict__ iv = ws.iv();
+
+    ALPAKA_ASSERT_OFFLOAD(zt);
+    ALPAKA_ASSERT_OFFLOAD(wv);
+    ALPAKA_ASSERT_OFFLOAD(chi2);
+    ALPAKA_ASSERT_OFFLOAD(nn);
+
+    constexpr uint32_t MAXTK = 512;
+
+    auto& it = alpaka::declareSharedVar<uint32_t[MAXTK], __COUNTER__>(acc);   // track index
+    auto& zz = alpaka::declareSharedVar<float[MAXTK], __COUNTER__>(acc);      // z pos
+    auto& newV = alpaka::declareSharedVar<uint8_t[MAXTK], __COUNTER__>(acc);  // 0 or 1
+    auto& ww = alpaka::declareSharedVar<float[MAXTK], __COUNTER__>(acc);      // z weight
+    auto& nq = alpaka::declareSharedVar<uint32_t, __COUNTER__>(acc);          // number of track for this vertex
+
+    const uint32_t blockIdx(alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0u]);
+    const uint32_t gridDimension(alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[0u]);
+
+    // one vertex per block
+    for (auto kv = blockIdx; kv < nvFinal; kv += gridDimension) {
+      if (nn[kv] < 4)
+        continue;
+      if (chi2[kv] < maxChi2 * float(nn[kv]))
+        continue;
+
+      ALPAKA_ASSERT_OFFLOAD(nn[kv] < int32_t(MAXTK));
+
+      if ((uint32_t)nn[kv] >= MAXTK)
+        continue;  // too bad FIXME
+
+      nq = 0u;
+      alpaka::syncBlockThreads(acc);
+
+      // copy to local
+      for (auto k : cms::alpakatools::independent_group_elements(acc, nt)) {
+        if (iv[k] == int(kv)) {
+          auto old = alpaka::atomicInc(acc, &nq, MAXTK, alpaka::hierarchy::Threads{});
+          zz[old] = zt[k] - zv[kv];
+          newV[old] = zz[old] < 0 ? 0 : 1;
+          ww[old] = 1.f / ezt2[k];
+          it[old] = k;
+        }
+      }
 
-        // copy to local
-        for (auto k : cms::alpakatools::independent_group_elements(acc, nt)) {
-          if (iv[k] == int(kv)) {
-            auto old = alpaka::atomicInc(acc, &nq, MAXTK, alpaka::hierarchy::Threads{});
-            zz[old] = zt[k] - zv[kv];
-            newV[old] = zz[old] < 0 ? 0 : 1;
-            ww[old] = 1.f / ezt2[k];
-            it[old] = k;
-          }
+      // the new vertices
+      auto& znew = alpaka::declareSharedVar<float[2], __COUNTER__>(acc);
+      auto& wnew = alpaka::declareSharedVar<float[2], __COUNTER__>(acc);
+      alpaka::syncBlockThreads(acc);
+
+      ALPAKA_ASSERT_OFFLOAD(int(nq) == nn[kv] + 1);
+
+      int maxiter = 20;
+      // kt-min....
+      bool more = true;
+      while (alpaka::syncBlockThreadsPredicate<alpaka::BlockOr>(acc, more)) {
+        more = false;
+        if (0 == threadIdxLocal) {
+          znew[0] = 0;
+          znew[1] = 0;
+          wnew[0] = 0;
+          wnew[1] = 0;
         }
+        alpaka::syncBlockThreads(acc);
 
-        // the new vertices
-        auto& znew = alpaka::declareSharedVar<float[2], __COUNTER__>(acc);
-        auto& wnew = alpaka::declareSharedVar<float[2], __COUNTER__>(acc);
+        for (auto k : cms::alpakatools::elements_with_stride(acc, nq)) {
+          auto i = newV[k];
+          alpaka::atomicAdd(acc, &znew[i], zz[k] * ww[k], alpaka::hierarchy::Threads{});
+          alpaka::atomicAdd(acc, &wnew[i], ww[k], alpaka::hierarchy::Threads{});
+        }
         alpaka::syncBlockThreads(acc);
 
-        ALPAKA_ASSERT_OFFLOAD(int(nq) == nn[kv] + 1);
+        if (0 == threadIdxLocal) {
+          znew[0] /= wnew[0];
+          znew[1] /= wnew[1];
+        }
+        alpaka::syncBlockThreads(acc);
 
-        int maxiter = 20;
-        // kt-min....
-        bool more = true;
-        while (alpaka::syncBlockThreadsPredicate<alpaka::BlockOr>(acc, more)) {
-          more = false;
-          if (0 == threadIdxLocal) {
-            znew[0] = 0;
-            znew[1] = 0;
-            wnew[0] = 0;
-            wnew[1] = 0;
-          }
-          alpaka::syncBlockThreads(acc);
-
-          for (auto k : cms::alpakatools::elements_with_stride(acc, nq)) {
-            auto i = newV[k];
-            alpaka::atomicAdd(acc, &znew[i], zz[k] * ww[k], alpaka::hierarchy::Threads{});
-            alpaka::atomicAdd(acc, &wnew[i], ww[k], alpaka::hierarchy::Threads{});
-          }
-          alpaka::syncBlockThreads(acc);
-
-          if (0 == threadIdxLocal) {
-            znew[0] /= wnew[0];
-            znew[1] /= wnew[1];
-          }
-          alpaka::syncBlockThreads(acc);
-
-          for (auto k : cms::alpakatools::elements_with_stride(acc, nq)) {
-            auto d0 = fabs(zz[k] - znew[0]);
-            auto d1 = fabs(zz[k] - znew[1]);
-            auto newer = d0 < d1 ? 0 : 1;
-            more |= newer != newV[k];
-            newV[k] = newer;
-          }
-          --maxiter;
-          if (maxiter <= 0)
-            more = false;
+        for (auto k : cms::alpakatools::elements_with_stride(acc, nq)) {
+          auto d0 = fabs(zz[k] - znew[0]);
+          auto d1 = fabs(zz[k] - znew[1]);
+          auto newer = d0 < d1 ? 0 : 1;
+          more |= newer != newV[k];
+          newV[k] = newer;
         }
+        --maxiter;
+        if (maxiter <= 0)
+          more = false;
+      }
 
-        // avoid empty vertices
-        if (0 == wnew[0] || 0 == wnew[1])
-          continue;
+      // avoid empty vertices
+      if (0 == wnew[0] || 0 == wnew[1])
+        continue;
 
-        // quality cut
-        auto dist2 = (znew[0] - znew[1]) * (znew[0] - znew[1]);
+      // quality cut
+      auto dist2 = (znew[0] - znew[1]) * (znew[0] - znew[1]);
 
-        auto chi2Dist = dist2 / (1.f / wnew[0] + 1.f / wnew[1]);
+      auto chi2Dist = dist2 / (1.f / wnew[0] + 1.f / wnew[1]);
 
-        if (verbose && 0 == threadIdxLocal)
-          printf("inter %d %f %f\n", 20 - maxiter, chi2Dist, dist2 * wv[kv]);
+      if (verbose && 0 == threadIdxLocal)
+        printf("inter %d %f %f\n", 20 - maxiter, chi2Dist, dist2 * wv[kv]);
 
-        if (chi2Dist < 4)
-          continue;
+      if (chi2Dist < 4)
+        continue;
 
-        // get a new global vertex
-        auto& igv = alpaka::declareSharedVar<uint32_t, __COUNTER__>(acc);
-        if (0 == threadIdxLocal)
-          igv = alpaka::atomicAdd(acc, &ws.nvIntermediate(), 1u, alpaka::hierarchy::Blocks{});
-        alpaka::syncBlockThreads(acc);
-        for (auto k : cms::alpakatools::elements_with_stride(acc, nq)) {
-          if (1 == newV[k])
-            iv[it[k]] = igv;
-        }
+      // get a new global vertex
+      auto& igv = alpaka::declareSharedVar<uint32_t, __COUNTER__>(acc);
+      if (0 == threadIdxLocal)
+        igv = alpaka::atomicAdd(acc, &ws.nvIntermediate(), 1u, alpaka::hierarchy::Blocks{});
+      alpaka::syncBlockThreads(acc);
+      for (auto k : cms::alpakatools::elements_with_stride(acc, nq)) {
+        if (1 == newV[k])
+          iv[it[k]] = igv;
+      }
+
+    }  // loop on vertices
+  }
 
-      }  // loop on vertices
+  class SplitVerticesKernel {
+  public:
+    template <typename TAcc>
+    ALPAKA_FN_ACC void operator()(const TAcc& acc, VtxSoAView pdata, WsSoAView pws, float maxChi2) const {
+      splitVertices(acc, pdata, pws, maxChi2);
     }
+  };
 
-    class SplitVerticesKernel {
-    public:
-      template <typename TAcc>
-      ALPAKA_FN_ACC void operator()(const TAcc& acc, VtxSoAView pdata, WsSoAView pws, float maxChi2) const {
-        splitVertices(acc, pdata, pws, maxChi2);
-      }
-    };
-  }  // namespace vertexFinder
-}  // namespace ALPAKA_ACCELERATOR_NAMESPACE
-#endif  // RecoPixelVertexing_PixelVertexFinding_plugins_splitVertices.h
+}  // namespace ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder
+
+#endif  // RecoTracker_PixelVertexFinding_plugins_alpaka_splitVertices_h
diff --git a/RecoTracker/PixelVertexFinding/plugins/alpaka/vertexFinder.dev.cc b/RecoTracker/PixelVertexFinding/plugins/alpaka/vertexFinder.dev.cc
index c40d9adda93c5..89a8ee676e35b 100644
--- a/RecoTracker/PixelVertexFinding/plugins/alpaka/vertexFinder.dev.cc
+++ b/RecoTracker/PixelVertexFinding/plugins/alpaka/vertexFinder.dev.cc
@@ -1,12 +1,12 @@
 #include <alpaka/alpaka.hpp>
-#include "DataFormats/TrackSoA/interface/alpaka/TrackUtilities.h"
 
-#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
+#include "DataFormats/TrackSoA/interface/alpaka/TrackUtilities.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/config.h"
 #include "HeterogeneousCore/AlpakaInterface/interface/traits.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
 #include "RecoTracker/PixelVertexFinding/interface/PixelVertexWorkSpaceLayout.h"
 #include "RecoTracker/PixelVertexFinding/plugins/alpaka/PixelVertexWorkSpaceSoADeviceAlpaka.h"
 
-#include "vertexFinder.h"
 #include "vertexFinder.h"
 #include "clusterTracksDBSCAN.h"
 #include "clusterTracksIterative.h"
diff --git a/RecoTracker/PixelVertexFinding/plugins/alpaka/vertexFinder.h b/RecoTracker/PixelVertexFinding/plugins/alpaka/vertexFinder.h
index 23e5db1e706c4..92890b89bb9c4 100644
--- a/RecoTracker/PixelVertexFinding/plugins/alpaka/vertexFinder.h
+++ b/RecoTracker/PixelVertexFinding/plugins/alpaka/vertexFinder.h
@@ -1,76 +1,77 @@
-#ifndef RecoPixelVertexing_PixelVertexFinding_vertexFinder_h
-#define RecoPixelVertexing_PixelVertexFinding_vertexFinder_h
+#ifndef RecoTracker_PixelVertexFinding_plugins_alpaka_vertexFinder_h
+#define RecoTracker_PixelVertexFinding_plugins_alpaka_vertexFinder_h
 
 #include <cstddef>
 #include <cstdint>
+
 #include <alpaka/alpaka.hpp>
+
 #include "DataFormats/TrackSoA/interface/alpaka/TrackUtilities.h"
+#include "DataFormats/VertexSoA/interface/ZVertexDevice.h"
 #include "DataFormats/VertexSoA/interface/ZVertexHost.h"
 #include "DataFormats/VertexSoA/interface/ZVertexSoA.h"
 #include "DataFormats/VertexSoA/interface/alpaka/ZVertexSoACollection.h"
-#include "DataFormats/VertexSoA/interface/ZVertexDevice.h"
-
+#include "HeterogeneousCore/AlpakaInterface/interface/config.h"
 #include "RecoTracker/PixelVertexFinding/interface/PixelVertexWorkSpaceLayout.h"
 #include "RecoTracker/PixelVertexFinding/plugins/alpaka/PixelVertexWorkSpaceSoADeviceAlpaka.h"
-#include "HeterogeneousCore/AlpakaInterface/interface/config.h"
 
-namespace ALPAKA_ACCELERATOR_NAMESPACE {
-  namespace vertexFinder {
-    using namespace cms::alpakatools;
-    using VtxSoAView = ::reco::ZVertexSoAView;
-    using WsSoAView = ::vertexFinder::PixelVertexWorkSpaceSoAView;
+namespace ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder {
+
+  using namespace cms::alpakatools;
+  using VtxSoAView = ::reco::ZVertexSoAView;
+  using WsSoAView = ::vertexFinder::PixelVertexWorkSpaceSoAView;
+
+  class Init {
+  public:
+    template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
+    ALPAKA_FN_ACC void operator()(const TAcc &acc, VtxSoAView pdata, WsSoAView pws) const {
+      pdata.nvFinal() = 0;  // initialization
+      ::vertexFinder::init(pws);
+    }
+  };
 
-    class Init {
-    public:
-      template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
-      ALPAKA_FN_ACC void operator()(const TAcc &acc, VtxSoAView pdata, WsSoAView pws) const {
-        pdata.nvFinal() = 0;  // initialization
-        ::vertexFinder::init(pws);
-      }
-    };
+  template <typename TrackerTraits>
+  class Producer {
+    using TkSoAConstView = reco::TrackSoAConstView<TrackerTraits>;
 
-    template <typename TrackerTraits>
-    class Producer {
-      using TkSoAConstView = reco::TrackSoAConstView<TrackerTraits>;
+  public:
+    Producer(bool oneKernel,
+             bool useDensity,
+             bool useDBSCAN,
+             bool useIterative,
+             bool doSplitting,
+             int iminT,      // min number of neighbours to be "core"
+             float ieps,     // max absolute distance to cluster
+             float ierrmax,  // max error to be "seed"
+             float ichi2max  // max normalized distance to cluster
+             )
+        : oneKernel_(oneKernel && !(useDBSCAN || useIterative)),
+          useDensity_(useDensity),
+          useDBSCAN_(useDBSCAN),
+          useIterative_(useIterative),
+          doSplitting_(doSplitting),
+          minT(iminT),
+          eps(ieps),
+          errmax(ierrmax),
+          chi2max(ichi2max) {}
 
-    public:
-      Producer(bool oneKernel,
-               bool useDensity,
-               bool useDBSCAN,
-               bool useIterative,
-               bool doSplitting,
-               int iminT,      // min number of neighbours to be "core"
-               float ieps,     // max absolute distance to cluster
-               float ierrmax,  // max error to be "seed"
-               float ichi2max  // max normalized distance to cluster
-               )
-          : oneKernel_(oneKernel && !(useDBSCAN || useIterative)),
-            useDensity_(useDensity),
-            useDBSCAN_(useDBSCAN),
-            useIterative_(useIterative),
-            doSplitting_(doSplitting),
-            minT(iminT),
-            eps(ieps),
-            errmax(ierrmax),
-            chi2max(ichi2max) {}
+    ~Producer() = default;
 
-      ~Producer() = default;
+    ZVertexSoACollection makeAsync(Queue &queue, const TkSoAConstView &tracks_view, float ptMin, float ptMax) const;
 
-      ZVertexSoACollection makeAsync(Queue &queue, const TkSoAConstView &tracks_view, float ptMin, float ptMax) const;
+  private:
+    const bool oneKernel_;     // run everything (cluster,fit,split,sort) in one kernel. Uses only density clusterizer
+    const bool useDensity_;    // use density clusterizer
+    const bool useDBSCAN_;     // use DBScan clusterizer
+    const bool useIterative_;  // use iterative clusterizer
+    const bool doSplitting_;   //run vertex splitting
 
-    private:
-      const bool oneKernel_;     // run everything (cluster,fit,split,sort) in one kernel. Uses only density clusterizer
-      const bool useDensity_;    // use density clusterizer
-      const bool useDBSCAN_;     // use DBScan clusterizer
-      const bool useIterative_;  // use iterative clusterizer
-      const bool doSplitting_;   //run vertex splitting
+    int minT;       // min number of neighbours to be "core"
+    float eps;      // max absolute distance to cluster
+    float errmax;   // max error to be "seed"
+    float chi2max;  // max normalized distance to cluster
+  };
 
-      int minT;       // min number of neighbours to be "core"
-      float eps;      // max absolute distance to cluster
-      float errmax;   // max error to be "seed"
-      float chi2max;  // max normalized distance to cluster
-    };
+}  // namespace ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder
 
-  }  // namespace vertexFinder
-}  // namespace ALPAKA_ACCELERATOR_NAMESPACE
-#endif
+#endif  // RecoTracker_PixelVertexFinding_plugins_alpaka_vertexFinder_h
diff --git a/RecoTracker/PixelVertexFinding/test/alpaka/VertexFinder_t.dev.cc b/RecoTracker/PixelVertexFinding/test/alpaka/VertexFinder_t.dev.cc
index e92d586dc1833..b632eb50ce158 100644
--- a/RecoTracker/PixelVertexFinding/test/alpaka/VertexFinder_t.dev.cc
+++ b/RecoTracker/PixelVertexFinding/test/alpaka/VertexFinder_t.dev.cc
@@ -3,11 +3,14 @@
 #include <iostream>
 #include <random>
 #include <vector>
+
 #include <alpaka/alpaka.hpp>
-#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
-#include "HeterogeneousCore/AlpakaInterface/interface/memory.h"
+
 // TrackUtilities only included in order to compile SoALayout with Eigen columns
 #include "DataFormats/TrackSoA/interface/alpaka/TrackUtilities.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/config.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/memory.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
 #ifdef USE_DBSCAN
 #include "RecoTracker/PixelVertexFinding/plugins/alpaka/clusterTracksDBSCAN.h"
 #define CLUSTERIZE ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder::ClusterTracksDBSCAN
@@ -18,11 +21,9 @@
 #include "RecoTracker/PixelVertexFinding/plugins/alpaka/clusterTracksByDensity.h"
 #define CLUSTERIZE ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder::ClusterTracksByDensityKernel
 #endif
-
 #include "RecoTracker/PixelVertexFinding/interface/PixelVertexWorkSpaceLayout.h"
 #include "RecoTracker/PixelVertexFinding/plugins/PixelVertexWorkSpaceSoAHostAlpaka.h"
 #include "RecoTracker/PixelVertexFinding/plugins/alpaka/PixelVertexWorkSpaceSoADeviceAlpaka.h"
-
 #include "RecoTracker/PixelVertexFinding/plugins/alpaka/fitVertices.h"
 #include "RecoTracker/PixelVertexFinding/plugins/alpaka/sortByPt2.h"
 #include "RecoTracker/PixelVertexFinding/plugins/alpaka/splitVertices.h"

From ff7e93097731e10600f5a75ccb180da53ea940fb Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Fri, 9 Feb 2024 09:45:47 +0100
Subject: [PATCH 17/25] Adjust the size of the collection created by
 CopyToHost::copyAsync

---
 .../interface/alpaka/SiPixelClustersSoACollection.h            | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/DataFormats/SiPixelClusterSoA/interface/alpaka/SiPixelClustersSoACollection.h b/DataFormats/SiPixelClusterSoA/interface/alpaka/SiPixelClustersSoACollection.h
index c5e35475b5330..2563f08810eca 100644
--- a/DataFormats/SiPixelClusterSoA/interface/alpaka/SiPixelClustersSoACollection.h
+++ b/DataFormats/SiPixelClusterSoA/interface/alpaka/SiPixelClustersSoACollection.h
@@ -20,7 +20,8 @@ namespace cms::alpakatools {
   struct CopyToHost<SiPixelClustersDevice<TDevice>> {
     template <typename TQueue>
     static auto copyAsync(TQueue &queue, SiPixelClustersDevice<TDevice> const &srcData) {
-      SiPixelClustersHost dstData(srcData->metadata().size(), queue);
+      // SiPixelClustersHost and SiPixelClustersDevice have a capacity larger than the ctor argument by one
+      SiPixelClustersHost dstData(srcData->metadata().size() - 1, queue);
       alpaka::memcpy(queue, dstData.buffer(), srcData.buffer());
       dstData.setNClusters(srcData.nClusters(), srcData.offsetBPIX2());
 #ifdef GPU_DEBUG  //keeping this untiil copies are in the Tracer

From bd9fe2d251e5369646ac4dc414655c8ab1be8629 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Sat, 10 Feb 2024 17:48:35 +0100
Subject: [PATCH 18/25] Synchronise the treatment of pixel errors 26, 27, 30
 with legacy code

---
 .../alpaka/SiPixelRawToClusterKernel.dev.cc   | 43 ++++++++++---------
 1 file changed, 22 insertions(+), 21 deletions(-)

diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/SiPixelRawToClusterKernel.dev.cc b/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/SiPixelRawToClusterKernel.dev.cc
index c4b8562f2ca92..9725ee10d2855 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/SiPixelRawToClusterKernel.dev.cc
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/SiPixelRawToClusterKernel.dev.cc
@@ -117,10 +117,12 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
 
       uint32_t gRow = rowOffset + slopeRow * local.row;
       uint32_t gCol = colOffset + slopeCol * local.col;
+      // inside frameConversion row: gRow, column: gCol
       ::pixelDetails::Pixel global = {gRow, gCol};
       return global;
     }
 
+    // error decoding and handling copied from EventFilter/SiPixelRawToDigi/src/ErrorChecker.cc
     template <bool debug = false>
     ALPAKA_FN_ACC uint8_t conversionError(uint8_t fedId, uint8_t status) {
       uint8_t errorType = 0;
@@ -159,15 +161,13 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
     }
 
     ALPAKA_FN_ACC bool rocRowColIsValid(uint32_t rocRow, uint32_t rocCol) {
-      uint32_t numRowsInRoc = 80;
-      uint32_t numColsInRoc = 52;
-
       /// row and column in ROC representation
-      return ((rocRow < numRowsInRoc) & (rocCol < numColsInRoc));
+      return ((rocRow < ::pixelDetails::numRowsInRoc) & (rocCol < ::pixelDetails::numColsInRoc));
     }
 
     ALPAKA_FN_ACC bool dcolIsValid(uint32_t dcol, uint32_t pxid) { return ((dcol < 26) & (2 <= pxid) & (pxid < 162)); }
 
+    // error decoding and handling copied from EventFilter/SiPixelRawToDigi/src/ErrorChecker.cc
     template <bool debug = false>
     ALPAKA_FN_ACC uint8_t
     checkROC(uint32_t errorWord, uint8_t fedId, uint32_t link, const SiPixelMappingSoAConstView &cablingMap) {
@@ -177,7 +177,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
       bool errorFound = false;
 
       switch (errorType) {
-        case (25): {
+        case 25: {
           errorFound = true;
           uint32_t index =
               fedId * ::pixelDetails::MAX_LINK * ::pixelDetails::MAX_ROC + (link - 1) * ::pixelDetails::MAX_ROC + 1;
@@ -185,29 +185,28 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
             if (!(link == cablingMap.link()[index] && 1 == cablingMap.roc()[index]))
               errorFound = false;
           }
-          if (debug and errorFound)
-            printf("Invalid ROC = 25 found (errorType = 25)\n");
+          if constexpr (debug)
+            if (errorFound)
+              printf("Invalid ROC = 25 found (errorType = 25)\n");
           break;
         }
-        case (26): {
+        case 26: {
           if constexpr (debug)
             printf("Gap word found (errorType = 26)\n");
-          errorFound = true;
           break;
         }
-        case (27): {
+        case 27: {
           if constexpr (debug)
             printf("Dummy word found (errorType = 27)\n");
-          errorFound = true;
           break;
         }
-        case (28): {
+        case 28: {
           if constexpr (debug)
             printf("Error fifo nearly full (errorType = 28)\n");
           errorFound = true;
           break;
         }
-        case (29): {
+        case 29: {
           if constexpr (debug)
             printf("Timeout on a channel (errorType = 29)\n");
           if (!((errorWord >> sipixelconstants::OMIT_ERR_shift) & sipixelconstants::OMIT_ERR_mask)) {
@@ -218,23 +217,24 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
           errorFound = true;
           break;
         }
-        case (30): {
+        case 30: {
           if constexpr (debug)
             printf("TBM error trailer (errorType = 30)\n");
-          int StateMatch_bits = 4;
-          int StateMatch_shift = 8;
-          uint32_t StateMatch_mask = ~(~uint32_t(0) << StateMatch_bits);
-          int StateMatch = (errorWord >> StateMatch_shift) & StateMatch_mask;
-          if (StateMatch != 1 && StateMatch != 8) {
+          int stateMatch_bits = 4;
+          int stateMatch_shift = 8;
+          uint32_t stateMatch_mask = ~(~uint32_t(0) << stateMatch_bits);
+          int stateMatch = (errorWord >> stateMatch_shift) & stateMatch_mask;
+          if (stateMatch != 1 && stateMatch != 8) {
             if constexpr (debug)
               printf("FED error 30 with unexpected State Bits (errorType = 30)\n");
+            break;
           }
-          if (StateMatch == 1)
+          if (stateMatch == 1)
             errorType = 40;  // 1=Overflow -> 40, 8=number of ROCs -> 30
           errorFound = true;
           break;
         }
-        case (31): {
+        case 31: {
           if constexpr (debug)
             printf("Event number error (errorType = 31)\n");
           errorFound = true;
@@ -247,6 +247,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
       return errorFound ? errorType : 0;
     }
 
+    // error decoding and handling copied from EventFilter/SiPixelRawToDigi/src/ErrorChecker.cc
     template <bool debug = false>
     ALPAKA_FN_ACC uint32_t
     getErrRawID(uint8_t fedId, uint32_t errWord, uint32_t errorType, const SiPixelMappingSoAConstView &cablingMap) {

From b3de2a3ec968d546c3cecfe71cae74130ccd5180 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Fri, 9 Feb 2024 16:28:04 +0100
Subject: [PATCH 19/25] Rewrite the uniform element kernel loops

Generalise elements_with_stride, blocks_with_stride and elements_in_block to
work on any single dimension, and rename them to uniform_elements_along,
uniform_groups_along and uniform_group_elements_along.

Introduce uniform_elements[_x|_y|_z], uniform_groups[_x|_y|_z] and
uniform_group_elements[_x|_y|_z] as specialisations of uniform_elements_along,
uniform_groups_along and uniform_group_elements_along.

Reintrouce elements_with_stride, blocks_with_stride, elements_in_block as
legacy names for uniform_elements, uniform_groups, and uniform_group_elements.

Rename elements_with_stride_nd to uniform_elements_nd, and reintroduce
elements_with_stride_nd as a legacy name for uniform_elements_nd.

Update the unit tests accordingly.
---
 .../AlpakaInterface/interface/workdivision.h  | 629 ++++++++++++++----
 .../test/alpaka/testKernel.dev.cc             |  20 +-
 .../test/alpaka/testWorkDivision.dev.cc       |  60 +-
 3 files changed, 558 insertions(+), 151 deletions(-)

diff --git a/HeterogeneousCore/AlpakaInterface/interface/workdivision.h b/HeterogeneousCore/AlpakaInterface/interface/workdivision.h
index ad950999517f4..2c7b439f8e545 100644
--- a/HeterogeneousCore/AlpakaInterface/interface/workdivision.h
+++ b/HeterogeneousCore/AlpakaInterface/interface/workdivision.h
@@ -77,7 +77,8 @@ namespace cms::alpakatools {
 
   /* ElementIndex
    *
-   * an aggregate that containes the .global and .local indices of an element; returned by iterating over elements_in_block.
+   * an aggregate that containes the `.global` and `.local` indices of an element; returned by iterating over the objecs
+   * returned by `elements_in_block` and similar functions.
    */
 
   struct ElementIndex {
@@ -85,51 +86,84 @@ namespace cms::alpakatools {
     Idx local;
   };
 
-  /* elements_with_stride
+  /* uniform_elements_along
+   *
+   * `uniform_elements_along<Dim>(acc [, first], extent)` returns a one-dimensional iteratable range that spans the
+   * element indices from `first` (inclusive) to `extent` (exlusive) along the `Dim` dimension.
+   * If `first` is not specified, it defaults to 0.
+   * If `extent` is not specified, it defaults to the kernel grid size along the `Dim` dimension.
+   *
+   * In a 1-dimensional kernel, `uniform_elements(acc, ...)` is a shorthand for `uniform_elements_along<0>(acc, ...)`.
    *
-   * `elements_with_stride(acc, [first, ]extent)` returns an iteratable range that spans the element indices required to
-   * cover the given problem size:
-   *   - `first` (optional) is index to the first element; if not specified, the loop starts from 0;
-   *   - `extent` is the total size of the problem, including any elements that may come before `first`.
+   * In an N-dimensional kernel, dimension 0 is the one that increases more slowly (e.g. the outer loop), followed
+   * by dimension 1, up to dimension N-1 that increases fastest (e.g. the inner loop).
+   * For convenience when converting CUDA or HIP code, `uniform_elements_x(acc, ...)`, `_y` and `_z` are shorthands for 
+   * `uniform_elements_along<N-1>(acc, ...)`, `<N-2>` and `<N-3>`.
    *
    * To cover the problem space, different threads may execute a different number of iterations. As a result, it is not
-   * safe to call alpaka::syncBlockThreads() within this loop. If a block synchronisation is needed, one should split
-   * the loop into an outer loop on the blocks and an inner loop on the threads, and call the syncronisation only in the
-   * outer loop:
+   * safe to call `alpaka::syncBlockThreads()` and other block-level synchronisations within this loop.
+   * If a block synchronisation is needed, one should split the loop into an outer loop over the groups and an inner
+   * loop over each group's elements, and synchronise only in the outer loop:
    *
-   *  for (auto group : uniform_groups(acc, extent) {
-   *    for (auto element : uniform_group_elements(acc, group, extent) {
+   *  for (auto group : uniform_groups_along<Dim>(acc, extent)) {
+   *    for (auto element : uniform_group_elements_along<Dim>(acc, group, extent)) {
+   *       // first part of the computation
    *       // no synchronisations here
    *       ...
    *    }
+   *    // wait for all threads to complete the first part
    *    alpaka::syncBlockThreads();
-   *    for (auto element : uniform_group_elements(acc, group, extent) {
+   *    for (auto element : uniform_group_elements_along<Dim>(acc, group, extent)) {
+   *       // second part of the computation
    *       // no synchronisations here
    *       ...
    *    }
+   *    // wait for all threads to complete the second part
    *    alpaka::syncBlockThreads();
+   *    ...
    *  }
+   *
+   * Warp-level primitives require that all threads in the warp execute the same function. If `extent` is not a multiple
+   * of the warp size, some of the warps may be incomplete, leading to undefined behaviour - for example, the kernel may
+   * hang. To avoid this problem, round up `extent` to a multiple of the warp size, and check the element index
+   * explicitly inside the loop:
+   *
+   *  for (auto element : uniform_elements_along<N-1>(acc, round_up_by(extent, alpaka::warp::getSize(acc)))) {
+   *    bool flag = false;
+   *    if (element < extent) {
+   *      // do some work and compute a result flag only for the valid elements
+   *      flag = do_some_work();
+   *    }
+   *    // check if any valid element had a positive result
+   *    if (alpaka::warp::any(acc, flag)) {
+   *      // ...
+   *    }
+   *  }
+   *
+   * Note that the use of warp-level primitives is usually suitable only for the fastest-looping dimension, `N-1`.
    */
 
-  template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
-  class elements_with_stride {
+  template <typename TAcc,
+            std::size_t Dim,
+            typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value >= Dim>>
+  class uniform_elements_along {
   public:
-    ALPAKA_FN_ACC inline elements_with_stride(TAcc const& acc)
-        : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[0u]},
-          first_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0u] * elements_},
-          stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[0u] * elements_},
+    ALPAKA_FN_ACC inline uniform_elements_along(TAcc const& acc)
+        : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim]},
+          first_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[Dim] * elements_},
+          stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[Dim] * elements_},
           extent_{stride_} {}
 
-    ALPAKA_FN_ACC inline elements_with_stride(TAcc const& acc, Idx extent)
-        : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[0u]},
-          first_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0u] * elements_},
-          stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[0u] * elements_},
+    ALPAKA_FN_ACC inline uniform_elements_along(TAcc const& acc, Idx extent)
+        : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim]},
+          first_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[Dim] * elements_},
+          stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[Dim] * elements_},
           extent_{extent} {}
 
-    ALPAKA_FN_ACC inline elements_with_stride(TAcc const& acc, Idx first, Idx extent)
-        : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[0u]},
-          first_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0u] * elements_ + first},
-          stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[0u] * elements_},
+    ALPAKA_FN_ACC inline uniform_elements_along(TAcc const& acc, Idx first, Idx extent)
+        : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim]},
+          first_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[Dim] * elements_ + first},
+          stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[Dim] * elements_},
           extent_{extent} {}
 
     class const_iterator;
@@ -140,7 +174,7 @@ namespace cms::alpakatools {
     ALPAKA_FN_ACC inline const_iterator end() const { return const_iterator(elements_, stride_, extent_, extent_); }
 
     class const_iterator {
-      friend class elements_with_stride;
+      friend class uniform_elements_along;
 
       ALPAKA_FN_ACC inline const_iterator(Idx elements, Idx stride, Idx extent, Idx first)
           : elements_{elements},
@@ -207,19 +241,162 @@ namespace cms::alpakatools {
     const Idx extent_;
   };
 
+  /* uniform_elements
+   *
+   * `uniform_elements(acc [, first], extent)` returns a one-dimensional iteratable range that spans the element indices
+   * from `first` (inclusive) to `extent` (exlusive).
+   * If `first` is not specified, it defaults to 0.
+   * If `extent` is not specified, it defaults to the kernel grid size.
+   *
+   * `uniform_elements(acc, ...)` is a shorthand for `uniform_elements_along<0>(acc, ...)`.
+   *
+   * To cover the problem space, different threads may execute a different number of iterations. As a result, it is not
+   * safe to call `alpaka::syncBlockThreads()` and other block-level synchronisations within this loop.
+   * If a block synchronisation is needed, one should split the loop into an outer loop over the groups and an inner
+   * loop over each group's elements, and synchronise only in the outer loop:
+   *
+   *  for (auto group : uniform_groups(acc, extent)) {
+   *    for (auto element : uniform_group_elements(acc, group, extent)) {
+   *       // first part of the computation
+   *       // no synchronisations here
+   *       ...
+   *    }
+   *    // wait for all threads to complete the first part
+   *    alpaka::syncBlockThreads();
+   *    for (auto element : uniform_group_elements(acc, group, extent)) {
+   *       // second part of the computation
+   *       // no synchronisations here
+   *       ...
+   *    }
+   *    // wait for all threads to complete the second part
+   *    alpaka::syncBlockThreads();
+   *    ...
+   *  }
+   *
+   * Warp-level primitives require that all threads in the warp execute the same function. If `extent` is not a multiple
+   * of the warp size, some of the warps may be incomplete, leading to undefined behaviour - for example, the kernel may
+   * hang. To avoid this problem, round up `extent` to a multiple of the warp size, and check the element index
+   * explicitly inside the loop:
+   *
+   *  for (auto element : uniform_elements(acc, round_up_by(extent, alpaka::warp::getSize(acc)))) {
+   *    bool flag = false;
+   *    if (element < extent) {
+   *      // do some work and compute a result flag only for elements up to extent
+   *      flag = do_some_work();
+   *    }
+   *    // check if any valid element had a positive result
+   *    if (alpaka::warp::any(acc, flag)) {
+   *      // ...
+   *    }
+   *  }
+   *
+   * Note that `uniform_elements(acc, ...)` is only suitable for one-dimensional kernels. For N-dimensional kernels, use
+   *   - `uniform_elements_nd(acc, ...)` to cover an N-dimensional problem space with a single loop;
+   *   - `uniform_elements_along<Dim>(acc, ...)` to perform the iteration explicitly along dimension `Dim`;
+   *   - `uniform_elements_x(acc, ...)`, `uniform_elements_y(acc, ...)`, or `uniform_elements_z(acc, ...)` to loop
+   *     along the fastest, second-fastest, or third-fastest dimension.
+   */
+
+  template <typename TAcc,
+            typename... TArgs,
+            typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
+  ALPAKA_FN_ACC inline auto uniform_elements(TAcc const& acc, TArgs... args) {
+    return uniform_elements_along<TAcc, 0>(acc, static_cast<Idx>(args)...);
+  }
+
+  /* uniform_elements_x, _y, _z
+   *
+   * Like `uniform_elements` for N-dimensional kernels, along the fastest, second-fastest, and third-fastest dimensions.
+   */
+
+  template <typename TAcc,
+            typename... TArgs,
+            typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 0)>>
+  ALPAKA_FN_ACC inline auto uniform_elements_x(TAcc const& acc, TArgs... args) {
+    return uniform_elements_along<TAcc, alpaka::Dim<TAcc>::value - 1>(acc, static_cast<Idx>(args)...);
+  }
+
+  template <typename TAcc,
+            typename... TArgs,
+            typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 1)>>
+  ALPAKA_FN_ACC inline auto uniform_elements_y(TAcc const& acc, TArgs... args) {
+    return uniform_elements_along<TAcc, alpaka::Dim<TAcc>::value - 2>(acc, static_cast<Idx>(args)...);
+  }
+
+  template <typename TAcc,
+            typename... TArgs,
+            typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 2)>>
+  ALPAKA_FN_ACC inline auto uniform_elements_z(TAcc const& acc, TArgs... args) {
+    return uniform_elements_along<TAcc, alpaka::Dim<TAcc>::value - 3>(acc, static_cast<Idx>(args)...);
+  }
+
+  /* elements_with_stride
+   *
+   * `elements_with_stride(acc [, first], extent)` returns a one-dimensional iteratable range that spans the element
+   * indices from `first` (inclusive) to `extent` (exlusive).
+   * If `first` is not specified, it defaults to 0.
+   * If `extent` is not specified, it defaults to the kernel grid size.
+   *
+   * `elements_with_stride(acc, ...)` is a legacy name for `uniform_elements(acc, ...)`.
+   */
+
+  template <typename TAcc,
+            typename... TArgs,
+            typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
+  ALPAKA_FN_ACC inline auto elements_with_stride(TAcc const& acc, TArgs... args) {
+    return uniform_elements_along<TAcc, 0>(acc, static_cast<Idx>(args)...);
+  }
+
+  /* uniform_elements_nd
+   *
+   * `uniform_elements_nd(acc, extent)` returns an N-dimensional iteratable range that spans the element indices
+   * required to cover the given problem size, indicated by `extent`.
+   *
+   * To cover the problem space, different threads may execute a different number of iterations. As a result, it is not
+   * safe to call `alpaka::syncBlockThreads()` and other block-level synchronisations within this loop.
+   * If a block synchronisation is needed, one should split the loop into an outer loop over the groups and an inner
+   * loop over each group's elements, and synchronise only in the outer loop:
+   *
+   *  for (auto group0 : uniform_groups_along<0>(acc, extent[0])) {
+   *    for (auto group1 : uniform_groups_along<1>(acc, extent[1])) {
+   *      for (auto element0 : uniform_group_elements_along<0>(acc, group0, extent[0])) {
+   *        for (auto element1 : uniform_group_elements_along<1>(acc, group1, extent[1])) {
+   *           // first part of the computation
+   *           // no synchronisations here
+   *           ...
+   *        }
+   *      }
+   *      // wait for all threads to complete the first part
+   *      alpaka::syncBlockThreads();
+   *      for (auto element0 : uniform_group_elements_along<0>(acc, group0, extent[0])) {
+   *        for (auto element1 : uniform_group_elements_along<1>(acc, group1, extent[1])) {
+   *           // second part of the computation
+   *           // no synchronisations here
+   *           ...
+   *        }
+   *      }
+   *      // wait for all threads to complete the second part
+   *      alpaka::syncBlockThreads();
+   *      ...
+   *    }
+   *  }
+   *
+   * For more details, see `uniform_elements_along<Dim>(acc, ...)`.
+   */
+
   template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 0)>>
-  class elements_with_stride_nd {
+  class uniform_elements_nd {
   public:
     using Dim = alpaka::Dim<TAcc>;
     using Vec = alpaka::Vec<Dim, Idx>;
 
-    ALPAKA_FN_ACC inline elements_with_stride_nd(TAcc const& acc)
+    ALPAKA_FN_ACC inline uniform_elements_nd(TAcc const& acc)
         : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)},
           thread_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc) * elements_},
           stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc) * elements_},
           extent_{stride_} {}
 
-    ALPAKA_FN_ACC inline elements_with_stride_nd(TAcc const& acc, Vec extent)
+    ALPAKA_FN_ACC inline uniform_elements_nd(TAcc const& acc, Vec extent)
         : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)},
           thread_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc) * elements_},
           stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc) * elements_},
@@ -248,7 +425,7 @@ namespace cms::alpakatools {
     }
 
     class const_iterator {
-      friend class elements_with_stride_nd;
+      friend class uniform_elements_nd;
 
     public:
       ALPAKA_FN_ACC inline Vec operator*() const { return index_; }
@@ -274,14 +451,14 @@ namespace cms::alpakatools {
 
     private:
       // construct an iterator pointing to the first element to be processed by the current thread
-      ALPAKA_FN_ACC inline const_iterator(elements_with_stride_nd const* loop, Vec first)
+      ALPAKA_FN_ACC inline const_iterator(uniform_elements_nd const* loop, Vec first)
           : loop_{loop},
             first_{alpaka::elementwise_min(first, loop->extent_)},
             range_{alpaka::elementwise_min(first + loop->elements_, loop->extent_)},
             index_{first_} {}
 
       // construct an end iterator, pointing post the end of the extent
-      ALPAKA_FN_ACC inline const_iterator(elements_with_stride_nd const* loop, at_end_t const&)
+      ALPAKA_FN_ACC inline const_iterator(uniform_elements_nd const* loop, at_end_t const&)
           : loop_{loop}, first_{loop_->extent_}, range_{loop_->extent_}, index_{loop_->extent_} {}
 
       template <size_t I>
@@ -360,8 +537,8 @@ namespace cms::alpakatools {
         index_ = loop_->extent_;
       }
 
-      // const pointer to the elements_with_stride_nd that the iterator refers to
-      const elements_with_stride_nd* loop_;
+      // const pointer to the uniform_elements_nd that the iterator refers to
+      const uniform_elements_nd* loop_;
 
       // modified by the pre/post-increment operator
       Vec first_;  // first element processed by this thread
@@ -376,34 +553,88 @@ namespace cms::alpakatools {
     const Vec extent_;
   };
 
-  /* blocks_with_stride
+  /* elements_with_stride_nd
+   *
+   * `elements_with_stride_nd(acc, extent)` returns an N-dimensional iteratable range that spans the element indices
+   * required to cover the given problem size, indicated by `extent`.
+   *
+   * `elements_with_stride_nd(acc, ...)` is a legacy name for `uniform_elements_nd(acc, ...)`.
+   */
+
+  template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 0)>>
+  ALPAKA_FN_ACC inline auto elements_with_stride_nd(TAcc const& acc) {
+    return uniform_elements_nd<TAcc>(acc);
+  }
+
+  template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 0)>>
+  ALPAKA_FN_ACC inline auto elements_with_stride_nd(TAcc const& acc, alpaka::Vec<alpaka::Dim<TAcc>, Idx> extent) {
+    return uniform_elements_nd<TAcc>(acc, extent);
+  }
+
+  /* uniform_groups_along
+   *
+   * `uniform_groups_along<Dim>(acc, elements)` returns a one-dimensional iteratable range than spans the group indices
+   * required to cover the given problem size along the `Dim` dimension, in units of the block size. `elements`
+   * indicates the total number of elements, across all groups; if not specified, it defaults to the kernel grid size
+   * along the `Dim` dimension.
+   *
+   * In a 1-dimensional kernel, `uniform_groups(acc, ...)` is a shorthand for `uniform_groups_along<0>(acc, ...)`.
+   *
+   * In an N-dimensional kernel, dimension 0 is the one that increases more slowly (e.g. the outer loop), followed by
+   * dimension 1, up to dimension N-1 that increases fastest (e.g. the inner loop).
+   * For convenience when converting CUDA or HIP code, `uniform_groups_x(acc, ...)`, `_y` and `_z` are shorthands for 
+   * `uniform_groups_along<N-1>(acc, ...)`, `<N-2>` and `<N-3>`.
+   *
+   * `uniform_groups_along<Dim>` should be called consistently by all the threads in a block. All threads in a block see
+   * the same loop iterations, while threads in different blocks may see a different number of iterations.
+   * If the work division has more blocks than the required number of groups, the first blocks will perform one
+   * iteration of the loop, while the other blocks will exit the loop immediately.
+   * If the work division has less blocks than the required number of groups, some of the blocks will perform more than
+   * one iteration, in order to cover then whole problem space.
+   *
+   * If the problem size is not a multiple of the block size, the last group will process a number of elements smaller
+   * than the block size. However, also in this case all threads in the block will execute the same number of iterations
+   * of this loop: this makes it safe to use block-level synchronisations in the loop body. It is left to the inner loop
+   * (or the user) to ensure that only the correct number of threads process any data; this logic is implemented by 
+   * `uniform_group_elements_along<Dim>(acc, group, elements)`.
+   *
+   * For example, if the block size is 64 and there are 400 elements
+   *
+   *   for (auto group: uniform_groups_along<Dim>(acc, 400)
    *
-   * `blocks_with_stride(acc, size)` returns a range than spans the (virtual) block indices required to cover the given
-   * problem size.
+   * will return the group range from 0 to 6, distributed across all blocks in the work division: group 0 should cover
+   * the elements from 0 to 63, group 1 should cover the elements from 64 to 127, etc., until the last group, group 6,
+   * should cover the elements from 384 to 399. All the threads of the block will process this last group; it is up to
+   * the inner loop to not process the non-existing elements after 399.
    *
-   * For example, if size is 1000 and the block size is 16, it will return the range from 1 to 62.
-   * If the work division has more than 63 blocks, only the first 63 will perform one iteration of the loop, and the
-   * other will exit immediately.
-   * If the work division has less than 63 blocks, some of the blocks will perform more than one iteration, in order to
-   * cover then whole problem space.
+   * If the work division has more than 7 blocks, the first 7 will perform one iteration of the loop, while the other
+   * blocks will exit the loop immediately. For example if the work division has 8 blocks, the blocks from 0 to 6 will
+   * process one group while block 7 will no process any.
    *
-   * All threads in a block see the same loop iterations, while threads in different blocks may see a different number
-   * of iterations.
+   * If the work division has less than 7 blocks, some of the blocks will perform more than one iteration of the loop,
+   * in order to cover then whole problem space. For example if the work division has 4 blocks, block 0 will process the
+   * groups 0 and 4, block 1 will process groups 1 and 5, group 2 will process groups 2 and 6, and block 3 will process
+   * group 3.
+   *
+   * See `uniform_elements_along<Dim>(acc, ...)` for a concrete example using `uniform_groups_along<Dim>` and
+   * `uniform_group_elements_along<Dim>`.
    */
 
-  template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
-  class blocks_with_stride {
+  template <typename TAcc,
+            std::size_t Dim,
+            typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value >= Dim>>
+  class uniform_groups_along {
   public:
-    ALPAKA_FN_ACC inline blocks_with_stride(TAcc const& acc)
-        : first_{alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0u]},
-          stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[0u]},
+    ALPAKA_FN_ACC inline uniform_groups_along(TAcc const& acc)
+        : first_{alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[Dim]},
+          stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[Dim]},
           extent_{stride_} {}
 
     // extent is the total number of elements (not blocks)
-    ALPAKA_FN_ACC inline blocks_with_stride(TAcc const& acc, Idx extent)
-        : first_{alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0u]},
-          stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[0u]},
-          extent_{divide_up_by(extent, alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u])} {}
+    ALPAKA_FN_ACC inline uniform_groups_along(TAcc const& acc, Idx extent)
+        : first_{alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[Dim]},
+          stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[Dim]},
+          extent_{divide_up_by(extent, alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[Dim])} {}
 
     class const_iterator;
     using iterator = const_iterator;
@@ -413,7 +644,7 @@ namespace cms::alpakatools {
     ALPAKA_FN_ACC inline const_iterator end() const { return const_iterator(stride_, extent_, extent_); }
 
     class const_iterator {
-      friend class blocks_with_stride;
+      friend class uniform_groups_along;
 
       ALPAKA_FN_ACC inline const_iterator(Idx stride, Idx extent, Idx first)
           : stride_{stride}, extent_{extent}, first_{std::min(first, extent)} {}
@@ -458,36 +689,167 @@ namespace cms::alpakatools {
     const Idx extent_;
   };
 
-  /* elements_in_block
+  /* uniform_groups
+   *
+   * `uniform_groups(acc, elements)` returns a one-dimensional iteratable range than spans the group indices required to
+   * cover the given problem size, in units of the block size. `elements` indicates the total number of elements, across
+   * all groups; if not specified, it defaults to the kernel grid size.
+   *
+   * `uniform_groups(acc, ...)` is a shorthand for `uniform_groups_along<0>(acc, ...)`.
+   *
+   * `uniform_groups` should be called consistently by all the threads in a block. All threads in a block see the same
+   * loop iterations, while threads in different blocks may see a different number of iterations.
+   * If the work division has more blocks than the required number of groups, the first blocks will perform one
+   * iteration of the loop, while the other blocks will exit the loop immediately.
+   * If the work division has less blocks than the required number of groups, some of the blocks will perform more than
+   * one iteration, in order to cover then whole problem space.
+   *
+   * If the problem size is not a multiple of the block size, the last group will process a number of elements smaller
+   * than the block size. However, also in this case all threads in the block will execute the same number of iterations
+   * of this loop: this makes it safe to use block-level synchronisations in the loop body. It is left to the inner loop
+   * (or the user) to ensure that only the correct number of threads process any data; this logic is implemented by 
+   * `uniform_group_elements(acc, group, elements)`.
+   *
+   * For example, if the block size is 64 and there are 400 elements
+   *
+   *   for (auto group: uniform_groups(acc, 400)
+   *
+   * will return the group range from 0 to 6, distributed across all blocks in the work division: group 0 should cover
+   * the elements from 0 to 63, group 1 should cover the elements from 64 to 127, etc., until the last group, group 6,
+   * should cover the elements from 384 to 399. All the threads of the block will process this last group; it is up to
+   * the inner loop to not process the non-existing elements after 399.
+   *
+   * If the work division has more than 7 blocks, the first 7 will perform one iteration of the loop, while the other
+   * blocks will exit the loop immediately. For example if the work division has 8 blocks, the blocks from 0 to 6 will
+   * process one group while block 7 will no process any.
+   *
+   * If the work division has less than 7 blocks, some of the blocks will perform more than one iteration of the loop,
+   * in order to cover then whole problem space. For example if the work division has 4 blocks, block 0 will process the
+   * groups 0 and 4, block 1 will process groups 1 and 5, group 2 will process groups 2 and 6, and block 3 will process
+   * group 3.
+   *
+   * See `uniform_elements(acc, ...)` for a concrete example using `uniform_groups` and `uniform_group_elements`.
+   *
+   * Note that `uniform_groups(acc, ...)` is only suitable for one-dimensional kernels. For N-dimensional kernels, use
+   *   - `uniform_groups_along<Dim>(acc, ...)` to perform the iteration explicitly along dimension `Dim`;
+   *   - `uniform_groups_x(acc, ...)`, `uniform_groups_y(acc, ...)`, or `uniform_groups_z(acc, ...)` to loop
+   *     along the fastest, second-fastest, or third-fastest dimension.
+   */
+
+  template <typename TAcc,
+            typename... TArgs,
+            typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
+  ALPAKA_FN_ACC inline auto uniform_groups(TAcc const& acc, TArgs... args) {
+    return uniform_groups_along<TAcc, 0>(acc, static_cast<Idx>(args)...);
+  }
+
+  /* uniform_groups_x, _y, _z
+   *
+   * Like `uniform_groups` for N-dimensional kernels, along the fastest, second-fastest, and third-fastest dimensions.
+   */
+
+  template <typename TAcc,
+            typename... TArgs,
+            typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 0)>>
+  ALPAKA_FN_ACC inline auto uniform_groups_x(TAcc const& acc, TArgs... args) {
+    return uniform_groups_along<TAcc, alpaka::Dim<TAcc>::value - 1>(acc, static_cast<Idx>(args)...);
+  }
+
+  template <typename TAcc,
+            typename... TArgs,
+            typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 1)>>
+  ALPAKA_FN_ACC inline auto uniform_groups_y(TAcc const& acc, TArgs... args) {
+    return uniform_groups_along<TAcc, alpaka::Dim<TAcc>::value - 2>(acc, static_cast<Idx>(args)...);
+  }
+
+  template <typename TAcc,
+            typename... TArgs,
+            typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 2)>>
+  ALPAKA_FN_ACC inline auto uniform_groups_z(TAcc const& acc, TArgs... args) {
+    return uniform_groups_along<TAcc, alpaka::Dim<TAcc>::value - 3>(acc, static_cast<Idx>(args)...);
+  }
+
+  /* blocks_with_stride
+   *
+   * `blocks_with_stride(acc, elements)` returns a one-dimensional iteratable range than spans the group indices
+   * required to cover the given problem size, in units of the block size. `elements` indicates the total number of
+   * elements, across all groups; if not specified, it defaults to the kernel grid size.
+   *
+   * `blocks_with_stride(acc, ...)` is a legacy name for `uniform_groups(acc, ...)`.
+   */
+
+  template <typename TAcc,
+            typename... TArgs,
+            typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
+  ALPAKA_FN_ACC inline auto blocks_with_stride(TAcc const& acc, TArgs... args) {
+    return uniform_groups_along<TAcc, 0>(acc, static_cast<Idx>(args)...);
+  }
+
+  /* uniform_group_elements_along
+   *
+   * `uniform_group_elements_along<Dim>(acc, group, elements)` returns a one-dimensional iteratable range that spans all
+   * the elements within the given `group` along dimension `Dim`, as obtained from `uniform_groups_along<Dim>`, up to
+   * `elements` (exclusive). `elements` indicates the total number of elements across all groups; if not specified, it
+   * defaults to the kernel grid size.
+   *
+   * In a 1-dimensional kernel, `uniform_group_elements(acc, ...)` is a shorthand for
+   * `uniform_group_elements_along<0>(acc, ...)`.
+   *
+   * In an N-dimensional kernel, dimension 0 is the one that increases more slowly (e.g. the outer loop), followed by 
+   * dimension 1, up to dimension N-1 that increases fastest (e.g. the inner loop).
+   * For convenience when converting CUDA or HIP code, `uniform_group_elements_x(acc, ...)`, `_y` and `_z` are
+   * shorthands for `uniform_group_elements_along<N-1>(acc, ...)`, `<N-2>` and `<N-3>`.
    *
-   * `elements_in_block(acc, block, size)` returns a range that spans all the elements within the given block.
-   * Iterating over the range yields values of type ElementIndex, that contain both .global and .local indices
-   * of the corresponding element.
+   * Iterating over the range yields values of type `ElementIndex`, that provide the `.global` and `.local` indices of
+   * the corresponding element. The global index spans a subset of the range from 0 to `elements` (excluded), while the
+   * local index spans the range from 0 to the block size (excluded).
    *
-   * If the work division has only one element per thread, the loop will perform at most one iteration.
-   * If the work division has more than one elements per thread, the loop will perform that number of iterations,
-   * or less if it reaches size.
+   * The loop will perform a number of iterations up to the number of elements per thread, stopping earlier if the
+   * global element index reaches `elements`.
    *
    * If the problem size is not a multiple of the block size, different threads may execute a different number of
-   * iterations. As a result, it is not safe to call alpaka::syncBlockThreads() within this loop. If a block
+   * iterations. As a result, it is not safe to call `alpaka::syncBlockThreads()` within this loop. If a block
    * synchronisation is needed, one should split the loop, and synchronise the threads between the loops.
+   * See `uniform_elements_along<Dim>(acc, ...)` for a concrete example using `uniform_groups_along<Dim>` and
+   * `uniform_group_elements_along<Dim>`.
+   *
+   * Warp-level primitives require that all threads in the warp execute the same function. If `elements` is not a
+   * multiple of the warp size, some of the warps may be incomplete, leading to undefined behaviour - for example, the
+   * kernel may hang. To avoid this problem, round up `elements` to a multiple of the warp size, and check the element
+   * index explicitly inside the loop:
+   *
+   *  for (auto element : uniform_group_elements_along<N-1>(acc, group, round_up_by(elements, alpaka::warp::getSize(acc)))) {
+   *    bool flag = false;
+   *    if (element < elements) {
+   *      // do some work and compute a result flag only for the valid elements
+   *      flag = do_some_work();
+   *    }
+   *    // check if any valid element had a positive result
+   *    if (alpaka::warp::any(acc, flag)) {
+   *      // ...
+   *    }
+   *  }
+   *
+   * Note that the use of warp-level primitives is usually suitable only for the fastest-looping dimension, `N-1`.
    */
 
-  template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
-  class elements_in_block {
+  template <typename TAcc,
+            std::size_t Dim,
+            typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value >= Dim>>
+  class uniform_group_elements_along {
   public:
-    ALPAKA_FN_ACC inline elements_in_block(TAcc const& acc, Idx block)
-        : first_{block * alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u]},
-          local_{alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u] *
-                 alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[0u]},
-          range_{local_ + alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[0u]} {}
-
-    ALPAKA_FN_ACC inline elements_in_block(TAcc const& acc, Idx block, Idx extent)
-        : first_{block * alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u]},
+    ALPAKA_FN_ACC inline uniform_group_elements_along(TAcc const& acc, Idx block)
+        : first_{block * alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[Dim]},
+          local_{alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[Dim] *
+                 alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim]},
+          range_{local_ + alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim]} {}
+
+    ALPAKA_FN_ACC inline uniform_group_elements_along(TAcc const& acc, Idx block, Idx extent)
+        : first_{block * alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[Dim]},
           local_{std::min(extent - first_,
-                          alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u] *
-                              alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[0u])},
-          range_{std::min(extent - first_, local_ + alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[0u])} {}
+                          alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[Dim] *
+                              alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim])},
+          range_{std::min(extent - first_, local_ + alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim])} {}
 
     class const_iterator;
     using iterator = const_iterator;
@@ -497,7 +859,7 @@ namespace cms::alpakatools {
     ALPAKA_FN_ACC inline const_iterator end() const { return const_iterator(range_, first_, range_); }
 
     class const_iterator {
-      friend class elements_in_block;
+      friend class uniform_group_elements_along;
 
       ALPAKA_FN_ACC inline const_iterator(Idx local, Idx first, Idx range)
           : index_{local}, first_{first}, range_{range} {}
@@ -544,54 +906,99 @@ namespace cms::alpakatools {
     const Idx range_;
   };
 
-  /* uniform_groups
+  /* uniform_group_elements
    *
-   * `uniform_groups(acc, elements)` returns a range than spans the group indices required to cover the given problem
-   * size, in units of the block size:
-   *   - the `elements` argument indicates the total number of elements, across all groups.
+   * `uniform_group_elements(acc, group, elements)` returns a one-dimensional iteratable range that spans all the
+   * elements within the given `group`, as obtained from `uniform_groups`, up to `elements` (exclusive). `elements`
+   * indicates the total number of elements across all groups; if not specified, it defaults to the kernel grid size.
    *
-   * `uniform_groups` should be called consistently by all the threads in a block. All threads in a block see the same
-   * loop iterations, while threads in different blocks may see a different number of iterations.
+   * `uniform_group_elements(acc, ...)` is a shorthand for `uniform_group_elements_along<0>(acc, ...)`.
    *
-   * For example, if `size` is 1000 and the block size is 16,
+   * Iterating over the range yields values of type `ElementIndex`, that provide the `.global` and `.local` indices of
+   * the corresponding element. The global index spans a subset of the range from 0 to `elements` (excluded), while the
+   * local index spans the range from 0 to the block size (excluded).
    *
-   *   for (auto group: uniform_groups(acc, 1000)
+   * The loop will perform a number of iterations up to the number of elements per thread, stopping earlier if the
+   * global element index reaches `elements`.
    *
-   * will return the range from 0 to 62, split across all blocks in the work division.
+   * If the problem size is not a multiple of the block size, different threads may execute a different number of
+   * iterations. As a result, it is not safe to call `alpaka::syncBlockThreads()` within this loop. If a block
+   * synchronisation is needed, one should split the loop, and synchronise the threads between the loops.
+   * See `uniform_elements(acc, ...)` for a concrete example using `uniform_groups` and `uniform_group_elements`.
    *
-   * If the work division has more than 63 blocks, the first 63 will perform one iteration of the loop, while the other
-   * blocks will exit immediately.
-   * If the work division has less than 63 blocks, some of the blocks will perform more than one iteration, in order to
-   * cover then whole problem space.
+   * Warp-level primitives require that all threads in the warp execute the same function. If `elements` is not a
+   * multiple of the warp size, some of the warps may be incomplete, leading to undefined behaviour - for example, the
+   * kernel may hang. To avoid this problem, round up `elements` to a multiple of the warp size, and check the element
+   * index explicitly inside the loop:
    *
-   * If the problem size is not a multiple of the block size, the last group will process a number of elements smaller
-   * than the block size. Also in this case all threads in the block will execute the same number of iterations of this
-   * loop: this makes it safe to use block-level synchronisations in the loop body. It is left to the inner loop (or the
-   * user) to ensure that only the correct number of threads process any data.
+   *  for (auto element : uniform_group_elements(acc, group, round_up_by(elements, alpaka::warp::getSize(acc)))) {
+   *    bool flag = false;
+   *    if (element < elements) {
+   *      // do some work and compute a result flag only for the valid elements
+   *      flag = do_some_work();
+   *    }
+   *    // check if any valid element had a positive result
+   *    if (alpaka::warp::any(acc, flag)) {
+   *      // ...
+   *    }
+   *  }
+   *
+   * Note that `uniform_group_elements(acc, ...)` is only suitable for one-dimensional kernels. For N-dimensional
+   * kernels, use
+   *   - `uniform_group_elements_along<Dim>(acc, ...)` to perform the iteration explicitly along dimension `Dim`;
+   *   - `uniform_group_elements_x(acc, ...)`, `uniform_group_elements_y(acc, ...)`, or
+   *     `uniform_group_elements_z(acc, ...)` to loop along the fastest, second-fastest, or third-fastest dimension.
    */
 
-  template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
-  using uniform_groups = blocks_with_stride<TAcc>;
+  template <typename TAcc,
+            typename... TArgs,
+            typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
+  ALPAKA_FN_ACC inline auto uniform_group_elements(TAcc const& acc, TArgs... args) {
+    return uniform_group_elements_along<TAcc, 0>(acc, static_cast<Idx>(args)...);
+  }
 
-  /* uniform_group_elements
-   *
-   * `uniform_group_elements(acc, group, elements)` returns a range that spans all the elements within the given group:
-   *   - the `group` argument indicates the id of the current group, for example as obtained from `uniform_groups`;
-   *   - the `elements` argument indicates the total number of elements, across all groups.
+  /* uniform_group_elements_x, _y, _z
    *
-   * Iterating over the range yields values of type `ElementIndex`, that contain the `.global` and `.local` indices of
-   * the corresponding element.
+   * Like `uniform_group_elements` for N-dimensional kernels, along the fastest, second-fastest, and third-fastest
+   * dimensions.
+   */
+
+  template <typename TAcc,
+            typename... TArgs,
+            typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 0)>>
+  ALPAKA_FN_ACC inline auto uniform_group_elements_x(TAcc const& acc, TArgs... args) {
+    return uniform_group_elements_along<TAcc, alpaka::Dim<TAcc>::value - 1>(acc, static_cast<Idx>(args)...);
+  }
+
+  template <typename TAcc,
+            typename... TArgs,
+            typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 1)>>
+  ALPAKA_FN_ACC inline auto uniform_group_elements_y(TAcc const& acc, TArgs... args) {
+    return uniform_group_elements_along<TAcc, alpaka::Dim<TAcc>::value - 2>(acc, static_cast<Idx>(args)...);
+  }
+
+  template <typename TAcc,
+            typename... TArgs,
+            typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 2)>>
+  ALPAKA_FN_ACC inline auto uniform_group_elements_z(TAcc const& acc, TArgs... args) {
+    return uniform_group_elements_along<TAcc, alpaka::Dim<TAcc>::value - 3>(acc, static_cast<Idx>(args)...);
+  }
+
+  /* elements_in_block
    *
-   * The loop will perform a number of iterations up to the number of elements per thread, stopping earlier when the
-   * element index reaches `size`.
+   * `elements_in_block(acc, group, elements)` returns a one-dimensional iteratable range that spans all the elements
+   * within the given `group`, as obtained from `uniform_groups`, up to `elements` (exclusive). `elements` indicates the
+   * total number of elements across all groups; if not specified, it defaults to the kernel grid size.
    *
-   * If the problem size is not a multiple of the block size, different threads may execute a different number of
-   * iterations. As a result, it is not safe to call alpaka::syncBlockThreads() within this loop. If a block
-   * synchronisation is needed, one should split the loop, and synchronise the threads between the loops.
+   * `elements_in_block(acc, ...)` is a legacy for `uniform_group_elements(acc, ...)`.
    */
 
-  template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
-  using uniform_group_elements = elements_in_block<TAcc>;
+  template <typename TAcc,
+            typename... TArgs,
+            typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
+  ALPAKA_FN_ACC inline auto elements_in_block(TAcc const& acc, TArgs... args) {
+    return uniform_group_elements_along<TAcc, 0>(acc, static_cast<Idx>(args)...);
+  }
 
   /* independent_groups
    *
@@ -682,7 +1089,7 @@ namespace cms::alpakatools {
    * per block, compared with the total number of elements.
    *
    * If the problem size is not a multiple of the block size, different threads may execute a different number of
-   * iterations. As a result, it is not safe to call alpaka::syncBlockThreads() within this loop. If a block
+   * iterations. As a result, it is not safe to call `alpaka::syncBlockThreads()` within this loop. If a block
    * synchronisation is needed, one should split the loop, and synchronise the threads between the loops.
    */
 
diff --git a/HeterogeneousCore/AlpakaInterface/test/alpaka/testKernel.dev.cc b/HeterogeneousCore/AlpakaInterface/test/alpaka/testKernel.dev.cc
index a730e4b515a76..a2b38da93fc18 100644
--- a/HeterogeneousCore/AlpakaInterface/test/alpaka/testKernel.dev.cc
+++ b/HeterogeneousCore/AlpakaInterface/test/alpaka/testKernel.dev.cc
@@ -17,7 +17,7 @@ struct VectorAddKernel {
   template <typename TAcc, typename T>
   ALPAKA_FN_ACC void operator()(
       TAcc const& acc, T const* __restrict__ in1, T const* __restrict__ in2, T* __restrict__ out, size_t size) const {
-    for (auto index : cms::alpakatools::elements_with_stride(acc, size)) {
+    for (auto index : cms::alpakatools::uniform_elements(acc, size)) {
       out[index] = in1[index] + in2[index];
     }
   }
@@ -31,7 +31,7 @@ struct VectorAddKernelSkip {
                                 T* __restrict__ out,
                                 size_t first,
                                 size_t size) const {
-    for (auto index : cms::alpakatools::elements_with_stride(acc, first, size)) {
+    for (auto index : cms::alpakatools::uniform_elements(acc, first, size)) {
       out[index] = in1[index] + in2[index];
     }
   }
@@ -41,7 +41,7 @@ struct VectorAddKernel1D {
   template <typename TAcc, typename T>
   ALPAKA_FN_ACC void operator()(
       TAcc const& acc, T const* __restrict__ in1, T const* __restrict__ in2, T* __restrict__ out, Vec1D size) const {
-    for (auto ndindex : cms::alpakatools::elements_with_stride_nd(acc, size)) {
+    for (auto ndindex : cms::alpakatools::uniform_elements_nd(acc, size)) {
       auto index = ndindex[0];
       out[index] = in1[index] + in2[index];
     }
@@ -52,7 +52,7 @@ struct VectorAddKernel2D {
   template <typename TAcc, typename T>
   ALPAKA_FN_ACC void operator()(
       TAcc const& acc, T const* __restrict__ in1, T const* __restrict__ in2, T* __restrict__ out, Vec2D size) const {
-    for (auto ndindex : cms::alpakatools::elements_with_stride_nd(acc, size)) {
+    for (auto ndindex : cms::alpakatools::uniform_elements_nd(acc, size)) {
       auto index = ndindex[0] * size[1] + ndindex[1];
       out[index] = in1[index] + in2[index];
     }
@@ -63,7 +63,7 @@ struct VectorAddKernel3D {
   template <typename TAcc, typename T>
   ALPAKA_FN_ACC void operator()(
       TAcc const& acc, T const* __restrict__ in1, T const* __restrict__ in2, T* __restrict__ out, Vec3D size) const {
-    for (auto ndindex : cms::alpakatools::elements_with_stride_nd(acc, size)) {
+    for (auto ndindex : cms::alpakatools::uniform_elements_nd(acc, size)) {
       auto index = (ndindex[0] * size[1] + ndindex[1]) * size[2] + ndindex[2];
       out[index] = in1[index] + in2[index];
     }
@@ -84,7 +84,7 @@ struct VectorAddBlockKernel {
     T* buffer = alpaka::getDynSharedMem<T>(acc);
     // the outer loop is needed to repeat the "block" as many times as needed to cover the whole problem space
     // the inner loop is needed for backends that use more than one element per thread
-    for (auto block : cms::alpakatools::blocks_with_stride(acc, size)) {
+    for (auto block : cms::alpakatools::uniform_groups(acc, size)) {
       // only one thread per block: initialise the shared memory
       if (cms::alpakatools::once_per_block(acc)) {
         // not really necessary, just to show how to use "once_per_block"
@@ -94,19 +94,19 @@ struct VectorAddBlockKernel {
       // synchronise all threads in the block
       alpaka::syncBlockThreads(acc);
       // read the first set of data into shared memory
-      for (auto index : cms::alpakatools::elements_in_block(acc, block, size)) {
+      for (auto index : cms::alpakatools::uniform_group_elements(acc, block, size)) {
         buffer[index.local] = in1[index.global];
       }
       // synchronise all threads in the block
       alpaka::syncBlockThreads(acc);
       // add the second set of data into shared memory
-      for (auto index : cms::alpakatools::elements_in_block(acc, block, size)) {
+      for (auto index : cms::alpakatools::uniform_group_elements(acc, block, size)) {
         buffer[index.local] += in2[index.global];
       }
       // synchronise all threads in the block
       alpaka::syncBlockThreads(acc);
       // store the results into global memory
-      for (auto index : cms::alpakatools::elements_in_block(acc, block, size)) {
+      for (auto index : cms::alpakatools::uniform_group_elements(acc, block, size)) {
         out[index.global] = buffer[index.local];
       }
     }
@@ -142,7 +142,7 @@ struct VectorAddKernelBlockSerial {
     // block size
     auto const blockSize = alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u];
     // the loop is used to repeat the "block" as many times as needed to cover the whole problem space
-    for (auto block : cms::alpakatools::blocks_with_stride(acc, size)) {
+    for (auto block : cms::alpakatools::uniform_groups(acc, size)) {
       // the operations are performed by a single thread in each "logical" block
       const auto first = blockSize * block;
       const auto range = std::min<size_t>(first + blockSize, size);
diff --git a/HeterogeneousCore/AlpakaInterface/test/alpaka/testWorkDivision.dev.cc b/HeterogeneousCore/AlpakaInterface/test/alpaka/testWorkDivision.dev.cc
index ce85ad42cb0f4..1d62bcb57c2cb 100644
--- a/HeterogeneousCore/AlpakaInterface/test/alpaka/testWorkDivision.dev.cc
+++ b/HeterogeneousCore/AlpakaInterface/test/alpaka/testWorkDivision.dev.cc
@@ -11,7 +11,7 @@ using namespace ALPAKA_ACCELERATOR_NAMESPACE;
 // Kernel running a loop over threads/elements
 // One function with multiple flavors
 
-// The type of elements_with_stride
+// The type of uniform_elements
 enum class RangeType { Default, ExtentLimited, ExtentLimitedWithShift };
 
 // The concurrency scope between threads
@@ -28,9 +28,9 @@ bool constexpr firstInLoopRange(TAcc const& acc) {
 }
 
 template <RangeType rangeType, LoopScope loopScope, typename TAcc>
-size_t constexpr expectedCount(TAcc const& acc, size_t size, size_t shift) {
+size_t constexpr expectedCount(TAcc const& acc, size_t skip, size_t size) {
   if constexpr (rangeType == RangeType::ExtentLimitedWithShift)
-    return shift < size ? size - shift : 0;
+    return skip < size ? size - skip : 0;
   else if constexpr (rangeType == RangeType::ExtentLimited)
     return size;
   else /* rangeType == RangeType::Default */
@@ -41,9 +41,9 @@ size_t constexpr expectedCount(TAcc const& acc, size_t size, size_t shift) {
 }
 
 template <RangeType rangeType, LoopScope loopScope>
-size_t constexpr expectedCount(WorkDiv1D const& workDiv, size_t size, size_t shift) {
+size_t constexpr expectedCount(WorkDiv1D const& workDiv, size_t skip, size_t size) {
   if constexpr (rangeType == RangeType::ExtentLimitedWithShift)
-    return shift < size ? size - shift : 0;
+    return skip < size ? size - skip : 0;
   else if constexpr (rangeType == RangeType::ExtentLimited)
     return size;
   else /* rangeType == RangeType::Default */
@@ -56,7 +56,7 @@ size_t constexpr expectedCount(WorkDiv1D const& workDiv, size_t size, size_t shi
 template <RangeType rangeType, LoopScope loopScope>
 struct testWordDivisionDefaultRange {
   template <typename TAcc>
-  ALPAKA_FN_ACC void operator()(TAcc const& acc, size_t size, size_t shift, size_t* globalCounter) const {
+  ALPAKA_FN_ACC void operator()(TAcc const& acc, size_t size, size_t skip, size_t* globalCounter) const {
     size_t& counter =
         (loopScope == LoopScope::Grid ? *globalCounter : alpaka::declareSharedVar<size_t, __COUNTER__>(acc));
     // Init the counter for block range. Grid range does so my mean of memset.
@@ -67,19 +67,19 @@ struct testWordDivisionDefaultRange {
     }
     // The loop we are testing
     if constexpr (rangeType == RangeType::Default)
-      for ([[maybe_unused]] auto idx : elements_with_stride(acc))
+      for ([[maybe_unused]] auto idx : uniform_elements(acc))
         alpaka::atomicAdd(acc, &counter, 1ul, alpaka::hierarchy::Blocks{});
     else if constexpr (rangeType == RangeType::ExtentLimited)
-      for ([[maybe_unused]] auto idx : elements_with_stride(acc, size))
+      for ([[maybe_unused]] auto idx : uniform_elements(acc, size))
         alpaka::atomicAdd(acc, &counter, 1ul, alpaka::hierarchy::Blocks{});
     else if constexpr (rangeType == RangeType::ExtentLimitedWithShift)
-      for ([[maybe_unused]] auto idx : elements_with_stride(acc, shift, size))
+      for ([[maybe_unused]] auto idx : uniform_elements(acc, skip, size))
         alpaka::atomicAdd(acc, &counter, 1ul, alpaka::hierarchy::Blocks{});
     alpaka::syncBlockThreads(acc);
     // Check the result. Grid range will check by memcpy-ing the result.
     if constexpr (loopScope == LoopScope::Block) {
       if (firstInLoopRange<loopScope>(acc)) {
-        auto expected = expectedCount<rangeType, loopScope>(acc, size, shift);
+        auto expected = expectedCount<rangeType, loopScope>(acc, skip, size);
         assert(counter == expected);
       }
     }
@@ -106,16 +106,16 @@ int main() {
     for (size_t blocks = 1; blocks < GridSize * 3; blocks++)
       for (auto sizeFuzz :
            std::initializer_list<ssize_t>{-10 * BlockSize / 13, -BlockSize / 2, -1, 0, 1, BlockSize / 2})
-        for (auto shift : std::initializer_list<ssize_t>{0,
-                                                         1,
-                                                         BlockSize / 2,
-                                                         BlockSize - 1,
-                                                         BlockSize,
-                                                         BlockSize + 1,
-                                                         BlockSize + BlockSize / 2,
-                                                         2 * BlockSize - 1,
-                                                         2 * BlockSize,
-                                                         2 * BlockSize + 1}) {
+        for (auto skip : std::initializer_list<ssize_t>{0,
+                                                        1,
+                                                        BlockSize / 2,
+                                                        BlockSize - 1,
+                                                        BlockSize,
+                                                        BlockSize + 1,
+                                                        BlockSize + BlockSize / 2,
+                                                        2 * BlockSize - 1,
+                                                        2 * BlockSize,
+                                                        2 * BlockSize + 1}) {
           // Grid level iteration: we need to initialize/check at the grid level
           // Default range
           alpaka::memset(queue, counter_d, 0);
@@ -125,12 +125,12 @@ int main() {
               alpaka::createTaskKernel<Acc1D>(workdiv,
                                               testWordDivisionDefaultRange<RangeType::Default, LoopScope::Grid>{},
                                               blocks * BlockSize + sizeFuzz,
-                                              shift,
+                                              skip,
                                               counter_d.data()));
           alpaka::memcpy(queue, counter_h, counter_d);
           alpaka::wait(queue);
           auto expected =
-              expectedCount<RangeType::Default, LoopScope::Grid>(workdiv, blocks * BlockSize + sizeFuzz, shift);
+              expectedCount<RangeType::Default, LoopScope::Grid>(workdiv, skip, blocks * BlockSize + sizeFuzz);
           assert(*counter_h.data() == expected);
 
           // ExtentLimited range
@@ -140,12 +140,12 @@ int main() {
               alpaka::createTaskKernel<Acc1D>(workdiv,
                                               testWordDivisionDefaultRange<RangeType::ExtentLimited, LoopScope::Grid>{},
                                               blocks * BlockSize + sizeFuzz,
-                                              shift,
+                                              skip,
                                               counter_d.data()));
           alpaka::memcpy(queue, counter_h, counter_d);
           alpaka::wait(queue);
           expected =
-              expectedCount<RangeType::ExtentLimited, LoopScope::Grid>(workdiv, blocks * BlockSize + sizeFuzz, shift);
+              expectedCount<RangeType::ExtentLimited, LoopScope::Grid>(workdiv, skip, blocks * BlockSize + sizeFuzz);
           assert(*counter_h.data() == expected);
 
           // ExtentLimitedWithShift range
@@ -155,12 +155,12 @@ int main() {
                               workdiv,
                               testWordDivisionDefaultRange<RangeType::ExtentLimitedWithShift, LoopScope::Grid>{},
                               blocks * BlockSize + sizeFuzz,
-                              shift,
+                              skip,
                               counter_d.data()));
           alpaka::memcpy(queue, counter_h, counter_d);
           alpaka::wait(queue);
           expected = expectedCount<RangeType::ExtentLimitedWithShift, LoopScope::Grid>(
-              workdiv, blocks * BlockSize + sizeFuzz, shift);
+              workdiv, skip, blocks * BlockSize + sizeFuzz);
           assert(*counter_h.data() == expected);
 
           // Block level auto tests
@@ -169,23 +169,23 @@ int main() {
               alpaka::createTaskKernel<Acc1D>(workdiv,
                                               testWordDivisionDefaultRange<RangeType::Default, LoopScope::Grid>{},
                                               blocks * BlockSize + sizeFuzz,
-                                              shift,
+                                              skip,
                                               counter_d.data()));
           alpaka::enqueue(
               queue,
               alpaka::createTaskKernel<Acc1D>(workdiv,
                                               testWordDivisionDefaultRange<RangeType::ExtentLimited, LoopScope::Grid>{},
                                               blocks * BlockSize + sizeFuzz,
-                                              shift,
+                                              skip,
                                               counter_d.data()));
           alpaka::enqueue(queue,
                           alpaka::createTaskKernel<Acc1D>(
                               workdiv,
                               testWordDivisionDefaultRange<RangeType::ExtentLimitedWithShift, LoopScope::Grid>{},
                               blocks * BlockSize + sizeFuzz,
-                              shift,
+                              skip,
                               counter_d.data()));
         }
     alpaka::wait(queue);
   }
-}
\ No newline at end of file
+}

From 5a87cef82768464e901153e702c0571ddcc4fad3 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Fri, 9 Feb 2024 16:28:04 +0100
Subject: [PATCH 20/25] Rewrite the independent element kernel loops

Generalise independent_groups and independent_group_elements to work on any single
dimension, and rename them to independent_groups_along and independent_group_elements_along.

Introduce independent_groups[_x|_y|_z] and independent_group_elements[_x|_y|_z]
as specialisations of independent_groups_along and independent_group_elements_along.
---
 .../AlpakaInterface/interface/workdivision.h  | 208 ++++++++++++++----
 1 file changed, 164 insertions(+), 44 deletions(-)

diff --git a/HeterogeneousCore/AlpakaInterface/interface/workdivision.h b/HeterogeneousCore/AlpakaInterface/interface/workdivision.h
index 2c7b439f8e545..6927e202d0954 100644
--- a/HeterogeneousCore/AlpakaInterface/interface/workdivision.h
+++ b/HeterogeneousCore/AlpakaInterface/interface/workdivision.h
@@ -697,8 +697,8 @@ namespace cms::alpakatools {
    *
    * `uniform_groups(acc, ...)` is a shorthand for `uniform_groups_along<0>(acc, ...)`.
    *
-   * `uniform_groups` should be called consistently by all the threads in a block. All threads in a block see the same
-   * loop iterations, while threads in different blocks may see a different number of iterations.
+   * `uniform_groups(acc, ...)` should be called consistently by all the threads in a block. All threads in a block see
+   * the same loop iterations, while threads in different blocks may see a different number of iterations.
    * If the work division has more blocks than the required number of groups, the first blocks will perform one
    * iteration of the loop, while the other blocks will exit the loop immediately.
    * If the work division has less blocks than the required number of groups, some of the blocks will perform more than
@@ -1000,30 +1000,54 @@ namespace cms::alpakatools {
     return uniform_group_elements_along<TAcc, 0>(acc, static_cast<Idx>(args)...);
   }
 
-  /* independent_groups
+  /* independent_groups_along
+   *
+   * `independent_groups_along<Dim>(acc, groups)` returns a one-dimensional iteratable range than spans the group
+   * indices from 0 to `groups`; the groups are assigned to the blocks along the `Dim` dimension. If `groups` is not
+   * specified, it defaults to the number of blocks along the `Dim` dimension.
+   *
+   * In a 1-dimensional kernel, `independent_groups(acc, ...)` is a shorthand for
+   * `independent_groups_along<0>(acc, ...)`.
+   *
+   * In an N-dimensional kernel, dimension 0 is the one that increases more slowly (e.g. the outer loop), followed by
+   * dimension 1, up to dimension N-1 that increases fastest (e.g. the inner loop).
+   * For convenience when converting CUDA or HIP code, `independent_groups_x(acc, ...)`, `_y` and `_z` are shorthands
+   * for `independent_groups_along<N-1>(acc, ...)`, `<N-2>` and `<N-3>`.
    *
-   * `independent_groups(acc, groups)` returns a range than spans the group indices from 0 to `groups`, with one group
-   * per block:
-   *   - the `groups` argument indicates the total number of groups.
+   * `independent_groups_along<Dim>` should be called consistently by all the threads in a block. All threads in a block
+   * see the same loop iterations, while threads in different blocks may see a different number of iterations.
+   * If the work division has more blocks than the required number of groups, the first blocks will perform one
+   * iteration of the loop, while the other blocks will exit the loop immediately.
+   * If the work division has less blocks than the required number of groups, some of the blocks will perform more than
+   * one iteration, in order to cover then whole problem space.
    *
-   * If the work division has more blocks than `groups`, only the first `groups` blocks will perform one iteration of
-   * the loop, while the other blocks will exit immediately.
-   * If the work division has less blocks than `groups`, some of the blocks will perform more than one iteration, in
-   * order to cover then whole problem space.
+   * For example,
+   *
+   *   for (auto group: independent_groups_along<Dim>(acc, 7))
+   *
+   * will return the group range from 0 to 6, distributed across all blocks in the work division.
+   * If the work division has more than 7 blocks, the first 7 will perform one iteration of the loop, while the other
+   * blocks will exit the loop immediately. For example if the work division has 8 blocks, the blocks from 0 to 6 will
+   * process one group while block 7 will no process any.
+   * If the work division has less than 7 blocks, some of the blocks will perform more than one iteration of the loop,
+   * in order to cover then whole problem space. For example if the work division has 4 blocks, block 0 will process the
+   * groups 0 and 4, block 1 will process groups 1 and 5, group 2 will process groups 2 and 6, and block 3 will process
+   * group 3.
    */
 
-  template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
-  class independent_groups {
+  template <typename TAcc,
+            std::size_t Dim,
+            typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value >= Dim>>
+  class independent_groups_along {
   public:
-    ALPAKA_FN_ACC inline independent_groups(TAcc const& acc)
-        : first_{alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0u]},
-          stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[0u]},
+    ALPAKA_FN_ACC inline independent_groups_along(TAcc const& acc)
+        : first_{alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[Dim]},
+          stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[Dim]},
           extent_{stride_} {}
 
-    // extent is the total number of elements (not blocks)
-    ALPAKA_FN_ACC inline independent_groups(TAcc const& acc, Idx groups)
-        : first_{alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0u]},
-          stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[0u]},
+    ALPAKA_FN_ACC inline independent_groups_along(TAcc const& acc, Idx groups)
+        : first_{alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[Dim]},
+          stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[Dim]},
           extent_{groups} {}
 
     class const_iterator;
@@ -1034,7 +1058,7 @@ namespace cms::alpakatools {
     ALPAKA_FN_ACC inline const_iterator end() const { return const_iterator(stride_, extent_, extent_); }
 
     class const_iterator {
-      friend class independent_groups;
+      friend class independent_groups_along;
 
       ALPAKA_FN_ACC inline const_iterator(Idx stride, Idx extent, Idx first)
           : stride_{stride}, extent_{extent}, first_{std::min(first, extent)} {}
@@ -1079,39 +1103,98 @@ namespace cms::alpakatools {
     const Idx extent_;
   };
 
-  /* independent_group_elements
+  /* independent_groups
    *
-   * `independent_group_elements(acc, elements)` returns a range that spans all the elements within the given group:
-   *   - the `elements` argument indicates the number of elements in the current group.
+   * `independent_groups(acc, groups)` returns a one-dimensional iteratable range than spans the group indices from 0 to
+   * `groups`. If `groups` is not specified, it defaults to the number of blocks.
    *
-   * Iterating over the range yields the local element index, between `0` and `elements - 1`. The threads in the block
-   * will perform one or more iterations, depending on the number of elements per thread, and on the number of threads
-   * per block, compared with the total number of elements.
+   * `independent_groups(acc, ...)` is a shorthand for `independent_groups_along<0>(acc, ...)`.
    *
-   * If the problem size is not a multiple of the block size, different threads may execute a different number of
-   * iterations. As a result, it is not safe to call `alpaka::syncBlockThreads()` within this loop. If a block
-   * synchronisation is needed, one should split the loop, and synchronise the threads between the loops.
+   * `independent_groups(acc, ...)` should be called consistently by all the threads in a block. All threads in a block
+   * see the same loop iterations, while threads in different blocks may see a different number of iterations.
+   * If the work division has more blocks than the required number of groups, the first blocks will perform one
+   * iteration of the loop, while the other blocks will exit the loop immediately.
+   * If the work division has less blocks than the required number of groups, some of the blocks will perform more than
+   * one iteration, in order to cover then whole problem space.
+   *
+   * For example,
+   *
+   *   for (auto group: independent_groups(acc, 7))
+   *
+   * will return the group range from 0 to 6, distributed across all blocks in the work division.
+   * If the work division has more than 7 blocks, the first 7 will perform one iteration of the loop, while the other
+   * blocks will exit the loop immediately. For example if the work division has 8 blocks, the blocks from 0 to 6 will
+   * process one group while block 7 will no process any.
+   * If the work division has less than 7 blocks, some of the blocks will perform more than one iteration of the loop,
+   * in order to cover then whole problem space. For example if the work division has 4 blocks, block 0 will process the
+   * groups 0 and 4, block 1 will process groups 1 and 5, group 2 will process groups 2 and 6, and block 3 will process
+   * group 3.
+   *
+   * Note that `independent_groups(acc, ...)` is only suitable for one-dimensional kernels. For N-dimensional kernels,
+   * use
+   *   - `independent_groups_along<Dim>(acc, ...)` to perform the iteration explicitly along dimension `Dim`;
+   *   - `independent_groups_x(acc, ...)`, `independent_groups_y(acc, ...)`, or `independent_groups_z(acc, ...)` to loop
+   *     along the fastest, second-fastest, or third-fastest dimension.
    */
 
-  template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
-  class independent_group_elements {
+  template <typename TAcc,
+            typename... TArgs,
+            typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
+  ALPAKA_FN_ACC inline auto independent_groups(TAcc const& acc, TArgs... args) {
+    return independent_groups_along<TAcc, 0>(acc, static_cast<Idx>(args)...);
+  }
+
+  /* independent_groups_x, _y, _z
+   *
+   * Like `independent_groups` for N-dimensional kernels, along the fastest, second-fastest, and third-fastest
+   * dimensions.
+   */
+
+  template <typename TAcc,
+            typename... TArgs,
+            typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 0)>>
+  ALPAKA_FN_ACC inline auto independent_groups_x(TAcc const& acc, TArgs... args) {
+    return independent_groups_along<TAcc, alpaka::Dim<TAcc>::value - 1>(acc, static_cast<Idx>(args)...);
+  }
+
+  template <typename TAcc,
+            typename... TArgs,
+            typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 1)>>
+  ALPAKA_FN_ACC inline auto independent_groups_y(TAcc const& acc, TArgs... args) {
+    return independent_groups_along<TAcc, alpaka::Dim<TAcc>::value - 2>(acc, static_cast<Idx>(args)...);
+  }
+
+  template <typename TAcc,
+            typename... TArgs,
+            typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 2)>>
+  ALPAKA_FN_ACC inline auto independent_groups_z(TAcc const& acc, TArgs... args) {
+    return independent_groups_along<TAcc, alpaka::Dim<TAcc>::value - 3>(acc, static_cast<Idx>(args)...);
+  }
+
+  /* independent_group_elements_along
+   */
+
+  template <typename TAcc,
+            std::size_t Dim,
+            typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value >= Dim>>
+  class independent_group_elements_along {
   public:
-    ALPAKA_FN_ACC inline independent_group_elements(TAcc const& acc)
-        : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[0u]},
-          thread_{alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u] * elements_},
-          stride_{alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc)[0u] * elements_},
+    ALPAKA_FN_ACC inline independent_group_elements_along(TAcc const& acc)
+        : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim]},
+          thread_{alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[Dim] * elements_},
+          stride_{alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc)[Dim] * elements_},
           extent_{stride_} {}
 
-    ALPAKA_FN_ACC inline independent_group_elements(TAcc const& acc, Idx extent)
-        : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[0u]},
-          thread_{alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u] * elements_},
-          stride_{alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc)[0u] * elements_},
+    ALPAKA_FN_ACC inline independent_group_elements_along(TAcc const& acc, Idx extent)
+        : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim]},
+          thread_{alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[Dim] * elements_},
+          stride_{alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc)[Dim] * elements_},
           extent_{extent} {}
 
-    ALPAKA_FN_ACC inline independent_group_elements(TAcc const& acc, Idx first, Idx extent)
-        : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[0u]},
-          thread_{alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u] * elements_ + first},
-          stride_{alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc)[0u] * elements_},
+    ALPAKA_FN_ACC inline independent_group_elements_along(TAcc const& acc, Idx first, Idx extent)
+        : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim]},
+          thread_{alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[Dim] * elements_ + first},
+          stride_{alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc)[Dim] * elements_},
           extent_{extent} {}
 
     class const_iterator;
@@ -1122,7 +1205,7 @@ namespace cms::alpakatools {
     ALPAKA_FN_ACC inline const_iterator end() const { return const_iterator(elements_, stride_, extent_, extent_); }
 
     class const_iterator {
-      friend class independent_group_elements;
+      friend class independent_group_elements_along;
 
       ALPAKA_FN_ACC inline const_iterator(Idx elements, Idx stride, Idx extent, Idx first)
           : elements_{elements},
@@ -1189,6 +1272,43 @@ namespace cms::alpakatools {
     const Idx extent_;
   };
 
+  /* independent_group_elements
+   */
+
+  template <typename TAcc,
+            typename... TArgs,
+            typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
+  ALPAKA_FN_ACC inline auto independent_group_elements(TAcc const& acc, TArgs... args) {
+    return independent_group_elements_along<TAcc, 0>(acc, static_cast<Idx>(args)...);
+  }
+
+  /* independent_group_elements_x, _y, _z
+   *
+   * Like `independent_group_elements` for N-dimensional kernels, along the fastest, second-fastest, and third-fastest
+   * dimensions.
+   */
+
+  template <typename TAcc,
+            typename... TArgs,
+            typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 0)>>
+  ALPAKA_FN_ACC inline auto independent_group_elements_x(TAcc const& acc, TArgs... args) {
+    return independent_group_elements_along<TAcc, alpaka::Dim<TAcc>::value - 1>(acc, static_cast<Idx>(args)...);
+  }
+
+  template <typename TAcc,
+            typename... TArgs,
+            typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 1)>>
+  ALPAKA_FN_ACC inline auto independent_group_elements_y(TAcc const& acc, TArgs... args) {
+    return independent_group_elements_along<TAcc, alpaka::Dim<TAcc>::value - 2>(acc, static_cast<Idx>(args)...);
+  }
+
+  template <typename TAcc,
+            typename... TArgs,
+            typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 2)>>
+  ALPAKA_FN_ACC inline auto independent_group_elements_z(TAcc const& acc, TArgs... args) {
+    return independent_group_elements_along<TAcc, alpaka::Dim<TAcc>::value - 3>(acc, static_cast<Idx>(args)...);
+  }
+
   /* once_per_grid
    *
    * `once_per_grid(acc)` returns true for a single thread within the kernel execution grid.

From 70371a8e449917b4b3583331e4c4fc4b93af7f99 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Sun, 11 Feb 2024 07:24:24 +0100
Subject: [PATCH 21/25] Rewrite zeroAndInit kernel using alpakatools utilities

---
 .../AlpakaInterface/interface/OneToManyAssoc.h        | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/HeterogeneousCore/AlpakaInterface/interface/OneToManyAssoc.h b/HeterogeneousCore/AlpakaInterface/interface/OneToManyAssoc.h
index e7834d39a8910..866564d3f896e 100644
--- a/HeterogeneousCore/AlpakaInterface/interface/OneToManyAssoc.h
+++ b/HeterogeneousCore/AlpakaInterface/interface/OneToManyAssoc.h
@@ -96,20 +96,15 @@ namespace cms::alpakatools {
     struct zeroAndInit {
       template <typename TAcc>
       ALPAKA_FN_ACC void operator()(const TAcc &acc, View view) const {
-        auto h = view.assoc;
         ALPAKA_ASSERT_OFFLOAD((1 == alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[0]));
         ALPAKA_ASSERT_OFFLOAD((0 == alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0]));
-
-        auto first = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0];
-
-        if (0 == first) {
+        auto h = view.assoc;
+        if (cms::alpakatools::once_per_block(acc)) {
           h->psws = 0;
           h->initStorage(view);
         }
         alpaka::syncBlockThreads(acc);
-        // TODO use for_each_element_in_grid_strided (or similar)
-        for (int i = first, nt = h->totOnes(); i < nt;
-             i += alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc)[0]) {
+        for (int i : cms::alpakatools::independent_group_elements(acc, h->totOnes())) {
           h->off[i] = 0;
         }
       }

From 2d01108d07b6c1b2e06e2179e66f66e3539368cb Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Fri, 9 Feb 2024 15:36:25 +0100
Subject: [PATCH 22/25] Rewrite pixel clustering and rechits using alpakatools
 utilities

Rewrite the alpaka pixel clustering and rechit-building code using the uniform
and independent kernel utilities from cms::alpakatools.
---
 .../plugins/alpaka/CalibPixel.h               |  15 +-
 .../alpaka/SiPixelRawToClusterKernel.dev.cc   |  55 +--
 .../plugins/alpaka/PixelRecHitKernels.dev.cc  |  16 +-
 .../plugins/alpaka/PixelRecHits.h             | 368 +++++++++---------
 4 files changed, 227 insertions(+), 227 deletions(-)

diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/CalibPixel.h b/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/CalibPixel.h
index 2808255782bc9..e34df782db4dc 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/CalibPixel.h
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/CalibPixel.h
@@ -4,18 +4,17 @@
 #include <algorithm>
 #include <cstdint>
 #include <cstdio>
-#include <type_traits>
+#include <limits>
 
 #include <alpaka/alpaka.hpp>
 
 #include "CondFormats/SiPixelObjects/interface/SiPixelGainCalibrationForHLTLayout.h"
-#include "CondFormats/SiPixelObjects/interface/alpaka/SiPixelGainCalibrationForHLTDevice.h"
 #include "CondFormats/SiPixelObjects/interface/alpaka/SiPixelGainCalibrationForHLTUtilities.h"
 #include "DataFormats/SiPixelClusterSoA/interface/ClusteringConstants.h"
 #include "DataFormats/SiPixelClusterSoA/interface/SiPixelClustersSoA.h"
-#include "DataFormats/SiPixelDigiSoA/interface/SiPixelDigiErrorsSoA.h"
 #include "DataFormats/SiPixelDigiSoA/interface/SiPixelDigisSoA.h"
 #include "Geometry/CommonTopologies/interface/SimplePixelTopology.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
 #include "RecoLocalTracker/SiPixelClusterizer/interface/SiPixelClusterThresholds.h"
 
 //#define GPU_DEBUG
@@ -107,9 +106,11 @@ namespace calibPixel {
         clus_view[0].clusModuleStart() = clus_view[0].moduleStart() = 0;
       }
 
-      cms::alpakatools::for_each_element_in_grid_strided(
-          acc, phase2PixelTopology::numberOfModules, [&](uint32_t i) { clus_view[i].clusInModule() = 0; });
-      cms::alpakatools::for_each_element_in_grid_strided(acc, numElements, [&](uint32_t i) {
+      for (uint32_t i : cms::alpakatools::elements_with_stride(acc, phase2PixelTopology::numberOfModules)) {
+        clus_view[i].clusInModule() = 0;
+      }
+
+      for (uint32_t i : cms::alpakatools::elements_with_stride(acc, numElements)) {
         auto dvgi = view[i];
         if (pixelClustering::invalidModuleId != dvgi.moduleId()) {
           const int mode = (Phase2ReadoutMode < -1 ? -1 : Phase2ReadoutMode);
@@ -131,7 +132,7 @@ namespace calibPixel {
           }
           dvgi.adc() = std::min(adc_int, int(std::numeric_limits<uint16_t>::max()));
         }
-      });
+      }
     }
   };
 }  // namespace calibPixel
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/SiPixelRawToClusterKernel.dev.cc b/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/SiPixelRawToClusterKernel.dev.cc
index 9725ee10d2855..1cb55b0a27955 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/SiPixelRawToClusterKernel.dev.cc
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/SiPixelRawToClusterKernel.dev.cc
@@ -1,30 +1,33 @@
 // C++ includes
 #include <algorithm>
 #include <cassert>
-#include <chrono>
+#include <cstdint>
 #include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include <fstream>
-#include <iomanip>
-#include <iostream>
-#include <string>
-#include <utility>
+#include <type_traits>
 
+// Alpaka includes
 #include <alpaka/alpaka.hpp>
 
 // CMSSW includes
-#include "HeterogeneousCore/AlpakaInterface/interface/SimpleVector.h"
-#include "HeterogeneousCore/AlpakaInterface/interface/config.h"
-#include "HeterogeneousCore/AlpakaInterface/interface/memory.h"
-#include "HeterogeneousCore/AlpakaInterface/interface/prefixScan.h"
-#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
 #include "CondFormats/SiPixelObjects/interface/SiPixelGainCalibrationForHLTLayout.h"
 #include "CondFormats/SiPixelObjects/interface/SiPixelMappingLayout.h"
 #include "DataFormats/DetId/interface/DetId.h"
 #include "DataFormats/SiPixelClusterSoA/interface/ClusteringConstants.h"
+#include "DataFormats/SiPixelClusterSoA/interface/SiPixelClustersSoA.h"
+#include "DataFormats/SiPixelClusterSoA/interface/alpaka/SiPixelClustersSoACollection.h"
 #include "DataFormats/SiPixelDetId/interface/PixelSubdetector.h"
 #include "DataFormats/SiPixelDigi/interface/SiPixelDigiConstants.h"
+#include "DataFormats/SiPixelDigiSoA/interface/SiPixelDigiErrorsSoA.h"
+#include "DataFormats/SiPixelDigiSoA/interface/SiPixelDigisSoA.h"
+#include "DataFormats/SiPixelDigiSoA/interface/alpaka/SiPixelDigiErrorsSoACollection.h"
+#include "DataFormats/SiPixelDigiSoA/interface/alpaka/SiPixelDigisSoACollection.h"
+#include "DataFormats/SiPixelRawData/interface/SiPixelErrorCompact.h"
+#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/config.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/memory.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/prefixScan.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
+#include "RecoLocalTracker/SiPixelClusterizer/interface/SiPixelClusterThresholds.h"
 
 // local includes
 #include "CalibPixel.h"
@@ -442,9 +445,9 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
 #endif
 
         // limit to maxHitsInModule;
-        cms::alpakatools::for_each_element_in_block_strided(acc, numberOfModules, [&](uint32_t i) {
+        for (uint32_t i : cms::alpakatools::independent_group_elements(acc, numberOfModules)) {
           clus_view[i + 1].clusModuleStart() = std::min(maxHitsInModule, clus_view[i].clusInModule());
-        });
+        }
 
         constexpr bool isPhase2 = std::is_base_of<pixelTopology::Phase2, TrackerTraits>::value;
         constexpr auto leftModules = isPhase2 ? 1024 : numberOfModules - 1024;
@@ -468,20 +471,20 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
         }
 
         constexpr auto lastModule = isPhase2 ? 2049u : numberOfModules + 1;
-        cms::alpakatools::for_each_element_in_block_strided(acc, lastModule, 1025u, [&](uint32_t i) {
+        for (uint32_t i : cms::alpakatools::independent_group_elements(acc, 1025u, lastModule)) {
           clus_view[i].clusModuleStart() += clus_view[1024].clusModuleStart();
-        });
+        }
         alpaka::syncBlockThreads(acc);
 
         if constexpr (isPhase2) {
-          cms::alpakatools::for_each_element_in_block_strided(acc, 3073u, 2049u, [&](uint32_t i) {
+          for (uint32_t i : cms::alpakatools::independent_group_elements(acc, 2049u, 3073u)) {
             clus_view[i].clusModuleStart() += clus_view[2048].clusModuleStart();
-          });
+          }
           alpaka::syncBlockThreads(acc);
 
-          cms::alpakatools::for_each_element_in_block_strided(acc, numberOfModules + 1, 3073u, [&](uint32_t i) {
+          for (uint32_t i : cms::alpakatools::independent_group_elements(acc, 3073u, numberOfModules + 1)) {
             clus_view[i].clusModuleStart() += clus_view[3072].clusModuleStart();
-          });
+          }
           alpaka::syncBlockThreads(acc);
         }
 #ifdef GPU_DEBUG
@@ -492,22 +495,22 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
         ALPAKA_ASSERT_OFFLOAD(clus_view[1025].moduleStart() >= clus_view[1024].moduleStart());
         ALPAKA_ASSERT_OFFLOAD(clus_view[numberOfModules].moduleStart() >= clus_view[1025].moduleStart());
 
-        cms::alpakatools::for_each_element_in_block_strided(acc, numberOfModules + 1, [&](uint32_t i) {
+        for (uint32_t i : cms::alpakatools::independent_group_elements(acc, numberOfModules + 1)) {
           if (0 != i)
-            ALPAKA_ASSERT_OFFLOAD(clus_view[i].moduleStart() >= clus_view[i - i].moduleStart());
+            ALPAKA_ASSERT_OFFLOAD(clus_view[i].moduleStart() >= clus_view[i - 1].moduleStart());
           // Check BPX2 (1), FP1 (4)
           constexpr auto bpix2 = TrackerTraits::layerStart[1];
           constexpr auto fpix1 = TrackerTraits::layerStart[4];
           if (i == bpix2 || i == fpix1)
             printf("moduleStart %d %d\n", i, clus_view[i].moduleStart());
-        });
+        }
 #endif
         // avoid overflow
         constexpr auto MAX_HITS = TrackerTraits::maxNumberOfHits;
-        cms::alpakatools::for_each_element_in_block_strided(acc, numberOfModules + 1, [&](uint32_t i) {
+        for (uint32_t i : cms::alpakatools::independent_group_elements(acc, numberOfModules + 1)) {
           if (clus_view[i].clusModuleStart() > MAX_HITS)
             clus_view[i].clusModuleStart() = MAX_HITS;
-        });
+        }
 
       }  // end of FillHitsModuleStart kernel operator()
     };   // end of FillHitsModuleStart struct
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/alpaka/PixelRecHitKernels.dev.cc b/RecoLocalTracker/SiPixelRecHits/plugins/alpaka/PixelRecHitKernels.dev.cc
index 95592e1d5b3a2..63e269cc79453 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/alpaka/PixelRecHitKernels.dev.cc
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/alpaka/PixelRecHitKernels.dev.cc
@@ -1,16 +1,24 @@
 // C++ headers
-#include <algorithm>
-#include <numeric>
+#include <cassert>
+#include <cstdint>
+#include <type_traits>
 
 // Alpaka headers
 #include <alpaka/alpaka.hpp>
 
 // CMSSW headers
 #include "DataFormats/BeamSpot/interface/BeamSpotPOD.h"
-#include "DataFormats/SiPixelClusterSoA/interface/ClusteringConstants.h"
+#include "DataFormats/SiPixelClusterSoA/interface/alpaka/SiPixelClustersSoACollection.h"
+#include "DataFormats/SiPixelDigiSoA/interface/alpaka/SiPixelDigisSoACollection.h"
+#include "DataFormats/TrackingRecHitSoA/interface/TrackingRecHitsSoA.h"
+#include "DataFormats/TrackingRecHitSoA/interface/alpaka/TrackingRecHitsSoACollection.h"
+#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h"
 #include "HeterogeneousCore/AlpakaInterface/interface/HistoContainer.h"
 #include "HeterogeneousCore/AlpakaInterface/interface/config.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
+#include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforDevice.h"
 
+// local headers
 #include "PixelRecHitKernel.h"
 #include "PixelRecHits.h"
 
@@ -63,6 +71,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
       // protect from empty events
       if (activeModulesWithDigis) {
         int threadsPerBlock = 128;
+        // note: the kernel should work with an arbitrary number of blocks
         int blocks = activeModulesWithDigis;
         const auto workDiv1D = cms::alpakatools::make_workdiv<Acc1D>(blocks, threadsPerBlock);
 
@@ -77,6 +86,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
                             bs_d,
                             digis_d.view(),
                             digis_d.nDigis(),
+                            digis_d.nModules(),
                             clusters_d.view(),
                             hits_d.view());
 #ifdef GPU_DEBUG
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/alpaka/PixelRecHits.h b/RecoLocalTracker/SiPixelRecHits/plugins/alpaka/PixelRecHits.h
index 45587034b572b..d90f38c11c984 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/alpaka/PixelRecHits.h
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/alpaka/PixelRecHits.h
@@ -1,20 +1,22 @@
 #ifndef RecoLocalTracker_SiPixelRecHits_alpaka_PixelRecHits_h
 #define RecoLocalTracker_SiPixelRecHits_alpaka_PixelRecHits_h
 
-#include <algorithm>
+// C++ headers
+#include <cassert>
 #include <cstdint>
-#include <cstdio>
 #include <limits>
+#include <type_traits>
 
+// Alpaka headers
 #include <alpaka/alpaka.hpp>
 
+// CMSSW headers
 #include "DataFormats/BeamSpot/interface/BeamSpotPOD.h"
 #include "DataFormats/Math/interface/approx_atan2.h"
 #include "DataFormats/SiPixelClusterSoA/interface/ClusteringConstants.h"
-#include "DataFormats/SiPixelDigiSoA/interface/SiPixelDigisDevice.h"
-#include "DataFormats/SiPixelDigiSoA/interface/alpaka/SiPixelDigisSoACollection.h"
+#include "DataFormats/SiPixelClusterSoA/interface/SiPixelClustersSoA.h"
+#include "DataFormats/SiPixelDigiSoA/interface/SiPixelDigisSoA.h"
 #include "DataFormats/TrackingRecHitSoA/interface/TrackingRecHitsSoA.h"
-#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h"
 #include "HeterogeneousCore/AlpakaInterface/interface/config.h"
 #include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
 #include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforDevice.h"
@@ -33,205 +35,189 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
                                     BeamSpotPOD const* __restrict__ bs,
                                     SiPixelDigisSoAConstView digis,
                                     uint32_t numElements,
+                                    uint32_t nonEmptyModules,
                                     SiPixelClustersSoAConstView clusters,
                                     TrackingRecHitSoAView<TrackerTraits> hits) const {
-        // FIXME
-        // the compiler seems NOT to optimize loads from views (even in a simple test case)
-        // The whole gimnastic here of copying or not is a pure heuristic exercise that seems to produce the fastest code with the above signature
-        // not using views (passing a gazzilion of array pointers) seems to produce the fastest code (but it is harder to mantain)
-
         ALPAKA_ASSERT_OFFLOAD(cpeParams);
 
-        const uint32_t blockIdx(alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0u]);
-
-        // copy average geometry corrected by beamspot . FIXME (move it somewhere else???)
-        if (0 == blockIdx) {
-          auto& agc = hits.averageGeometry();
-          auto const& ag = cpeParams->averageGeometry();
-          auto nLadders = TrackerTraits::numberOfLaddersInBarrel;
-
-          cms::alpakatools::for_each_element_in_block_strided(acc, nLadders, [&](uint32_t il) {
-            agc.ladderZ[il] = ag.ladderZ[il] - bs->z;
-            agc.ladderX[il] = ag.ladderX[il] - bs->x;
-            agc.ladderY[il] = ag.ladderY[il] - bs->y;
-            agc.ladderR[il] = sqrt(agc.ladderX[il] * agc.ladderX[il] + agc.ladderY[il] * agc.ladderY[il]);
-            agc.ladderMinZ[il] = ag.ladderMinZ[il] - bs->z;
-            agc.ladderMaxZ[il] = ag.ladderMaxZ[il] - bs->z;
-          });
-
-          if (cms::alpakatools::once_per_block(acc)) {
-            agc.endCapZ[0] = ag.endCapZ[0] - bs->z;
-            agc.endCapZ[1] = ag.endCapZ[1] - bs->z;
+        // outer loop: one block per module
+        for (uint32_t module : cms::alpakatools::independent_groups(acc, nonEmptyModules)) {
+          // This is necessary only once - consider moving it somewhere else.
+          // Copy the average geometry corrected by the beamspot.
+          if (0 == module) {
+            auto& agc = hits.averageGeometry();
+            auto const& ag = cpeParams->averageGeometry();
+            auto nLadders = TrackerTraits::numberOfLaddersInBarrel;
+
+            for (uint32_t il : cms::alpakatools::independent_group_elements(acc, nLadders)) {
+              agc.ladderZ[il] = ag.ladderZ[il] - bs->z;
+              agc.ladderX[il] = ag.ladderX[il] - bs->x;
+              agc.ladderY[il] = ag.ladderY[il] - bs->y;
+              agc.ladderR[il] = sqrt(agc.ladderX[il] * agc.ladderX[il] + agc.ladderY[il] * agc.ladderY[il]);
+              agc.ladderMinZ[il] = ag.ladderMinZ[il] - bs->z;
+              agc.ladderMaxZ[il] = ag.ladderMaxZ[il] - bs->z;
+            }
+
+            if (cms::alpakatools::once_per_block(acc)) {
+              agc.endCapZ[0] = ag.endCapZ[0] - bs->z;
+              agc.endCapZ[1] = ag.endCapZ[1] - bs->z;
+            }
           }
-        }
-
-        // to be moved in common namespace...
-        using pixelClustering::invalidModuleId;
-        constexpr int32_t MaxHitsInIter = pixelCPEforDevice::MaxHitsInIter;
 
-        using ClusParams = pixelCPEforDevice::ClusParams;
+          // to be moved in common namespace...
+          using pixelClustering::invalidModuleId;
+          constexpr int32_t maxHitsInIter = pixelCPEforDevice::MaxHitsInIter;
 
-        // as usual one block per module
-        auto& clusParams = alpaka::declareSharedVar<ClusParams, __COUNTER__>(acc);
+          auto me = clusters[module].moduleId();
+          int nclus = clusters[me].clusInModule();
 
-        auto me = clusters[blockIdx].moduleId();
-        int nclus = clusters[me].clusInModule();
+          // skip empty modules
+          if (0 == nclus)
+            continue;
 
-        if (0 == nclus)
-          return;
 #ifdef GPU_DEBUG
-        if (cms::alpakatools::once_per_block(acc)) {
-          auto k = clusters[1 + blockIdx].moduleStart();
-          while (digis[k].moduleId() == invalidModuleId)
-            ++k;
-          ALPAKA_ASSERT_OFFLOAD(digis[k].moduleId() == me);
-        }
-
-        if (me % 100 == 1)
-          if (cms::alpakatools::once_per_block(acc))
-            printf(
-                "hitbuilder: %d clusters in module %d. will write at %d\n", nclus, me, clusters[me].clusModuleStart());
-#endif
-
-        for (int startClus = 0, endClus = nclus; startClus < endClus; startClus += MaxHitsInIter) {
-          auto first = clusters[1 + blockIdx].moduleStart();
-
-          int nClusInIter = alpaka::math::min(acc, MaxHitsInIter, endClus - startClus);
-          int lastClus = startClus + nClusInIter;
-          assert(nClusInIter <= nclus);
-          assert(nClusInIter > 0);
-          assert(lastClus <= nclus);
-
-          assert(nclus > MaxHitsInIter || (0 == startClus && nClusInIter == nclus && lastClus == nclus));
-
-          // init
-          cms::alpakatools::for_each_element_in_block_strided(acc, nClusInIter, [&](uint32_t ic) {
-            clusParams.minRow[ic] = std::numeric_limits<uint32_t>::max();
-            clusParams.maxRow[ic] = 0;
-            clusParams.minCol[ic] = std::numeric_limits<uint32_t>::max();
-            clusParams.maxCol[ic] = 0;
-            clusParams.charge[ic] = 0;
-            clusParams.q_f_X[ic] = 0;
-            clusParams.q_l_X[ic] = 0;
-            clusParams.q_f_Y[ic] = 0;
-            clusParams.q_l_Y[ic] = 0;
-          });
-
-          alpaka::syncBlockThreads(acc);
-
-          // one thread per "digi"
-          const uint32_t blockDimension(alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u]);
-          const auto& [firstElementIdxNoStride, endElementIdxNoStride] =
-              cms::alpakatools::element_index_range_in_block(acc, first);
-          uint32_t rowsColsFirstElementIdx = firstElementIdxNoStride;
-          uint32_t rowsColsEndElementIdx = endElementIdxNoStride;
-          for (uint32_t i = rowsColsFirstElementIdx; i < numElements; ++i) {
-            if (not cms::alpakatools::next_valid_element_index_strided(
-                    i, rowsColsFirstElementIdx, rowsColsEndElementIdx, blockDimension, numElements))
-              break;
-            auto id = digis[i].moduleId();
-            if (id == invalidModuleId)
-              continue;  // not valid
-            if (id != me)
-              break;  // end of module
-            auto cl = digis[i].clus();
-            if (cl < startClus || cl >= lastClus)
-              continue;
-            cl -= startClus;
-            ALPAKA_ASSERT_OFFLOAD(cl >= 0);
-            ALPAKA_ASSERT_OFFLOAD(cl < MaxHitsInIter);
-            auto x = digis[i].xx();
-            auto y = digis[i].yy();
-            alpaka::atomicMin(acc, &clusParams.minRow[cl], (uint32_t)x, alpaka::hierarchy::Threads{});
-            alpaka::atomicMax(acc, &clusParams.maxRow[cl], (uint32_t)x, alpaka::hierarchy::Threads{});
-            alpaka::atomicMin(acc, &clusParams.minCol[cl], (uint32_t)y, alpaka::hierarchy::Threads{});
-            alpaka::atomicMax(acc, &clusParams.maxCol[cl], (uint32_t)y, alpaka::hierarchy::Threads{});
-          }
-
-          alpaka::syncBlockThreads(acc);
-
-          auto pixmx = cpeParams->detParams(me).pixmx;
-          uint32_t chargeFirstElementIdx = firstElementIdxNoStride;
-          uint32_t chargeEndElementIdx = endElementIdxNoStride;
-          for (uint32_t i = chargeFirstElementIdx; i < numElements; ++i) {
-            if (not cms::alpakatools::next_valid_element_index_strided(
-                    i, chargeFirstElementIdx, chargeEndElementIdx, blockDimension, numElements))
-              break;
-            auto id = digis[i].moduleId();
-            if (id == invalidModuleId)
-              continue;  // not valid
-            if (id != me)
-              break;  // end of module
-            auto cl = digis[i].clus();
-            if (cl < startClus || cl >= lastClus)
-              continue;
-            cl -= startClus;
-            ALPAKA_ASSERT_OFFLOAD(cl >= 0);
-            ALPAKA_ASSERT_OFFLOAD(cl < MaxHitsInIter);
-            auto x = digis[i].xx();
-            auto y = digis[i].yy();
-            auto ch = digis[i].adc();
-            alpaka::atomicAdd(acc, &clusParams.charge[cl], (int32_t)ch, alpaka::hierarchy::Threads{});
-            ch = alpaka::math::min(acc, ch, pixmx);
-            if (clusParams.minRow[cl] == x)
-              alpaka::atomicAdd(acc, &clusParams.q_f_X[cl], (int32_t)ch, alpaka::hierarchy::Threads{});
-            if (clusParams.maxRow[cl] == x)
-              alpaka::atomicAdd(acc, &clusParams.q_l_X[cl], (int32_t)ch, alpaka::hierarchy::Threads{});
-            if (clusParams.minCol[cl] == y)
-              alpaka::atomicAdd(acc, &clusParams.q_f_Y[cl], (int32_t)ch, alpaka::hierarchy::Threads{});
-            if (clusParams.maxCol[cl] == y)
-              alpaka::atomicAdd(acc, &clusParams.q_l_Y[cl], (int32_t)ch, alpaka::hierarchy::Threads{});
+          if (cms::alpakatools::once_per_block(acc)) {
+            auto k = clusters[1 + module].moduleStart();
+            while (digis[k].moduleId() == invalidModuleId)
+              ++k;
+            ALPAKA_ASSERT_OFFLOAD(digis[k].moduleId() == me);
           }
 
-          alpaka::syncBlockThreads(acc);
-
-          // next one cluster per thread...
-          first = clusters[me].clusModuleStart() + startClus;
-          cms::alpakatools::for_each_element_in_block_strided(acc, nClusInIter, [&](uint32_t ic) {
-            auto h = first + ic;  // output index in global memory
-
-            assert(h < (uint32_t)hits.metadata().size());
-            assert(h < clusters[me + 1].clusModuleStart());
-
-            pixelCPEforDevice::position<TrackerTraits>(
-                cpeParams->commonParams(), cpeParams->detParams(me), clusParams, ic);
-
-            pixelCPEforDevice::errorFromDB<TrackerTraits>(
-                cpeParams->commonParams(), cpeParams->detParams(me), clusParams, ic);
-
-            // store it
-            hits[h].chargeAndStatus().charge = clusParams.charge[ic];
-            hits[h].chargeAndStatus().status = clusParams.status[ic];
-            hits[h].detectorIndex() = me;
-
-            float xl, yl;
-            hits[h].xLocal() = xl = clusParams.xpos[ic];
-            hits[h].yLocal() = yl = clusParams.ypos[ic];
-
-            hits[h].clusterSizeX() = clusParams.xsize[ic];
-            hits[h].clusterSizeY() = clusParams.ysize[ic];
-
-            hits[h].xerrLocal() = clusParams.xerr[ic] * clusParams.xerr[ic] + cpeParams->detParams(me).apeXX;
-            hits[h].yerrLocal() = clusParams.yerr[ic] * clusParams.yerr[ic] + cpeParams->detParams(me).apeYY;
-
-            // keep it local for computations
-            float xg, yg, zg;
-            // to global and compute phi...
-            cpeParams->detParams(me).frame.toGlobal(xl, yl, xg, yg, zg);
-            // here correct for the beamspot...
-            xg -= bs->x;
-            yg -= bs->y;
-            zg -= bs->z;
-
-            hits[h].xGlobal() = xg;
-            hits[h].yGlobal() = yg;
-            hits[h].zGlobal() = zg;
+          if (me % 100 == 1)
+            if (cms::alpakatools::once_per_block(acc))
+              printf("hitbuilder: %d clusters in module %d. will write at %d\n",
+                     nclus,
+                     me,
+                     clusters[me].clusModuleStart());
+#endif
 
-            hits[h].rGlobal() = alpaka::math::sqrt(acc, xg * xg + yg * yg);
-            hits[h].iphi() = unsafe_atan2s<7>(yg, xg);
-          });
-          alpaka::syncBlockThreads(acc);
-        }  // end loop on batches
+          auto& clusParams = alpaka::declareSharedVar<pixelCPEforDevice::ClusParams, __COUNTER__>(acc);
+          for (int startClus = 0, endClus = nclus; startClus < endClus; startClus += maxHitsInIter) {
+            auto first = clusters[1 + module].moduleStart();
+
+            int nClusInIter = alpaka::math::min(acc, maxHitsInIter, endClus - startClus);
+            int lastClus = startClus + nClusInIter;
+            assert(nClusInIter <= nclus);
+            assert(nClusInIter > 0);
+            assert(lastClus <= nclus);
+
+            assert(nclus > maxHitsInIter || (0 == startClus && nClusInIter == nclus && lastClus == nclus));
+
+            // init
+            for (uint32_t ic : cms::alpakatools::independent_group_elements(acc, nClusInIter)) {
+              clusParams.minRow[ic] = std::numeric_limits<uint32_t>::max();
+              clusParams.maxRow[ic] = 0;
+              clusParams.minCol[ic] = std::numeric_limits<uint32_t>::max();
+              clusParams.maxCol[ic] = 0;
+              clusParams.charge[ic] = 0;
+              clusParams.q_f_X[ic] = 0;
+              clusParams.q_l_X[ic] = 0;
+              clusParams.q_f_Y[ic] = 0;
+              clusParams.q_l_Y[ic] = 0;
+            }
+
+            alpaka::syncBlockThreads(acc);
+
+            // one thread or element per "digi"
+            for (uint32_t i : cms::alpakatools::independent_group_elements(acc, first, numElements)) {
+              auto id = digis[i].moduleId();
+              if (id == invalidModuleId)
+                continue;  // not valid
+              if (id != me)
+                break;  // end of module
+              auto cl = digis[i].clus();
+              if (cl < startClus || cl >= lastClus)
+                continue;
+              cl -= startClus;
+              ALPAKA_ASSERT_OFFLOAD(cl >= 0);
+              ALPAKA_ASSERT_OFFLOAD(cl < maxHitsInIter);
+              auto x = digis[i].xx();
+              auto y = digis[i].yy();
+              alpaka::atomicMin(acc, &clusParams.minRow[cl], (uint32_t)x, alpaka::hierarchy::Threads{});
+              alpaka::atomicMax(acc, &clusParams.maxRow[cl], (uint32_t)x, alpaka::hierarchy::Threads{});
+              alpaka::atomicMin(acc, &clusParams.minCol[cl], (uint32_t)y, alpaka::hierarchy::Threads{});
+              alpaka::atomicMax(acc, &clusParams.maxCol[cl], (uint32_t)y, alpaka::hierarchy::Threads{});
+            }
+
+            alpaka::syncBlockThreads(acc);
+
+            auto pixmx = cpeParams->detParams(me).pixmx;
+            for (uint32_t i : cms::alpakatools::independent_group_elements(acc, first, numElements)) {
+              auto id = digis[i].moduleId();
+              if (id == invalidModuleId)
+                continue;  // not valid
+              if (id != me)
+                break;  // end of module
+              auto cl = digis[i].clus();
+              if (cl < startClus || cl >= lastClus)
+                continue;
+              cl -= startClus;
+              ALPAKA_ASSERT_OFFLOAD(cl >= 0);
+              ALPAKA_ASSERT_OFFLOAD(cl < maxHitsInIter);
+              auto x = digis[i].xx();
+              auto y = digis[i].yy();
+              auto ch = digis[i].adc();
+              alpaka::atomicAdd(acc, &clusParams.charge[cl], (int32_t)ch, alpaka::hierarchy::Threads{});
+              ch = alpaka::math::min(acc, ch, pixmx);
+              if (clusParams.minRow[cl] == x)
+                alpaka::atomicAdd(acc, &clusParams.q_f_X[cl], (int32_t)ch, alpaka::hierarchy::Threads{});
+              if (clusParams.maxRow[cl] == x)
+                alpaka::atomicAdd(acc, &clusParams.q_l_X[cl], (int32_t)ch, alpaka::hierarchy::Threads{});
+              if (clusParams.minCol[cl] == y)
+                alpaka::atomicAdd(acc, &clusParams.q_f_Y[cl], (int32_t)ch, alpaka::hierarchy::Threads{});
+              if (clusParams.maxCol[cl] == y)
+                alpaka::atomicAdd(acc, &clusParams.q_l_Y[cl], (int32_t)ch, alpaka::hierarchy::Threads{});
+            }
+
+            alpaka::syncBlockThreads(acc);
+
+            // next one cluster per thread...
+            first = clusters[me].clusModuleStart() + startClus;
+            for (uint32_t ic : cms::alpakatools::independent_group_elements(acc, nClusInIter)) {
+              auto h = first + ic;  // output index in global memory
+
+              assert(h < (uint32_t)hits.metadata().size());
+              assert(h < clusters[me + 1].clusModuleStart());
+
+              pixelCPEforDevice::position<TrackerTraits>(
+                  cpeParams->commonParams(), cpeParams->detParams(me), clusParams, ic);
+
+              pixelCPEforDevice::errorFromDB<TrackerTraits>(
+                  cpeParams->commonParams(), cpeParams->detParams(me), clusParams, ic);
+
+              // store it
+              hits[h].chargeAndStatus().charge = clusParams.charge[ic];
+              hits[h].chargeAndStatus().status = clusParams.status[ic];
+              hits[h].detectorIndex() = me;
+
+              // local coordinates for computations
+              float xl, yl;
+              hits[h].xLocal() = xl = clusParams.xpos[ic];
+              hits[h].yLocal() = yl = clusParams.ypos[ic];
+
+              hits[h].clusterSizeX() = clusParams.xsize[ic];
+              hits[h].clusterSizeY() = clusParams.ysize[ic];
+
+              hits[h].xerrLocal() = clusParams.xerr[ic] * clusParams.xerr[ic] + cpeParams->detParams(me).apeXX;
+              hits[h].yerrLocal() = clusParams.yerr[ic] * clusParams.yerr[ic] + cpeParams->detParams(me).apeYY;
+
+              // global coordinates and phi computation
+              float xg, yg, zg;
+              cpeParams->detParams(me).frame.toGlobal(xl, yl, xg, yg, zg);
+              // correct for the beamspot position
+              xg -= bs->x;
+              yg -= bs->y;
+              zg -= bs->z;
+
+              hits[h].xGlobal() = xg;
+              hits[h].yGlobal() = yg;
+              hits[h].zGlobal() = zg;
+              hits[h].rGlobal() = alpaka::math::sqrt(acc, xg * xg + yg * yg);
+              hits[h].iphi() = unsafe_atan2s<7>(yg, xg);
+            }
+            alpaka::syncBlockThreads(acc);
+          }  // end loop on batches
+        }
       }
     };
 

From 1bd5d4a2afae18dcc46d65e6d86311736e6a8da1 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Sun, 11 Feb 2024 08:34:53 +0100
Subject: [PATCH 23/25] Rewrite pixel seeding using alpakatools utilities

Rewrite the alpaka pixel seeding code using the uniform and independent kernel
utilities from cms::alpakatools.
---
 .../plugins/CAHitNtupletGeneratorKernels.cc   |   4 +-
 .../plugins/CAHitNtupletGeneratorKernels.cu   |   2 +-
 .../PixelSeeding/plugins/alpaka/CAFishbone.h  |  32 +---
 .../alpaka/CAHitNtupletGeneratorKernelsImpl.h | 148 ++++++++----------
 .../plugins/alpaka/CAPixelDoubletsAlgos.h     |  44 ++----
 5 files changed, 82 insertions(+), 148 deletions(-)

diff --git a/RecoTracker/PixelSeeding/plugins/CAHitNtupletGeneratorKernels.cc b/RecoTracker/PixelSeeding/plugins/CAHitNtupletGeneratorKernels.cc
index 9ab908a037bd7..7646da18faf17 100644
--- a/RecoTracker/PixelSeeding/plugins/CAHitNtupletGeneratorKernels.cc
+++ b/RecoTracker/PixelSeeding/plugins/CAHitNtupletGeneratorKernels.cc
@@ -1,7 +1,7 @@
-#include "RecoTracker/PixelSeeding/plugins/CAHitNtupletGeneratorKernelsImpl.h"
-
 #include <mutex>
 
+#include "CAHitNtupletGeneratorKernelsImpl.h"
+
 namespace {
   // cuda atomics are NOT atomics on CPU so protect stat update with a mutex
   // waiting for a more general solution (incuding multiple devices) to be proposed and implemented
diff --git a/RecoTracker/PixelSeeding/plugins/CAHitNtupletGeneratorKernels.cu b/RecoTracker/PixelSeeding/plugins/CAHitNtupletGeneratorKernels.cu
index 6e07126e9e428..e846622b951a8 100644
--- a/RecoTracker/PixelSeeding/plugins/CAHitNtupletGeneratorKernels.cu
+++ b/RecoTracker/PixelSeeding/plugins/CAHitNtupletGeneratorKernels.cu
@@ -1,6 +1,6 @@
 #include <mutex>
 
-#include "RecoTracker/PixelSeeding/plugins/CAHitNtupletGeneratorKernelsImpl.h"
+#include "CAHitNtupletGeneratorKernelsImpl.h"
 
 //#define GPU_DEBUG
 //#define NTUPLE_DEBUG
diff --git a/RecoTracker/PixelSeeding/plugins/alpaka/CAFishbone.h b/RecoTracker/PixelSeeding/plugins/alpaka/CAFishbone.h
index 0e04350651aa6..957c86a3669ca 100644
--- a/RecoTracker/PixelSeeding/plugins/alpaka/CAFishbone.h
+++ b/RecoTracker/PixelSeeding/plugins/alpaka/CAFishbone.h
@@ -58,25 +58,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::caPixelDoublets {
       uint16_t d[maxCellsPerHit];
       uint8_t l[maxCellsPerHit];
 
-      // index Dim<TAcc>::value - 1 runs faster...
-      constexpr uint32_t dimIndexX = 1u;
-      const uint32_t blockDimensionX(alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[dimIndexX]);
-      const auto [firstElementIdxNoStrideX, endElementIdxNoStrideX] =
-          cms::alpakatools::element_index_range_in_block(acc, 0u, dimIndexX);
-
-      // Outermost parallel loop on the slower dimension (Y or 0 in a 2D grid)
-      constexpr uint32_t dimIndexY = 0u;
-      const uint32_t gridDimensionY(alpaka::getWorkDiv<alpaka::Grid, alpaka::Elems>(acc)[dimIndexY]);
-      const auto [firstElementIdxNoStrideY, endElementIdxNoStrideY] =
-          cms::alpakatools::element_index_range_in_grid(acc, 0u, dimIndexY);
-      uint32_t firstElementIdxY = firstElementIdxNoStrideY;
-      uint32_t endElementIdxY = endElementIdxNoStrideY;
-
-      for (uint32_t idy = firstElementIdxY, nt = nHits - layer2Offset; idy < nt; ++idy) {
-        if (not cms::alpakatools::next_valid_element_index_strided(
-                idy, firstElementIdxY, endElementIdxY, gridDimensionY, nt))
-          break;
-
+      // outermost parallel loop, using all grid elements along the slower dimension (Y or 0 in a 2D grid)
+      for (uint32_t idy : cms::alpakatools::uniform_elements_y(acc, nHits - layer2Offset)) {
         auto const& vc = isOuterHitOfCell[idy];
         auto size = vc.size();
         if (size < 2)
@@ -106,15 +89,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::caPixelDoublets {
         if (sg < 2)
           continue;
 
-        // here we parallelize in X
-        // Innermost parallel loop on the faster dimension (X or 1 in a 2D grid)
-        uint32_t firstElementIdxX = firstElementIdxNoStrideX;
-        uint32_t endElementIdxX = endElementIdxNoStrideX;
-        for (uint32_t ic = firstElementIdxX; (int)ic < sg - 1; ++ic) {
-          if (not cms::alpakatools::next_valid_element_index_strided(
-                  ic, firstElementIdxX, endElementIdxX, blockDimensionX, sg - 1))
-            break;
-
+        // innermost parallel loop, using the block elements along the faster dimension (X or 1 in a 2D grid)
+        for (uint32_t ic : cms::alpakatools::independent_group_elements_x(acc, sg - 1)) {
           auto& ci = cells[cc[ic]];
           for (auto jc = ic + 1; (int)jc < sg; ++jc) {
             auto& cj = cells[cc[jc]];
diff --git a/RecoTracker/PixelSeeding/plugins/alpaka/CAHitNtupletGeneratorKernelsImpl.h b/RecoTracker/PixelSeeding/plugins/alpaka/CAHitNtupletGeneratorKernelsImpl.h
index 0153c78868519..68b6e597e93c8 100644
--- a/RecoTracker/PixelSeeding/plugins/alpaka/CAHitNtupletGeneratorKernelsImpl.h
+++ b/RecoTracker/PixelSeeding/plugins/alpaka/CAHitNtupletGeneratorKernelsImpl.h
@@ -112,9 +112,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::caHitNtupletGeneratorKernels {
           ALPAKA_ASSERT_OFFLOAD(tracks_view.hitIndices().size() == apc->get().second);
         }
       }
-      const auto ntNbins = foundNtuplets->nbins();
 
-      for (auto idx : cms::alpakatools::elements_with_stride(acc, ntBins)) {
+      for (auto idx : cms::alpakatools::elements_with_stride(acc, tracks_view.hitIndices().nOnes())) {
         if (tracks_view.hitIndices().size(idx) > TrackerTraits::maxHitsOnTrack)  // current real limit
           printf("ERROR %d, %d\n", idx, tracks_view.hitIndices().size(idx));
         ALPAKA_ASSERT_OFFLOAD(ftracks_view.hitIndices().size(idx) <= TrackerTraits::maxHitsOnTrack);
@@ -142,8 +141,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::caHitNtupletGeneratorKernels {
 #endif
       }
 
-      const auto ntNCells = (*nCells);
-      for (auto idx : cms::alpakatools::elements_with_stride(acc, ntNCells)) {
+      for (auto idx : cms::alpakatools::elements_with_stride(acc, *nCells)) {
         auto const &thisCell = cells[idx];
         if (thisCell.hasFishbone() && !thisCell.isKilled())
           alpaka::atomicAdd(acc, &c.nFishCells, 1ull, alpaka::hierarchy::Blocks{});
@@ -159,6 +157,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::caHitNtupletGeneratorKernels {
           alpaka::atomicAdd(acc, &c.nZeroTrackCells, 1ull, alpaka::hierarchy::Blocks{});
       }
 
+      // FIXME this loop was up to nHits - isOuterHitOfCell.offset in the CUDA version
       for (auto idx : cms::alpakatools::elements_with_stride(acc, nHits))
         if ((*isOuterHitOfCell).container[idx].full())  // ++tooManyOuterHitOfCell;
           printf("OuterHitOfCell overflow %d\n", idx);
@@ -174,9 +173,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::caHitNtupletGeneratorKernels {
                                   uint32_t const *__restrict__ nCells,
                                   TkSoAView<TrackerTraits> tracks_view) const {
       constexpr auto reject = Quality::dup;
-      const auto ntNCells = (*nCells);
 
-      for (auto idx : cms::alpakatools::elements_with_stride(acc, ntNCells)) {
+      for (auto idx : cms::alpakatools::elements_with_stride(acc, *nCells)) {
         auto const &thisCell = cells[idx];
         if (!thisCell.isKilled())
           continue;
@@ -186,6 +184,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::caHitNtupletGeneratorKernels {
       }
     }
   };
+
   // remove shorter tracks if sharing a cell
   // It does not seem to affect efficiency in any way!
   template <typename TrackerTraits>
@@ -200,9 +199,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::caHitNtupletGeneratorKernels {
       // quality to mark rejected
       constexpr auto reject = Quality::edup;  /// cannot be loose
       ALPAKA_ASSERT_OFFLOAD(nCells);
-      const auto ntNCells = (*nCells);
-
-      for (auto idx : cms::alpakatools::elements_with_stride(acc, ntNCells)) {
+      for (auto idx : cms::alpakatools::elements_with_stride(acc, *nCells)) {
         auto const &thisCell = cells[idx];
 
         if (thisCell.tracks().size() < 2)
@@ -217,12 +214,12 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::caHitNtupletGeneratorKernels {
         }
 
         // if (maxNl<4) continue;
-        // quad pass through (leave it her for tests)
+        // quad pass through (leave it here for tests)
         //  maxNl = std::min(4, maxNl);
 
         for (auto it : thisCell.tracks()) {
           if (tracks_view[it].nLayers() < maxNl)
-            tracks_view[it].quality() = reject;  //no race:  simple assignment of the same constant
+            tracks_view[it].quality() = reject;  // no race: simple assignment of the same constant
         }
       }
     }
@@ -333,70 +330,54 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::caHitNtupletGeneratorKernels {
                                   CAParams<TrackerTraits> params) const {
       using Cell = CACellT<TrackerTraits>;
 
-      const uint32_t dimIndexY = 0u;
-      const uint32_t dimIndexX = 1u;
-      const uint32_t threadIdxY(alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[dimIndexY]);
-      const uint32_t threadIdxLocalX(alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[dimIndexX]);
-
-      if (0 == (threadIdxY + threadIdxLocalX)) {
-        (*apc1) = 0;
-        (*apc2) = 0;
+      if (cms::alpakatools::once_per_grid(acc)) {
+        *apc1 = 0;
+        *apc2 = 0;
       }  // ready for next kernel
 
-      constexpr uint32_t last_bpix1_detIndex = TrackerTraits::last_bpix1_detIndex;
-      constexpr uint32_t last_barrel_detIndex = TrackerTraits::last_barrel_detIndex;
-
-      cms::alpakatools::for_each_element_in_grid_strided(
-          acc,
-          (*nCells),
-          0u,
-          [&](uint32_t idx) {
-            auto cellIndex = idx;
-            auto &thisCell = cells[idx];
-            auto innerHitId = thisCell.inner_hit_id();
-            if (int(innerHitId) >= isOuterHitOfCell->offset) {
-              uint32_t numberOfPossibleNeighbors = (*isOuterHitOfCell)[innerHitId].size();
-              auto vi = (*isOuterHitOfCell)[innerHitId].data();
-              auto ri = thisCell.inner_r(hh);
-              auto zi = thisCell.inner_z(hh);
-              auto ro = thisCell.outer_r(hh);
-              auto zo = thisCell.outer_z(hh);
-              auto isBarrel = thisCell.inner_detIndex(hh) < last_barrel_detIndex;
-
-              cms::alpakatools::for_each_element_in_block_strided(
-                  acc,
-                  numberOfPossibleNeighbors,
-                  0u,
-                  [&](uint32_t j) {
-                    auto otherCell = (vi[j]);
-                    auto &oc = cells[otherCell];
-                    auto r1 = oc.inner_r(hh);
-                    auto z1 = oc.inner_z(hh);
-                    bool aligned =
-                        Cell::areAlignedRZ(r1,
-                                           z1,
-                                           ri,
-                                           zi,
-                                           ro,
-                                           zo,
-                                           params.ptmin_,
-                                           isBarrel ? params.CAThetaCutBarrel_
-                                                    : params.CAThetaCutForward_);  // 2.f*thetaCut); // FIXME tune cuts
-                    if (aligned &&
-                        thisCell.dcaCut(hh,
-                                        oc,
-                                        oc.inner_detIndex(hh) < last_bpix1_detIndex ? params.dcaCutInnerTriplet_
-                                                                                    : params.dcaCutOuterTriplet_,
-                                        params.hardCurvCut_)) {  // FIXME tune cuts
-                      oc.addOuterNeighbor(acc, cellIndex, *cellNeighbors);
-                      thisCell.setStatusBits(Cell::StatusBit::kUsed);
-                      oc.setStatusBits(Cell::StatusBit::kUsed);
-                    }
-                  },
-                  dimIndexX);  // loop on inner cells
-            }
-          },
-          dimIndexY);  // loop on outer cells
+      // loop on outer cells
+      for (uint32_t idx : cms::alpakatools::uniform_elements_y(acc, *nCells)) {
+        auto cellIndex = idx;
+        auto &thisCell = cells[idx];
+        auto innerHitId = thisCell.inner_hit_id();
+        if (int(innerHitId) < isOuterHitOfCell->offset)
+          continue;
+
+        uint32_t numberOfPossibleNeighbors = (*isOuterHitOfCell)[innerHitId].size();
+        auto vi = (*isOuterHitOfCell)[innerHitId].data();
+        auto ri = thisCell.inner_r(hh);
+        auto zi = thisCell.inner_z(hh);
+        auto ro = thisCell.outer_r(hh);
+        auto zo = thisCell.outer_z(hh);
+        auto isBarrel = thisCell.inner_detIndex(hh) < TrackerTraits::last_barrel_detIndex;
+
+        // loop on inner cells
+        for (uint32_t j : cms::alpakatools::independent_group_elements_x(acc, numberOfPossibleNeighbors)) {
+          auto otherCell = (vi[j]);
+          auto &oc = cells[otherCell];
+          auto r1 = oc.inner_r(hh);
+          auto z1 = oc.inner_z(hh);
+          bool aligned = Cell::areAlignedRZ(
+              r1,
+              z1,
+              ri,
+              zi,
+              ro,
+              zo,
+              params.ptmin_,
+              isBarrel ? params.CAThetaCutBarrel_ : params.CAThetaCutForward_);  // 2.f*thetaCut); // FIXME tune cuts
+          if (aligned &&
+              thisCell.dcaCut(hh,
+                              oc,
+                              oc.inner_detIndex(hh) < TrackerTraits::last_bpix1_detIndex ? params.dcaCutInnerTriplet_
+                                                                                         : params.dcaCutOuterTriplet_,
+                              params.hardCurvCut_)) {  // FIXME tune cuts
+            oc.addOuterNeighbor(acc, cellIndex, *cellNeighbors);
+            thisCell.setStatusBits(Cell::StatusBit::kUsed);
+            oc.setStatusBits(Cell::StatusBit::kUsed);
+          }
+        }  // loop on inner cells
+      }    // loop on outer cells
     }
   };
   template <typename TrackerTraits>
@@ -423,11 +404,11 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::caHitNtupletGeneratorKernels {
       for (auto idx : cms::alpakatools::elements_with_stride(acc, (*nCells))) {
         auto const &thisCell = cells[idx];
 
+        // cut by earlyFishbone
         if (thisCell.isKilled())
-          continue;  // cut by earlyFishbone
-
-        // we require at least three hits...
+          continue;
 
+        // we require at least three hits
         if (thisCell.outerNeighbors().empty())
           continue;
 
@@ -966,7 +947,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::caHitNtupletGeneratorKernels {
                                   int iev) const {
       constexpr auto loose = Quality::loose;
 
-      for (auto i : cms::alpakatools::elements_with_stride(acc, tracks_view.hitIndices().nbins())) {
+      for (auto i : cms::alpakatools::elements_with_stride(
+               acc, firstPrint, std::min(lastPrint, tracks_view.hitIndices().nbins()))) {
         auto nh = tracks_view.hitIndices().size(i);
         if (nh < 3)
           continue;
@@ -1001,13 +983,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::caHitNtupletGeneratorKernels {
     ALPAKA_FN_ACC void operator()(TAcc const &acc, Counters const *counters) const {
       auto const &c = *counters;
       printf(
-          "||Counters | nEvents | nHits | nCells | nTuples | nFitTacks  |  nLooseTracks  |  nGoodTracks | "
-          "nUsedHits "
-          "| "
-          "nDupHits | "
-          "nFishCells | "
-          "nKilledCells | "
-          "nUsedCells | nZeroTrackCells ||\n");
+          "||Counters | nEvents | nHits | nCells | nTuples | nFitTacks  |  nLooseTracks  |  nGoodTracks | nUsedHits | "
+          "nDupHits | nFishCells | nKilledCells | nUsedCells | nZeroTrackCells ||\n");
       printf("Counters Raw %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld\n",
              c.nEvents,
              c.nHits,
@@ -1023,8 +1000,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::caHitNtupletGeneratorKernels {
              c.nEmptyCells,
              c.nZeroTrackCells);
       printf(
-          "Counters Norm %lld ||  %.1f|  %.1f|  %.1f|  %.1f|  %.1f|  %.1f|  %.1f|  %.1f|  %.3f|  %.3f|  "
-          "%.3f|  "
+          "Counters Norm %lld ||  %.1f|  %.1f|  %.1f|  %.1f|  %.1f|  %.1f|  %.1f|  %.1f|  %.3f|  %.3f|  %.3f|  "
           "%.3f||\n",
           c.nEvents,
           c.nHits / double(c.nEvents),
diff --git a/RecoTracker/PixelSeeding/plugins/alpaka/CAPixelDoubletsAlgos.h b/RecoTracker/PixelSeeding/plugins/alpaka/CAPixelDoubletsAlgos.h
index ddeb853a7ec93..048aaf2058d27 100644
--- a/RecoTracker/PixelSeeding/plugins/alpaka/CAPixelDoubletsAlgos.h
+++ b/RecoTracker/PixelSeeding/plugins/alpaka/CAPixelDoubletsAlgos.h
@@ -179,29 +179,15 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::caPixelDoublets {
     }
     alpaka::syncBlockThreads(acc);
 
-    // x runs faster
-    const uint32_t blockDimensionX(alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[dimIndexX]);
-    const auto& [firstElementIdxNoStrideX, endElementIdxNoStrideX] =
-        cms::alpakatools::element_index_range_in_block(acc, 0u, dimIndexX);
-
-    uint32_t pairLayerId = 0;  // cannot go backward
-
-    // Outermost loop on Y
-    const uint32_t gridDimensionY(alpaka::getWorkDiv<alpaka::Grid, alpaka::Elems>(acc)[dimIndexY]);
-    const auto& [firstElementIdxNoStrideY, endElementIdxNoStrideY] =
-        cms::alpakatools::element_index_range_in_grid(acc, 0u, dimIndexY);
-    uint32_t firstElementIdxY = firstElementIdxNoStrideY;
-    uint32_t endElementIdxY = endElementIdxNoStrideY;
-
-    //const uint32_t incY = cms::alpakatools::requires_single_thread_per_block_v<TAcc> ? 1 : gridDimensionY;
-    for (uint32_t j = firstElementIdxY; j < ntot; j++) {
-      if (not cms::alpakatools::next_valid_element_index_strided(
-              j, firstElementIdxY, endElementIdxY, gridDimensionY, ntot))
-        break;
+    // declared outside the loop, as it cannot go backward
+    uint32_t pairLayerId = 0;
 
+    // outermost parallel loop, using all grid elements along the slower dimension (Y or 0 in a 2D grid)
+    for (uint32_t j : cms::alpakatools::uniform_elements_y(acc, ntot)) {
+      // move to lower_bound ?
       while (j >= innerLayerCumulativeSize[pairLayerId++])
         ;
-      --pairLayerId;  // move to lower_bound ??
+      --pairLayerId;
 
       ALPAKA_ASSERT_OFFLOAD(pairLayerId < nPairs);
       ALPAKA_ASSERT_OFFLOAD(j < innerLayerCumulativeSize[pairLayerId]);
@@ -218,7 +204,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::caPixelDoublets {
       ALPAKA_ASSERT_OFFLOAD(i >= offsets[inner]);
       ALPAKA_ASSERT_OFFLOAD(i < offsets[inner + 1]);
 
-      // found hit corresponding to our cuda thread, now do the job
+      // found hit corresponding to our worker thread, now do the job
       if (hh[i].detectorIndex() > pixelClustering::maxNumModules)
         continue;  // invalid
 
@@ -277,21 +263,17 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::caPixelDoublets {
         auto const* __restrict__ e = phiBinner.end(kk + hoff);
         auto const maxpIndex = e - p;
 
-        // Here we parallelize in X
-        uint32_t firstElementIdxX = firstElementIdxNoStrideX;
-        uint32_t endElementIdxX = endElementIdxNoStrideX;
-
-        for (uint32_t pIndex = firstElementIdxX; pIndex < maxpIndex; ++pIndex) {
-          if (not cms::alpakatools::next_valid_element_index_strided(
-                  pIndex, firstElementIdxX, endElementIdxX, blockDimensionX, maxpIndex))
-            break;
-          auto oi = p[pIndex];  // auto oi = __ldg(p); is not allowed since __ldg is device-only
+        // innermost parallel loop, using the block elements along the faster dimension (X or 1 in a 2D grid)
+        for (uint32_t pIndex : cms::alpakatools::independent_group_elements_x(acc, maxpIndex)) {
+          // FIXME implement alpaka::ldg and use it here? or is it const* __restrict__ enough?
+          auto oi = p[pIndex];
           ALPAKA_ASSERT_OFFLOAD(oi >= offsets[outer]);
           ALPAKA_ASSERT_OFFLOAD(oi < offsets[outer + 1]);
           auto mo = hh[oi].detectorIndex();
 
+          // invalid
           if (mo > pixelClustering::maxNumModules)
-            continue;  //    invalid
+            continue;
 
           if (doZ0Cut && z0cutoff(oi))
             continue;

From 008ca51385f4eb38c2b9eb2ce117f2d3a93a1cba Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Sat, 10 Feb 2024 12:49:38 +0100
Subject: [PATCH 24/25] Rename elements_with_stride to uniform_elements in user
 code

---
 .../test/alpaka/Clusters_test.dev.cc          |  4 +-
 .../alpaka/TrackSoAHeterogeneous_test.dev.cc  |  4 +-
 .../test/alpaka/Hits_test.dev.cc              |  2 +-
 .../interface/HistoContainer.h                |  4 +-
 .../test/alpaka/testAtomicPairCounter.dev.cc  |  2 +-
 .../test/alpaka/testOneHistoContainer.dev.cc  | 16 +++---
 .../test/alpaka/testOneRadixSort.dev.cc       |  6 +--
 .../test/alpaka/testOneToManyAssoc.dev.cc     | 12 ++---
 .../test/alpaka/testPrefixScan.dev.cc         |  8 +--
 .../test/alpaka/testSimpleVector.dev.cc       |  4 +-
 .../plugins/alpaka/CalibPixel.h               |  8 +--
 .../plugins/alpaka/PixelClustering.h          |  2 +-
 .../alpaka/SiPixelRawToClusterKernel.dev.cc   |  2 +-
 .../plugins/alpaka/PixelRecHitKernels.dev.cc  |  2 +-
 .../plugins/alpaka/BrokenLineFit.dev.cc       |  4 +-
 .../alpaka/CAHitNtupletGeneratorKernelsImpl.h | 52 +++++++++----------
 .../plugins/alpaka/CAPixelDoublets.h          |  2 +-
 .../plugins/alpaka/RiemannFit.dev.cc          |  6 +--
 .../plugins/alpaka/clusterTracksByDensity.h   | 24 ++++-----
 .../plugins/alpaka/clusterTracksDBSCAN.h      | 26 +++++-----
 .../plugins/alpaka/clusterTracksIterative.h   | 20 +++----
 .../plugins/alpaka/fitVertices.h              | 10 ++--
 .../plugins/alpaka/sortByPt2.h                |  6 +--
 .../plugins/alpaka/splitVertices.h            |  6 +--
 .../plugins/alpaka/vertexFinder.dev.cc        |  2 +-
 25 files changed, 117 insertions(+), 117 deletions(-)

diff --git a/DataFormats/SiPixelClusterSoA/test/alpaka/Clusters_test.dev.cc b/DataFormats/SiPixelClusterSoA/test/alpaka/Clusters_test.dev.cc
index 684380dcbdfbc..56cacacc92c37 100644
--- a/DataFormats/SiPixelClusterSoA/test/alpaka/Clusters_test.dev.cc
+++ b/DataFormats/SiPixelClusterSoA/test/alpaka/Clusters_test.dev.cc
@@ -15,7 +15,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
     public:
       template <typename TAcc, typename = std::enable_if_t<isAccelerator<TAcc>>>
       ALPAKA_FN_ACC void operator()(TAcc const& acc, SiPixelClustersSoAView clust_view) const {
-        for (int32_t j : elements_with_stride(acc, clust_view.metadata().size())) {
+        for (int32_t j : uniform_elements(acc, clust_view.metadata().size())) {
           clust_view[j].moduleStart() = j;
           clust_view[j].clusInModule() = j * 2;
           clust_view[j].moduleId() = j * 3;
@@ -28,7 +28,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
     public:
       template <typename TAcc, typename = std::enable_if_t<isAccelerator<TAcc>>>
       ALPAKA_FN_ACC void operator()(TAcc const& acc, SiPixelClustersSoAConstView clust_view) const {
-        for (uint32_t j : elements_with_stride(acc, clust_view.metadata().size())) {
+        for (uint32_t j : uniform_elements(acc, clust_view.metadata().size())) {
           assert(clust_view[j].moduleStart() == j);
           assert(clust_view[j].clusInModule() == j * 2);
           assert(clust_view[j].moduleId() == j * 3);
diff --git a/DataFormats/TrackSoA/test/alpaka/TrackSoAHeterogeneous_test.dev.cc b/DataFormats/TrackSoA/test/alpaka/TrackSoAHeterogeneous_test.dev.cc
index accf175bccfe6..566d4fd7ac92c 100644
--- a/DataFormats/TrackSoA/test/alpaka/TrackSoAHeterogeneous_test.dev.cc
+++ b/DataFormats/TrackSoA/test/alpaka/TrackSoAHeterogeneous_test.dev.cc
@@ -27,7 +27,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
           tracks_view.nTracks() = nTracks;
         }
 
-        for (int32_t j : elements_with_stride(acc, nTracks)) {
+        for (int32_t j : uniform_elements(acc, nTracks)) {
           tracks_view[j].pt() = (float)j;
           tracks_view[j].eta() = (float)j;
           tracks_view[j].chi2() = (float)j;
@@ -50,7 +50,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
         if (cms::alpakatools::once_per_grid(acc)) {
           ALPAKA_ASSERT(tracks_view.nTracks() == nTracks);
         }
-        for (int32_t j : elements_with_stride(acc, tracks_view.nTracks())) {
+        for (int32_t j : uniform_elements(acc, tracks_view.nTracks())) {
           ALPAKA_ASSERT(abs(tracks_view[j].pt() - (float)j) < .0001);
           ALPAKA_ASSERT(abs(tracks_view[j].eta() - (float)j) < .0001);
           ALPAKA_ASSERT(abs(tracks_view[j].chi2() - (float)j) < .0001);
diff --git a/DataFormats/TrackingRecHitSoA/test/alpaka/Hits_test.dev.cc b/DataFormats/TrackingRecHitSoA/test/alpaka/Hits_test.dev.cc
index b987b0ee82a63..d490ba540211b 100644
--- a/DataFormats/TrackingRecHitSoA/test/alpaka/Hits_test.dev.cc
+++ b/DataFormats/TrackingRecHitSoA/test/alpaka/Hits_test.dev.cc
@@ -45,7 +45,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
         }
 
         // can be increased to soa.nHits() for debugging
-        for (uint32_t i : cms::alpakatools::elements_with_stride(acc, 10)) {
+        for (uint32_t i : cms::alpakatools::uniform_elements(acc, 10)) {
           printf("iPhi %d -> %d\n", i, soa[i].iphi());
         }
       }
diff --git a/HeterogeneousCore/AlpakaInterface/interface/HistoContainer.h b/HeterogeneousCore/AlpakaInterface/interface/HistoContainer.h
index 304d01ff9fd08..9535abad90c01 100644
--- a/HeterogeneousCore/AlpakaInterface/interface/HistoContainer.h
+++ b/HeterogeneousCore/AlpakaInterface/interface/HistoContainer.h
@@ -25,7 +25,7 @@ namespace cms::alpakatools {
                                   T const *__restrict__ v,
                                   uint32_t const *__restrict__ offsets) const {
       const uint32_t nt = offsets[nh];
-      for (uint32_t i : elements_with_stride(acc, nt)) {
+      for (uint32_t i : uniform_elements(acc, nt)) {
         auto off = alpaka_std::upper_bound(offsets, offsets + nh + 1, i);
         ALPAKA_ASSERT_OFFLOAD((*off) > 0);
         int32_t ih = off - offsets - 1;
@@ -44,7 +44,7 @@ namespace cms::alpakatools {
                                   T const *__restrict__ v,
                                   uint32_t const *__restrict__ offsets) const {
       const uint32_t nt = offsets[nh];
-      for (uint32_t i : elements_with_stride(acc, nt)) {
+      for (uint32_t i : uniform_elements(acc, nt)) {
         auto off = alpaka_std::upper_bound(offsets, offsets + nh + 1, i);
         ALPAKA_ASSERT_OFFLOAD((*off) > 0);
         int32_t ih = off - offsets - 1;
diff --git a/HeterogeneousCore/AlpakaInterface/test/alpaka/testAtomicPairCounter.dev.cc b/HeterogeneousCore/AlpakaInterface/test/alpaka/testAtomicPairCounter.dev.cc
index 1687feb8c1bab..e059a668e1480 100644
--- a/HeterogeneousCore/AlpakaInterface/test/alpaka/testAtomicPairCounter.dev.cc
+++ b/HeterogeneousCore/AlpakaInterface/test/alpaka/testAtomicPairCounter.dev.cc
@@ -18,7 +18,7 @@ struct update {
   template <typename TAcc>
   ALPAKA_FN_ACC void operator()(
       const TAcc &acc, AtomicPairCounter *dc, uint32_t *ind, uint32_t *cont, uint32_t n) const {
-    for (auto i : elements_with_stride(acc, n)) {
+    for (auto i : uniform_elements(acc, n)) {
       auto m = i % 11;
       m = m % 6 + 1;  // max 6, no 0
       auto c = dc->inc_add(acc, m);
diff --git a/HeterogeneousCore/AlpakaInterface/test/alpaka/testOneHistoContainer.dev.cc b/HeterogeneousCore/AlpakaInterface/test/alpaka/testOneHistoContainer.dev.cc
index 4ce11cc7facdd..b032939f9870b 100644
--- a/HeterogeneousCore/AlpakaInterface/test/alpaka/testOneHistoContainer.dev.cc
+++ b/HeterogeneousCore/AlpakaInterface/test/alpaka/testOneHistoContainer.dev.cc
@@ -29,19 +29,19 @@ struct mykernel {
     auto& ws = alpaka::declareSharedVar<typename Hist::Counter[32], __COUNTER__>(acc);
 
     // set off zero
-    for (auto j : elements_with_stride(acc, Hist::totbins())) {
+    for (auto j : uniform_elements(acc, Hist::totbins())) {
       hist.off[j] = 0;
     }
     alpaka::syncBlockThreads(acc);
 
     // set bins zero
-    for (auto j : elements_with_stride(acc, Hist::totbins())) {
+    for (auto j : uniform_elements(acc, Hist::totbins())) {
       hist.content[j] = 0;
     }
     alpaka::syncBlockThreads(acc);
 
     // count
-    for (auto j : elements_with_stride(acc, N)) {
+    for (auto j : uniform_elements(acc, N)) {
       hist.count(acc, v[j]);
     }
     alpaka::syncBlockThreads(acc);
@@ -56,18 +56,18 @@ struct mykernel {
     ALPAKA_ASSERT_OFFLOAD(N == hist.size());
 
     // verify
-    for ([[maybe_unused]] auto j : elements_with_stride(acc, Hist::nbins())) {
+    for ([[maybe_unused]] auto j : uniform_elements(acc, Hist::nbins())) {
       ALPAKA_ASSERT_OFFLOAD(hist.off[j] <= hist.off[j + 1]);
     }
     alpaka::syncBlockThreads(acc);
 
-    for (auto j : elements_with_stride(acc, 32)) {
+    for (auto j : uniform_elements(acc, 32)) {
       ws[j] = 0;  // used by prefix scan...
     }
     alpaka::syncBlockThreads(acc);
 
     // fill
-    for (auto j : elements_with_stride(acc, N)) {
+    for (auto j : uniform_elements(acc, N)) {
       hist.fill(acc, v[j], j);
     }
     alpaka::syncBlockThreads(acc);
@@ -77,7 +77,7 @@ struct mykernel {
 
     // bin
 #ifndef NDEBUG
-    for (auto j : elements_with_stride(acc, hist.size() - 1)) {
+    for (auto j : uniform_elements(acc, hist.size() - 1)) {
       auto p = hist.begin() + j;
       ALPAKA_ASSERT_OFFLOAD((*p) < N);
       [[maybe_unused]] auto k1 = Hist::bin(v[*p]);
@@ -87,7 +87,7 @@ struct mykernel {
 #endif
 
     // forEachInWindow
-    for (auto i : elements_with_stride(acc, hist.size())) {
+    for (auto i : uniform_elements(acc, hist.size())) {
       auto p = hist.begin() + i;
       auto j = *p;
 #ifndef NDEBUG
diff --git a/HeterogeneousCore/AlpakaInterface/test/alpaka/testOneRadixSort.dev.cc b/HeterogeneousCore/AlpakaInterface/test/alpaka/testOneRadixSort.dev.cc
index b1cb735b55194..a8d9240d47183 100644
--- a/HeterogeneousCore/AlpakaInterface/test/alpaka/testOneRadixSort.dev.cc
+++ b/HeterogeneousCore/AlpakaInterface/test/alpaka/testOneRadixSort.dev.cc
@@ -85,7 +85,7 @@ namespace {
       //      __shared__ uint16_t sws[2048];
       //      __shared__ float z[2048];
       //      __shared__ int iz[2048];
-      for (auto itrack : elements_with_stride(acc, elements)) {
+      for (auto itrack : uniform_elements(acc, elements)) {
         z[itrack] = gpu_input[itrack];
         iz[itrack] = 10000 * gpu_input[itrack];
         // order[itrack] = itrack;
@@ -95,7 +95,7 @@ namespace {
       alpaka::syncBlockThreads(acc);
 
       //verify
-      for (auto itrack : elements_with_stride(acc, elements - 1)) {
+      for (auto itrack : uniform_elements(acc, elements - 1)) {
         auto ntrack = order[itrack];
         auto mtrack = order[itrack + 1];
         assert(truncate<2>(z[ntrack]) <= truncate<2>(z[mtrack]));
@@ -123,7 +123,7 @@ namespace {
       radixSort<TAcc, int, 4>(acc, iz, order, sws, elements);
       alpaka::syncBlockThreads(acc);
 
-      for (auto itrack : elements_with_stride(acc, elements - 1)) {
+      for (auto itrack : uniform_elements(acc, elements - 1)) {
         auto ntrack = order[itrack];
         auto mtrack = order[itrack + 1];
         assert(iz[ntrack] <= iz[mtrack]);
diff --git a/HeterogeneousCore/AlpakaInterface/test/alpaka/testOneToManyAssoc.dev.cc b/HeterogeneousCore/AlpakaInterface/test/alpaka/testOneToManyAssoc.dev.cc
index d1de1f1c17cca..492911e6b1a57 100644
--- a/HeterogeneousCore/AlpakaInterface/test/alpaka/testOneToManyAssoc.dev.cc
+++ b/HeterogeneousCore/AlpakaInterface/test/alpaka/testOneToManyAssoc.dev.cc
@@ -36,7 +36,7 @@ struct countMultiLocal {
                                 TK const* __restrict__ tk,
                                 Multiplicity* __restrict__ assoc,
                                 uint32_t n) const {
-    for (auto i : elements_with_stride(acc, n)) {
+    for (auto i : uniform_elements(acc, n)) {
       auto& local = alpaka::declareSharedVar<Multiplicity::CountersOnly, __COUNTER__>(acc);
       const uint32_t threadIdxLocal(alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]);
       const bool oncePerSharedMemoryAccess = (threadIdxLocal == 0);
@@ -59,7 +59,7 @@ struct countMulti {
                                 TK const* __restrict__ tk,
                                 Multiplicity* __restrict__ assoc,
                                 uint32_t n) const {
-    for (auto i : elements_with_stride(acc, n)) {
+    for (auto i : uniform_elements(acc, n)) {
       assoc->count(acc, 2 + i % 4);
     }
   }
@@ -68,7 +68,7 @@ struct countMulti {
 struct verifyMulti {
   template <typename TAcc>
   ALPAKA_FN_ACC void operator()(const TAcc& acc, Multiplicity* __restrict__ m1, Multiplicity* __restrict__ m2) const {
-    for ([[maybe_unused]] auto i : elements_with_stride(acc, Multiplicity{}.totOnes())) {
+    for ([[maybe_unused]] auto i : uniform_elements(acc, Multiplicity{}.totOnes())) {
       ALPAKA_ASSERT_OFFLOAD(m1->off[i] == m2->off[i]);
     }
   }
@@ -80,7 +80,7 @@ struct count {
                                 TK const* __restrict__ tk,
                                 AssocRandomAccess* __restrict__ assoc,
                                 uint32_t n) const {
-    for (auto i : elements_with_stride(acc, 4 * n)) {
+    for (auto i : uniform_elements(acc, 4 * n)) {
       auto k = i / 4;
       auto j = i - 4 * k;
       ALPAKA_ASSERT_OFFLOAD(j < 4);
@@ -100,7 +100,7 @@ struct fill {
                                 TK const* __restrict__ tk,
                                 AssocRandomAccess* __restrict__ assoc,
                                 uint32_t n) const {
-    for (auto i : elements_with_stride(acc, 4 * n)) {
+    for (auto i : uniform_elements(acc, 4 * n)) {
       auto k = i / 4;
       auto j = i - 4 * k;
       ALPAKA_ASSERT_OFFLOAD(j < 4);
@@ -125,7 +125,7 @@ struct fillBulk {
   template <typename TAcc, typename Assoc>
   ALPAKA_FN_ACC void operator()(
       const TAcc& acc, AtomicPairCounter* apc, TK const* __restrict__ tk, Assoc* __restrict__ assoc, uint32_t n) const {
-    for (auto k : elements_with_stride(acc, n)) {
+    for (auto k : uniform_elements(acc, n)) {
       auto m = tk[k][3] < MaxElem ? 4 : 3;
       assoc->bulkFill(acc, *apc, &tk[k][0], m);
     }
diff --git a/HeterogeneousCore/AlpakaInterface/test/alpaka/testPrefixScan.dev.cc b/HeterogeneousCore/AlpakaInterface/test/alpaka/testPrefixScan.dev.cc
index bffee8f1f533d..5e8f4ee3b8e9a 100644
--- a/HeterogeneousCore/AlpakaInterface/test/alpaka/testPrefixScan.dev.cc
+++ b/HeterogeneousCore/AlpakaInterface/test/alpaka/testPrefixScan.dev.cc
@@ -34,7 +34,7 @@ struct testPrefixScan {
     auto& c = alpaka::declareSharedVar<T[1024], __COUNTER__>(acc);
     auto& co = alpaka::declareSharedVar<T[1024], __COUNTER__>(acc);
 
-    for (auto i : elements_with_stride(acc, size)) {
+    for (auto i : uniform_elements(acc, size)) {
       c[i] = 1;
     };
 
@@ -49,7 +49,7 @@ struct testPrefixScan {
     // TODO: not needed? Not in multi kernel version, not in CUDA version
     alpaka::syncBlockThreads(acc);
 
-    for (auto i : elements_with_stride(acc, size)) {
+    for (auto i : uniform_elements(acc, size)) {
       if (0 == i)
         continue;
       if constexpr (!std::is_floating_point_v<T>) {
@@ -109,7 +109,7 @@ struct testWarpPrefixScan {
 struct init {
   template <typename TAcc>
   ALPAKA_FN_ACC void operator()(const TAcc& acc, uint32_t* v, uint32_t val, uint32_t n) const {
-    for (auto index : elements_with_stride(acc, n)) {
+    for (auto index : uniform_elements(acc, n)) {
       v[index] = val;
 
       if (index == 0)
@@ -121,7 +121,7 @@ struct init {
 struct verify {
   template <typename TAcc>
   ALPAKA_FN_ACC void operator()(const TAcc& acc, uint32_t const* v, uint32_t n) const {
-    for (auto index : elements_with_stride(acc, n)) {
+    for (auto index : uniform_elements(acc, n)) {
       ALPAKA_ASSERT_OFFLOAD(v[index] == index + 1);
 
       if (index == 0)
diff --git a/HeterogeneousCore/AlpakaInterface/test/alpaka/testSimpleVector.dev.cc b/HeterogeneousCore/AlpakaInterface/test/alpaka/testSimpleVector.dev.cc
index c29b571c6d356..6f60679c79d64 100644
--- a/HeterogeneousCore/AlpakaInterface/test/alpaka/testSimpleVector.dev.cc
+++ b/HeterogeneousCore/AlpakaInterface/test/alpaka/testSimpleVector.dev.cc
@@ -15,7 +15,7 @@ using namespace ALPAKA_ACCELERATOR_NAMESPACE;
 struct vector_pushback {
   template <typename TAcc>
   ALPAKA_FN_ACC void operator()(const TAcc& acc, SimpleVector<int>* foo) const {
-    for (auto index : elements_with_stride(acc))
+    for (auto index : uniform_elements(acc))
       foo->push_back(acc, index);
   }
 };
@@ -30,7 +30,7 @@ struct vector_reset {
 struct vector_emplace_back {
   template <typename TAcc>
   ALPAKA_FN_ACC void operator()(const TAcc& acc, SimpleVector<int>* foo) const {
-    for (auto index : elements_with_stride(acc))
+    for (auto index : uniform_elements(acc))
       foo->emplace_back(acc, index);
   }
 };
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/CalibPixel.h b/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/CalibPixel.h
index e34df782db4dc..1ab3bdde439ab 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/CalibPixel.h
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/CalibPixel.h
@@ -41,11 +41,11 @@ namespace calibPixel {
         clus_view[0].clusModuleStart() = 0;
         clus_view[0].moduleStart() = 0;
       }
-      for (auto i : cms::alpakatools::elements_with_stride(acc, phase1PixelTopology::numberOfModules)) {
+      for (auto i : cms::alpakatools::uniform_elements(acc, phase1PixelTopology::numberOfModules)) {
         clus_view[i].clusInModule() = 0;
       }
 
-      for (auto i : cms::alpakatools::elements_with_stride(acc, numElements)) {
+      for (auto i : cms::alpakatools::uniform_elements(acc, numElements)) {
         auto dvgi = view[i];
         if (dvgi.moduleId() == ::pixelClustering::invalidModuleId)
           continue;
@@ -106,11 +106,11 @@ namespace calibPixel {
         clus_view[0].clusModuleStart() = clus_view[0].moduleStart() = 0;
       }
 
-      for (uint32_t i : cms::alpakatools::elements_with_stride(acc, phase2PixelTopology::numberOfModules)) {
+      for (uint32_t i : cms::alpakatools::uniform_elements(acc, phase2PixelTopology::numberOfModules)) {
         clus_view[i].clusInModule() = 0;
       }
 
-      for (uint32_t i : cms::alpakatools::elements_with_stride(acc, numElements)) {
+      for (uint32_t i : cms::alpakatools::uniform_elements(acc, numElements)) {
         auto dvgi = view[i];
         if (pixelClustering::invalidModuleId != dvgi.moduleId()) {
           const int mode = (Phase2ReadoutMode < -1 ? -1 : Phase2ReadoutMode);
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/PixelClustering.h b/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/PixelClustering.h
index 37afda9847a99..b2fcca94e1d24 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/PixelClustering.h
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/PixelClustering.h
@@ -104,7 +104,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::pixelClustering {
         printf("Starting to count modules to set module starts:");
       }
 #endif
-      for (int32_t i : cms::alpakatools::elements_with_stride(acc, numElements)) {
+      for (int32_t i : cms::alpakatools::uniform_elements(acc, numElements)) {
         digi_view[i].clus() = i;
         if (::pixelClustering::invalidModuleId == digi_view[i].moduleId())
           continue;
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/SiPixelRawToClusterKernel.dev.cc b/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/SiPixelRawToClusterKernel.dev.cc
index 1cb55b0a27955..6a28f0cd0504a 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/SiPixelRawToClusterKernel.dev.cc
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/SiPixelRawToClusterKernel.dev.cc
@@ -304,7 +304,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
         if (cms::alpakatools::once_per_grid(acc))
           err.size() = 0;
 
-        for (auto gIndex : cms::alpakatools::elements_with_stride(acc, wordCounter)) {
+        for (auto gIndex : cms::alpakatools::uniform_elements(acc, wordCounter)) {
           auto dvgi = digisView[gIndex];
           dvgi.xx() = 0;
           dvgi.yy() = 0;
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/alpaka/PixelRecHitKernels.dev.cc b/RecoLocalTracker/SiPixelRecHits/plugins/alpaka/PixelRecHitKernels.dev.cc
index 63e269cc79453..5b6d1133a77bb 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/alpaka/PixelRecHitKernels.dev.cc
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/alpaka/PixelRecHitKernels.dev.cc
@@ -36,7 +36,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
                                   uint32_t* __restrict__ hitsLayerStart) const {
       assert(0 == hitsModuleStart[0]);
 
-      for (int32_t i : cms::alpakatools::elements_with_stride(acc, TrackerTraits::numberOfLayers + 1)) {
+      for (int32_t i : cms::alpakatools::uniform_elements(acc, TrackerTraits::numberOfLayers + 1)) {
         hitsLayerStart[i] = hitsModuleStart[cpeParams->layerGeometry().layerStart[i]];
 #ifdef GPU_DEBUG
         int old = i == 0 ? 0 : hitsModuleStart[cpeParams->layerGeometry().layerStart[i - 1]];
diff --git a/RecoTracker/PixelSeeding/plugins/alpaka/BrokenLineFit.dev.cc b/RecoTracker/PixelSeeding/plugins/alpaka/BrokenLineFit.dev.cc
index 9882c5c47b43e..ae6739cfb72df 100644
--- a/RecoTracker/PixelSeeding/plugins/alpaka/BrokenLineFit.dev.cc
+++ b/RecoTracker/PixelSeeding/plugins/alpaka/BrokenLineFit.dev.cc
@@ -62,7 +62,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
       }
 #endif
       const auto nt = riemannFit::maxNumberOfConcurrentFits;
-      for (auto local_idx : cms::alpakatools::elements_with_stride(acc, nt)) {
+      for (auto local_idx : cms::alpakatools::uniform_elements(acc, nt)) {
         auto tuple_idx = local_idx + offset;
         if ((int)tuple_idx >= totTK) {
           ptkids[local_idx] = invalidTkId;
@@ -190,7 +190,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
       // same as above...
       // look in bin for this hit multiplicity
       const auto nt = riemannFit::maxNumberOfConcurrentFits;
-      for (auto local_idx : cms::alpakatools::elements_with_stride(acc, nt)) {
+      for (auto local_idx : cms::alpakatools::uniform_elements(acc, nt)) {
         if (invalidTkId == ptkids[local_idx])
           break;
         auto tkid = ptkids[local_idx];
diff --git a/RecoTracker/PixelSeeding/plugins/alpaka/CAHitNtupletGeneratorKernelsImpl.h b/RecoTracker/PixelSeeding/plugins/alpaka/CAHitNtupletGeneratorKernelsImpl.h
index 68b6e597e93c8..7b296324ba3eb 100644
--- a/RecoTracker/PixelSeeding/plugins/alpaka/CAHitNtupletGeneratorKernelsImpl.h
+++ b/RecoTracker/PixelSeeding/plugins/alpaka/CAHitNtupletGeneratorKernelsImpl.h
@@ -113,7 +113,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::caHitNtupletGeneratorKernels {
         }
       }
 
-      for (auto idx : cms::alpakatools::elements_with_stride(acc, tracks_view.hitIndices().nOnes())) {
+      for (auto idx : cms::alpakatools::uniform_elements(acc, tracks_view.hitIndices().nOnes())) {
         if (tracks_view.hitIndices().size(idx) > TrackerTraits::maxHitsOnTrack)  // current real limit
           printf("ERROR %d, %d\n", idx, tracks_view.hitIndices().size(idx));
         ALPAKA_ASSERT_OFFLOAD(ftracks_view.hitIndices().size(idx) <= TrackerTraits::maxHitsOnTrack);
@@ -141,7 +141,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::caHitNtupletGeneratorKernels {
 #endif
       }
 
-      for (auto idx : cms::alpakatools::elements_with_stride(acc, *nCells)) {
+      for (auto idx : cms::alpakatools::uniform_elements(acc, *nCells)) {
         auto const &thisCell = cells[idx];
         if (thisCell.hasFishbone() && !thisCell.isKilled())
           alpaka::atomicAdd(acc, &c.nFishCells, 1ull, alpaka::hierarchy::Blocks{});
@@ -158,7 +158,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::caHitNtupletGeneratorKernels {
       }
 
       // FIXME this loop was up to nHits - isOuterHitOfCell.offset in the CUDA version
-      for (auto idx : cms::alpakatools::elements_with_stride(acc, nHits))
+      for (auto idx : cms::alpakatools::uniform_elements(acc, nHits))
         if ((*isOuterHitOfCell).container[idx].full())  // ++tooManyOuterHitOfCell;
           printf("OuterHitOfCell overflow %d\n", idx);
     }
@@ -174,7 +174,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::caHitNtupletGeneratorKernels {
                                   TkSoAView<TrackerTraits> tracks_view) const {
       constexpr auto reject = Quality::dup;
 
-      for (auto idx : cms::alpakatools::elements_with_stride(acc, *nCells)) {
+      for (auto idx : cms::alpakatools::uniform_elements(acc, *nCells)) {
         auto const &thisCell = cells[idx];
         if (!thisCell.isKilled())
           continue;
@@ -199,7 +199,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::caHitNtupletGeneratorKernels {
       // quality to mark rejected
       constexpr auto reject = Quality::edup;  /// cannot be loose
       ALPAKA_ASSERT_OFFLOAD(nCells);
-      for (auto idx : cms::alpakatools::elements_with_stride(acc, *nCells)) {
+      for (auto idx : cms::alpakatools::uniform_elements(acc, *nCells)) {
         auto const &thisCell = cells[idx];
 
         if (thisCell.tracks().size() < 2)
@@ -242,7 +242,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::caHitNtupletGeneratorKernels {
       ALPAKA_ASSERT_OFFLOAD(nCells);
       const auto ntNCells = (*nCells);
 
-      for (auto idx : cms::alpakatools::elements_with_stride(acc, ntNCells)) {
+      for (auto idx : cms::alpakatools::uniform_elements(acc, ntNCells)) {
         auto const &thisCell = cells[idx];
         if (thisCell.tracks().size() < 2)
           continue;
@@ -401,7 +401,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::caHitNtupletGeneratorKernels {
         printf("starting producing ntuplets from %d cells \n", *nCells);
 #endif
 
-      for (auto idx : cms::alpakatools::elements_with_stride(acc, (*nCells))) {
+      for (auto idx : cms::alpakatools::uniform_elements(acc, (*nCells))) {
         auto const &thisCell = cells[idx];
 
         // cut by earlyFishbone
@@ -445,7 +445,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::caHitNtupletGeneratorKernels {
                                   CACellT<TrackerTraits> *__restrict__ cells,
                                   uint32_t const *nCells) const {
       using Cell = CACellT<TrackerTraits>;
-      for (auto idx : cms::alpakatools::elements_with_stride(acc, (*nCells))) {
+      for (auto idx : cms::alpakatools::uniform_elements(acc, (*nCells))) {
         auto &thisCell = cells[idx];
         if (!thisCell.tracks().empty())
           thisCell.setStatusBits(Cell::StatusBit::kInTrack);
@@ -460,7 +460,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::caHitNtupletGeneratorKernels {
     ALPAKA_FN_ACC void operator()(TAcc const &acc,
                                   TkSoAView<TrackerTraits> tracks_view,
                                   TupleMultiplicity<TrackerTraits> *tupleMultiplicity) const {
-      for (auto it : cms::alpakatools::elements_with_stride(acc, tracks_view.hitIndices().nOnes())) {
+      for (auto it : cms::alpakatools::uniform_elements(acc, tracks_view.hitIndices().nOnes())) {
         auto nhits = tracks_view.hitIndices().size(it);
         if (nhits < 3)
           continue;
@@ -482,7 +482,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::caHitNtupletGeneratorKernels {
     ALPAKA_FN_ACC void operator()(TAcc const &acc,
                                   TkSoAView<TrackerTraits> tracks_view,
                                   TupleMultiplicity<TrackerTraits> *tupleMultiplicity) const {
-      for (auto it : cms::alpakatools::elements_with_stride(acc, tracks_view.hitIndices().nOnes())) {
+      for (auto it : cms::alpakatools::uniform_elements(acc, tracks_view.hitIndices().nOnes())) {
         auto nhits = tracks_view.hitIndices().size(it);
         if (nhits < 3)
           continue;
@@ -504,7 +504,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::caHitNtupletGeneratorKernels {
     ALPAKA_FN_ACC void operator()(TAcc const &acc,
                                   TkSoAView<TrackerTraits> tracks_view,
                                   QualityCuts<TrackerTraits> cuts) const {
-      for (auto it : cms::alpakatools::elements_with_stride(acc, tracks_view.hitIndices().nOnes())) {
+      for (auto it : cms::alpakatools::uniform_elements(acc, tracks_view.hitIndices().nOnes())) {
         auto nhits = tracks_view.hitIndices().size(it);
         if (nhits == 0)
           break;  // guard
@@ -549,7 +549,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::caHitNtupletGeneratorKernels {
   public:
     template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
     ALPAKA_FN_ACC void operator()(TAcc const &acc, TkSoAView<TrackerTraits> tracks_view, Counters *counters) const {
-      for (auto idx : cms::alpakatools::elements_with_stride(acc, tracks_view.hitIndices().nOnes())) {
+      for (auto idx : cms::alpakatools::uniform_elements(acc, tracks_view.hitIndices().nOnes())) {
         if (tracks_view.hitIndices().size(idx) == 0)
           break;  //guard
         if (tracks_view[idx].quality() < Quality::loose)
@@ -569,7 +569,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::caHitNtupletGeneratorKernels {
     ALPAKA_FN_ACC void operator()(TAcc const &acc,
                                   TkSoAView<TrackerTraits> tracks_view,
                                   HitToTuple<TrackerTraits> *hitToTuple) const {
-      for (auto idx : cms::alpakatools::elements_with_stride(acc, tracks_view.hitIndices().nOnes())) {
+      for (auto idx : cms::alpakatools::uniform_elements(acc, tracks_view.hitIndices().nOnes())) {
         if (tracks_view.hitIndices().size(idx) == 0)
           break;  // guard
         for (auto h = tracks_view.hitIndices().begin(idx); h != tracks_view.hitIndices().end(idx); ++h)
@@ -585,7 +585,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::caHitNtupletGeneratorKernels {
     ALPAKA_FN_ACC void operator()(TAcc const &acc,
                                   TkSoAView<TrackerTraits> tracks_view,
                                   HitToTuple<TrackerTraits> *hitToTuple) const {
-      for (auto idx : cms::alpakatools::elements_with_stride(acc, tracks_view.hitIndices().nOnes())) {
+      for (auto idx : cms::alpakatools::uniform_elements(acc, tracks_view.hitIndices().nOnes())) {
         if (tracks_view.hitIndices().size(idx) == 0)
           break;  // guard
         for (auto h = tracks_view.hitIndices().begin(idx); h != tracks_view.hitIndices().end(idx); ++h)
@@ -602,11 +602,11 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::caHitNtupletGeneratorKernels {
                                   TkSoAView<TrackerTraits> tracks_view,
                                   HitsConstView<TrackerTraits> hh) const {
       // copy offsets
-      for (auto idx : cms::alpakatools::elements_with_stride(acc, tracks_view.hitIndices().nOnes())) {
+      for (auto idx : cms::alpakatools::uniform_elements(acc, tracks_view.hitIndices().nOnes())) {
         tracks_view.detIndices().off[idx] = tracks_view.hitIndices().off[idx];
       }
       // fill hit indices
-      for (auto idx : cms::alpakatools::elements_with_stride(acc, tracks_view.hitIndices().size())) {
+      for (auto idx : cms::alpakatools::uniform_elements(acc, tracks_view.hitIndices().size())) {
         ALPAKA_ASSERT_OFFLOAD(tracks_view.hitIndices().content[idx] < (uint32_t)hh.metadata().size());
         tracks_view.detIndices().content[idx] = hh[tracks_view.hitIndices().content[idx]].detectorIndex();
       }
@@ -625,7 +625,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::caHitNtupletGeneratorKernels {
 
       if (cms::alpakatools::once_per_grid(acc))
         tracks_view.nTracks() = ntracks;
-      for (auto idx : cms::alpakatools::elements_with_stride(acc, ntracks)) {
+      for (auto idx : cms::alpakatools::uniform_elements(acc, ntracks)) {
         ALPAKA_ASSERT_OFFLOAD(TracksUtilities<TrackerTraits>::nHits(tracks_view, idx) >= 3);
         tracks_view[idx].nLayers() = TracksUtilities<TrackerTraits>::computeNumberOfLayers(tracks_view, idx);
       }
@@ -640,7 +640,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::caHitNtupletGeneratorKernels {
                                   HitToTuple<TrackerTraits> const *__restrict__ hitToTuple,
                                   Counters *counters) const {
       auto &c = *counters;
-      for (auto idx : cms::alpakatools::elements_with_stride(acc, hitToTuple->nOnes())) {
+      for (auto idx : cms::alpakatools::uniform_elements(acc, hitToTuple->nOnes())) {
         if (hitToTuple->size(idx) == 0)
           continue;  // SHALL NOT BE break
         alpaka::atomicAdd(acc, &c.nUsedHits, 1ull, alpaka::hierarchy::Blocks{});
@@ -663,7 +663,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::caHitNtupletGeneratorKernels {
 
       auto &hitToTuple = *phitToTuple;
       auto const &foundNtuplets = *ptuples;
-      for (auto idx : cms::alpakatools::elements_with_stride(acc, hitToTuple->nbins())) {
+      for (auto idx : cms::alpakatools::uniform_elements(acc, hitToTuple->nbins())) {
         if (hitToTuple.size(idx) < 2)
           continue;
 
@@ -705,7 +705,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::caHitNtupletGeneratorKernels {
 
       // quality to mark rejected
       auto const reject = dupPassThrough ? loose : dup;
-      for (auto idx : cms::alpakatools::elements_with_stride(acc, tuples->nbins())) {
+      for (auto idx : cms::alpakatools::uniform_elements(acc, tuples->nbins())) {
         if (tuples->size(idx) == 0)
           break;  //guard
         if (quality[idx] <= reject)
@@ -731,7 +731,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::caHitNtupletGeneratorKernels {
 
       auto &hitToTuple = *phitToTuple;
 
-      for (auto idx : cms::alpakatools::elements_with_stride(acc, hitToTuple.nOnes())) {
+      for (auto idx : cms::alpakatools::uniform_elements(acc, hitToTuple.nOnes())) {
         if (hitToTuple.size(idx) < 2)
           continue;
 
@@ -793,7 +793,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::caHitNtupletGeneratorKernels {
 
       uint32_t l1end = hh.hitsLayerStart()[1];
 
-      for (auto idx : cms::alpakatools::elements_with_stride(acc, hitToTuple.nOnes())) {
+      for (auto idx : cms::alpakatools::uniform_elements(acc, hitToTuple.nOnes())) {
         if (hitToTuple.size(idx) < 2)
           continue;
 
@@ -844,7 +844,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::caHitNtupletGeneratorKernels {
 
       auto &hitToTuple = *phitToTuple;
 
-      for (auto idx : cms::alpakatools::elements_with_stride(acc, hitToTuple.nOnes())) {
+      for (auto idx : cms::alpakatools::uniform_elements(acc, hitToTuple.nOnes())) {
         if (hitToTuple.size(idx) < 2)
           continue;
 
@@ -904,7 +904,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::caHitNtupletGeneratorKernels {
 
       auto &hitToTuple = *phitToTuple;
 
-      for (auto idx : cms::alpakatools::elements_with_stride(acc, hitToTuple.nOnes())) {
+      for (auto idx : cms::alpakatools::uniform_elements(acc, hitToTuple.nOnes())) {
         if (hitToTuple.size(idx) < 2)
           continue;
 
@@ -947,8 +947,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::caHitNtupletGeneratorKernels {
                                   int iev) const {
       constexpr auto loose = Quality::loose;
 
-      for (auto i : cms::alpakatools::elements_with_stride(
-               acc, firstPrint, std::min(lastPrint, tracks_view.hitIndices().nbins()))) {
+      for (auto i :
+           cms::alpakatools::uniform_elements(acc, firstPrint, std::min(lastPrint, tracks_view.hitIndices().nbins()))) {
         auto nh = tracks_view.hitIndices().size(i);
         if (nh < 3)
           continue;
diff --git a/RecoTracker/PixelSeeding/plugins/alpaka/CAPixelDoublets.h b/RecoTracker/PixelSeeding/plugins/alpaka/CAPixelDoublets.h
index 518a55c318402..580198772034d 100644
--- a/RecoTracker/PixelSeeding/plugins/alpaka/CAPixelDoublets.h
+++ b/RecoTracker/PixelSeeding/plugins/alpaka/CAPixelDoublets.h
@@ -29,7 +29,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
                                     CellTracks<TrackerTraits>* cellTracksContainer) const {
         ALPAKA_ASSERT_OFFLOAD((*isOuterHitOfCell).container);
 
-        for (auto i : cms::alpakatools::elements_with_stride(acc, nHits - isOuterHitOfCell->offset))
+        for (auto i : cms::alpakatools::uniform_elements(acc, nHits - isOuterHitOfCell->offset))
           (*isOuterHitOfCell).container[i].reset();
 
         if (cms::alpakatools::once_per_grid(acc)) {
diff --git a/RecoTracker/PixelSeeding/plugins/alpaka/RiemannFit.dev.cc b/RecoTracker/PixelSeeding/plugins/alpaka/RiemannFit.dev.cc
index 3a1d5dacd8435..9ab7d1fdf1e78 100644
--- a/RecoTracker/PixelSeeding/plugins/alpaka/RiemannFit.dev.cc
+++ b/RecoTracker/PixelSeeding/plugins/alpaka/RiemannFit.dev.cc
@@ -55,7 +55,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
 #endif
 
       const auto nt = riemannFit::maxNumberOfConcurrentFits;
-      for (auto local_idx : cms::alpakatools::elements_with_stride(acc, nt)) {
+      for (auto local_idx : cms::alpakatools::uniform_elements(acc, nt)) {
         auto tuple_idx = local_idx + offset;
         if (tuple_idx >= tupleMultiplicity->size(nHits))
           break;
@@ -111,7 +111,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
 
       // look in bin for this hit multiplicity
       const auto nt = riemannFit::maxNumberOfConcurrentFits;
-      for (auto local_idx : cms::alpakatools::elements_with_stride(acc, nt)) {
+      for (auto local_idx : cms::alpakatools::uniform_elements(acc, nt)) {
         auto tuple_idx = local_idx + offset;
         if (tuple_idx >= tupleMultiplicity->size(nHits))
           break;
@@ -158,7 +158,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
 
       // look in bin for this hit multiplicity
       const auto nt = riemannFit::maxNumberOfConcurrentFits;
-      for (auto local_idx : cms::alpakatools::elements_with_stride(acc, nt)) {
+      for (auto local_idx : cms::alpakatools::uniform_elements(acc, nt)) {
         auto tuple_idx = local_idx + offset;
         if (tuple_idx >= tupleMultiplicity->size(nHits))
           break;
diff --git a/RecoTracker/PixelVertexFinding/plugins/alpaka/clusterTracksByDensity.h b/RecoTracker/PixelVertexFinding/plugins/alpaka/clusterTracksByDensity.h
index 447a3d6c89c07..cb772a7e653b4 100644
--- a/RecoTracker/PixelVertexFinding/plugins/alpaka/clusterTracksByDensity.h
+++ b/RecoTracker/PixelVertexFinding/plugins/alpaka/clusterTracksByDensity.h
@@ -67,7 +67,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder {
     auto& hist = alpaka::declareSharedVar<Hist, __COUNTER__>(acc);
     auto& hws = alpaka::declareSharedVar<Hist::Counter[32], __COUNTER__>(acc);
 
-    for (auto j : cms::alpakatools::elements_with_stride(acc, Hist::totbins())) {
+    for (auto j : cms::alpakatools::uniform_elements(acc, Hist::totbins())) {
       hist.off[j] = 0;
     }
     alpaka::syncBlockThreads(acc);
@@ -79,7 +79,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder {
     ALPAKA_ASSERT_OFFLOAD(static_cast<int>(nt) <= hist.capacity());
 
     // fill hist  (bin shall be wider than "eps")
-    for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
+    for (auto i : cms::alpakatools::uniform_elements(acc, nt)) {
       ALPAKA_ASSERT_OFFLOAD(i < ::zVertex::MAXTRACKS);
       int iz = int(zt[i] * 10.);  // valid if eps<=0.1
       // iz = std::clamp(iz, INT8_MIN, INT8_MAX);  // sorry c++17 only
@@ -98,12 +98,12 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder {
     hist.finalize(acc, hws);
     alpaka::syncBlockThreads(acc);
     ALPAKA_ASSERT_OFFLOAD(hist.size() == nt);
-    for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
+    for (auto i : cms::alpakatools::uniform_elements(acc, nt)) {
       hist.fill(acc, izt[i], uint16_t(i));
     }
     alpaka::syncBlockThreads(acc);
     // count neighbours
-    for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
+    for (auto i : cms::alpakatools::uniform_elements(acc, nt)) {
       if (ezt2[i] > er2mx)
         continue;
       auto loop = [&](uint32_t j) {
@@ -122,7 +122,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder {
     alpaka::syncBlockThreads(acc);
 
     // find closest above me .... (we ignore the possibility of two j at same distance from i)
-    for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
+    for (auto i : cms::alpakatools::uniform_elements(acc, nt)) {
       float mdist = eps;
       auto loop = [&](uint32_t j) {
         if (nn[j] < nn[i])
@@ -143,7 +143,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder {
 
 #ifdef GPU_DEBUG
     //  mini verification
-    for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
+    for (auto i : cms::alpakatools::uniform_elements(acc, nt)) {
       if (iv[i] != int(i))
         ALPAKA_ASSERT_OFFLOAD(iv[iv[i]] != int(i));
     }
@@ -151,7 +151,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder {
 #endif
 
     // consolidate graph (percolate index of seed)
-    for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
+    for (auto i : cms::alpakatools::uniform_elements(acc, nt)) {
       auto m = iv[i];
       while (m != iv[m])
         m = iv[m];
@@ -161,7 +161,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder {
 #ifdef GPU_DEBUG
     alpaka::syncBlockThreads(acc);
     //  mini verification
-    for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
+    for (auto i : cms::alpakatools::uniform_elements(acc, nt)) {
       if (iv[i] != int(i))
         ALPAKA_ASSERT_OFFLOAD(iv[iv[i]] != int(i));
     }
@@ -169,7 +169,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder {
 
 #ifdef GPU_DEBUG
     // and verify that we did not spit any cluster...
-    for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
+    for (auto i : cms::alpakatools::uniform_elements(acc, nt)) {
       auto minJ = i;
       auto mdist = eps;
       auto loop = [&](uint32_t j) {
@@ -199,7 +199,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder {
 
     // find the number of different clusters, identified by a tracks with clus[i] == i and density larger than threshold;
     // mark these tracks with a negative id.
-    for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
+    for (auto i : cms::alpakatools::uniform_elements(acc, nt)) {
       if (iv[i] == int(i)) {
         if (nn[i] >= minT) {
           auto old = alpaka::atomicInc(acc, &foundClusters, 0xffffffff, alpaka::hierarchy::Threads{});
@@ -214,7 +214,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder {
     ALPAKA_ASSERT_OFFLOAD(foundClusters < ::zVertex::MAXVTX);
 
     // propagate the negative id to all the tracks in the cluster.
-    for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
+    for (auto i : cms::alpakatools::uniform_elements(acc, nt)) {
       if (iv[i] >= 0) {
         // mark each track in a cluster with the same id as the first one
         iv[i] = iv[iv[i]];
@@ -223,7 +223,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder {
     alpaka::syncBlockThreads(acc);
 
     // adjust the cluster id to be a positive value starting from 0
-    for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
+    for (auto i : cms::alpakatools::uniform_elements(acc, nt)) {
       iv[i] = -iv[i] - 1;
     }
 
diff --git a/RecoTracker/PixelVertexFinding/plugins/alpaka/clusterTracksDBSCAN.h b/RecoTracker/PixelVertexFinding/plugins/alpaka/clusterTracksDBSCAN.h
index 769896aa97252..38cfb0bec2289 100644
--- a/RecoTracker/PixelVertexFinding/plugins/alpaka/clusterTracksDBSCAN.h
+++ b/RecoTracker/PixelVertexFinding/plugins/alpaka/clusterTracksDBSCAN.h
@@ -62,7 +62,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder {
       auto& hist = alpaka::declareSharedVar<Hist, __COUNTER__>(acc);
       auto& hws = alpaka::declareSharedVar<Hist::Counter[32], __COUNTER__>(acc);
 
-      for (auto j : cms::alpakatools::elements_with_stride(acc, Hist::totbins())) {
+      for (auto j : cms::alpakatools::uniform_elements(acc, Hist::totbins())) {
         hist.off[j] = 0;
       }
       alpaka::syncBlockThreads(acc);
@@ -75,7 +75,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder {
       ALPAKA_ASSERT_OFFLOAD(static_cast<int>(nt) <= hist.capacity());
 
       // fill hist  (bin shall be wider than "eps")
-      for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
+      for (auto i : cms::alpakatools::uniform_elements(acc, nt)) {
         ALPAKA_ASSERT_OFFLOAD(i < ::zVertex::MAXTRACKS);
         int iz = int(zt[i] * 10.);  // valid if eps<=0.1
         iz = std::clamp(iz, INT8_MIN, INT8_MAX);
@@ -93,13 +93,13 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder {
       hist.finalize(acc, hws);
       alpaka::syncBlockThreads(acc);
       ALPAKA_ASSERT_OFFLOAD(hist.size() == nt);
-      for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
+      for (auto i : cms::alpakatools::uniform_elements(acc, nt)) {
         hist.fill(acc, izt[i], uint32_t(i));
       }
       alpaka::syncBlockThreads(acc);
 
       // count neighbours
-      for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
+      for (auto i : cms::alpakatools::uniform_elements(acc, nt)) {
         if (ezt2[i] > er2mx)
           continue;
         auto loop = [&](uint32_t j) {
@@ -118,7 +118,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder {
       alpaka::syncBlockThreads(acc);
 
       // find NN with smaller z...
-      for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
+      for (auto i : cms::alpakatools::uniform_elements(acc, nt)) {
         if (nn[i] < minT)
           continue;  // DBSCAN core rule
         float mz = zt[i];
@@ -141,7 +141,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder {
 
 #ifdef GPU_DEBUG
       //  mini verification
-      for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
+      for (auto i : cms::alpakatools::uniform_elements(acc, nt)) {
         if (iv[i] != int(i))
           ALPAKA_ASSERT_OFFLOAD(iv[iv[i]] != int(i));
       }
@@ -149,7 +149,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder {
 #endif
 
       // consolidate graph (percolate index of seed)
-      for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
+      for (auto i : cms::alpakatools::uniform_elements(acc, nt)) {
         auto m = iv[i];
         while (m != iv[m])
           m = iv[m];
@@ -160,7 +160,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder {
 
 #ifdef GPU_DEBUG
       //  mini verification
-      for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
+      for (auto i : cms::alpakatools::uniform_elements(acc, nt)) {
         if (iv[i] != int(i))
           ALPAKA_ASSERT_OFFLOAD(iv[iv[i]] != int(i));
       }
@@ -169,7 +169,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder {
 
 #ifdef GPU_DEBUG
       // and verify that we did not spit any cluster...
-      for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
+      for (auto i : cms::alpakatools::uniform_elements(acc, nt)) {
         if (nn[i] < minT)
           continue;  // DBSCAN core rule
         ALPAKA_ASSERT_OFFLOAD(zt[iv[i]] <= zt[i]);
@@ -194,7 +194,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder {
 #endif
 
       // collect edges (assign to closest cluster of closest point??? here to closest point)
-      for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
+      for (auto i : cms::alpakatools::uniform_elements(acc, nt)) {
         //    if (nn[i]==0 || nn[i]>=minT) continue;    // DBSCAN edge rule
         if (nn[i] >= minT)
           continue;  // DBSCAN edge rule
@@ -219,7 +219,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder {
 
       // find the number of different clusters, identified by a tracks with clus[i] == i;
       // mark these tracks with a negative id.
-      for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
+      for (auto i : cms::alpakatools::uniform_elements(acc, nt)) {
         if (iv[i] == int(i)) {
           if (nn[i] >= minT) {
             auto old = alpaka::atomicInc(acc, &foundClusters, 0xffffffff, alpaka::hierarchy::Threads{});
@@ -234,7 +234,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder {
       ALPAKA_ASSERT_OFFLOAD(foundClusters < ::zVertex::MAXVTX);
 
       // propagate the negative id to all the tracks in the cluster.
-      for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
+      for (auto i : cms::alpakatools::uniform_elements(acc, nt)) {
         if (iv[i] >= 0) {
           // mark each track in a cluster with the same id as the first one
           iv[i] = iv[iv[i]];
@@ -243,7 +243,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder {
       alpaka::syncBlockThreads(acc);
 
       // adjust the cluster id to be a positive value starting from 0
-      for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
+      for (auto i : cms::alpakatools::uniform_elements(acc, nt)) {
         iv[i] = -iv[i] - 1;
       }
 
diff --git a/RecoTracker/PixelVertexFinding/plugins/alpaka/clusterTracksIterative.h b/RecoTracker/PixelVertexFinding/plugins/alpaka/clusterTracksIterative.h
index 6468fb9e185c4..100b4b6d42d84 100644
--- a/RecoTracker/PixelVertexFinding/plugins/alpaka/clusterTracksIterative.h
+++ b/RecoTracker/PixelVertexFinding/plugins/alpaka/clusterTracksIterative.h
@@ -61,7 +61,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
         auto& hist = alpaka::declareSharedVar<Hist, __COUNTER__>(acc);
         auto& hws = alpaka::declareSharedVar<Hist::Counter[32], __COUNTER__>(acc);
 
-        for (auto j : cms::alpakatools::elements_with_stride(acc, Hist::totbins())) {
+        for (auto j : cms::alpakatools::uniform_elements(acc, Hist::totbins())) {
           hist.off[j] = 0;
         }
         alpaka::syncBlockThreads(acc);
@@ -74,7 +74,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
         ALPAKA_ASSERT_OFFLOAD(static_cast<int>(nt) <= hist.capacity());
 
         // fill hist  (bin shall be wider than "eps")
-        for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
+        for (auto i : cms::alpakatools::uniform_elements(acc, nt)) {
           ALPAKA_ASSERT_OFFLOAD(i < ::zVertex::MAXTRACKS);
           int iz = int(zt[i] * 10.);  // valid if eps<=0.1
           iz = std::clamp(iz, INT8_MIN, INT8_MAX);
@@ -95,13 +95,13 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
         alpaka::syncBlockThreads(acc);
 
         ALPAKA_ASSERT_OFFLOAD(hist.size() == nt);
-        for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
+        for (auto i : cms::alpakatools::uniform_elements(acc, nt)) {
           hist.fill(acc, izt[i], uint16_t(i));
         }
         alpaka::syncBlockThreads(acc);
 
         // count neighbours
-        for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
+        for (auto i : cms::alpakatools::uniform_elements(acc, nt)) {
           if (ezt2[i] > er2mx)
             continue;
           auto loop = [&](uint32_t j) {
@@ -127,7 +127,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
         bool more = true;
         while (alpaka::syncBlockThreadsPredicate<alpaka::BlockOr>(acc, more)) {
           if (1 == nloops % 2) {
-            for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
+            for (auto i : cms::alpakatools::uniform_elements(acc, nt)) {
               auto m = iv[i];
               while (m != iv[m])
                 m = iv[m];
@@ -135,7 +135,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
             }
           } else {
             more = false;
-            for (auto k : cms::alpakatools::elements_with_stride(acc, hist.size())) {
+            for (auto k : cms::alpakatools::uniform_elements(acc, hist.size())) {
               auto p = hist.begin() + k;
               auto i = (*p);
               auto be = std::min(Hist::bin(izt[i]) + 1, int(hist.nbins() - 1));
@@ -167,7 +167,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
         }  // while
 
         // collect edges (assign to closest cluster of closest point??? here to closest point)
-        for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
+        for (auto i : cms::alpakatools::uniform_elements(acc, nt)) {
           //    if (nn[i]==0 || nn[i]>=minT) continue;    // DBSCAN edge rule
           if (nn[i] >= minT)
             continue;  // DBSCAN edge rule
@@ -192,7 +192,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
 
         // find the number of different clusters, identified by a tracks with clus[i] == i;
         // mark these tracks with a negative id.
-        for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
+        for (auto i : cms::alpakatools::uniform_elements(acc, nt)) {
           if (iv[i] == int(i)) {
             if (nn[i] >= minT) {
               auto old = alpaka::atomicInc(acc, &foundClusters, 0xffffffff, alpaka::hierarchy::Threads{});
@@ -207,7 +207,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
         ALPAKA_ASSERT_OFFLOAD(foundClusters < ::zVertex::MAXVTX);
 
         // propagate the negative id to all the tracks in the cluster.
-        for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
+        for (auto i : cms::alpakatools::uniform_elements(acc, nt)) {
           if (iv[i] >= 0) {
             // mark each track in a cluster with the same id as the first one
             iv[i] = iv[iv[i]];
@@ -216,7 +216,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
         alpaka::syncBlockThreads(acc);
 
         // adjust the cluster id to be a positive value starting from 0
-        for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
+        for (auto i : cms::alpakatools::uniform_elements(acc, nt)) {
           iv[i] = -iv[i] - 1;
         }
 
diff --git a/RecoTracker/PixelVertexFinding/plugins/alpaka/fitVertices.h b/RecoTracker/PixelVertexFinding/plugins/alpaka/fitVertices.h
index 5ee24f610c1aa..caba60c826823 100644
--- a/RecoTracker/PixelVertexFinding/plugins/alpaka/fitVertices.h
+++ b/RecoTracker/PixelVertexFinding/plugins/alpaka/fitVertices.h
@@ -42,7 +42,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder {
     auto foundClusters = nvFinal;
 
     // zero
-    for (auto i : cms::alpakatools::elements_with_stride(acc, foundClusters)) {
+    for (auto i : cms::alpakatools::uniform_elements(acc, foundClusters)) {
       zv[i] = 0;
       wv[i] = 0;
       chi2[i] = 0;
@@ -58,7 +58,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder {
     alpaka::syncBlockThreads(acc);
 
     // compute cluster location
-    for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
+    for (auto i : cms::alpakatools::uniform_elements(acc, nt)) {
       if (iv[i] > 9990) {
         if constexpr (verbose)
           alpaka::atomicAdd(acc, &noise, 1, alpaka::hierarchy::Threads{});
@@ -73,7 +73,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder {
 
     alpaka::syncBlockThreads(acc);
     // reuse nn
-    for (auto i : cms::alpakatools::elements_with_stride(acc, foundClusters)) {
+    for (auto i : cms::alpakatools::uniform_elements(acc, foundClusters)) {
       ALPAKA_ASSERT_OFFLOAD(wv[i] > 0.f);
       zv[i] /= wv[i];
       nn[i] = -1;  // ndof
@@ -81,7 +81,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder {
     alpaka::syncBlockThreads(acc);
 
     // compute chi2
-    for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
+    for (auto i : cms::alpakatools::uniform_elements(acc, nt)) {
       if (iv[i] > 9990)
         continue;
 
@@ -96,7 +96,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder {
     }
     alpaka::syncBlockThreads(acc);
 
-    for (auto i : cms::alpakatools::elements_with_stride(acc, foundClusters)) {
+    for (auto i : cms::alpakatools::uniform_elements(acc, foundClusters)) {
       if (nn[i] > 0) {
         wv[i] *= float(nn[i]) / chi2[i];
       }
diff --git a/RecoTracker/PixelVertexFinding/plugins/alpaka/sortByPt2.h b/RecoTracker/PixelVertexFinding/plugins/alpaka/sortByPt2.h
index 5d5765ed3d4b8..ff8fab8ab635f 100644
--- a/RecoTracker/PixelVertexFinding/plugins/alpaka/sortByPt2.h
+++ b/RecoTracker/PixelVertexFinding/plugins/alpaka/sortByPt2.h
@@ -36,17 +36,17 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder {
       return;
 
     // fill indexing
-    for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
+    for (auto i : cms::alpakatools::uniform_elements(acc, nt)) {
       data.idv()[ws.itrk()[i]] = iv[i];
     };
 
     // can be done asynchronously at the end of previous event
-    for (auto i : cms::alpakatools::elements_with_stride(acc, nvFinal)) {
+    for (auto i : cms::alpakatools::uniform_elements(acc, nvFinal)) {
       ptv2[i] = 0;
     };
     alpaka::syncBlockThreads(acc);
 
-    for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
+    for (auto i : cms::alpakatools::uniform_elements(acc, nt)) {
       if (iv[i] <= 9990) {
         alpaka::atomicAdd(acc, &ptv2[iv[i]], ptt2[i], alpaka::hierarchy::Blocks{});
       }
diff --git a/RecoTracker/PixelVertexFinding/plugins/alpaka/splitVertices.h b/RecoTracker/PixelVertexFinding/plugins/alpaka/splitVertices.h
index 5a16d9c57a20d..7ba0f905e260b 100644
--- a/RecoTracker/PixelVertexFinding/plugins/alpaka/splitVertices.h
+++ b/RecoTracker/PixelVertexFinding/plugins/alpaka/splitVertices.h
@@ -100,7 +100,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder {
         }
         alpaka::syncBlockThreads(acc);
 
-        for (auto k : cms::alpakatools::elements_with_stride(acc, nq)) {
+        for (auto k : cms::alpakatools::uniform_elements(acc, nq)) {
           auto i = newV[k];
           alpaka::atomicAdd(acc, &znew[i], zz[k] * ww[k], alpaka::hierarchy::Threads{});
           alpaka::atomicAdd(acc, &wnew[i], ww[k], alpaka::hierarchy::Threads{});
@@ -113,7 +113,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder {
         }
         alpaka::syncBlockThreads(acc);
 
-        for (auto k : cms::alpakatools::elements_with_stride(acc, nq)) {
+        for (auto k : cms::alpakatools::uniform_elements(acc, nq)) {
           auto d0 = fabs(zz[k] - znew[0]);
           auto d1 = fabs(zz[k] - znew[1]);
           auto newer = d0 < d1 ? 0 : 1;
@@ -145,7 +145,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder {
       if (0 == threadIdxLocal)
         igv = alpaka::atomicAdd(acc, &ws.nvIntermediate(), 1u, alpaka::hierarchy::Blocks{});
       alpaka::syncBlockThreads(acc);
-      for (auto k : cms::alpakatools::elements_with_stride(acc, nq)) {
+      for (auto k : cms::alpakatools::uniform_elements(acc, nq)) {
         if (1 == newV[k])
           iv[it[k]] = igv;
       }
diff --git a/RecoTracker/PixelVertexFinding/plugins/alpaka/vertexFinder.dev.cc b/RecoTracker/PixelVertexFinding/plugins/alpaka/vertexFinder.dev.cc
index 89a8ee676e35b..2d33fee32752c 100644
--- a/RecoTracker/PixelVertexFinding/plugins/alpaka/vertexFinder.dev.cc
+++ b/RecoTracker/PixelVertexFinding/plugins/alpaka/vertexFinder.dev.cc
@@ -39,7 +39,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
         auto const* quality = tracks_view.quality();
         using helper = TracksUtilities<TrackerTraits>;
 
-        for (auto idx : cms::alpakatools::elements_with_stride(acc, tracks_view.nTracks())) {
+        for (auto idx : cms::alpakatools::uniform_elements(acc, tracks_view.nTracks())) {
           [[maybe_unused]] auto nHits = helper::nHits(tracks_view, idx);
           ALPAKA_ASSERT_OFFLOAD(nHits >= 3);
 

From c7d364154aaa3e38596d58710c4240d791b3e7ce Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Sun, 11 Feb 2024 08:37:03 +0100
Subject: [PATCH 25/25] Remove obsolete alpakatools utilities

---
 .../AlpakaInterface/interface/workdivision.h  | 204 ------------------
 1 file changed, 204 deletions(-)

diff --git a/HeterogeneousCore/AlpakaInterface/interface/workdivision.h b/HeterogeneousCore/AlpakaInterface/interface/workdivision.h
index 6927e202d0954..fe02f9646605a 100644
--- a/HeterogeneousCore/AlpakaInterface/interface/workdivision.h
+++ b/HeterogeneousCore/AlpakaInterface/interface/workdivision.h
@@ -1333,210 +1333,6 @@ namespace cms::alpakatools {
     return alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc) == Vec<alpaka::Dim<TAcc>>::zeros();
   }
 
-  /*********************************************
-     *           RANGE COMPUTATION
-     ********************************************/
-
-  /*
-     * Computes the range of the elements indexes, local to the block.
-     * Warning: the max index is not truncated by the max number of elements of interest.
-     */
-  template <typename TAcc>
-  ALPAKA_FN_ACC std::pair<Idx, Idx> element_index_range_in_block(const TAcc& acc,
-                                                                 const Idx elementIdxShift,
-                                                                 const unsigned int dimIndex = 0u) {
-    // Take into account the thread index in block.
-    const Idx threadIdxLocal(alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[dimIndex]);
-    const Idx threadDimension(alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[dimIndex]);
-
-    // Compute the elements indexes in block.
-    // Obviously relevant for CPU only.
-    // For GPU, threadDimension == 1, and elementIdx == firstElementIdx == threadIdx + elementIdxShift.
-    const Idx firstElementIdxLocal = threadIdxLocal * threadDimension;
-    const Idx firstElementIdx = firstElementIdxLocal + elementIdxShift;  // Add the shift!
-    const Idx endElementIdxUncut = firstElementIdx + threadDimension;
-
-    // Return element indexes, shifted by elementIdxShift.
-    return {firstElementIdx, endElementIdxUncut};
-  }
-
-  /*
-     * Computes the range of the elements indexes, local to the block.
-     * Truncated by the max number of elements of interest.
-     */
-  template <typename TAcc>
-  ALPAKA_FN_ACC std::pair<Idx, Idx> element_index_range_in_block_truncated(const TAcc& acc,
-                                                                           const Idx maxNumberOfElements,
-                                                                           const Idx elementIdxShift,
-                                                                           const unsigned int dimIndex = 0u) {
-    // Check dimension
-    //static_assert(alpaka::Dim<TAcc>::value == Dim1::value,
-    //              "Accelerator and maxNumberOfElements need to have same dimension.");
-    auto [firstElementIdxLocal, endElementIdxLocal] = element_index_range_in_block(acc, elementIdxShift, dimIndex);
-
-    // Truncate
-    endElementIdxLocal = std::min(endElementIdxLocal, maxNumberOfElements);
-
-    // Return element indexes, shifted by elementIdxShift, and truncated by maxNumberOfElements.
-    return {firstElementIdxLocal, endElementIdxLocal};
-  }
-
-  /*
-     * Computes the range of the elements indexes in grid.
-     * Warning: the max index is not truncated by the max number of elements of interest.
-     */
-  template <typename TAcc>
-  ALPAKA_FN_ACC std::pair<Idx, Idx> element_index_range_in_grid(const TAcc& acc,
-                                                                Idx elementIdxShift,
-                                                                const unsigned int dimIndex = 0u) {
-    // Take into account the block index in grid.
-    const Idx blockIdxInGrid(alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[dimIndex]);
-    const Idx blockDimension(alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[dimIndex]);
-
-    // Shift to get global indices in grid (instead of local to the block)
-    elementIdxShift += blockIdxInGrid * blockDimension;
-
-    // Return element indexes, shifted by elementIdxShift.
-    return element_index_range_in_block(acc, elementIdxShift, dimIndex);
-  }
-
-  /*
-   * Loop on all (CPU) elements.
-   * Elements loop makes sense in CPU case only. In GPU case, elementIdx = firstElementIdx = threadIdx + shift.
-   * Indexes are local to the BLOCK.
-   */
-  template <typename TAcc, typename Func>
-  ALPAKA_FN_ACC void for_each_element_in_block(const TAcc& acc,
-                                               const Idx maxNumberOfElements,
-                                               const Idx elementIdxShift,
-                                               const Func func,
-                                               const unsigned int dimIndex = 0) {
-    const auto& [firstElementIdx, endElementIdx] =
-        element_index_range_in_block_truncated(acc, maxNumberOfElements, elementIdxShift, dimIndex);
-
-    for (Idx elementIdx = firstElementIdx; elementIdx < endElementIdx; ++elementIdx) {
-      func(elementIdx);
-    }
-  }
-
-  /*
-   * Overload for elementIdxShift = 0
-   */
-  template <typename TAcc, typename Func>
-  ALPAKA_FN_ACC void for_each_element_in_block(const TAcc& acc,
-                                               const Idx maxNumberOfElements,
-                                               const Func func,
-                                               const unsigned int dimIndex = 0) {
-    const Idx elementIdxShift = 0;
-    for_each_element_in_block(acc, maxNumberOfElements, elementIdxShift, func, dimIndex);
-  }
-
-  /**************************************************************
-     *          LOOP ON ALL ELEMENTS WITH ONE LOOP
-     **************************************************************/
-
-  /*
-     * Case where the input index i has reached the end of threadDimension: strides the input index.
-     * Otherwise: do nothing.
-     * NB 1: This helper function is used as a trick to only have one loop (like in legacy), instead of 2 loops
-     * (like in all the other Alpaka helpers, 'for_each_element_in_block_strided' for example, 
-     * because of the additional loop over elements in Alpaka model). 
-     * This allows to keep the 'continue' and 'break' statements as-is from legacy code, 
-     * and hence avoids a lot of legacy code reshuffling.
-     * NB 2: Modifies i, firstElementIdx and endElementIdx.
-     */
-  ALPAKA_FN_ACC ALPAKA_FN_INLINE bool next_valid_element_index_strided(
-      Idx& i, Idx& firstElementIdx, Idx& endElementIdx, const Idx stride, const Idx maxNumberOfElements) {
-    bool isNextStrideElementValid = true;
-    if (i == endElementIdx) {
-      firstElementIdx += stride;
-      endElementIdx += stride;
-      i = firstElementIdx;
-      if (i >= maxNumberOfElements) {
-        isNextStrideElementValid = false;
-      }
-    }
-    return isNextStrideElementValid;
-  }
-
-  template <typename TAcc, typename Func>
-  ALPAKA_FN_ACC void for_each_element_in_block_strided(const TAcc& acc,
-                                                       const Idx maxNumberOfElements,
-                                                       const Idx elementIdxShift,
-                                                       const Func func,
-                                                       const unsigned int dimIndex = 0) {
-    // Get thread / element indices in block.
-    const auto& [firstElementIdxNoStride, endElementIdxNoStride] =
-        element_index_range_in_block(acc, elementIdxShift, dimIndex);
-
-    // Stride = block size.
-    const Idx blockDimension(alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[dimIndex]);
-
-    // Strided access.
-    for (Idx threadIdx = firstElementIdxNoStride, endElementIdx = endElementIdxNoStride;
-         threadIdx < maxNumberOfElements;
-         threadIdx += blockDimension, endElementIdx += blockDimension) {
-      // (CPU) Loop on all elements.
-      if (endElementIdx > maxNumberOfElements) {
-        endElementIdx = maxNumberOfElements;
-      }
-      for (Idx i = threadIdx; i < endElementIdx; ++i) {
-        func(i);
-      }
-    }
-  }
-
-  /*
-   * Overload for elementIdxShift = 0
-   */
-  template <typename TAcc, typename Func>
-  ALPAKA_FN_ACC void for_each_element_in_block_strided(const TAcc& acc,
-                                                       const Idx maxNumberOfElements,
-                                                       const Func func,
-                                                       const unsigned int dimIndex = 0) {
-    const Idx elementIdxShift = 0;
-    for_each_element_in_block_strided(acc, maxNumberOfElements, elementIdxShift, func, dimIndex);
-  }
-
-  template <typename TAcc, typename Func>
-  ALPAKA_FN_ACC void for_each_element_in_grid_strided(const TAcc& acc,
-                                                      const Idx maxNumberOfElements,
-                                                      const Idx elementIdxShift,
-                                                      const Func func,
-                                                      const unsigned int dimIndex = 0) {
-    // Get thread / element indices in block.
-    const auto& [firstElementIdxNoStride, endElementIdxNoStride] =
-        element_index_range_in_grid(acc, elementIdxShift, dimIndex);
-
-    // Stride = grid size.
-    const Idx gridDimension(alpaka::getWorkDiv<alpaka::Grid, alpaka::Elems>(acc)[dimIndex]);
-
-    // Strided access.
-    for (Idx threadIdx = firstElementIdxNoStride, endElementIdx = endElementIdxNoStride;
-         threadIdx < maxNumberOfElements;
-         threadIdx += gridDimension, endElementIdx += gridDimension) {
-      // (CPU) Loop on all elements.
-      if (endElementIdx > maxNumberOfElements) {
-        endElementIdx = maxNumberOfElements;
-      }
-      for (Idx i = threadIdx; i < endElementIdx; ++i) {
-        func(i);
-      }
-    }
-  }
-
-  /*
-   * Overload for elementIdxShift = 0
-   */
-  template <typename TAcc, typename Func>
-  ALPAKA_FN_ACC void for_each_element_in_grid_strided(const TAcc& acc,
-                                                      const Idx maxNumberOfElements,
-                                                      const Func func,
-                                                      const unsigned int dimIndex = 0) {
-    const Idx elementIdxShift = 0;
-    for_each_element_in_grid_strided(acc, maxNumberOfElements, elementIdxShift, func, dimIndex);
-  }
-
 }  // namespace cms::alpakatools
 
 #endif  // HeterogeneousCore_AlpakaInterface_interface_workdivision_h