diff --git a/src/alpaka/AlpakaCore/alpakaWorkDivHelper.h b/src/alpaka/AlpakaCore/alpakaWorkDivHelper.h
index cd1b3461c..f0ae89494 100644
--- a/src/alpaka/AlpakaCore/alpakaWorkDivHelper.h
+++ b/src/alpaka/AlpakaCore/alpakaWorkDivHelper.h
@@ -232,7 +232,10 @@ namespace cms {
            threadIdx < maxNumberOfElements;
            threadIdx += blockDimension, endElementIdx += blockDimension) {
         // (CPU) Loop on all elements.
-        for (uint32_t i = threadIdx; i < std::min(endElementIdx, maxNumberOfElements); ++i) {
+        if (endElementIdx > maxNumberOfElements) {
+          endElementIdx = maxNumberOfElements;
+        }
+        for (uint32_t i = threadIdx; i < endElementIdx; ++i) {
           func(i);
         }
       }
@@ -274,7 +277,10 @@ namespace cms {
            threadIdx < maxNumberOfElements;
            threadIdx += gridDimension, endElementIdx += gridDimension) {
         // (CPU) Loop on all elements.
-        for (uint32_t i = threadIdx; i < std::min(endElementIdx, maxNumberOfElements); ++i) {
+        if (endElementIdx > maxNumberOfElements) {
+          endElementIdx = maxNumberOfElements;
+        }
+        for (uint32_t i = threadIdx; i < endElementIdx; ++i) {
           func(i);
         }
       }
@@ -291,6 +297,27 @@ namespace cms {
       cms::alpakatools::for_each_element_1D_grid_stride(acc, maxNumberOfElements, elementIdxShift, func);
     }
 
+    /*
+     * Case where the input index has reached the end of threadDimension: strides the input index.
+     * Otherwise: do nothing.
+     */
+    ALPAKA_FN_ACC bool get_next_element_1D_index_stride(uint32_t& i,
+                                                        uint32_t& firstElementIdx,
+                                                        uint32_t& endElementIdx,
+                                                        const uint32_t stride,
+                                                        const uint32_t maxNumberOfElements) {
+      bool isNextStrideElementValid = true;
+      if (i == endElementIdx) {
+        firstElementIdx += stride;
+        endElementIdx += stride;
+        i = firstElementIdx;
+        if (i >= maxNumberOfElements) {
+          isNextStrideElementValid = false;
+        }
+      }
+      return isNextStrideElementValid;
+    }
+
   }  // namespace alpakatools
 }  // namespace cms
 
diff --git a/src/alpaka/Geometry/phase1PixelTopology.h b/src/alpaka/Geometry/phase1PixelTopology.h
new file mode 100644
index 000000000..409ebec3c
--- /dev/null
+++ b/src/alpaka/Geometry/phase1PixelTopology.h
@@ -0,0 +1,174 @@
+#ifndef Geometry_TrackerGeometryBuilder_phase1PixelTopology_h
+#define Geometry_TrackerGeometryBuilder_phase1PixelTopology_h
+
+#include <cstdint>
+#include <array>
+
+namespace phase1PixelTopology {
+
+  constexpr uint16_t numRowsInRoc = 80;
+  constexpr uint16_t numColsInRoc = 52;
+  constexpr uint16_t lastRowInRoc = numRowsInRoc - 1;
+  constexpr uint16_t lastColInRoc = numColsInRoc - 1;
+
+  constexpr uint16_t numRowsInModule = 2 * numRowsInRoc;
+  constexpr uint16_t numColsInModule = 8 * numColsInRoc;
+  constexpr uint16_t lastRowInModule = numRowsInModule - 1;
+  constexpr uint16_t lastColInModule = numColsInModule - 1;
+
+  constexpr int16_t xOffset = -81;
+  constexpr int16_t yOffset = -54 * 4;
+
+  constexpr uint32_t numPixsInModule = uint32_t(numRowsInModule) * uint32_t(numColsInModule);
+
+  constexpr uint32_t numberOfModules = 1856;
+  constexpr uint32_t numberOfLayers = 10;
+  constexpr uint32_t layerStart[numberOfLayers + 1] = {0,
+                                                       96,
+                                                       320,
+                                                       672,  // barrel
+                                                       1184,
+                                                       1296,
+                                                       1408,  // positive endcap
+                                                       1520,
+                                                       1632,
+                                                       1744,  // negative endcap
+                                                       numberOfModules};
+  constexpr char const* layerName[numberOfLayers] = {
+      "BL1",
+      "BL2",
+      "BL3",
+      "BL4",  // barrel
+      "E+1",
+      "E+2",
+      "E+3",  // positive endcap
+      "E-1",
+      "E-2",
+      "E-3"  // negative endcap
+  };
+
+  constexpr uint32_t numberOfModulesInBarrel = 1184;
+  constexpr uint32_t numberOfLaddersInBarrel = numberOfModulesInBarrel / 8;
+
+  template <class Function, std::size_t... Indices>
+  constexpr auto map_to_array_helper(Function f, std::index_sequence<Indices...>)
+      -> std::array<typename std::result_of<Function(std::size_t)>::type, sizeof...(Indices)> {
+    return {{f(Indices)...}};
+  }
+
+  template <int N, class Function>
+  constexpr auto map_to_array(Function f) -> std::array<typename std::result_of<Function(std::size_t)>::type, N> {
+    return map_to_array_helper(f, std::make_index_sequence<N>{});
+  }
+
+  constexpr uint32_t findMaxModuleStride() {
+    bool go = true;
+    int n = 2;
+    while (go) {
+      for (uint8_t i = 1; i < 11; ++i) {
+        if (layerStart[i] % n != 0) {
+          go = false;
+          break;
+        }
+      }
+      if (!go)
+        break;
+      n *= 2;
+    }
+    return n / 2;
+  }
+
+  constexpr uint32_t maxModuleStride = findMaxModuleStride();
+
+  constexpr uint8_t findLayer(uint32_t detId) {
+    for (uint8_t i = 0; i < 11; ++i)
+      if (detId < layerStart[i + 1])
+        return i;
+    return 11;
+  }
+
+  constexpr uint8_t findLayerFromCompact(uint32_t detId) {
+    detId *= maxModuleStride;
+    for (uint8_t i = 0; i < 11; ++i)
+      if (detId < layerStart[i + 1])
+        return i;
+    return 11;
+  }
+
+  constexpr uint32_t layerIndexSize = numberOfModules / maxModuleStride;
+  constexpr std::array<uint8_t, layerIndexSize> layer = map_to_array<layerIndexSize>(findLayerFromCompact);
+
+  constexpr bool validateLayerIndex() {
+    bool res = true;
+    for (auto i = 0U; i < numberOfModules; ++i) {
+      auto j = i / maxModuleStride;
+      res &= (layer[j] < 10);
+      res &= (i >= layerStart[layer[j]]);
+      res &= (i < layerStart[layer[j] + 1]);
+    }
+    return res;
+  }
+
+  static_assert(validateLayerIndex(), "layer from detIndex algo is buggy");
+
+  // this is for the ROC n<512 (upgrade 1024)
+  constexpr inline uint16_t divu52(uint16_t n) {
+    n = n >> 2;
+    uint16_t q = (n >> 1) + (n >> 4);
+    q = q + (q >> 4) + (q >> 5);
+    q = q >> 3;
+    uint16_t r = n - q * 13;
+    return q + ((r + 3) >> 4);
+  }
+
+  constexpr inline bool isEdgeX(uint16_t px) { return (px == 0) | (px == lastRowInModule); }
+
+  constexpr inline bool isEdgeY(uint16_t py) { return (py == 0) | (py == lastColInModule); }
+
+  constexpr inline uint16_t toRocX(uint16_t px) { return (px < numRowsInRoc) ? px : px - numRowsInRoc; }
+
+  constexpr inline uint16_t toRocY(uint16_t py) {
+    auto roc = divu52(py);
+    return py - 52 * roc;
+  }
+
+  constexpr inline bool isBigPixX(uint16_t px) { return (px == 79) | (px == 80); }
+
+  constexpr inline bool isBigPixY(uint16_t py) {
+    auto ly = toRocY(py);
+    return (ly == 0) | (ly == lastColInRoc);
+  }
+
+  constexpr inline uint16_t localX(uint16_t px) {
+    auto shift = 0;
+    if (px > lastRowInRoc)
+      shift += 1;
+    if (px > numRowsInRoc)
+      shift += 1;
+    return px + shift;
+  }
+
+  constexpr inline uint16_t localY(uint16_t py) {
+    auto roc = divu52(py);
+    auto shift = 2 * roc;
+    auto yInRoc = py - 52 * roc;
+    if (yInRoc > 0)
+      shift += 1;
+    return py + shift;
+  }
+
+  //FIXME move it elsewhere?
+  struct AverageGeometry {
+    static constexpr auto numberOfLaddersInBarrel = phase1PixelTopology::numberOfLaddersInBarrel;
+    float ladderZ[numberOfLaddersInBarrel];
+    float ladderX[numberOfLaddersInBarrel];
+    float ladderY[numberOfLaddersInBarrel];
+    float ladderR[numberOfLaddersInBarrel];
+    float ladderMinZ[numberOfLaddersInBarrel];
+    float ladderMaxZ[numberOfLaddersInBarrel];
+    float endCapZ[2];  // just for pos and neg Layer1
+  };
+
+}  // namespace phase1PixelTopology
+
+#endif  // Geometry_TrackerGeometryBuilder_phase1PixelTopology_h
diff --git a/src/alpaka/plugin-SiPixelClusterizer/clustering_alpaka.h b/src/alpaka/plugin-SiPixelClusterizer/clustering_alpaka.h
deleted file mode 100644
index e69de29bb..000000000
diff --git a/src/alpaka/plugin-SiPixelClusterizer/gpuClusterChargeCut.h b/src/alpaka/plugin-SiPixelClusterizer/gpuClusterChargeCut.h
new file mode 100644
index 000000000..456a25fa8
--- /dev/null
+++ b/src/alpaka/plugin-SiPixelClusterizer/gpuClusterChargeCut.h
@@ -0,0 +1,147 @@
+#ifndef RecoLocalTracker_SiPixelClusterizer_plugins_gpuClusterChargeCut_h
+#define RecoLocalTracker_SiPixelClusterizer_plugins_gpuClusterChargeCut_h
+
+#include <cstdint>
+#include <cstdio>
+
+#include "AlpakaCore/alpakaConfig.h"
+#include "AlpakaCore/alpakaWorkDivHelper.h"
+#include "AlpakaCore/prefixScan.h"
+#include "AlpakaDataFormats/gpuClusteringConstants.h"
+
+namespace gpuClustering {
+
+  struct clusterChargeCut {
+    template <typename T_Acc>
+    ALPAKA_FN_ACC void operator()(
+        const T_Acc& acc,
+        uint16_t* __restrict__ id,                 // module id of each pixel (modified if bad cluster)
+        uint16_t const* __restrict__ adc,          //  charge of each pixel
+        uint32_t const* __restrict__ moduleStart,  // index of the first pixel of each module
+        uint32_t* __restrict__ nClustersInModule,  // modified: number of clusters found in each module
+        uint32_t const* __restrict__ moduleId,     // module id of each module
+        int32_t* __restrict__ clusterId,           // modified: cluster id of each pixel
+        const uint32_t numElements) const {
+      const uint32_t blockIdx(alpaka::idx::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0u]);
+      if (blockIdx >= moduleStart[0])
+        return;
+
+      auto firstPixel = moduleStart[1 + blockIdx];
+      auto thisModuleId = id[firstPixel];
+      assert(thisModuleId < MaxNumModules);
+      assert(thisModuleId == moduleId[blockIdx]);
+
+      auto nclus = nClustersInModule[thisModuleId];
+      if (nclus == 0)
+        return;
+
+      const uint32_t threadIdxLocal(alpaka::idx::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]);
+      if (threadIdxLocal == 0 && nclus > MaxNumClustersPerModules)
+        printf("Warning too many clusters in module %d in block %d: %d > %d\n",
+               thisModuleId,
+               blockIdx,
+               nclus,
+               MaxNumClustersPerModules);
+
+      // Stride = block size.
+      const uint32_t blockDimension(alpaka::workdiv::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u]);
+
+      // Get thread / CPU element indices in block.
+      const auto& [firstElementIdxNoStride, endElementIdxNoStride] =
+          cms::alpakatools::element_index_range_in_block(acc, Vec1::all(firstPixel));
+
+      if (nclus > MaxNumClustersPerModules) {
+        uint32_t firstElementIdx = firstElementIdxNoStride[0u];
+        uint32_t endElementIdx = endElementIdxNoStride[0u];
+        // remove excess  FIXME find a way to cut charge first....
+        for (uint32_t i = firstElementIdx; i < numElements; ++i) {
+          if (!cms::alpakatools::get_next_element_1D_index_stride(
+                  i, firstElementIdx, endElementIdx, blockDimension, numElements))
+            break;
+          if (id[i] == InvId)
+            continue;  // not valid
+          if (id[i] != thisModuleId)
+            break;  // end of module
+          if (clusterId[i] >= MaxNumClustersPerModules) {
+            id[i] = InvId;
+            clusterId[i] = InvId;
+          }
+        }
+        nclus = MaxNumClustersPerModules;
+      }
+
+#ifdef GPU_DEBUG
+      if (thisModuleId % 100 == 1)
+        if (threadIdxLocal == 0)
+          printf("start clusterizer for module %d in block %d\n", thisModuleId, blockIdx);
+#endif
+
+      auto&& charge = alpaka::block::shared::st::allocVar<int32_t[MaxNumClustersPerModules], __COUNTER__>(acc);
+      auto&& ok = alpaka::block::shared::st::allocVar<uint8_t[MaxNumClustersPerModules], __COUNTER__>(acc);
+      auto&& newclusId = alpaka::block::shared::st::allocVar<uint16_t[MaxNumClustersPerModules], __COUNTER__>(acc);
+
+      assert(nclus <= MaxNumClustersPerModules);
+      cms::alpakatools::for_each_element_1D_block_stride(acc, nclus, [&](uint32_t i) { charge[i] = 0; });
+      alpaka::block::sync::syncBlockThreads(acc);
+
+      uint32_t firstElementIdx = firstElementIdxNoStride[0u];
+      uint32_t endElementIdx = endElementIdxNoStride[0u];
+      for (uint32_t i = firstElementIdx; i < numElements; ++i) {
+        if (!cms::alpakatools::get_next_element_1D_index_stride(
+                i, firstElementIdx, endElementIdx, blockDimension, numElements))
+          break;
+        if (id[i] == InvId)
+          continue;  // not valid
+        if (id[i] != thisModuleId)
+          break;  // end of module
+        alpaka::atomic::atomicOp<alpaka::atomic::op::Add>(acc, &charge[clusterId[i]], static_cast<int32_t>(adc[i]));
+      }
+      alpaka::block::sync::syncBlockThreads(acc);
+
+      auto chargeCut = thisModuleId < 96 ? 2000 : 4000;  // move in constants (calib?)
+      cms::alpakatools::for_each_element_1D_block_stride(
+          acc, nclus, [&](uint32_t i) { newclusId[i] = ok[i] = charge[i] > chargeCut ? 1 : 0; });
+      alpaka::block::sync::syncBlockThreads(acc);
+
+      // renumber
+      auto&& ws = alpaka::block::shared::st::allocVar<uint16_t[32], __COUNTER__>(acc);
+      cms::alpakatools::blockPrefixScan(acc, newclusId, nclus, ws);
+
+      assert(nclus >= newclusId[nclus - 1]);
+
+      if (nclus == newclusId[nclus - 1])
+        return;
+
+      nClustersInModule[thisModuleId] = newclusId[nclus - 1];
+      alpaka::block::sync::syncBlockThreads(acc);
+
+      // mark bad cluster again
+      cms::alpakatools::for_each_element_1D_block_stride(acc, nclus, [&](uint32_t i) {
+        if (0 == ok[i])
+          newclusId[i] = InvId + 1;
+      });
+      alpaka::block::sync::syncBlockThreads(acc);
+
+      // reassign id
+      firstElementIdx = firstElementIdxNoStride[0u];
+      endElementIdx = endElementIdxNoStride[0u];
+      for (uint32_t i = firstElementIdx; i < numElements; ++i) {
+        if (!cms::alpakatools::get_next_element_1D_index_stride(
+                i, firstElementIdx, endElementIdx, blockDimension, numElements))
+          break;
+        if (id[i] == InvId)
+          continue;  // not valid
+        if (id[i] != thisModuleId)
+          break;  // end of module
+        clusterId[i] = newclusId[clusterId[i]] - 1;
+        if (clusterId[i] == InvId)
+          id[i] = InvId;
+      }
+
+      //done
+    }
+  };
+
+}  // namespace gpuClustering
+
+#endif  // RecoLocalTracker_SiPixelClusterizer_plugins_gpuClusterChargeCut_h
diff --git a/src/alpaka/plugin-SiPixelClusterizer/gpuClustering.h b/src/alpaka/plugin-SiPixelClusterizer/gpuClustering.h
new file mode 100644
index 000000000..73eaae60d
--- /dev/null
+++ b/src/alpaka/plugin-SiPixelClusterizer/gpuClustering.h
@@ -0,0 +1,350 @@
+#ifndef RecoLocalTracker_SiPixelClusterizer_plugins_gpuClustering_h
+#define RecoLocalTracker_SiPixelClusterizer_plugins_gpuClustering_h
+
+#include <cstdint>
+#include <cstdio>
+
+#include "AlpakaCore/alpakaConfig.h"
+#include "AlpakaCore/alpakaWorkDivHelper.h"
+#include "AlpakaCore/HistoContainer.h"
+#include "AlpakaDataFormats/gpuClusteringConstants.h"
+#include "Geometry/phase1PixelTopology.h"
+
+namespace gpuClustering {
+
+#ifdef GPU_DEBUG
+  // move to ALPAKA_ACCELERATOR_NAMESPACE ?
+  ALPAKA_STATIC_ACC_MEM_GLOBAL uint32_t gMaxHit = 0;
+#endif
+
+  struct countModules {
+    template <typename T_Acc>
+    ALPAKA_FN_ACC void operator()(const T_Acc& acc,
+                                  uint16_t const* __restrict__ id,
+                                  uint32_t* __restrict__ moduleStart,
+                                  int32_t* __restrict__ clusterId,
+                                  const unsigned int numElements) const {
+      cms::alpakatools::for_each_element_1D_grid_stride(acc, numElements, [&](uint32_t i) {
+        clusterId[i] = i;
+        if (InvId != id[i]) {
+          int j = i - 1;
+          while (j >= 0 and id[j] == InvId)
+            --j;
+          if (j < 0 or id[j] != id[i]) {
+            // boundary...
+	    constexpr auto max = MaxNumModules;
+	    auto loc = alpaka::atomic::atomicOp<alpaka::atomic::op::Inc>(acc, moduleStart, max);
+
+            moduleStart[loc + 1] = i;
+          }
+        }
+      });
+    }
+  };
+
+  //  __launch_bounds__(256,4)
+  struct findClus {
+    template <typename T_Acc>
+    ALPAKA_FN_ACC void operator()(
+        const T_Acc& acc,
+        uint16_t const* __restrict__ id,           // module id of each pixel
+        uint16_t const* __restrict__ x,            // local coordinates of each pixel
+        uint16_t const* __restrict__ y,            //
+        uint32_t const* __restrict__ moduleStart,  // index of the first pixel of each module
+        uint32_t* __restrict__ nClustersInModule,  // output: number of clusters found in each module
+        uint32_t* __restrict__ moduleId,           // output: module id of each module
+        int32_t* __restrict__ clusterId,           // output: cluster id of each pixel
+        const unsigned int numElements) const {
+      const uint32_t blockIdx(alpaka::idx::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0u]);
+      if (blockIdx >= moduleStart[0])
+        return;
+
+      auto firstPixel = moduleStart[1 + blockIdx];
+      auto thisModuleId = id[firstPixel];
+      assert(thisModuleId < MaxNumModules);
+
+      const uint32_t threadIdxLocal(alpaka::idx::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]);
+
+#ifdef GPU_DEBUG
+      if (thisModuleId % 100 == 1)
+        if (threadIdxLocal == 0)
+          printf("start clusterizer for module %d in block %d\n", thisModuleId, blockIdx);
+#endif
+
+      // find the index of the first pixel not belonging to this module (or invalid)
+      auto&& msize = alpaka::block::shared::st::allocVar<unsigned int, __COUNTER__>(acc);
+      msize = numElements;
+      alpaka::block::sync::syncBlockThreads(acc);
+
+      // Stride = block size.
+      const uint32_t blockDimension(alpaka::workdiv::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u]);
+
+      // Get thread / CPU element indices in block.
+      const auto& [firstElementIdxNoStride, endElementIdxNoStride] =
+          cms::alpakatools::element_index_range_in_block(acc, Vec1::all(firstPixel));
+      uint32_t firstElementIdx = firstElementIdxNoStride[0u];
+      uint32_t endElementIdx = endElementIdxNoStride[0u];
+
+      // skip threads not associated to an existing pixel
+      for (uint32_t i = firstElementIdx; i < numElements; ++i) {
+        if (!cms::alpakatools::get_next_element_1D_index_stride(
+                i, firstElementIdx, endElementIdx, blockDimension, numElements))
+          break;
+        if (id[i] == InvId)  // skip invalid pixels
+          continue;
+        if (id[i] != thisModuleId) {  // find the first pixel in a different module
+          alpaka::atomic::atomicOp<alpaka::atomic::op::Min>(acc, &msize, i);
+          break;
+        }
+      }
+
+      //init hist  (ymax=416 < 512 : 9bits)
+      constexpr uint32_t maxPixInModule = 4000;
+      constexpr auto nbins = phase1PixelTopology::numColsInModule + 2;  //2+2;
+      using Hist = cms::alpakatools::HistoContainer<uint16_t, nbins, maxPixInModule, 9, uint16_t>;
+      auto&& hist = alpaka::block::shared::st::allocVar<Hist, __COUNTER__>(acc);
+      auto&& ws = alpaka::block::shared::st::allocVar<Hist::Counter[32], __COUNTER__>(acc);
+
+      cms::alpakatools::for_each_element_1D_block_stride(acc, Hist::totbins(), [&](uint32_t j) { hist.off[j] = 0; });
+      alpaka::block::sync::syncBlockThreads(acc);
+
+      assert((msize == numElements) or ((msize < numElements) and (id[msize] != thisModuleId)));
+
+      // limit to maxPixInModule  (FIXME if recurrent (and not limited to simulation with low threshold) one will need to implement something cleverer)
+      if (0 == threadIdxLocal) {
+        if (msize - firstPixel > maxPixInModule) {
+          printf("too many pixels in module %d: %d > %d\n", thisModuleId, msize - firstPixel, maxPixInModule);
+          msize = maxPixInModule + firstPixel;
+        }
+      }
+
+      alpaka::block::sync::syncBlockThreads(acc);
+      assert(msize - firstPixel <= maxPixInModule);
+
+#ifdef GPU_DEBUG
+      auto&& totGood = alpaka::block::shared::st::allocVar<uint32_t, __COUNTER__>(acc);
+      totGood = 0;
+      alpaka::block::sync::syncBlockThreads(acc);
+#endif
+
+      // fill histo
+      cms::alpakatools::for_each_element_1D_block_stride(acc, msize, firstPixel, [&](uint32_t i) {
+        if (id[i] != InvId) {  // skip invalid pixels
+          hist.count(acc, y[i]);
+#ifdef GPU_DEBUG
+          alpaka::atomic::atomicOp<alpaka::atomic::op::Add>(acc, &totGood, 1u);
+#endif
+        }
+      });
+      alpaka::block::sync::syncBlockThreads(acc);
+      cms::alpakatools::for_each_element_in_thread_1D_index_in_block(acc, 32u, [&](uint32_t i) {
+        ws[i] = 0;  // used by prefix scan...
+      });
+      alpaka::block::sync::syncBlockThreads(acc);
+      hist.finalize(acc, ws);
+      alpaka::block::sync::syncBlockThreads(acc);
+#ifdef GPU_DEBUG
+      assert(hist.size() == totGood);
+      if (thisModuleId % 100 == 1)
+        if (threadIdxLocal == 0)
+          printf("histo size %d\n", hist.size());
+#endif
+      cms::alpakatools::for_each_element_1D_block_stride(acc, msize, firstPixel, [&](uint32_t i) {
+        if (id[i] != InvId) {  // skip invalid pixels
+          hist.fill(acc, y[i], i - firstPixel);
+        }
+      });
+
+#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
+      // assume that we can cover the whole module with up to 16 blockDimension-wide iterations
+      constexpr unsigned int maxiter = 16;
+#else
+      const unsigned int maxiter =
+          hist.size();  // After change of threadDimension, should be changed to hist.size() / blockDimension.
+#endif
+
+      // allocate space for duplicate pixels: a pixel can appear more than once with different charge in the same event
+      constexpr int maxNeighbours = 10;
+      assert((hist.size() / blockDimension) <= maxiter);
+
+#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
+      constexpr uint32_t threadDimension = 1;
+#else
+      // For now, match legacy, for perf comparison purposes. After fixes in perf, this should be tuned.
+      constexpr uint32_t threadDimension = 1;
+#endif
+      const uint32_t runTimeThreadDimension(alpaka::workdiv::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[0u]);
+      assert(runTimeThreadDimension <= threadDimension);
+
+      // nearest neighbour
+      uint16_t nn[maxiter][threadDimension][maxNeighbours];
+      uint8_t nnn[maxiter][threadDimension];  // number of nn
+      for (uint32_t elementIdx = 0; elementIdx < threadDimension; ++elementIdx) {
+        for (uint32_t k = 0; k < maxiter; ++k) {
+          nnn[k][elementIdx] = 0;
+        }
+      }
+
+      alpaka::block::sync::syncBlockThreads(acc);  // for hit filling!
+
+#ifdef GPU_DEBUG
+      // look for anomalous high occupancy
+      auto&& n40 = alpaka::block::shared::st::allocVar<uint32_t, __COUNTER__>(acc);
+      auto&& n60 = alpaka::block::shared::st::allocVar<uint32_t, __COUNTER__>(acc);
+      n40 = n60 = 0;
+      alpaka::block::sync::syncBlockThreads(acc);
+      cms::alpakatools::for_each_element_1D_block_stride(acc, Hist::nbins(), [&](uint32_t j) {
+        if (hist.size(j) > 60)
+          alpaka::atomic::atomicOp<alpaka::atomic::op::Add>(acc, &n60, 1u);
+        if (hist.size(j) > 40)
+          alpaka::atomic::atomicOp<alpaka::atomic::op::Add>(acc, &n40, 1u);
+      });
+      alpaka::block::sync::syncBlockThreads(acc);
+      if (0 == threadIdxLocal) {
+        if (n60 > 0)
+          printf("columns with more than 60 px %d in %d\n", n60, thisModuleId);
+        else if (n40 > 0)
+          printf("columns with more than 40 px %d in %d\n", n40, thisModuleId);
+      }
+      alpaka::block::sync::syncBlockThreads(acc);
+#endif
+
+      // fill NN
+      uint32_t k = 0u;
+      cms::alpakatools::for_each_element_1D_block_stride(acc, hist.size(), [&](uint32_t j) {
+        const uint32_t jEquivalentClass = j % threadDimension;
+        k = j / blockDimension;
+        assert(k < maxiter);
+        auto p = hist.begin() + j;
+        auto i = *p + firstPixel;
+        assert(id[i] != InvId);
+        assert(id[i] == thisModuleId);  // same module
+        int be = Hist::bin(y[i] + 1);
+        auto e = hist.end(be);
+        ++p;
+        assert(0 == nnn[k][jEquivalentClass]);
+        for (; p < e; ++p) {
+          auto m = (*p) + firstPixel;
+          assert(m != i);
+          assert(int(y[m]) - int(y[i]) >= 0);
+          assert(int(y[m]) - int(y[i]) <= 1);
+          if (std::abs(int(x[m]) - int(x[i])) <= 1) {
+            auto l = nnn[k][jEquivalentClass]++;
+            assert(l < maxNeighbours);
+            nn[k][jEquivalentClass][l] = *p;
+          }
+        }
+      });
+
+      // for each pixel, look at all the pixels until the end of the module;
+      // when two valid pixels within +/- 1 in x or y are found, set their id to the minimum;
+      // after the loop, all the pixel in each cluster should have the id equeal to the lowest
+      // pixel in the cluster ( clus[i] == i ).
+      bool more = true;
+      int nloops = 0;
+      while (alpaka::block::sync::syncBlockThreadsPredicate<alpaka::block::sync::op::LogicalOr>(acc, more)) {
+        if (1 == nloops % 2) {
+          cms::alpakatools::for_each_element_1D_block_stride(acc, hist.size(), [&](uint32_t j) {
+            auto p = hist.begin() + j;
+            auto i = *p + firstPixel;
+            auto m = clusterId[i];
+            while (m != clusterId[m])
+              m = clusterId[m];
+            clusterId[i] = m;
+          });
+        } else {
+          more = false;
+          uint32_t k = 0u;
+          cms::alpakatools::for_each_element_1D_block_stride(acc, hist.size(), [&](uint32_t j) {
+            k = j / blockDimension;
+            const uint32_t jEquivalentClass = j % threadDimension;
+            auto p = hist.begin() + j;
+            auto i = *p + firstPixel;
+            for (int kk = 0; kk < nnn[k][jEquivalentClass]; ++kk) {
+              auto l = nn[k][jEquivalentClass][kk];
+              auto m = l + firstPixel;
+              assert(m != i);
+              auto old = alpaka::atomic::atomicOp<alpaka::atomic::op::Min>(acc, &clusterId[m], clusterId[i]);
+              if (old != clusterId[i]) {
+                // end the loop only if no changes were applied
+                more = true;
+              }
+              alpaka::atomic::atomicOp<alpaka::atomic::op::Min>(acc, &clusterId[i], old);
+            }  // nnloop
+          });  // pixel loop
+        }
+        ++nloops;
+      }  // end while
+
+#ifdef GPU_DEBUG
+      {
+        auto&& n0 = alpaka::block::shared::st::allocVar<int, __COUNTER__>(acc);
+        if (threadIdxLocal == 0)
+          n0 = nloops;
+        alpaka::block::sync::syncBlockThreads(acc);
+        auto ok = n0 == nloops;
+        assert(alpaka::block::sync::syncBlockThreadsPredicate<alpaka::block::sync::op::LogicalAnd>(acc, ok));
+        if (thisModuleId % 100 == 1)
+          if (threadIdxLocal == 0)
+            printf("# loops %d\n", nloops);
+      }
+#endif
+
+      auto&& foundClusters = alpaka::block::shared::st::allocVar<unsigned int, __COUNTER__>(acc);
+      foundClusters = 0;
+      alpaka::block::sync::syncBlockThreads(acc);
+
+      // find the number of different clusters, identified by a pixels with clus[i] == i;
+      // mark these pixels with a negative id.
+      cms::alpakatools::for_each_element_1D_block_stride(acc, msize, firstPixel, [&](uint32_t i) {
+        if (id[i] != InvId) {  // skip invalid pixels
+          if (clusterId[i] == static_cast<int>(i)) {
+            auto old = alpaka::atomic::atomicOp<alpaka::atomic::op::Inc>(acc, &foundClusters, 0xffffffff);
+            clusterId[i] = -(old + 1);
+          }
+        }
+      });
+      alpaka::block::sync::syncBlockThreads(acc);
+
+      // propagate the negative id to all the pixels in the cluster.
+      cms::alpakatools::for_each_element_1D_block_stride(acc, msize, firstPixel, [&](uint32_t i) {
+        if (id[i] != InvId) {  // skip invalid pixels
+          if (clusterId[i] >= 0) {
+            // mark each pixel in a cluster with the same id as the first one
+            clusterId[i] = clusterId[clusterId[i]];
+          }
+        }
+      });
+      alpaka::block::sync::syncBlockThreads(acc);
+
+      // adjust the cluster id to be a positive value starting from 0
+      cms::alpakatools::for_each_element_1D_block_stride(acc, msize, firstPixel, [&](uint32_t i) {
+        if (id[i] == InvId) {  // skip invalid pixels
+          clusterId[i] = -9999;
+        } else {
+          clusterId[i] = -clusterId[i] - 1;
+        }
+      });
+      alpaka::block::sync::syncBlockThreads(acc);
+
+      if (threadIdxLocal == 0) {
+        nClustersInModule[thisModuleId] = foundClusters;
+        moduleId[blockIdx] = thisModuleId;
+#ifdef GPU_DEBUG
+        if (foundClusters > gMaxHit) {
+          gMaxHit = foundClusters;
+          if (foundClusters > 8)
+            printf("max hit %d in %d\n", foundClusters, thisModuleId);
+        }
+#endif
+#ifdef GPU_DEBUG
+        if (thisModuleId % 100 == 1)
+          printf("%d clusters in module %d\n", foundClusters, thisModuleId);
+#endif
+      }
+    }
+  };
+
+}  // namespace gpuClustering
+
+#endif  // RecoLocalTracker_SiPixelClusterizer_plugins_gpuClustering_h
diff --git a/src/alpaka/test/alpaka/clustering_alpaka.cc b/src/alpaka/test/alpaka/clustering_alpaka.cc
deleted file mode 100644
index 4f4982afa..000000000
--- a/src/alpaka/test/alpaka/clustering_alpaka.cc
+++ /dev/null
@@ -1,8 +0,0 @@
-#include <iostream>
-#include "AlpakaCore/alpakaConfig.h"
-
-int main() {
-  std::cout << "Exit success" << std::endl;
-
-  return 0;
-}
diff --git a/src/alpaka/test/alpaka/clustering_t.cc b/src/alpaka/test/alpaka/clustering_t.cc
new file mode 100644
index 000000000..295ecccb3
--- /dev/null
+++ b/src/alpaka/test/alpaka/clustering_t.cc
@@ -0,0 +1,387 @@
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <cstdint>
+#include <iomanip>
+#include <iostream>
+#include <memory>
+#include <numeric>
+#include <set>
+#include <vector>
+
+#include "AlpakaCore/alpakaConfig.h"
+#include "AlpakaCore/alpakaWorkDivHelper.h"
+
+// dirty, but works
+#include "plugin-SiPixelClusterizer/gpuClustering.h"
+#include "plugin-SiPixelClusterizer/gpuClusterChargeCut.h"
+
+int main(void) {
+  using namespace gpuClustering;
+
+  const DevHost host(alpaka::pltf::getDevByIdx<PltfHost>(0u));
+  const ALPAKA_ACCELERATOR_NAMESPACE::DevAcc1 device(
+      alpaka::pltf::getDevByIdx<ALPAKA_ACCELERATOR_NAMESPACE::PltfAcc1>(0u));
+  ALPAKA_ACCELERATOR_NAMESPACE::Queue queue(device);
+
+  constexpr unsigned int numElements = 256 * 2000;
+  // these in reality are already on GPU
+  auto h_id_buf = alpaka::mem::buf::alloc<uint16_t, Idx>(host, numElements);
+  auto h_id = alpaka::mem::view::getPtrNative(h_id_buf);
+  auto h_x_buf = alpaka::mem::buf::alloc<uint16_t, Idx>(host, numElements);
+  auto h_x = alpaka::mem::view::getPtrNative(h_x_buf);
+  auto h_y_buf = alpaka::mem::buf::alloc<uint16_t, Idx>(host, numElements);
+  auto h_y = alpaka::mem::view::getPtrNative(h_y_buf);
+  auto h_adc_buf = alpaka::mem::buf::alloc<uint16_t, Idx>(host, numElements);
+  auto h_adc = alpaka::mem::view::getPtrNative(h_adc_buf);
+
+  auto h_clus_buf = alpaka::mem::buf::alloc<int, Idx>(host, numElements);
+  auto h_clus = alpaka::mem::view::getPtrNative(h_clus_buf);
+
+  auto d_id_buf = alpaka::mem::buf::alloc<uint16_t, Idx>(device, numElements);
+  auto d_x_buf = alpaka::mem::buf::alloc<uint16_t, Idx>(device, numElements);
+  auto d_y_buf = alpaka::mem::buf::alloc<uint16_t, Idx>(device, numElements);
+  auto d_adc_buf = alpaka::mem::buf::alloc<uint16_t, Idx>(device, numElements);
+  auto d_clus_buf = alpaka::mem::buf::alloc<int, Idx>(device, numElements);
+
+  auto d_moduleStart_buf = alpaka::mem::buf::alloc<uint32_t, Idx>(device, MaxNumModules + 1);
+  auto d_clusInModule_buf = alpaka::mem::buf::alloc<uint32_t, Idx>(device, MaxNumModules);
+  auto d_moduleId_buf = alpaka::mem::buf::alloc<uint32_t, Idx>(device, MaxNumModules);
+
+  // later random number
+  unsigned int n = 0;
+  int ncl = 0;
+  int y[10] = {5, 7, 9, 1, 3, 0, 4, 8, 2, 6};
+
+  auto generateClusters = [&](int kn) {
+    auto addBigNoise = 1 == kn % 2;
+    if (addBigNoise) {
+      constexpr int MaxPixels = 1000;
+      int id = 666;
+      for (int x = 0; x < 140; x += 3) {
+        for (int yy = 0; yy < 400; yy += 3) {
+          h_id[n] = id;
+          h_x[n] = x;
+          h_y[n] = yy;
+          h_adc[n] = 1000;
+          ++n;
+          ++ncl;
+          if (MaxPixels <= ncl)
+            break;
+        }
+        if (MaxPixels <= ncl)
+          break;
+      }
+    }
+
+    {
+      // isolated
+      int id = 42;
+      int x = 10;
+      ++ncl;
+      h_id[n] = id;
+      h_x[n] = x;
+      h_y[n] = x;
+      h_adc[n] = kn == 0 ? 100 : 5000;
+      ++n;
+
+      // first column
+      ++ncl;
+      h_id[n] = id;
+      h_x[n] = x;
+      h_y[n] = 0;
+      h_adc[n] = 5000;
+      ++n;
+      // first columns
+      ++ncl;
+      h_id[n] = id;
+      h_x[n] = x + 80;
+      h_y[n] = 2;
+      h_adc[n] = 5000;
+      ++n;
+      h_id[n] = id;
+      h_x[n] = x + 80;
+      h_y[n] = 1;
+      h_adc[n] = 5000;
+      ++n;
+
+      // last column
+      ++ncl;
+      h_id[n] = id;
+      h_x[n] = x;
+      h_y[n] = 415;
+      h_adc[n] = 5000;
+      ++n;
+      // last columns
+      ++ncl;
+      h_id[n] = id;
+      h_x[n] = x + 80;
+      h_y[n] = 415;
+      h_adc[n] = 2500;
+      ++n;
+      h_id[n] = id;
+      h_x[n] = x + 80;
+      h_y[n] = 414;
+      h_adc[n] = 2500;
+      ++n;
+
+      // diagonal
+      ++ncl;
+      for (int x = 20; x < 25; ++x) {
+        h_id[n] = id;
+        h_x[n] = x;
+        h_y[n] = x;
+        h_adc[n] = 1000;
+        ++n;
+      }
+      ++ncl;
+      // reversed
+      for (int x = 45; x > 40; --x) {
+        h_id[n] = id;
+        h_x[n] = x;
+        h_y[n] = x;
+        h_adc[n] = 1000;
+        ++n;
+      }
+      ++ncl;
+      h_id[n++] = InvId;  // error
+      // messy
+      int xx[5] = {21, 25, 23, 24, 22};
+      for (int k = 0; k < 5; ++k) {
+        h_id[n] = id;
+        h_x[n] = xx[k];
+        h_y[n] = 20 + xx[k];
+        h_adc[n] = 1000;
+        ++n;
+      }
+      // holes
+      ++ncl;
+      for (int k = 0; k < 5; ++k) {
+        h_id[n] = id;
+        h_x[n] = xx[k];
+        h_y[n] = 100;
+        h_adc[n] = kn == 2 ? 100 : 1000;
+        ++n;
+        if (xx[k] % 2 == 0) {
+          h_id[n] = id;
+          h_x[n] = xx[k];
+          h_y[n] = 101;
+          h_adc[n] = 1000;
+          ++n;
+        }
+      }
+    }
+    {
+      // id == 0 (make sure it works!
+      int id = 0;
+      int x = 10;
+      ++ncl;
+      h_id[n] = id;
+      h_x[n] = x;
+      h_y[n] = x;
+      h_adc[n] = 5000;
+      ++n;
+    }
+    // all odd id
+    for (int id = 11; id <= 1800; id += 2) {
+      if ((id / 20) % 2)
+        h_id[n++] = InvId;  // error
+      for (int x = 0; x < 40; x += 4) {
+        ++ncl;
+        if ((id / 10) % 2) {
+          for (int k = 0; k < 10; ++k) {
+            h_id[n] = id;
+            h_x[n] = x;
+            h_y[n] = x + y[k];
+            h_adc[n] = 100;
+            ++n;
+            h_id[n] = id;
+            h_x[n] = x + 1;
+            h_y[n] = x + y[k] + 2;
+            h_adc[n] = 1000;
+            ++n;
+          }
+        } else {
+          for (int k = 0; k < 10; ++k) {
+            h_id[n] = id;
+            h_x[n] = x;
+            h_y[n] = x + y[9 - k];
+            h_adc[n] = kn == 2 ? 10 : 1000;
+            ++n;
+            if (y[k] == 3)
+              continue;  // hole
+            if (id == 51) {
+              h_id[n++] = InvId;
+              h_id[n++] = InvId;
+            }  // error
+            h_id[n] = id;
+            h_x[n] = x + 1;
+            h_y[n] = x + y[k] + 2;
+            h_adc[n] = kn == 2 ? 10 : 1000;
+            ++n;
+          }
+        }
+      }
+    }
+  };  // end lambda
+  for (auto kkk = 0; kkk < 5; ++kkk) {
+    n = 0;
+    ncl = 0;
+    generateClusters(kkk);
+
+    std::cout << "created " << n << " digis in " << ncl << " clusters" << std::endl;
+    assert(n <= numElements);
+
+    auto h_nModules_buf = alpaka::mem::buf::alloc<uint32_t, Idx>(host, 1u);
+    auto nModules = alpaka::mem::view::getPtrNative(h_nModules_buf);
+    nModules[0] = 0;
+    alpaka::mem::view::copy(queue, d_moduleStart_buf, h_nModules_buf, 1u);
+
+    alpaka::mem::view::copy(queue, d_id_buf, h_id_buf, n);
+    alpaka::mem::view::copy(queue, d_x_buf, h_x_buf, n);
+    alpaka::mem::view::copy(queue, d_y_buf, h_y_buf, n);
+    alpaka::mem::view::copy(queue, d_adc_buf, h_adc_buf, n);
+
+    // Launch CUDA Kernels
+#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
+    const int threadsPerBlockOrElementsPerThread = (kkk == 5) ? 512 : ((kkk == 3) ? 128 : 256);
+#else
+    // For now, match legacy, for perf comparison purposes. After fixes in perf, this should be tuned.
+    const int threadsPerBlockOrElementsPerThread = 1;
+#endif
+
+    // COUNT MODULES
+#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
+    const int blocksPerGridCountModules =
+        (numElements + threadsPerBlockOrElementsPerThread - 1) / threadsPerBlockOrElementsPerThread;
+#else
+    const int blocksPerGridCountModules = 1;
+#endif
+    const WorkDiv1& workDivCountModules = cms::alpakatools::make_workdiv(Vec1::all(blocksPerGridCountModules),
+                                                                         Vec1::all(threadsPerBlockOrElementsPerThread));
+    std::cout << "CUDA countModules kernel launch with " << blocksPerGridCountModules << " blocks of "
+              << threadsPerBlockOrElementsPerThread << " threads (GPU) or elements (CPU). \n";
+
+    alpaka::queue::enqueue(queue,
+                           alpaka::kernel::createTaskKernel<ALPAKA_ACCELERATOR_NAMESPACE::Acc1>(
+                               workDivCountModules,
+                               countModules(),
+                               alpaka::mem::view::getPtrNative(d_id_buf),
+                               alpaka::mem::view::getPtrNative(d_moduleStart_buf),
+                               alpaka::mem::view::getPtrNative(d_clus_buf),
+                               n));
+
+    // FIND CLUSTER
+    const WorkDiv1& workDivMaxNumModules =
+        cms::alpakatools::make_workdiv(Vec1::all(MaxNumModules), Vec1::all(threadsPerBlockOrElementsPerThread));
+    std::cout << "CUDA findModules kernel launch with " << MaxNumModules << " blocks of "
+              << threadsPerBlockOrElementsPerThread << " threads (GPU) or elements (CPU). \n";
+
+    alpaka::mem::view::set(queue, d_clusInModule_buf, 0, MaxNumModules);
+
+    alpaka::queue::enqueue(queue,
+                           alpaka::kernel::createTaskKernel<ALPAKA_ACCELERATOR_NAMESPACE::Acc1>(
+                               workDivMaxNumModules,
+                               findClus(),
+                               alpaka::mem::view::getPtrNative(d_id_buf),
+                               alpaka::mem::view::getPtrNative(d_x_buf),
+                               alpaka::mem::view::getPtrNative(d_y_buf),
+                               alpaka::mem::view::getPtrNative(d_moduleStart_buf),
+                               alpaka::mem::view::getPtrNative(d_clusInModule_buf),
+                               alpaka::mem::view::getPtrNative(d_moduleId_buf),
+                               alpaka::mem::view::getPtrNative(d_clus_buf),
+                               n));
+    alpaka::mem::view::copy(queue, h_nModules_buf, d_moduleStart_buf, 1u);
+
+    auto h_nclus_buf = alpaka::mem::buf::alloc<uint32_t, Idx>(host, MaxNumModules);
+    auto nclus = alpaka::mem::view::getPtrNative(h_nclus_buf);
+    alpaka::mem::view::copy(queue, h_nclus_buf, d_clusInModule_buf, MaxNumModules);
+
+    // Wait for memory transfers to be completed
+    alpaka::wait::wait(queue);
+
+    auto h_moduleId_buf = alpaka::mem::buf::alloc<uint32_t, Idx>(host, nModules[0]);
+    //auto moduleId = alpaka::mem::view::getPtrNative(h_moduleId_buf);
+
+    std::cout << "before charge cut found " << std::accumulate(nclus, nclus + MaxNumModules, 0) << " clusters"
+              << std::endl;
+    for (auto i = MaxNumModules; i > 0; i--)
+      if (nclus[i - 1] > 0) {
+        std::cout << "last module is " << i - 1 << ' ' << nclus[i - 1] << std::endl;
+        break;
+      }
+    if (ncl != std::accumulate(nclus, nclus + MaxNumModules, 0))
+      std::cout << "ERROR!!!!! wrong number of cluster found" << std::endl;
+
+    // CLUSTER CHARGE CUT
+    alpaka::queue::enqueue(queue,
+                           alpaka::kernel::createTaskKernel<ALPAKA_ACCELERATOR_NAMESPACE::Acc1>(
+                               workDivMaxNumModules,
+                               clusterChargeCut(),
+                               alpaka::mem::view::getPtrNative(d_id_buf),
+                               alpaka::mem::view::getPtrNative(d_adc_buf),
+                               alpaka::mem::view::getPtrNative(d_moduleStart_buf),
+                               alpaka::mem::view::getPtrNative(d_clusInModule_buf),
+                               alpaka::mem::view::getPtrNative(d_moduleId_buf),
+                               alpaka::mem::view::getPtrNative(d_clus_buf),
+                               n));
+    alpaka::mem::view::copy(queue, h_id_buf, d_id_buf, n);
+    alpaka::mem::view::copy(queue, h_clus_buf, d_clus_buf, n);
+    alpaka::mem::view::copy(queue, h_nclus_buf, d_clusInModule_buf, MaxNumModules);
+    alpaka::mem::view::copy(queue, h_moduleId_buf, d_moduleId_buf, nModules[0]);
+
+    // Wait for memory transfers to be completed
+    alpaka::wait::wait(queue);
+    std::cout << "found " << nModules[0] << " Modules active" << std::endl;
+
+    // CROSS-CHECK
+    std::set<unsigned int> clids;
+    for (unsigned int i = 0; i < n; ++i) {
+      assert(h_id[i] != 666);  // only noise
+      if (h_id[i] == InvId)
+        continue;
+      assert(h_clus[i] >= 0);
+      assert(h_clus[i] < int(nclus[h_id[i]]));
+      clids.insert(h_id[i] * 1000 + h_clus[i]);
+      // clids.insert(h_clus[i]);
+    }
+
+    // verify no hole in numbering
+    auto p = clids.begin();
+    auto cmid = (*p) / 1000;
+    assert(0 == (*p) % 1000);
+    auto c = p;
+    ++c;
+    std::cout << "first clusters " << *p << ' ' << *c << ' ' << nclus[cmid] << ' ' << nclus[(*c) / 1000] << std::endl;
+    std::cout << "last cluster " << *clids.rbegin() << ' ' << nclus[(*clids.rbegin()) / 1000] << std::endl;
+    for (; c != clids.end(); ++c) {
+      auto cc = *c;
+      auto pp = *p;
+      auto mid = cc / 1000;
+      auto pnc = pp % 1000;
+      auto nc = cc % 1000;
+      if (mid != cmid) {
+        assert(0 == cc % 1000);
+        assert(nclus[cmid] - 1 == pp % 1000);
+        // if (nclus[cmid]-1 != pp%1000) std::cout << "error size " << mid << ": "  << nclus[mid] << ' ' << pp << std::endl;
+        cmid = mid;
+        p = c;
+        continue;
+      }
+      p = c;
+      // assert(nc==pnc+1);
+      if (nc != pnc + 1)
+        std::cout << "error " << mid << ": " << nc << ' ' << pnc << std::endl;
+    }
+
+    std::cout << "found " << std::accumulate(nclus, nclus + MaxNumModules, 0) << ' ' << clids.size() << " clusters"
+              << std::endl;
+    for (auto i = MaxNumModules; i > 0; i--)
+      if (nclus[i - 1] > 0) {
+        std::cout << "last module is " << i - 1 << ' ' << nclus[i - 1] << std::endl;
+        break;
+      }
+    // << " and " << seeds.size() << " seeds" << std::endl;
+  }  /// end loop kkk
+  return 0;
+}