diff --git a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h
index 1c076e8705aa0..100766324a424 100644
--- a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h
+++ b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h
@@ -5,6 +5,7 @@
 #include <algorithm>
 
 #include "CUDADataFormats/Track/interface/TrajectoryStateSoAT.h"
+#include "Geometry/TrackerGeometryBuilder/interface/phase1PixelTopology.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
 
 #include "CUDADataFormats/Common/interface/HeterogeneousSoA.h"
@@ -42,8 +43,26 @@ class TrackSoAHeterogeneousT {
   // this is chi2/ndof as not necessarely all hits are used in the fit
   eigenSoA::ScalarSoA<float, S> chi2;
 
+  eigenSoA::ScalarSoA<int8_t, S> nLayers;
+
   constexpr int nHits(int i) const { return detIndices.size(i); }
 
+  constexpr bool isTriplet(int i) const { return nLayers(i) == 3; }
+
+  constexpr int computeNumberOfLayers(int32_t i) const {
+    // layers are in order and we assume tracks are either forward or backward
+    auto pdet = detIndices.begin(i);
+    int nl = 1;
+    auto ol = phase1PixelTopology::getLayer(*pdet);
+    for (; pdet < detIndices.end(i); ++pdet) {
+      auto il = phase1PixelTopology::getLayer(*pdet);
+      if (il != ol)
+        ++nl;
+      ol = il;
+    }
+    return nl;
+  }
+
   // State at the Beam spot
   // phi,tip,1/pt,cotan(theta),zip
   TrajectoryStateSoAT<S> stateAtBS;
diff --git a/DataFormats/Math/interface/choleskyInversion.h b/DataFormats/Math/interface/choleskyInversion.h
index b84e7cc80a120..90d9262957376 100644
--- a/DataFormats/Math/interface/choleskyInversion.h
+++ b/DataFormats/Math/interface/choleskyInversion.h
@@ -5,7 +5,78 @@
 
 #include <Eigen/Core>
 
-/**
+namespace math {
+  namespace cholesky {
+
+    template <typename M1, typename M2, int N = M2::ColsAtCompileTime>
+// without this: either does not compile or compiles and then fails silently at runtime
+#ifdef __CUDACC__
+    __host__ __device__
+#endif
+        inline constexpr void
+        invertNN(M1 const& src, M2& dst) {
+
+      // origin: CERNLIB
+
+      using T = typename M2::Scalar;
+
+      T a[N][N];
+      for (int i = 0; i < N; ++i) {
+        a[i][i] = src(i, i);
+        for (int j = i + 1; j < N; ++j)
+          a[j][i] = src(i, j);
+      }
+
+      for (int j = 0; j < N; ++j) {
+        a[j][j] = T(1.) / a[j][j];
+        int jp1 = j + 1;
+        for (int l = jp1; l < N; ++l) {
+          a[j][l] = a[j][j] * a[l][j];
+          T s1 = -a[l][jp1];
+          for (int i = 0; i < jp1; ++i)
+            s1 += a[l][i] * a[i][jp1];
+          a[l][jp1] = -s1;
+        }
+      }
+
+      if constexpr (N == 1) {
+        dst(0, 0) = a[0][0];
+        return;
+      }
+      a[0][1] = -a[0][1];
+      a[1][0] = a[0][1] * a[1][1];
+      for (int j = 2; j < N; ++j) {
+        int jm1 = j - 1;
+        for (int k = 0; k < jm1; ++k) {
+          T s31 = a[k][j];
+          for (int i = k; i < jm1; ++i)
+            s31 += a[k][i + 1] * a[i + 1][j];
+          a[k][j] = -s31;
+          a[j][k] = -s31 * a[j][j];
+        }
+        a[jm1][j] = -a[jm1][j];
+        a[j][jm1] = a[jm1][j] * a[j][j];
+      }
+
+      int j = 0;
+      while (j < N - 1) {
+        T s33 = a[j][j];
+        for (int i = j + 1; i < N; ++i)
+          s33 += a[j][i] * a[i][j];
+        dst(j, j) = s33;
+
+        ++j;
+        for (int k = 0; k < j; ++k) {
+          T s32 = 0;
+          for (int i = j; i < N; ++i)
+            s32 += a[k][i] * a[i][j];
+          dst(k, j) = dst(j, k) = s32;
+        }
+      }
+      dst(j, j) = a[j][j];
+    }
+
+    /**
  * fully inlined specialized code to perform the inversion of a
  * positive defined matrix of rank up to 6.
  *
@@ -16,8 +87,6 @@
  *
  *
  */
-namespace math {
-  namespace cholesky {
 
     template <typename M1, typename M2>
     inline constexpr void __attribute__((always_inline)) invert11(M1 const& src, M2& dst) {
@@ -336,12 +405,12 @@ namespace math {
     };
 
     // Eigen interface
-    template <typename D1, typename D2>
-    inline constexpr void __attribute__((always_inline))
-    invert(Eigen::DenseBase<D1> const& src, Eigen::DenseBase<D2>& dst) {
-      using M1 = Eigen::DenseBase<D1>;
-      using M2 = Eigen::DenseBase<D2>;
-      Inverter<M1, M2, M2::ColsAtCompileTime>::eval(src, dst);
+    template <typename M1, typename M2>
+    inline constexpr void __attribute__((always_inline)) invert(M1 const& src, M2& dst) {
+      if constexpr (M2::ColsAtCompileTime < 7)
+        Inverter<M1, M2, M2::ColsAtCompileTime>::eval(src, dst);
+      else
+        invertNN(src, dst);
     }
 
   }  // namespace cholesky
diff --git a/DataFormats/Math/test/BuildFile.xml b/DataFormats/Math/test/BuildFile.xml
index faa85fb2224ad..ad0058fe0dcb9 100644
--- a/DataFormats/Math/test/BuildFile.xml
+++ b/DataFormats/Math/test/BuildFile.xml
@@ -127,4 +127,10 @@
     <flags CUDA_FLAGS="-w"/>
   </bin>
 
+  <bin file="simpleCholeskyTest.cu">
+    <use name="cuda"/>
+    <use name="HeterogeneousCore/CUDAUtilities"/>
+    <flags CUDA_FLAGS="-w"/>
+  </bin>
+
 </iftool>
diff --git a/DataFormats/Math/test/CholeskyInvert_t.cpp b/DataFormats/Math/test/CholeskyInvert_t.cpp
index de048e3d4455b..7794e1d1d0113 100644
--- a/DataFormats/Math/test/CholeskyInvert_t.cpp
+++ b/DataFormats/Math/test/CholeskyInvert_t.cpp
@@ -138,6 +138,7 @@ int main() {
   go<5>(true);
   go<6>(true);
 
-  // go<10>();
+  go<10>(false);
+  go<10>(true);
   return 0;
 }
diff --git a/DataFormats/Math/test/CholeskyInvert_t.cu b/DataFormats/Math/test/CholeskyInvert_t.cu
index 558f9296150c7..7faf30ee3d54c 100644
--- a/DataFormats/Math/test/CholeskyInvert_t.cu
+++ b/DataFormats/Math/test/CholeskyInvert_t.cu
@@ -85,7 +85,8 @@ template <int N>
 void go(bool soa) {
   constexpr unsigned int DIM = N;
   using MX = MXN<DIM>;
-  std::cout << "testing Matrix of dimension " << DIM << " size " << sizeof(MX) << std::endl;
+  std::cout << "testing Matrix of dimension " << DIM << " size " << sizeof(MX) << " in " << (soa ? "SOA" : "AOS")
+            << " mode" << std::endl;
 
   auto start = std::chrono::high_resolution_clock::now();
   auto delta = start - start;
@@ -197,13 +198,15 @@ int main() {
   go<4>(false);
   go<5>(false);
   go<6>(false);
+  go<7>(false);
+  go<10>(false);
 
   go<2>(true);
   go<3>(true);
   go<4>(true);
   go<5>(true);
   go<6>(true);
-
-  // go<10>();
+  go<7>(true);
+  go<10>(true);
   return 0;
 }
diff --git a/DataFormats/Math/test/simpleCholeskyTest.cu b/DataFormats/Math/test/simpleCholeskyTest.cu
new file mode 100644
index 0000000000000..04a8438524276
--- /dev/null
+++ b/DataFormats/Math/test/simpleCholeskyTest.cu
@@ -0,0 +1,103 @@
+#include "DataFormats/Math/interface/choleskyInversion.h"
+
+using namespace math::cholesky;
+
+#include <Eigen/Core>
+#include <Eigen/Eigenvalues>
+
+#include <iomanip>
+#include <memory>
+#include <algorithm>
+#include <chrono>
+#include <random>
+
+#include <cassert>
+#include <iostream>
+#include <limits>
+
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
+
+template <typename M, int N>
+__global__ void invert(M* mm, int n) {
+  auto i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i >= n)
+    return;
+
+  auto& m = mm[i];
+
+  printf("before %d %f %f %f\n", N, m(0, 0), m(1, 0), m(1, 1));
+
+  invertNN(m, m);
+
+  printf("after %d %f %f %f\n", N, m(0, 0), m(1, 0), m(1, 1));
+}
+
+template <typename M, int N>
+__global__ void invertE(M* mm, int n) {
+  auto i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i >= n)
+    return;
+
+  auto& m = mm[i];
+
+  printf("before %d %f %f %f\n", N, m(0, 0), m(1, 0), m(1, 1));
+
+  m = m.inverse();
+
+  printf("after %d %f %f %f\n", N, m(0, 0), m(1, 0), m(1, 1));
+}
+
+// generate matrices
+template <class M>
+void genMatrix(M& m) {
+  using T = typename std::remove_reference<decltype(m(0, 0))>::type;
+  int n = M::ColsAtCompileTime;
+  std::mt19937 eng;
+  // std::mt19937 eng2;
+  std::uniform_real_distribution<T> rgen(0., 1.);
+
+  // generate first diagonal elemets
+  for (int i = 0; i < n; ++i) {
+    double maxVal = i * 10000 / (n - 1) + 1;  // max condition is 10^4
+    m(i, i) = maxVal * rgen(eng);
+  }
+  for (int i = 0; i < n; ++i) {
+    for (int j = 0; j < i; ++j) {
+      double v = 0.3 * std::sqrt(m(i, i) * m(j, j));  // this makes the matrix pos defined
+      m(i, j) = v * rgen(eng);
+      m(j, i) = m(i, j);
+    }
+  }
+}
+
+template <int DIM>
+using MXN = Eigen::Matrix<double, DIM, DIM>;
+
+int main() {
+  cms::cudatest::requireDevices();
+
+  constexpr int DIM = 6;
+
+  using M = MXN<DIM>;
+
+  M m;
+
+  genMatrix(m);
+
+  printf("on CPU before %d %f %f %f\n", DIM, m(0, 0), m(1, 0), m(1, 1));
+
+  invertNN(m, m);
+
+  printf("on CPU after %d %f %f %f\n", DIM, m(0, 0), m(1, 0), m(1, 1));
+
+  double* d;
+  cudaMalloc(&d, sizeof(M));
+  cudaMemcpy(d, &m, sizeof(M), cudaMemcpyHostToDevice);
+  invert<M, DIM><<<1, 1>>>((M*)d, 1);
+  cudaCheck(cudaDeviceSynchronize());
+  invertE<M, DIM><<<1, 1>>>((M*)d, 1);
+  cudaCheck(cudaDeviceSynchronize());
+
+  return 0;
+}
diff --git a/Geometry/TrackerGeometryBuilder/interface/phase1PixelTopology.h b/Geometry/TrackerGeometryBuilder/interface/phase1PixelTopology.h
index 4e70cd86e1524..27f5b8b573ea7 100644
--- a/Geometry/TrackerGeometryBuilder/interface/phase1PixelTopology.h
+++ b/Geometry/TrackerGeometryBuilder/interface/phase1PixelTopology.h
@@ -27,17 +27,20 @@ namespace phase1PixelTopology {
 
   constexpr uint32_t numberOfModules = 1856;
   constexpr uint32_t numberOfLayers = 10;
-  constexpr uint32_t layerStart[numberOfLayers + 1] = {0,
-                                                       96,
-                                                       320,
-                                                       672,  // barrel
-                                                       1184,
-                                                       1296,
-                                                       1408,  // positive endcap
-                                                       1520,
-                                                       1632,
-                                                       1744,  // negative endcap
-                                                       numberOfModules};
+#ifdef __CUDA_ARCH__
+  __device__
+#endif
+      constexpr uint32_t layerStart[numberOfLayers + 1] = {0,
+                                                           96,
+                                                           320,
+                                                           672,  // barrel
+                                                           1184,
+                                                           1296,
+                                                           1408,  // positive endcap
+                                                           1520,
+                                                           1632,
+                                                           1744,  // negative endcap
+                                                           numberOfModules};
   constexpr char const* layerName[numberOfLayers] = {
       "BL1",
       "BL2",
@@ -84,8 +87,8 @@ namespace phase1PixelTopology {
 
   constexpr uint32_t maxModuleStride = findMaxModuleStride();
 
-  constexpr uint8_t findLayer(uint32_t detId) {
-    for (uint8_t i = 0; i < std::size(layerStart); ++i)
+  constexpr uint8_t findLayer(uint32_t detId, uint8_t sl = 0) {
+    for (uint8_t i = sl; i < std::size(layerStart); ++i)
       if (detId < layerStart[i + 1])
         return i;
     return std::size(layerStart);
@@ -100,7 +103,15 @@ namespace phase1PixelTopology {
   }
 
   constexpr uint32_t layerIndexSize = numberOfModules / maxModuleStride;
-  constexpr std::array<uint8_t, layerIndexSize> layer = map_to_array<layerIndexSize>(findLayerFromCompact);
+#ifdef __CUDA_ARCH__
+  __device__
+#endif
+      constexpr std::array<uint8_t, layerIndexSize>
+          layer = map_to_array<layerIndexSize>(findLayerFromCompact);
+
+  constexpr uint8_t getLayer(uint32_t detId) {
+    return phase1PixelTopology::layer[detId / phase1PixelTopology::maxModuleStride];
+  }
 
   constexpr bool validateLayerIndex() {
     bool res = true;
diff --git a/Geometry/TrackerGeometryBuilder/test/BuildFile.xml b/Geometry/TrackerGeometryBuilder/test/BuildFile.xml
index d9efc604f4627..238ae24f0f170 100644
--- a/Geometry/TrackerGeometryBuilder/test/BuildFile.xml
+++ b/Geometry/TrackerGeometryBuilder/test/BuildFile.xml
@@ -40,7 +40,11 @@
   <flags EDM_PLUGIN="1"/>
 </library>
 
-<bin file="phase1PixelTopology_t.cpp"/>
+<bin file="phase1PixelTopology_t.cu">
+  <use name="HeterogeneousCore/CUDAUtilities"/>
+  <use name="cuda"/>
+  <flags CUDA_FLAGS="-g -DGPU_DEBUG"/>
+</bin>
 
 <bin file="runTests.cpp" name="GeometryTrackerGeometryBuilderTestDriver">
   <use name="FWCore/Utilities"/>
diff --git a/Geometry/TrackerGeometryBuilder/test/phase1PixelTopology_t.cpp b/Geometry/TrackerGeometryBuilder/test/phase1PixelTopology_t.cu
similarity index 86%
rename from Geometry/TrackerGeometryBuilder/test/phase1PixelTopology_t.cpp
rename to Geometry/TrackerGeometryBuilder/test/phase1PixelTopology_t.cu
index 2dda8ed41500b..7344382de7293 100644
--- a/Geometry/TrackerGeometryBuilder/test/phase1PixelTopology_t.cpp
+++ b/Geometry/TrackerGeometryBuilder/test/phase1PixelTopology_t.cu
@@ -4,6 +4,9 @@
 
 #include "Geometry/TrackerGeometryBuilder/interface/phase1PixelTopology.h"
 
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
+
 namespace {
 
   // original code from CMSSW_4_4
@@ -123,7 +126,23 @@ namespace {
 
 }  // namespace
 
+constexpr void testLayer() {
+  for (auto i = 0U; i < phase1PixelTopology::numberOfModules; ++i) {
+    uint32_t layer = phase1PixelTopology::getLayer(i);
+    uint32_t tLayer = phase1PixelTopology::findLayer(i);
+    assert(tLayer == layer);
+    //std::cout << "module " << i << ": " << "layer " << layer << ", \"" << phase1PixelTopology::layerName[layer] << "\", [" << phase1PixelTopology::layerStart[layer] << ", " << phase1PixelTopology::layerStart[layer+1] << ")" << std::endl;
+    assert(layer < phase1PixelTopology::numberOfLayers);
+    assert(i >= phase1PixelTopology::layerStart[layer]);
+    assert(i < phase1PixelTopology::layerStart[layer + 1]);
+  }
+}
+
+__global__ void kernel_testLayer() { testLayer(); }
+
 int main() {
+  cms::cudatest::requireDevices();
+
   for (uint16_t ix = 0; ix < 80 * 2; ++ix) {
     auto ori = localXori(ix);
     auto xl = phase1PixelTopology::localX(ix);
@@ -144,17 +163,17 @@ int main() {
 
   for (auto i = 0U; i < phase1PixelTopology::numberOfLayers; ++i) {
     std::cout << "layer " << i << ", \"" << phase1PixelTopology::layerName[i] << "\", ["
-              << phase1PixelTopology::layerStart[i] << ", " << phase1PixelTopology::layerStart[i + 1] << ")"
-              << std::endl;
+              << phase1PixelTopology::layerStart[i] << ", " << phase1PixelTopology::layerStart[i + 1] << ") "
+              << phase1PixelTopology::layerStart[i + 1] - phase1PixelTopology::layerStart[i] << std::endl;
   }
 
-  for (auto i = 0U; i < phase1PixelTopology::numberOfModules; ++i) {
-    auto layer = static_cast<const uint32_t>(phase1PixelTopology::layer[i / phase1PixelTopology::maxModuleStride]);
-    //std::cout << "module " << i << ": " << "layer " << layer << ", \"" << phase1PixelTopology::layerName[layer] << "\", [" << phase1PixelTopology::layerStart[layer] << ", " << phase1PixelTopology::layerStart[layer+1] << ")" << std::endl;
-    assert(layer < phase1PixelTopology::numberOfLayers);
-    assert(i >= phase1PixelTopology::layerStart[layer]);
-    assert(i < phase1PixelTopology::layerStart[layer + 1]);
-  }
+  std::cout << "maxModuleStide layerIndexSize " << phase1PixelTopology::maxModuleStride << ' '
+            << phase1PixelTopology::layerIndexSize << std::endl;
+
+  testLayer();
+
+  kernel_testLayer<<<1, 1>>>();
+  cudaCheck(cudaDeviceSynchronize());
 
   return 0;
 }
diff --git a/RecoPixelVertexing/PixelTrackFitting/interface/BrokenLine.h b/RecoPixelVertexing/PixelTrackFitting/interface/BrokenLine.h
index 4b661bad26681..fbcac4c07836f 100644
--- a/RecoPixelVertexing/PixelTrackFitting/interface/BrokenLine.h
+++ b/RecoPixelVertexing/PixelTrackFitting/interface/BrokenLine.h
@@ -154,8 +154,17 @@ namespace brokenline {
     riemannFit::Vector2d dVec;
     riemannFit::Vector2d eVec;
 
-    dVec = hits.block(0, 1, 2, 1) - hits.block(0, 0, 2, 1);
-    eVec = hits.block(0, n - 1, 2, 1) - hits.block(0, n - 2, 2, 1);
+    int mId = 1;
+
+    if constexpr (n > 3) {
+      riemannFit::Vector2d middle = 0.5 * (hits.block(0, n - 1, 2, 1) + hits.block(0, 0, 2, 1));
+      auto d1 = (hits.block(0, n / 2, 2, 1) - middle).squaredNorm();
+      auto d2 = (hits.block(0, n / 2 - 1, 2, 1) - middle).squaredNorm();
+      mId = d1 < d2 ? n / 2 : n / 2 - 1;
+    }
+
+    dVec = hits.block(0, mId, 2, 1) - hits.block(0, 0, 2, 1);
+    eVec = hits.block(0, n - 1, 2, 1) - hits.block(0, mId, 2, 1);
     results.qCharge = riemannFit::cross2D(dVec, eVec) > 0 ? -1 : 1;
 
     const double slope = -results.qCharge / fast_fit(3);
@@ -249,8 +258,17 @@ namespace brokenline {
   __host__ __device__ inline void fastFit(const M3xN& hits, V4& result) {
     constexpr uint32_t n = M3xN::ColsAtCompileTime;
 
-    const riemannFit::Vector2d a = hits.block(0, n / 2, 2, 1) - hits.block(0, 0, 2, 1);
-    const riemannFit::Vector2d b = hits.block(0, n - 1, 2, 1) - hits.block(0, n / 2, 2, 1);
+    int mId = 1;
+
+    if constexpr (n > 3) {
+      riemannFit::Vector2d middle = 0.5 * (hits.block(0, n - 1, 2, 1) + hits.block(0, 0, 2, 1));
+      auto d1 = (hits.block(0, n / 2, 2, 1) - middle).squaredNorm();
+      auto d2 = (hits.block(0, n / 2 - 1, 2, 1) - middle).squaredNorm();
+      mId = d1 < d2 ? n / 2 : n / 2 - 1;
+    }
+
+    const riemannFit::Vector2d a = hits.block(0, mId, 2, 1) - hits.block(0, 0, 2, 1);
+    const riemannFit::Vector2d b = hits.block(0, n - 1, 2, 1) - hits.block(0, mId, 2, 1);
     const riemannFit::Vector2d c = hits.block(0, 0, 2, 1) - hits.block(0, n - 1, 2, 1);
 
     auto tmp = 0.5 / riemannFit::cross2D(c, a);
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cc
index bebfe0e08008e..1523640e2ef8f 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cc
@@ -3,67 +3,118 @@
 void HelixFitOnGPU::launchBrokenLineKernelsOnCPU(HitsView const* hv, uint32_t hitsInFit, uint32_t maxNumberOfTuples) {
   assert(tuples_);
 
+#ifdef BROKENLINE_DEBUG
+  setlinebuf(stdout);
+#endif
+
   //  Fit internals
-  auto hitsGPU_ =
-      std::make_unique<double[]>(maxNumberOfConcurrentFits_ * sizeof(riemannFit::Matrix3xNd<4>) / sizeof(double));
-  auto hits_geGPU_ =
-      std::make_unique<float[]>(maxNumberOfConcurrentFits_ * sizeof(riemannFit::Matrix6x4f) / sizeof(float));
-  auto fast_fit_resultsGPU_ =
+  auto tkidGPU = std::make_unique<caConstants::tindex_type[]>(maxNumberOfConcurrentFits_);
+  auto hitsGPU =
+      std::make_unique<double[]>(maxNumberOfConcurrentFits_ * sizeof(riemannFit::Matrix3xNd<6>) / sizeof(double));
+  auto hits_geGPU =
+      std::make_unique<float[]>(maxNumberOfConcurrentFits_ * sizeof(riemannFit::Matrix6xNf<6>) / sizeof(float));
+  auto fast_fit_resultsGPU =
       std::make_unique<double[]>(maxNumberOfConcurrentFits_ * sizeof(riemannFit::Vector4d) / sizeof(double));
 
   for (uint32_t offset = 0; offset < maxNumberOfTuples; offset += maxNumberOfConcurrentFits_) {
     // fit triplets
-    kernel_BLFastFit<3>(
-        tuples_, tupleMultiplicity_, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 3, offset);
+    kernel_BLFastFit<3>(tuples_,
+                        tupleMultiplicity_,
+                        hv,
+                        tkidGPU.get(),
+                        hitsGPU.get(),
+                        hits_geGPU.get(),
+                        fast_fit_resultsGPU.get(),
+                        3,
+                        3,
+                        offset);
 
     kernel_BLFit<3>(tupleMultiplicity_,
                     bField_,
                     outputSoa_,
-                    hitsGPU_.get(),
-                    hits_geGPU_.get(),
-                    fast_fit_resultsGPU_.get(),
-                    3,
-                    offset);
-
-    // fit quads
-    kernel_BLFastFit<4>(
-        tuples_, tupleMultiplicity_, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 4, offset);
-
-    kernel_BLFit<4>(tupleMultiplicity_,
-                    bField_,
-                    outputSoa_,
-                    hitsGPU_.get(),
-                    hits_geGPU_.get(),
-                    fast_fit_resultsGPU_.get(),
-                    4,
-                    offset);
+                    tkidGPU.get(),
+                    hitsGPU.get(),
+                    hits_geGPU.get(),
+                    fast_fit_resultsGPU.get());
 
-    if (fit5as4_) {
-      // fit penta (only first 4)
-      kernel_BLFastFit<4>(
-          tuples_, tupleMultiplicity_, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 5, offset);
+    if (fitNas4_) {
+      // fit all as 4
+      kernel_BLFastFit<4>(tuples_,
+                          tupleMultiplicity_,
+                          hv,
+                          tkidGPU.get(),
+                          hitsGPU.get(),
+                          hits_geGPU.get(),
+                          fast_fit_resultsGPU.get(),
+                          4,
+                          8,
+                          offset);
 
       kernel_BLFit<4>(tupleMultiplicity_,
                       bField_,
                       outputSoa_,
-                      hitsGPU_.get(),
-                      hits_geGPU_.get(),
-                      fast_fit_resultsGPU_.get(),
-                      5,
-                      offset);
+                      tkidGPU.get(),
+                      hitsGPU.get(),
+                      hits_geGPU.get(),
+                      fast_fit_resultsGPU.get());
     } else {
+      // fit quads
+      kernel_BLFastFit<4>(tuples_,
+                          tupleMultiplicity_,
+                          hv,
+                          tkidGPU.get(),
+                          hitsGPU.get(),
+                          hits_geGPU.get(),
+                          fast_fit_resultsGPU.get(),
+                          4,
+                          4,
+                          offset);
+
+      kernel_BLFit<4>(tupleMultiplicity_,
+                      bField_,
+                      outputSoa_,
+                      tkidGPU.get(),
+                      hitsGPU.get(),
+                      hits_geGPU.get(),
+                      fast_fit_resultsGPU.get());
       // fit penta (all 5)
-      kernel_BLFastFit<5>(
-          tuples_, tupleMultiplicity_, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 5, offset);
+      kernel_BLFastFit<5>(tuples_,
+                          tupleMultiplicity_,
+                          hv,
+                          tkidGPU.get(),
+                          hitsGPU.get(),
+                          hits_geGPU.get(),
+                          fast_fit_resultsGPU.get(),
+                          5,
+                          5,
+                          offset);
 
       kernel_BLFit<5>(tupleMultiplicity_,
                       bField_,
                       outputSoa_,
-                      hitsGPU_.get(),
-                      hits_geGPU_.get(),
-                      fast_fit_resultsGPU_.get(),
-                      5,
-                      offset);
+                      tkidGPU.get(),
+                      hitsGPU.get(),
+                      hits_geGPU.get(),
+                      fast_fit_resultsGPU.get());
+      // fit sexta and above (as 6)
+      kernel_BLFastFit<6>(tuples_,
+                          tupleMultiplicity_,
+                          hv,
+                          tkidGPU.get(),
+                          hitsGPU.get(),
+                          hits_geGPU.get(),
+                          fast_fit_resultsGPU.get(),
+                          6,
+                          8,
+                          offset);
+
+      kernel_BLFit<6>(tupleMultiplicity_,
+                      bField_,
+                      outputSoa_,
+                      tkidGPU.get(),
+                      hitsGPU.get(),
+                      hits_geGPU.get(),
+                      fast_fit_resultsGPU.get());
     }
 
   }  // loop on concurrent fits
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu
index d2ca583e86bd0..d99a96b705451 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu
+++ b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu
@@ -11,73 +11,120 @@ void HelixFitOnGPU::launchBrokenLineKernels(HitsView const *hv,
   auto numberOfBlocks = (maxNumberOfConcurrentFits_ + blockSize - 1) / blockSize;
 
   //  Fit internals
-  auto hitsGPU_ = cms::cuda::make_device_unique<double[]>(
-      maxNumberOfConcurrentFits_ * sizeof(riemannFit::Matrix3xNd<4>) / sizeof(double), stream);
-  auto hits_geGPU_ = cms::cuda::make_device_unique<float[]>(
-      maxNumberOfConcurrentFits_ * sizeof(riemannFit::Matrix6x4f) / sizeof(float), stream);
-  auto fast_fit_resultsGPU_ = cms::cuda::make_device_unique<double[]>(
+  auto tkidGPU = cms::cuda::make_device_unique<caConstants::tindex_type[]>(maxNumberOfConcurrentFits_, stream);
+  auto hitsGPU = cms::cuda::make_device_unique<double[]>(
+      maxNumberOfConcurrentFits_ * sizeof(riemannFit::Matrix3xNd<6>) / sizeof(double), stream);
+  auto hits_geGPU = cms::cuda::make_device_unique<float[]>(
+      maxNumberOfConcurrentFits_ * sizeof(riemannFit::Matrix6xNf<6>) / sizeof(float), stream);
+  auto fast_fit_resultsGPU = cms::cuda::make_device_unique<double[]>(
       maxNumberOfConcurrentFits_ * sizeof(riemannFit::Vector4d) / sizeof(double), stream);
 
   for (uint32_t offset = 0; offset < maxNumberOfTuples; offset += maxNumberOfConcurrentFits_) {
     // fit triplets
-    kernel_BLFastFit<3><<<numberOfBlocks, blockSize, 0, stream>>>(
-        tuples_, tupleMultiplicity_, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 3, offset);
+    kernel_BLFastFit<3><<<numberOfBlocks, blockSize, 0, stream>>>(tuples_,
+                                                                  tupleMultiplicity_,
+                                                                  hv,
+                                                                  tkidGPU.get(),
+                                                                  hitsGPU.get(),
+                                                                  hits_geGPU.get(),
+                                                                  fast_fit_resultsGPU.get(),
+                                                                  3,
+                                                                  3,
+                                                                  offset);
     cudaCheck(cudaGetLastError());
 
     kernel_BLFit<3><<<numberOfBlocks, blockSize, 0, stream>>>(tupleMultiplicity_,
                                                               bField_,
                                                               outputSoa_,
-                                                              hitsGPU_.get(),
-                                                              hits_geGPU_.get(),
-                                                              fast_fit_resultsGPU_.get(),
-                                                              3,
-                                                              offset);
-    cudaCheck(cudaGetLastError());
-
-    // fit quads
-    kernel_BLFastFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(
-        tuples_, tupleMultiplicity_, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 4, offset);
-    cudaCheck(cudaGetLastError());
-
-    kernel_BLFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_,
-                                                                  bField_,
-                                                                  outputSoa_,
-                                                                  hitsGPU_.get(),
-                                                                  hits_geGPU_.get(),
-                                                                  fast_fit_resultsGPU_.get(),
-                                                                  4,
-                                                                  offset);
+                                                              tkidGPU.get(),
+                                                              hitsGPU.get(),
+                                                              hits_geGPU.get(),
+                                                              fast_fit_resultsGPU.get());
     cudaCheck(cudaGetLastError());
 
-    if (fit5as4_) {
-      // fit penta (only first 4)
-      kernel_BLFastFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(
-          tuples_, tupleMultiplicity_, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 5, offset);
+    if (fitNas4_) {
+      // fit all as 4
+      kernel_BLFastFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tuples_,
+                                                                        tupleMultiplicity_,
+                                                                        hv,
+                                                                        tkidGPU.get(),
+                                                                        hitsGPU.get(),
+                                                                        hits_geGPU.get(),
+                                                                        fast_fit_resultsGPU.get(),
+                                                                        4,
+                                                                        8,
+                                                                        offset);
       cudaCheck(cudaGetLastError());
 
       kernel_BLFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_,
                                                                     bField_,
                                                                     outputSoa_,
-                                                                    hitsGPU_.get(),
-                                                                    hits_geGPU_.get(),
-                                                                    fast_fit_resultsGPU_.get(),
-                                                                    5,
-                                                                    offset);
-      cudaCheck(cudaGetLastError());
+                                                                    tkidGPU.get(),
+                                                                    hitsGPU.get(),
+                                                                    hits_geGPU.get(),
+                                                                    fast_fit_resultsGPU.get());
     } else {
-      // fit penta (all 5)
-      kernel_BLFastFit<5><<<numberOfBlocks / 4, blockSize, 0, stream>>>(
-          tuples_, tupleMultiplicity_, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 5, offset);
+      // fit quads
+      kernel_BLFastFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tuples_,
+                                                                        tupleMultiplicity_,
+                                                                        hv,
+                                                                        tkidGPU.get(),
+                                                                        hitsGPU.get(),
+                                                                        hits_geGPU.get(),
+                                                                        fast_fit_resultsGPU.get(),
+                                                                        4,
+                                                                        4,
+                                                                        offset);
       cudaCheck(cudaGetLastError());
 
-      kernel_BLFit<5><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_,
+      kernel_BLFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_,
                                                                     bField_,
                                                                     outputSoa_,
-                                                                    hitsGPU_.get(),
-                                                                    hits_geGPU_.get(),
-                                                                    fast_fit_resultsGPU_.get(),
-                                                                    5,
-                                                                    offset);
+                                                                    tkidGPU.get(),
+                                                                    hitsGPU.get(),
+                                                                    hits_geGPU.get(),
+                                                                    fast_fit_resultsGPU.get());
+      // fit penta (all 5)
+      kernel_BLFastFit<5><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tuples_,
+                                                                        tupleMultiplicity_,
+                                                                        hv,
+                                                                        tkidGPU.get(),
+                                                                        hitsGPU.get(),
+                                                                        hits_geGPU.get(),
+                                                                        fast_fit_resultsGPU.get(),
+                                                                        5,
+                                                                        5,
+                                                                        offset);
+      cudaCheck(cudaGetLastError());
+
+      kernel_BLFit<5><<<8, blockSize, 0, stream>>>(tupleMultiplicity_,
+                                                   bField_,
+                                                   outputSoa_,
+                                                   tkidGPU.get(),
+                                                   hitsGPU.get(),
+                                                   hits_geGPU.get(),
+                                                   fast_fit_resultsGPU.get());
+      cudaCheck(cudaGetLastError());
+      // fit sexta and above (as 6)
+      kernel_BLFastFit<6><<<4, blockSize, 0, stream>>>(tuples_,
+                                                       tupleMultiplicity_,
+                                                       hv,
+                                                       tkidGPU.get(),
+                                                       hitsGPU.get(),
+                                                       hits_geGPU.get(),
+                                                       fast_fit_resultsGPU.get(),
+                                                       6,
+                                                       8,
+                                                       offset);
+      cudaCheck(cudaGetLastError());
+
+      kernel_BLFit<6><<<4, blockSize, 0, stream>>>(tupleMultiplicity_,
+                                                   bField_,
+                                                   outputSoa_,
+                                                   tkidGPU.get(),
+                                                   hitsGPU.get(),
+                                                   hits_geGPU.get(),
+                                                   fast_fit_resultsGPU.get());
       cudaCheck(cudaGetLastError());
     }
 
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h
index d572c8010cdfb..6ec6afb83cba1 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h
@@ -19,6 +19,8 @@
 using HitsOnGPU = TrackingRecHit2DSOAView;
 using Tuples = pixelTrack::HitContainer;
 using OutputSoA = pixelTrack::TrackSoA;
+using tindex_type = caConstants::tindex_type;
+constexpr auto invalidTkId = std::numeric_limits<tindex_type>::max();
 
 // #define BL_DUMP_HITS
 
@@ -26,41 +28,53 @@ template <int N>
 __global__ void kernel_BLFastFit(Tuples const *__restrict__ foundNtuplets,
                                  caConstants::TupleMultiplicity const *__restrict__ tupleMultiplicity,
                                  HitsOnGPU const *__restrict__ hhp,
+                                 tindex_type *__restrict__ ptkids,
                                  double *__restrict__ phits,
                                  float *__restrict__ phits_ge,
                                  double *__restrict__ pfast_fit,
-                                 uint32_t nHits,
-                                 uint32_t offset) {
+                                 uint32_t nHitsL,
+                                 uint32_t nHitsH,
+                                 int32_t offset) {
   constexpr uint32_t hitsInFit = N;
 
-  assert(hitsInFit <= nHits);
-
+  assert(hitsInFit <= nHitsL);
+  assert(nHitsL <= nHitsH);
   assert(hhp);
+  assert(phits);
   assert(pfast_fit);
   assert(foundNtuplets);
   assert(tupleMultiplicity);
 
   // look in bin for this hit multiplicity
   auto local_start = blockIdx.x * blockDim.x + threadIdx.x;
+  int totTK = tupleMultiplicity->end(nHitsH) - tupleMultiplicity->begin(nHitsL);
+  assert(totTK <= int(tupleMultiplicity->size()));
+  assert(totTK >= 0);
 
 #ifdef BROKENLINE_DEBUG
   if (0 == local_start) {
-    printf("%d total Ntuple\n", foundNtuplets->nOnes());
-    printf("%d Ntuple of size %d for %d hits to fit\n", tupleMultiplicity->size(nHits), nHits, hitsInFit);
+    printf("%d total Ntuple\n", tupleMultiplicity->size());
+    printf("%d Ntuple of size %d/%d for %d hits to fit\n", totTK, nHitsL, nHitsH, hitsInFit);
   }
 #endif
 
   for (int local_idx = local_start, nt = riemannFit::maxNumberOfConcurrentFits; local_idx < nt;
        local_idx += gridDim.x * blockDim.x) {
-    auto tuple_idx = local_idx + offset;
-    if (tuple_idx >= tupleMultiplicity->size(nHits))
+    int tuple_idx = local_idx + offset;
+    if (tuple_idx >= totTK) {
+      ptkids[local_idx] = invalidTkId;
       break;
-
+    }
     // get it from the ntuple container (one to one to helix)
-    auto tkid = *(tupleMultiplicity->begin(nHits) + tuple_idx);
+    auto tkid = *(tupleMultiplicity->begin(nHitsL) + tuple_idx);
     assert(tkid < foundNtuplets->nOnes());
 
-    assert(foundNtuplets->size(tkid) == nHits);
+    ptkids[local_idx] = tkid;
+
+    auto nHits = foundNtuplets->size(tkid);
+
+    assert(nHits >= nHitsL);
+    assert(nHits <= nHitsH);
 
     riemannFit::Map3xNd<N> hits(phits + local_idx);
     riemannFit::Map4d fast_fit(pfast_fit + local_idx);
@@ -84,9 +98,18 @@ __global__ void kernel_BLFastFit(Tuples const *__restrict__ foundNtuplets,
     auto dz = hhp->zGlobal(hitId[hitsInFit - 1]) - hhp->zGlobal(hitId[0]);
     float ux, uy, uz;
 #endif
-    for (unsigned int i = 0; i < hitsInFit; ++i) {
-      auto hit = hitId[i];
+
+    float incr = std::max(1.f, float(nHits) / float(hitsInFit));
+    float n = 0;
+    for (uint32_t i = 0; i < hitsInFit; ++i) {
+      int j = int(n + 0.5f);  // round
+      if (hitsInFit - 1 == i)
+        j = nHits - 1;  // force last hit to ensure max lever arm.
+      assert(j < int(nHits));
+      n += incr;
+      auto hit = hitId[j];
       float ge[6];
+
 #ifdef YERR_FROM_DC
       auto const &dp = hhp->cpeParams().detParams(hhp->detectorIndex(hit));
       auto status = hhp->status(hit);
@@ -115,26 +138,21 @@ __global__ void kernel_BLFastFit(Tuples const *__restrict__ foundNtuplets,
 #endif
 
 #ifdef BL_DUMP_HITS
+      bool dump = foundNtuplets->size(tkid) == 5;
       if (dump) {
-        printf("Hit global: %d: %d hits.col(%d) << %f,%f,%f\n",
+        printf("Track id %d %d Hit %d on %d\nGlobal: hits.col(%d) << %f,%f,%f\n",
+               local_idx,
                tkid,
+               hit,
                hhp->detectorIndex(hit),
                i,
                hhp->xGlobal(hit),
                hhp->yGlobal(hit),
                hhp->zGlobal(hit));
-        printf("Error: %d: %d  hits_ge.col(%d) << %e,%e,%e,%e,%e,%e\n",
-               tkid,
-               hhp->detetectorIndex(hit),
-               i,
-               ge[0],
-               ge[1],
-               ge[2],
-               ge[3],
-               ge[4],
-               ge[5]);
+        printf("Error: hits_ge.col(%d) << %e,%e,%e,%e,%e,%e\n", i, ge[0], ge[1], ge[2], ge[3], ge[4], ge[5]);
       }
 #endif
+
       hits.col(i) << hhp->xGlobal(hit), hhp->yGlobal(hit), hhp->zGlobal(hit);
       hits_ge.col(i) << ge[0], ge[1], ge[2], ge[3], ge[4], ge[5];
     }
@@ -152,13 +170,10 @@ template <int N>
 __global__ void kernel_BLFit(caConstants::TupleMultiplicity const *__restrict__ tupleMultiplicity,
                              double bField,
                              OutputSoA *results,
+                             tindex_type const *__restrict__ ptkids,
                              double *__restrict__ phits,
                              float *__restrict__ phits_ge,
-                             double *__restrict__ pfast_fit,
-                             uint32_t nHits,
-                             uint32_t offset) {
-  assert(N <= nHits);
-
+                             double *__restrict__ pfast_fit) {
   assert(results);
   assert(pfast_fit);
 
@@ -168,12 +183,12 @@ __global__ void kernel_BLFit(caConstants::TupleMultiplicity const *__restrict__
   auto local_start = blockIdx.x * blockDim.x + threadIdx.x;
   for (int local_idx = local_start, nt = riemannFit::maxNumberOfConcurrentFits; local_idx < nt;
        local_idx += gridDim.x * blockDim.x) {
-    auto tuple_idx = local_idx + offset;
-    if (tuple_idx >= tupleMultiplicity->size(nHits))
+    if (invalidTkId == ptkids[local_idx])
       break;
 
-    // get it for the ntuple container (one to one to helix)
-    auto tkid = *(tupleMultiplicity->begin(nHits) + tuple_idx);
+    auto tkid = ptkids[local_idx];
+
+    assert(tkid < caConstants::maxTuples);
 
     riemannFit::Map3xNd<N> hits(phits + local_idx);
     riemannFit::Map4d fast_fit(pfast_fit + local_idx);
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h b/RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h
index c1428a6daaf55..127831e0e2eb7 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h
@@ -44,6 +44,7 @@ namespace caConstants {
   constexpr uint32_t maxNumberOfLayerPairs = 20;
   constexpr uint32_t maxNumberOfLayers = 10;
   constexpr uint32_t maxTuples = maxNumberOfTuples;
+  constexpr int32_t maxHitsOnTrack = 10;
 
   // Modules constants
   constexpr uint32_t max_ladder_bpx0 = 12;
@@ -76,7 +77,7 @@ namespace caConstants {
   using OuterHitOfCellContainer = cms::cuda::VecArray<uint32_t, maxCellsPerHit>;
   using TuplesContainer = cms::cuda::OneToManyAssoc<hindex_type, maxTuples, 5 * maxTuples>;
   using HitToTuple = cms::cuda::OneToManyAssoc<tindex_type, -1, 4 * maxTuples>;  // 3.5 should be enough
-  using TupleMultiplicity = cms::cuda::OneToManyAssoc<tindex_type, 8, maxTuples>;
+  using TupleMultiplicity = cms::cuda::OneToManyAssoc<tindex_type, maxHitsOnTrack + 1, maxTuples>;
 
   struct OuterHitOfCell {
     OuterHitOfCellContainer* container;
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
index c36e6787ef871..bd8151a2a816a 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
@@ -13,11 +13,6 @@ void CAHitNtupletGeneratorKernelsCPU::printCounters(Counters const *counters) {
   kernel_printCounters(counters);
 }
 
-template <>
-void CAHitNtupletGeneratorKernelsCPU::fillHitDetIndices(HitsView const *hv, TkSoA *tracks_d, cudaStream_t) {
-  kernel_fillHitDetIndices(&tracks_d->hitIndices, hv, &tracks_d->detIndices);
-}
-
 template <>
 void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsOnCPU const &hh, cudaStream_t stream) {
   auto nhits = hh.nHits();
@@ -85,6 +80,7 @@ void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsOnCPU const &hh, cudaStr
 template <>
 void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) {
   auto *tuples_d = &tracks_d->hitIndices;
+  auto *detId_d = &tracks_d->detIndices;
   auto *quality_d = tracks_d->qualityData();
 
   assert(tuples_d && quality_d);
@@ -128,12 +124,15 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA *
                        quality_d,
                        params_.minHitsPerNtuplet_);
   if (params_.doStats_)
-    kernel_mark_used(hh.view(), device_theCells_.get(), device_nCells_);
+    kernel_mark_used(device_theCells_.get(), device_nCells_);
 
   cms::cuda::finalizeBulk(device_hitTuple_apc_, tuples_d);
 
+  kernel_fillHitDetIndices(tuples_d, hh.view(), detId_d);
+  kernel_fillNLayers(tracks_d);
+
   // remove duplicates (tracks that share a doublet)
-  kernel_earlyDuplicateRemover(device_theCells_.get(), device_nCells_, tuples_d, quality_d, params_.dupPassThrough_);
+  kernel_earlyDuplicateRemover(device_theCells_.get(), device_nCells_, tracks_d, quality_d, params_.dupPassThrough_);
 
   kernel_countMultiplicity(tuples_d, quality_d, device_tupleMultiplicity_.get());
   cms::cuda::launchFinalize(device_tupleMultiplicity_.get(), cudaStream);
@@ -160,7 +159,7 @@ void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, TkSoA
   }
 
   // remove duplicates (tracks that share a doublet)
-  kernel_fastDuplicateRemover(device_theCells_.get(), device_nCells_, tuples_d, tracks_d, params_.dupPassThrough_);
+  kernel_fastDuplicateRemover(device_theCells_.get(), device_nCells_, tracks_d, params_.dupPassThrough_);
 
   // fill hit->track "map"
   if (params_.doSharedHitCut_ || params_.doStats_) {
@@ -171,37 +170,21 @@ void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, TkSoA
 
   // remove duplicates (tracks that share at least one hit)
   if (params_.doSharedHitCut_) {
-    kernel_rejectDuplicate(hh.view(),
-                           tuples_d,
-                           tracks_d,
-                           quality_d,
-                           params_.minHitsForSharingCut_,
-                           params_.dupPassThrough_,
-                           device_hitToTuple_.get());
+    kernel_rejectDuplicate(
+        tracks_d, quality_d, params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get());
 
     kernel_sharedHitCleaner(hh.view(),
-                            tuples_d,
                             tracks_d,
                             quality_d,
                             params_.minHitsForSharingCut_,
                             params_.dupPassThrough_,
                             device_hitToTuple_.get());
     if (params_.useSimpleTripletCleaner_) {
-      kernel_simpleTripletCleaner(hh.view(),
-                                  tuples_d,
-                                  tracks_d,
-                                  quality_d,
-                                  params_.minHitsForSharingCut_,
-                                  params_.dupPassThrough_,
-                                  device_hitToTuple_.get());
+      kernel_simpleTripletCleaner(
+          tracks_d, quality_d, params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get());
     } else {
-      kernel_tripletCleaner(hh.view(),
-                            tuples_d,
-                            tracks_d,
-                            quality_d,
-                            params_.minHitsForSharingCut_,
-                            params_.dupPassThrough_,
-                            device_hitToTuple_.get());
+      kernel_tripletCleaner(
+          tracks_d, quality_d, params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get());
     }
   }
 
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
index ae770cd77e740..689cc0afd052b 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
@@ -1,23 +1,10 @@
 #include "RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h"
 
-template <>
-void CAHitNtupletGeneratorKernelsGPU::fillHitDetIndices(HitsView const *hv, TkSoA *tracks_d, cudaStream_t cudaStream) {
-  auto blockSize = 128;
-  auto numberOfBlocks = (HitContainer::ctCapacity() + blockSize - 1) / blockSize;
-
-  kernel_fillHitDetIndices<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
-      &tracks_d->hitIndices, hv, &tracks_d->detIndices);
-  cudaCheck(cudaGetLastError());
-#ifdef GPU_DEBUG
-  cudaDeviceSynchronize();
-  cudaCheck(cudaGetLastError());
-#endif
-}
-
 template <>
 void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) {
   // these are pointer on GPU!
   auto *tuples_d = &tracks_d->hitIndices;
+  auto *detId_d = &tracks_d->detIndices;
   auto *quality_d = tracks_d->qualityData();
 
   // zero tuples
@@ -89,7 +76,7 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *
   cudaCheck(cudaGetLastError());
 
   if (params_.doStats_)
-    kernel_mark_used<<<numberOfBlocks, blockSize, 0, cudaStream>>>(hh.view(), device_theCells_.get(), device_nCells_);
+    kernel_mark_used<<<numberOfBlocks, blockSize, 0, cudaStream>>>(device_theCells_.get(), device_nCells_);
   cudaCheck(cudaGetLastError());
 
 #ifdef GPU_DEBUG
@@ -101,10 +88,15 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *
   numberOfBlocks = (HitContainer::ctNOnes() + blockSize - 1) / blockSize;
   cms::cuda::finalizeBulk<<<numberOfBlocks, blockSize, 0, cudaStream>>>(device_hitTuple_apc_, tuples_d);
 
+  kernel_fillHitDetIndices<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d, hh.view(), detId_d);
+  cudaCheck(cudaGetLastError());
+  kernel_fillNLayers<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tracks_d);
+  cudaCheck(cudaGetLastError());
+
   // remove duplicates (tracks that share a doublet)
   numberOfBlocks = nDoubletBlocks(blockSize);
   kernel_earlyDuplicateRemover<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
-      device_theCells_.get(), device_nCells_, tuples_d, quality_d, params_.dupPassThrough_);
+      device_theCells_.get(), device_nCells_, tracks_d, quality_d, params_.dupPassThrough_);
   cudaCheck(cudaGetLastError());
 
   blockSize = 128;
@@ -254,7 +246,7 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA
   // mark duplicates (tracks that share a doublet)
   numberOfBlocks = nDoubletBlocks(blockSize);
   kernel_fastDuplicateRemover<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
-      device_theCells_.get(), device_nCells_, tuples_d, tracks_d, params_.dupPassThrough_);
+      device_theCells_.get(), device_nCells_, tracks_d, params_.dupPassThrough_);
   cudaCheck(cudaGetLastError());
 #ifdef GPU_DEBUG
   cudaCheck(cudaDeviceSynchronize());
@@ -282,16 +274,10 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA
     // mark duplicates (tracks that share at least one hit)
     numberOfBlocks = (hitToTupleView_.offSize + blockSize - 1) / blockSize;
 
-    kernel_rejectDuplicate<<<numberOfBlocks, blockSize, 0, cudaStream>>>(hh.view(),
-                                                                         tuples_d,
-                                                                         tracks_d,
-                                                                         quality_d,
-                                                                         params_.minHitsForSharingCut_,
-                                                                         params_.dupPassThrough_,
-                                                                         device_hitToTuple_.get());
+    kernel_rejectDuplicate<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+        tracks_d, quality_d, params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get());
 
     kernel_sharedHitCleaner<<<numberOfBlocks, blockSize, 0, cudaStream>>>(hh.view(),
-                                                                          tuples_d,
                                                                           tracks_d,
                                                                           quality_d,
                                                                           params_.minHitsForSharingCut_,
@@ -299,21 +285,11 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA
                                                                           device_hitToTuple_.get());
 
     if (params_.useSimpleTripletCleaner_) {
-      kernel_simpleTripletCleaner<<<numberOfBlocks, blockSize, 0, cudaStream>>>(hh.view(),
-                                                                                tuples_d,
-                                                                                tracks_d,
-                                                                                quality_d,
-                                                                                params_.minHitsForSharingCut_,
-                                                                                params_.dupPassThrough_,
-                                                                                device_hitToTuple_.get());
+      kernel_simpleTripletCleaner<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+          tracks_d, quality_d, params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get());
     } else {
-      kernel_tripletCleaner<<<numberOfBlocks, blockSize, 0, cudaStream>>>(hh.view(),
-                                                                          tuples_d,
-                                                                          tracks_d,
-                                                                          quality_d,
-                                                                          params_.minHitsForSharingCut_,
-                                                                          params_.dupPassThrough_,
-                                                                          device_hitToTuple_.get());
+      kernel_tripletCleaner<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+          tracks_d, quality_d, params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get());
     }
     cudaCheck(cudaGetLastError());
 #ifdef GPU_DEBUG
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h
index 684a54f5d2ed4..10a02309185a3 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h
@@ -59,7 +59,7 @@ namespace cAHitNtupletGenerator {
            uint32_t maxNumberOfDoublets,
            uint16_t minHitsForSharingCuts,
            bool useRiemannFit,
-           bool fit5as4,
+           bool fitNas4,
            bool includeJumpingForwardDoublets,
            bool earlyFishbone,
            bool lateFishbone,
@@ -84,7 +84,7 @@ namespace cAHitNtupletGenerator {
           maxNumberOfDoublets_(maxNumberOfDoublets),
           minHitsForSharingCut_(minHitsForSharingCuts),
           useRiemannFit_(useRiemannFit),
-          fit5as4_(fit5as4),
+          fitNas4_(fitNas4),
           includeJumpingForwardDoublets_(includeJumpingForwardDoublets),
           earlyFishbone_(earlyFishbone),
           lateFishbone_(lateFishbone),
@@ -109,7 +109,7 @@ namespace cAHitNtupletGenerator {
     const uint32_t maxNumberOfDoublets_;
     const uint16_t minHitsForSharingCut_;
     const bool useRiemannFit_;
-    const bool fit5as4_;
+    const bool fitNas4_;
     const bool includeJumpingForwardDoublets_;
     const bool earlyFishbone_;
     const bool lateFishbone_;
@@ -185,8 +185,6 @@ class CAHitNtupletGeneratorKernels {
 
   void classifyTuples(HitsOnCPU const& hh, TkSoA* tuples_d, cudaStream_t cudaStream);
 
-  void fillHitDetIndices(HitsView const* hv, TkSoA* tuples_d, cudaStream_t cudaStream);
-
   void buildDoublets(HitsOnCPU const& hh, cudaStream_t stream);
   void allocateOnGPU(int32_t nHits, cudaStream_t stream);
   void cleanup(cudaStream_t cudaStream);
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
index 418fff8d8d67e..86d7ab8b5a1a6 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
@@ -78,9 +78,9 @@ __global__ void kernel_checkOverflows(HitContainer const *foundNtuplets,
   }
 
   for (int idx = first, nt = foundNtuplets->nOnes(); idx < nt; idx += gridDim.x * blockDim.x) {
-    if (foundNtuplets->size(idx) > 5)
+    if (foundNtuplets->size(idx) > 7)  // current real limit
       printf("ERROR %d, %d\n", idx, foundNtuplets->size(idx));
-    assert(foundNtuplets->size(idx) < 6);
+    assert(foundNtuplets->size(idx) <= caConstants::maxHitsOnTrack);
     for (auto ih = foundNtuplets->begin(idx); ih != foundNtuplets->end(idx); ++ih)
       assert(int(*ih) < nHits);
   }
@@ -137,12 +137,14 @@ __global__ void kernel_fishboneCleaner(GPUCACell const *cells, uint32_t const *_
 // It does not seem to affect efficiency in any way!
 __global__ void kernel_earlyDuplicateRemover(GPUCACell const *cells,
                                              uint32_t const *__restrict__ nCells,
-                                             HitContainer *foundNtuplets,
+                                             TkSoA const *__restrict__ ptracks,
                                              Quality *quality,
                                              bool dupPassThrough) {
   // quality to mark rejected
   constexpr auto reject = pixelTrack::Quality::edup;  /// cannot be loose
 
+  auto const &tracks = *ptracks;
+
   assert(nCells);
   auto first = threadIdx.x + blockIdx.x * blockDim.x;
   for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) {
@@ -150,22 +152,21 @@ __global__ void kernel_earlyDuplicateRemover(GPUCACell const *cells,
 
     if (thisCell.tracks().size() < 2)
       continue;
-    //if (0==thisCell.theUsed) continue;
-    // if (thisCell.theDoubletId < 0) continue;
 
-    uint32_t maxNh = 0;
+    int8_t maxNl = 0;
 
-    // find maxNh
+    // find maxNl
     for (auto it : thisCell.tracks()) {
-      auto nh = foundNtuplets->size(it);
-      maxNh = std::max(nh, maxNh);
+      auto nl = tracks.nLayers(it);
+      maxNl = std::max(nl, maxNl);
     }
 
+    // if (maxNl<4) continue;
     // quad pass through (leave it her for tests)
-    //  maxNh = std::min(4U, maxNh);
+    //  maxNl = std::min(4, maxNl);
 
     for (auto it : thisCell.tracks()) {
-      if (foundNtuplets->size(it) < maxNh)
+      if (tracks.nLayers(it) < maxNl)
         quality[it] = reject;  //no race:  simple assignment of the same constant
     }
   }
@@ -174,7 +175,6 @@ __global__ void kernel_earlyDuplicateRemover(GPUCACell const *cells,
 // assume the above (so, short tracks already removed)
 __global__ void kernel_fastDuplicateRemover(GPUCACell const *__restrict__ cells,
                                             uint32_t const *__restrict__ nCells,
-                                            HitContainer const *__restrict__ foundNtuplets,
                                             TkSoA *__restrict__ tracks,
                                             bool dupPassThrough) {
   // quality to mark rejected
@@ -188,14 +188,13 @@ __global__ void kernel_fastDuplicateRemover(GPUCACell const *__restrict__ cells,
     auto const &thisCell = cells[idx];
     if (thisCell.tracks().size() < 2)
       continue;
-    // if (thisCell.theDoubletId < 0) continue;
 
     float mc = maxScore;
     uint16_t im = tkNotFound;
 
     /* chi2 penalize higher-pt tracks  (try rescale it?)
     auto score = [&](auto it) {
-      return foundNtuplets->size(it) < 4 ? 
+      return tracks->nHits(it) < 4 ? 
               std::abs(tracks->tip(it)) :  // tip for triplets
               tracks->chi2(it);            //chi2 for quads
     };
@@ -329,8 +328,8 @@ __global__ void kernel_connect(cms::cuda::AtomicPairCounter *apc1,
                                                                                               : dcaCutOuterTriplet,
                                      hardCurvCut)) {  // FIXME tune cuts
         oc.addOuterNeighbor(cellIndex, *cellNeighbors);
-        thisCell.setUsedBit(1);
-        oc.setUsedBit(1);
+        thisCell.setStatusBits(GPUCACell::StatusBit::kUsed);
+        oc.setStatusBits(GPUCACell::StatusBit::kUsed);
       }
     }  // loop on inner cells
   }    // loop on outer cells
@@ -368,14 +367,12 @@ __global__ void kernel_find_ntuplets(GPUCACell::Hits const *__restrict__ hhp,
   }
 }
 
-__global__ void kernel_mark_used(GPUCACell::Hits const *__restrict__ hhp,
-                                 GPUCACell *__restrict__ cells,
-                                 uint32_t const *nCells) {
+__global__ void kernel_mark_used(GPUCACell *__restrict__ cells, uint32_t const *nCells) {
   auto first = threadIdx.x + blockIdx.x * blockDim.x;
   for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) {
     auto &thisCell = cells[idx];
     if (!thisCell.tracks().empty())
-      thisCell.setUsedBit(2);
+      thisCell.setStatusBits(GPUCACell::StatusBit::kInTrack);
   }
 }
 
@@ -390,9 +387,9 @@ __global__ void kernel_countMultiplicity(HitContainer const *__restrict__ foundN
     if (quality[it] == pixelTrack::Quality::edup)
       continue;
     assert(quality[it] == pixelTrack::Quality::bad);
-    if (nhits > 5)
+    if (nhits > 7)  // current limit
       printf("wrong mult %d %d\n", it, nhits);
-    assert(nhits < 8);
+    assert(nhits <= caConstants::maxHitsOnTrack);
     tupleMultiplicity->count(nhits);
   }
 }
@@ -408,9 +405,9 @@ __global__ void kernel_fillMultiplicity(HitContainer const *__restrict__ foundNt
     if (quality[it] == pixelTrack::Quality::edup)
       continue;
     assert(quality[it] == pixelTrack::Quality::bad);
-    if (nhits > 5)
+    if (nhits > 7)
       printf("wrong mult %d %d\n", it, nhits);
-    assert(nhits < 8);
+    assert(nhits <= caConstants::maxHitsOnTrack);
     tupleMultiplicity->fill(nhits, it);
   }
 }
@@ -560,6 +557,17 @@ __global__ void kernel_fillHitDetIndices(HitContainer const *__restrict__ tuples
   }
 }
 
+__global__ void kernel_fillNLayers(TkSoA *__restrict__ ptracks) {
+  auto &tracks = *ptracks;
+  auto first = blockIdx.x * blockDim.x + threadIdx.x;
+  for (int idx = first, nt = TkSoA::stride(); idx < nt; idx += gridDim.x * blockDim.x) {
+    auto nHits = tracks.nHits(idx);
+    if (nHits == 0)
+      break;  // this is a guard: maybe we need to move to nTracks...
+    tracks.nLayers(idx) = tracks.computeNumberOfLayers(idx);
+  }
+}
+
 __global__ void kernel_doStatsForHitInTracks(CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ hitToTuple,
                                              CAHitNtupletGeneratorKernelsGPU::Counters *counters) {
   auto &c = *counters;
@@ -633,9 +641,7 @@ __global__ void kernel_markSharedHit(int const *__restrict__ nshared,
 }
 
 // mostly for very forward triplets.....
-__global__ void kernel_rejectDuplicate(TrackingRecHit2DSOAView const *__restrict__ hhp,
-                                       HitContainer const *__restrict__ ptuples,
-                                       TkSoA const *__restrict__ ptracks,
+__global__ void kernel_rejectDuplicate(TkSoA const *__restrict__ ptracks,
                                        Quality *__restrict__ quality,
                                        uint16_t nmin,
                                        bool dupPassThrough,
@@ -644,7 +650,6 @@ __global__ void kernel_rejectDuplicate(TrackingRecHit2DSOAView const *__restrict
   auto const reject = dupPassThrough ? pixelTrack::Quality::loose : pixelTrack::Quality::dup;
 
   auto &hitToTuple = *phitToTuple;
-  auto const &foundNtuplets = *ptuples;
   auto const &tracks = *ptracks;
 
   int first = blockDim.x * blockIdx.x + threadIdx.x;
@@ -653,12 +658,12 @@ __global__ void kernel_rejectDuplicate(TrackingRecHit2DSOAView const *__restrict
       continue;
 
     /* chi2 is bad for large pt
-    auto score = [&](auto it, auto nh) {
-      return nh < 4 ? std::abs(tracks.tip(it)) :  // tip for triplets
+    auto score = [&](auto it, auto nl) {
+      return nl < 4 ? std::abs(tracks.tip(it)) :  // tip for triplets
                  tracks.chi2(it);                 //chi2
     };
     */
-    auto score = [&](auto it, auto nh) { return std::abs(tracks.tip(it)); };
+    auto score = [&](auto it, auto nl) { return std::abs(tracks.tip(it)); };
 
     // full combinatorics
     for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) {
@@ -670,7 +675,7 @@ __global__ void kernel_rejectDuplicate(TrackingRecHit2DSOAView const *__restrict
       auto e2opi = tracks.stateAtBS.covariance(it)(9);
       auto cti = tracks.stateAtBS.state(it)(3);
       auto e2cti = tracks.stateAtBS.covariance(it)(12);
-      auto nhi = foundNtuplets.size(it);
+      auto nli = tracks.nLayers(it);
       for (auto jp = ip + 1; jp != hitToTuple.end(idx); ++jp) {
         auto const jt = *jp;
         auto qj = quality[jt];
@@ -684,8 +689,8 @@ __global__ void kernel_rejectDuplicate(TrackingRecHit2DSOAView const *__restrict
         auto dop = nSigma2 * (tracks.stateAtBS.covariance(jt)(9) + e2opi);
         if ((opi - opj) * (opi - opj) > dop)
           continue;
-        auto nhj = foundNtuplets.size(jt);
-        if (nhj < nhi || (nhj == nhi && (qj < qi || (qj == qi && score(it, nhi) < score(jt, nhj)))))
+        auto nlj = tracks.nLayers(jt);
+        if (nlj < nli || (nlj == nli && (qj < qi || (qj == qi && score(it, nli) < score(jt, nlj)))))
           quality[jt] = reject;
         else {
           quality[it] = reject;
@@ -697,10 +702,9 @@ __global__ void kernel_rejectDuplicate(TrackingRecHit2DSOAView const *__restrict
 }
 
 __global__ void kernel_sharedHitCleaner(TrackingRecHit2DSOAView const *__restrict__ hhp,
-                                        HitContainer const *__restrict__ ptuples,
                                         TkSoA const *__restrict__ ptracks,
                                         Quality *__restrict__ quality,
-                                        uint16_t nmin,
+                                        int nmin,
                                         bool dupPassThrough,
                                         CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ phitToTuple) {
   // quality to mark rejected
@@ -709,8 +713,7 @@ __global__ void kernel_sharedHitCleaner(TrackingRecHit2DSOAView const *__restric
   auto const longTqual = pixelTrack::Quality::highPurity;
 
   auto &hitToTuple = *phitToTuple;
-  auto const &foundNtuplets = *ptuples;
-  // auto const &tracks = *ptracks;
+  auto const &tracks = *ptracks;
 
   auto const &hh = *hhp;
   int l1end = hh.hitsLayerStart()[1];
@@ -720,39 +723,38 @@ __global__ void kernel_sharedHitCleaner(TrackingRecHit2DSOAView const *__restric
     if (hitToTuple.size(idx) < 2)
       continue;
 
-    uint32_t maxNh = 0;
+    int8_t maxNl = 0;
 
-    // find maxNh
+    // find maxNl
     for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) {
       if (quality[*it] < longTqual)
         continue;
-      uint32_t nh = foundNtuplets.size(*it);
-      maxNh = std::max(nh, maxNh);
+      // if (tracks.nHits(*it)==3) continue;
+      auto nl = tracks.nLayers(*it);
+      maxNl = std::max(nl, maxNl);
     }
 
-    if (maxNh < 4)
+    if (maxNl < 4)
       continue;
 
     // quad pass through (leave for tests)
-    // maxNh = std::min(4U, maxNh);
+    // maxNl = std::min(4, maxNl);
 
-    // kill all tracks shorter than maxHn (only triplets???
+    // kill all tracks shorter than maxHl (only triplets???
     for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) {
-      uint32_t nh = foundNtuplets.size(*it);
+      auto nl = tracks.nLayers(*it);
 
       //checking if shared hit is on bpix1 and if the tuple is short enough
-      if (idx < l1end and nh > nmin)
+      if (idx < l1end and nl > nmin)
         continue;
 
-      if (nh < maxNh && quality[*it] > reject)
+      if (nl < maxNl && quality[*it] > reject)
         quality[*it] = reject;
     }
   }
 }
 
-__global__ void kernel_tripletCleaner(TrackingRecHit2DSOAView const *__restrict__ hhp,
-                                      HitContainer const *__restrict__ ptuples,
-                                      TkSoA const *__restrict__ ptracks,
+__global__ void kernel_tripletCleaner(TkSoA const *__restrict__ ptracks,
                                       Quality *__restrict__ quality,
                                       uint16_t nmin,
                                       bool dupPassThrough,
@@ -763,7 +765,6 @@ __global__ void kernel_tripletCleaner(TrackingRecHit2DSOAView const *__restrict_
   auto const good = pixelTrack::Quality::strict;
 
   auto &hitToTuple = *phitToTuple;
-  auto const &foundNtuplets = *ptuples;
   auto const &tracks = *ptracks;
 
   int first = blockDim.x * blockIdx.x + threadIdx.x;
@@ -773,18 +774,19 @@ __global__ void kernel_tripletCleaner(TrackingRecHit2DSOAView const *__restrict_
 
     float mc = maxScore;
     uint16_t im = tkNotFound;
-    uint32_t maxNh = 0;
+    bool onlyTriplets = true;
 
-    // find maxNh
+    // check if only triplets
     for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) {
       if (quality[*it] <= good)
         continue;
-      uint32_t nh = foundNtuplets.size(*it);
-      maxNh = std::max(nh, maxNh);
+      onlyTriplets &= tracks.isTriplet(*it);
+      if (!onlyTriplets)
+        break;
     }
 
     // only triplets
-    if (maxNh != 3)
+    if (!onlyTriplets)
       continue;
 
     // for triplets choose best tip!  (should we first find best quality???)
@@ -810,8 +812,6 @@ __global__ void kernel_tripletCleaner(TrackingRecHit2DSOAView const *__restrict_
 }
 
 __global__ void kernel_simpleTripletCleaner(
-    TrackingRecHit2DSOAView const *__restrict__ hhp,
-    HitContainer const *__restrict__ ptuples,
     TkSoA const *__restrict__ ptracks,
     Quality *__restrict__ quality,
     uint16_t nmin,
@@ -823,7 +823,6 @@ __global__ void kernel_simpleTripletCleaner(
   auto const good = pixelTrack::Quality::loose;
 
   auto &hitToTuple = *phitToTuple;
-  auto const &foundNtuplets = *ptuples;
   auto const &tracks = *ptracks;
 
   int first = blockDim.x * blockIdx.x + threadIdx.x;
@@ -849,7 +848,7 @@ __global__ void kernel_simpleTripletCleaner(
     // mark worse ambiguities
     for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) {
       auto const it = *ip;
-      if (quality[it] > reject && foundNtuplets.size(it) == 3 && it != im)
+      if (quality[it] > reject && tracks.isTriplet(it) && it != im)
         quality[it] = reject;  //no race:  simple assignment of the same constant
     }
 
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
index bf16e63b27163..d03e5e88b5699 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
@@ -63,7 +63,7 @@ CAHitNtupletGeneratorOnGPU::CAHitNtupletGeneratorOnGPU(const edm::ParameterSet&
                cfg.getParameter<unsigned int>("maxNumberOfDoublets"),
                cfg.getParameter<unsigned int>("minHitsForSharingCut"),
                cfg.getParameter<bool>("useRiemannFit"),
-               cfg.getParameter<bool>("fit5as4"),
+               cfg.getParameter<bool>("fitNas4"),
                cfg.getParameter<bool>("includeJumpingForwardDoublets"),
                cfg.getParameter<bool>("earlyFishbone"),
                cfg.getParameter<bool>("lateFishbone"),
@@ -149,17 +149,17 @@ void CAHitNtupletGeneratorOnGPU::fillDescriptions(edm::ParameterSetDescription&
   desc.add<bool>("fillStatistics", false);
   desc.add<unsigned int>("minHitsPerNtuplet", 4);
   desc.add<unsigned int>("maxNumberOfDoublets", caConstants::maxNumberOfDoublets);
-  desc.add<unsigned int>("minHitsForSharingCut", 5)
+  desc.add<unsigned int>("minHitsForSharingCut", 10)
       ->setComment("Maximum number of hits in a tuple to clean also if the shared hit is on bpx1");
   desc.add<bool>("includeJumpingForwardDoublets", false);
-  desc.add<bool>("fit5as4", true);
+  desc.add<bool>("fitNas4", false)->setComment("fit only 4 hits out of N");
   desc.add<bool>("doClusterCut", true);
   desc.add<bool>("doZ0Cut", true);
   desc.add<bool>("doPtCut", true);
   desc.add<bool>("useRiemannFit", false)->setComment("true for Riemann, false for BrokenLine");
   desc.add<bool>("doSharedHitCut", true)->setComment("Sharing hit nTuples cleaning");
   desc.add<bool>("dupPassThrough", false)->setComment("Do not reject duplicate");
-  desc.add<bool>("useSimpleTripletCleaner", false)->setComment("use alternate implementation");
+  desc.add<bool>("useSimpleTripletCleaner", true)->setComment("use alternate implementation");
 
   edm::ParameterSetDescription trackQualityCuts;
   trackQualityCuts.add<double>("chi2MaxPt", 10.)->setComment("max pT used to determine the pT-dependent chi2 cut");
@@ -194,9 +194,8 @@ PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecH
 
   kernels.buildDoublets(hits_d, stream);
   kernels.launchKernels(hits_d, soa, stream);
-  kernels.fillHitDetIndices(hits_d.view(), soa, stream);  // in principle needed only if Hits not "available"
 
-  HelixFitOnGPU fitter(bfield, m_params.fit5as4_);
+  HelixFitOnGPU fitter(bfield, m_params.fitNas4_);
   fitter.allocateOnGPU(&(soa->hitIndices), kernels.tupleMultiplicity(), soa);
   if (m_params.useRiemannFit_) {
     fitter.launchRiemannKernels(hits_d.view(), hits_d.nHits(), caConstants::maxNumberOfQuadruplets, stream);
@@ -226,13 +225,12 @@ PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuples(TrackingRecHit2DC
 
   kernels.buildDoublets(hits_d, nullptr);
   kernels.launchKernels(hits_d, soa, nullptr);
-  kernels.fillHitDetIndices(hits_d.view(), soa, nullptr);  // in principle needed only if Hits not "available"
 
   if (0 == hits_d.nHits())
     return tracks;
 
   // now fit
-  HelixFitOnGPU fitter(bfield, m_params.fit5as4_);
+  HelixFitOnGPU fitter(bfield, m_params.fitNas4_);
   fitter.allocateOnGPU(&(soa->hitIndices), kernels.tupleMultiplicity(), soa);
 
   if (m_params.useRiemannFit_) {
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
index 50cef5b59cb80..2bcfbf401fb14 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
@@ -38,20 +38,21 @@ class GPUCACell {
   using Quality = pixelTrack::Quality;
   static constexpr auto bad = pixelTrack::Quality::bad;
 
+  enum class StatusBit : uint16_t { kUsed = 1, kInTrack = 2, kKilled = 1 << 15 };
+
   GPUCACell() = default;
 
   __device__ __forceinline__ void init(CellNeighborsVector& cellNeighbors,
                                        CellTracksVector& cellTracks,
                                        Hits const& hh,
                                        int layerPairId,
-                                       int doubletId,
                                        hindex_type innerHitId,
                                        hindex_type outerHitId) {
     theInnerHitId = innerHitId;
     theOuterHitId = outerHitId;
-    theDoubletId_ = doubletId;
     theLayerPairId_ = layerPairId;
-    theUsed_ = 0;
+    theStatus_ = 0;
+    theFishboneId = std::numeric_limits<hindex_type>::max();
 
     // optimization that depends on access pattern
     theInnerZ = hh.zGlobal(innerHitId);
@@ -130,8 +131,7 @@ class GPUCACell {
   constexpr unsigned int outer_hit_id() const { return theOuterHitId; }
 
   __device__ void print_cell() const {
-    printf("printing cell: %d, on layerPair: %d, innerHitId: %d, outerHitId: %d \n",
-           theDoubletId_,
+    printf("printing cell: on layerPair: %d, innerHitId: %d, outerHitId: %d \n",
            theLayerPairId_,
            theInnerHitId,
            theOuterHitId);
@@ -287,12 +287,13 @@ class GPUCACell {
     // the ntuplets is then saved if the number of hits it contains is greater
     // than a threshold
 
-    tmpNtuplet.push_back_unsafe(theDoubletId_);
+    auto doubletId = this - cells;
+    tmpNtuplet.push_back_unsafe(doubletId);
     assert(tmpNtuplet.size() <= 4);
 
     bool last = true;
     for (unsigned int otherCell : outerNeighbors()) {
-      if (cells[otherCell].theDoubletId_ < 0)
+      if (cells[otherCell].isKilled())
         continue;  // killed by earlyFishbone
       last = false;
       cells[otherCell].find_ntuplets<DEPTH - 1>(
@@ -306,13 +307,20 @@ class GPUCACell {
             ((!startAt0) && hole0(hh, cells[tmpNtuplet[0]])))
 #endif
         {
-          hindex_type hits[6];
+          hindex_type hits[8];
           auto nh = 0U;
+          constexpr int maxFB = 2;  // for the time being let's limit this
+          int nfb = 0;
           for (auto c : tmpNtuplet) {
             hits[nh++] = cells[c].theInnerHitId;
+            if (nfb < maxFB && cells[c].hasFishbone()) {
+              ++nfb;
+              hits[nh++] = cells[c].theFishboneId;  // fishbone hit is always outer than inner hit
+            }
           }
+          assert(nh < caConstants::maxHitsOnTrack);
           hits[nh] = theOuterHitId;
-          auto it = foundNtuplets.bulkFill(apc, hits, tmpNtuplet.size() + 1);
+          auto it = foundNtuplets.bulkFill(apc, hits, nh + 1);
           if (it >= 0) {  // if negative is overflow....
             for (auto c : tmpNtuplet)
               cells[c].addTrack(it, cellTracks);
@@ -326,26 +334,32 @@ class GPUCACell {
   }
 
   // Cell status management
-  __device__ __forceinline__ void kill() { theDoubletId_ = -1; }
-  __device__ __forceinline__ bool isKilled() const { return theDoubletId_ < 0; }
+  __device__ __forceinline__ void kill() { theStatus_ |= uint16_t(StatusBit::kKilled); }
+  __device__ __forceinline__ bool isKilled() const { return theStatus_ & uint16_t(StatusBit::kKilled); }
 
   __device__ __forceinline__ int16_t layerPairId() const { return theLayerPairId_; }
 
-  __device__ __forceinline__ bool unused() const { return 0 == theUsed_; }
-  __device__ __forceinline__ void setUsedBit(uint16_t bit) { theUsed_ |= bit; }
+  __device__ __forceinline__ bool unused() const { return 0 == (uint16_t(StatusBit::kUsed) & theStatus_); }
+  __device__ __forceinline__ void setStatusBits(StatusBit mask) { theStatus_ |= uint16_t(mask); }
+
+  __device__ __forceinline__ void setFishbone(hindex_type id) { theFishboneId = id; }
+  __device__ __forceinline__ auto fishboneId() const { return theFishboneId; }
+  __device__ __forceinline__ bool hasFishbone() const {
+    return theFishboneId != std::numeric_limits<hindex_type>::max();
+  }
 
 private:
   CellNeighbors* theOuterNeighbors;
   CellTracks* theTracks;
 
-  int32_t theDoubletId_;
   int16_t theLayerPairId_;
-  uint16_t theUsed_;  // tbd
+  uint16_t theStatus_;  // tbd
 
   float theInnerZ;
   float theInnerR;
   hindex_type theInnerHitId;
   hindex_type theOuterHitId;
+  hindex_type theFishboneId;
 };
 
 template <>
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h
index 938994840f8c0..9a9c85970af33 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h
@@ -40,7 +40,7 @@ class HelixFitOnGPU {
 
   using TupleMultiplicity = caConstants::TupleMultiplicity;
 
-  explicit HelixFitOnGPU(float bf, bool fit5as4) : bField_(bf), fit5as4_(fit5as4) {}
+  explicit HelixFitOnGPU(float bf, bool fitNas4) : bField_(bf), fitNas4_(fitNas4) {}
   ~HelixFitOnGPU() { deallocateOnGPU(); }
 
   void setBField(double bField) { bField_ = bField; }
@@ -62,7 +62,7 @@ class HelixFitOnGPU {
   OutputSoA *outputSoa_;
   float bField_;
 
-  const bool fit5as4_;
+  const bool fitNas4_;
 };
 
 #endif  // RecoPixelVertexing_PixelTriplets_plugins_HelixFitOnGPU_h
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cc
index 491dd0df2004f..13d665b597b13 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cc
@@ -61,7 +61,7 @@ void HelixFitOnGPU::launchRiemannKernelsOnCPU(HitsView const *hv, uint32_t nhits
                       circle_fit_resultsGPU,
                       offset);
 
-    if (fit5as4_) {
+    if (fitNas4_) {
       // penta
       kernel_FastFit<4>(
           tuples_, tupleMultiplicity_, 5, hv, hitsGPU.get(), hits_geGPU.get(), fast_fit_resultsGPU.get(), offset);
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu
index 90af2ac13730b..d8530bac964c1 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu
+++ b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu
@@ -74,7 +74,7 @@ void HelixFitOnGPU::launchRiemannKernels(HitsView const *hv,
                                                                     offset);
     cudaCheck(cudaGetLastError());
 
-    if (fit5as4_) {
+    if (fitNas4_) {
       // penta
       kernel_FastFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(
           tuples_, tupleMultiplicity_, 5, hv, hitsGPU.get(), hits_geGPU.get(), fast_fit_resultsGPU.get(), offset);
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h
index 9a3572df4e313..8eb3faaa677f7 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h
@@ -36,7 +36,7 @@ namespace gpuPixelDoublets {
     float x[maxCellsPerHit], y[maxCellsPerHit], z[maxCellsPerHit], n[maxCellsPerHit];
     uint32_t cc[maxCellsPerHit];
     uint16_t d[maxCellsPerHit];
-    // uint8_t l[maxCellsPerHit];
+    uint8_t l[maxCellsPerHit];
 
     for (int idy = firstY, nt = nHits - offset; idy < nt; idy += gridDim.y * blockDim.y) {
       auto const& vc = isOuterHitOfCell[idy];
@@ -57,6 +57,7 @@ namespace gpuPixelDoublets {
         if (checkTrack && ci.tracks().empty())
           continue;
         cc[sg] = vc[ic];
+        l[sg] = ci.layerPairId();
         d[sg] = ci.inner_detIndex(hh);
         x[sg] = ci.inner_x(hh) - xo;
         y[sg] = ci.inner_y(hh) - yo;
@@ -71,17 +72,29 @@ namespace gpuPixelDoublets {
         auto& ci = cells[cc[ic]];
         for (auto jc = ic + 1; jc < sg; ++jc) {
           auto& cj = cells[cc[jc]];
-          // must be different detectors (in the same layer)
+          // must be different detectors
           //        if (d[ic]==d[jc]) continue;
-          // || l[ic]!=l[jc]) continue;
           auto cos12 = x[ic] * x[jc] + y[ic] * y[jc] + z[ic] * z[jc];
           if (d[ic] != d[jc] && cos12 * cos12 >= 0.99999f * n[ic] * n[jc]) {
-            // alligned:  kill farthest  (prefer consecutive layers)
+            // alligned:  kill farthest (prefer consecutive layers)
+            // if same layer prefer farthest (longer level arm) and make space for intermediate hit
+            bool sameLayer = l[ic] == l[jc];
             if (n[ic] > n[jc]) {
-              ci.kill();
-              break;
+              if (sameLayer) {
+                cj.kill();  // closest
+                ci.setFishbone(cj.inner_hit_id());
+              } else {
+                ci.kill();  // farthest
+                break;
+              }
             } else {
-              cj.kill();
+              if (!sameLayer) {
+                cj.kill();  // farthest
+              } else {
+                ci.kill();  // closest
+                cj.setFishbone(ci.inner_hit_id());
+                break;
+              }
             }
           }
         }  // cj
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h
index 37970048b3168..80316d24c748b 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h
@@ -222,7 +222,7 @@ namespace gpuPixelDoublets {
             break;
           }  // move to SimpleVector??
           // int layerPairId, int doubletId, int innerHitId, int outerHitId)
-          cells[ind].init(*cellNeighbors, *cellTracks, hh, pairLayerId, ind, i, oi);
+          cells[ind].init(*cellNeighbors, *cellTracks, hh, pairLayerId, i, oi);
           isOuterHitOfCell[oi].push_back(ind);
 #ifdef GPU_DEBUG
           if (isOuterHitOfCell[oi].full())
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc
index b8568a7bbcfe9..2defd6f5ba2ca 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc
@@ -34,7 +34,7 @@ namespace gpuVertexFinder {
       // initialize soa...
       soa->idv[idx] = -1;
 
-      if (nHits < 4)
+      if (tracks.isTriplet(idx))
         continue;  // no triplets
       if (quality[idx] < pixelTrack::Quality::highPurity)
         continue;
diff --git a/Validation/RecoTrack/python/TrackValidation_cff.py b/Validation/RecoTrack/python/TrackValidation_cff.py
index 0b3b973da0d53..d77e589dbee89 100644
--- a/Validation/RecoTrack/python/TrackValidation_cff.py
+++ b/Validation/RecoTrack/python/TrackValidation_cff.py
@@ -1000,10 +1000,10 @@ def _uniqueFirstLayers(layerList):
                              cut = cms.string("")
 )
 
-cutstring = "numberOfValidHits == 3"
+cutstring = "hitPattern.trackerLayersWithMeasurement == 3"
 pixelTracks3hits = trackRefSelector.clone( cut = cutstring )
 
-cutstring = "numberOfValidHits >= 4"
+cutstring = "hitPattern.trackerLayersWithMeasurement >= 4"
 pixelTracks4hits = trackRefSelector.clone( cut = cutstring )
 
 cutstring = "pt > 0.9"
@@ -1014,7 +1014,7 @@ def _uniqueFirstLayers(layerList):
 #pixelTracksFromPVPt09 = generalTracksPt09.clone(quality = ["loose","tight","highPurity"], vertexTag = "pixelVertices", src = "pixelTracksFromPV")
 pixelTracksFromPVPt09 = pixelTracksFromPV.clone(ptMin = 0.9)
 
-cutstring = "numberOfValidHits >= 4"
+cutstring = "hitPattern.trackerLayersWithMeasurement >= 4"
 #pixelTracksFromPV4hits = trackRefSelector.clone( cut = cutstring, src = "pixelTracksFromPV" )
 pixelTracksFromPV4hits = pixelTracksFromPV.clone( numberOfValidPixelHits = 4 )
 
@@ -1124,7 +1124,7 @@ def _uniqueFirstLayers(layerList):
     tracksPreValidationPixelTrackingOnly.add(locals()[label])
 #-----------         
     label = "pixelTracks4hits"+key
-    cutstring = "numberOfValidHits == 4 & qualityMask <= 7 & qualityMask >= " + str(qualityBit)
+    cutstring = "hitPattern.trackerLayersWithMeasurement >= 4 & qualityMask <= 7 & qualityMask >= " + str(qualityBit)
     locals()[label] = trackRefSelector.clone( cut = cutstring )
     tracksPreValidationPixelTrackingOnly.add(locals()[label])
     
@@ -1134,7 +1134,7 @@ def _uniqueFirstLayers(layerList):
     tracksPreValidationPixelTrackingOnly.add(locals()[label])
 #--------    
     label = "pixelTracks3hits"+key
-    cutstring = "numberOfValidHits == 3 & qualityMask <= 7 & qualityMask >= " + str(qualityBit)
+    cutstring = "hitPattern.trackerLayersWithMeasurement == 3 & qualityMask <= 7 & qualityMask >= " + str(qualityBit)
     locals()[label] = trackRefSelector.clone( cut = cutstring )
     tracksPreValidationPixelTrackingOnly.add(locals()[label])