From e7f5b13d2e6509470c8aeeba71c783ff87d8f2e9 Mon Sep 17 00:00:00 2001
From: AdrianoDee <AdrianoDee@users.noreply.github.com>
Date: Thu, 24 Nov 2022 18:34:20 +0100
Subject: [PATCH] Tracks and hits portable (#3)

* STUFF that make things work

* Use DeviceToDevice copy for cpe params

Co-authored-by: Dimitris Papagiannis <nothingface0@gmail.com>
---
 .../interface/SiPixelClustersCUDA.h           |  46 +----
 .../interface/TrackingRecHitSoADevice.h       |  55 ++++--
 .../interface/TrackingRecHitSoAHost.h         |  30 ++--
 .../TrackingRecHit/src/classes_def.xml        |  18 +-
 .../plugins/SiPixelDigisSoAFromCUDA.cc        |  36 ++--
 .../Configuration/python/HLT_2022v15_cff.py   |   2 +-
 .../Configuration/python/HLT_FULL_cff.py      |   2 +-
 .../Configuration/python/HLT_GRun_cff.py      |   2 +-
 .../Configuration/python/HLT_HIon_cff.py      |   2 +-
 .../Configuration/python/HLT_PRef_cff.py      |   2 +-
 .../Configuration/test/OnLine_HLT_2022v15.py  |   2 +-
 .../Configuration/test/OnLine_HLT_FULL.py     |   2 +-
 .../Configuration/test/OnLine_HLT_GRun.py     |   2 +-
 .../Configuration/test/OnLine_HLT_HIon.py     |   2 +-
 .../Configuration/test/OnLine_HLT_PRef.py     |   2 +-
 .../plugins/PixelRecHitGPUKernel.cu           |  30 ++--
 .../plugins/SiPixelRecHitCUDA.cc              |  12 +-
 .../plugins/SiPixelRecHitFromCUDA.cc          |  15 +-
 .../plugins/SiPixelRecHitSoAFromLegacy.cc     | 165 ++++++++++--------
 .../SiPixelRecHits/plugins/gpuPixelRecHits.h  |   9 +-
 .../plugins/PixelTrackProducerFromSoA.cc      |   3 +-
 .../plugins/BrokenLineFitOnGPU.cc             |   2 +-
 .../plugins/BrokenLineFitOnGPU.cu             |  28 ++-
 .../plugins/BrokenLineFitOnGPU.h              |  44 +++--
 .../PixelTriplets/plugins/CAHitNtupletCUDA.cc |  13 +-
 .../plugins/CAHitNtupletGeneratorKernels.cc   |  13 +-
 .../plugins/CAHitNtupletGeneratorKernels.cu   |  53 +++---
 .../plugins/CAHitNtupletGeneratorKernels.h    |   8 +-
 .../CAHitNtupletGeneratorKernelsImpl.h        |  14 +-
 .../plugins/CAHitNtupletGeneratorOnGPU.cc     |  68 ++++++--
 .../plugins/CAHitNtupletGeneratorOnGPU.h      |  14 +-
 .../PixelTriplets/plugins/GPUCACell.h         |  50 +++---
 .../PixelTriplets/plugins/HelixFitOnGPU.h     |   9 +-
 .../PixelTriplets/plugins/RiemannFitOnGPU.cc  |   2 +-
 .../PixelTriplets/plugins/RiemannFitOnGPU.cu  |   2 +-
 .../PixelTriplets/plugins/RiemannFitOnGPU.h   |   6 +-
 .../PixelTriplets/plugins/gpuFishbone.h       |   2 +-
 .../PixelTriplets/plugins/gpuPixelDoublets.h  |   2 +-
 .../plugins/gpuPixelDoubletsAlgos.h           |   2 +-
 39 files changed, 417 insertions(+), 354 deletions(-)

diff --git a/CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h b/CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h
index f83953bbc3e15..0deabe8fb5034 100644
--- a/CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h
+++ b/CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h
@@ -21,19 +21,11 @@ using SiPixelClustersCUDASoA = SiPixelClustersCUDALayout<>;
 class SiPixelClustersCUDA : public cms::cuda::PortableDeviceCollection<SiPixelClustersCUDALayout<>> {
 public:
   SiPixelClustersCUDA() = default;
-  // explicit SiPixelClustersCUDA(size_t maxModules, cudaStream_t stream);
-  explicit SiPixelClustersCUDA(size_t maxModules, cudaStream_t stream)
-      : PortableDeviceCollection<SiPixelClustersCUDALayout<>>(maxModules + 1, stream) {}
   ~SiPixelClustersCUDA() = default;
 
-  // // Restrict view
-  // using RestrictConstView =
-  //    Layout::ConstViewTemplate<cms::soa::RestrictQualify::enabled, cms::soa::RangeChecking::disabled>;
-  //
-  // RestrictConstView restrictConstView() const { return RestrictConstView(layout()); }
+  explicit SiPixelClustersCUDA(size_t maxModules, cudaStream_t stream)
+      : PortableDeviceCollection<SiPixelClustersCUDALayout<>>(maxModules + 1, stream) {}
 
-  // SiPixelClustersCUDA(const SiPixelClustersCUDA &) = delete;
-  // SiPixelClustersCUDA &operator=(const SiPixelClustersCUDA &) = delete;
   SiPixelClustersCUDA(SiPixelClustersCUDA &&) = default;
   SiPixelClustersCUDA &operator=(SiPixelClustersCUDA &&) = default;
 
@@ -45,41 +37,7 @@ class SiPixelClustersCUDA : public cms::cuda::PortableDeviceCollection<SiPixelCl
   uint32_t nClusters() const { return nClusters_h; }
   int32_t offsetBPIX2() const { return offsetBPIX2_h; }
 
-  // uint32_t *moduleStart() { return moduleStart_d.get(); }
-  // uint32_t *clusInModule() { return clusInModule_d.get(); }
-  // uint32_t *moduleId() { return moduleId_d.get(); }
-  // uint32_t *clusModuleStart() { return clusModuleStart_d.get(); }
-  //
-  // uint32_t const *moduleStart() const { return moduleStart_d.get(); }
-  // uint32_t const *clusInModule() const { return clusInModule_d.get(); }
-  // uint32_t const *moduleId() const { return moduleId_d.get(); }
-  // uint32_t const *clusModuleStart() const { return clusModuleStart_d.get(); }
-  //
-  // class SiPixelClustersCUDASOAView {
-  // public:
-  //   __device__ __forceinline__ uint32_t moduleStart(int i) const { return __ldg(moduleStart_ + i); }
-  //   __device__ __forceinline__ uint32_t clusInModule(int i) const { return __ldg(clusInModule_ + i); }
-  //   __device__ __forceinline__ uint32_t moduleId(int i) const { return __ldg(moduleId_ + i); }
-  //   __device__ __forceinline__ uint32_t clusModuleStart(int i) const { return __ldg(clusModuleStart_ + i); }
-  //
-  //   uint32_t const *moduleStart_;
-  //   uint32_t const *clusInModule_;
-  //   uint32_t const *moduleId_;
-  //   uint32_t const *clusModuleStart_;
-  // };
-  //
-  // SiPixelClustersCUDASOAView const *view() const { return view_d.get(); }
-
 private:
-  // cms::cuda::device::unique_ptr<uint32_t[]> moduleStart_d;   // index of the first pixel of each module
-  // cms::cuda::device::unique_ptr<uint32_t[]> clusInModule_d;  // number of clusters found in each module
-  // cms::cuda::device::unique_ptr<uint32_t[]> moduleId_d;      // module id of each module
-  //
-  // // originally from rechits
-  // cms::cuda::device::unique_ptr<uint32_t[]> clusModuleStart_d;  // index of the first cluster of each module
-  //
-  // cms::cuda::device::unique_ptr<SiPixelClustersCUDASOAView> view_d;  // "me" pointer
-
   uint32_t nClusters_h = 0;
   int32_t offsetBPIX2_h = 0;
 };
diff --git a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h
index 56ed23b0582fd..08cf5fa39bd52 100644
--- a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h
+++ b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h
@@ -15,19 +15,39 @@ namespace trackingRecHit
     TrackingRecHitSoADevice() = default;  // cms::cuda::Product needs this
 
     // Constructor which specifies the SoA size
-    explicit TrackingRecHitSoADevice(uint32_t nHits, bool isPhase2, int32_t offsetBPIX2, pixelCPEforGPU::ParamsOnGPU const* cpeParams, uint32_t const* hitsModuleStart, cudaStream_t stream)
-        : PortableDeviceCollection<TrackingRecHitSoALayout<>>(nHits, stream), nHits_(nHits), cpeParams_(cpeParams), hitsModuleStart_(hitsModuleStart), offsetBPIX2_(offsetBPIX2)
-        {
-          nModules_ = isPhase2 ? phase2PixelTopology::numberOfModules : phase1PixelTopology::numberOfModules;
-          phiBinner_ = &(view().phiBinner());
-          // phiBinner_ = cms::cuda::make_device_unique<TrackingRecHit2DSOAView::PhiBinner>(stream).get();
-          cudaCheck(cudaMemcpyAsync(&(view().nHits()), &nHits, sizeof(uint32_t),cudaMemcpyHostToDevice,stream));
-          cudaCheck(cudaMemcpyAsync(&(view().nMaxModules()), &nModules_, sizeof(uint32_t),cudaMemcpyHostToDevice,stream));
-          cudaCheck(cudaMemcpyAsync(view().hitsModuleStart().data(), hitsModuleStart, sizeof(uint32_t) * int(nModules_ + 1),cudaMemcpyHostToDevice,stream));
-          // cudaCheck(cudaMemcpyAsync(&(view().cpeParams()), cpeParams, int(sizeof(pixelCPEforGPU::ParamsOnGPU)),cudaMemcpyHostToDevice,stream));
-          cudaCheck(cudaMemcpyAsync(&(view().offsetBPIX2()), &offsetBPIX2, sizeof(int32_t),cudaMemcpyHostToDevice,stream));
+    explicit TrackingRecHitSoADevice(uint32_t nHits,
+                                     bool isPhase2,
+                                     int32_t offsetBPIX2,
+                                     pixelCPEforGPU::ParamsOnGPU const* cpeParams,
+                                     uint32_t const* hitsModuleStart,
+                                     cudaStream_t stream)
+        : PortableDeviceCollection<TrackingRecHitSoALayout<>>(nHits, stream),
+          nHits_(nHits),
+          cpeParams_(cpeParams),
+          hitsModuleStart_(hitsModuleStart),
+          offsetBPIX2_(offsetBPIX2) {
+      cudaCheck(cudaGetLastError());
+      cudaCheck(cudaDeviceSynchronize());
 
-        }
+      nModules_ = isPhase2 ? phase2PixelTopology::numberOfModules : phase1PixelTopology::numberOfModules;
+      phiBinner_ = &(view().phiBinner());
+      // phiBinner_ = cms::cuda::make_device_unique<TrackingRecHit2DSOAView::PhiBinner>(stream).get();
+      cudaCheck(cudaMemcpyAsync(&(view().nHits()), &nHits, sizeof(uint32_t), cudaMemcpyHostToDevice, stream));
+      cudaCheck(cudaMemcpyAsync(&(view().nMaxModules()), &nModules_, sizeof(uint32_t), cudaMemcpyHostToDevice, stream));
+      cudaCheck(cudaMemcpyAsync(view().hitsModuleStart().data(),
+                                hitsModuleStart,
+                                sizeof(uint32_t) * int(nModules_ + 1),
+                                cudaMemcpyHostToDevice,
+                                stream));
+      cudaCheck(
+          cudaMemcpyAsync(&(view().offsetBPIX2()), &offsetBPIX2, sizeof(int32_t), cudaMemcpyHostToDevice, stream));
+
+      // cpeParams argument is a pointer to device memory, copy
+      // its contents into the Layout.
+
+      cudaCheck(cudaMemcpyAsync(
+          &(view().cpeParams()), cpeParams, int(sizeof(pixelCPEforGPU::ParamsOnGPU)), cudaMemcpyDeviceToDevice, stream));
+    }
 
     uint32_t nHits() const { return nHits_; } //go to size of view
     uint32_t nModules() const { return nModules_; }
@@ -35,14 +55,14 @@ namespace trackingRecHit
     cms::cuda::host::unique_ptr<float[]> localCoordToHostAsync(cudaStream_t stream) const {
       auto ret = cms::cuda::make_host_unique<float[]>(4 * nHits(), stream);
       size_t rowSize = sizeof(float) * nHits();
-      printf("%d \n",nModules());
-      printf("%d \n",nHits());
-      cudaCheck(cudaMemcpyAsync(ret.get(), view().xLocal() , rowSize * 4, cudaMemcpyDeviceToHost, stream));
+      printf("nModules=%d \n", nModules());
+      printf("nHits=%d \n", nHits());
+      cudaCheck(cudaMemcpyAsync(ret.get(), view().xLocal(), rowSize * 4, cudaMemcpyDeviceToHost, stream));
       // cudaCheck(cudaMemcpyAsync(ret.get() + rowSize , view().yLocal() , rowSize, cudaMemcpyDeviceToHost, stream));
       // cudaCheck(cudaMemcpyAsync(ret.get() + size_t(rowSize * 2), view().xerrLocal() , rowSize, cudaMemcpyDeviceToHost, stream));
       // cudaCheck(cudaMemcpyAsync(ret.get() + size_t(rowSize * 3) , view().yerrLocal() , rowSize, cudaMemcpyDeviceToHost, stream));
       return ret;
-    } //move to utilities
+    }  //move to utilities
 
     cms::cuda::host::unique_ptr<uint32_t[]> hitsModuleStartToHostAsync(cudaStream_t stream) const {
       // printf("%d \n",nModules());
@@ -58,7 +78,8 @@ namespace trackingRecHit
 
     private:
       uint32_t nHits_; //Needed for the host SoA size
-      pixelCPEforGPU::ParamsOnGPU const* cpeParams_; //TODO: this is used not that much from the hits (only once in BrokenLineFit), would make sens to remove it from this class.
+      pixelCPEforGPU::ParamsOnGPU const*
+          cpeParams_;  //TODO: this is used not that much from the hits (only once in BrokenLineFit), would make sens to remove it from this class.
       uint32_t const* hitsModuleStart_;
       uint32_t offsetBPIX2_;
 
diff --git a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoAHost.h b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoAHost.h
index 020ddf4d64bf6..8f7381409837a 100644
--- a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoAHost.h
+++ b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoAHost.h
@@ -20,18 +20,24 @@ namespace trackingRecHit
     explicit TrackingRecHitSoAHost(uint32_t nHits, cudaStream_t stream)
     : PortableHostCollection<TrackingRecHitSoALayout<>>(nHits, stream) {}
 
-    explicit TrackingRecHitSoAHost(uint32_t nHits, bool isPhase2, int32_t offsetBPIX2, pixelCPEforGPU::ParamsOnGPU const* cpeParams, uint32_t const* hitsModuleStart, cudaStream_t stream)
-        : PortableHostCollection<TrackingRecHitSoALayout<>>(nHits, stream), nHits_(nHits), cpeParams_(cpeParams), offsetBPIX2_(offsetBPIX2)
-        {
-          nModules_ = isPhase2 ? phase2PixelTopology::numberOfModules : phase1PixelTopology::numberOfModules;
-
-          view().nHits() = nHits;
-          view().nMaxModules() = nModules_;
-          std::copy(hitsModuleStart,hitsModuleStart+nModules_+1,view().hitsModuleStart().begin());
-
-          view().offsetBPIX2() = offsetBPIX2;
-
-        }
+    explicit TrackingRecHitSoAHost(uint32_t nHits,
+                                   bool isPhase2,
+                                   int32_t offsetBPIX2,
+                                   pixelCPEforGPU::ParamsOnGPU const* cpeParams,
+                                   uint32_t const* hitsModuleStart,
+                                   cudaStream_t stream)
+        : PortableHostCollection<TrackingRecHitSoALayout<>>(nHits, stream),
+          nHits_(nHits),
+          cpeParams_(cpeParams),
+          offsetBPIX2_(offsetBPIX2) {
+      nModules_ = isPhase2 ? phase2PixelTopology::numberOfModules : phase1PixelTopology::numberOfModules;
+      std::cout << "PORCA MADONNA!!!!!!!!!!!!!!!!!" << std::endl;
+      view().nHits() = nHits;
+      view().nMaxModules() = nModules_;
+      std::copy(hitsModuleStart, hitsModuleStart + nModules_ + 1, view().hitsModuleStart().begin());
+      memcpy(&(view().cpeParams()), cpeParams, sizeof(pixelCPEforGPU::ParamsOnGPU));
+      view().offsetBPIX2() = offsetBPIX2;
+    }
 
     uint32_t nHits() const { return nHits_; }
     uint32_t nModules() const { return nModules_; }
diff --git a/CUDADataFormats/TrackingRecHit/src/classes_def.xml b/CUDADataFormats/TrackingRecHit/src/classes_def.xml
index b0589a5a46d82..9bb7984550377 100644
--- a/CUDADataFormats/TrackingRecHit/src/classes_def.xml
+++ b/CUDADataFormats/TrackingRecHit/src/classes_def.xml
@@ -1,15 +1,17 @@
 <lcgdict>
-  <class name="TrackingRecHit2DCPU" persistent="false"/>
-  <class name="edm::Wrapper<TrackingRecHit2DCPU>" persistent="false"/>
-  <class name="TrackingRecHit2DHost" persistent="false"/>
-  <class name="edm::Wrapper<TrackingRecHit2DHost>" persistent="false"/>
-  <class name="cms::cuda::Product<TrackingRecHit2DGPU>" persistent="false"/>
-  <class name="edm::Wrapper<cms::cuda::Product<TrackingRecHit2DGPU>>" persistent="false"/>
-  <class name="TrackingRecHit2DReduced" persistent="false"/>
-  <class name="edm::Wrapper<TrackingRecHit2DReduced>" persistent="false"/>
+  <!-- <class name="TrackingRecHit2DCPU" persistent="false"/> -->
+  <!-- <class name="edm::Wrapper<TrackingRecHit2DCPU>" persistent="false"/> -->
+  <!-- <class name="TrackingRecHit2DHost" persistent="false"/> -->
+  <!-- <class name="edm::Wrapper<TrackingRecHit2DHost>" persistent="false"/> -->
+  <!-- <class name="cms::cuda::Product<TrackingRecHit2DGPU>" persistent="false"/> -->
+  <!-- <class name="edm::Wrapper<cms::cuda::Product<TrackingRecHit2DGPU>>" persistent="false"/> -->
+  <!-- <class name="TrackingRecHit2DReduced" persistent="false"/> -->
+  <!-- <class name="edm::Wrapper<TrackingRecHit2DReduced>" persistent="false"/> -->
   
   <class name="trackingRecHit::TrackingRecHitSoAHost" persistent="false"/>
   <class name="edm::Wrapper<trackingRecHit::TrackingRecHitSoAHost>" persistent="false"/>
+  <class name="trackingRecHit::TrackingRecHitSoADevice" persistent="false"/>  
   <class name="cms::cuda::Product<trackingRecHit::TrackingRecHitSoADevice>" persistent="false"/>
+  <class name="edm::Wrapper<trackingRecHit::TrackingRecHitSoADevice>" persistent="false"/>  
   <class name="edm::Wrapper<cms::cuda::Product<trackingRecHit::TrackingRecHitSoADevice>>" persistent="false"/>
 </lcgdict>
diff --git a/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigisSoAFromCUDA.cc b/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigisSoAFromCUDA.cc
index ab47d396884f3..cda526f513c9e 100644
--- a/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigisSoAFromCUDA.cc
+++ b/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigisSoAFromCUDA.cc
@@ -1,4 +1,5 @@
 #include "CUDADataFormats/Common/interface/Product.h"
+#include "CUDADataFormats/Common/interface/PortableHostCollection.h"
 #include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h"
 #include "DataFormats/SiPixelDigi/interface/SiPixelDigisSoA.h"
 #include "FWCore/Framework/interface/EventSetup.h"
@@ -27,7 +28,7 @@ class SiPixelDigisSoAFromCUDA : public edm::stream::EDProducer<edm::ExternalWork
   edm::EDGetTokenT<cms::cuda::Product<SiPixelDigisCUDA>> digiGetToken_;
   edm::EDPutTokenT<SiPixelDigisSoA> digiPutToken_;
 
-  cms::cuda::host::unique_ptr<std::byte[]> store_;
+  cms::cuda::PortableHostCollection<SiPixelDigisSoALayout<>> digis_h;
 
   int nDigis_;
 };
@@ -48,31 +49,22 @@ void SiPixelDigisSoAFromCUDA::acquire(const edm::Event& iEvent,
   // Do the transfer in a CUDA stream parallel to the computation CUDA stream
   cms::cuda::ScopedContextAcquire ctx{iEvent.streamID(), std::move(waitingTaskHolder)};
 
-  const auto& gpuDigis = ctx.get(iEvent, digiGetToken_);
+  const auto& digis_d = ctx.get(iEvent, digiGetToken_);
 
-  nDigis_ = gpuDigis.nDigis();
-  store_ = gpuDigis.copyAllToHostAsync(ctx.stream());
+  nDigis_ = digis_d.nDigis();
+  digis_h = cms::cuda::PortableHostCollection<SiPixelDigisSoALayout<>>(digis_d.view().metadata().size(), ctx.stream());
+  cudaCheck(cudaMemcpyAsync(
+      digis_h.buffer().get(), digis_d.const_buffer().get(), digis_d.bufferSize(), cudaMemcpyDeviceToHost, ctx.stream()));
+  cudaCheck(cudaGetLastError());
 }
 
 void SiPixelDigisSoAFromCUDA::produce(edm::Event& iEvent, const edm::EventSetup& iSetup) {
-  // The following line copies the data from the pinned host memory to
-  // regular host memory. In principle that feels unnecessary (why not
-  // just use the pinned host memory?). There are a few arguments for
-  // doing it though
-  // - Now can release the pinned host memory back to the (caching) allocator
-  //   * if we'd like to keep the pinned memory, we'd need to also
-  //     keep the CUDA stream around as long as that, or allow pinned
-  //     host memory to be allocated without a CUDA stream
-  // - What if a CPU algorithm would produce the same SoA? We can't
-  //   use cudaMallocHost without a GPU...
-
-  // auto tmp_view = SiPixelDigisCUDASOAView(store_, nDigis_, SiPixelDigisCUDASOAView::StorageLocationHost::kMAX);
-  SiPixelDigisCUDASOA tmp_layout(store_.get(), nDigis_);
-  SiPixelDigisCUDASOA::View tmp_view(tmp_layout);
-
-  iEvent.emplace(digiPutToken_, nDigis_, tmp_view.pdigi(), tmp_view.rawIdArr(), tmp_view.adc(), tmp_view.clus());
-
-  store_.reset();
+  iEvent.emplace(digiPutToken_,
+                 nDigis_,
+                 digis_h.view().pdigi(),
+                 digis_h.view().rawIdArr(),
+                 digis_h.view().adc(),
+                 digis_h.view().clus());
 }
 
 // define as framework plugin
diff --git a/HLTrigger/Configuration/python/HLT_2022v15_cff.py b/HLTrigger/Configuration/python/HLT_2022v15_cff.py
index c23f766e062ad..8b64caf12cd86 100644
--- a/HLTrigger/Configuration/python/HLT_2022v15_cff.py
+++ b/HLTrigger/Configuration/python/HLT_2022v15_cff.py
@@ -84802,7 +84802,7 @@
 fragment.hltSiPixelRecHitsSoA = SwitchProducerCUDA(
    cpu = cms.EDAlias(
        hltSiPixelRecHitsFromLegacy = cms.VPSet( 
-         cms.PSet(  type = cms.string( "cmscudacompatCPUTraitsTrackingRecHit2DHeterogeneous" )         ),
+         cms.PSet(  type = cms.string( "trackingRecHitTrackingRecHitSoAHost" )         ),
          cms.PSet(  type = cms.string( "uintAsHostProduct" )         )
        )
    ),
diff --git a/HLTrigger/Configuration/python/HLT_FULL_cff.py b/HLTrigger/Configuration/python/HLT_FULL_cff.py
index a1651875c2977..9ec4c92e62855 100644
--- a/HLTrigger/Configuration/python/HLT_FULL_cff.py
+++ b/HLTrigger/Configuration/python/HLT_FULL_cff.py
@@ -121310,7 +121310,7 @@
 fragment.hltSiPixelRecHitsSoA = SwitchProducerCUDA(
    cpu = cms.EDAlias(
        hltSiPixelRecHitsFromLegacy = cms.VPSet( 
-         cms.PSet(  type = cms.string( "cmscudacompatCPUTraitsTrackingRecHit2DHeterogeneous" )         ),
+         cms.PSet(  type = cms.string( "trackingRecHitTrackingRecHitSoAHost" )         ),
          cms.PSet(  type = cms.string( "uintAsHostProduct" )         )
        )
    ),
diff --git a/HLTrigger/Configuration/python/HLT_GRun_cff.py b/HLTrigger/Configuration/python/HLT_GRun_cff.py
index 05dc2f47e700b..522f518ca31ce 100644
--- a/HLTrigger/Configuration/python/HLT_GRun_cff.py
+++ b/HLTrigger/Configuration/python/HLT_GRun_cff.py
@@ -85517,7 +85517,7 @@
 fragment.hltSiPixelRecHitsSoA = SwitchProducerCUDA(
    cpu = cms.EDAlias(
        hltSiPixelRecHitsFromLegacy = cms.VPSet( 
-         cms.PSet(  type = cms.string( "cmscudacompatCPUTraitsTrackingRecHit2DHeterogeneous" )         ),
+         cms.PSet(  type = cms.string( "trackingRecHitTrackingRecHitSoAHost" )         ),
          cms.PSet(  type = cms.string( "uintAsHostProduct" )         )
        )
    ),
diff --git a/HLTrigger/Configuration/python/HLT_HIon_cff.py b/HLTrigger/Configuration/python/HLT_HIon_cff.py
index adec53755ea21..5cfb325352d36 100644
--- a/HLTrigger/Configuration/python/HLT_HIon_cff.py
+++ b/HLTrigger/Configuration/python/HLT_HIon_cff.py
@@ -25503,7 +25503,7 @@
 fragment.hltSiPixelRecHitsSoA = SwitchProducerCUDA(
    cpu = cms.EDAlias(
        hltSiPixelRecHitsFromLegacy = cms.VPSet( 
-         cms.PSet(  type = cms.string( "cmscudacompatCPUTraitsTrackingRecHit2DHeterogeneous" )         ),
+         cms.PSet(  type = cms.string( "trackingRecHitTrackingRecHitSoAHost" )         ),
          cms.PSet(  type = cms.string( "uintAsHostProduct" )         )
        )
    ),
diff --git a/HLTrigger/Configuration/python/HLT_PRef_cff.py b/HLTrigger/Configuration/python/HLT_PRef_cff.py
index 1f2eee46fbd4f..2bf1b5a4837d9 100644
--- a/HLTrigger/Configuration/python/HLT_PRef_cff.py
+++ b/HLTrigger/Configuration/python/HLT_PRef_cff.py
@@ -9248,7 +9248,7 @@
 fragment.hltSiPixelRecHitsSoA = SwitchProducerCUDA(
    cpu = cms.EDAlias(
        hltSiPixelRecHitsFromLegacy = cms.VPSet( 
-         cms.PSet(  type = cms.string( "cmscudacompatCPUTraitsTrackingRecHit2DHeterogeneous" )         ),
+         cms.PSet(  type = cms.string( "trackingRecHitTrackingRecHitSoAHost" )         ),
          cms.PSet(  type = cms.string( "uintAsHostProduct" )         )
        )
    ),
diff --git a/HLTrigger/Configuration/test/OnLine_HLT_2022v15.py b/HLTrigger/Configuration/test/OnLine_HLT_2022v15.py
index 6ad31948f5bfc..06dea8df2ef0a 100644
--- a/HLTrigger/Configuration/test/OnLine_HLT_2022v15.py
+++ b/HLTrigger/Configuration/test/OnLine_HLT_2022v15.py
@@ -85070,7 +85070,7 @@
 process.hltSiPixelRecHitsSoA = SwitchProducerCUDA(
    cpu = cms.EDAlias(
        hltSiPixelRecHitsFromLegacy = cms.VPSet( 
-         cms.PSet(  type = cms.string( "cmscudacompatCPUTraitsTrackingRecHit2DHeterogeneous" )         ),
+         cms.PSet(  type = cms.string( "trackingRecHitTrackingRecHitSoAHost" )         ),
          cms.PSet(  type = cms.string( "uintAsHostProduct" )         )
        )
    ),
diff --git a/HLTrigger/Configuration/test/OnLine_HLT_FULL.py b/HLTrigger/Configuration/test/OnLine_HLT_FULL.py
index 6cb2600166b88..681f518b7daed 100644
--- a/HLTrigger/Configuration/test/OnLine_HLT_FULL.py
+++ b/HLTrigger/Configuration/test/OnLine_HLT_FULL.py
@@ -121947,7 +121947,7 @@
 process.hltSiPixelRecHitsSoA = SwitchProducerCUDA(
    cpu = cms.EDAlias(
        hltSiPixelRecHitsFromLegacy = cms.VPSet( 
-         cms.PSet(  type = cms.string( "cmscudacompatCPUTraitsTrackingRecHit2DHeterogeneous" )         ),
+         cms.PSet(  type = cms.string( "trackingRecHitTrackingRecHitSoAHost" )         ),
          cms.PSet(  type = cms.string( "uintAsHostProduct" )         )
        )
    ),
diff --git a/HLTrigger/Configuration/test/OnLine_HLT_GRun.py b/HLTrigger/Configuration/test/OnLine_HLT_GRun.py
index a74141d51598f..9c998d956dadc 100644
--- a/HLTrigger/Configuration/test/OnLine_HLT_GRun.py
+++ b/HLTrigger/Configuration/test/OnLine_HLT_GRun.py
@@ -85785,7 +85785,7 @@
 process.hltSiPixelRecHitsSoA = SwitchProducerCUDA(
    cpu = cms.EDAlias(
        hltSiPixelRecHitsFromLegacy = cms.VPSet( 
-         cms.PSet(  type = cms.string( "cmscudacompatCPUTraitsTrackingRecHit2DHeterogeneous" )         ),
+         cms.PSet(  type = cms.string( "trackingRecHitTrackingRecHitSoAHost" )         ),
          cms.PSet(  type = cms.string( "uintAsHostProduct" )         )
        )
    ),
diff --git a/HLTrigger/Configuration/test/OnLine_HLT_HIon.py b/HLTrigger/Configuration/test/OnLine_HLT_HIon.py
index 6ab093bb88ae7..46c253d18521c 100644
--- a/HLTrigger/Configuration/test/OnLine_HLT_HIon.py
+++ b/HLTrigger/Configuration/test/OnLine_HLT_HIon.py
@@ -25771,7 +25771,7 @@
 process.hltSiPixelRecHitsSoA = SwitchProducerCUDA(
    cpu = cms.EDAlias(
        hltSiPixelRecHitsFromLegacy = cms.VPSet( 
-         cms.PSet(  type = cms.string( "cmscudacompatCPUTraitsTrackingRecHit2DHeterogeneous" )         ),
+         cms.PSet(  type = cms.string( "trackingRecHitTrackingRecHitSoAHost" )         ),
          cms.PSet(  type = cms.string( "uintAsHostProduct" )         )
        )
    ),
diff --git a/HLTrigger/Configuration/test/OnLine_HLT_PRef.py b/HLTrigger/Configuration/test/OnLine_HLT_PRef.py
index f4bab18c1039b..d827ae488507c 100644
--- a/HLTrigger/Configuration/test/OnLine_HLT_PRef.py
+++ b/HLTrigger/Configuration/test/OnLine_HLT_PRef.py
@@ -9516,7 +9516,7 @@
 process.hltSiPixelRecHitsSoA = SwitchProducerCUDA(
    cpu = cms.EDAlias(
        hltSiPixelRecHitsFromLegacy = cms.VPSet( 
-         cms.PSet(  type = cms.string( "cmscudacompatCPUTraitsTrackingRecHit2DHeterogeneous" )         ),
+         cms.PSet(  type = cms.string( "trackingRecHitTrackingRecHitSoAHost" )         ),
          cms.PSet(  type = cms.string( "uintAsHostProduct" )         )
        )
    ),
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu
index f32a36ab2e58b..fe65229867359 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu
@@ -35,20 +35,27 @@ namespace {
 
 namespace pixelgpudetails {
 
-  trackingRecHit::TrackingRecHitSoADevice PixelRecHitGPUKernel::makeHitsAsync(SiPixelDigisCUDA const& digis_d,
-                                                          SiPixelClustersCUDA const& clusters_d,
-                                                          BeamSpotCUDA const& bs_d,
-                                                          pixelCPEforGPU::ParamsOnGPU const* cpeParams,
-                                                          bool isPhase2,
-                                                          cudaStream_t stream) const {
+  trackingRecHit::TrackingRecHitSoADevice PixelRecHitGPUKernel::makeHitsAsync(
+      SiPixelDigisCUDA const& digis_d,
+      SiPixelClustersCUDA const& clusters_d,
+      BeamSpotCUDA const& bs_d,
+      pixelCPEforGPU::ParamsOnGPU const* cpeParams,
+      bool isPhase2,
+      cudaStream_t stream) const {
     auto nHits = clusters_d.nClusters();
 
-    trackingRecHit::TrackingRecHitSoADevice hits_d(nHits, isPhase2, clusters_d.offsetBPIX2(), cpeParams, clusters_d->clusModuleStart(), stream);
+    cudaCheck(cudaGetLastError());
+    cudaCheck(cudaDeviceSynchronize());
+
+    trackingRecHit::TrackingRecHitSoADevice hits_d(
+        nHits, isPhase2, clusters_d.offsetBPIX2(), cpeParams, clusters_d->clusModuleStart(), stream);
     // TrackingRecHit2DGPU hits_d(
     //     nHits, isPhase2, clusters_d.offsetBPIX2(), cpeParams, clusters_d.clusModuleStart(), stream);
     // assert(hits_d.nMaxModules() == isPhase2 ? phase2PixelTopology::numberOfModules
     //                                         : phase1PixelTopology::numberOfModules);
 
+    cudaCheck(cudaGetLastError());
+    cudaCheck(cudaDeviceSynchronize());
     int activeModulesWithDigis = digis_d.nModules();
     // protect from empty events
     if (activeModulesWithDigis) {
@@ -67,7 +74,8 @@ namespace pixelgpudetails {
 
       // assuming full warp of threads is better than a smaller number...
       if (nHits) {
-        setHitsLayerStart<<<1, 32, 0, stream>>>(clusters_d->clusModuleStart(), cpeParams, hits_d.view().hitsLayerStart().data());
+        setHitsLayerStart<<<1, 32, 0, stream>>>(
+            clusters_d->clusModuleStart(), cpeParams, hits_d.view().hitsLayerStart().data());
         cudaCheck(cudaGetLastError());
         auto nLayers = isPhase2 ? phase2PixelTopology::numberOfLayers : phase1PixelTopology::numberOfLayers;
         cms::cuda::fillManyFromVector(hits_d.phiBinner(),
@@ -85,10 +93,10 @@ namespace pixelgpudetails {
 #endif
       }
     }
-    #ifdef GPU_DEBUG
+#ifdef GPU_DEBUG
     cudaCheck(cudaDeviceSynchronize());
-          std::cout << "DONE" << std::endl;
-    #endif
+    std::cout << "DONE" << std::endl;
+#endif
 
     return hits_d;
   }
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitCUDA.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitCUDA.cc
index 5287718bdac05..342ebafe0e6d5 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitCUDA.cc
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitCUDA.cc
@@ -66,26 +66,26 @@ void SiPixelRecHitCUDA::produce(edm::StreamID streamID, edm::Event& iEvent, cons
   if (not fcpe) {
     throw cms::Exception("Configuration") << "SiPixelRecHitSoAFromLegacy can only use a CPE of type PixelCPEFast";
   }
-
+  std::cout << "SiPixelRecHitCUDA::produce " << __LINE__ << std::endl;
   edm::Handle<cms::cuda::Product<SiPixelClustersCUDA>> hclusters;
   iEvent.getByToken(token_, hclusters);
-
+  std::cout << "SiPixelRecHitCUDA::produce " << __LINE__ << std::endl;
   cms::cuda::ScopedContextProduce ctx{*hclusters};
   auto const& clusters = ctx.get(*hclusters);
-
+  std::cout << "SiPixelRecHitCUDA::produce " << __LINE__ << std::endl;
   edm::Handle<cms::cuda::Product<SiPixelDigisCUDA>> hdigis;
   iEvent.getByToken(tokenDigi_, hdigis);
   auto const& digis = ctx.get(*hdigis);
-
+  std::cout << "SiPixelRecHitCUDA::produce " << __LINE__ << std::endl;
   edm::Handle<cms::cuda::Product<BeamSpotCUDA>> hbs;
   iEvent.getByToken(tBeamSpot, hbs);
   auto const& bs = ctx.get(*hbs);
-
+  std::cout << "SiPixelRecHitCUDA::produce " << __LINE__ << std::endl;
   ctx.emplace(iEvent,
               tokenHit_,
               gpuAlgo_.makeHitsAsync(
                   digis, clusters, bs, fcpe->getGPUProductAsync(ctx.stream()), fcpe->isPhase2(), ctx.stream()));
- std::cout << __LINE__<<std::endl;
+  std::cout << "SiPixelRecHitCUDA::produce " << __LINE__ << std::endl;
 }
 
 DEFINE_FWK_MODULE(SiPixelRecHitCUDA);
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitFromCUDA.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitFromCUDA.cc
index 0ea6d9a7d8809..0cdc77fd13a45 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitFromCUDA.cc
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitFromCUDA.cc
@@ -76,21 +76,18 @@ void SiPixelRecHitFromCUDA::acquire(edm::Event const& iEvent,
   cms::cuda::Product<TrackingRecHitSoADevice> const& inputDataWrapped = iEvent.get(hitsToken_);
   cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)};
   auto const& inputData = ctx.get(inputDataWrapped);
-  std::cout << __LINE__<<std::endl;
+  std::cout << "SiPixelRecHitFromCUDA::acquire " << __LINE__ << std::endl;
   nHits_ = inputData.nHits();
-  std::cout << __LINE__<<std::endl;
+  std::cout << "SiPixelRecHitFromCUDA::acquire " << __LINE__ << std::endl;
   nMaxModules_ = inputData.nModules();
-  std::cout << __LINE__<<std::endl;
+  std::cout << "SiPixelRecHitFromCUDA::acquire " << __LINE__ << std::endl;
 
   // LogDebug("SiPixelRecHitFromCUDA")
-  std::cout << "SiPixelRecHitFromCUDA " << "converting " << nHits_ << " Hits from " << nMaxModules_ << " modules." << std::endl;
+  std::cout << "SiPixelRecHitFromCUDA "
+            << "converting " << nHits_ << " Hits from " << nMaxModules_ << " modules." << std::endl;
 
   // std::cout << inputData.hitsModuleStart()[2] << std::endl;
-  if (0 == nHits_)
-    return;
-  std::cout << __LINE__<<std::endl;
   store32_ = inputData.localCoordToHostAsync(ctx.stream());
-  std::cout << __LINE__<<std::endl;
   hitsModuleStart_ = inputData.hitsModuleStartToHostAsync(ctx.stream());
   hitsModuleStart_[nMaxModules_] = nHits_;
   // assert(nHits_==hitsModuleStart_[nMaxModules_]);
@@ -106,7 +103,7 @@ void SiPixelRecHitFromCUDA::acquire(edm::Event const& iEvent,
   // cudaMemcpyAsync(hitsModuleStart_.get(), inputData.view().hitsModuleStart().data(), sizeof(uint32_t) * (nMaxModules_ + 1), cudaMemcpyDeviceToHost, ctx.stream());
 
   // std::copy(inputData.view().hitsModuleStart().data(), inputData.hitsModuleStart().view().data() + nMaxModules_ + 1, hitsModuleStart_.get());
-// trackingRecHitSoA::hitsModuleStartToHostAsync(inputData.view(), ctx.stream());
+  // trackingRecHitSoA::hitsModuleStartToHostAsync(inputData.view(), ctx.stream());
   std::cout << __LINE__ << std::endl;
 }
 
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromLegacy.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromLegacy.cc
index d64fbec0ff700..aa7a99601311e 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromLegacy.cc
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromLegacy.cc
@@ -4,6 +4,7 @@
 #include "CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h"
 #include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h"
 // #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h"
+#include "CUDADataFormats/Common/interface/PortableHostCollection.h"
 #include "CUDADataFormats/Common/interface/HostProduct.h"
 #include "DataFormats/BeamSpot/interface/BeamSpot.h"
 #include "DataFormats/Common/interface/DetSetVectorNew.h"
@@ -84,7 +85,7 @@ void SiPixelRecHitSoAFromLegacy::produce(edm::StreamID streamID, edm::Event& iEv
     throw cms::Exception("Configuration") << "SiPixelRecHitSoAFromLegacy can only use a CPE of type PixelCPEFast";
   }
   auto const& cpeView = fcpe->getCPUProduct();
-
+  std::cout << "GAMIESAI 0" << std::endl;
   const reco::BeamSpot& bs = iEvent.get(bsGetToken_);
 
   BeamSpotPOD bsHost;
@@ -105,15 +106,14 @@ void SiPixelRecHitSoAFromLegacy::produce(edm::StreamID streamID, edm::Event& iEv
   // allocate a buffer for the indices of the clusters
   auto hmsp = std::make_unique<uint32_t[]>(nMaxModules + 1);
   // hitsModuleStart is a non-owning pointer to the buffer
-  auto hitsModuleStart = hmsp.get();
+  // auto hitsModuleStart = hmsp.get();
   // wrap the buffer in a HostProduct
   auto hms = std::make_unique<HMSstorage>(std::move(hmsp));
   // move the HostProduct to the Event, without reallocating the buffer or affecting hitsModuleStart
   iEvent.put(tokenModuleStart_, std::move(hms));
-
+  std::cout << "GAMIESAI 1" << std::endl;
   // legacy output
   auto legacyOutput = std::make_unique<SiPixelRecHitCollectionNew>();
-
   // storage
   std::vector<uint16_t> xx;
   std::vector<uint16_t> yy;
@@ -125,19 +125,21 @@ void SiPixelRecHitSoAFromLegacy::produce(edm::StreamID streamID, edm::Event& iEv
 
   constexpr uint32_t maxHitsInModule = gpuClustering::maxHitsInModule();
 
-  HitModuleStart moduleStart_;  // index of the first pixel of each module
-  HitModuleStart clusInModule_;
-  memset(&clusInModule_, 0, sizeof(HitModuleStart));  // needed??
-  memset(&moduleStart_, 0, sizeof(HitModuleStart));
-  assert(gpuClustering::maxNumModules + 1 == clusInModule_.size());
-  assert(0 == clusInModule_[gpuClustering::maxNumModules]);
-  uint32_t moduleId_;
-  moduleStart_[1] = 0;  // we run sequentially....
-
-  // SiPixelClustersCUDA::SiPixelClustersCUDASOAView clusterView{
-  //     moduleStart_.data(), clusInModule_.data(), &moduleId_, hitsModuleStart};
-  SiPixelClustersCUDASoA::ConstView clusterView{
-      gpuClustering::maxNumModules + 1, moduleStart_.data(), clusInModule_.data(), &moduleId_, hitsModuleStart};
+  // HitModuleStart moduleStart_;  // index of the first pixel of each module
+  // HitModuleStart clusInModule_;
+  // memset(&clusInModule_, 0, sizeof(HitModuleStart));  // needed??
+  // memset(&moduleStart_, 0, sizeof(HitModuleStart));
+
+  std::cout << "GAMIESAI 2" << std::endl;
+  cms::cuda::PortableHostCollection<SiPixelClustersCUDALayout<>> clusters_h(gpuClustering::maxNumModules + 1, nullptr);
+
+  memset(clusters_h.view().clusInModule(), 0, sizeof(HitModuleStart));  // needed??
+  memset(clusters_h.view().moduleStart(), 0, sizeof(HitModuleStart));
+
+  assert(0 == clusters_h.view()[gpuClustering::maxNumModules].clusInModule());
+
+  clusters_h.view()[1].moduleStart() = 0;  // we run sequentially....
+
   // fill cluster arrays
   int numberOfClusters = 0;
   for (auto const& dsv : input) {
@@ -147,25 +149,32 @@ void SiPixelRecHitSoAFromLegacy::produce(edm::StreamID streamID, edm::Event& iEv
     auto gind = genericDet->index();
     assert(gind < nMaxModules);
     auto const nclus = dsv.size();
-    clusInModule_[gind] = nclus;
+    clusters_h.view()[gind].clusInModule() = nclus;
     numberOfClusters += nclus;
   }
-  hitsModuleStart[0] = 0;
+  clusters_h.view()[0].clusModuleStart() = 0;
 
-  for (int i = 1, n = nMaxModules + 1; i < n; ++i)
-    hitsModuleStart[i] = hitsModuleStart[i - 1] + clusInModule_[i - 1];
+  for (int i = 1, n = nMaxModules + 1; i < n; ++i) {
+    clusters_h.view()[i].clusModuleStart() =
+        clusters_h.view()[i - 1].clusModuleStart() + clusters_h.view()[i - 1].clusInModule();
+  }
 
-  assert(numberOfClusters == int(hitsModuleStart[nMaxModules]));
+  assert((uint32_t)numberOfClusters == clusters_h.view()[nMaxModules].clusModuleStart());
 
   // output SoA
   // element 96 is the start of BPIX2 (i.e. the number of clusters in BPIX1)
 
-  auto output = std::make_unique<TrackingRecHitSoAHost>(
-      numberOfClusters, isPhase2_, hitsModuleStart[startBPIX2], &cpeView, hitsModuleStart, nullptr);
-  assert(output->nModules() == uint32_t(nMaxModules));
+  TrackingRecHitSoAHost output = TrackingRecHitSoAHost(numberOfClusters,
+                                                       isPhase2_,
+                                                       clusters_h.view()[startBPIX2].clusModuleStart(),
+                                                       &cpeView,
+                                                       clusters_h.view().clusModuleStart(),
+                                                       nullptr);
+  std::cout << "GAMIESAI 3" << std::endl;
+  assert(output.nModules() == uint32_t(nMaxModules));
 
   if (0 == numberOfClusters) {
-    iEvent.put(std::move(output));
+    iEvent.emplace(tokenHit_, std::move(output));
     if (convert2Legacy_)
       iEvent.put(std::move(legacyOutput));
     return;
@@ -173,11 +182,12 @@ void SiPixelRecHitSoAFromLegacy::produce(edm::StreamID streamID, edm::Event& iEv
 
   if (convert2Legacy_)
     legacyOutput->reserve(nMaxModules, numberOfClusters);
-
+  std::cout << "GAMIESAI 4" << std::endl;
   int numberOfDetUnits = 0;
   int numberOfHits = 0;
   for (auto const& dsv : input) {
     numberOfDetUnits++;
+    std::cout << "GAMIESAI 4a_" << numberOfDetUnits << std::endl;
     unsigned int detid = dsv.detId();
     DetId detIdObject(detid);
     const GeomDetUnit* genericDet = geom_->idToDetUnit(detIdObject);
@@ -186,12 +196,14 @@ void SiPixelRecHitSoAFromLegacy::produce(edm::StreamID streamID, edm::Event& iEv
     const PixelGeomDetUnit* pixDet = dynamic_cast<const PixelGeomDetUnit*>(genericDet);
     assert(pixDet);
     auto const nclus = dsv.size();
-    assert(clusInModule_[gind] == nclus);
+    assert(clusters_h.view()[gind].clusInModule() == nclus);
     if (0 == nclus)
       continue;  // is this really possible?
-
-    auto const fc = hitsModuleStart[gind];
-    auto const lc = hitsModuleStart[gind + 1];
+    std::cout << "GAMIESAI 4b_" << numberOfDetUnits << std::endl;
+    auto const fc = clusters_h.view()[gind].clusModuleStart();
+    std::cout << "GAMIESAI 4c_" << numberOfDetUnits << std::endl;
+    auto const lc = clusters_h.view()[gind + 1].clusModuleStart();
+    std::cout << "GAMIESAI 4d_" << numberOfDetUnits << std::endl;
     assert(lc > fc);
     LogDebug("SiPixelRecHitSoAFromLegacy") << "in det " << gind << ": conv " << nclus << " hits from " << dsv.size()
                                            << " legacy clusters" << ' ' << fc << ',' << lc;
@@ -201,55 +213,60 @@ void SiPixelRecHitSoAFromLegacy::produce(edm::StreamID streamID, edm::Event& iEv
           "WARNING: too many clusters %d in Module %d. Only first %d Hits converted\n", nclus, gind, maxHitsInModule);
 
     // fill digis
-    xx.clear();
-    yy.clear();
-    adc.clear();
-    moduleInd.clear();
-    clus.clear();
+    uint32_t ndigi = 0;
+    for (auto const& clust : dsv) {
+      assert(clust.size() > 0);
+      for (int i = 0, nd = clust.size(); i < nd; ++i) {
+        ndigi++;
+      }
+    }
+    std::cout << "ndigi=" << ndigi << std::endl;
+    cms::cuda::PortableHostCollection<SiPixelDigisSoALayout<>> digis_h(ndigi, nullptr);
+    // xx.clear();
+    // yy.clear();
+    // adc.clear();
+    // moduleInd.clear();
+    // clus.clear();
     clusterRef.clear();
-    moduleId_ = gind;
+    // moduleId_ = gind;
+    clusters_h.view()[0].moduleId() = gind;
     uint32_t ic = 0;
-    uint32_t ndigi = 0;
+    ndigi = 0;
     for (auto const& clust : dsv) {
       assert(clust.size() > 0);
       for (int i = 0, nd = clust.size(); i < nd; ++i) {
         auto px = clust.pixel(i);
-        xx.push_back(px.x);
-        yy.push_back(px.y);
-        adc.push_back(px.adc);
-        moduleInd.push_back(gind);
-        clus.push_back(ic);
+        digis_h.view()[ndigi].xx() = px.x;
+        digis_h.view()[ndigi].yy() = px.y;
+        digis_h.view()[ndigi].adc() = px.adc;
+        digis_h.view()[ndigi].moduleId() = gind;
+        digis_h.view()[ndigi].clus() = ic;
         ++ndigi;
       }
+      std::cout << "GAMIESAI 4g_" << numberOfDetUnits << std::endl;
 
       if (convert2Legacy_)
         clusterRef.emplace_back(edmNew::makeRefTo(hclusters, &clust));
       ic++;
     }
     assert(nclus == ic);
-    assert(clus.size() == ndigi);
+    // assert(clus.size() == ndigi);
     numberOfHits += nclus;
-    // filled creates view
-    // SiPixelDigisCUDASOAView digiView;
-    // digiView.xx_ = xx.data();
-    // digiView.yy_ = yy.data();
-    // digiView.adc_ = adc.data();
-    // digiView.moduleInd_ = moduleInd.data();
-    // digiView.clus_ = clus.data();
-    // digiView.pdigi_ = nullptr;
-    // digiView.rawIdArr_ = nullptr;
-    // assert(digiView.adc(0) != 0);
-    SiPixelDigisCUDASOAConstView digiView(
-        ndigi, clus.data(), nullptr, nullptr, adc.data(), xx.data(), yy.data(), moduleInd.data());
-    assert(digiView[0].adc() != 0);
+
+    // SiPixelDigisCUDASOAConstView digiView(
+    //     ndigi, clus.data(), nullptr, nullptr, adc.data(), xx.data(), yy.data(), moduleInd.data());
+    assert(digis_h.view()[0].adc() != 0);
+
     // we run on blockId.x==0
-    gpuPixelRecHits::getHits(&cpeView, &bsHost, digiView, ndigi, clusterView, output->view());
+    std::cout << "GAMIESAI 4h_" << numberOfDetUnits << std::endl;
+    gpuPixelRecHits::getHits(&cpeView, &bsHost, digis_h.view(), ndigi, clusters_h.view(), output.view());
+    std::cout << "GAMIESAI 4i_" << numberOfDetUnits << std::endl;
     // gpuPixelRecHits::getHits(&cpeView, &bsHost, digiView, ndigi, &clusterView, output->view());
     for (auto h = fc; h < lc; ++h)
       if (h - fc < maxHitsInModule)
-        assert(gind == output->view()[h].detectorIndex());
+        assert(gind == output.view()[h].detectorIndex());
       else
-        assert(gpuClustering::invalidModuleId == output->view()[h].detectorIndex());
+        assert(gpuClustering::invalidModuleId == output.view()[h].detectorIndex());
     if (convert2Legacy_) {
       SiPixelRecHitCollectionNew::FastFiller recHitsOnDetUnit(*legacyOutput, detid);
       for (auto h = fc; h < lc; ++h) {
@@ -258,38 +275,38 @@ void SiPixelRecHitSoAFromLegacy::produce(edm::StreamID streamID, edm::Event& iEv
         if (ih >= maxHitsInModule)
           break;
         assert(ih < clusterRef.size());
-        LocalPoint lp(output->view()[h].xLocal(), output->view()[h].yLocal());
-        LocalError le(output->view()[h].xerrLocal(), 0, output->view()[h].yerrLocal());
+        LocalPoint lp(output.view()[h].xLocal(), output.view()[h].yLocal());
+        LocalError le(output.view()[h].xerrLocal(), 0, output.view()[h].yerrLocal());
         SiPixelRecHitQuality::QualWordType rqw = 0;
         SiPixelRecHit hit(lp, le, rqw, *genericDet, clusterRef[ih]);
         recHitsOnDetUnit.push_back(hit);
       }
     }
   }
-
+  std::cout << "GAMIESAI 5" << std::endl;
   assert(numberOfHits == numberOfClusters);
 
   // fill data structure to support CA
   const auto nLayers = isPhase2_ ? phase2PixelTopology::numberOfLayers : phase1PixelTopology::numberOfLayers;
   for (auto i = 0U; i < nLayers + 1; ++i) {
-    output->view().hitsLayerStart()[i] = hitsModuleStart[cpeView.layerGeometry().layerStart[i]];
+    std::cout << "GAMIESAI 6_" << i << std::endl;
+    output.view().hitsLayerStart()[i] = clusters_h.view()[cpeView.layerGeometry().layerStart[i]].clusModuleStart();
     LogDebug("SiPixelRecHitSoAFromLegacy")
         << "Layer n." << i << " - starting at module: " << cpeView.layerGeometry().layerStart[i]
-        << " - starts ad cluster: " << output->view()[i].hitsLayerStart() << "\n";
+        << " - starts ad cluster: " << output.view()[i].hitsLayerStart() << "\n";
   }
-
-  cms::cuda::fillManyFromVector(&(output->view().phiBinner()),
+  std::cout << "GAMIESAI 7" << std::endl;
+  cms::cuda::fillManyFromVector(&(output.view().phiBinner()),
                                 nLayers,
-                                output->view().iphi(),
-                                output->view().hitsLayerStart().data(),
-                                output->nHits(),
+                                output.view().iphi(),
+                                output.view().hitsLayerStart().data(),
+                                output.view().nHits(),
                                 256,
-                                output->phiBinnerStorage());
-
-
+                                output.view().phiBinnerStorage());
+  std::cout << "GAMIESAI 8" << std::endl;
   LogDebug("SiPixelRecHitSoAFromLegacy") << "created HitSoa for " << numberOfClusters << " clusters in "
                                          << numberOfDetUnits << " Dets";
-  iEvent.put(std::move(output));
+  iEvent.emplace(tokenHit_, std::move(output));
   if (convert2Legacy_)
     iEvent.put(std::move(legacyOutput));
 }
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h b/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h
index 7c8145cf7ebf1..566fb2629a3af 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h
@@ -20,10 +20,9 @@ namespace gpuPixelRecHits {
   __global__ void getHits(pixelCPEforGPU::ParamsOnGPU const* __restrict__ cpeParams,
                           BeamSpotPOD const* __restrict__ bs,
                           // SiPixelDigisCUDASOAView const digis
-                          SiPixelDigisCUDASOAConstView const digis,
+                          SiPixelDigisCUDASOAConstView digis,
                           int numElements,
-                          // SiPixelClustersCUDA::SiPixelClustersCUDASOAView const* __restrict__ pclusters,
-                          SiPixelClustersCUDASoA::ConstView const __restrict__ clusters,
+                          SiPixelClustersCUDASoA::ConstView clusters,
                           trackingRecHitSoA::HitSoAView hits) {
     // FIXME
     // the compiler seems NOT to optimize loads from views (even in a simple test case)
@@ -82,8 +81,8 @@ namespace gpuPixelRecHits {
 #endif
 #ifdef GPU_DEBUG
     // if (me % 100 == 1)
-      if (threadIdx.x == 0)
-        printf("hitbuilder: %d clusters in module %d. will write at %d\n", nclus, me, clusters.clusModuleStart(me));
+    if (threadIdx.x == 0)
+      printf("hitbuilder: %d clusters in module %d. will write at %d\n", nclus, me, clusters.clusModuleStart(me));
 #endif
     for (int startClus = 0, endClus = nclus; startClus < endClus; startClus += MaxHitsInIter) {
       int nClusInIter = std::min(MaxHitsInIter, endClus - startClus);
diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc
index 36d3dd8c3dcc7..9db4d09d16f05 100644
--- a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc
+++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc
@@ -134,7 +134,7 @@ void PixelTrackProducerFromSoA::produce(edm::StreamID streamID,
   auto const &rcs = rechits.data();
   auto nhits = rcs.size();
   hitmap.resize(nhits, nullptr);
-
+  std::cout << "nHits = " << nhits << std::endl;
   auto const *hitsModuleStart = iEvent.get(hmsToken_).get();
   auto fc = hitsModuleStart;
 
@@ -144,6 +144,7 @@ void PixelTrackProducerFromSoA::produce(edm::StreamID streamID,
     auto const &clus = thit.firstClusterRef();
     assert(clus.isPixel());
     auto i = fc[detI] + clus.pixelCluster().originalId();
+    std::cout << "i = " << i << std::endl;
     if (i >= hitmap.size())
       hitmap.resize(i + 256, nullptr);  // only in case of hit overflow in one module
     assert(nullptr == hitmap[i]);
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cc
index eb4de1485addd..c4ebebc8c6039 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cc
@@ -1,6 +1,6 @@
 #include "BrokenLineFitOnGPU.h"
 
-void HelixFitOnGPU::launchBrokenLineKernelsOnCPU(HitSoAConstView const &hv, uint32_t hitsInFit, uint32_t maxNumberOfTuples) {
+void HelixFitOnGPU::launchBrokenLineKernelsOnCPU(HitSoAConstView hv, uint32_t hitsInFit, uint32_t maxNumberOfTuples) {
   assert(tuples_);
 
 #ifdef BROKENLINE_DEBUG
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu
index 82bccff64ff7e..e9a3be07f3f8f 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu
+++ b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu
@@ -1,7 +1,7 @@
 #include "BrokenLineFitOnGPU.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
 
-void HelixFitOnGPU::launchBrokenLineKernels(HitSoAConstView const &hv,
+void HelixFitOnGPU::launchBrokenLineKernels(HitSoAConstView hv,
                                             uint32_t hitsInFit,
                                             uint32_t maxNumberOfTuples,
                                             cudaStream_t stream) {
@@ -21,6 +21,7 @@ void HelixFitOnGPU::launchBrokenLineKernels(HitSoAConstView const &hv,
 
   for (uint32_t offset = 0; offset < maxNumberOfTuples; offset += maxNumberOfConcurrentFits_) {
     // fit triplets
+    std::cout << "BITCH 0_" << offset << std::endl;
     kernel_BLFastFit<3><<<numberOfBlocks, blockSize, 0, stream>>>(tuples_,
                                                                   tupleMultiplicity_,
                                                                   hv,
@@ -32,7 +33,8 @@ void HelixFitOnGPU::launchBrokenLineKernels(HitSoAConstView const &hv,
                                                                   3,
                                                                   offset);
     cudaCheck(cudaGetLastError());
-
+    cudaCheck(cudaDeviceSynchronize());
+    std::cout << "BITCH 1_" << offset << std::endl;
     kernel_BLFit<3><<<numberOfBlocks, blockSize, 0, stream>>>(tupleMultiplicity_,
                                                               bField_,
                                                               outputSoa_,
@@ -41,9 +43,11 @@ void HelixFitOnGPU::launchBrokenLineKernels(HitSoAConstView const &hv,
                                                               hits_geGPU.get(),
                                                               fast_fit_resultsGPU.get());
     cudaCheck(cudaGetLastError());
-
+    cudaCheck(cudaDeviceSynchronize());
+    std::cout << "BITCH 2_" << offset << std::endl;
     if (fitNas4_) {
       // fit all as 4
+      std::cout << "BITCH 2a_" << offset << std::endl;
       kernel_BLFastFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tuples_,
                                                                         tupleMultiplicity_,
                                                                         hv,
@@ -55,7 +59,8 @@ void HelixFitOnGPU::launchBrokenLineKernels(HitSoAConstView const &hv,
                                                                         8,
                                                                         offset);
       cudaCheck(cudaGetLastError());
-
+      cudaCheck(cudaDeviceSynchronize());
+      std::cout << "BITCH 3_" << offset << std::endl;
       kernel_BLFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_,
                                                                     bField_,
                                                                     outputSoa_,
@@ -63,8 +68,10 @@ void HelixFitOnGPU::launchBrokenLineKernels(HitSoAConstView const &hv,
                                                                     hitsGPU.get(),
                                                                     hits_geGPU.get(),
                                                                     fast_fit_resultsGPU.get());
+      std::cout << "BITCH 4_" << offset << std::endl;
     } else {
       // fit quads
+      std::cout << "BITCH 2b_" << offset << std::endl;
       kernel_BLFastFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tuples_,
                                                                         tupleMultiplicity_,
                                                                         hv,
@@ -76,7 +83,8 @@ void HelixFitOnGPU::launchBrokenLineKernels(HitSoAConstView const &hv,
                                                                         4,
                                                                         offset);
       cudaCheck(cudaGetLastError());
-
+      cudaCheck(cudaDeviceSynchronize());
+      std::cout << "BITCH 5_" << offset << std::endl;
       kernel_BLFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_,
                                                                     bField_,
                                                                     outputSoa_,
@@ -84,6 +92,7 @@ void HelixFitOnGPU::launchBrokenLineKernels(HitSoAConstView const &hv,
                                                                     hitsGPU.get(),
                                                                     hits_geGPU.get(),
                                                                     fast_fit_resultsGPU.get());
+      std::cout << "BITCH 6_" << offset << std::endl;
       // fit penta (all 5)
       kernel_BLFastFit<5><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tuples_,
                                                                         tupleMultiplicity_,
@@ -96,7 +105,8 @@ void HelixFitOnGPU::launchBrokenLineKernels(HitSoAConstView const &hv,
                                                                         5,
                                                                         offset);
       cudaCheck(cudaGetLastError());
-
+      cudaCheck(cudaDeviceSynchronize());
+      std::cout << "BITCH 7_" << offset << std::endl;
       kernel_BLFit<5><<<8, blockSize, 0, stream>>>(tupleMultiplicity_,
                                                    bField_,
                                                    outputSoa_,
@@ -105,6 +115,7 @@ void HelixFitOnGPU::launchBrokenLineKernels(HitSoAConstView const &hv,
                                                    hits_geGPU.get(),
                                                    fast_fit_resultsGPU.get());
       cudaCheck(cudaGetLastError());
+      std::cout << "BITCH 8_" << offset << std::endl;
       // fit sexta and above (as 6)
       kernel_BLFastFit<6><<<4, blockSize, 0, stream>>>(tuples_,
                                                        tupleMultiplicity_,
@@ -117,7 +128,8 @@ void HelixFitOnGPU::launchBrokenLineKernels(HitSoAConstView const &hv,
                                                        8,
                                                        offset);
       cudaCheck(cudaGetLastError());
-
+      cudaCheck(cudaDeviceSynchronize());
+      std::cout << "BITCH 9_" << offset << std::endl;
       kernel_BLFit<6><<<4, blockSize, 0, stream>>>(tupleMultiplicity_,
                                                    bField_,
                                                    outputSoa_,
@@ -126,6 +138,8 @@ void HelixFitOnGPU::launchBrokenLineKernels(HitSoAConstView const &hv,
                                                    hits_geGPU.get(),
                                                    fast_fit_resultsGPU.get());
       cudaCheck(cudaGetLastError());
+      cudaCheck(cudaDeviceSynchronize());
+      std::cout << "BITCH 10_" << offset << std::endl;
     }
 
   }  // loop on concurrent fits
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h
index a8d4be7ca969d..6030019778f91 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h
@@ -27,7 +27,7 @@ constexpr auto invalidTkId = std::numeric_limits<tindex_type>::max();
 template <int N>
 __global__ void kernel_BLFastFit(Tuples const *__restrict__ foundNtuplets,
                                  caConstants::TupleMultiplicity const *__restrict__ tupleMultiplicity,
-                                 HitSoAConstView const &__restrict__ hh,
+                                 HitSoAConstView hh,
                                  tindex_type *__restrict__ ptkids,
                                  double *__restrict__ phits,
                                  float *__restrict__ phits_ge,
@@ -35,6 +35,8 @@ __global__ void kernel_BLFastFit(Tuples const *__restrict__ foundNtuplets,
                                  uint32_t nHitsL,
                                  uint32_t nHitsH,
                                  int32_t offset) {
+  // look in bin for this hit multiplicity
+  auto local_start = blockIdx.x * blockDim.x + threadIdx.x;
   constexpr uint32_t hitsInFit = N;
 
   assert(hitsInFit <= nHitsL);
@@ -45,18 +47,17 @@ __global__ void kernel_BLFastFit(Tuples const *__restrict__ foundNtuplets,
   assert(foundNtuplets);
   assert(tupleMultiplicity);
 
-  // look in bin for this hit multiplicity
-  auto local_start = blockIdx.x * blockDim.x + threadIdx.x;
   int totTK = tupleMultiplicity->end(nHitsH) - tupleMultiplicity->begin(nHitsL);
+
   assert(totTK <= int(tupleMultiplicity->size()));
   assert(totTK >= 0);
 
-#ifdef BROKENLINE_DEBUG
+  // #ifdef BROKENLINE_DEBUG
   if (0 == local_start) {
     printf("%d total Ntuple\n", tupleMultiplicity->size());
     printf("%d Ntuple of size %d/%d for %d hits to fit\n", totTK, nHitsL, nHitsH, hitsInFit);
   }
-#endif
+  // #endif
 
   for (int local_idx = local_start, nt = riemannFit::maxNumberOfConcurrentFits; local_idx < nt;
        local_idx += gridDim.x * blockDim.x) {
@@ -65,6 +66,7 @@ __global__ void kernel_BLFastFit(Tuples const *__restrict__ foundNtuplets,
       ptkids[local_idx] = invalidTkId;
       break;
     }
+
     // get it from the ntuple container (one to one to helix)
     auto tkid = *(tupleMultiplicity->begin(nHitsL) + tuple_idx);
     assert(tkid < foundNtuplets->nOnes());
@@ -92,10 +94,11 @@ __global__ void kernel_BLFastFit(Tuples const *__restrict__ foundNtuplets,
 
     // #define YERR_FROM_DC
 #ifdef YERR_FROM_DC
+
     // try to compute more precise error in y
-    auto dx = hh.xGlobal(hitId[hitsInFit - 1]) - hh.xGlobal(hitId[0]);
-    auto dy = hh.yGlobal(hitId[hitsInFit - 1]) - hh.yGlobal(hitId[0]);
-    auto dz = hh.zGlobal(hitId[hitsInFit - 1]) - hh.zGlobal(hitId[0]);
+    auto dx = hh[hitId[hitsInFit - 1]].xGlobal() - hh[hitId[0]].xGlobal();
+    auto dy = hh[hitId[hitsInFit - 1]].yGlobal() - hh[hitId[0]].yGlobal();
+    auto dz = hh[hitId[hitsInFit - 1]].zGlobal() - hh[hitId[0]].zGlobal();
     float ux, uy, uz;
 #endif
 
@@ -111,12 +114,14 @@ __global__ void kernel_BLFastFit(Tuples const *__restrict__ foundNtuplets,
       float ge[6];
 
 #ifdef YERR_FROM_DC
+
       auto const &dp = hh.cpeParams().detParams(hh.detectorIndex(hit));
-      auto status = hh.status(hit);
+      auto status = hh[hit].status();
       int qbin = CPEFastParametrisation::kGenErrorQBins - 1 - status.qBin;
       assert(qbin >= 0 && qbin < 5);
       bool nok = (status.isBigY | status.isOneY);
       // compute cotanbeta and use it to recompute error
+
       dp.frame.rotation().multiply(dx, dy, dz, ux, uy, uz);
       auto cb = std::abs(uy / uz);
       int bin =
@@ -125,16 +130,17 @@ __global__ void kernel_BLFastFit(Tuples const *__restrict__ foundNtuplets,
       int high_value = CPEFastParametrisation::kNumErrorBins - 1;
       // return estimated bin value truncated to [0, 15]
       bin = std::clamp(bin, low_value, high_value);
+
       float yerr = dp.sigmay[bin] * 1.e-4f;  // toCM
       yerr *= dp.yfact[qbin];                // inflate
       yerr *= yerr;
       yerr += dp.apeYY;
-      yerr = nok ? hh.yerrLocal(hit) : yerr;
-      dp.frame.toGlobal(hh.xerrLocal(hit), 0, yerr, ge);
+      yerr = nok ? hh[hit].yerrLocal() : yerr;
+      dp.frame.toGlobal(hh[hit].xerrLocal(), 0, yerr, ge);
 #else
-      hh.cpeParams()
-          .detParams(hh.detectorIndex(hit))
-          .frame.toGlobal(hh.xerrLocal(hit), 0, hh.yerrLocal(hit), ge);
+
+      hh.cpeParams().detParams(hh.detectorIndex(hit)).frame.toGlobal(hh.xerrLocal(hit), 0, hh.yerrLocal(hit), ge);
+
 #endif
 
 #ifdef BL_DUMP_HITS
@@ -144,16 +150,16 @@ __global__ void kernel_BLFastFit(Tuples const *__restrict__ foundNtuplets,
                local_idx,
                tkid,
                hit,
-               hh.detectorIndex(hit),
+               hh[hit].detectorIndex(),
                i,
-               hh.xGlobal(hit),
-               hh.yGlobal(hit),
-               hh.zGlobal(hit));
+               hh[hit].xGlobal(),
+               hh[hit].yGlobal(),
+               hh[hit].zGlobal());
         printf("Error: hits_ge.col(%d) << %e,%e,%e,%e,%e,%e\n", i, ge[0], ge[1], ge[2], ge[3], ge[4], ge[5]);
       }
 #endif
 
-      hits.col(i) << hh.xGlobal(hit), hh.yGlobal(hit), hh.zGlobal(hit);
+      hits.col(i) << hh[hit].xGlobal(), hh[hit].yGlobal(), hh[hit].zGlobal();
       hits_ge.col(i) << ge[0], ge[1], ge[2], ge[3], ge[4], ge[5];
     }
     brokenline::fastFit(hits, fast_fit);
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc
index 9d4851010097a..2dd5e92f7583d 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc
@@ -87,15 +87,16 @@ void CAHitNtupletCUDA::produce(edm::StreamID streamID, edm::Event& iEvent, const
   auto bf = 1. / es.getData(tokenField_).inverseBzAtOriginInGeV();
 
   if (onGPU_) {
-    auto hHits = iEvent.getHandle(tokenHitGPU_);
+    edm::Handle<cms::cuda::Product<TrackingRecHitSoADevice>> hHits;
+    iEvent.getByToken(tokenHitGPU_, hHits);
+    // auto hHits = iEvent.getHandle(tokenHitGPU_);
 
     cms::cuda::ScopedContextProduce ctx{*hHits};
-    auto const& hits = ctx.get(*hHits);
-
-    ctx.emplace(iEvent, tokenTrackGPU_, gpuAlgo_.makeTuplesAsync(hits, bf, ctx.stream()));
+    auto& hits_d = ctx.get(*hHits);
+    ctx.emplace(iEvent, tokenTrackGPU_, gpuAlgo_.makeTuplesAsync(hits_d, bf, ctx.stream()));
   } else {
-    auto const& hits = iEvent.get(tokenHitCPU_);
-    iEvent.emplace(tokenTrackCPU_, gpuAlgo_.makeTuples(hits, bf));
+    auto& hits_h = iEvent.get(tokenHitCPU_);
+    iEvent.emplace(tokenTrackCPU_, gpuAlgo_.makeTuples(hits_h, bf));
   }
 }
 
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
index cccdaeee50a00..85399ba317b69 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
@@ -14,7 +14,7 @@ void CAHitNtupletGeneratorKernelsCPU::printCounters(Counters const *counters) {
 }
 
 template <>
-void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsConstView const&hh, int32_t offsetBPIX2, cudaStream_t stream) {
+void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsConstView hh, int32_t offsetBPIX2, cudaStream_t stream) {
   uint32_t nhits = hh.metadata().size();
 
 #ifdef NTUPLE_DEBUG
@@ -82,9 +82,7 @@ void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsConstView const&hh, int3
 }
 
 template <>
-void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsConstView const&hh,
-                                                    TkSoAView tracks_view,
-                                                    cudaStream_t cudaStream) {
+void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsConstView hh, TkSoAView tracks_view, cudaStream_t cudaStream) {
   // zero tuples
   cms::cuda::launchZero(&tracks_view.hitIndices(), cudaStream);
 
@@ -142,9 +140,7 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsConstView const&hh,
 }
 
 template <>
-void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsConstView const&hh,
-                                                     TkSoAView tracks_view,
-                                                     cudaStream_t cudaStream) {
+void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsConstView hh, TkSoAView tracks_view, cudaStream_t cudaStream) {
   int32_t nhits = hh.metadata().size();
 
   auto *quality_d = pixelTrack::utilities::qualityData(tracks_view);
@@ -165,12 +161,10 @@ void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsConstView const&hh,
     cms::cuda::launchFinalize(hitToTupleView_, cudaStream);
     kernel_fillHitInTracks(tracks_view, device_hitToTuple_.get());
   }
-
   // remove duplicates (tracks that share at least one hit)
   if (params_.doSharedHitCut_) {
     kernel_rejectDuplicate(
         tracks_view, params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get());
-
     kernel_sharedHitCleaner(
         hh, tracks_view, params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get());
     if (params_.useSimpleTripletCleaner_) {
@@ -197,7 +191,6 @@ void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsConstView const&hh,
                           params_.maxNumberOfDoublets_,
                           counters_);
   }
-
   if (params_.doStats_) {
     // counters (add flag???)
     std::lock_guard guard(lock_stat);
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
index 8561f703e527e..8bd2d62bd13ef 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
@@ -2,9 +2,7 @@
 #include <mutex>
 
 template <>
-void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsConstView const&hh,
-                                                    TkSoAView tracks_view,
-                                                    cudaStream_t cudaStream) {
+void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsConstView hh, TkSoAView tracks_view, cudaStream_t cudaStream) {
   // these are pointer on GPU!
   auto *quality_d = pixelTrack::utilities::qualityData(tracks_view);
 
@@ -130,7 +128,7 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsConstView const&hh,
 }
 
 template <>
-void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsConstView const&hh, int32_t offsetBPIX2, cudaStream_t stream) {
+void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsConstView hh, int32_t offsetBPIX2, cudaStream_t stream) {
   int32_t nhits = hh.metadata().size();
 
   isOuterHitOfCell_ = GPUCACell::OuterHitOfCell{device_isOuterHitOfCell_.get(), offsetBPIX2};
@@ -145,8 +143,8 @@ void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsConstView const&hh, int3
 #endif
   std::cout << __LINE__ << std::endl;
   // in principle we can use "nhits" to heuristically dimension the workspace...
-  device_isOuterHitOfCell_ = cms::cuda::make_device_unique<GPUCACell::OuterHitOfCellContainer[]>(
-      std::max(1, nhits - offsetBPIX2), stream);
+  device_isOuterHitOfCell_ =
+      cms::cuda::make_device_unique<GPUCACell::OuterHitOfCellContainer[]>(std::max(1, nhits - offsetBPIX2), stream);
   assert(device_isOuterHitOfCell_.get());
   std::cout << __LINE__ << std::endl;
   isOuterHitOfCell_ = GPUCACell::OuterHitOfCell{device_isOuterHitOfCell_.get(), offsetBPIX2};
@@ -220,20 +218,18 @@ void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsConstView const&hh, int3
 }
 
 template <>
-void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsConstView const&hh,
-                                                     TkSoAView tracks_view,
-                                                     cudaStream_t cudaStream) {
+void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsConstView hh, TkSoAView tracks_view, cudaStream_t cudaStream) {
   // these are pointer on GPU!
   auto *quality_d = pixelTrack::utilities::qualityData(tracks_view);
 
   int32_t nhits = hh.metadata().size();
-
+  std::cout << "PSOFOS 1" << std::endl;
   auto blockSize = 64;
 
   // classify tracks based on kinematics
   auto numberOfBlocks = nQuadrupletBlocks(blockSize);
   kernel_classifyTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tracks_view, quality_d, params_.cuts_);
-
+  std::cout << "PSOFOS 2" << std::endl;
   cudaCheck(cudaGetLastError());
 
   if (params_.lateFishbone_) {
@@ -242,57 +238,70 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsConstView const&hh,
     kernel_fishboneCleaner<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
         device_theCells_.get(), device_nCells_, quality_d);
     cudaCheck(cudaGetLastError());
+    std::cout << "PSOFOS 2.1" << std::endl;
   }
-
+  std::cout << "PSOFOS 2.2" << std::endl;
   // mark duplicates (tracks that share a doublet)
   numberOfBlocks = nDoubletBlocks(blockSize);
+  std::cout << "PSOFOS 2.3" << std::endl;
   kernel_fastDuplicateRemover<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
       device_theCells_.get(), device_nCells_, tracks_view, params_.dupPassThrough_);
+  std::cout << "PSOFOS 2.4" << std::endl;
   cudaCheck(cudaGetLastError());
+  std::cout << "PSOFOS 3" << std::endl;
+
 #ifdef GPU_DEBUG
   cudaCheck(cudaDeviceSynchronize());
 #endif
-
+  std::cout << "PSOFOS 3 new" << std::endl;
   if (params_.doSharedHitCut_ || params_.doStats_) {
+    std::cout << "PSOFOS 3.1" << std::endl;
     // fill hit->track "map"
     assert(hitToTupleView_.offSize > nhits);
+    std::cout << "PSOFOS 3.2" << std::endl;
     numberOfBlocks = nQuadrupletBlocks(blockSize);
+    std::cout << "PSOFOS 3.3" << std::endl;
     kernel_countHitInTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tracks_view, device_hitToTuple_.get());
     cudaCheck(cudaGetLastError());
+    std::cout << "PSOFOS 3.4" << std::endl;
     assert((hitToTupleView_.assoc == device_hitToTuple_.get()) &&
            (hitToTupleView_.offStorage == device_hitToTupleStorage_.get()) && (hitToTupleView_.offSize > 0));
     cms::cuda::launchFinalize(hitToTupleView_, cudaStream);
     cudaCheck(cudaGetLastError());
+    std::cout << "PSOFOS 3.5" << std::endl;
     kernel_fillHitInTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tracks_view, device_hitToTuple_.get());
     cudaCheck(cudaGetLastError());
+    std::cout << "PSOFOS 3.6" << std::endl;
 #ifdef GPU_DEBUG
     cudaCheck(cudaDeviceSynchronize());
 #endif
   }
-
+  std::cout << "PSOFOS 4" << std::endl;
   if (params_.doSharedHitCut_) {
     // mark duplicates (tracks that share at least one hit)
     numberOfBlocks = (hitToTupleView_.offSize + blockSize - 1) / blockSize;
 
     kernel_rejectDuplicate<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
         tracks_view, params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get());
-
+    std::cout << "PSOFOS 4.1" << std::endl;
     kernel_sharedHitCleaner<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
         hh, tracks_view, params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get());
-
+    std::cout << "PSOFOS 4.2" << std::endl;
     if (params_.useSimpleTripletCleaner_) {
       kernel_simpleTripletCleaner<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
           tracks_view, params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get());
+      std::cout << "PSOFOS 4.3" << std::endl;
     } else {
       kernel_tripletCleaner<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
           tracks_view, params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get());
+      std::cout << "PSOFOS 4.4" << std::endl;
     }
     cudaCheck(cudaGetLastError());
 #ifdef GPU_DEBUG
     cudaCheck(cudaDeviceSynchronize());
 #endif
   }
-
+  std::cout << "PSOFOS 5" << std::endl;
   if (params_.doStats_) {
     numberOfBlocks = (std::max(nhits, int(params_.maxNumberOfDoublets_)) + blockSize - 1) / blockSize;
     kernel_checkOverflows<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tracks_view,
@@ -308,16 +317,19 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsConstView const&hh,
                                                                         params_.maxNumberOfDoublets_,
                                                                         counters_);
     cudaCheck(cudaGetLastError());
+    std::cout << "PSOFOS 5.1" << std::endl;
   }
-
+  std::cout << "PSOFOS 6" << std::endl;
   if (params_.doStats_) {
     // counters (add flag???)
     numberOfBlocks = (hitToTupleView_.offSize + blockSize - 1) / blockSize;
     kernel_doStatsForHitInTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(device_hitToTuple_.get(), counters_);
     cudaCheck(cudaGetLastError());
+    std::cout << "PSOFOS 6.1" << std::endl;
     numberOfBlocks = (3 * caConstants::maxNumberOfQuadruplets / 4 + blockSize - 1) / blockSize;
     kernel_doStatsForTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tracks_view, quality_d, counters_);
     cudaCheck(cudaGetLastError());
+    std::cout << "PSOFOS 6.2" << std::endl;
   }
 #ifdef GPU_DEBUG
   cudaDeviceSynchronize();
@@ -331,13 +343,14 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsConstView const&hh,
     std::lock_guard<std::mutex> guard(lock);
     ++iev;
     for (int k = 0; k < 20000; k += 500) {
-      kernel_print_found_ntuplets<<<1, 32, 0, cudaStream>>>(
-          hh, tracks_view, device_hitToTuple_.get(), k, k + 500, iev);
+      kernel_print_found_ntuplets<<<1, 32, 0, cudaStream>>>(hh, tracks_view, device_hitToTuple_.get(), k, k + 500, iev);
       cudaDeviceSynchronize();
+      std::cout << "PSOFOS 7" << std::endl;
     }
     kernel_print_found_ntuplets<<<1, 32, 0, cudaStream>>>(
         hh, tracks_view, device_hitToTuple_.get(), 20000, 1000000, iev);
     cudaDeviceSynchronize();
+    std::cout << "PSOFOS 8" << std::endl;
     // cudaStreamSynchronize(cudaStream);
   }
 #endif
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h
index d2b2c4b1c77a0..2dce3909adfa5 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h
@@ -32,6 +32,7 @@ namespace cAHitNtupletGenerator {
     unsigned long long nZeroTrackCells;
   };
 
+  using HitsView = trackingRecHitSoA::HitSoAView;
   using HitsConstView = trackingRecHitSoA::HitSoAConstView;
 
   // using HitsConstView = TrackingRecHit2DSOAView;
@@ -173,6 +174,7 @@ class CAHitNtupletGeneratorKernels {
   template <typename T>
   using unique_ptr = typename Traits::template unique_ptr<T>;
 
+  using HitsView = trackingRecHitSoA::HitSoAView;
   using HitsConstView = trackingRecHitSoA::HitSoAConstView;
   // using HitsOnGPU = TrackingRecHit2DSOAView;
   // using HitsOnCPU = TrackingRecHit2DHeterogeneous<Traits>;
@@ -191,11 +193,11 @@ class CAHitNtupletGeneratorKernels {
 
   TupleMultiplicity const* tupleMultiplicity() const { return device_tupleMultiplicity_.get(); }
 
-  void launchKernels(HitsConstView const& hh, TkSoAView tracks_view, cudaStream_t cudaStream);
+  void launchKernels(HitsConstView hh, TkSoAView tracks_view, cudaStream_t cudaStream);
 
-  void classifyTuples(HitsConstView const& hh, TkSoAView tracks_view, cudaStream_t cudaStream);
+  void classifyTuples(HitsConstView hh, TkSoAView tracks_view, cudaStream_t cudaStream);
 
-  void buildDoublets(HitsConstView const& hh, int32_t offsetBPIX2, cudaStream_t stream);
+  void buildDoublets(HitsConstView hh, int32_t offsetBPIX2, cudaStream_t stream);
   void allocateOnGPU(int32_t nHits, cudaStream_t stream);
   void cleanup(cudaStream_t cudaStream);
 
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
index 49cac0dd0026d..8a00aba6b80bb 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
@@ -26,7 +26,8 @@
 // using HitsOnGPU = TrackingRecHit2DSOAView;
 // using HitsOnCPU = TrackingRecHit2DGPU;
 
-using HitSoAView = trackingRecHitSoA::HitSoAConstView;
+using HitSoAView = trackingRecHitSoA::HitSoAView;
+using HitSoAConstView = trackingRecHitSoA::HitSoAConstView;
 using HitToTuple = caConstants::HitToTuple;
 using TupleMultiplicity = caConstants::TupleMultiplicity;
 
@@ -269,7 +270,7 @@ __global__ void kernel_fastDuplicateRemover(GPUCACell const *__restrict__ cells,
 
 __global__ void kernel_connect(cms::cuda::AtomicPairCounter *apc1,
                                cms::cuda::AtomicPairCounter *apc2,  // just to zero them,
-                               GPUCACell::HitsConstView const& __restrict__ hh,
+                               GPUCACell::HitsConstView hh,
                                GPUCACell *cells,
                                uint32_t const *__restrict__ nCells,
                                gpuPixelDoublets::CellNeighborsVector *cellNeighbors,
@@ -334,7 +335,7 @@ __global__ void kernel_connect(cms::cuda::AtomicPairCounter *apc1,
   }    // loop on outer cells
 }
 
-__global__ void kernel_find_ntuplets(GPUCACell::HitsConstView const& __restrict__ hh,
+__global__ void kernel_find_ntuplets(GPUCACell::HitsConstView hh,
                                      GPUCACell *__restrict__ cells,
                                      uint32_t const *nCells,
                                      gpuPixelDoublets::CellTracksVector *cellTracks,
@@ -547,14 +548,13 @@ __global__ void kernel_fillHitInTracks(TkSoAView tracks_view,  // TODO: Make Con
   }
 }
 
-__global__ void kernel_fillHitDetIndices(TkSoAView tracks_view, HitSoAView const& __restrict__ hh) {
+__global__ void kernel_fillHitDetIndices(TkSoAView tracks_view, HitSoAConstView hh) {
   int first = blockDim.x * blockIdx.x + threadIdx.x;
   // copy offsets
   for (int idx = first, ntot = tracks_view.hitIndices().totOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
     tracks_view.detIndices().off[idx] = tracks_view.hitIndices().off[idx];
   }
   // fill hit indices
-  // auto const &hh = *hhp;
   auto nhits = hh.nHits();
   for (int idx = first, ntot = tracks_view.hitIndices().size(); idx < ntot; idx += gridDim.x * blockDim.x) {
     assert(tracks_view.hitIndices().content[idx] < nhits);
@@ -707,7 +707,7 @@ __global__ void kernel_rejectDuplicate(TkSoAView tracks_view,
   }
 }
 
-__global__ void kernel_sharedHitCleaner(HitSoAView const& __restrict__ hh,
+__global__ void kernel_sharedHitCleaner(HitSoAConstView hh,
                                         TkSoAView tracks_view,
                                         int nmin,
                                         bool dupPassThrough,
@@ -855,7 +855,7 @@ __global__ void kernel_simpleTripletCleaner(
   }  // loop over hits
 }
 
-__global__ void kernel_print_found_ntuplets(HitSoAView& __restrict__ hh,
+__global__ void kernel_print_found_ntuplets(HitSoAView hh,
                                             TkSoAView tracks_view,
                                             CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ phitToTuple,
                                             int32_t firstPrint,
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
index 0246d0ecb608e..7fa3f339260fc 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
@@ -191,27 +191,58 @@ void CAHitNtupletGeneratorOnGPU::endJob() {
   }
 }
 
-pixelTrack::TrackSoADevice CAHitNtupletGeneratorOnGPU::makeTuplesAsync(HitsOnGPU const& hits_d,
-                                                                       float bfield,
-                                                                       cudaStream_t stream) const {
+pixelTrack::TrackSoADevice CAHitNtupletGeneratorOnGPU::makeTuplesAsync(
+    trackingRecHit::TrackingRecHitSoADevice const& hits_d, float bfield, cudaStream_t stream) const {
   pixelTrack::TrackSoADevice tracks(stream);
-  auto* soa = &tracks;
 
+  std::cout << "!!!!" << hits_d.offsetBPIX2() << ", " << hits_d.nHits() << std::endl;
   CAHitNtupletGeneratorKernelsGPU kernels(m_params);
   kernels.setCounters(m_counters);
+  std::cout << "GAMW THN PANAGIA 3" << std::endl;
+
   kernels.allocateOnGPU(hits_d.nHits(), stream);
+  cudaCheck(cudaGetLastError());
+  cudaCheck(cudaDeviceSynchronize());
+  std::cout << "GAMW THN PANAGIA 4" << std::endl;
+
+  kernels.buildDoublets(hits_d.view(), hits_d.offsetBPIX2(), stream);
+  cudaCheck(cudaGetLastError());
+  cudaCheck(cudaDeviceSynchronize());
+  std::cout << "GAMW THN PANAGIA 5" << std::endl;
 
-  kernels.buildDoublets(hits_d.const_view(), hits_d.offsetBPIX2(), stream);
-  kernels.launchKernels(hits_d.const_view(), soa->view(), stream);
+  kernels.launchKernels(hits_d.view(), tracks.view(), stream);
+  cudaCheck(cudaGetLastError());
+  cudaCheck(cudaDeviceSynchronize());
+  std::cout << "GAMW THN PANAGIA 6" << std::endl;
 
   HelixFitOnGPU fitter(bfield, m_params.fitNas4_);
-  fitter.allocateOnGPU(kernels.tupleMultiplicity(), soa->view());
+  cudaCheck(cudaGetLastError());
+  cudaCheck(cudaDeviceSynchronize());
+  std::cout << "GAMW THN PANAGIA 7" << std::endl;
+
+  fitter.allocateOnGPU(kernels.tupleMultiplicity(), tracks.view());
+  cudaCheck(cudaGetLastError());
+  cudaCheck(cudaDeviceSynchronize());
+  std::cout << "GAMW THN PANAGIA 8" << std::endl;
+
   if (m_params.useRiemannFit_) {
-    fitter.launchRiemannKernels(hits_d.const_view(), hits_d.nHits(), caConstants::maxNumberOfQuadruplets, stream);
+    std::cout << "GAMW THN PANAGIA 8.1a" << std::endl;
+    fitter.launchRiemannKernels(hits_d.view(), hits_d.nHits(), caConstants::maxNumberOfQuadruplets, stream);
+    cudaCheck(cudaGetLastError());
+    cudaCheck(cudaDeviceSynchronize());
+    std::cout << "GAMW THN PANAGIA 8.1" << std::endl;
   } else {
-    fitter.launchBrokenLineKernels(hits_d.const_view(), hits_d.nHits(), caConstants::maxNumberOfQuadruplets, stream);
+    std::cout << "GAMW THN PANAGIA 8.2a" << std::endl;
+    fitter.launchBrokenLineKernels(hits_d.view(), hits_d.nHits(), caConstants::maxNumberOfQuadruplets, stream);
+    cudaCheck(cudaGetLastError());
+    cudaCheck(cudaDeviceSynchronize());
+    std::cout << "GAMW THN PANAGIA 8.2" << std::endl;
   }
-  kernels.classifyTuples(hits_d.const_view(), soa->view(), stream);
+
+  kernels.classifyTuples(hits_d.view(), tracks.view(), stream);
+  cudaCheck(cudaGetLastError());
+  cudaCheck(cudaDeviceSynchronize());
+  std::cout << "GAMW THN PANAGIA 9" << std::endl;
 
 #ifdef GPU_DEBUG
   cudaDeviceSynchronize();
@@ -222,17 +253,18 @@ pixelTrack::TrackSoADevice CAHitNtupletGeneratorOnGPU::makeTuplesAsync(HitsOnGPU
   return tracks;
 }
 
-pixelTrack::TrackSoAHost CAHitNtupletGeneratorOnGPU::makeTuples(HitsOnCPU const& hits_d, float bfield) const {
+pixelTrack::TrackSoAHost CAHitNtupletGeneratorOnGPU::makeTuples(trackingRecHit::TrackingRecHitSoAHost const& hits_h,
+                                                                float bfield) const {
   pixelTrack::TrackSoAHost tracks(nullptr);
 
   CAHitNtupletGeneratorKernelsCPU kernels(m_params);
   kernels.setCounters(m_counters);
-  kernels.allocateOnGPU(hits_d.nHits(), nullptr);
+  kernels.allocateOnGPU(hits_h.nHits(), nullptr);
 
-  kernels.buildDoublets(hits_d.const_view(), hits_d.offsetBPIX2(), nullptr);
-  kernels.launchKernels(hits_d.const_view(), tracks.view(), nullptr);
+  kernels.buildDoublets(hits_h.view(), hits_h.offsetBPIX2(), nullptr);
+  kernels.launchKernels(hits_h.view(), tracks.view(), nullptr);
 
-  if (0 == hits_d.nHits())
+  if (0 == hits_h.nHits())
     return tracks;
 
   // now fit
@@ -240,12 +272,12 @@ pixelTrack::TrackSoAHost CAHitNtupletGeneratorOnGPU::makeTuples(HitsOnCPU const&
   fitter.allocateOnGPU(kernels.tupleMultiplicity(), tracks.view());
 
   if (m_params.useRiemannFit_) {
-    fitter.launchRiemannKernelsOnCPU(hits_d.const_view(), hits_d.nHits(), caConstants::maxNumberOfQuadruplets);
+    fitter.launchRiemannKernelsOnCPU(hits_h.view(), hits_h.nHits(), caConstants::maxNumberOfQuadruplets);
   } else {
-    fitter.launchBrokenLineKernelsOnCPU(hits_d.const_view(), hits_d.nHits(), caConstants::maxNumberOfQuadruplets);
+    fitter.launchBrokenLineKernelsOnCPU(hits_h.view(), hits_h.nHits(), caConstants::maxNumberOfQuadruplets);
   }
 
-  kernels.classifyTuples(hits_d.const_view(), tracks.view(), nullptr);
+  kernels.classifyTuples(hits_h.view(), tracks.view(), nullptr);
 
 #ifdef GPU_DEBUG
   std::cout << "finished building pixel tracks on CPU" << std::endl;
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h
index 3447e56354ebb..2d46bd09d1e0a 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h
@@ -6,9 +6,9 @@
 #include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h"
 #include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h"
 
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h"
 #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoAHost.h"
 #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h"
-#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h"
 
 #include "DataFormats/SiPixelDetId/interface/PixelSubdetector.h"
 #include "FWCore/ParameterSet/interface/ParameterSet.h"
@@ -28,6 +28,8 @@ namespace edm {
 
 class CAHitNtupletGeneratorOnGPU {
 public:
+  using HitsView = trackingRecHitSoA::HitSoAView;
+  using HitsConstView = trackingRecHitSoA::HitSoAConstView;
   using HitsOnGPU = trackingRecHit::TrackingRecHitSoADevice;
   using HitsOnCPU = trackingRecHit::TrackingRecHitSoAHost;
   // using hindex_type = TrackingRecHit2DSOAView::hindex_type;
@@ -54,19 +56,19 @@ class CAHitNtupletGeneratorOnGPU {
   void endJob();
 
   // On GPU
-  pixelTrack::TrackSoADevice makeTuplesAsync(HitsOnGPU const& hits_d,
+  pixelTrack::TrackSoADevice makeTuplesAsync(trackingRecHit::TrackingRecHitSoADevice const& hits_d,
                                              float bfield,
                                              cudaStream_t stream) const;
 
   // On CPU
-  pixelTrack::TrackSoAHost makeTuples(HitsOnCPU const& hits_d, float bfield) const;
+  pixelTrack::TrackSoAHost makeTuples(trackingRecHit::TrackingRecHitSoAHost const& hits_h, float bfield) const;
 
 private:
-  void buildDoublets(HitsOnCPU const& hh, cudaStream_t stream) const;
+  void buildDoublets(HitsConstView hh, cudaStream_t stream) const;
 
-  void hitNtuplets(HitsOnCPU const& hh, const edm::EventSetup& es, bool useRiemannFit, cudaStream_t cudaStream);
+  void hitNtuplets(HitsConstView hh, const edm::EventSetup& es, bool useRiemannFit, cudaStream_t cudaStream);
 
-  void launchKernels(HitsOnCPU const& hh, bool useRiemannFit, cudaStream_t cudaStream) const;
+  void launchKernels(HitsConstView hh, bool useRiemannFit, cudaStream_t cudaStream) const;
 
   Params m_params;
 
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
index a1f0c49b6fa8d..4267d674e1ca8 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
@@ -47,7 +47,7 @@ class GPUCACell {
 
   __device__ __forceinline__ void init(CellNeighborsVector& cellNeighbors,
                                        CellTracksVector& cellTracks,
-                                       HitsConstView const& hh,
+                                       HitsConstView hh,
                                        int layerPairId,
                                        hindex_type innerHitId,
                                        hindex_type outerHitId) {
@@ -113,22 +113,22 @@ class GPUCACell {
   __device__ __forceinline__ CellTracks const& tracks() const { return *theTracks; }
   __device__ __forceinline__ CellNeighbors& outerNeighbors() { return *theOuterNeighbors; }
   __device__ __forceinline__ CellNeighbors const& outerNeighbors() const { return *theOuterNeighbors; }
-  __device__ __forceinline__ float inner_x(HitsConstView const& hh) const { return hh[theInnerHitId].xGlobal(); }
-  __device__ __forceinline__ float outer_x(HitsConstView const& hh) const { return hh[theOuterHitId].xGlobal(); }
-  __device__ __forceinline__ float inner_y(HitsConstView const& hh) const { return hh[theInnerHitId].yGlobal(); }
-  __device__ __forceinline__ float outer_y(HitsConstView const& hh) const { return hh[theOuterHitId].yGlobal(); }
-  __device__ __forceinline__ float inner_z(HitsConstView const& hh) const { return theInnerZ; }
+  __device__ __forceinline__ float inner_x(HitsConstView hh) const { return hh[theInnerHitId].xGlobal(); }
+  __device__ __forceinline__ float outer_x(HitsConstView hh) const { return hh[theOuterHitId].xGlobal(); }
+  __device__ __forceinline__ float inner_y(HitsConstView hh) const { return hh[theInnerHitId].yGlobal(); }
+  __device__ __forceinline__ float outer_y(HitsConstView hh) const { return hh[theOuterHitId].yGlobal(); }
+  __device__ __forceinline__ float inner_z(HitsConstView hh) const { return theInnerZ; }
   // { return hh.zGlobal(theInnerHitId); } // { return theInnerZ; }
-  __device__ __forceinline__ float outer_z(HitsConstView const& hh) const { return hh[theOuterHitId].zGlobal(); }
-  __device__ __forceinline__ float inner_r(HitsConstView const& hh) const { return theInnerR; }
+  __device__ __forceinline__ float outer_z(HitsConstView hh) const { return hh[theOuterHitId].zGlobal(); }
+  __device__ __forceinline__ float inner_r(HitsConstView hh) const { return theInnerR; }
   // { return hh.rGlobal(theInnerHitId); } // { return theInnerR; }
-  __device__ __forceinline__ float outer_r(HitsConstView const& hh) const { return hh[theOuterHitId].rGlobal(); }
+  __device__ __forceinline__ float outer_r(HitsConstView hh) const { return hh[theOuterHitId].rGlobal(); }
 
-  __device__ __forceinline__ auto inner_iphi(HitsConstView const& hh) const { return hh[theInnerHitId].iphi(); }
-  __device__ __forceinline__ auto outer_iphi(HitsConstView const& hh) const { return hh[theOuterHitId].iphi(); }
+  __device__ __forceinline__ auto inner_iphi(HitsConstView hh) const { return hh[theInnerHitId].iphi(); }
+  __device__ __forceinline__ auto outer_iphi(HitsConstView hh) const { return hh[theOuterHitId].iphi(); }
 
-  __device__ __forceinline__ float inner_detIndex(HitsConstView const& hh) const { return hh[theInnerHitId].detectorIndex(); }
-  __device__ __forceinline__ float outer_detIndex(HitsConstView const& hh) const { return hh[theOuterHitId].detectorIndex(); }
+  __device__ __forceinline__ float inner_detIndex(HitsConstView hh) const { return hh[theInnerHitId].detectorIndex(); }
+  __device__ __forceinline__ float outer_detIndex(HitsConstView hh) const { return hh[theOuterHitId].detectorIndex(); }
 
   constexpr unsigned int inner_hit_id() const { return theInnerHitId; }
   constexpr unsigned int outer_hit_id() const { return theOuterHitId; }
@@ -140,7 +140,7 @@ class GPUCACell {
            theOuterHitId);
   }
 
-  __device__ bool check_alignment(HitsConstView const& hh,
+  __device__ bool check_alignment(HitsConstView hh,
                                   GPUCACell const& otherCell,
                                   const float ptmin,
                                   const float hardCurvCut,
@@ -187,7 +187,7 @@ class GPUCACell {
     return tan_12_13_half_mul_distance_13_squared * pMin <= thetaCut * distance_13_squared * radius_diff;
   }
 
-  __device__ inline bool dcaCut(HitsConstView const& hh,
+  __device__ inline bool dcaCut(HitsConstView hh,
                                 GPUCACell const& otherCell,
                                 const float region_origin_radius_plus_tolerance,
                                 const float maxCurv) const {
@@ -224,7 +224,7 @@ class GPUCACell {
     return std::abs(eq.dca0()) < region_origin_radius_plus_tolerance * std::abs(eq.curvature());
   }
 
-  __device__ inline bool hole0(HitsConstView const& hh, GPUCACell const& innerCell) const {
+  __device__ inline bool hole0(HitsConstView hh, GPUCACell const& innerCell) const {
     using caConstants::first_ladder_bpx0;
     using caConstants::max_ladder_bpx0;
     using caConstants::module_length_bpx0;
@@ -247,7 +247,7 @@ class GPUCACell {
     return gap;
   }
 
-  __device__ inline bool hole4(HitsConstView const& hh, GPUCACell const& innerCell) const {
+  __device__ inline bool hole4(HitsConstView hh, GPUCACell const& innerCell) const {
     using caConstants::first_ladder_bpx4;
     using caConstants::max_ladder_bpx4;
     using caConstants::module_length_bpx4;
@@ -275,7 +275,7 @@ class GPUCACell {
   // trying to free the track building process from hardcoded layers, leaving
   // the visit of the graph based on the neighborhood connections between cells.
   template <int DEPTH>
-  __device__ inline void find_ntuplets(HitsConstView const& hh,
+  __device__ inline void find_ntuplets(HitsConstView hh,
                                        GPUCACell* __restrict__ cells,
                                        CellTracksVector& cellTracks,
                                        HitContainer& foundNtuplets,
@@ -345,14 +345,14 @@ class GPUCACell {
   __device__ __forceinline__ bool unused() const { return 0 == (uint16_t(StatusBit::kUsed) & theStatus_); }
   __device__ __forceinline__ void setStatusBits(StatusBit mask) { theStatus_ |= uint16_t(mask); }
 
-  __device__ __forceinline__ void setFishbone(hindex_type id, float z, HitsConstView const& hh) {
+  __device__ __forceinline__ void setFishbone(hindex_type id, float z, HitsConstView hh) {
     // make it deterministic: use the farther apart (in z)
     auto old = theFishboneId;
-    while (
-        old !=
-        atomicCAS(&theFishboneId,
-                  old,
-                  (invalidHitId == old || std::abs(z - theInnerZ) > std::abs(hh[old].zGlobal() - theInnerZ)) ? id : old))
+    while (old !=
+           atomicCAS(
+               &theFishboneId,
+               old,
+               (invalidHitId == old || std::abs(z - theInnerZ) > std::abs(hh[old].zGlobal() - theInnerZ)) ? id : old))
       old = theFishboneId;
   }
   __device__ __forceinline__ auto fishboneId() const { return theFishboneId; }
@@ -373,7 +373,7 @@ class GPUCACell {
 };
 
 template <>
-__device__ inline void GPUCACell::find_ntuplets<0>(HitsConstView const& hh,
+__device__ inline void GPUCACell::find_ntuplets<0>(HitsConstView hh,
                                                    GPUCACell* __restrict__ cells,
                                                    CellTracksVector& cellTracks,
                                                    HitContainer& foundNtuplets,
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h
index fe618c035f38f..1fdef06b56db1 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h
@@ -33,6 +33,7 @@ namespace riemannFit {
 
 class HelixFitOnGPU {
 public:
+  using HitSoAView = trackingRecHitSoA::HitSoAView;
   using HitSoAConstView = trackingRecHitSoA::HitSoAConstView;
 
   using Tuples = pixelTrack::HitContainer;
@@ -44,11 +45,11 @@ class HelixFitOnGPU {
   ~HelixFitOnGPU() { deallocateOnGPU(); }
 
   void setBField(double bField) { bField_ = bField; }
-  void launchRiemannKernels(HitSoAConstView const &hv, uint32_t nhits, uint32_t maxNumberOfTuples, cudaStream_t cudaStream);
-  void launchBrokenLineKernels(HitSoAConstView const &hv, uint32_t nhits, uint32_t maxNumberOfTuples, cudaStream_t cudaStream);
+  void launchRiemannKernels(HitSoAConstView hv, uint32_t nhits, uint32_t maxNumberOfTuples, cudaStream_t cudaStream);
+  void launchBrokenLineKernels(HitSoAConstView hv, uint32_t nhits, uint32_t maxNumberOfTuples, cudaStream_t cudaStream);
 
-  void launchRiemannKernelsOnCPU(HitSoAConstView const &hv, uint32_t nhits, uint32_t maxNumberOfTuples);
-  void launchBrokenLineKernelsOnCPU(HitSoAConstView const &hv, uint32_t nhits, uint32_t maxNumberOfTuples);
+  void launchRiemannKernelsOnCPU(HitSoAConstView hv, uint32_t nhits, uint32_t maxNumberOfTuples);
+  void launchBrokenLineKernelsOnCPU(HitSoAConstView hv, uint32_t nhits, uint32_t maxNumberOfTuples);
 
   void allocateOnGPU(TupleMultiplicity const *tupleMultiplicity, OutputSoAView outputSoA);
   void deallocateOnGPU();
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cc
index c5dcd9f32f0e9..1b2e08767efa2 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cc
@@ -1,6 +1,6 @@
 #include "RiemannFitOnGPU.h"
 
-void HelixFitOnGPU::launchRiemannKernelsOnCPU(HitSoAConstView const &hv, uint32_t nhits, uint32_t maxNumberOfTuples) {
+void HelixFitOnGPU::launchRiemannKernelsOnCPU(HitSoAConstView hv, uint32_t nhits, uint32_t maxNumberOfTuples) {
   assert(tuples_);
 
   //  Fit internals
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu
index a62cc64cb1fb5..87db9d8f0e71d 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu
+++ b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu
@@ -1,7 +1,7 @@
 #include "RiemannFitOnGPU.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
 
-void HelixFitOnGPU::launchRiemannKernels(HitSoAConstView const &hv,
+void HelixFitOnGPU::launchRiemannKernels(HitSoAConstView hv,
                                          uint32_t nhits,
                                          uint32_t maxNumberOfTuples,
                                          cudaStream_t stream) {
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h
index 1f91172cb6105..f203acb8047ff 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h
@@ -22,7 +22,7 @@ template <int N>
 __global__ void kernel_FastFit(Tuples const *__restrict__ foundNtuplets,
                                caConstants::TupleMultiplicity const *__restrict__ tupleMultiplicity,
                                uint32_t nHits,
-                               HitSoAConstView const &__restrict__ hh,
+                               HitSoAConstView hh,
                                double *__restrict__ phits,
                                float *__restrict__ phits_ge,
                                double *__restrict__ pfast_fit,
@@ -65,9 +65,7 @@ __global__ void kernel_FastFit(Tuples const *__restrict__ foundNtuplets,
       auto hit = hitId[i];
       // printf("Hit global: %f,%f,%f\n", hh.xg_d[hit],hh.yg_d[hit],hh.zg_d[hit]);
       float ge[6];
-      hh.cpeParams()
-          .detParams(hh.detectorIndex(hit))
-          .frame.toGlobal(hh.xerrLocal(hit), 0, hh.yerrLocal(hit), ge);
+      hh.cpeParams().detParams(hh.detectorIndex(hit)).frame.toGlobal(hh.xerrLocal(hit), 0, hh.yerrLocal(hit), ge);
       // printf("Error: %d: %f,%f,%f,%f,%f,%f\n",hh.detInd_d[hit],ge[0],ge[1],ge[2],ge[3],ge[4],ge[5]);
 
       hits.col(i) << hh.xGlobal(hit), hh.yGlobal(hit), hh.zGlobal(hit);
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h
index 6432289fc6cc4..b2b6592e78a74 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h
@@ -16,7 +16,7 @@
 
 namespace gpuPixelDoublets {
 
-  __global__ void fishbone(GPUCACell::HitsConstView const& __restrict__ hh,
+  __global__ void fishbone(GPUCACell::HitsConstView hh,
                            GPUCACell* cells,
                            uint32_t const* __restrict__ nCells,
                            GPUCACell::OuterHitOfCell const isOuterHitOfCellWrap,
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
index c51559b530711..e12476d9a4b0d 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
@@ -99,7 +99,7 @@ namespace gpuPixelDoublets {
                                 uint32_t* nCells,
                                 CellNeighborsVector* cellNeighbors,
                                 CellTracksVector* cellTracks,
-                                HitsConstView const& __restrict__ hh,
+                                HitsConstView hh,
                                 GPUCACell::OuterHitOfCell isOuterHitOfCell,
                                 int nActualPairs,
                                 bool ideal_cond,
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h
index b16e08ba5e22a..8bf5eec7a00ce 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h
@@ -29,7 +29,7 @@ namespace gpuPixelDoublets {
                                                     uint32_t* nCells,
                                                     CellNeighborsVector* cellNeighbors,
                                                     CellTracksVector* cellTracks,
-                                                    HitsConstView const& __restrict__ hh,
+                                                    HitsConstView hh,
                                                     GPUCACell::OuterHitOfCell isOuterHitOfCell,
                                                     int16_t const* __restrict__ phicuts,
                                                     float const* __restrict__ minz,