From d3e5e681c597a3724331da4f8566c9e6b18503fa Mon Sep 17 00:00:00 2001
From: Breno Orzari <breno.orzari@hotmail.com>
Date: Wed, 28 Sep 2022 12:31:41 +0200
Subject: [PATCH 001/110] Adding PixelTrack CUDADataFormats header and dummy

---
 .../interface/TrackSoAHeterogeneousT_test.h   | 114 ++++++++++++++++++
 .../Track/src/TrackSoAHeterogeneous_t_test.cc |   1 +
 2 files changed, 115 insertions(+)
 create mode 100644 CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h
 create mode 100644 CUDADataFormats/Track/src/TrackSoAHeterogeneous_t_test.cc
diff --git a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h
new file mode 100644
index 0000000000000..b6eb21b0835dc
--- /dev/null
+++ b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h
@@ -0,0 +1,114 @@
+#ifndef CUDADataFormats_Track_TrackHeterogeneousT_H
+#define CUDADataFormats_Track_TrackHeterogeneousT_H
+
+#include <string>
+#include <algorithm>
+
+#include "CUDADataFormats/Track/interface/TrajectoryStateSoAT.h"
+#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
+
+#include "CUDADataFormats/Common/interface/HeterogeneousSoA.h"
+#include "DataFormats/SoATemplate/interface/SoALayout.h"
+
+//#include "DataFormats/Portable/interface/PortableCUDADeviceCollection.h"
+#include "CUDADataFormats/Common/interface/PortableDeviceCollection.h"
+
+namespace pixelTrack {
+  enum class Quality : uint8_t { bad = 0, edup, dup, loose, strict, tight, highPurity, notQuality };
+  constexpr uint32_t qualitySize{uint8_t(Quality::notQuality)};
+  const std::string qualityName[qualitySize]{"bad", "edup", "dup", "loose", "strict", "tight", "highPurity"};
+  inline Quality qualityByName(std::string const &name) {
+    auto qp = std::find(qualityName, qualityName + qualitySize, name) - qualityName;
+    return static_cast<Quality>(qp);
+  }
+}  // namespace pixelTrack
+
+GENERATE_SOA_LAYOUT(TrackSoAHeterogeneousT_test,
+                    SOA_COLUMN(uint8_t, quality),
+                    SOA_COLUMN(float, chi2), // this is chi2/ndof as not necessarely all hits are used in the fit
+                    SOA_COLUMN(int8_t, nLayers),
+                    SOA_COLUMN(float, eta),
+                    SOA_COLUMN(float, pt))
+                    // TODO: maybe add stateAtBS
+
+template <int32_t S>
+class TrackSoAHeterogeneousT  : public cms::cuda::PortableDeviceCollection<TrackSoAHeterogeneousT_test<>> {
+
+public:
+
+  static constexpr int32_t stride() { return S; }
+
+  using Quality = pixelTrack::Quality;
+  using hindex_type = uint32_t;
+  using HitContainer = cms::cuda::OneToManyAssoc<hindex_type, S + 1, 5 * S>;
+
+  // Always check quality is at least loose!
+  // CUDA does not support enums  in __lgc ...
+private:
+
+public:
+  constexpr Quality quality(int32_t i) const { return static_cast<Quality>(view()[i].quality()); }
+  constexpr Quality &quality(int32_t i) { return static_cast<Quality &>(view()[i].quality()); }
+  // TODO: static did not work; using reinterpret_cast
+  constexpr Quality const *qualityData() const { return reinterpret_cast <Quality const *>(view().quality()); }
+  constexpr Quality *qualityData() { return reinterpret_cast< Quality *>(view().quality()); }
+
+  constexpr int nTracks() const { return nTracks_; }
+  constexpr void setNTracks(int n) { nTracks_ = n; }
+
+  constexpr int nHits(int i) const { return detIndices.size(i); }
+
+  constexpr bool isTriplet(int i) const { return view()[i].nLayers() == 3; }
+
+  constexpr int computeNumberOfLayers(int32_t i) const {
+    // layers are in order and we assume tracks are either forward or backward
+    auto pdet = detIndices.begin(i);
+    int nl = 1;
+    auto ol = phase1PixelTopology::getLayer(*pdet);
+    for (; pdet < detIndices.end(i); ++pdet) {
+      auto il = phase1PixelTopology::getLayer(*pdet);
+      if (il != ol)
+        ++nl;
+      ol = il;
+    }
+    return nl;
+  }
+
+  // State at the Beam spot
+  // phi,tip,1/pt,cotan(theta),zip
+  TrajectoryStateSoAT<S> stateAtBS;
+  constexpr float charge(int32_t i) const { return std::copysign(1.f, stateAtBS.state(i)(2)); }
+  constexpr float phi(int32_t i) const { return stateAtBS.state(i)(0); }
+  constexpr float tip(int32_t i) const { return stateAtBS.state(i)(1); }
+  constexpr float zip(int32_t i) const { return stateAtBS.state(i)(4); }
+
+  // state at the detector of the outermost hit
+  // representation to be decided...
+  // not yet filled on GPU
+  // TrajectoryStateSoA<S> stateAtOuterDet;
+
+  HitContainer hitIndices;
+  HitContainer detIndices;
+
+private:
+  int nTracks_;
+};
+
+namespace pixelTrack {
+
+#ifdef GPU_SMALL_EVENTS
+  // kept for testing and debugging
+  constexpr uint32_t maxNumber() { return 2 * 1024; }
+#else
+  // tested on MC events with 55-75 pileup events
+  constexpr uint32_t maxNumber() { return 32 * 1024; }
+#endif
+
+  using TrackSoA = TrackSoAHeterogeneousT<maxNumber()>;
+  using TrajectoryState = TrajectoryStateSoAT<maxNumber()>;
+  using HitContainer = TrackSoA::HitContainer;
+
+}  // namespace pixelTrack
+
+#endif  // CUDADataFormats_Track_TrackHeterogeneousT_H
diff --git a/CUDADataFormats/Track/src/TrackSoAHeterogeneous_t_test.cc b/CUDADataFormats/Track/src/TrackSoAHeterogeneous_t_test.cc
new file mode 100644
index 0000000000000..b15debe3cb72b
--- /dev/null
+++ b/CUDADataFormats/Track/src/TrackSoAHeterogeneous_t_test.cc
@@ -0,0 +1 @@
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h"

From cee33d5c2f89edf396136d13ccfc0b4101876d7c Mon Sep 17 00:00:00 2001
From: Breno Orzari <breno.orzari@hotmail.com>
Date: Wed, 28 Sep 2022 15:10:25 +0200
Subject: [PATCH 002/110] Adding methods for pt, eta and chi2

---
 .../Track/interface/PixelTrackHeterogeneous.h        |  5 +++--
 .../Track/interface/TrackSoAHeterogeneousT_test.h    | 12 ++++++++++++
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h b/CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h
index 3ee5af80353dd..c0e5c99b6fd28 100644
--- a/CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h
+++ b/CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h
@@ -2,8 +2,9 @@
 #define CUDADataFormats_Track_PixelTrackHeterogeneous_h
 
 #include "CUDADataFormats/Common/interface/HeterogeneousSoA.h"
-#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h"
+//#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h"
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h"
 
 using PixelTrackHeterogeneous = HeterogeneousSoA<pixelTrack::TrackSoA>;
 
-#endif  // #ifndef CUDADataFormats_Track_PixelTrackHeterogeneous_h
\ No newline at end of file
+#endif  // #ifndef CUDADataFormats_Track_PixelTrackHeterogeneous_h
diff --git a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h
index b6eb21b0835dc..1cf34f14b30a1 100644
--- a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h
+++ b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h
@@ -37,6 +37,9 @@ class TrackSoAHeterogeneousT  : public cms::cuda::PortableDeviceCollection<Track
 
 public:
 
+  // using cms::cuda::PortableDeviceCollection<TrackSoAHeterogeneousT_test<>>::PortableDeviceCollection;
+  TrackSoAHeterogeneousT() = default;
+
   static constexpr int32_t stride() { return S; }
 
   using Quality = pixelTrack::Quality;
@@ -54,6 +57,15 @@ class TrackSoAHeterogeneousT  : public cms::cuda::PortableDeviceCollection<Track
   constexpr Quality const *qualityData() const { return reinterpret_cast <Quality const *>(view().quality()); }
   constexpr Quality *qualityData() { return reinterpret_cast< Quality *>(view().quality()); }
 
+  constexpr float pt(int32_t i) const { return view()[i].pt(); }
+  constexpr float &pt(int32_t i) { return view()[i].pt(); }
+
+  constexpr float eta(int32_t i) const { return view()[i].eta(); }
+  constexpr float &eta(int32_t i) { return view()[i].eta(); }
+
+  constexpr float chi2(int32_t i) const { return view()[i].chi2(); }
+  constexpr float &chi2(int32_t i) { return view()[i].chi2(); }
+
   constexpr int nTracks() const { return nTracks_; }
   constexpr void setNTracks(int n) { nTracks_ = n; }
 

From f92a10f5398ac9e42c06355c942d5a55371bc820 Mon Sep 17 00:00:00 2001
From: Dimitris Papagiannis <nothingface0@gmail.com>
Date: Wed, 28 Sep 2022 16:25:53 +0200
Subject: [PATCH 003/110] Test of TrajectoryStateSoAT with macro-generated SoA
 compiles

---
 .../interface/TrajectoryStateSoAT_test.h      | 63 +++++++++++++++++++
 .../Track/src/TrajectoryStateSoAT_test.cpp    |  1 +
 2 files changed, 64 insertions(+)
 create mode 100644 CUDADataFormats/Track/interface/TrajectoryStateSoAT_test.h
 create mode 100644 CUDADataFormats/Track/src/TrajectoryStateSoAT_test.cpp

diff --git a/CUDADataFormats/Track/interface/TrajectoryStateSoAT_test.h b/CUDADataFormats/Track/interface/TrajectoryStateSoAT_test.h
new file mode 100644
index 0000000000000..1e561d0131d51
--- /dev/null
+++ b/CUDADataFormats/Track/interface/TrajectoryStateSoAT_test.h
@@ -0,0 +1,63 @@
+#ifndef CUDADataFormats_Track_TrajectoryStateSOAT_H
+#define CUDADataFormats_Track_TrajectoryStateSOAT_H
+
+#include <Eigen/Dense>
+#include "HeterogeneousCore/CUDAUtilities/interface/eigenSoA.h"
+#include "DataFormats/SoATemplate/interface/SoALayout.h"
+#include "CUDADataFormats/Common/interface/PortableDeviceCollection.h"
+using Vector5f = Eigen::Matrix<float, 5, 1>;
+using Vector15f = Eigen::Matrix<float, 15, 1>;
+
+using Vector5d = Eigen::Matrix<double, 5, 1>;
+using Matrix5d = Eigen::Matrix<double, 5, 5>;
+GENERATE_SOA_LAYOUT(TrajectoryStateSoAT_test,
+                    SOA_EIGEN_COLUMN(Vector5f, state),
+                    SOA_EIGEN_COLUMN(Vector15f, covariance))
+
+template <int32_t S>
+struct TrajectoryStateSoAT : public cms::cuda::PortableDeviceCollection<TrajectoryStateSoAT_test<>> {
+  static constexpr int32_t stride() { return S; }
+
+  // eigenSoA::MatrixSoA<Vector5f, S> state;
+  // eigenSoA::MatrixSoA<Vector15f, S> covariance;
+
+  template <typename V3, typename M3, typename V2, typename M2>
+  __host__ __device__ inline void copyFromCircle(
+      V3 const& cp, M3 const& ccov, V2 const& lp, M2 const& lcov, float b, int32_t i) {
+    view()[i].state() << cp.template cast<float>(), lp.template cast<float>();
+    view()[i].state()(2) *= b;  // TODO?? 2d access??
+    auto cov = view()[i].covariance();
+    cov(0) = ccov(0, 0);
+    cov(1) = ccov(0, 1);
+    cov(2) = b * float(ccov(0, 2));
+    cov(4) = cov(3) = 0;
+    cov(5) = ccov(1, 1);
+    cov(6) = b * float(ccov(1, 2));
+    cov(8) = cov(7) = 0;
+    cov(9) = b * b * float(ccov(2, 2));
+    cov(11) = cov(10) = 0;
+    cov(12) = lcov(0, 0);
+    cov(13) = lcov(0, 1);
+    cov(14) = lcov(1, 1);
+  }
+
+  template <typename V5, typename M5>
+  __host__ __device__ inline void copyFromDense(V5 const& v, M5 const& cov, int32_t i) {
+    view()[i].state() = v.template cast<float>();
+    for (int j = 0, ind = 0; j < 5; ++j)
+      for (auto k = j; k < 5; ++k)
+        view()[i].covariance()(ind++) = cov(j, k);
+  }
+
+  template <typename V5, typename M5>
+  __host__ __device__ inline void copyToDense(V5& v, M5& cov, int32_t i) const {
+    v = view()[i].state().template cast<typename V5::Scalar>();
+    for (int j = 0, ind = 0; j < 5; ++j) {
+      cov(j, j) = view()[i].covariance()(ind++);
+      for (auto k = j + 1; k < 5; ++k)
+        cov(k, j) = cov(j, k) = view()[i].covariance()(ind++);
+    }
+  }
+};
+
+#endif  // CUDADataFormats_Track_TrajectoryStateSOAT_H
diff --git a/CUDADataFormats/Track/src/TrajectoryStateSoAT_test.cpp b/CUDADataFormats/Track/src/TrajectoryStateSoAT_test.cpp
new file mode 100644
index 0000000000000..f6b9659331603
--- /dev/null
+++ b/CUDADataFormats/Track/src/TrajectoryStateSoAT_test.cpp
@@ -0,0 +1 @@
+#include "CUDADataFormats/Track/interface/TrajectoryStateSoAT_test.h"

From b69a948905933d82976e44359f952b4b0cb9ded0 Mon Sep 17 00:00:00 2001
From: Dimitris Papagiannis <nothingface0@gmail.com>
Date: Thu, 29 Sep 2022 16:29:29 +0200
Subject: [PATCH 004/110] Eric's SoA port for Trajectory; might be dumped

---
 .../Track/interface/TrajectoryStateSoAT.h     | 47 ++++++++-----
 .../plugins/CAHitNtupletGeneratorKernels.cc   | 13 ++--
 .../plugins/CAHitNtupletGeneratorKernels.cu   | 13 ++--
 .../CAHitNtupletGeneratorKernelsImpl.h        | 67 ++++++++++---------
 4 files changed, 86 insertions(+), 54 deletions(-)

diff --git a/CUDADataFormats/Track/interface/TrajectoryStateSoAT.h b/CUDADataFormats/Track/interface/TrajectoryStateSoAT.h
index 64fcd573a6991..7f710ca67c7b6 100644
--- a/CUDADataFormats/Track/interface/TrajectoryStateSoAT.h
+++ b/CUDADataFormats/Track/interface/TrajectoryStateSoAT.h
@@ -3,26 +3,41 @@
 
 #include <Eigen/Dense>
 #include "HeterogeneousCore/CUDAUtilities/interface/eigenSoA.h"
+#include "DataFormats/SoATemplate/interface/SoALayout.h"
+#include "CUDADataFormats/Common/interface/PortableDeviceCollection.h"
+using Vector5f = Eigen::Matrix<float, 5, 1>;
+using Vector15f = Eigen::Matrix<float, 15, 1>;
+
+using Vector5d = Eigen::Matrix<double, 5, 1>;
+using Matrix5d = Eigen::Matrix<double, 5, 5>;
+GENERATE_SOA_LAYOUT(TrajectoryStateSoAT_test,
+                    SOA_EIGEN_COLUMN(Vector5f, state),
+                    SOA_EIGEN_COLUMN(Vector15f, covariance))
 
 template <int32_t S>
-struct TrajectoryStateSoAT {
-  using Vector5f = Eigen::Matrix<float, 5, 1>;
-  using Vector15f = Eigen::Matrix<float, 15, 1>;
+struct TrajectoryStateSoAT : public cms::cuda::PortableDeviceCollection<TrajectoryStateSoAT_test<>> {
+  static constexpr int32_t stride() { return S; }
 
-  using Vector5d = Eigen::Matrix<double, 5, 1>;
-  using Matrix5d = Eigen::Matrix<double, 5, 5>;
+  // eigenSoA::MatrixSoA<Vector5f, S> state;
+  // eigenSoA::MatrixSoA<Vector15f, S> covariance;
 
-  static constexpr int32_t stride() { return S; }
+  // Vector5f state(const int32_t i) const { return view()[i].state(); }
+  // float* state() const { return view().state(); }  // TODO: Return Vector5f* ?
+  // Vector15f covariance(const int32_t i) const { return view()[i].covariance(); }
+  // float* covariance() const { return view().covariance(); }  // TODO: Return Vector15f* ?
+
+  // Restrict view
+  // using RestrictConstView =
+  //     Layout::ConstViewTemplate<cms::soa::RestrictQualify::enabled, cms::soa::RangeChecking::disabled>;
 
-  eigenSoA::MatrixSoA<Vector5f, S> state;
-  eigenSoA::MatrixSoA<Vector15f, S> covariance;
+  // RestrictConstView restrictConstView() const { return RestrictConstView(layout()); }
 
   template <typename V3, typename M3, typename V2, typename M2>
   __host__ __device__ inline void copyFromCircle(
       V3 const& cp, M3 const& ccov, V2 const& lp, M2 const& lcov, float b, int32_t i) {
-    state(i) << cp.template cast<float>(), lp.template cast<float>();
-    state(i)(2) *= b;
-    auto cov = covariance(i);
+    view()[i].state() << cp.template cast<float>(), lp.template cast<float>();
+    view()[i].state()(2) *= b;
+    auto cov = view()[i].covariance();
     cov(0) = ccov(0, 0);
     cov(1) = ccov(0, 1);
     cov(2) = b * float(ccov(0, 2));
@@ -39,19 +54,19 @@ struct TrajectoryStateSoAT {
 
   template <typename V5, typename M5>
   __host__ __device__ inline void copyFromDense(V5 const& v, M5 const& cov, int32_t i) {
-    state(i) = v.template cast<float>();
+    view()[i].state() = v.template cast<float>();
     for (int j = 0, ind = 0; j < 5; ++j)
       for (auto k = j; k < 5; ++k)
-        covariance(i)(ind++) = cov(j, k);
+        view()[i].covariance()(ind++) = cov(j, k);
   }
 
   template <typename V5, typename M5>
   __host__ __device__ inline void copyToDense(V5& v, M5& cov, int32_t i) const {
-    v = state(i).template cast<typename V5::Scalar>();
+    v = view()[i].state().template cast<typename V5::Scalar>();
     for (int j = 0, ind = 0; j < 5; ++j) {
-      cov(j, j) = covariance(i)(ind++);
+      cov(j, j) = view()[i].covariance()(ind++);
       for (auto k = j + 1; k < 5; ++k)
-        cov(k, j) = cov(j, k) = covariance(i)(ind++);
+        cov(k, j) = cov(j, k) = view()[i].covariance()(ind++);
     }
   }
 };
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
index 66208debdc98d..bc745817d4e4a 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
@@ -151,7 +151,7 @@ void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, TkSoA
   auto *quality_d = tracks_d->qualityData();
 
   // classify tracks based on kinematics
-  kernel_classifyTracks(tuples_d, tracks_d, params_.cuts_, quality_d);
+  kernel_classifyTracks(tuples_d, tracks_d, tracks_d->stateAtBS.view(), params_.cuts_, quality_d);
 
   if (params_.lateFishbone_) {
     // apply fishbone cleaning to good tracks
@@ -159,7 +159,8 @@ void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, TkSoA
   }
 
   // remove duplicates (tracks that share a doublet)
-  kernel_fastDuplicateRemover(device_theCells_.get(), device_nCells_, tracks_d, params_.dupPassThrough_);
+  kernel_fastDuplicateRemover(
+      device_theCells_.get(), device_nCells_, tracks_d, tracks_d->stateAtBS.view(), params_.dupPassThrough_);
 
   // fill hit->track "map"
   if (params_.doSharedHitCut_ || params_.doStats_) {
@@ -170,8 +171,12 @@ void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, TkSoA
 
   // remove duplicates (tracks that share at least one hit)
   if (params_.doSharedHitCut_) {
-    kernel_rejectDuplicate(
-        tracks_d, quality_d, params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get());
+    kernel_rejectDuplicate(tracks_d,
+                           tracks_d->stateAtBS.view(),  // stateAtBS SoA view
+                           quality_d,
+                           params_.minHitsForSharingCut_,
+                           params_.dupPassThrough_,
+                           device_hitToTuple_.get());
 
     kernel_sharedHitCleaner(hh.view(),
                             tracks_d,
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
index 913b6d5a32d28..712d995a6a6cf 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
@@ -233,7 +233,8 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA
 
   // classify tracks based on kinematics
   auto numberOfBlocks = nQuadrupletBlocks(blockSize);
-  kernel_classifyTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d, tracks_d, params_.cuts_, quality_d);
+  kernel_classifyTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+      tuples_d, tracks_d, tracks_d->stateAtBS.view(), params_.cuts_, quality_d);
   cudaCheck(cudaGetLastError());
 
   if (params_.lateFishbone_) {
@@ -247,7 +248,7 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA
   // mark duplicates (tracks that share a doublet)
   numberOfBlocks = nDoubletBlocks(blockSize);
   kernel_fastDuplicateRemover<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
-      device_theCells_.get(), device_nCells_, tracks_d, params_.dupPassThrough_);
+      device_theCells_.get(), device_nCells_, tracks_d, tracks_d->stateAtBS.view(), params_.dupPassThrough_);
   cudaCheck(cudaGetLastError());
 #ifdef GPU_DEBUG
   cudaCheck(cudaDeviceSynchronize());
@@ -275,8 +276,12 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA
     // mark duplicates (tracks that share at least one hit)
     numberOfBlocks = (hitToTupleView_.offSize + blockSize - 1) / blockSize;
 
-    kernel_rejectDuplicate<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
-        tracks_d, quality_d, params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get());
+    kernel_rejectDuplicate<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tracks_d,
+                                                                         tracks_d->stateAtBS.view(),
+                                                                         quality_d,
+                                                                         params_.minHitsForSharingCut_,
+                                                                         params_.dupPassThrough_,
+                                                                         device_hitToTuple_.get());
 
     kernel_sharedHitCleaner<<<numberOfBlocks, blockSize, 0, cudaStream>>>(hh.view(),
                                                                           tracks_d,
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
index bbe5df891a735..5806f6e6844e2 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
@@ -14,6 +14,7 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
 #include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h"
+#include "CUDADataFormats/Track/interface/TrajectoryStateSoAT.h"
 
 #include "CAConstants.h"
 #include "CAHitNtupletGeneratorKernels.h"
@@ -175,10 +176,12 @@ __global__ void kernel_earlyDuplicateRemover(GPUCACell const *cells,
 }
 
 // assume the above (so, short tracks already removed)
-__global__ void kernel_fastDuplicateRemover(GPUCACell const *__restrict__ cells,
-                                            uint32_t const *__restrict__ nCells,
-                                            TkSoA *__restrict__ tracks,
-                                            bool dupPassThrough) {
+__global__ void kernel_fastDuplicateRemover(
+    GPUCACell const *__restrict__ cells,
+    uint32_t const *__restrict__ nCells,
+    TkSoA *__restrict__ tracks,
+    cms::cuda::PortableDeviceCollection<TrajectoryStateSoAT_test<>>::ConstView stateAtBS_view,
+    bool dupPassThrough) {
   // quality to mark rejected
   auto const reject = dupPassThrough ? pixelTrack::Quality::loose : pixelTrack::Quality::dup;
   constexpr auto loose = pixelTrack::Quality::loose;
@@ -211,21 +214,21 @@ __global__ void kernel_fastDuplicateRemover(GPUCACell const *__restrict__ cells,
       auto qi = tracks->quality(it);
       if (qi <= reject)
         continue;
-      auto opi = tracks->stateAtBS.state(it)(2);
-      auto e2opi = tracks->stateAtBS.covariance(it)(9);
-      auto cti = tracks->stateAtBS.state(it)(3);
-      auto e2cti = tracks->stateAtBS.covariance(it)(12);
+      auto opi = stateAtBS_view[it].state()(2);
+      auto e2opi = stateAtBS_view[it].covariance()(9);
+      auto cti = stateAtBS_view[it].state()(3);
+      auto e2cti = stateAtBS_view[it].covariance()(12);
       for (auto j = i + 1; j < ntr; ++j) {
         auto jt = thisCell.tracks()[j];
         auto qj = tracks->quality(jt);
         if (qj <= reject)
           continue;
-        auto opj = tracks->stateAtBS.state(jt)(2);
-        auto ctj = tracks->stateAtBS.state(jt)(3);
-        auto dct = nSigma2 * (tracks->stateAtBS.covariance(jt)(12) + e2cti);
+        auto opj = stateAtBS_view[jt].state()(2);
+        auto ctj = stateAtBS_view[jt].state()(3);
+        auto dct = nSigma2 * (stateAtBS_view[jt].covariance()(12) + e2cti);
         if ((cti - ctj) * (cti - ctj) > dct)
           continue;
-        auto dop = nSigma2 * (tracks->stateAtBS.covariance(jt)(9) + e2opi);
+        auto dop = nSigma2 * (stateAtBS_view[jt].covariance()(9) + e2opi);
         if ((opi - opj) * (opi - opj) > dop)
           continue;
         if ((qj < qi) || (qj == qi && score(it) < score(jt)))
@@ -410,10 +413,12 @@ __global__ void kernel_fillMultiplicity(HitContainer const *__restrict__ foundNt
   }
 }
 
-__global__ void kernel_classifyTracks(HitContainer const *__restrict__ tuples,
-                                      TkSoA const *__restrict__ tracks,
-                                      CAHitNtupletGeneratorKernelsGPU::QualityCuts cuts,
-                                      Quality *__restrict__ quality) {
+__global__ void kernel_classifyTracks(
+    HitContainer const *__restrict__ tuples,
+    TkSoA const *__restrict__ tracks,
+    cms::cuda::PortableDeviceCollection<TrajectoryStateSoAT_test<>>::ConstView stateAtBS_view,
+    CAHitNtupletGeneratorKernelsGPU::QualityCuts cuts,
+    Quality *__restrict__ quality) {
   int first = blockDim.x * blockIdx.x + threadIdx.x;
   for (int it = first, nt = tuples->nOnes(); it < nt; it += gridDim.x * blockDim.x) {
     auto nhits = tuples->size(it);
@@ -433,7 +438,7 @@ __global__ void kernel_classifyTracks(HitContainer const *__restrict__ tuples,
     // if the fit has any invalid parameters, mark it as bad
     bool isNaN = false;
     for (int i = 0; i < 5; ++i) {
-      isNaN |= std::isnan(tracks->stateAtBS.state(it)(i));
+      isNaN |= std::isnan(stateAtBS_view[it].state()(i));
     }
     if (isNaN) {
 #ifdef NTUPLE_DEBUG
@@ -642,11 +647,13 @@ __global__ void kernel_markSharedHit(int const *__restrict__ nshared,
 }
 
 // mostly for very forward triplets.....
-__global__ void kernel_rejectDuplicate(TkSoA const *__restrict__ ptracks,
-                                       Quality *__restrict__ quality,
-                                       uint16_t nmin,
-                                       bool dupPassThrough,
-                                       CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ phitToTuple) {
+__global__ void kernel_rejectDuplicate(
+    TkSoA const *__restrict__ ptracks,  // TODO: Change to Constview
+    cms::cuda::PortableDeviceCollection<TrajectoryStateSoAT_test<>>::ConstView stateAtBS_view,
+    Quality *__restrict__ quality,
+    uint16_t nmin,
+    bool dupPassThrough,
+    CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ phitToTuple) {
   // quality to mark rejected
   auto const reject = dupPassThrough ? pixelTrack::Quality::loose : pixelTrack::Quality::dup;
 
@@ -672,22 +679,22 @@ __global__ void kernel_rejectDuplicate(TkSoA const *__restrict__ ptracks,
       auto qi = quality[it];
       if (qi <= reject)
         continue;
-      auto opi = tracks.stateAtBS.state(it)(2);
-      auto e2opi = tracks.stateAtBS.covariance(it)(9);
-      auto cti = tracks.stateAtBS.state(it)(3);
-      auto e2cti = tracks.stateAtBS.covariance(it)(12);
+      auto opi = stateAtBS_view[it].state()(2);
+      auto e2opi = stateAtBS_view[it].covariance()(9);
+      auto cti = stateAtBS_view[it].state()(3);
+      auto e2cti = stateAtBS_view[it].covariance()(12);
       auto nli = tracks.nLayers(it);
       for (auto jp = ip + 1; jp < hitToTuple.end(idx); ++jp) {
         auto const jt = *jp;
         auto qj = quality[jt];
         if (qj <= reject)
           continue;
-        auto opj = tracks.stateAtBS.state(jt)(2);
-        auto ctj = tracks.stateAtBS.state(jt)(3);
-        auto dct = nSigma2 * (tracks.stateAtBS.covariance(jt)(12) + e2cti);
+        auto opj = stateAtBS_view[jt].state()(2);
+        auto ctj = stateAtBS_view[jt].state()(3);
+        auto dct = nSigma2 * (stateAtBS_view[jt].covariance()(12) + e2cti);
         if ((cti - ctj) * (cti - ctj) > dct)
           continue;
-        auto dop = nSigma2 * (tracks.stateAtBS.covariance(jt)(9) + e2opi);
+        auto dop = nSigma2 * (stateAtBS_view[jt].covariance()(9) + e2opi);
         if ((opi - opj) * (opi - opj) > dop)
           continue;
         auto nlj = tracks.nLayers(jt);

From e8d296eb1a5cf883ec05bc54391b1d416427e31c Mon Sep 17 00:00:00 2001
From: Dimitris Papagiannis <nothingface0@gmail.com>
Date: Mon, 10 Oct 2022 15:53:21 +0200
Subject: [PATCH 005/110] Merged TrajectorySoAT into TracksSoAHeterogeneousT

---
 .../interface/TrackSoAHeterogeneousT_test.h   | 52 +++++++------
 .../Track/interface/TrajectoryStateSoAT.h     |  1 -
 .../Track/src/TrajectoryStateSoAT_test.cpp    |  1 -
 CUDADataFormats/Track/test/BuildFile.xml      | 12 ---
 .../Track/test/TrajectoryStateSOA_t.cpp       |  1 -
 .../Track/test/TrajectoryStateSOA_t.cu        |  1 -
 .../Track/test/TrajectoryStateSOA_t.h         | 75 -------------------
 .../PixelTriplets/test/BuildFile.xml          |  1 +
 8 files changed, 29 insertions(+), 115 deletions(-)
 delete mode 100644 CUDADataFormats/Track/src/TrajectoryStateSoAT_test.cpp
 delete mode 100644 CUDADataFormats/Track/test/TrajectoryStateSOA_t.cpp
 delete mode 100644 CUDADataFormats/Track/test/TrajectoryStateSOA_t.cu
 delete mode 100644 CUDADataFormats/Track/test/TrajectoryStateSOA_t.h

diff --git a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h
index 1cf34f14b30a1..aa22332dd0cb1 100644
--- a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h
+++ b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h
@@ -4,7 +4,7 @@
 #include <string>
 #include <algorithm>
 
-#include "CUDADataFormats/Track/interface/TrajectoryStateSoAT.h"
+#include <Eigen/Dense>
 #include "Geometry/CommonTopologies/interface/SimplePixelTopology.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
 
@@ -24,19 +24,37 @@ namespace pixelTrack {
   }
 }  // namespace pixelTrack
 
+using Vector5f = Eigen::Matrix<float, 5, 1>;
+using Vector15f = Eigen::Matrix<float, 15, 1>;
+
+using Vector5d = Eigen::Matrix<double, 5, 1>;
+using Matrix5d = Eigen::Matrix<double, 5, 5>;
+
 GENERATE_SOA_LAYOUT(TrackSoAHeterogeneousT_test,
                     SOA_COLUMN(uint8_t, quality),
-                    SOA_COLUMN(float, chi2), // this is chi2/ndof as not necessarely all hits are used in the fit
+                    SOA_COLUMN(float, chi2),  // this is chi2/ndof as not necessarely all hits are used in the fit
                     SOA_COLUMN(int8_t, nLayers),
                     SOA_COLUMN(float, eta),
-                    SOA_COLUMN(float, pt))
-                    // TODO: maybe add stateAtBS
+                    SOA_COLUMN(float, pt),
+                    SOA_EIGEN_COLUMN(Vector5f, state),
+                    SOA_EIGEN_COLUMN(Vector15f, covariance))
 
-template <int32_t S>
-class TrackSoAHeterogeneousT  : public cms::cuda::PortableDeviceCollection<TrackSoAHeterogeneousT_test<>> {
+// Previous TrajectoryStateSoAT class methods
+namespace pixelTrack {
+  namespace utilities {
+    using TrackSoAView = cms::cuda::PortableDeviceCollection<TrackSoAHeterogeneousT_test<>>::ConstView;
+    // State at the Beam spot
+    // phi,tip,1/pt,cotan(theta),zip
+    float charge(TrackSoAView tracks, int32_t i) { return std::copysign(1.f, tracks[i].state()(2)); }
+    float phi(TrackSoAView tracks, int32_t i) { return tracks[i].state()(0); }
+    float tip(TrackSoAView tracks, int32_t i) { return tracks[i].state()(1); }
+    float zip(TrackSoAView tracks, int32_t i) { return tracks[i].state()(4); }
+  }  // namespace utilities
+}  // namespace pixelTrack
 
+template <int32_t S>
+class TrackSoAHeterogeneousT : public cms::cuda::PortableDeviceCollection<TrackSoAHeterogeneousT_test<>> {
 public:
-
   // using cms::cuda::PortableDeviceCollection<TrackSoAHeterogeneousT_test<>>::PortableDeviceCollection;
   TrackSoAHeterogeneousT() = default;
 
@@ -49,13 +67,12 @@ class TrackSoAHeterogeneousT  : public cms::cuda::PortableDeviceCollection<Track
   // Always check quality is at least loose!
   // CUDA does not support enums  in __lgc ...
 private:
-
 public:
   constexpr Quality quality(int32_t i) const { return static_cast<Quality>(view()[i].quality()); }
   constexpr Quality &quality(int32_t i) { return static_cast<Quality &>(view()[i].quality()); }
   // TODO: static did not work; using reinterpret_cast
-  constexpr Quality const *qualityData() const { return reinterpret_cast <Quality const *>(view().quality()); }
-  constexpr Quality *qualityData() { return reinterpret_cast< Quality *>(view().quality()); }
+  constexpr Quality const *qualityData() const { return reinterpret_cast<Quality const *>(view().quality()); }
+  constexpr Quality *qualityData() { return reinterpret_cast<Quality *>(view().quality()); }
 
   constexpr float pt(int32_t i) const { return view()[i].pt(); }
   constexpr float &pt(int32_t i) { return view()[i].pt(); }
@@ -87,19 +104,6 @@ class TrackSoAHeterogeneousT  : public cms::cuda::PortableDeviceCollection<Track
     return nl;
   }
 
-  // State at the Beam spot
-  // phi,tip,1/pt,cotan(theta),zip
-  TrajectoryStateSoAT<S> stateAtBS;
-  constexpr float charge(int32_t i) const { return std::copysign(1.f, stateAtBS.state(i)(2)); }
-  constexpr float phi(int32_t i) const { return stateAtBS.state(i)(0); }
-  constexpr float tip(int32_t i) const { return stateAtBS.state(i)(1); }
-  constexpr float zip(int32_t i) const { return stateAtBS.state(i)(4); }
-
-  // state at the detector of the outermost hit
-  // representation to be decided...
-  // not yet filled on GPU
-  // TrajectoryStateSoA<S> stateAtOuterDet;
-
   HitContainer hitIndices;
   HitContainer detIndices;
 
@@ -118,7 +122,7 @@ namespace pixelTrack {
 #endif
 
   using TrackSoA = TrackSoAHeterogeneousT<maxNumber()>;
-  using TrajectoryState = TrajectoryStateSoAT<maxNumber()>;
+
   using HitContainer = TrackSoA::HitContainer;
 
 }  // namespace pixelTrack
diff --git a/CUDADataFormats/Track/interface/TrajectoryStateSoAT.h b/CUDADataFormats/Track/interface/TrajectoryStateSoAT.h
index 7f710ca67c7b6..23ff2ce2b1986 100644
--- a/CUDADataFormats/Track/interface/TrajectoryStateSoAT.h
+++ b/CUDADataFormats/Track/interface/TrajectoryStateSoAT.h
@@ -1,7 +1,6 @@
 #ifndef CUDADataFormats_Track_TrajectoryStateSOAT_H
 #define CUDADataFormats_Track_TrajectoryStateSOAT_H
 
-#include <Eigen/Dense>
 #include "HeterogeneousCore/CUDAUtilities/interface/eigenSoA.h"
 #include "DataFormats/SoATemplate/interface/SoALayout.h"
 #include "CUDADataFormats/Common/interface/PortableDeviceCollection.h"
diff --git a/CUDADataFormats/Track/src/TrajectoryStateSoAT_test.cpp b/CUDADataFormats/Track/src/TrajectoryStateSoAT_test.cpp
deleted file mode 100644
index f6b9659331603..0000000000000
--- a/CUDADataFormats/Track/src/TrajectoryStateSoAT_test.cpp
+++ /dev/null
@@ -1 +0,0 @@
-#include "CUDADataFormats/Track/interface/TrajectoryStateSoAT_test.h"
diff --git a/CUDADataFormats/Track/test/BuildFile.xml b/CUDADataFormats/Track/test/BuildFile.xml
index fc78783db473b..985445f1e1b2a 100644
--- a/CUDADataFormats/Track/test/BuildFile.xml
+++ b/CUDADataFormats/Track/test/BuildFile.xml
@@ -5,15 +5,3 @@
   <flags CXXFLAGS="-g -DGPU_DEBUG"/>
 </bin>
 
-<bin file="TrajectoryStateSOA_t.cpp" name="cpuTrajectoryStateSOA_t">
-  <use name="eigen"/>
-  <flags CXXFLAGS="-g -DGPU_DEBUG"/>
-</bin>
-
-<iftool name="cuda-gcc-support">
-<bin file="TrajectoryStateSOA_t.cu" name="gpuTrajectoryStateSOA_t">
-  <use name="eigen"/>
-  <flags CUDA_FLAGS="-g -DGPU_DEBUG"/>
-  <flags CXXFLAGS="-g -DGPU_DEBUG"/>
-</bin>
-</iftool>
diff --git a/CUDADataFormats/Track/test/TrajectoryStateSOA_t.cpp b/CUDADataFormats/Track/test/TrajectoryStateSOA_t.cpp
deleted file mode 100644
index d6ff539a642b0..0000000000000
--- a/CUDADataFormats/Track/test/TrajectoryStateSOA_t.cpp
+++ /dev/null
@@ -1 +0,0 @@
-#include "TrajectoryStateSOA_t.h"
diff --git a/CUDADataFormats/Track/test/TrajectoryStateSOA_t.cu b/CUDADataFormats/Track/test/TrajectoryStateSOA_t.cu
deleted file mode 100644
index d6ff539a642b0..0000000000000
--- a/CUDADataFormats/Track/test/TrajectoryStateSOA_t.cu
+++ /dev/null
@@ -1 +0,0 @@
-#include "TrajectoryStateSOA_t.h"
diff --git a/CUDADataFormats/Track/test/TrajectoryStateSOA_t.h b/CUDADataFormats/Track/test/TrajectoryStateSOA_t.h
deleted file mode 100644
index 97b88873c2613..0000000000000
--- a/CUDADataFormats/Track/test/TrajectoryStateSOA_t.h
+++ /dev/null
@@ -1,75 +0,0 @@
-#include "CUDADataFormats/Track/interface/TrajectoryStateSoAT.h"
-
-using Vector5d = Eigen::Matrix<double, 5, 1>;
-using Matrix5d = Eigen::Matrix<double, 5, 5>;
-
-__host__ __device__ Matrix5d loadCov(Vector5d const& e) {
-  Matrix5d cov;
-  for (int i = 0; i < 5; ++i)
-    cov(i, i) = e(i) * e(i);
-  for (int i = 0; i < 5; ++i) {
-    for (int j = 0; j < i; ++j) {
-      double v = 0.3 * std::sqrt(cov(i, i) * cov(j, j));  // this makes the matrix pos defined
-      cov(i, j) = (i + j) % 2 ? -0.4 * v : 0.1 * v;
-      cov(j, i) = cov(i, j);
-    }
-  }
-  return cov;
-}
-
-using TS = TrajectoryStateSoAT<128>;
-
-__global__ void testTSSoA(TS* pts, int n) {
-  assert(n <= 128);
-
-  Vector5d par0;
-  par0 << 0.2, 0.1, 3.5, 0.8, 0.1;
-  Vector5d e0;
-  e0 << 0.01, 0.01, 0.035, -0.03, -0.01;
-  auto cov0 = loadCov(e0);
-
-  TS& ts = *pts;
-
-  int first = threadIdx.x + blockIdx.x * blockDim.x;
-
-  for (int i = first; i < n; i += blockDim.x * gridDim.x) {
-    ts.copyFromDense(par0, cov0, i);
-    Vector5d par1;
-    Matrix5d cov1;
-    ts.copyToDense(par1, cov1, i);
-    Vector5d delV = par1 - par0;
-    Matrix5d delM = cov1 - cov0;
-    for (int j = 0; j < 5; ++j) {
-      assert(std::abs(delV(j)) < 1.e-5);
-      for (auto k = j; k < 5; ++k) {
-        assert(cov0(k, j) == cov0(j, k));
-        assert(cov1(k, j) == cov1(j, k));
-        assert(std::abs(delM(k, j)) < 1.e-5);
-      }
-    }
-  }
-}
-
-#ifdef __CUDACC__
-#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-#endif
-
-int main() {
-#ifdef __CUDACC__
-  cms::cudatest::requireDevices();
-#endif
-
-  TS ts;
-
-#ifdef __CUDACC__
-  TS* ts_d;
-  cudaCheck(cudaMalloc(&ts_d, sizeof(TS)));
-  testTSSoA<<<1, 64>>>(ts_d, 128);
-  cudaCheck(cudaGetLastError());
-  cudaCheck(cudaMemcpy(&ts, ts_d, sizeof(TS), cudaMemcpyDefault));
-  cudaCheck(cudaDeviceSynchronize());
-#else
-  testTSSoA(&ts, 128);
-#endif
-}
diff --git a/RecoPixelVertexing/PixelTriplets/test/BuildFile.xml b/RecoPixelVertexing/PixelTriplets/test/BuildFile.xml
index d480d7408b9e2..522b186f3351b 100644
--- a/RecoPixelVertexing/PixelTriplets/test/BuildFile.xml
+++ b/RecoPixelVertexing/PixelTriplets/test/BuildFile.xml
@@ -26,4 +26,5 @@
 <bin file="CAsizes_t.cpp">
   <use name="cuda"/>
   <use name="eigen"/>
+  <use name="boost"/>
 </bin>

From 88e7966e52d018e5b3f67851c0ad53b89b07085c Mon Sep 17 00:00:00 2001
From: Dimitris Papagiannis <nothingface0@gmail.com>
Date: Mon, 10 Oct 2022 16:08:38 +0200
Subject: [PATCH 006/110] Moving stuff until they break

---
 .../interface/TrackSoAHeterogeneousT_test.h   | 42 +++++++++++++++++++
 .../plugins/BrokenLineFitOnGPU.h              |  3 +-
 2 files changed, 44 insertions(+), 1 deletion(-)

diff --git a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h
index aa22332dd0cb1..cdbb3bcba7cc1 100644
--- a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h
+++ b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h
@@ -49,6 +49,45 @@ namespace pixelTrack {
     float phi(TrackSoAView tracks, int32_t i) { return tracks[i].state()(0); }
     float tip(TrackSoAView tracks, int32_t i) { return tracks[i].state()(1); }
     float zip(TrackSoAView tracks, int32_t i) { return tracks[i].state()(4); }
+
+    template <typename V3, typename M3, typename V2, typename M2>
+    __host__ __device__ inline void copyFromCircle(
+        TrackSoAView tracks, V3 const &cp, M3 const &ccov, V2 const &lp, M2 const &lcov, float b, int32_t i) {
+      tracks[i].state() << cp.template cast<float>(), lp.template cast<float>();
+
+      tracks[i].state()(2) = tracks[i].state()(2) * b;
+      auto cov = tracks[i].covariance();
+      cov(0) = ccov(0, 0);
+      cov(1) = ccov(0, 1);
+      cov(2) = b * float(ccov(0, 2));
+      cov(4) = cov(3) = 0;
+      cov(5) = ccov(1, 1);
+      cov(6) = b * float(ccov(1, 2));
+      cov(8) = cov(7) = 0;
+      cov(9) = b * b * float(ccov(2, 2));
+      cov(11) = cov(10) = 0;
+      cov(12) = lcov(0, 0);
+      cov(13) = lcov(0, 1);
+      cov(14) = lcov(1, 1);
+    }
+
+    template <typename V5, typename M5>
+    __host__ __device__ inline void copyFromDense(TrackSoAView tracks, V5 const &v, M5 const &cov, int32_t i) {
+      tracks[i].state() = v.template cast<float>();
+      for (int j = 0, ind = 0; j < 5; ++j)
+        for (auto k = j; k < 5; ++k)
+          tracks[i].covariance()(ind++) = cov(j, k);
+    }
+
+    template <typename V5, typename M5>
+    __host__ __device__ inline void copyToDense(TrackSoAView tracks, V5 &v, M5 &cov, int32_t i) {
+      v = tracks[i].state().template cast<typename V5::Scalar>();
+      for (int j = 0, ind = 0; j < 5; ++j) {
+        cov(j, j) = tracks[i].covariance()(ind++);
+        for (auto k = j + 1; k < 5; ++k)
+          cov(k, j) = cov(j, k) = tracks[i].covariance()(ind++);
+      }
+    }
   }  // namespace utilities
 }  // namespace pixelTrack
 
@@ -58,6 +97,9 @@ class TrackSoAHeterogeneousT : public cms::cuda::PortableDeviceCollection<TrackS
   // using cms::cuda::PortableDeviceCollection<TrackSoAHeterogeneousT_test<>>::PortableDeviceCollection;
   TrackSoAHeterogeneousT() = default;
 
+  explicit TrackSoAHeterogeneousT(size_t maxModules, cudaStream_t stream)
+      : PortableDeviceCollection<TrackSoAHeterogeneousT_test<>>(maxModules, stream) {}
+
   static constexpr int32_t stride() { return S; }
 
   using Quality = pixelTrack::Quality;
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h
index 6ec6afb83cba1..b4d6da45e42f9 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h
@@ -203,7 +203,8 @@ __global__ void kernel_BLFit(caConstants::TupleMultiplicity const *__restrict__
     brokenline::lineFit(hits_ge, fast_fit, bField, data, line);
     brokenline::circleFit(hits, hits_ge, fast_fit, bField, data, circle);
 
-    results->stateAtBS.copyFromCircle(circle.par, circle.cov, line.par, line.cov, 1.f / float(bField), tkid);
+    pixelTrack::utilities::copyFromCircle(
+        results, circle.par, circle.cov, line.par, line.cov, 1.f / float(bField), tkid);
     results->pt(tkid) = float(bField) / float(std::abs(circle.par(2)));
     results->eta(tkid) = asinhf(line.par(0));
     results->chi2(tkid) = (circle.chi2 + line.chi2) / (2 * N - 5);

From 5604d0b0024ee4442ec016fcd69a040c2b69a1fb Mon Sep 17 00:00:00 2001
From: Dimitris Papagiannis <nothingface0@gmail.com>
Date: Mon, 10 Oct 2022 16:10:56 +0200
Subject: [PATCH 007/110] Cleanup

---
 .../Track/interface/TrackSoAHeterogeneousT_test.h            | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h
index cdbb3bcba7cc1..d671c6c3f22c5 100644
--- a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h
+++ b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h
@@ -7,11 +7,8 @@
 #include <Eigen/Dense>
 #include "Geometry/CommonTopologies/interface/SimplePixelTopology.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
-
-#include "CUDADataFormats/Common/interface/HeterogeneousSoA.h"
 #include "DataFormats/SoATemplate/interface/SoALayout.h"
-
-//#include "DataFormats/Portable/interface/PortableCUDADeviceCollection.h"
+#include "CUDADataFormats/Common/interface/HeterogeneousSoA.h"
 #include "CUDADataFormats/Common/interface/PortableDeviceCollection.h"
 
 namespace pixelTrack {

From a68c296619c86150a486af8e90c8d1c82070e1a4 Mon Sep 17 00:00:00 2001
From: Dimitris Papagiannis <nothingface0@gmail.com>
Date: Mon, 10 Oct 2022 16:20:42 +0200
Subject: [PATCH 008/110] Fixing some calls to TkSoA method calls

---
 .../PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc  | 6 +++---
 .../PixelTriplets/plugins/BrokenLineFitOnGPU.h              | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc
index 59ba877e9e626..12899be2c4156 100644
--- a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc
+++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc
@@ -155,7 +155,7 @@ void PixelTrackProducerFromSoA::produce(edm::StreamID streamID,
   const auto &tsoa = *iEvent.get(tokenTrack_);
 
   auto const *quality = tsoa.qualityData();
-  auto const &fit = tsoa.stateAtBS;
+  // auto const &fit = tsoa.stateAtBS;
   auto const &hitIndices = tsoa.hitIndices;
   auto nTracks = tsoa.nTracks();
 
@@ -190,11 +190,11 @@ void PixelTrackProducerFromSoA::produce(edm::StreamID streamID,
     // mind: this values are respect the beamspot!
 
     float chi2 = tsoa.chi2(it);
-    float phi = tsoa.phi(it);
+    float phi = pixelTrack::utilities::phi(tsoa.view(), it);
 
     riemannFit::Vector5d ipar, opar;
     riemannFit::Matrix5d icov, ocov;
-    fit.copyToDense(ipar, icov, it);
+    pixelTrack::utilities::copyToDense(tsoa.view(), ipar, icov, it);
     riemannFit::transformToPerigeePlane(ipar, icov, opar, ocov);
 
     LocalTrajectoryParameters lpar(opar(0), opar(1), opar(2), opar(3), opar(4), 1.);
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h
index b4d6da45e42f9..c0046d2888256 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h
@@ -204,7 +204,7 @@ __global__ void kernel_BLFit(caConstants::TupleMultiplicity const *__restrict__
     brokenline::circleFit(hits, hits_ge, fast_fit, bField, data, circle);
 
     pixelTrack::utilities::copyFromCircle(
-        results, circle.par, circle.cov, line.par, line.cov, 1.f / float(bField), tkid);
+        results->view(), circle.par, circle.cov, line.par, line.cov, 1.f / float(bField), tkid);
     results->pt(tkid) = float(bField) / float(std::abs(circle.par(2)));
     results->eta(tkid) = asinhf(line.par(0));
     results->chi2(tkid) = (circle.chi2 + line.chi2) / (2 * N - 5);

From 1fc040be1355d9b628dd04d6ef893266fb7c37b4 Mon Sep 17 00:00:00 2001
From: Dimitris Papagiannis <nothingface0@gmail.com>
Date: Mon, 10 Oct 2022 17:23:50 +0200
Subject: [PATCH 009/110] Switched to View instead of ConstView

---
 CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h
index d671c6c3f22c5..fb1f248621ebe 100644
--- a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h
+++ b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h
@@ -39,7 +39,7 @@ GENERATE_SOA_LAYOUT(TrackSoAHeterogeneousT_test,
 // Previous TrajectoryStateSoAT class methods
 namespace pixelTrack {
   namespace utilities {
-    using TrackSoAView = cms::cuda::PortableDeviceCollection<TrackSoAHeterogeneousT_test<>>::ConstView;
+    using TrackSoAView = cms::cuda::PortableDeviceCollection<TrackSoAHeterogeneousT_test<>>::View;
     // State at the Beam spot
     // phi,tip,1/pt,cotan(theta),zip
     float charge(TrackSoAView tracks, int32_t i) { return std::copysign(1.f, tracks[i].state()(2)); }

From 47ec5428ffcf862484b1711650e89b5ffea9864e Mon Sep 17 00:00:00 2001
From: Dimitris Papagiannis <nothingface0@gmail.com>
Date: Tue, 11 Oct 2022 12:03:08 +0200
Subject: [PATCH 010/110] Moving methods to static functions, using View

---
 .../interface/TrackSoAHeterogeneousT_test.h   | 36 ++++++++------
 .../plugins/CAHitNtupletGeneratorKernels.cc   |  7 ++-
 .../plugins/CAHitNtupletGeneratorKernels.h    |  4 ++
 .../CAHitNtupletGeneratorKernelsImpl.h        | 49 +++++++++++--------
 4 files changed, 56 insertions(+), 40 deletions(-)

diff --git a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h
index fb1f248621ebe..695931671c1a5 100644
--- a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h
+++ b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h
@@ -40,12 +40,27 @@ GENERATE_SOA_LAYOUT(TrackSoAHeterogeneousT_test,
 namespace pixelTrack {
   namespace utilities {
     using TrackSoAView = cms::cuda::PortableDeviceCollection<TrackSoAHeterogeneousT_test<>>::View;
+    using TrackSoAConstView = cms::cuda::PortableDeviceCollection<TrackSoAHeterogeneousT_test<>>::ConstView;
     // State at the Beam spot
     // phi,tip,1/pt,cotan(theta),zip
-    float charge(TrackSoAView tracks, int32_t i) { return std::copysign(1.f, tracks[i].state()(2)); }
-    float phi(TrackSoAView tracks, int32_t i) { return tracks[i].state()(0); }
-    float tip(TrackSoAView tracks, int32_t i) { return tracks[i].state()(1); }
-    float zip(TrackSoAView tracks, int32_t i) { return tracks[i].state()(4); }
+    float charge(TrackSoAConstView tracks, int32_t i) { return std::copysign(1.f, tracks[i].state()(2)); }
+
+    float phi(TrackSoAConstView tracks, int32_t i) { return tracks[i].state()(0); }
+
+    float tip(TrackSoAConstView tracks, int32_t i) { return tracks[i].state()(1); }
+
+    float zip(TrackSoAConstView tracks, int32_t i) { return tracks[i].state()(4); }
+
+    float pt(TrackSoAConstView tracks, int32_t i) { return tracks[i].pt(); }
+    float &pt(TrackSoAConstView tracks, int32_t i) { return tracks[i].pt(); }
+
+    float eta(TrackSoAConstView tracks, int32_t i) { return tracks[i].eta(); }
+    float &eta(TrackSoAConstView tracks, int32_t i) { return tracks[i].eta(); }
+
+    float chi2(TrackSoAConstView tracks, int32_t i) { return tracks[i].chi2(); }
+    float &chi2(TrackSoAConstView tracks, int32_t i) { return tracks[i].chi2(); }
+
+    bool isTriplet(TrackSoAConstView tracks, int i) { return view[i].nLayers() == 3; }
 
     template <typename V3, typename M3, typename V2, typename M2>
     __host__ __device__ inline void copyFromCircle(
@@ -113,22 +128,11 @@ class TrackSoAHeterogeneousT : public cms::cuda::PortableDeviceCollection<TrackS
   constexpr Quality const *qualityData() const { return reinterpret_cast<Quality const *>(view().quality()); }
   constexpr Quality *qualityData() { return reinterpret_cast<Quality *>(view().quality()); }
 
-  constexpr float pt(int32_t i) const { return view()[i].pt(); }
-  constexpr float &pt(int32_t i) { return view()[i].pt(); }
-
-  constexpr float eta(int32_t i) const { return view()[i].eta(); }
-  constexpr float &eta(int32_t i) { return view()[i].eta(); }
-
-  constexpr float chi2(int32_t i) const { return view()[i].chi2(); }
-  constexpr float &chi2(int32_t i) { return view()[i].chi2(); }
-
   constexpr int nTracks() const { return nTracks_; }
   constexpr void setNTracks(int n) { nTracks_ = n; }
 
   constexpr int nHits(int i) const { return detIndices.size(i); }
 
-  constexpr bool isTriplet(int i) const { return view()[i].nLayers() == 3; }
-
   constexpr int computeNumberOfLayers(int32_t i) const {
     // layers are in order and we assume tracks are either forward or backward
     auto pdet = detIndices.begin(i);
@@ -161,6 +165,8 @@ namespace pixelTrack {
 #endif
 
   using TrackSoA = TrackSoAHeterogeneousT<maxNumber()>;
+  using TrackSoAView = cms::cuda::PortableDeviceCollection<TrackSoAHeterogeneousT_test<>>::View;
+  using TrackSoAConstView = cms::cuda::PortableDeviceCollection<TrackSoAHeterogeneousT_test<>>::ConstView;
 
   using HitContainer = TrackSoA::HitContainer;
 
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
index bc745817d4e4a..85c9c539593ca 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
@@ -148,10 +148,9 @@ void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, TkSoA
   int32_t nhits = hh.nHits();
 
   auto const *tuples_d = &tracks_d->hitIndices;
-  auto *quality_d = tracks_d->qualityData();
 
   // classify tracks based on kinematics
-  kernel_classifyTracks(tuples_d, tracks_d, tracks_d->stateAtBS.view(), params_.cuts_, quality_d);
+  kernel_classifyTracks(tuples_d, tracks_d, tracks_d->view(), params_.cuts_);
 
   if (params_.lateFishbone_) {
     // apply fishbone cleaning to good tracks
@@ -160,7 +159,7 @@ void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, TkSoA
 
   // remove duplicates (tracks that share a doublet)
   kernel_fastDuplicateRemover(
-      device_theCells_.get(), device_nCells_, tracks_d, tracks_d->stateAtBS.view(), params_.dupPassThrough_);
+      device_theCells_.get(), device_nCells_, tracks_d, tracks_d->view(), params_.dupPassThrough_);
 
   // fill hit->track "map"
   if (params_.doSharedHitCut_ || params_.doStats_) {
@@ -172,7 +171,7 @@ void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, TkSoA
   // remove duplicates (tracks that share at least one hit)
   if (params_.doSharedHitCut_) {
     kernel_rejectDuplicate(tracks_d,
-                           tracks_d->stateAtBS.view(),  // stateAtBS SoA view
+                           tracks_d->view(),  // stateAtBS SoA view
                            quality_d,
                            params_.minHitsForSharingCut_,
                            params_.dupPassThrough_,
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h
index 8af1176fe92c6..fcab52e96d210 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h
@@ -35,6 +35,8 @@ namespace cAHitNtupletGenerator {
 
   using Quality = pixelTrack::Quality;
   using TkSoA = pixelTrack::TrackSoA;
+  using TkSoAView = pixelTrack::TrackSoAView;
+  using TkSoAConstView = pixelTrack::TrackSoAConstView;
   using HitContainer = pixelTrack::HitContainer;
 
   struct QualityCuts {
@@ -174,6 +176,8 @@ class CAHitNtupletGeneratorKernels {
 
   using Quality = pixelTrack::Quality;
   using TkSoA = pixelTrack::TrackSoA;
+  using TkSoAView = pixelTrack::TrackSoAView;
+  using TkSoAConstView = pixelTrack::TrackSoAConstView;
   using HitContainer = pixelTrack::HitContainer;
 
   CAHitNtupletGeneratorKernels(Params const& params)
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
index 5806f6e6844e2..71a823ba23212 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
@@ -30,6 +30,8 @@ using TupleMultiplicity = caConstants::TupleMultiplicity;
 
 using Quality = pixelTrack::Quality;
 using TkSoA = pixelTrack::TrackSoA;
+using TkSoAView = pixelTrack::TrackSoAView;
+using TkSoAConstView = pixelTrack::TrackSoAConstView;
 using HitContainer = pixelTrack::HitContainer;
 
 namespace {
@@ -413,23 +415,26 @@ __global__ void kernel_fillMultiplicity(HitContainer const *__restrict__ foundNt
   }
 }
 
-__global__ void kernel_classifyTracks(
-    HitContainer const *__restrict__ tuples,
-    TkSoA const *__restrict__ tracks,
-    cms::cuda::PortableDeviceCollection<TrajectoryStateSoAT_test<>>::ConstView stateAtBS_view,
-    CAHitNtupletGeneratorKernelsGPU::QualityCuts cuts,
-    Quality *__restrict__ quality) {
+/*
+  Supply both the original TkSoA and the TkSoAView which contains
+the SoA Data
+ */
+__global__ void kernel_classifyTracks(HitContainer const *__restrict__ tuples,
+                                      TkSoA const *__restrict__ tracks,
+                                      TkSoAView tracks_view,
+                                      CAHitNtupletGeneratorKernelsGPU::QualityCuts cuts) {
   int first = blockDim.x * blockIdx.x + threadIdx.x;
+
   for (int it = first, nt = tuples->nOnes(); it < nt; it += gridDim.x * blockDim.x) {
     auto nhits = tuples->size(it);
     if (nhits == 0)
       break;  // guard
 
     // if duplicate: not even fit
-    if (quality[it] == pixelTrack::Quality::edup)
+    if (tracks_view[it].quality() == pixelTrack::Quality::edup)
       continue;
 
-    assert(quality[it] == pixelTrack::Quality::bad);
+    assert(tracks_view[it].quality() == pixelTrack::Quality::bad);
 
     // mark doublets as bad
     if (nhits < 3)
@@ -438,16 +443,16 @@ __global__ void kernel_classifyTracks(
     // if the fit has any invalid parameters, mark it as bad
     bool isNaN = false;
     for (int i = 0; i < 5; ++i) {
-      isNaN |= std::isnan(stateAtBS_view[it].state()(i));
+      isNaN |= std::isnan(tracks_view[it].state()(i));
     }
     if (isNaN) {
 #ifdef NTUPLE_DEBUG
-      printf("NaN in fit %d size %d chi2 %f\n", it, tuples->size(it), tracks->chi2(it));
+      printf("NaN in fit %d size %d chi2 %f\n", it, tuples->size(it), pixelTrack::utilities::chi2(tracks_view, it));
 #endif
       continue;
     }
 
-    quality[it] = pixelTrack::Quality::strict;
+    tracks[it].quality() = pixelTrack::Quality::strict;
 
     // compute a pT-dependent chi2 cut
 
@@ -473,21 +478,21 @@ __global__ void kernel_classifyTracks(
     };
 
     // (see CAHitNtupletGeneratorGPU.cc)
-    float pt = std::min<float>(tracks->pt(it), cuts.chi2MaxPt);
+    float pt = std::min<float>(pixelTrack::utilities::pt(tracks_view, it), cuts.chi2MaxPt);
     float chi2Cut = cuts.chi2Scale * (cuts.chi2Coeff[0] + roughLog(pt) * cuts.chi2Coeff[1]);
-    if (tracks->chi2(it) >= chi2Cut) {
+    if (pixelTrack::utilities::chi2(tracks_view, it) >= chi2Cut) {
 #ifdef NTUPLE_FIT_DEBUG
       printf("Bad chi2 %d size %d pt %f eta %f chi2 %f\n",
              it,
              tuples->size(it),
-             tracks->pt(it),
-             tracks->eta(it),
-             tracks->chi2(it));
+             pixelTrack::utilities::pt(tracks_view, it),
+             pixelTrack::utilities::eta(tracks_view, it),
+             pixelTrack::utilities::chi2(tracks_view, it));
 #endif
       continue;
     }
 
-    quality[it] = pixelTrack::Quality::tight;
+    tracks[it].quality() = pixelTrack::Quality::tight;
 
     // impose "region cuts" based on the fit results (phi, Tip, pt, cotan(theta)), Zip)
     // default cuts:
@@ -495,11 +500,13 @@ __global__ void kernel_classifyTracks(
     //   - for quadruplets: |Tip| < 0.5 cm, pT > 0.3 GeV, |Zip| < 12.0 cm
     // (see CAHitNtupletGeneratorGPU.cc)
     auto const &region = (nhits > 3) ? cuts.quadruplet : cuts.triplet;
-    bool isOk = (std::abs(tracks->tip(it)) < region.maxTip) and (tracks->pt(it) > region.minPt) and
-                (std::abs(tracks->zip(it)) < region.maxZip);
+    bool isOk = (std::abs(pixelTrack::utilities::tip(tracks_view, it)) < region.maxTip) and
+                (pixelTrack::utilities::pt(tracks_view, it) > region.minPt) and
+                (std::abs(pixelTrack::utilities::zip(tracks_view, it)) < region.maxZip);
 
-    if (isOk)
-      quality[it] = pixelTrack::Quality::highPurity;
+    if (isOk) {
+      tracks[it].quality() = pixelTrack::Quality::highPurity;
+    }
   }
 }
 

From 61fbb0efed60a69f4795d0eb1846bdd1eca44ba5 Mon Sep 17 00:00:00 2001
From: Dimitris Papagiannis <nothingface0@gmail.com>
Date: Tue, 11 Oct 2022 12:30:03 +0200
Subject: [PATCH 011/110] working on Broken line

---
 .../Track/interface/TrackSoAHeterogeneousT_test.h  | 14 +++++++-------
 .../PixelTriplets/plugins/BrokenLineFitOnGPU.h     | 14 ++++++++------
 2 files changed, 15 insertions(+), 13 deletions(-)

diff --git a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h
index 695931671c1a5..d67eef16e9927 100644
--- a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h
+++ b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h
@@ -51,16 +51,16 @@ namespace pixelTrack {
 
     float zip(TrackSoAConstView tracks, int32_t i) { return tracks[i].state()(4); }
 
-    float pt(TrackSoAConstView tracks, int32_t i) { return tracks[i].pt(); }
-    float &pt(TrackSoAConstView tracks, int32_t i) { return tracks[i].pt(); }
+    // float pt(TrackSoAConstView tracks, int32_t i) { return tracks[i].pt(); }
+    // // float &pt(TrackSoAConstView tracks, int32_t i) { return tracks[i].pt(); }
 
-    float eta(TrackSoAConstView tracks, int32_t i) { return tracks[i].eta(); }
-    float &eta(TrackSoAConstView tracks, int32_t i) { return tracks[i].eta(); }
+    // float eta(TrackSoAConstView tracks, int32_t i) { return tracks[i].eta(); }
+    // // float &eta(TrackSoAConstView tracks, int32_t i) { return tracks[i].eta(); }
 
-    float chi2(TrackSoAConstView tracks, int32_t i) { return tracks[i].chi2(); }
-    float &chi2(TrackSoAConstView tracks, int32_t i) { return tracks[i].chi2(); }
+    // float chi2(TrackSoAConstView tracks, int32_t i) { return tracks[i].chi2(); }
+    // float &chi2(TrackSoAConstView tracks, int32_t i) { return tracks[i].chi2(); }
 
-    bool isTriplet(TrackSoAConstView tracks, int i) { return view[i].nLayers() == 3; }
+    bool isTriplet(TrackSoAConstView tracks, int i) { return tracks[i].nLayers() == 3; }
 
     template <typename V3, typename M3, typename V2, typename M2>
     __host__ __device__ inline void copyFromCircle(
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h
index c0046d2888256..eda536640af5a 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h
@@ -8,6 +8,7 @@
 
 #include <cuda_runtime.h>
 
+#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
 #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
@@ -19,6 +20,7 @@
 using HitsOnGPU = TrackingRecHit2DSOAView;
 using Tuples = pixelTrack::HitContainer;
 using OutputSoA = pixelTrack::TrackSoA;
+using OutputSoAView = pixelTrack::TrackSoAView;
 using tindex_type = caConstants::tindex_type;
 constexpr auto invalidTkId = std::numeric_limits<tindex_type>::max();
 
@@ -169,12 +171,12 @@ __global__ void kernel_BLFastFit(Tuples const *__restrict__ foundNtuplets,
 template <int N>
 __global__ void kernel_BLFit(caConstants::TupleMultiplicity const *__restrict__ tupleMultiplicity,
                              double bField,
-                             OutputSoA *results,
+                             OutputSoAView results_view,
                              tindex_type const *__restrict__ ptkids,
                              double *__restrict__ phits,
                              float *__restrict__ phits_ge,
                              double *__restrict__ pfast_fit) {
-  assert(results);
+  // assert(results_view); // Need to be replaced with something that works
   assert(pfast_fit);
 
   // same as above...
@@ -204,10 +206,10 @@ __global__ void kernel_BLFit(caConstants::TupleMultiplicity const *__restrict__
     brokenline::circleFit(hits, hits_ge, fast_fit, bField, data, circle);
 
     pixelTrack::utilities::copyFromCircle(
-        results->view(), circle.par, circle.cov, line.par, line.cov, 1.f / float(bField), tkid);
-    results->pt(tkid) = float(bField) / float(std::abs(circle.par(2)));
-    results->eta(tkid) = asinhf(line.par(0));
-    results->chi2(tkid) = (circle.chi2 + line.chi2) / (2 * N - 5);
+        results_view, circle.par, circle.cov, line.par, line.cov, 1.f / float(bField), tkid);
+    results_view[tkid].pt() = float(bField) / float(std::abs(circle.par(2)));
+    results_view[tkid].eta() = asinhf(line.par(0));
+    results_view[tkid].chi2() = (circle.chi2 + line.chi2) / (2 * N - 5);
 
 #ifdef BROKENLINE_DEBUG
     if (!(circle.chi2 >= 0) || !(line.chi2 >= 0))

From 78a500cb4d57db7124bd3fa20a7d424b7d19cca6 Mon Sep 17 00:00:00 2001
From: Dimitris Papagiannis <nothingface0@gmail.com>
Date: Tue, 11 Oct 2022 12:41:38 +0200
Subject: [PATCH 012/110] Working on HelixFit

---
 .../PixelTriplets/plugins/BrokenLineFitOnGPU.h              | 3 +--
 RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc   | 4 ++--
 RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h    | 6 +++---
 3 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h
index eda536640af5a..aefde7ac602b1 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h
@@ -19,7 +19,6 @@
 
 using HitsOnGPU = TrackingRecHit2DSOAView;
 using Tuples = pixelTrack::HitContainer;
-using OutputSoA = pixelTrack::TrackSoA;
 using OutputSoAView = pixelTrack::TrackSoAView;
 using tindex_type = caConstants::tindex_type;
 constexpr auto invalidTkId = std::numeric_limits<tindex_type>::max();
@@ -176,7 +175,7 @@ __global__ void kernel_BLFit(caConstants::TupleMultiplicity const *__restrict__
                              double *__restrict__ phits,
                              float *__restrict__ phits_ge,
                              double *__restrict__ pfast_fit) {
-  // assert(results_view); // Need to be replaced with something that works
+  // assert(results_view); // TODO Find equivalent assertion for View
   assert(pfast_fit);
 
   // same as above...
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc
index 880bdb47dfb5c..624934645338b 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc
@@ -3,14 +3,14 @@
 
 void HelixFitOnGPU::allocateOnGPU(Tuples const *tuples,
                                   TupleMultiplicity const *tupleMultiplicity,
-                                  OutputSoA *helix_fit_results) {
+                                  OutputSoAView helix_fit_results) {
   tuples_ = tuples;
   tupleMultiplicity_ = tupleMultiplicity;
   outputSoa_ = helix_fit_results;
 
   assert(tuples_);
   assert(tupleMultiplicity_);
-  assert(outputSoa_);
+  // assert(outputSoa_); // TODO find equivalent assertion for View
 }
 
 void HelixFitOnGPU::deallocateOnGPU() {}
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h
index 9a9c85970af33..031325e2e13d9 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h
@@ -36,7 +36,7 @@ class HelixFitOnGPU {
   using HitsView = TrackingRecHit2DSOAView;
 
   using Tuples = pixelTrack::HitContainer;
-  using OutputSoA = pixelTrack::TrackSoA;
+  using OutputSoAView = pixelTrack::TrackSoAView;
 
   using TupleMultiplicity = caConstants::TupleMultiplicity;
 
@@ -47,7 +47,7 @@ class HelixFitOnGPU {
   void launchRiemannKernels(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples, cudaStream_t cudaStream);
   void launchBrokenLineKernels(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples, cudaStream_t cudaStream);
 
-  void launchRiemannKernelsOnCPU(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples);
+  void launchiRemannKernelsOnCPU(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples);
   void launchBrokenLineKernelsOnCPU(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples);
 
   void allocateOnGPU(Tuples const *tuples, TupleMultiplicity const *tupleMultiplicity, OutputSoA *outputSoA);
@@ -59,7 +59,7 @@ class HelixFitOnGPU {
   // fowarded
   Tuples const *tuples_ = nullptr;
   TupleMultiplicity const *tupleMultiplicity_ = nullptr;
-  OutputSoA *outputSoa_;
+  OutputSoAView outputSoa_;
   float bField_;
 
   const bool fitNas4_;

From bc3f4ae825c825f370afa86cb69197908b35dd65 Mon Sep 17 00:00:00 2001
From: Dimitris Papagiannis <nothingface0@gmail.com>
Date: Tue, 11 Oct 2022 12:56:26 +0200
Subject: [PATCH 013/110] Replacing methods with utilities

---
 .../CAHitNtupletGeneratorKernelsImpl.h        | 38 +++++++++----------
 .../PixelTriplets/plugins/HelixFitOnGPU.h     |  2 +-
 2 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
index 71a823ba23212..58b371932b933 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
@@ -162,7 +162,7 @@ __global__ void kernel_earlyDuplicateRemover(GPUCACell const *cells,
 
     // find maxNl
     for (auto it : thisCell.tracks()) {
-      auto nl = tracks.nLayers(it);
+      auto nl = tracks[it].nLayers();
       maxNl = std::max(nl, maxNl);
     }
 
@@ -171,7 +171,7 @@ __global__ void kernel_earlyDuplicateRemover(GPUCACell const *cells,
     //  maxNl = std::min(4, maxNl);
 
     for (auto it : thisCell.tracks()) {
-      if (tracks.nLayers(it) < maxNl)
+      if (tracks[it].nLayers() < maxNl)
         quality[it] = reject;  //no race:  simple assignment of the same constant
     }
   }
@@ -201,13 +201,13 @@ __global__ void kernel_fastDuplicateRemover(
 
     /* chi2 penalize higher-pt tracks  (try rescale it?)
     auto score = [&](auto it) {
-      return tracks->nLayers(it) < 4 ? 
-              std::abs(tracks->tip(it)) :  // tip for triplets
-              tracks->chi2(it);            //chi2 for quads
+      return tracks[it].nLayers() < 4 ? 
+              std::abs(pixelTrack::utilities::tip(tracks,it)) :  // tip for triplets
+              tracks[it].chi2(it);            //chi2 for quads
     };
     */
 
-    auto score = [&](auto it) { return std::abs(tracks->tip(it)); };
+    auto score = [&](auto it) { return std::abs(pixelTrack::utilities::tip(tracks, it)); };
 
     // full crazy combinatorics
     int ntr = thisCell.tracks().size();
@@ -577,7 +577,7 @@ __global__ void kernel_fillNLayers(TkSoA *__restrict__ ptracks, cms::cuda::Atomi
   for (int idx = first, nt = ntracks; idx < nt; idx += gridDim.x * blockDim.x) {
     auto nHits = tracks.nHits(idx);
     assert(nHits >= 3);
-    tracks.nLayers(idx) = tracks.computeNumberOfLayers(idx);
+    tracks[idx].nLayers() = tracks.computeNumberOfLayers(idx);
   }
 }
 
@@ -690,7 +690,7 @@ __global__ void kernel_rejectDuplicate(
       auto e2opi = stateAtBS_view[it].covariance()(9);
       auto cti = stateAtBS_view[it].state()(3);
       auto e2cti = stateAtBS_view[it].covariance()(12);
-      auto nli = tracks.nLayers(it);
+      auto nli = tracks[it].nLayers();
       for (auto jp = ip + 1; jp < hitToTuple.end(idx); ++jp) {
         auto const jt = *jp;
         auto qj = quality[jt];
@@ -704,7 +704,7 @@ __global__ void kernel_rejectDuplicate(
         auto dop = nSigma2 * (stateAtBS_view[jt].covariance()(9) + e2opi);
         if ((opi - opj) * (opi - opj) > dop)
           continue;
-        auto nlj = tracks.nLayers(jt);
+        auto nlj = tracks[jt].nLayers();
         if (nlj < nli || (nlj == nli && (qj < qi || (qj == qi && score(it, nli) < score(jt, nlj)))))
           quality[jt] = reject;
         else {
@@ -745,7 +745,7 @@ __global__ void kernel_sharedHitCleaner(TrackingRecHit2DSOAView const *__restric
       if (quality[*it] < longTqual)
         continue;
       // if (tracks.nHits(*it)==3) continue;
-      auto nl = tracks.nLayers(*it);
+      auto nl = tracks[*it].nLayers();
       maxNl = std::max(nl, maxNl);
     }
 
@@ -757,7 +757,7 @@ __global__ void kernel_sharedHitCleaner(TrackingRecHit2DSOAView const *__restric
 
     // kill all tracks shorter than maxHl (only triplets???
     for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) {
-      auto nl = tracks.nLayers(*it);
+      auto nl = tracks[*it].nLayers();
 
       //checking if shared hit is on bpix1 and if the tuple is short enough
       if (idx < l1end and nl > nmin)
@@ -893,15 +893,15 @@ __global__ void kernel_print_found_ntuplets(TrackingRecHit2DSOAView const *__res
            10000 * iev + i,
            int(quality[i]),
            nh,
-           tracks.nLayers(i),
-           tracks.charge(i),
-           tracks.pt(i),
-           tracks.eta(i),
-           tracks.phi(i),
-           tracks.tip(i),
-           tracks.zip(i),
+           tracks[i].nLayers(),
+           pixelTrack::utilities::charge(tracks, i),
+           tracks[i].pt(),
+           tracks[i].eta(),
+           pixelTrack::utilities::phi(tracks, i),
+           pixelTrack::utilities::tip(tracks, i),
+           pixelTrack::utilities::zip(tracks, i),
            //           asinhf(fit_results[i].par(3)),
-           tracks.chi2(i),
+           tracks[i].chi2(),
            hh.zGlobal(*foundNtuplets.begin(i)),
            hh.zGlobal(*(foundNtuplets.begin(i) + 1)),
            hh.zGlobal(*(foundNtuplets.begin(i) + 2)),
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h
index 031325e2e13d9..d47e4c5f8ece9 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h
@@ -50,7 +50,7 @@ class HelixFitOnGPU {
   void launchiRemannKernelsOnCPU(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples);
   void launchBrokenLineKernelsOnCPU(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples);
 
-  void allocateOnGPU(Tuples const *tuples, TupleMultiplicity const *tupleMultiplicity, OutputSoA *outputSoA);
+  void allocateOnGPU(Tuples const *tuples, TupleMultiplicity const *tupleMultiplicity, OutputSoAView *outputSoA);
   void deallocateOnGPU();
 
 private:

From 4e7d9fa4acddebeac6793edecde5df50e8d76ad8 Mon Sep 17 00:00:00 2001
From: Dimitris Papagiannis <nothingface0@gmail.com>
Date: Tue, 11 Oct 2022 13:06:59 +0200
Subject: [PATCH 014/110] rejectDuplicate kernel

---
 .../plugins/CAHitNtupletGeneratorKernels.cc   | 11 +--
 .../CAHitNtupletGeneratorKernelsImpl.h        | 92 +++++++++----------
 2 files changed, 46 insertions(+), 57 deletions(-)

diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
index 85c9c539593ca..1271f3f6dcd1a 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
@@ -158,8 +158,7 @@ void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, TkSoA
   }
 
   // remove duplicates (tracks that share a doublet)
-  kernel_fastDuplicateRemover(
-      device_theCells_.get(), device_nCells_, tracks_d, tracks_d->view(), params_.dupPassThrough_);
+  kernel_fastDuplicateRemover(device_theCells_.get(), device_nCells_, tracks_d->view(), params_.dupPassThrough_);
 
   // fill hit->track "map"
   if (params_.doSharedHitCut_ || params_.doStats_) {
@@ -170,12 +169,8 @@ void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, TkSoA
 
   // remove duplicates (tracks that share at least one hit)
   if (params_.doSharedHitCut_) {
-    kernel_rejectDuplicate(tracks_d,
-                           tracks_d->view(),  // stateAtBS SoA view
-                           quality_d,
-                           params_.minHitsForSharingCut_,
-                           params_.dupPassThrough_,
-                           device_hitToTuple_.get());
+    kernel_rejectDuplicate(
+        tracks_d->view(), params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get());
 
     kernel_sharedHitCleaner(hh.view(),
                             tracks_d,
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
index 58b371932b933..343610993e674 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
@@ -178,12 +178,10 @@ __global__ void kernel_earlyDuplicateRemover(GPUCACell const *cells,
 }
 
 // assume the above (so, short tracks already removed)
-__global__ void kernel_fastDuplicateRemover(
-    GPUCACell const *__restrict__ cells,
-    uint32_t const *__restrict__ nCells,
-    TkSoA *__restrict__ tracks,
-    cms::cuda::PortableDeviceCollection<TrajectoryStateSoAT_test<>>::ConstView stateAtBS_view,
-    bool dupPassThrough) {
+__global__ void kernel_fastDuplicateRemover(GPUCACell const *__restrict__ cells,
+                                            uint32_t const *__restrict__ nCells,
+                                            TkSoAView tracks_view,
+                                            bool dupPassThrough) {
   // quality to mark rejected
   auto const reject = dupPassThrough ? pixelTrack::Quality::loose : pixelTrack::Quality::dup;
   constexpr auto loose = pixelTrack::Quality::loose;
@@ -201,42 +199,42 @@ __global__ void kernel_fastDuplicateRemover(
 
     /* chi2 penalize higher-pt tracks  (try rescale it?)
     auto score = [&](auto it) {
-      return tracks[it].nLayers() < 4 ? 
-              std::abs(pixelTrack::utilities::tip(tracks,it)) :  // tip for triplets
-              tracks[it].chi2(it);            //chi2 for quads
+      return tracks_view[it].nLayers() < 4 ? 
+              std::abs(pixelTrack::utilities::tip(tracks_view, it)) :  // tip for triplets
+              tracks_view[it].chi2(it);            //chi2 for quads
     };
     */
 
-    auto score = [&](auto it) { return std::abs(pixelTrack::utilities::tip(tracks, it)); };
+    auto score = [&](auto it) { return std::abs(pixelTrack::utilities::tip(tracks_view, it)); };
 
     // full crazy combinatorics
     int ntr = thisCell.tracks().size();
     for (int i = 0; i < ntr - 1; ++i) {
       auto it = thisCell.tracks()[i];
-      auto qi = tracks->quality(it);
+      auto qi = tracks_view[it].quality();
       if (qi <= reject)
         continue;
-      auto opi = stateAtBS_view[it].state()(2);
-      auto e2opi = stateAtBS_view[it].covariance()(9);
-      auto cti = stateAtBS_view[it].state()(3);
-      auto e2cti = stateAtBS_view[it].covariance()(12);
+      auto opi = tracks_view[it].state()(2);
+      auto e2opi = tracks_view[it].covariance()(9);
+      auto cti = tracks_view[it].state()(3);
+      auto e2cti = tracks_view[it].covariance()(12);
       for (auto j = i + 1; j < ntr; ++j) {
         auto jt = thisCell.tracks()[j];
-        auto qj = tracks->quality(jt);
+        auto qj = tracks_view[jt].quality();
         if (qj <= reject)
           continue;
-        auto opj = stateAtBS_view[jt].state()(2);
-        auto ctj = stateAtBS_view[jt].state()(3);
-        auto dct = nSigma2 * (stateAtBS_view[jt].covariance()(12) + e2cti);
+        auto opj = tracks_view[jt].state()(2);
+        auto ctj = tracks_view[jt].state()(3);
+        auto dct = nSigma2 * (tracks_view[jt].covariance()(12) + e2cti);
         if ((cti - ctj) * (cti - ctj) > dct)
           continue;
-        auto dop = nSigma2 * (stateAtBS_view[jt].covariance()(9) + e2opi);
+        auto dop = nSigma2 * (tracks_view[jt].covariance()(9) + e2opi);
         if ((opi - opj) * (opi - opj) > dop)
           continue;
         if ((qj < qi) || (qj == qi && score(it) < score(jt)))
-          tracks->quality(jt) = reject;
+          tracks_view[jt].quality() = reject;
         else {
-          tracks->quality(it) = reject;
+          tracks_view[it].quality() = reject;
           break;
         }
       }
@@ -245,8 +243,8 @@ __global__ void kernel_fastDuplicateRemover(
     // find maxQual
     auto maxQual = reject;  // no duplicate!
     for (auto it : thisCell.tracks()) {
-      if (tracks->quality(it) > maxQual)
-        maxQual = tracks->quality(it);
+      if (tracks_view[it].quality() > maxQual)
+        maxQual = tracks_view[it].quality();
     }
 
     if (maxQual <= loose)
@@ -254,7 +252,7 @@ __global__ void kernel_fastDuplicateRemover(
 
     // find min score
     for (auto it : thisCell.tracks()) {
-      if (tracks->quality(it) == maxQual && score(it) < mc) {
+      if (tracks_view[it].quality() == maxQual && score(it) < mc) {
         mc = score(it);
         im = it;
       }
@@ -265,8 +263,8 @@ __global__ void kernel_fastDuplicateRemover(
 
     // mark all other duplicates  (not yet, keep it loose)
     for (auto it : thisCell.tracks()) {
-      if (tracks->quality(it) > loose && it != im)
-        tracks->quality(it) = loose;  //no race:  simple assignment of the same constant
+      if (tracks_view[it].quality() > loose && it != im)
+        tracks_view[it].quality() = loose;  //no race:  simple assignment of the same constant
     }
   }
 }
@@ -654,18 +652,14 @@ __global__ void kernel_markSharedHit(int const *__restrict__ nshared,
 }
 
 // mostly for very forward triplets.....
-__global__ void kernel_rejectDuplicate(
-    TkSoA const *__restrict__ ptracks,  // TODO: Change to Constview
-    cms::cuda::PortableDeviceCollection<TrajectoryStateSoAT_test<>>::ConstView stateAtBS_view,
-    Quality *__restrict__ quality,
-    uint16_t nmin,
-    bool dupPassThrough,
-    CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ phitToTuple) {
+__global__ void kernel_rejectDuplicate(TkSoAView tracks_view,
+                                       uint16_t nmin,
+                                       bool dupPassThrough,
+                                       CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ phitToTuple) {
   // quality to mark rejected
   auto const reject = dupPassThrough ? pixelTrack::Quality::loose : pixelTrack::Quality::dup;
 
   auto &hitToTuple = *phitToTuple;
-  auto const &tracks = *ptracks;
 
   int first = blockDim.x * blockIdx.x + threadIdx.x;
   for (int idx = first, ntot = hitToTuple.nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
@@ -678,37 +672,37 @@ __global__ void kernel_rejectDuplicate(
                  tracks.chi2(it);                 //chi2
     };
     */
-    auto score = [&](auto it, auto nl) { return std::abs(tracks.tip(it)); };
+    auto score = [&](auto it, auto nl) { return std::abs(pixelTrack::utilities::tip(tracks_view, it)); };
 
     // full combinatorics
     for (auto ip = hitToTuple.begin(idx); ip < hitToTuple.end(idx) - 1; ++ip) {
       auto const it = *ip;
-      auto qi = quality[it];
+      auto qi = tracks_view[it].quality();
       if (qi <= reject)
         continue;
-      auto opi = stateAtBS_view[it].state()(2);
-      auto e2opi = stateAtBS_view[it].covariance()(9);
-      auto cti = stateAtBS_view[it].state()(3);
-      auto e2cti = stateAtBS_view[it].covariance()(12);
+      auto opi = tracks_view[it].state()(2);
+      auto e2opi = tracks_view[it].covariance()(9);
+      auto cti = tracks_view[it].state()(3);
+      auto e2cti = tracks_view[it].covariance()(12);
       auto nli = tracks[it].nLayers();
       for (auto jp = ip + 1; jp < hitToTuple.end(idx); ++jp) {
         auto const jt = *jp;
-        auto qj = quality[jt];
+        auto qj = tracks_view[jt].quality();
         if (qj <= reject)
           continue;
-        auto opj = stateAtBS_view[jt].state()(2);
-        auto ctj = stateAtBS_view[jt].state()(3);
-        auto dct = nSigma2 * (stateAtBS_view[jt].covariance()(12) + e2cti);
+        auto opj = tracks_view[jt].state()(2);
+        auto ctj = tracks_view[jt].state()(3);
+        auto dct = nSigma2 * (tracks_view[jt].covariance()(12) + e2cti);
         if ((cti - ctj) * (cti - ctj) > dct)
           continue;
-        auto dop = nSigma2 * (stateAtBS_view[jt].covariance()(9) + e2opi);
+        auto dop = nSigma2 * (tracks_view[jt].covariance()(9) + e2opi);
         if ((opi - opj) * (opi - opj) > dop)
           continue;
-        auto nlj = tracks[jt].nLayers();
+        auto nlj = tracks_view[jt].nLayers();
         if (nlj < nli || (nlj == nli && (qj < qi || (qj == qi && score(it, nli) < score(jt, nlj)))))
-          quality[jt] = reject;
+          tracks_view[jt].quality() = reject;
         else {
-          quality[it] = reject;
+          tracks_view[it].quality() = reject;
           break;
         }
       }

From 6a489bfa37689e6a993469b0943ff2bb1424b01a Mon Sep 17 00:00:00 2001
From: Dimitris Papagiannis <nothingface0@gmail.com>
Date: Tue, 11 Oct 2022 13:08:59 +0200
Subject: [PATCH 015/110] sharedHitContainer kernel

---
 .../plugins/CAHitNtupletGeneratorKernelsImpl.h     | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
index 343610993e674..a2bedc3b46ebc 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
@@ -711,8 +711,7 @@ __global__ void kernel_rejectDuplicate(TkSoAView tracks_view,
 }
 
 __global__ void kernel_sharedHitCleaner(TrackingRecHit2DSOAView const *__restrict__ hhp,
-                                        TkSoA const *__restrict__ ptracks,
-                                        Quality *__restrict__ quality,
+                                        TkSoAView tracks_view,
                                         int nmin,
                                         bool dupPassThrough,
                                         CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ phitToTuple) {
@@ -722,7 +721,6 @@ __global__ void kernel_sharedHitCleaner(TrackingRecHit2DSOAView const *__restric
   auto const longTqual = pixelTrack::Quality::highPurity;
 
   auto &hitToTuple = *phitToTuple;
-  auto const &tracks = *ptracks;
 
   auto const &hh = *hhp;
   int l1end = hh.hitsLayerStart()[1];
@@ -736,10 +734,10 @@ __global__ void kernel_sharedHitCleaner(TrackingRecHit2DSOAView const *__restric
 
     // find maxNl
     for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) {
-      if (quality[*it] < longTqual)
+      if (tracks_view[*it].quality() < longTqual)
         continue;
       // if (tracks.nHits(*it)==3) continue;
-      auto nl = tracks[*it].nLayers();
+      auto nl = tracks_view[*it].nLayers();
       maxNl = std::max(nl, maxNl);
     }
 
@@ -751,14 +749,14 @@ __global__ void kernel_sharedHitCleaner(TrackingRecHit2DSOAView const *__restric
 
     // kill all tracks shorter than maxHl (only triplets???
     for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) {
-      auto nl = tracks[*it].nLayers();
+      auto nl = tracks_view[*it].nLayers();
 
       //checking if shared hit is on bpix1 and if the tuple is short enough
       if (idx < l1end and nl > nmin)
         continue;
 
-      if (nl < maxNl && quality[*it] > reject)
-        quality[*it] = reject;
+      if (nl < maxNl && tracks_view[*it].quality() > reject)
+        tracks_view[*it].quality() = reject;
     }
   }
 }

From bc0392de78e657d5ba86e3c32d439258bb9ffea1 Mon Sep 17 00:00:00 2001
From: Dimitris Papagiannis <nothingface0@gmail.com>
Date: Tue, 11 Oct 2022 14:02:24 +0200
Subject: [PATCH 016/110] Triplet cleaner

---
 .../plugins/CAHitNtupletGeneratorKernels.cc      |  4 ++--
 .../plugins/CAHitNtupletGeneratorKernelsImpl.h   | 16 +++++++---------
 2 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
index 1271f3f6dcd1a..2b950caec5c6f 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
@@ -183,7 +183,7 @@ void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, TkSoA
           tracks_d, quality_d, params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get());
     } else {
       kernel_tripletCleaner(
-          tracks_d, quality_d, params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get());
+          tracks_d->view(), params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get());
     }
   }
 
@@ -216,7 +216,7 @@ void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, TkSoA
   {
     std::lock_guard<std::mutex> guard(lock);
     ++iev;
-    kernel_print_found_ntuplets(hh.view(), tuples_d, tracks_d, quality_d, device_hitToTuple_.get(), 0, 1000000, iev);
+    kernel_print_found_ntuplets(hh.view(), tuples_d, tracks_d->view(), device_hitToTuple_.get(), 0, 1000000, iev);
   }
 #endif
 }
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
index a2bedc3b46ebc..c6961dd11d571 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
@@ -761,8 +761,7 @@ __global__ void kernel_sharedHitCleaner(TrackingRecHit2DSOAView const *__restric
   }
 }
 
-__global__ void kernel_tripletCleaner(TkSoA const *__restrict__ ptracks,
-                                      Quality *__restrict__ quality,
+__global__ void kernel_tripletCleaner(TkSoAView tracks_view,
                                       uint16_t nmin,
                                       bool dupPassThrough,
                                       CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ phitToTuple) {
@@ -772,7 +771,6 @@ __global__ void kernel_tripletCleaner(TkSoA const *__restrict__ ptracks,
   auto const good = pixelTrack::Quality::strict;
 
   auto &hitToTuple = *phitToTuple;
-  auto const &tracks = *ptracks;
 
   int first = blockDim.x * blockIdx.x + threadIdx.x;
   for (int idx = first, ntot = hitToTuple.nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
@@ -785,9 +783,9 @@ __global__ void kernel_tripletCleaner(TkSoA const *__restrict__ ptracks,
 
     // check if only triplets
     for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) {
-      if (quality[*it] <= good)
+      if (track_view[*it].quality() <= good)
         continue;
-      onlyTriplets &= tracks.isTriplet(*it);
+      onlyTriplets &= pixelTrack::utilities::isTriplet(tracks_view, *it);
       if (!onlyTriplets)
         break;
     }
@@ -799,8 +797,8 @@ __global__ void kernel_tripletCleaner(TkSoA const *__restrict__ ptracks,
     // for triplets choose best tip!  (should we first find best quality???)
     for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) {
       auto const it = *ip;
-      if (quality[it] >= good && std::abs(tracks.tip(it)) < mc) {
-        mc = std::abs(tracks.tip(it));
+      if (tracks_view[it].quality() >= good && std::abs(pixelTrack::utilities::tip(tracks_view, it)) < mc) {
+        mc = std::abs(pixelTrack::utilities::tip(tracks_view, it));
         im = it;
       }
     }
@@ -811,8 +809,8 @@ __global__ void kernel_tripletCleaner(TkSoA const *__restrict__ ptracks,
     // mark worse ambiguities
     for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) {
       auto const it = *ip;
-      if (quality[it] > reject && it != im)
-        quality[it] = reject;  //no race:  simple assignment of the same constant
+      if (tracks_view[it].quality() > reject && it != im)
+        tracks_view[it].quality() = reject;  //no race:  simple assignment of the same constant
     }
 
   }  // loop over hits

From 13e04d709759ad2ed8512e3e372d646034cb2202 Mon Sep 17 00:00:00 2001
From: Dimitris Papagiannis <nothingface0@gmail.com>
Date: Tue, 11 Oct 2022 14:04:27 +0200
Subject: [PATCH 017/110] simpleTripletCleaner

---
 .../plugins/CAHitNtupletGeneratorKernelsImpl.h       | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
index c6961dd11d571..938f29101c781 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
@@ -817,8 +817,7 @@ __global__ void kernel_tripletCleaner(TkSoAView tracks_view,
 }
 
 __global__ void kernel_simpleTripletCleaner(
-    TkSoA const *__restrict__ ptracks,
-    Quality *__restrict__ quality,
+    TkSoAView tracks_view,
     uint16_t nmin,
     bool dupPassThrough,
     CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ phitToTuple) {
@@ -828,7 +827,6 @@ __global__ void kernel_simpleTripletCleaner(
   auto const good = pixelTrack::Quality::loose;
 
   auto &hitToTuple = *phitToTuple;
-  auto const &tracks = *ptracks;
 
   int first = blockDim.x * blockIdx.x + threadIdx.x;
   for (int idx = first, ntot = hitToTuple.nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
@@ -841,8 +839,8 @@ __global__ void kernel_simpleTripletCleaner(
     // choose best tip!  (should we first find best quality???)
     for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) {
       auto const it = *ip;
-      if (quality[it] >= good && std::abs(tracks.tip(it)) < mc) {
-        mc = std::abs(tracks.tip(it));
+      if (tracks_view[it].quality() >= good && std::abs(pixelTrack::utilities::tip(tracks_view, it)) < mc) {
+        mc = std::abs(pixelTrack::utilities::tip(tracks_view, it));
         im = it;
       }
     }
@@ -853,8 +851,8 @@ __global__ void kernel_simpleTripletCleaner(
     // mark worse ambiguities
     for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) {
       auto const it = *ip;
-      if (quality[it] > reject && tracks.isTriplet(it) && it != im)
-        quality[it] = reject;  //no race:  simple assignment of the same constant
+      if (tracks_view[it].quality() > reject && pixelTracks::utilities::isTriplet(tracks_view, it) && it != im)
+        tracks_view[it].quality() = reject;  //no race:  simple assignment of the same constant
     }
 
   }  // loop over hits

From 9833fb637fa61ee10c39fba319538da97df2dcd3 Mon Sep 17 00:00:00 2001
From: Dimitris Papagiannis <nothingface0@gmail.com>
Date: Tue, 11 Oct 2022 14:10:45 +0200
Subject: [PATCH 018/110] fillNLayers

---
 .../Track/interface/TrackSoAHeterogeneousT_test.h    |  3 ++-
 .../plugins/CAHitNtupletGeneratorKernels.cc          | 12 ++++--------
 .../plugins/CAHitNtupletGeneratorKernelsImpl.h       |  6 ++++--
 3 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h
index d67eef16e9927..f4075b0b385d7 100644
--- a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h
+++ b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h
@@ -34,7 +34,8 @@ GENERATE_SOA_LAYOUT(TrackSoAHeterogeneousT_test,
                     SOA_COLUMN(float, eta),
                     SOA_COLUMN(float, pt),
                     SOA_EIGEN_COLUMN(Vector5f, state),
-                    SOA_EIGEN_COLUMN(Vector15f, covariance))
+                    SOA_EIGEN_COLUMN(Vector15f, covariance),
+                    SOA_SCALAR(int, nTracks))
 
 // Previous TrajectoryStateSoAT class methods
 namespace pixelTrack {
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
index 2b950caec5c6f..4143701fb9cc2 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
@@ -129,7 +129,7 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA *
   cms::cuda::finalizeBulk(device_hitTuple_apc_, tuples_d);
 
   kernel_fillHitDetIndices(tuples_d, hh.view(), detId_d);
-  kernel_fillNLayers(tracks_d, device_hitTuple_apc_);
+  kernel_fillNLayers(tracks_d, tracks_d->view(), device_hitTuple_apc_);
 
   // remove duplicates (tracks that share a doublet)
   kernel_earlyDuplicateRemover(device_theCells_.get(), device_nCells_, tracks_d, quality_d, params_.dupPassThrough_);
@@ -172,15 +172,11 @@ void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, TkSoA
     kernel_rejectDuplicate(
         tracks_d->view(), params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get());
 
-    kernel_sharedHitCleaner(hh.view(),
-                            tracks_d,
-                            quality_d,
-                            params_.minHitsForSharingCut_,
-                            params_.dupPassThrough_,
-                            device_hitToTuple_.get());
+    kernel_sharedHitCleaner(
+        hh.view(), tracks_d->view(), params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get());
     if (params_.useSimpleTripletCleaner_) {
       kernel_simpleTripletCleaner(
-          tracks_d, quality_d, params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get());
+          tracks_d->view(), params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get());
     } else {
       kernel_tripletCleaner(
           tracks_d->view(), params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get());
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
index 938f29101c781..503f4847f78a3 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
@@ -565,13 +565,15 @@ __global__ void kernel_fillHitDetIndices(HitContainer const *__restrict__ tuples
   }
 }
 
-__global__ void kernel_fillNLayers(TkSoA *__restrict__ ptracks, cms::cuda::AtomicPairCounter *apc) {
+__global__ void kernel_fillNLayers(TkSoA *__restrict__ ptracks,
+                                   TkSoAView tracks_view,
+                                   cms::cuda::AtomicPairCounter *apc) {
   auto &tracks = *ptracks;
   auto first = blockIdx.x * blockDim.x + threadIdx.x;
   // clamp the number of tracks to the capacity of the SoA
   auto ntracks = std::min<int>(apc->get().m, tracks.stride() - 1);
   if (0 == first)
-    tracks.setNTracks(ntracks);
+    tracks_view.nTracks() = ntracks;
   for (int idx = first, nt = ntracks; idx < nt; idx += gridDim.x * blockDim.x) {
     auto nHits = tracks.nHits(idx);
     assert(nHits >= 3);

From a683d86cee7b0e1bd8c7945881c89daefc9c5c9b Mon Sep 17 00:00:00 2001
From: Dimitris Papagiannis <nothingface0@gmail.com>
Date: Tue, 11 Oct 2022 14:14:14 +0200
Subject: [PATCH 019/110] earlyDuplicateRemover

---
 .../PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc | 3 ++-
 .../plugins/CAHitNtupletGeneratorKernelsImpl.h            | 8 +++-----
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
index 4143701fb9cc2..2bbf9edab4e99 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
@@ -132,7 +132,8 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA *
   kernel_fillNLayers(tracks_d, tracks_d->view(), device_hitTuple_apc_);
 
   // remove duplicates (tracks that share a doublet)
-  kernel_earlyDuplicateRemover(device_theCells_.get(), device_nCells_, tracks_d, quality_d, params_.dupPassThrough_);
+  kernel_earlyDuplicateRemover(
+      device_theCells_.get(), device_nCells_, tracks_d->view(), quality_d, params_.dupPassThrough_);
 
   kernel_countMultiplicity(tuples_d, quality_d, device_tupleMultiplicity_.get());
   cms::cuda::launchFinalize(device_tupleMultiplicity_.get(), cudaStream);
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
index 503f4847f78a3..2f83a18c6127d 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
@@ -142,14 +142,12 @@ __global__ void kernel_fishboneCleaner(GPUCACell const *cells, uint32_t const *_
 // It does not seem to affect efficiency in any way!
 __global__ void kernel_earlyDuplicateRemover(GPUCACell const *cells,
                                              uint32_t const *__restrict__ nCells,
-                                             TkSoA const *__restrict__ ptracks,
+                                             TkSoAConstView tracks_view,
                                              Quality *quality,
                                              bool dupPassThrough) {
   // quality to mark rejected
   constexpr auto reject = pixelTrack::Quality::edup;  /// cannot be loose
 
-  auto const &tracks = *ptracks;
-
   assert(nCells);
   auto first = threadIdx.x + blockIdx.x * blockDim.x;
   for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) {
@@ -162,7 +160,7 @@ __global__ void kernel_earlyDuplicateRemover(GPUCACell const *cells,
 
     // find maxNl
     for (auto it : thisCell.tracks()) {
-      auto nl = tracks[it].nLayers();
+      auto nl = tracks_view[it].nLayers();
       maxNl = std::max(nl, maxNl);
     }
 
@@ -171,7 +169,7 @@ __global__ void kernel_earlyDuplicateRemover(GPUCACell const *cells,
     //  maxNl = std::min(4, maxNl);
 
     for (auto it : thisCell.tracks()) {
-      if (tracks[it].nLayers() < maxNl)
+      if (tracks_view[it].nLayers() < maxNl)
         quality[it] = reject;  //no race:  simple assignment of the same constant
     }
   }

From 293151b3db77330a3e0d9e10817573aa5a206ab3 Mon Sep 17 00:00:00 2001
From: Dimitris Papagiannis <nothingface0@gmail.com>
Date: Tue, 11 Oct 2022 14:15:08 +0200
Subject: [PATCH 020/110] earlyDuplicateRemover

---
 .../plugins/#CAHitNtupletGeneratorKernels.cc# | 219 ++++++++++++++++++
 .../CAHitNtupletGeneratorKernelsImpl.h        |   5 +-
 2 files changed, 221 insertions(+), 3 deletions(-)
 create mode 100644 RecoPixelVertexing/PixelTriplets/plugins/#CAHitNtupletGeneratorKernels.cc#

diff --git a/RecoPixelVertexing/PixelTriplets/plugins/#CAHitNtupletGeneratorKernels.cc# b/RecoPixelVertexing/PixelTriplets/plugins/#CAHitNtupletGeneratorKernels.cc#
new file mode 100644
index 0000000000000..9a8cc40d24985
--- /dev/null
+++ b/RecoPixelVertexing/PixelTriplets/plugins/#CAHitNtupletGeneratorKernels.cc#
@@ -0,0 +1,219 @@
+#include "RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h"
+
+#include <mutex>
+
+namespace {
+  // cuda atomics are NOT atomics on CPU so protect stat update with a mutex
+  // waiting for a more general solution (incuding multiple devices) to be proposed and implemented
+  std::mutex lock_stat;
+}  // namespace
+
+template <>
+void CAHitNtupletGeneratorKernelsCPU::printCounters(Counters const *counters) {
+  kernel_printCounters(counters);
+}
+
+template <>
+void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsOnCPU const &hh, cudaStream_t stream) {
+  auto nhits = hh.nHits();
+
+#ifdef NTUPLE_DEBUG
+  std::cout << "building Doublets out of " << nhits << " Hits. BPIX2 offset is " << hh.offsetBPIX2() << std::endl;
+#endif
+
+  // use "nhits" to heuristically dimension the workspace
+
+  // no need to use the Traits allocations, since we know this is being compiled for the CPU
+  //device_isOuterHitOfCell_ = Traits::template make_unique<GPUCACell::OuterHitOfCell[]>(std::max(1U, nhits), stream);
+  device_isOuterHitOfCell_ = std::make_unique<GPUCACell::OuterHitOfCellContainer[]>(std::max(1U, nhits));
+  assert(device_isOuterHitOfCell_.get());
+  isOuterHitOfCell_ = GPUCACell::OuterHitOfCell{device_isOuterHitOfCell_.get(), hh.offsetBPIX2()};
+
+  auto cellStorageSize = caConstants::maxNumOfActiveDoublets * sizeof(GPUCACell::CellNeighbors) +
+                         caConstants::maxNumOfActiveDoublets * sizeof(GPUCACell::CellTracks);
+  // no need to use the Traits allocations, since we know this is being compiled for the CPU
+  //cellStorage_ = Traits::template make_unique<unsigned char[]>(cellStorageSize, stream);
+  cellStorage_ = std::make_unique<unsigned char[]>(cellStorageSize);
+  device_theCellNeighborsContainer_ = (GPUCACell::CellNeighbors *)cellStorage_.get();
+  device_theCellTracksContainer_ = (GPUCACell::CellTracks *)(cellStorage_.get() + caConstants::maxNumOfActiveDoublets *
+                                                                                      sizeof(GPUCACell::CellNeighbors));
+
+  gpuPixelDoublets::initDoublets(isOuterHitOfCell_,
+                                 nhits,
+                                 device_theCellNeighbors_.get(),
+                                 device_theCellNeighborsContainer_,
+                                 device_theCellTracks_.get(),
+                                 device_theCellTracksContainer_);
+
+  // no need to use the Traits allocations, since we know this is being compiled for the CPU
+  //device_theCells_ = Traits::template make_unique<GPUCACell[]>(params_.maxNumberOfDoublets_, stream);
+  device_theCells_ = std::make_unique<GPUCACell[]>(params_.maxNumberOfDoublets_);
+  if (0 == nhits)
+    return;  // protect against empty events
+
+  // take all layer pairs into account
+  auto nActualPairs = gpuPixelDoublets::nPairs;
+  if (not params_.includeJumpingForwardDoublets_) {
+    // exclude forward "jumping" layer pairs
+    nActualPairs = gpuPixelDoublets::nPairsForTriplets;
+  }
+  if (params_.minHitsPerNtuplet_ > 3) {
+    // for quadruplets, exclude all "jumping" layer pairs
+    nActualPairs = gpuPixelDoublets::nPairsForQuadruplets;
+  }
+
+  assert(nActualPairs <= gpuPixelDoublets::nPairs);
+  gpuPixelDoublets::getDoubletsFromHisto(device_theCells_.get(),
+                                         device_nCells_,
+                                         device_theCellNeighbors_.get(),
+                                         device_theCellTracks_.get(),
+                                         hh.view(),
+                                         isOuterHitOfCell_,
+                                         nActualPairs,
+                                         params_.idealConditions_,
+                                         params_.doClusterCut_,
+                                         params_.doZ0Cut_,
+                                         params_.doPtCut_,
+                                         params_.maxNumberOfDoublets_);
+}
+
+template <>
+void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) {
+  auto *tuples_d = &tracks_d->hitIndices;
+  auto *detId_d = &tracks_d->detIndices;
+  auto *quality_d = tracks_d->qualityData();
+
+  assert(tuples_d && quality_d);
+
+  // zero tuples
+  cms::cuda::launchZero(tuples_d, cudaStream);
+
+  auto nhits = hh.nHits();
+
+  // std::cout << "N hits " << nhits << std::endl;
+  // if (nhits<2) std::cout << "too few hits " << nhits << std::endl;
+
+  //
+  // applying conbinatoric cleaning such as fishbone at this stage is too expensive
+  //
+
+  kernel_connect(device_hitTuple_apc_,
+                 device_hitToTuple_apc_,  // needed only to be reset, ready for next kernel
+                 hh.view(),
+                 device_theCells_.get(),
+                 device_nCells_,
+                 device_theCellNeighbors_.get(),
+                 isOuterHitOfCell_,
+                 params_.hardCurvCut_,
+                 params_.ptmin_,
+                 params_.CAThetaCutBarrel_,
+                 params_.CAThetaCutForward_,
+                 params_.dcaCutInnerTriplet_,
+                 params_.dcaCutOuterTriplet_);
+
+  if (nhits > 1 && params_.earlyFishbone_) {
+    gpuPixelDoublets::fishbone(hh.view(), device_theCells_.get(), device_nCells_, isOuterHitOfCell_, nhits, false);
+  }
+
+  kernel_find_ntuplets(hh.view(),
+                       device_theCells_.get(),
+                       device_nCells_,
+                       device_theCellTracks_.get(),
+                       tuples_d,
+                       device_hitTuple_apc_,
+                       quality_d,
+                       params_.minHitsPerNtuplet_);
+  if (params_.doStats_)
+    kernel_mark_used(device_theCells_.get(), device_nCells_);
+
+  cms::cuda::finalizeBulk(device_hitTuple_apc_, tuples_d);
+
+  kernel_fillHitDetIndices(tuples_d, hh.view(), detId_d);
+  kernel_fillNLayers(tracks_d, tracks_d->view(), device_hitTuple_apc_);
+
+  // remove duplicates (tracks that share a doublet)
+  kernel_earlyDuplicateRemover(
+      device_theCells_.get(), device_nCells_, tracks_d->view(), params_.dupPassThrough_);
+
+  kernel_countMultiplicity(tuples_d, quality_d, device_tupleMultiplicity_.get());
+  cms::cuda::launchFinalize(device_tupleMultiplicity_.get(), cudaStream);
+  kernel_fillMultiplicity(tuples_d, quality_d, device_tupleMultiplicity_.get());
+
+  if (nhits > 1 && params_.lateFishbone_) {
+    gpuPixelDoublets::fishbone(hh.view(), device_theCells_.get(), device_nCells_, isOuterHitOfCell_, nhits, true);
+  }
+}
+
+template <>
+void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) {
+  int32_t nhits = hh.nHits();
+
+  auto const *tuples_d = &tracks_d->hitIndices;
+
+  // classify tracks based on kinematics
+  kernel_classifyTracks(tuples_d, tracks_d, tracks_d->view(), params_.cuts_);
+
+  if (params_.lateFishbone_) {
+    // apply fishbone cleaning to good tracks
+    kernel_fishboneCleaner(device_theCells_.get(), device_nCells_, quality_d);
+  }
+
+  // remove duplicates (tracks that share a doublet)
+  kernel_fastDuplicateRemover(device_theCells_.get(), device_nCells_, tracks_d->view(), params_.dupPassThrough_);
+
+  // fill hit->track "map"
+  if (params_.doSharedHitCut_ || params_.doStats_) {
+    kernel_countHitInTracks(tuples_d, quality_d, device_hitToTuple_.get());
+    cms::cuda::launchFinalize(hitToTupleView_, cudaStream);
+    kernel_fillHitInTracks(tuples_d, quality_d, device_hitToTuple_.get());
+  }
+
+  // remove duplicates (tracks that share at least one hit)
+  if (params_.doSharedHitCut_) {
+    kernel_rejectDuplicate(
+        tracks_d->view(), params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get());
+
+    kernel_sharedHitCleaner(
+        hh.view(), tracks_d->view(), params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get());
+    if (params_.useSimpleTripletCleaner_) {
+      kernel_simpleTripletCleaner(
+          tracks_d->view(), params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get());
+    } else {
+      kernel_tripletCleaner(
+          tracks_d->view(), params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get());
+    }
+  }
+
+  if (params_.doStats_) {
+    std::lock_guard guard(lock_stat);
+    kernel_checkOverflows(tuples_d,
+                          device_tupleMultiplicity_.get(),
+                          device_hitToTuple_.get(),
+                          device_hitTuple_apc_,
+                          device_theCells_.get(),
+                          device_nCells_,
+                          device_theCellNeighbors_.get(),
+                          device_theCellTracks_.get(),
+                          isOuterHitOfCell_,
+                          nhits,
+                          params_.maxNumberOfDoublets_,
+                          counters_);
+  }
+
+  if (params_.doStats_) {
+    // counters (add flag???)
+    std::lock_guard guard(lock_stat);
+    kernel_doStatsForHitInTracks(device_hitToTuple_.get(), counters_);
+    kernel_doStatsForTracks(tuples_d, quality_d, counters_);
+  }
+
+#ifdef DUMP_GPU_TK_TUPLES
+  static std::atomic<int> iev(0);
+  static std::mutex lock;
+  {
+    std::lock_guard<std::mutex> guard(lock);
+    ++iev;
+    kernel_print_found_ntuplets(hh.view(), tuples_d, tracks_d->view(), device_hitToTuple_.get(), 0, 1000000, iev);
+  }
+#endif
+}
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
index 2f83a18c6127d..b3468cbce9dde 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
@@ -142,8 +142,7 @@ __global__ void kernel_fishboneCleaner(GPUCACell const *cells, uint32_t const *_
 // It does not seem to affect efficiency in any way!
 __global__ void kernel_earlyDuplicateRemover(GPUCACell const *cells,
                                              uint32_t const *__restrict__ nCells,
-                                             TkSoAConstView tracks_view,
-                                             Quality *quality,
+                                             TkSoAView tracks_view,
                                              bool dupPassThrough) {
   // quality to mark rejected
   constexpr auto reject = pixelTrack::Quality::edup;  /// cannot be loose
@@ -170,7 +169,7 @@ __global__ void kernel_earlyDuplicateRemover(GPUCACell const *cells,
 
     for (auto it : thisCell.tracks()) {
       if (tracks_view[it].nLayers() < maxNl)
-        quality[it] = reject;  //no race:  simple assignment of the same constant
+        tracks_view[it].quality() = reject;  //no race:  simple assignment of the same constant
     }
   }
 }

From 4312c47b2bcf6eef4a4292928c6a02674fb8da1b Mon Sep 17 00:00:00 2001
From: Dimitris Papagiannis <nothingface0@gmail.com>
Date: Tue, 11 Oct 2022 14:16:01 +0200
Subject: [PATCH 021/110] earlyDuplicateRemover

---
 .../plugins/#CAHitNtupletGeneratorKernels.cc# | 219 ------------------
 .../plugins/CAHitNtupletGeneratorKernels.cc   |   4 +-
 2 files changed, 1 insertion(+), 222 deletions(-)
 delete mode 100644 RecoPixelVertexing/PixelTriplets/plugins/#CAHitNtupletGeneratorKernels.cc#

diff --git a/RecoPixelVertexing/PixelTriplets/plugins/#CAHitNtupletGeneratorKernels.cc# b/RecoPixelVertexing/PixelTriplets/plugins/#CAHitNtupletGeneratorKernels.cc#
deleted file mode 100644
index 9a8cc40d24985..0000000000000
--- a/RecoPixelVertexing/PixelTriplets/plugins/#CAHitNtupletGeneratorKernels.cc#
+++ /dev/null
@@ -1,219 +0,0 @@
-#include "RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h"
-
-#include <mutex>
-
-namespace {
-  // cuda atomics are NOT atomics on CPU so protect stat update with a mutex
-  // waiting for a more general solution (incuding multiple devices) to be proposed and implemented
-  std::mutex lock_stat;
-}  // namespace
-
-template <>
-void CAHitNtupletGeneratorKernelsCPU::printCounters(Counters const *counters) {
-  kernel_printCounters(counters);
-}
-
-template <>
-void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsOnCPU const &hh, cudaStream_t stream) {
-  auto nhits = hh.nHits();
-
-#ifdef NTUPLE_DEBUG
-  std::cout << "building Doublets out of " << nhits << " Hits. BPIX2 offset is " << hh.offsetBPIX2() << std::endl;
-#endif
-
-  // use "nhits" to heuristically dimension the workspace
-
-  // no need to use the Traits allocations, since we know this is being compiled for the CPU
-  //device_isOuterHitOfCell_ = Traits::template make_unique<GPUCACell::OuterHitOfCell[]>(std::max(1U, nhits), stream);
-  device_isOuterHitOfCell_ = std::make_unique<GPUCACell::OuterHitOfCellContainer[]>(std::max(1U, nhits));
-  assert(device_isOuterHitOfCell_.get());
-  isOuterHitOfCell_ = GPUCACell::OuterHitOfCell{device_isOuterHitOfCell_.get(), hh.offsetBPIX2()};
-
-  auto cellStorageSize = caConstants::maxNumOfActiveDoublets * sizeof(GPUCACell::CellNeighbors) +
-                         caConstants::maxNumOfActiveDoublets * sizeof(GPUCACell::CellTracks);
-  // no need to use the Traits allocations, since we know this is being compiled for the CPU
-  //cellStorage_ = Traits::template make_unique<unsigned char[]>(cellStorageSize, stream);
-  cellStorage_ = std::make_unique<unsigned char[]>(cellStorageSize);
-  device_theCellNeighborsContainer_ = (GPUCACell::CellNeighbors *)cellStorage_.get();
-  device_theCellTracksContainer_ = (GPUCACell::CellTracks *)(cellStorage_.get() + caConstants::maxNumOfActiveDoublets *
-                                                                                      sizeof(GPUCACell::CellNeighbors));
-
-  gpuPixelDoublets::initDoublets(isOuterHitOfCell_,
-                                 nhits,
-                                 device_theCellNeighbors_.get(),
-                                 device_theCellNeighborsContainer_,
-                                 device_theCellTracks_.get(),
-                                 device_theCellTracksContainer_);
-
-  // no need to use the Traits allocations, since we know this is being compiled for the CPU
-  //device_theCells_ = Traits::template make_unique<GPUCACell[]>(params_.maxNumberOfDoublets_, stream);
-  device_theCells_ = std::make_unique<GPUCACell[]>(params_.maxNumberOfDoublets_);
-  if (0 == nhits)
-    return;  // protect against empty events
-
-  // take all layer pairs into account
-  auto nActualPairs = gpuPixelDoublets::nPairs;
-  if (not params_.includeJumpingForwardDoublets_) {
-    // exclude forward "jumping" layer pairs
-    nActualPairs = gpuPixelDoublets::nPairsForTriplets;
-  }
-  if (params_.minHitsPerNtuplet_ > 3) {
-    // for quadruplets, exclude all "jumping" layer pairs
-    nActualPairs = gpuPixelDoublets::nPairsForQuadruplets;
-  }
-
-  assert(nActualPairs <= gpuPixelDoublets::nPairs);
-  gpuPixelDoublets::getDoubletsFromHisto(device_theCells_.get(),
-                                         device_nCells_,
-                                         device_theCellNeighbors_.get(),
-                                         device_theCellTracks_.get(),
-                                         hh.view(),
-                                         isOuterHitOfCell_,
-                                         nActualPairs,
-                                         params_.idealConditions_,
-                                         params_.doClusterCut_,
-                                         params_.doZ0Cut_,
-                                         params_.doPtCut_,
-                                         params_.maxNumberOfDoublets_);
-}
-
-template <>
-void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) {
-  auto *tuples_d = &tracks_d->hitIndices;
-  auto *detId_d = &tracks_d->detIndices;
-  auto *quality_d = tracks_d->qualityData();
-
-  assert(tuples_d && quality_d);
-
-  // zero tuples
-  cms::cuda::launchZero(tuples_d, cudaStream);
-
-  auto nhits = hh.nHits();
-
-  // std::cout << "N hits " << nhits << std::endl;
-  // if (nhits<2) std::cout << "too few hits " << nhits << std::endl;
-
-  //
-  // applying conbinatoric cleaning such as fishbone at this stage is too expensive
-  //
-
-  kernel_connect(device_hitTuple_apc_,
-                 device_hitToTuple_apc_,  // needed only to be reset, ready for next kernel
-                 hh.view(),
-                 device_theCells_.get(),
-                 device_nCells_,
-                 device_theCellNeighbors_.get(),
-                 isOuterHitOfCell_,
-                 params_.hardCurvCut_,
-                 params_.ptmin_,
-                 params_.CAThetaCutBarrel_,
-                 params_.CAThetaCutForward_,
-                 params_.dcaCutInnerTriplet_,
-                 params_.dcaCutOuterTriplet_);
-
-  if (nhits > 1 && params_.earlyFishbone_) {
-    gpuPixelDoublets::fishbone(hh.view(), device_theCells_.get(), device_nCells_, isOuterHitOfCell_, nhits, false);
-  }
-
-  kernel_find_ntuplets(hh.view(),
-                       device_theCells_.get(),
-                       device_nCells_,
-                       device_theCellTracks_.get(),
-                       tuples_d,
-                       device_hitTuple_apc_,
-                       quality_d,
-                       params_.minHitsPerNtuplet_);
-  if (params_.doStats_)
-    kernel_mark_used(device_theCells_.get(), device_nCells_);
-
-  cms::cuda::finalizeBulk(device_hitTuple_apc_, tuples_d);
-
-  kernel_fillHitDetIndices(tuples_d, hh.view(), detId_d);
-  kernel_fillNLayers(tracks_d, tracks_d->view(), device_hitTuple_apc_);
-
-  // remove duplicates (tracks that share a doublet)
-  kernel_earlyDuplicateRemover(
-      device_theCells_.get(), device_nCells_, tracks_d->view(), params_.dupPassThrough_);
-
-  kernel_countMultiplicity(tuples_d, quality_d, device_tupleMultiplicity_.get());
-  cms::cuda::launchFinalize(device_tupleMultiplicity_.get(), cudaStream);
-  kernel_fillMultiplicity(tuples_d, quality_d, device_tupleMultiplicity_.get());
-
-  if (nhits > 1 && params_.lateFishbone_) {
-    gpuPixelDoublets::fishbone(hh.view(), device_theCells_.get(), device_nCells_, isOuterHitOfCell_, nhits, true);
-  }
-}
-
-template <>
-void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) {
-  int32_t nhits = hh.nHits();
-
-  auto const *tuples_d = &tracks_d->hitIndices;
-
-  // classify tracks based on kinematics
-  kernel_classifyTracks(tuples_d, tracks_d, tracks_d->view(), params_.cuts_);
-
-  if (params_.lateFishbone_) {
-    // apply fishbone cleaning to good tracks
-    kernel_fishboneCleaner(device_theCells_.get(), device_nCells_, quality_d);
-  }
-
-  // remove duplicates (tracks that share a doublet)
-  kernel_fastDuplicateRemover(device_theCells_.get(), device_nCells_, tracks_d->view(), params_.dupPassThrough_);
-
-  // fill hit->track "map"
-  if (params_.doSharedHitCut_ || params_.doStats_) {
-    kernel_countHitInTracks(tuples_d, quality_d, device_hitToTuple_.get());
-    cms::cuda::launchFinalize(hitToTupleView_, cudaStream);
-    kernel_fillHitInTracks(tuples_d, quality_d, device_hitToTuple_.get());
-  }
-
-  // remove duplicates (tracks that share at least one hit)
-  if (params_.doSharedHitCut_) {
-    kernel_rejectDuplicate(
-        tracks_d->view(), params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get());
-
-    kernel_sharedHitCleaner(
-        hh.view(), tracks_d->view(), params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get());
-    if (params_.useSimpleTripletCleaner_) {
-      kernel_simpleTripletCleaner(
-          tracks_d->view(), params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get());
-    } else {
-      kernel_tripletCleaner(
-          tracks_d->view(), params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get());
-    }
-  }
-
-  if (params_.doStats_) {
-    std::lock_guard guard(lock_stat);
-    kernel_checkOverflows(tuples_d,
-                          device_tupleMultiplicity_.get(),
-                          device_hitToTuple_.get(),
-                          device_hitTuple_apc_,
-                          device_theCells_.get(),
-                          device_nCells_,
-                          device_theCellNeighbors_.get(),
-                          device_theCellTracks_.get(),
-                          isOuterHitOfCell_,
-                          nhits,
-                          params_.maxNumberOfDoublets_,
-                          counters_);
-  }
-
-  if (params_.doStats_) {
-    // counters (add flag???)
-    std::lock_guard guard(lock_stat);
-    kernel_doStatsForHitInTracks(device_hitToTuple_.get(), counters_);
-    kernel_doStatsForTracks(tuples_d, quality_d, counters_);
-  }
-
-#ifdef DUMP_GPU_TK_TUPLES
-  static std::atomic<int> iev(0);
-  static std::mutex lock;
-  {
-    std::lock_guard<std::mutex> guard(lock);
-    ++iev;
-    kernel_print_found_ntuplets(hh.view(), tuples_d, tracks_d->view(), device_hitToTuple_.get(), 0, 1000000, iev);
-  }
-#endif
-}
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
index 2bbf9edab4e99..ef6cb4ecc0ea6 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
@@ -132,9 +132,7 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA *
   kernel_fillNLayers(tracks_d, tracks_d->view(), device_hitTuple_apc_);
 
   // remove duplicates (tracks that share a doublet)
-  kernel_earlyDuplicateRemover(
-      device_theCells_.get(), device_nCells_, tracks_d->view(), quality_d, params_.dupPassThrough_);
-
+  kernel_earlyDuplicateRemover(device_theCells_.get(), device_nCells_, tracks_d->view(), params_.dupPassThrough_);
   kernel_countMultiplicity(tuples_d, quality_d, device_tupleMultiplicity_.get());
   cms::cuda::launchFinalize(device_tupleMultiplicity_.get(), cudaStream);
   kernel_fillMultiplicity(tuples_d, quality_d, device_tupleMultiplicity_.get());

From 86bc12aa6eb478f9ee05039124732d4a560794e7 Mon Sep 17 00:00:00 2001
From: Dimitris Papagiannis <nothingface0@gmail.com>
Date: Tue, 11 Oct 2022 14:19:58 +0200
Subject: [PATCH 022/110] Multiplicity kernels

---
 .../plugins/CAHitNtupletGeneratorKernels.cc          |  2 +-
 .../plugins/CAHitNtupletGeneratorKernelsImpl.h       | 12 ++++++------
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
index ef6cb4ecc0ea6..9ebe0dfe44bdb 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
@@ -133,7 +133,7 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA *
 
   // remove duplicates (tracks that share a doublet)
   kernel_earlyDuplicateRemover(device_theCells_.get(), device_nCells_, tracks_d->view(), params_.dupPassThrough_);
-  kernel_countMultiplicity(tuples_d, quality_d, device_tupleMultiplicity_.get());
+  kernel_countMultiplicity(tuples_d, tracks_d->view(), device_tupleMultiplicity_.get());
   cms::cuda::launchFinalize(device_tupleMultiplicity_.get(), cudaStream);
   kernel_fillMultiplicity(tuples_d, quality_d, device_tupleMultiplicity_.get());
 
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
index b3468cbce9dde..45ef5009bc44a 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
@@ -375,16 +375,16 @@ __global__ void kernel_mark_used(GPUCACell *__restrict__ cells, uint32_t const *
 }
 
 __global__ void kernel_countMultiplicity(HitContainer const *__restrict__ foundNtuplets,
-                                         Quality const *__restrict__ quality,
+                                         TkSoAConstView tracks_view,
                                          caConstants::TupleMultiplicity *tupleMultiplicity) {
   auto first = blockIdx.x * blockDim.x + threadIdx.x;
   for (int it = first, nt = foundNtuplets->nOnes(); it < nt; it += gridDim.x * blockDim.x) {
     auto nhits = foundNtuplets->size(it);
     if (nhits < 3)
       continue;
-    if (quality[it] == pixelTrack::Quality::edup)
+    if (tracks_view[it].quality() == pixelTrack::Quality::edup)
       continue;
-    assert(quality[it] == pixelTrack::Quality::bad);
+    assert(tracks_view[it].quality() == pixelTrack::Quality::bad);
     if (nhits > 7)  // current limit
       printf("wrong mult %d %d\n", it, nhits);
     assert(nhits <= caConstants::maxHitsOnTrack);
@@ -393,16 +393,16 @@ __global__ void kernel_countMultiplicity(HitContainer const *__restrict__ foundN
 }
 
 __global__ void kernel_fillMultiplicity(HitContainer const *__restrict__ foundNtuplets,
-                                        Quality const *__restrict__ quality,
+                                        TkSoAConstView tracks_view,
                                         caConstants::TupleMultiplicity *tupleMultiplicity) {
   auto first = blockIdx.x * blockDim.x + threadIdx.x;
   for (int it = first, nt = foundNtuplets->nOnes(); it < nt; it += gridDim.x * blockDim.x) {
     auto nhits = foundNtuplets->size(it);
     if (nhits < 3)
       continue;
-    if (quality[it] == pixelTrack::Quality::edup)
+    if (tracks_view[it].quality() == pixelTrack::Quality::edup)
       continue;
-    assert(quality[it] == pixelTrack::Quality::bad);
+    assert(tracks_view[it].quality() == pixelTrack::Quality::bad);
     if (nhits > 7)
       printf("wrong mult %d %d\n", it, nhits);
     assert(nhits <= caConstants::maxHitsOnTrack);

From 9cf0298792504a1f2caa1287fc708ded7d81569b Mon Sep 17 00:00:00 2001
From: Dimitris Papagiannis <nothingface0@gmail.com>
Date: Tue, 11 Oct 2022 14:22:23 +0200
Subject: [PATCH 023/110] classifyTracks

---
 .../PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc  | 2 +-
 .../plugins/CAHitNtupletGeneratorKernelsImpl.h             | 7 +++----
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
index 9ebe0dfe44bdb..e2ec93fbbea86 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
@@ -149,7 +149,7 @@ void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, TkSoA
   auto const *tuples_d = &tracks_d->hitIndices;
 
   // classify tracks based on kinematics
-  kernel_classifyTracks(tuples_d, tracks_d, tracks_d->view(), params_.cuts_);
+  kernel_classifyTracks(tuples_d, tracks_d->view(), params_.cuts_);
 
   if (params_.lateFishbone_) {
     // apply fishbone cleaning to good tracks
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
index 45ef5009bc44a..178ba71532b8f 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
@@ -415,7 +415,6 @@ __global__ void kernel_fillMultiplicity(HitContainer const *__restrict__ foundNt
 the SoA Data
  */
 __global__ void kernel_classifyTracks(HitContainer const *__restrict__ tuples,
-                                      TkSoA const *__restrict__ tracks,
                                       TkSoAView tracks_view,
                                       CAHitNtupletGeneratorKernelsGPU::QualityCuts cuts) {
   int first = blockDim.x * blockIdx.x + threadIdx.x;
@@ -447,7 +446,7 @@ __global__ void kernel_classifyTracks(HitContainer const *__restrict__ tuples,
       continue;
     }
 
-    tracks[it].quality() = pixelTrack::Quality::strict;
+    tracks_view[it].quality() = pixelTrack::Quality::strict;
 
     // compute a pT-dependent chi2 cut
 
@@ -487,7 +486,7 @@ __global__ void kernel_classifyTracks(HitContainer const *__restrict__ tuples,
       continue;
     }
 
-    tracks[it].quality() = pixelTrack::Quality::tight;
+    tracks_view[it].quality() = pixelTrack::Quality::tight;
 
     // impose "region cuts" based on the fit results (phi, Tip, pt, cotan(theta)), Zip)
     // default cuts:
@@ -500,7 +499,7 @@ __global__ void kernel_classifyTracks(HitContainer const *__restrict__ tuples,
                 (std::abs(pixelTrack::utilities::zip(tracks_view, it)) < region.maxZip);
 
     if (isOk) {
-      tracks[it].quality() = pixelTrack::Quality::highPurity;
+      tracks_view[it].quality() = pixelTrack::Quality::highPurity;
     }
   }
 }

From a988e01255f337bb380caf988b5ad8f5aea7dd68 Mon Sep 17 00:00:00 2001
From: Dimitris Papagiannis <nothingface0@gmail.com>
Date: Tue, 11 Oct 2022 16:14:01 +0200
Subject: [PATCH 024/110] Cleanup, print ntuplets

---
 .../plugins/CAHitNtupletGeneratorKernels.cu   | 33 +++++++----------
 .../CAHitNtupletGeneratorKernelsImpl.h        | 35 +++++++++----------
 2 files changed, 30 insertions(+), 38 deletions(-)

diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
index 712d995a6a6cf..d293ad00558fe 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
@@ -234,12 +234,13 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA
   // classify tracks based on kinematics
   auto numberOfBlocks = nQuadrupletBlocks(blockSize);
   kernel_classifyTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
-      tuples_d, tracks_d, tracks_d->stateAtBS.view(), params_.cuts_, quality_d);
+      tuples_d, tracks_d->view(), params_.cuts_, quality_d);
   cudaCheck(cudaGetLastError());
 
   if (params_.lateFishbone_) {
-    // apply fishbone cleaning to good tracks
-    numberOfBlocks = nDoubletBlocks(blockSize);
+    x
+        // apply fishbone cleaning to good tracks
+        numberOfBlocks = nDoubletBlocks(blockSize);
     kernel_fishboneCleaner<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
         device_theCells_.get(), device_nCells_, quality_d);
     cudaCheck(cudaGetLastError());
@@ -248,7 +249,7 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA
   // mark duplicates (tracks that share a doublet)
   numberOfBlocks = nDoubletBlocks(blockSize);
   kernel_fastDuplicateRemover<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
-      device_theCells_.get(), device_nCells_, tracks_d, tracks_d->stateAtBS.view(), params_.dupPassThrough_);
+      device_theCells_.get(), device_nCells_, tracks_d->view(), params_.dupPassThrough_);
   cudaCheck(cudaGetLastError());
 #ifdef GPU_DEBUG
   cudaCheck(cudaDeviceSynchronize());
@@ -276,26 +277,18 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA
     // mark duplicates (tracks that share at least one hit)
     numberOfBlocks = (hitToTupleView_.offSize + blockSize - 1) / blockSize;
 
-    kernel_rejectDuplicate<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tracks_d,
-                                                                         tracks_d->stateAtBS.view(),
-                                                                         quality_d,
-                                                                         params_.minHitsForSharingCut_,
-                                                                         params_.dupPassThrough_,
-                                                                         device_hitToTuple_.get());
+    kernel_rejectDuplicate<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+        tracks_d->view(), params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get());
 
-    kernel_sharedHitCleaner<<<numberOfBlocks, blockSize, 0, cudaStream>>>(hh.view(),
-                                                                          tracks_d,
-                                                                          quality_d,
-                                                                          params_.minHitsForSharingCut_,
-                                                                          params_.dupPassThrough_,
-                                                                          device_hitToTuple_.get());
+    kernel_sharedHitCleaner<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+        hh.view(), tracks_d->view(), params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get());
 
     if (params_.useSimpleTripletCleaner_) {
       kernel_simpleTripletCleaner<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
-          tracks_d, quality_d, params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get());
+          tracks_d->view(), params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get());
     } else {
       kernel_tripletCleaner<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
-          tracks_d, quality_d, params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get());
+          tracks_d->view(), params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get());
     }
     cudaCheck(cudaGetLastError());
 #ifdef GPU_DEBUG
@@ -342,11 +335,11 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA
     ++iev;
     for (int k = 0; k < 20000; k += 500) {
       kernel_print_found_ntuplets<<<1, 32, 0, cudaStream>>>(
-          hh.view(), tuples_d, tracks_d, quality_d, device_hitToTuple_.get(), k, k + 500, iev);
+          hh.view(), tuples_d, tracks_d->view(), device_hitToTuple_.get(), k, k + 500, iev);
       cudaDeviceSynchronize();
     }
     kernel_print_found_ntuplets<<<1, 32, 0, cudaStream>>>(
-        hh.view(), tuples_d, tracks_d, quality_d, device_hitToTuple_.get(), 20000, 1000000, iev);
+        hh.view(), tuples_d, tracks_d->view(), device_hitToTuple_.get(), 20000, 1000000, iev);
     cudaDeviceSynchronize();
     // cudaStreamSynchronize(cudaStream);
   }
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
index 178ba71532b8f..7e513856cd9c6 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
@@ -666,8 +666,8 @@ __global__ void kernel_rejectDuplicate(TkSoAView tracks_view,
 
     /* chi2 is bad for large pt
     auto score = [&](auto it, auto nl) {
-      return nl < 4 ? std::abs(tracks.tip(it)) :  // tip for triplets
-                 tracks.chi2(it);                 //chi2
+      return nl < 4 ? std::abs(pixelTrack::utilities::tip(tracks_view, it)) :  // tip for triplets
+                 pixelTrack::utilities::chi2(tracks_view, it);                 //chi2
     };
     */
     auto score = [&](auto it, auto nl) { return std::abs(pixelTrack::utilities::tip(tracks_view, it)); };
@@ -682,7 +682,7 @@ __global__ void kernel_rejectDuplicate(TkSoAView tracks_view,
       auto e2opi = tracks_view[it].covariance()(9);
       auto cti = tracks_view[it].state()(3);
       auto e2cti = tracks_view[it].covariance()(12);
-      auto nli = tracks[it].nLayers();
+      auto nli = tracks_view[it].nLayers();
       for (auto jp = ip + 1; jp < hitToTuple.end(idx); ++jp) {
         auto const jt = *jp;
         auto qj = tracks_view[jt].quality();
@@ -734,7 +734,7 @@ __global__ void kernel_sharedHitCleaner(TrackingRecHit2DSOAView const *__restric
     for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) {
       if (tracks_view[*it].quality() < longTqual)
         continue;
-      // if (tracks.nHits(*it)==3) continue;
+      // if (tracks_view[*it].nHits()==3) continue;
       auto nl = tracks_view[*it].nLayers();
       maxNl = std::max(nl, maxNl);
     }
@@ -781,7 +781,7 @@ __global__ void kernel_tripletCleaner(TkSoAView tracks_view,
 
     // check if only triplets
     for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) {
-      if (track_view[*it].quality() <= good)
+      if (tracks_view[*it].quality() <= good)
         continue;
       onlyTriplets &= pixelTrack::utilities::isTriplet(tracks_view, *it);
       if (!onlyTriplets)
@@ -858,8 +858,7 @@ __global__ void kernel_simpleTripletCleaner(
 
 __global__ void kernel_print_found_ntuplets(TrackingRecHit2DSOAView const *__restrict__ hhp,
                                             HitContainer const *__restrict__ ptuples,
-                                            TkSoA const *__restrict__ ptracks,
-                                            Quality const *__restrict__ quality,
+                                            TkSoAConstView tracks_view,
                                             CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ phitToTuple,
                                             int32_t firstPrint,
                                             int32_t lastPrint,
@@ -867,27 +866,27 @@ __global__ void kernel_print_found_ntuplets(TrackingRecHit2DSOAView const *__res
   constexpr auto loose = pixelTrack::Quality::loose;
   auto const &hh = *hhp;
   auto const &foundNtuplets = *ptuples;
-  auto const &tracks = *ptracks;
+
   int first = firstPrint + blockDim.x * blockIdx.x + threadIdx.x;
   for (int i = first, np = std::min(lastPrint, foundNtuplets.nOnes()); i < np; i += blockDim.x * gridDim.x) {
     auto nh = foundNtuplets.size(i);
     if (nh < 3)
       continue;
-    if (quality[i] < loose)
+    if (tracks_view[i].quality() < loose)
       continue;
     printf("TK: %d %d %d %d %f %f %f %f %f %f %f %.3f %.3f %.3f %.3f %.3f %.3f %.3f\n",
            10000 * iev + i,
-           int(quality[i]),
+           int(tracks_view[i].quality()),
            nh,
-           tracks[i].nLayers(),
-           pixelTrack::utilities::charge(tracks, i),
-           tracks[i].pt(),
-           tracks[i].eta(),
-           pixelTrack::utilities::phi(tracks, i),
-           pixelTrack::utilities::tip(tracks, i),
-           pixelTrack::utilities::zip(tracks, i),
+           tracks_view[i].nLayers(),
+           pixelTrack::utilities::charge(tracks_view, i),
+           tracks_view[i].pt(),
+           tracks_view[i].eta(),
+           pixelTrack::utilities::phi(tracks_view, i),
+           pixelTrack::utilities::tip(tracks_view, i),
+           pixelTrack::utilities::zip(tracks_view, i),
            //           asinhf(fit_results[i].par(3)),
-           tracks[i].chi2(),
+           tracks_view[i].chi2(),
            hh.zGlobal(*foundNtuplets.begin(i)),
            hh.zGlobal(*(foundNtuplets.begin(i) + 1)),
            hh.zGlobal(*(foundNtuplets.begin(i) + 2)),

From f979906d481f873b43640bf1deaece3250a69f07 Mon Sep 17 00:00:00 2001
From: Dimitris Papagiannis <nothingface0@gmail.com>
Date: Tue, 11 Oct 2022 17:11:26 +0200
Subject: [PATCH 025/110] Adapted for quality, must revert to Quality types

---
 .../plugins/CAHitNtupletGeneratorKernels.cc   | 12 ++--
 .../plugins/CAHitNtupletGeneratorKernels.cu   | 12 ++--
 .../CAHitNtupletGeneratorKernelsImpl.h        | 69 ++++++++++---------
 .../PixelTriplets/plugins/HelixFitOnGPU.h     |  2 +-
 4 files changed, 49 insertions(+), 46 deletions(-)

diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
index e2ec93fbbea86..a34e0f280dd9d 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
@@ -83,7 +83,7 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA *
   auto *detId_d = &tracks_d->detIndices;
   auto *quality_d = tracks_d->qualityData();
 
-  assert(tuples_d && quality_d);
+  // assert(tuples_d && quality_d); // TODO Find equivalent for View
 
   // zero tuples
   cms::cuda::launchZero(tuples_d, cudaStream);
@@ -135,7 +135,7 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA *
   kernel_earlyDuplicateRemover(device_theCells_.get(), device_nCells_, tracks_d->view(), params_.dupPassThrough_);
   kernel_countMultiplicity(tuples_d, tracks_d->view(), device_tupleMultiplicity_.get());
   cms::cuda::launchFinalize(device_tupleMultiplicity_.get(), cudaStream);
-  kernel_fillMultiplicity(tuples_d, quality_d, device_tupleMultiplicity_.get());
+  kernel_fillMultiplicity(tuples_d, tracks_d->view(), device_tupleMultiplicity_.get());
 
   if (nhits > 1 && params_.lateFishbone_) {
     gpuPixelDoublets::fishbone(hh.view(), device_theCells_.get(), device_nCells_, isOuterHitOfCell_, nhits, true);
@@ -147,9 +147,9 @@ void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, TkSoA
   int32_t nhits = hh.nHits();
 
   auto const *tuples_d = &tracks_d->hitIndices;
-
+  auto *quality_d = tracks_d->qualityData();
   // classify tracks based on kinematics
-  kernel_classifyTracks(tuples_d, tracks_d->view(), params_.cuts_);
+  kernel_classifyTracks(tuples_d, tracks_d->view(), quality_d, params_.cuts_);
 
   if (params_.lateFishbone_) {
     // apply fishbone cleaning to good tracks
@@ -161,9 +161,9 @@ void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, TkSoA
 
   // fill hit->track "map"
   if (params_.doSharedHitCut_ || params_.doStats_) {
-    kernel_countHitInTracks(tuples_d, quality_d, device_hitToTuple_.get());
+    kernel_countHitInTracks(tuples_d, device_hitToTuple_.get());
     cms::cuda::launchFinalize(hitToTupleView_, cudaStream);
-    kernel_fillHitInTracks(tuples_d, quality_d, device_hitToTuple_.get());
+    kernel_fillHitInTracks(tuples_d, device_hitToTuple_.get());
   }
 
   // remove duplicates (tracks that share at least one hit)
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
index d293ad00558fe..c0b953f3b5d10 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
@@ -91,13 +91,13 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *
 
   kernel_fillHitDetIndices<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d, hh.view(), detId_d);
   cudaCheck(cudaGetLastError());
-  kernel_fillNLayers<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tracks_d, device_hitTuple_apc_);
+  kernel_fillNLayers<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tracks_d, tracks_d->view(), device_hitTuple_apc_);
   cudaCheck(cudaGetLastError());
 
   // remove duplicates (tracks that share a doublet)
   numberOfBlocks = nDoubletBlocks(blockSize);
   kernel_earlyDuplicateRemover<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
-      device_theCells_.get(), device_nCells_, tracks_d, quality_d, params_.dupPassThrough_);
+      device_theCells_.get(), device_nCells_, tracks_d->view(), params_.dupPassThrough_);
   cudaCheck(cudaGetLastError());
 
   blockSize = 128;
@@ -234,13 +234,13 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA
   // classify tracks based on kinematics
   auto numberOfBlocks = nQuadrupletBlocks(blockSize);
   kernel_classifyTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
-      tuples_d, tracks_d->view(), params_.cuts_, quality_d);
+      tuples_d, tracks_d->view(), quality_d, params_.cuts_);
+
   cudaCheck(cudaGetLastError());
 
   if (params_.lateFishbone_) {
-    x
-        // apply fishbone cleaning to good tracks
-        numberOfBlocks = nDoubletBlocks(blockSize);
+    // apply fishbone cleaning to good tracks
+    numberOfBlocks = nDoubletBlocks(blockSize);
     kernel_fishboneCleaner<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
         device_theCells_.get(), device_nCells_, quality_d);
     cudaCheck(cudaGetLastError());
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
index 7e513856cd9c6..f38c042ed15c2 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
@@ -9,6 +9,7 @@
 #include <cstdint>
 #include <limits>
 
+#include <bits/stdint-uintn.h>
 #include <cuda_runtime.h>
 
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
@@ -145,7 +146,7 @@ __global__ void kernel_earlyDuplicateRemover(GPUCACell const *cells,
                                              TkSoAView tracks_view,
                                              bool dupPassThrough) {
   // quality to mark rejected
-  constexpr auto reject = pixelTrack::Quality::edup;  /// cannot be loose
+  constexpr auto reject = (uint8_t)pixelTrack::Quality::edup;  /// cannot be loose
 
   assert(nCells);
   auto first = threadIdx.x + blockIdx.x * blockDim.x;
@@ -180,8 +181,8 @@ __global__ void kernel_fastDuplicateRemover(GPUCACell const *__restrict__ cells,
                                             TkSoAView tracks_view,
                                             bool dupPassThrough) {
   // quality to mark rejected
-  auto const reject = dupPassThrough ? pixelTrack::Quality::loose : pixelTrack::Quality::dup;
-  constexpr auto loose = pixelTrack::Quality::loose;
+  auto const reject = dupPassThrough ? (uint8_t)pixelTrack::Quality::loose : (uint8_t)pixelTrack::Quality::dup;
+  constexpr auto loose = (uint8_t)pixelTrack::Quality::loose;
 
   assert(nCells);
 
@@ -382,9 +383,9 @@ __global__ void kernel_countMultiplicity(HitContainer const *__restrict__ foundN
     auto nhits = foundNtuplets->size(it);
     if (nhits < 3)
       continue;
-    if (tracks_view[it].quality() == pixelTrack::Quality::edup)
+    if (tracks_view[it].quality() == (uint8_t)pixelTrack::Quality::edup)
       continue;
-    assert(tracks_view[it].quality() == pixelTrack::Quality::bad);
+    assert(tracks_view[it].quality() == (uint8_t)pixelTrack::Quality::bad);
     if (nhits > 7)  // current limit
       printf("wrong mult %d %d\n", it, nhits);
     assert(nhits <= caConstants::maxHitsOnTrack);
@@ -400,9 +401,9 @@ __global__ void kernel_fillMultiplicity(HitContainer const *__restrict__ foundNt
     auto nhits = foundNtuplets->size(it);
     if (nhits < 3)
       continue;
-    if (tracks_view[it].quality() == pixelTrack::Quality::edup)
+    if (tracks_view[it].quality() == (uint8_t)pixelTrack::Quality::edup)
       continue;
-    assert(tracks_view[it].quality() == pixelTrack::Quality::bad);
+    assert(tracks_view[it].quality() == (uint8_t)pixelTrack::Quality::bad);
     if (nhits > 7)
       printf("wrong mult %d %d\n", it, nhits);
     assert(nhits <= caConstants::maxHitsOnTrack);
@@ -416,6 +417,7 @@ the SoA Data
  */
 __global__ void kernel_classifyTracks(HitContainer const *__restrict__ tuples,
                                       TkSoAView tracks_view,
+                                      Quality *__restrict__ quality,
                                       CAHitNtupletGeneratorKernelsGPU::QualityCuts cuts) {
   int first = blockDim.x * blockIdx.x + threadIdx.x;
 
@@ -425,10 +427,10 @@ __global__ void kernel_classifyTracks(HitContainer const *__restrict__ tuples,
       break;  // guard
 
     // if duplicate: not even fit
-    if (tracks_view[it].quality() == pixelTrack::Quality::edup)
+    if (quality[it] == pixelTrack::Quality::edup)
       continue;
 
-    assert(tracks_view[it].quality() == pixelTrack::Quality::bad);
+    assert(quality[it] == pixelTrack::Quality::bad);
 
     // mark doublets as bad
     if (nhits < 3)
@@ -441,12 +443,12 @@ __global__ void kernel_classifyTracks(HitContainer const *__restrict__ tuples,
     }
     if (isNaN) {
 #ifdef NTUPLE_DEBUG
-      printf("NaN in fit %d size %d chi2 %f\n", it, tuples->size(it), pixelTrack::utilities::chi2(tracks_view, it));
+      printf("NaN in fit %d size %d chi2 %f\n", it, tuples->size(it), tracks_view[it].chi2());
 #endif
       continue;
     }
 
-    tracks_view[it].quality() = pixelTrack::Quality::strict;
+    quality[it] = pixelTrack::Quality::strict;
 
     // compute a pT-dependent chi2 cut
 
@@ -472,21 +474,21 @@ __global__ void kernel_classifyTracks(HitContainer const *__restrict__ tuples,
     };
 
     // (see CAHitNtupletGeneratorGPU.cc)
-    float pt = std::min<float>(pixelTrack::utilities::pt(tracks_view, it), cuts.chi2MaxPt);
+    float pt = std::min<float>(tracks_view[it].pt(), cuts.chi2MaxPt);
     float chi2Cut = cuts.chi2Scale * (cuts.chi2Coeff[0] + roughLog(pt) * cuts.chi2Coeff[1]);
-    if (pixelTrack::utilities::chi2(tracks_view, it) >= chi2Cut) {
+    if (tracks_view[it].chi2() >= chi2Cut) {
 #ifdef NTUPLE_FIT_DEBUG
       printf("Bad chi2 %d size %d pt %f eta %f chi2 %f\n",
              it,
              tuples->size(it),
-             pixelTrack::utilities::pt(tracks_view, it),
-             pixelTrack::utilities::eta(tracks_view, it),
-             pixelTrack::utilities::chi2(tracks_view, it));
+             tracks_view[it].pt(),
+             tracks_view[it].eta(),
+             tracks_view[it].chi2());
 #endif
       continue;
     }
 
-    tracks_view[it].quality() = pixelTrack::Quality::tight;
+    quality[it] = pixelTrack::Quality::tight;
 
     // impose "region cuts" based on the fit results (phi, Tip, pt, cotan(theta)), Zip)
     // default cuts:
@@ -495,11 +497,11 @@ __global__ void kernel_classifyTracks(HitContainer const *__restrict__ tuples,
     // (see CAHitNtupletGeneratorGPU.cc)
     auto const &region = (nhits > 3) ? cuts.quadruplet : cuts.triplet;
     bool isOk = (std::abs(pixelTrack::utilities::tip(tracks_view, it)) < region.maxTip) and
-                (pixelTrack::utilities::pt(tracks_view, it) > region.minPt) and
+                (tracks_view[it].pt() > region.minPt) and
                 (std::abs(pixelTrack::utilities::zip(tracks_view, it)) < region.maxZip);
 
     if (isOk) {
-      tracks_view[it].quality() = pixelTrack::Quality::highPurity;
+      quality[it] = pixelTrack::Quality::highPurity;
     }
   }
 }
@@ -521,7 +523,6 @@ __global__ void kernel_doStatsForTracks(HitContainer const *__restrict__ tuples,
 }
 
 __global__ void kernel_countHitInTracks(HitContainer const *__restrict__ tuples,
-                                        Quality const *__restrict__ quality,
                                         CAHitNtupletGeneratorKernelsGPU::HitToTuple *hitToTuple) {
   int first = blockDim.x * blockIdx.x + threadIdx.x;
   for (int idx = first, ntot = tuples->nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
@@ -533,7 +534,6 @@ __global__ void kernel_countHitInTracks(HitContainer const *__restrict__ tuples,
 }
 
 __global__ void kernel_fillHitInTracks(HitContainer const *__restrict__ tuples,
-                                       Quality const *__restrict__ quality,
                                        CAHitNtupletGeneratorKernelsGPU::HitToTuple *hitToTuple) {
   int first = blockDim.x * blockIdx.x + threadIdx.x;
   for (int idx = first, ntot = tuples->nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
@@ -561,6 +561,9 @@ __global__ void kernel_fillHitDetIndices(HitContainer const *__restrict__ tuples
   }
 }
 
+/*
+  Needs both TkSoA and TkSoAView for accessing SoA, computeNumberOfLayers(), nHits(), stride()
+ */
 __global__ void kernel_fillNLayers(TkSoA *__restrict__ ptracks,
                                    TkSoAView tracks_view,
                                    cms::cuda::AtomicPairCounter *apc) {
@@ -573,7 +576,7 @@ __global__ void kernel_fillNLayers(TkSoA *__restrict__ ptracks,
   for (int idx = first, nt = ntracks; idx < nt; idx += gridDim.x * blockDim.x) {
     auto nHits = tracks.nHits(idx);
     assert(nHits >= 3);
-    tracks[idx].nLayers() = tracks.computeNumberOfLayers(idx);
+    tracks_view[idx].nLayers() = tracks.computeNumberOfLayers(idx);
   }
 }
 
@@ -630,10 +633,10 @@ __global__ void kernel_markSharedHit(int const *__restrict__ nshared,
                                      HitContainer const *__restrict__ tuples,
                                      Quality *__restrict__ quality,
                                      bool dupPassThrough) {
-  // constexpr auto bad = pixelTrack::Quality::bad;
+  // constexpr auto bad = (uint8_t)pixelTrack::Quality::bad;
   constexpr auto dup = pixelTrack::Quality::dup;
   constexpr auto loose = pixelTrack::Quality::loose;
-  // constexpr auto strict = pixelTrack::Quality::strict;
+  // constexpr auto strict = (uint8_t)pixelTrack::Quality::strict;
 
   // quality to mark rejected
   auto const reject = dupPassThrough ? loose : dup;
@@ -655,7 +658,7 @@ __global__ void kernel_rejectDuplicate(TkSoAView tracks_view,
                                        bool dupPassThrough,
                                        CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ phitToTuple) {
   // quality to mark rejected
-  auto const reject = dupPassThrough ? pixelTrack::Quality::loose : pixelTrack::Quality::dup;
+  auto const reject = dupPassThrough ? (uint8_t)pixelTrack::Quality::loose : (uint8_t)pixelTrack::Quality::dup;
 
   auto &hitToTuple = *phitToTuple;
 
@@ -714,9 +717,9 @@ __global__ void kernel_sharedHitCleaner(TrackingRecHit2DSOAView const *__restric
                                         bool dupPassThrough,
                                         CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ phitToTuple) {
   // quality to mark rejected
-  auto const reject = dupPassThrough ? pixelTrack::Quality::loose : pixelTrack::Quality::dup;
+  auto const reject = dupPassThrough ? (uint8_t)pixelTrack::Quality::loose : (uint8_t)pixelTrack::Quality::dup;
   // quality of longest track
-  auto const longTqual = pixelTrack::Quality::highPurity;
+  auto const longTqual = (uint8_t)pixelTrack::Quality::highPurity;
 
   auto &hitToTuple = *phitToTuple;
 
@@ -764,9 +767,9 @@ __global__ void kernel_tripletCleaner(TkSoAView tracks_view,
                                       bool dupPassThrough,
                                       CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ phitToTuple) {
   // quality to mark rejected
-  auto const reject = pixelTrack::Quality::loose;
+  auto const reject = (uint8_t)pixelTrack::Quality::loose;
   /// min quality of good
-  auto const good = pixelTrack::Quality::strict;
+  auto const good = (uint8_t)pixelTrack::Quality::strict;
 
   auto &hitToTuple = *phitToTuple;
 
@@ -820,9 +823,9 @@ __global__ void kernel_simpleTripletCleaner(
     bool dupPassThrough,
     CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ phitToTuple) {
   // quality to mark rejected
-  auto const reject = pixelTrack::Quality::loose;
+  auto const reject = (uint8_t)pixelTrack::Quality::loose;
   /// min quality of good
-  auto const good = pixelTrack::Quality::loose;
+  auto const good = (uint8_t)pixelTrack::Quality::loose;
 
   auto &hitToTuple = *phitToTuple;
 
@@ -849,7 +852,7 @@ __global__ void kernel_simpleTripletCleaner(
     // mark worse ambiguities
     for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) {
       auto const it = *ip;
-      if (tracks_view[it].quality() > reject && pixelTracks::utilities::isTriplet(tracks_view, it) && it != im)
+      if (tracks_view[it].quality() > reject && pixelTrack::utilities::isTriplet(tracks_view, it) && it != im)
         tracks_view[it].quality() = reject;  //no race:  simple assignment of the same constant
     }
 
@@ -863,7 +866,7 @@ __global__ void kernel_print_found_ntuplets(TrackingRecHit2DSOAView const *__res
                                             int32_t firstPrint,
                                             int32_t lastPrint,
                                             int iev) {
-  constexpr auto loose = pixelTrack::Quality::loose;
+  constexpr auto loose = (uint8_t)pixelTrack::Quality::loose;
   auto const &hh = *hhp;
   auto const &foundNtuplets = *ptuples;
 
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h
index d47e4c5f8ece9..78d0c2782f17e 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h
@@ -47,7 +47,7 @@ class HelixFitOnGPU {
   void launchRiemannKernels(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples, cudaStream_t cudaStream);
   void launchBrokenLineKernels(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples, cudaStream_t cudaStream);
 
-  void launchiRemannKernelsOnCPU(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples);
+  void launchRiemannKernelsOnCPU(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples);
   void launchBrokenLineKernelsOnCPU(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples);
 
   void allocateOnGPU(Tuples const *tuples, TupleMultiplicity const *tupleMultiplicity, OutputSoAView *outputSoA);

From ffcbc08413fb907dbee60d2370bbf4fa916e46bd Mon Sep 17 00:00:00 2001
From: Dimitris Papagiannis <nothingface0@gmail.com>
Date: Tue, 11 Oct 2022 17:21:22 +0200
Subject: [PATCH 026/110] Fix instantiation of TrackSoA

---
 .../PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc         | 2 +-
 .../PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h          | 2 +-
 RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h        | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
index f650ca8ab2a08..f68a315e1d3c3 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
@@ -200,7 +200,7 @@ PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecH
   kernels.launchKernels(hits_d, soa, stream);
 
   HelixFitOnGPU fitter(bfield, m_params.fitNas4_);
-  fitter.allocateOnGPU(&(soa->hitIndices), kernels.tupleMultiplicity(), soa);
+  fitter.allocateOnGPU(&(soa->hitIndices), kernels.tupleMultiplicity(), soa->view());
   if (m_params.useRiemannFit_) {
     fitter.launchRiemannKernels(hits_d.view(), hits_d.nHits(), caConstants::maxNumberOfQuadruplets, stream);
   } else {
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h
index ae4576d883530..36212298aac2f 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h
@@ -28,7 +28,7 @@ class CAHitNtupletGeneratorOnGPU {
   using hindex_type = TrackingRecHit2DSOAView::hindex_type;
 
   using Quality = pixelTrack::Quality;
-  using OutputSoA = pixelTrack::TrackSoA;
+  using OutputSoAView = pixelTrack::TrackSoAView;
   using HitContainer = pixelTrack::HitContainer;
   using Tuple = HitContainer;
 
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h
index 78d0c2782f17e..67a180c53e887 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h
@@ -50,7 +50,7 @@ class HelixFitOnGPU {
   void launchRiemannKernelsOnCPU(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples);
   void launchBrokenLineKernelsOnCPU(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples);
 
-  void allocateOnGPU(Tuples const *tuples, TupleMultiplicity const *tupleMultiplicity, OutputSoAView *outputSoA);
+  void allocateOnGPU(Tuples const *tuples, TupleMultiplicity const *tupleMultiplicity, OutputSoAView outputSoA);
   void deallocateOnGPU();
 
 private:

From 4e36dbc4e58ee79b559dfa8e8625c3312d96f518 Mon Sep 17 00:00:00 2001
From: Dimitris Papagiannis <nothingface0@gmail.com>
Date: Tue, 11 Oct 2022 17:25:30 +0200
Subject: [PATCH 027/110] riemann fit kernel

---
 .../PixelTriplets/plugins/RiemannFitOnGPU.h   | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h
index 926002d674b83..e815a8943d520 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h
@@ -128,13 +128,13 @@ template <int N>
 __global__ void kernel_LineFit(caConstants::TupleMultiplicity const *__restrict__ tupleMultiplicity,
                                uint32_t nHits,
                                double bField,
-                               OutputSoA *results,
+                               OutputSoAView results_view,
                                double *__restrict__ phits,
                                float *__restrict__ phits_ge,
                                double *__restrict__ pfast_fit_input,
                                riemannFit::CircleFit *__restrict__ circle_fit,
                                uint32_t offset) {
-  assert(results);
+  // assert(results); // TODO find equivalent for View
   assert(circle_fit);
   assert(N <= nHits);
 
@@ -159,11 +159,16 @@ __global__ void kernel_LineFit(caConstants::TupleMultiplicity const *__restrict_
 
     riemannFit::fromCircleToPerigee(circle_fit[local_idx]);
 
-    results->stateAtBS.copyFromCircle(
-        circle_fit[local_idx].par, circle_fit[local_idx].cov, line_fit.par, line_fit.cov, 1.f / float(bField), tkid);
-    results->pt(tkid) = bField / std::abs(circle_fit[local_idx].par(2));
-    results->eta(tkid) = asinhf(line_fit.par(0));
-    results->chi2(tkid) = (circle_fit[local_idx].chi2 + line_fit.chi2) / (2 * N - 5);
+    pixelTrack::utilities::copyFromCircle(results_view,
+                                          circle_fit[local_idx].par,
+                                          circle_fit[local_idx].cov,
+                                          line_fit.par,
+                                          line_fit.cov,
+                                          1.f / float(bField),
+                                          tkid);
+    results_view[tkid].pt() = bField / std::abs(circle_fit[local_idx].par(2));
+    results_view[tkid].eta() = asinhf(line_fit.par(0));
+    results_view[tkid].chi2() = (circle_fit[local_idx].chi2 + line_fit.chi2) / (2 * N - 5);
 
 #ifdef RIEMANN_DEBUG
     printf("kernelLineFit size %d for %d hits circle.par(0,1,2): %d %f,%f,%f\n",

From c4695dad2191eaa1c6be6ff92a5819e757237a77 Mon Sep 17 00:00:00 2001
From: Dimitris Papagiannis <nothingface0@gmail.com>
Date: Tue, 11 Oct 2022 17:36:45 +0200
Subject: [PATCH 028/110] Fixed allocation call

---
 .../PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc         | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
index f68a315e1d3c3..a56dcc851c1d5 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
@@ -235,7 +235,7 @@ PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuples(TrackingRecHit2DC
 
   // now fit
   HelixFitOnGPU fitter(bfield, m_params.fitNas4_);
-  fitter.allocateOnGPU(&(soa->hitIndices), kernels.tupleMultiplicity(), soa);
+  fitter.allocateOnGPU(&(soa->hitIndices), kernels.tupleMultiplicity(), soa->view());
 
   if (m_params.useRiemannFit_) {
     fitter.launchRiemannKernelsOnCPU(hits_d.view(), hits_d.nHits(), caConstants::maxNumberOfQuadruplets);

From 1f04fbfe056df38dfb79f35671205b322e065e76 Mon Sep 17 00:00:00 2001
From: Dimitris Papagiannis <nothingface0@gmail.com>
Date: Tue, 11 Oct 2022 18:45:53 +0200
Subject: [PATCH 029/110] Several fixes, still breaks

---
 .../interface/TrackSoAHeterogeneousT_test.h   | 29 +++++--------------
 .../plugins/PixelTrackProducerFromSoA.cc      | 11 +++----
 .../plugins/PixelTrackSoAFromCUDA.cc          |  2 +-
 .../plugins/CAHitNtupletGeneratorKernels.cu   |  9 +++---
 .../plugins/CAHitNtupletGeneratorOnGPU.cc     |  2 +-
 .../PixelTriplets/plugins/RiemannFitOnGPU.h   |  4 +--
 6 files changed, 22 insertions(+), 35 deletions(-)

diff --git a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h
index f4075b0b385d7..289c6d2f8c211 100644
--- a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h
+++ b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h
@@ -44,24 +44,17 @@ namespace pixelTrack {
     using TrackSoAConstView = cms::cuda::PortableDeviceCollection<TrackSoAHeterogeneousT_test<>>::ConstView;
     // State at the Beam spot
     // phi,tip,1/pt,cotan(theta),zip
-    float charge(TrackSoAConstView tracks, int32_t i) { return std::copysign(1.f, tracks[i].state()(2)); }
-
-    float phi(TrackSoAConstView tracks, int32_t i) { return tracks[i].state()(0); }
-
-    float tip(TrackSoAConstView tracks, int32_t i) { return tracks[i].state()(1); }
-
-    float zip(TrackSoAConstView tracks, int32_t i) { return tracks[i].state()(4); }
+    __host__ __device__ float charge(TrackSoAConstView tracks, int32_t i) {
+      return std::copysign(1.f, tracks[i].state()(2));
+    }
 
-    // float pt(TrackSoAConstView tracks, int32_t i) { return tracks[i].pt(); }
-    // // float &pt(TrackSoAConstView tracks, int32_t i) { return tracks[i].pt(); }
+    __host__ __device__ float phi(TrackSoAConstView tracks, int32_t i) { return tracks[i].state()(0); }
 
-    // float eta(TrackSoAConstView tracks, int32_t i) { return tracks[i].eta(); }
-    // // float &eta(TrackSoAConstView tracks, int32_t i) { return tracks[i].eta(); }
+    __host__ __device__ float tip(TrackSoAConstView tracks, int32_t i) { return tracks[i].state()(1); }
 
-    // float chi2(TrackSoAConstView tracks, int32_t i) { return tracks[i].chi2(); }
-    // float &chi2(TrackSoAConstView tracks, int32_t i) { return tracks[i].chi2(); }
+    __host__ __device__ float zip(TrackSoAConstView tracks, int32_t i) { return tracks[i].state()(4); }
 
-    bool isTriplet(TrackSoAConstView tracks, int i) { return tracks[i].nLayers() == 3; }
+    __host__ __device__ bool isTriplet(TrackSoAConstView tracks, int i) { return tracks[i].nLayers() == 3; }
 
     template <typename V3, typename M3, typename V2, typename M2>
     __host__ __device__ inline void copyFromCircle(
@@ -93,7 +86,7 @@ namespace pixelTrack {
     }
 
     template <typename V5, typename M5>
-    __host__ __device__ inline void copyToDense(TrackSoAView tracks, V5 &v, M5 &cov, int32_t i) {
+    __host__ __device__ inline void copyToDense(TrackSoAConstView tracks, V5 &v, M5 &cov, int32_t i) {
       v = tracks[i].state().template cast<typename V5::Scalar>();
       for (int j = 0, ind = 0; j < 5; ++j) {
         cov(j, j) = tracks[i].covariance()(ind++);
@@ -129,9 +122,6 @@ class TrackSoAHeterogeneousT : public cms::cuda::PortableDeviceCollection<TrackS
   constexpr Quality const *qualityData() const { return reinterpret_cast<Quality const *>(view().quality()); }
   constexpr Quality *qualityData() { return reinterpret_cast<Quality *>(view().quality()); }
 
-  constexpr int nTracks() const { return nTracks_; }
-  constexpr void setNTracks(int n) { nTracks_ = n; }
-
   constexpr int nHits(int i) const { return detIndices.size(i); }
 
   constexpr int computeNumberOfLayers(int32_t i) const {
@@ -150,9 +140,6 @@ class TrackSoAHeterogeneousT : public cms::cuda::PortableDeviceCollection<TrackS
 
   HitContainer hitIndices;
   HitContainer detIndices;
-
-private:
-  int nTracks_;
 };
 
 namespace pixelTrack {
diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc
index 12899be2c4156..e6d49cde90d6a 100644
--- a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc
+++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc
@@ -157,7 +157,7 @@ void PixelTrackProducerFromSoA::produce(edm::StreamID streamID,
   auto const *quality = tsoa.qualityData();
   // auto const &fit = tsoa.stateAtBS;
   auto const &hitIndices = tsoa.hitIndices;
-  auto nTracks = tsoa.nTracks();
+  auto nTracks = tsoa.view().nTracks();
 
   tracks.reserve(nTracks);
 
@@ -166,8 +166,9 @@ void PixelTrackProducerFromSoA::produce(edm::StreamID streamID,
   //sort index by pt
   std::vector<int32_t> sortIdxs(nTracks);
   std::iota(sortIdxs.begin(), sortIdxs.end(), 0);
-  std::sort(
-      sortIdxs.begin(), sortIdxs.end(), [&](int32_t const i1, int32_t const i2) { return tsoa.pt(i1) > tsoa.pt(i2); });
+  std::sort(sortIdxs.begin(), sortIdxs.end(), [&](int32_t const i1, int32_t const i2) {
+    return tsoa.view()[i1].pt() > tsoa.view()[i2].pt();
+  });
 
   //store the index of the SoA: indToEdm[index_SoAtrack] -> index_edmTrack (if it exists)
   indToEdm.resize(sortIdxs.size(), -1);
@@ -189,12 +190,12 @@ void PixelTrackProducerFromSoA::produce(edm::StreamID streamID,
 
     // mind: this values are respect the beamspot!
 
-    float chi2 = tsoa.chi2(it);
+    float chi2 = tsoa.view()[it].chi2();
     float phi = pixelTrack::utilities::phi(tsoa.view(), it);
 
     riemannFit::Vector5d ipar, opar;
     riemannFit::Matrix5d icov, ocov;
-    pixelTrack::utilities::copyToDense(tsoa.view(), ipar, icov, it);
+    pixelTrack::utilities::copyToDense<riemannFit::Vector5d, riemannFit::Matrix5d>(tsoa.view(), ipar, icov, it);
     riemannFit::transformToPerigeePlane(ipar, icov, opar, ocov);
 
     LocalTrajectoryParameters lpar(opar(0), opar(1), opar(2), opar(3), opar(4), 1.);
diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc
index 5cf4aac491901..57df7ae63b7a0 100644
--- a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc
+++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc
@@ -63,7 +63,7 @@ void PixelTrackSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& i
   // check that the fixed-size SoA does not overflow
   auto const& tsoa = *soa_;
   auto maxTracks = tsoa.stride();
-  auto nTracks = tsoa.nTracks();
+  auto nTracks = tsoa.view().nTracks();
   assert(nTracks < maxTracks);
   if (nTracks == maxTracks - 1) {
     edm::LogWarning("PixelTracks") << "Unsorted reconstructed pixel tracks truncated to " << maxTracks - 1
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
index c0b953f3b5d10..168ba3b0c8144 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
@@ -103,10 +103,10 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *
   blockSize = 128;
   numberOfBlocks = (3 * caConstants::maxTuples / 4 + blockSize - 1) / blockSize;
   kernel_countMultiplicity<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
-      tuples_d, quality_d, device_tupleMultiplicity_.get());
+      tuples_d, tracks_d->view(), device_tupleMultiplicity_.get());
   cms::cuda::launchFinalize(device_tupleMultiplicity_.get(), cudaStream);
   kernel_fillMultiplicity<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
-      tuples_d, quality_d, device_tupleMultiplicity_.get());
+      tuples_d, tracks_d->view(), device_tupleMultiplicity_.get());
   cudaCheck(cudaGetLastError());
 
   // do not run the fishbone if there are hits only in BPIX1
@@ -259,14 +259,13 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA
     // fill hit->track "map"
     assert(hitToTupleView_.offSize > nhits);
     numberOfBlocks = nQuadrupletBlocks(blockSize);
-    kernel_countHitInTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
-        tuples_d, quality_d, device_hitToTuple_.get());
+    kernel_countHitInTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d, device_hitToTuple_.get());
     cudaCheck(cudaGetLastError());
     assert((hitToTupleView_.assoc == device_hitToTuple_.get()) &&
            (hitToTupleView_.offStorage == device_hitToTupleStorage_.get()) && (hitToTupleView_.offSize > 0));
     cms::cuda::launchFinalize(hitToTupleView_, cudaStream);
     cudaCheck(cudaGetLastError());
-    kernel_fillHitInTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d, quality_d, device_hitToTuple_.get());
+    kernel_fillHitInTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d, device_hitToTuple_.get());
     cudaCheck(cudaGetLastError());
 #ifdef GPU_DEBUG
     cudaCheck(cudaDeviceSynchronize());
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
index a56dcc851c1d5..4a5689f572e47 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
@@ -252,7 +252,7 @@ PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuples(TrackingRecHit2DC
   // check that the fixed-size SoA does not overflow
   auto const& tsoa = *soa;
   auto maxTracks = tsoa.stride();
-  auto nTracks = tsoa.nTracks();
+  auto nTracks = tsoa.view().nTracks();
   assert(nTracks < maxTracks);
   if (nTracks == maxTracks - 1) {
     edm::LogWarning("PixelTracks") << "Unsorted reconstructed pixel tracks truncated to " << maxTracks - 1
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h
index e815a8943d520..e511680bf76b7 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h
@@ -16,7 +16,7 @@
 
 using HitsOnGPU = TrackingRecHit2DSOAView;
 using Tuples = pixelTrack::HitContainer;
-using OutputSoA = pixelTrack::TrackSoA;
+using OutputSoAView = pixelTrack::TrackSoAView;
 
 template <int N>
 __global__ void kernel_FastFit(Tuples const *__restrict__ foundNtuplets,
@@ -149,7 +149,7 @@ __global__ void kernel_LineFit(caConstants::TupleMultiplicity const *__restrict_
       break;
 
     // get it for the ntuple container (one to one to helix)
-    auto tkid = *(tupleMultiplicity->begin(nHits) + tuple_idx);
+    int32_t tkid = *(tupleMultiplicity->begin(nHits) + tuple_idx);
 
     riemannFit::Map3xNd<N> hits(phits + local_idx);
     riemannFit::Map4d fast_fit(pfast_fit_input + local_idx);

From 345b22e5f5782c8290c3aa4c46fdb00f84c2fe2d Mon Sep 17 00:00:00 2001
From: Dimitris Papagiannis <nothingface0@gmail.com>
Date: Tue, 11 Oct 2022 18:59:01 +0200
Subject: [PATCH 030/110] Removed unused accessors

---
 CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h
index 289c6d2f8c211..667317ff8174c 100644
--- a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h
+++ b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h
@@ -116,8 +116,6 @@ class TrackSoAHeterogeneousT : public cms::cuda::PortableDeviceCollection<TrackS
   // CUDA does not support enums  in __lgc ...
 private:
 public:
-  constexpr Quality quality(int32_t i) const { return static_cast<Quality>(view()[i].quality()); }
-  constexpr Quality &quality(int32_t i) { return static_cast<Quality &>(view()[i].quality()); }
   // TODO: static did not work; using reinterpret_cast
   constexpr Quality const *qualityData() const { return reinterpret_cast<Quality const *>(view().quality()); }
   constexpr Quality *qualityData() { return reinterpret_cast<Quality *>(view().quality()); }

From 70ae74f038764a63ff0085b6895f508e0568c90b Mon Sep 17 00:00:00 2001
From: Dimitris Papagiannis <nothingface0@gmail.com>
Date: Wed, 12 Oct 2022 11:11:11 +0200
Subject: [PATCH 031/110] Removed unused file

---
 .../Track/interface/TrajectoryStateSoAT.h     | 73 -------------------
 .../interface/TrajectoryStateSoAT_test.h      | 63 ----------------
 2 files changed, 136 deletions(-)
 delete mode 100644 CUDADataFormats/Track/interface/TrajectoryStateSoAT.h
 delete mode 100644 CUDADataFormats/Track/interface/TrajectoryStateSoAT_test.h

diff --git a/CUDADataFormats/Track/interface/TrajectoryStateSoAT.h b/CUDADataFormats/Track/interface/TrajectoryStateSoAT.h
deleted file mode 100644
index 23ff2ce2b1986..0000000000000
--- a/CUDADataFormats/Track/interface/TrajectoryStateSoAT.h
+++ /dev/null
@@ -1,73 +0,0 @@
-#ifndef CUDADataFormats_Track_TrajectoryStateSOAT_H
-#define CUDADataFormats_Track_TrajectoryStateSOAT_H
-
-#include "HeterogeneousCore/CUDAUtilities/interface/eigenSoA.h"
-#include "DataFormats/SoATemplate/interface/SoALayout.h"
-#include "CUDADataFormats/Common/interface/PortableDeviceCollection.h"
-using Vector5f = Eigen::Matrix<float, 5, 1>;
-using Vector15f = Eigen::Matrix<float, 15, 1>;
-
-using Vector5d = Eigen::Matrix<double, 5, 1>;
-using Matrix5d = Eigen::Matrix<double, 5, 5>;
-GENERATE_SOA_LAYOUT(TrajectoryStateSoAT_test,
-                    SOA_EIGEN_COLUMN(Vector5f, state),
-                    SOA_EIGEN_COLUMN(Vector15f, covariance))
-
-template <int32_t S>
-struct TrajectoryStateSoAT : public cms::cuda::PortableDeviceCollection<TrajectoryStateSoAT_test<>> {
-  static constexpr int32_t stride() { return S; }
-
-  // eigenSoA::MatrixSoA<Vector5f, S> state;
-  // eigenSoA::MatrixSoA<Vector15f, S> covariance;
-
-  // Vector5f state(const int32_t i) const { return view()[i].state(); }
-  // float* state() const { return view().state(); }  // TODO: Return Vector5f* ?
-  // Vector15f covariance(const int32_t i) const { return view()[i].covariance(); }
-  // float* covariance() const { return view().covariance(); }  // TODO: Return Vector15f* ?
-
-  // Restrict view
-  // using RestrictConstView =
-  //     Layout::ConstViewTemplate<cms::soa::RestrictQualify::enabled, cms::soa::RangeChecking::disabled>;
-
-  // RestrictConstView restrictConstView() const { return RestrictConstView(layout()); }
-
-  template <typename V3, typename M3, typename V2, typename M2>
-  __host__ __device__ inline void copyFromCircle(
-      V3 const& cp, M3 const& ccov, V2 const& lp, M2 const& lcov, float b, int32_t i) {
-    view()[i].state() << cp.template cast<float>(), lp.template cast<float>();
-    view()[i].state()(2) *= b;
-    auto cov = view()[i].covariance();
-    cov(0) = ccov(0, 0);
-    cov(1) = ccov(0, 1);
-    cov(2) = b * float(ccov(0, 2));
-    cov(4) = cov(3) = 0;
-    cov(5) = ccov(1, 1);
-    cov(6) = b * float(ccov(1, 2));
-    cov(8) = cov(7) = 0;
-    cov(9) = b * b * float(ccov(2, 2));
-    cov(11) = cov(10) = 0;
-    cov(12) = lcov(0, 0);
-    cov(13) = lcov(0, 1);
-    cov(14) = lcov(1, 1);
-  }
-
-  template <typename V5, typename M5>
-  __host__ __device__ inline void copyFromDense(V5 const& v, M5 const& cov, int32_t i) {
-    view()[i].state() = v.template cast<float>();
-    for (int j = 0, ind = 0; j < 5; ++j)
-      for (auto k = j; k < 5; ++k)
-        view()[i].covariance()(ind++) = cov(j, k);
-  }
-
-  template <typename V5, typename M5>
-  __host__ __device__ inline void copyToDense(V5& v, M5& cov, int32_t i) const {
-    v = view()[i].state().template cast<typename V5::Scalar>();
-    for (int j = 0, ind = 0; j < 5; ++j) {
-      cov(j, j) = view()[i].covariance()(ind++);
-      for (auto k = j + 1; k < 5; ++k)
-        cov(k, j) = cov(j, k) = view()[i].covariance()(ind++);
-    }
-  }
-};
-
-#endif  // CUDADataFormats_Track_TrajectoryStateSOAT_H
diff --git a/CUDADataFormats/Track/interface/TrajectoryStateSoAT_test.h b/CUDADataFormats/Track/interface/TrajectoryStateSoAT_test.h
deleted file mode 100644
index 1e561d0131d51..0000000000000
--- a/CUDADataFormats/Track/interface/TrajectoryStateSoAT_test.h
+++ /dev/null
@@ -1,63 +0,0 @@
-#ifndef CUDADataFormats_Track_TrajectoryStateSOAT_H
-#define CUDADataFormats_Track_TrajectoryStateSOAT_H
-
-#include <Eigen/Dense>
-#include "HeterogeneousCore/CUDAUtilities/interface/eigenSoA.h"
-#include "DataFormats/SoATemplate/interface/SoALayout.h"
-#include "CUDADataFormats/Common/interface/PortableDeviceCollection.h"
-using Vector5f = Eigen::Matrix<float, 5, 1>;
-using Vector15f = Eigen::Matrix<float, 15, 1>;
-
-using Vector5d = Eigen::Matrix<double, 5, 1>;
-using Matrix5d = Eigen::Matrix<double, 5, 5>;
-GENERATE_SOA_LAYOUT(TrajectoryStateSoAT_test,
-                    SOA_EIGEN_COLUMN(Vector5f, state),
-                    SOA_EIGEN_COLUMN(Vector15f, covariance))
-
-template <int32_t S>
-struct TrajectoryStateSoAT : public cms::cuda::PortableDeviceCollection<TrajectoryStateSoAT_test<>> {
-  static constexpr int32_t stride() { return S; }
-
-  // eigenSoA::MatrixSoA<Vector5f, S> state;
-  // eigenSoA::MatrixSoA<Vector15f, S> covariance;
-
-  template <typename V3, typename M3, typename V2, typename M2>
-  __host__ __device__ inline void copyFromCircle(
-      V3 const& cp, M3 const& ccov, V2 const& lp, M2 const& lcov, float b, int32_t i) {
-    view()[i].state() << cp.template cast<float>(), lp.template cast<float>();
-    view()[i].state()(2) *= b;  // TODO?? 2d access??
-    auto cov = view()[i].covariance();
-    cov(0) = ccov(0, 0);
-    cov(1) = ccov(0, 1);
-    cov(2) = b * float(ccov(0, 2));
-    cov(4) = cov(3) = 0;
-    cov(5) = ccov(1, 1);
-    cov(6) = b * float(ccov(1, 2));
-    cov(8) = cov(7) = 0;
-    cov(9) = b * b * float(ccov(2, 2));
-    cov(11) = cov(10) = 0;
-    cov(12) = lcov(0, 0);
-    cov(13) = lcov(0, 1);
-    cov(14) = lcov(1, 1);
-  }
-
-  template <typename V5, typename M5>
-  __host__ __device__ inline void copyFromDense(V5 const& v, M5 const& cov, int32_t i) {
-    view()[i].state() = v.template cast<float>();
-    for (int j = 0, ind = 0; j < 5; ++j)
-      for (auto k = j; k < 5; ++k)
-        view()[i].covariance()(ind++) = cov(j, k);
-  }
-
-  template <typename V5, typename M5>
-  __host__ __device__ inline void copyToDense(V5& v, M5& cov, int32_t i) const {
-    v = view()[i].state().template cast<typename V5::Scalar>();
-    for (int j = 0, ind = 0; j < 5; ++j) {
-      cov(j, j) = view()[i].covariance()(ind++);
-      for (auto k = j + 1; k < 5; ++k)
-        cov(k, j) = cov(j, k) = view()[i].covariance()(ind++);
-    }
-  }
-};
-
-#endif  // CUDADataFormats_Track_TrajectoryStateSOAT_H

From 5e04512520888c845e105451d6a8823c2683bc89 Mon Sep 17 00:00:00 2001
From: Dimitris Papagiannis <nothingface0@gmail.com>
Date: Wed, 12 Oct 2022 11:19:33 +0200
Subject: [PATCH 032/110] Added a simple test for trivially_constructible
 classes

---
 CUDADataFormats/Track/test/BuildFile.xml       |  3 ++-
 .../Track/test/trivially_constructible.cpp     | 18 ++++++++++++++++++
 2 files changed, 20 insertions(+), 1 deletion(-)
 create mode 100644 CUDADataFormats/Track/test/trivially_constructible.cpp

diff --git a/CUDADataFormats/Track/test/BuildFile.xml b/CUDADataFormats/Track/test/BuildFile.xml
index 985445f1e1b2a..4da6a1a0bb38d 100644
--- a/CUDADataFormats/Track/test/BuildFile.xml
+++ b/CUDADataFormats/Track/test/BuildFile.xml
@@ -1,5 +1,6 @@
 <use name="HeterogeneousCore/CUDAUtilities"/>
-
+<bin file="trivially_constructible.cpp">
+</bin>
 <bin file="TrackSoAHeterogeneous_t.cpp">
   <use name="eigen"/>
   <flags CXXFLAGS="-g -DGPU_DEBUG"/>
diff --git a/CUDADataFormats/Track/test/trivially_constructible.cpp b/CUDADataFormats/Track/test/trivially_constructible.cpp
new file mode 100644
index 0000000000000..f560f5ce58faa
--- /dev/null
+++ b/CUDADataFormats/Track/test/trivially_constructible.cpp
@@ -0,0 +1,18 @@
+#include <bits/stdc++.h>
+#include <type_traits>
+#include <stdint.h>
+#include <assert.h>
+#include "CUDADataFormats/Common/interface/PortableDeviceCollection.h"
+#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
+using namespace std;
+
+int main() {
+  std::cout << "pixelTrack::TrackSoA trivially constructible: "
+            << std::is_trivially_constructible<pixelTrack::TrackSoA>::value << std::endl;
+
+  std::cout << "cms::cuda::PortableDeviceCollection<TrackSoAHeterogeneousT_test<>> trivially constructible: "
+            << std::is_trivially_constructible<cms::cuda::PortableDeviceCollection<TrackSoAHeterogeneousT_test<>>>::value
+            << std::endl;
+
+  return 0;
+}

From f72cf64866eab45e47b6657be4aa3d5b1c4fd946 Mon Sep 17 00:00:00 2001
From: Dimitris Papagiannis <nothingface0@gmail.com>
Date: Wed, 12 Oct 2022 12:01:59 +0200
Subject: [PATCH 033/110] Added more tests

---
 CUDADataFormats/Track/test/BuildFile.xml          |  4 +++-
 ...e.cpp => trivially_constructible_eric_soa.cpp} |  5 +++--
 .../test/trivially_constructible_manual_soa.cpp   | 15 +++++++++++++++
 3 files changed, 21 insertions(+), 3 deletions(-)
 rename CUDADataFormats/Track/test/{trivially_constructible.cpp => trivially_constructible_eric_soa.cpp} (78%)
 create mode 100644 CUDADataFormats/Track/test/trivially_constructible_manual_soa.cpp

diff --git a/CUDADataFormats/Track/test/BuildFile.xml b/CUDADataFormats/Track/test/BuildFile.xml
index 4da6a1a0bb38d..3d216c02efa2c 100644
--- a/CUDADataFormats/Track/test/BuildFile.xml
+++ b/CUDADataFormats/Track/test/BuildFile.xml
@@ -1,5 +1,7 @@
 <use name="HeterogeneousCore/CUDAUtilities"/>
-<bin file="trivially_constructible.cpp">
+<bin file="trivially_constructible_manual_soa.cpp">
+</bin>
+<bin file="trivially_constructible_eric_soa.cpp">
 </bin>
 <bin file="TrackSoAHeterogeneous_t.cpp">
   <use name="eigen"/>
diff --git a/CUDADataFormats/Track/test/trivially_constructible.cpp b/CUDADataFormats/Track/test/trivially_constructible_eric_soa.cpp
similarity index 78%
rename from CUDADataFormats/Track/test/trivially_constructible.cpp
rename to CUDADataFormats/Track/test/trivially_constructible_eric_soa.cpp
index f560f5ce58faa..3c9e60df70024 100644
--- a/CUDADataFormats/Track/test/trivially_constructible.cpp
+++ b/CUDADataFormats/Track/test/trivially_constructible_eric_soa.cpp
@@ -3,11 +3,12 @@
 #include <stdint.h>
 #include <assert.h>
 #include "CUDADataFormats/Common/interface/PortableDeviceCollection.h"
-#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h"
+
 using namespace std;
 
 int main() {
-  std::cout << "pixelTrack::TrackSoA trivially constructible: "
+  std::cout << "pixelTrack::TrackSoA with Eric's SoA, trivially constructible: "
             << std::is_trivially_constructible<pixelTrack::TrackSoA>::value << std::endl;
 
   std::cout << "cms::cuda::PortableDeviceCollection<TrackSoAHeterogeneousT_test<>> trivially constructible: "
diff --git a/CUDADataFormats/Track/test/trivially_constructible_manual_soa.cpp b/CUDADataFormats/Track/test/trivially_constructible_manual_soa.cpp
new file mode 100644
index 0000000000000..c275ca6e414f9
--- /dev/null
+++ b/CUDADataFormats/Track/test/trivially_constructible_manual_soa.cpp
@@ -0,0 +1,15 @@
+#include <bits/stdc++.h>
+#include <type_traits>
+#include <stdint.h>
+#include <assert.h>
+#include "CUDADataFormats/Common/interface/PortableDeviceCollection.h"
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h"
+
+using namespace std;
+
+int main() {
+  std::cout << "pixelTrack::TrackSoA with manually defined SoA, trivially constructible: "
+            << std::is_trivially_constructible<pixelTrack::TrackSoA>::value << std::endl;
+
+  return 0;
+}

From 04aa3e3fefd5cd155c571a67429746fb987d55c2 Mon Sep 17 00:00:00 2001
From: Dimitris Papagiannis <nothingface0@gmail.com>
Date: Thu, 13 Oct 2022 12:05:48 +0200
Subject: [PATCH 034/110] Added test for class instantiation

---
 .../interface/TrackSoAHeterogeneousT_test.h   |  5 +-
 .../Track/test/TrackSoAHeterogeneous_test.cpp | 48 +++++++++++++++++++
 .../Track/test/TrackSoAHeterogeneous_test.cu  | 29 +++++++++++
 3 files changed, 81 insertions(+), 1 deletion(-)
 create mode 100644 CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp
 create mode 100644 CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu

diff --git a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h
index 1cf34f14b30a1..c9d7e71bd556f 100644
--- a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h
+++ b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h
@@ -30,7 +30,7 @@ GENERATE_SOA_LAYOUT(TrackSoAHeterogeneousT_test,
                     SOA_COLUMN(int8_t, nLayers),
                     SOA_COLUMN(float, eta),
                     SOA_COLUMN(float, pt))
-                    // TODO: maybe add stateAtBS
+	            // TODO: maybe add stateAtBS
 
 template <int32_t S>
 class TrackSoAHeterogeneousT  : public cms::cuda::PortableDeviceCollection<TrackSoAHeterogeneousT_test<>> {
@@ -118,6 +118,9 @@ namespace pixelTrack {
 #endif
 
   using TrackSoA = TrackSoAHeterogeneousT<maxNumber()>;
+  using TrackSoAView = cms::cuda::PortableDeviceCollection<TrackSoAHeterogeneousT_test<>>::View;
+  using TrackSoAConstView = cms::cuda::PortableDeviceCollection<TrackSoAHeterogeneousT_test<>>::ConstView;
+
   using TrajectoryState = TrajectoryStateSoAT<maxNumber()>;
   using HitContainer = TrackSoA::HitContainer;
 
diff --git a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp
new file mode 100644
index 0000000000000..ac4e9978cc12f
--- /dev/null
+++ b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp
@@ -0,0 +1,48 @@
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h"
+
+namespace testTrackSoAHeterogeneousT {
+
+  void runKernels(pixelTrack::TrackSoAView tracks, uint32_t soaSize);
+
+}
+
+int main() {
+  cms::cudatest::requireDevices();
+
+  cudaStream_t stream;
+  cudaCheck(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
+
+  auto soaSize = 200;
+  // inner scope to deallocate memory before destroying the stream
+  {
+    /*TrackingRecHit2DGPU tkhit(nHits, false, 0, nullptr, nullptr, stream);
+    testTrackingRecHit2D::runKernels(tkhit.view());
+
+    TrackingRecHit2DGPU tkhitPhase2(nHits, true, 0, nullptr, nullptr, stream);
+    testTrackingRecHit2D::runKernels(tkhitPhase2.view());
+
+    TrackingRecHit2DHost tkhitH(nHits, false, 0, nullptr, nullptr, stream, &tkhit);
+    cudaStreamSynchronize(stream);
+    assert(tkhitH.view());
+    assert(tkhitH.view()->nHits() == unsigned(nHits));
+    assert(tkhitH.view()->nMaxModules() == phase1PixelTopology::numberOfModules);
+
+    TrackingRecHit2DHost tkhitHPhase2(nHits, true, 0, nullptr, nullptr, stream, &tkhit);
+    cudaStreamSynchronize(stream);
+    assert(tkhitHPhase2.view());
+    assert(tkhitHPhase2.view()->nHits() == unsigned(nHits));
+    assert(tkhitHPhase2.view()->nMaxModules() == phase2PixelTopology::numberOfModules);*/
+
+    pixelTrack::TrackSoA tracks;
+    testTrackSoAHeterogeneousT::runKernels(tracks.view(), soaSize);
+    std::cout << typeid(tracks.view()).name() << std::endl;
+  }
+
+  cudaCheck(cudaStreamDestroy(stream));
+
+  return 0;
+}
diff --git a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu
new file mode 100644
index 0000000000000..2e1d56a278eb4
--- /dev/null
+++ b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu
@@ -0,0 +1,29 @@
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h"
+
+namespace testTrackSoAHeterogeneousT {
+
+  __global__ void fill(pixelTrack::TrackSoAView tracks, uint32_t soaSize) {
+    assert(tracks);
+
+    int i = threadIdx.x;
+    if (i > soaSize)
+      return;
+    tracks[i].pt() = (float) i;
+  }
+
+  __global__ void verify(pixelTrack::TrackSoAConstView tracks, uint32_t soaSize) {
+    assert(tracks);
+
+    int i = threadIdx.x;
+    if (i > soaSize)
+      return;
+    assert(tracks[i].pt() == (float) i)
+  }
+
+  void runKernels(pixelTrack::TrackSoAView tracks, uint32_t soaSize) {
+    assert(tracks);
+    fill<<<1, 1024>>>(tracks, soaSize);
+    verify<<<1, 1024>>>(tracks, soaSize);
+  }
+
+}  // namespace testTrackingRecHit2D

From 31b4769b0f3d6f65309e7f4e6b7245deab4e74c3 Mon Sep 17 00:00:00 2001
From: Dimitris Papagiannis <nothingface0@gmail.com>
Date: Thu, 13 Oct 2022 13:55:31 +0200
Subject: [PATCH 035/110] Tests run, illegal memory access

---
 .../interface/TrackSoAHeterogeneousT_test.h   | 16 +++++-----
 CUDADataFormats/Track/test/BuildFile.xml      |  6 ++++
 .../Track/test/TrackSoAHeterogeneous_test.cpp | 26 ++--------------
 .../Track/test/TrackSoAHeterogeneous_test.cu  | 30 ++++++++++---------
 4 files changed, 31 insertions(+), 47 deletions(-)

diff --git a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h
index c9d7e71bd556f..8a2778e10aaec 100644
--- a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h
+++ b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h
@@ -26,17 +26,15 @@ namespace pixelTrack {
 
 GENERATE_SOA_LAYOUT(TrackSoAHeterogeneousT_test,
                     SOA_COLUMN(uint8_t, quality),
-                    SOA_COLUMN(float, chi2), // this is chi2/ndof as not necessarely all hits are used in the fit
+                    SOA_COLUMN(float, chi2),  // this is chi2/ndof as not necessarely all hits are used in the fit
                     SOA_COLUMN(int8_t, nLayers),
                     SOA_COLUMN(float, eta),
                     SOA_COLUMN(float, pt))
-	            // TODO: maybe add stateAtBS
+// TODO: maybe add stateAtBS
 
 template <int32_t S>
-class TrackSoAHeterogeneousT  : public cms::cuda::PortableDeviceCollection<TrackSoAHeterogeneousT_test<>> {
-
+class TrackSoAHeterogeneousT : public cms::cuda::PortableDeviceCollection<TrackSoAHeterogeneousT_test<>> {
 public:
-
   // using cms::cuda::PortableDeviceCollection<TrackSoAHeterogeneousT_test<>>::PortableDeviceCollection;
   TrackSoAHeterogeneousT() = default;
 
@@ -52,19 +50,19 @@ class TrackSoAHeterogeneousT  : public cms::cuda::PortableDeviceCollection<Track
 
 public:
   constexpr Quality quality(int32_t i) const { return static_cast<Quality>(view()[i].quality()); }
-  constexpr Quality &quality(int32_t i) { return static_cast<Quality &>(view()[i].quality()); }
+  // constexpr Quality &quality(int32_t i) { return static_cast<Quality &>(view()[i].quality()); }
   // TODO: static did not work; using reinterpret_cast
   constexpr Quality const *qualityData() const { return reinterpret_cast <Quality const *>(view().quality()); }
   constexpr Quality *qualityData() { return reinterpret_cast< Quality *>(view().quality()); }
 
   constexpr float pt(int32_t i) const { return view()[i].pt(); }
-  constexpr float &pt(int32_t i) { return view()[i].pt(); }
+  // constexpr float &pt(int32_t i) { return view()[i].pt(); }
 
   constexpr float eta(int32_t i) const { return view()[i].eta(); }
-  constexpr float &eta(int32_t i) { return view()[i].eta(); }
+  // constexpr float &eta(int32_t i) { return view()[i].eta(); }
 
   constexpr float chi2(int32_t i) const { return view()[i].chi2(); }
-  constexpr float &chi2(int32_t i) { return view()[i].chi2(); }
+  // constexpr float &chi2(int32_t i) { return view()[i].chi2(); }
 
   constexpr int nTracks() const { return nTracks_; }
   constexpr void setNTracks(int n) { nTracks_ = n; }
diff --git a/CUDADataFormats/Track/test/BuildFile.xml b/CUDADataFormats/Track/test/BuildFile.xml
index fc78783db473b..dd2d980859f99 100644
--- a/CUDADataFormats/Track/test/BuildFile.xml
+++ b/CUDADataFormats/Track/test/BuildFile.xml
@@ -17,3 +17,9 @@
   <flags CXXFLAGS="-g -DGPU_DEBUG"/>
 </bin>
 </iftool>
+
+<use name="CUDADataFormats/Track"/>
+<flags CUDA_FLAGS="-g -DGPU_DEBUG"/>
+<iftool name="cuda-gcc-support">
+<bin file="TrackSoAHeterogeneous_test.cpp TrackSoAHeterogeneous_test.cu" name="TrackSoAHeterogeneous_test"/>
+</iftool>
diff --git a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp
index ac4e9978cc12f..c34bda0806111 100644
--- a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp
+++ b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp
@@ -5,9 +5,7 @@
 #include "Geometry/CommonTopologies/interface/SimplePixelTopology.h"
 
 namespace testTrackSoAHeterogeneousT {
-
-  void runKernels(pixelTrack::TrackSoAView tracks, uint32_t soaSize);
-
+  void runKernels(pixelTrack::TrackSoAView tracks, unsigned int soaSize);
 }
 
 int main() {
@@ -16,32 +14,12 @@ int main() {
   cudaStream_t stream;
   cudaCheck(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
 
-  auto soaSize = 200;
+  const auto soaSize = 256;
   // inner scope to deallocate memory before destroying the stream
   {
-    /*TrackingRecHit2DGPU tkhit(nHits, false, 0, nullptr, nullptr, stream);
-    testTrackingRecHit2D::runKernels(tkhit.view());
-
-    TrackingRecHit2DGPU tkhitPhase2(nHits, true, 0, nullptr, nullptr, stream);
-    testTrackingRecHit2D::runKernels(tkhitPhase2.view());
-
-    TrackingRecHit2DHost tkhitH(nHits, false, 0, nullptr, nullptr, stream, &tkhit);
-    cudaStreamSynchronize(stream);
-    assert(tkhitH.view());
-    assert(tkhitH.view()->nHits() == unsigned(nHits));
-    assert(tkhitH.view()->nMaxModules() == phase1PixelTopology::numberOfModules);
-
-    TrackingRecHit2DHost tkhitHPhase2(nHits, true, 0, nullptr, nullptr, stream, &tkhit);
-    cudaStreamSynchronize(stream);
-    assert(tkhitHPhase2.view());
-    assert(tkhitHPhase2.view()->nHits() == unsigned(nHits));
-    assert(tkhitHPhase2.view()->nMaxModules() == phase2PixelTopology::numberOfModules);*/
-
     pixelTrack::TrackSoA tracks;
     testTrackSoAHeterogeneousT::runKernels(tracks.view(), soaSize);
-    std::cout << typeid(tracks.view()).name() << std::endl;
   }
-
   cudaCheck(cudaStreamDestroy(stream));
 
   return 0;
diff --git a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu
index 2e1d56a278eb4..162fd2448ea6e 100644
--- a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu
+++ b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu
@@ -1,29 +1,31 @@
 #include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h"
-
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 namespace testTrackSoAHeterogeneousT {
 
-  __global__ void fill(pixelTrack::TrackSoAView tracks, uint32_t soaSize) {
-    assert(tracks);
-
+  __global__ void fill(pixelTrack::TrackSoAView tracks, unsigned int soaSize) {
     int i = threadIdx.x;
-    if (i > soaSize)
+    if (i >= soaSize)
       return;
-    tracks[i].pt() = (float) i;
+    tracks[i].pt() = (float)i;
   }
 
-  __global__ void verify(pixelTrack::TrackSoAConstView tracks, uint32_t soaSize) {
-    assert(tracks);
-
+  __global__ void verify(pixelTrack::TrackSoAConstView tracks, unsigned int soaSize) {
     int i = threadIdx.x;
-    if (i > soaSize)
+    if (i >= soaSize)
       return;
-    assert(tracks[i].pt() == (float) i)
+    assert(tracks[i].pt() == (float)i);
   }
 
-  void runKernels(pixelTrack::TrackSoAView tracks, uint32_t soaSize) {
-    assert(tracks);
+  void runKernels(pixelTrack::TrackSoAView tracks, unsigned int soaSize) {
     fill<<<1, 1024>>>(tracks, soaSize);
+    cudaError_t cudaerr = cudaDeviceSynchronize();
+    if (cudaerr != cudaSuccess)
+      printf("kernel launch failed with error \"%s\".\n", cudaGetErrorString(cudaerr));
+
     verify<<<1, 1024>>>(tracks, soaSize);
+    cudaerr = cudaDeviceSynchronize();
+    if (cudaerr != cudaSuccess)
+      printf("kernel launch failed with error \"%s\".\n", cudaGetErrorString(cudaerr));
   }
 
-}  // namespace testTrackingRecHit2D
+}  // namespace testTrackSoAHeterogeneousT

From 00380b8c4b8f3263265a924bbfae2ed7548bc363 Mon Sep 17 00:00:00 2001
From: Dimitris Papagiannis <nothingface0@gmail.com>
Date: Thu, 13 Oct 2022 17:17:24 +0200
Subject: [PATCH 036/110] Fixed tests, verified SoA works

---
 CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h | 2 ++
 CUDADataFormats/Track/test/BuildFile.xml                      | 3 ++-
 CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp     | 3 ++-
 3 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h
index 8a2778e10aaec..7880bf728a91e 100644
--- a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h
+++ b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h
@@ -37,6 +37,8 @@ class TrackSoAHeterogeneousT : public cms::cuda::PortableDeviceCollection<TrackS
 public:
   // using cms::cuda::PortableDeviceCollection<TrackSoAHeterogeneousT_test<>>::PortableDeviceCollection;
   TrackSoAHeterogeneousT() = default;
+  explicit TrackSoAHeterogeneousT(cudaStream_t stream)
+      : PortableDeviceCollection<TrackSoAHeterogeneousT_test<>>(S, stream) {}
 
   static constexpr int32_t stride() { return S; }
 
diff --git a/CUDADataFormats/Track/test/BuildFile.xml b/CUDADataFormats/Track/test/BuildFile.xml
index dd2d980859f99..885760aad6a36 100644
--- a/CUDADataFormats/Track/test/BuildFile.xml
+++ b/CUDADataFormats/Track/test/BuildFile.xml
@@ -21,5 +21,6 @@
 <use name="CUDADataFormats/Track"/>
 <flags CUDA_FLAGS="-g -DGPU_DEBUG"/>
 <iftool name="cuda-gcc-support">
-<bin file="TrackSoAHeterogeneous_test.cpp TrackSoAHeterogeneous_test.cu" name="TrackSoAHeterogeneous_test"/>
+  <bin file="TrackSoAHeterogeneous_test.cpp TrackSoAHeterogeneous_test.cu" name="TrackSoAHeterogeneous_test">
+  </bin>
 </iftool>
diff --git a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp
index c34bda0806111..244d2a35f94d4 100644
--- a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp
+++ b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp
@@ -17,7 +17,8 @@ int main() {
   const auto soaSize = 256;
   // inner scope to deallocate memory before destroying the stream
   {
-    pixelTrack::TrackSoA tracks;
+    // pixelTrack::TrackSoA tracks;
+    TrackSoAHeterogeneousT<soaSize> tracks(stream);
     testTrackSoAHeterogeneousT::runKernels(tracks.view(), soaSize);
   }
   cudaCheck(cudaStreamDestroy(stream));

From f33f1357e1099f247296a28e49864e91707ea48b Mon Sep 17 00:00:00 2001
From: Dimitris Papagiannis <nothingface0@gmail.com>
Date: Thu, 13 Oct 2022 18:02:44 +0200
Subject: [PATCH 037/110] FIXED EVERYTHING

---
 .../interface/TrackSoAHeterogeneousT_test.h   | 24 ++++++++------
 CUDADataFormats/Track/test/BuildFile.xml      | 12 ++++---
 .../Track/test/TrackSoAHeterogeneous_test.cpp | 27 ++++++++++++++++
 .../Track/test/TrackSoAHeterogeneous_test.cu  | 31 +++++++++++++++++++
 .../test/trivially_constructible_eric_soa.cpp | 19 ------------
 .../trivially_constructible_manual_soa.cpp    | 15 ---------
 6 files changed, 81 insertions(+), 47 deletions(-)
 create mode 100644 CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp
 create mode 100644 CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu
 delete mode 100644 CUDADataFormats/Track/test/trivially_constructible_eric_soa.cpp
 delete mode 100644 CUDADataFormats/Track/test/trivially_constructible_manual_soa.cpp

diff --git a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h
index 667317ff8174c..68028c0ed92dc 100644
--- a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h
+++ b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h
@@ -44,17 +44,23 @@ namespace pixelTrack {
     using TrackSoAConstView = cms::cuda::PortableDeviceCollection<TrackSoAHeterogeneousT_test<>>::ConstView;
     // State at the Beam spot
     // phi,tip,1/pt,cotan(theta),zip
-    __host__ __device__ float charge(TrackSoAConstView tracks, int32_t i) {
-      return std::copysign(1.f, tracks[i].state()(2));
-    }
+    // __host__ __device__ float charge(TrackSoAConstView tracks, int32_t i) {
+    //   return std::copysign(1.f, tracks[i].state()(2));
+    // }
+
+    // __host__ __device__ float phi(TrackSoAConstView tracks, int32_t i) { return tracks[i].state()(0); }
 
-    __host__ __device__ float phi(TrackSoAConstView tracks, int32_t i) { return tracks[i].state()(0); }
+    // __host__ __device__ float tip(TrackSoAConstView tracks, int32_t i) { return tracks[i].state()(1); }
 
-    __host__ __device__ float tip(TrackSoAConstView tracks, int32_t i) { return tracks[i].state()(1); }
+    // __host__ __device__ float zip(TrackSoAConstView tracks, int32_t i) { return tracks[i].state()(4); }
 
-    __host__ __device__ float zip(TrackSoAConstView tracks, int32_t i) { return tracks[i].state()(4); }
+    // __host__ __device__ bool isTriplet(TrackSoAConstView tracks, int i) { return tracks[i].nLayers() == 3; }
 
-    __host__ __device__ bool isTriplet(TrackSoAConstView tracks, int i) { return tracks[i].nLayers() == 3; }
+#define phi(tracks, i) (tracks[i].state()(0))
+#define tip(tracks, i) (tracks[i].state()(1))
+#define charge(tracks, i) (tracks[i].state()(2))
+#define zip(tracks, i) (tracks[i].state()(4))
+#define isTriplet(tracks, i) (tracks[i].nLayers() == 3)
 
     template <typename V3, typename M3, typename V2, typename M2>
     __host__ __device__ inline void copyFromCircle(
@@ -103,8 +109,8 @@ class TrackSoAHeterogeneousT : public cms::cuda::PortableDeviceCollection<TrackS
   // using cms::cuda::PortableDeviceCollection<TrackSoAHeterogeneousT_test<>>::PortableDeviceCollection;
   TrackSoAHeterogeneousT() = default;
 
-  explicit TrackSoAHeterogeneousT(size_t maxModules, cudaStream_t stream)
-      : PortableDeviceCollection<TrackSoAHeterogeneousT_test<>>(maxModules, stream) {}
+  explicit TrackSoAHeterogeneousT(cudaStream_t stream)
+      : PortableDeviceCollection<TrackSoAHeterogeneousT_test<>>(S, stream) {}
 
   static constexpr int32_t stride() { return S; }
 
diff --git a/CUDADataFormats/Track/test/BuildFile.xml b/CUDADataFormats/Track/test/BuildFile.xml
index 3d216c02efa2c..8c16498888e4e 100644
--- a/CUDADataFormats/Track/test/BuildFile.xml
+++ b/CUDADataFormats/Track/test/BuildFile.xml
@@ -1,10 +1,14 @@
 <use name="HeterogeneousCore/CUDAUtilities"/>
-<bin file="trivially_constructible_manual_soa.cpp">
-</bin>
-<bin file="trivially_constructible_eric_soa.cpp">
-</bin>
 <bin file="TrackSoAHeterogeneous_t.cpp">
   <use name="eigen"/>
   <flags CXXFLAGS="-g -DGPU_DEBUG"/>
 </bin>
 
+<use name="CUDADataFormats/Track"/>
+
+<iftool name="cuda-gcc-support">
+  <bin file="TrackSoAHeterogeneous_test.cpp TrackSoAHeterogeneous_test.cu" name="TrackSoAHeterogeneous_test">
+	<flags CUDA_FLAGS="-g -DGPU_DEBUG"/>
+	<flags CXXFLAGS="-g -DGPU_DEBUG"/>	
+  </bin>
+</iftool>
diff --git a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp
new file mode 100644
index 0000000000000..244d2a35f94d4
--- /dev/null
+++ b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp
@@ -0,0 +1,27 @@
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h"
+
+namespace testTrackSoAHeterogeneousT {
+  void runKernels(pixelTrack::TrackSoAView tracks, unsigned int soaSize);
+}
+
+int main() {
+  cms::cudatest::requireDevices();
+
+  cudaStream_t stream;
+  cudaCheck(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
+
+  const auto soaSize = 256;
+  // inner scope to deallocate memory before destroying the stream
+  {
+    // pixelTrack::TrackSoA tracks;
+    TrackSoAHeterogeneousT<soaSize> tracks(stream);
+    testTrackSoAHeterogeneousT::runKernels(tracks.view(), soaSize);
+  }
+  cudaCheck(cudaStreamDestroy(stream));
+
+  return 0;
+}
diff --git a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu
new file mode 100644
index 0000000000000..162fd2448ea6e
--- /dev/null
+++ b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu
@@ -0,0 +1,31 @@
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+namespace testTrackSoAHeterogeneousT {
+
+  __global__ void fill(pixelTrack::TrackSoAView tracks, unsigned int soaSize) {
+    int i = threadIdx.x;
+    if (i >= soaSize)
+      return;
+    tracks[i].pt() = (float)i;
+  }
+
+  __global__ void verify(pixelTrack::TrackSoAConstView tracks, unsigned int soaSize) {
+    int i = threadIdx.x;
+    if (i >= soaSize)
+      return;
+    assert(tracks[i].pt() == (float)i);
+  }
+
+  void runKernels(pixelTrack::TrackSoAView tracks, unsigned int soaSize) {
+    fill<<<1, 1024>>>(tracks, soaSize);
+    cudaError_t cudaerr = cudaDeviceSynchronize();
+    if (cudaerr != cudaSuccess)
+      printf("kernel launch failed with error \"%s\".\n", cudaGetErrorString(cudaerr));
+
+    verify<<<1, 1024>>>(tracks, soaSize);
+    cudaerr = cudaDeviceSynchronize();
+    if (cudaerr != cudaSuccess)
+      printf("kernel launch failed with error \"%s\".\n", cudaGetErrorString(cudaerr));
+  }
+
+}  // namespace testTrackSoAHeterogeneousT
diff --git a/CUDADataFormats/Track/test/trivially_constructible_eric_soa.cpp b/CUDADataFormats/Track/test/trivially_constructible_eric_soa.cpp
deleted file mode 100644
index 3c9e60df70024..0000000000000
--- a/CUDADataFormats/Track/test/trivially_constructible_eric_soa.cpp
+++ /dev/null
@@ -1,19 +0,0 @@
-#include <bits/stdc++.h>
-#include <type_traits>
-#include <stdint.h>
-#include <assert.h>
-#include "CUDADataFormats/Common/interface/PortableDeviceCollection.h"
-#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h"
-
-using namespace std;
-
-int main() {
-  std::cout << "pixelTrack::TrackSoA with Eric's SoA, trivially constructible: "
-            << std::is_trivially_constructible<pixelTrack::TrackSoA>::value << std::endl;
-
-  std::cout << "cms::cuda::PortableDeviceCollection<TrackSoAHeterogeneousT_test<>> trivially constructible: "
-            << std::is_trivially_constructible<cms::cuda::PortableDeviceCollection<TrackSoAHeterogeneousT_test<>>>::value
-            << std::endl;
-
-  return 0;
-}
diff --git a/CUDADataFormats/Track/test/trivially_constructible_manual_soa.cpp b/CUDADataFormats/Track/test/trivially_constructible_manual_soa.cpp
deleted file mode 100644
index c275ca6e414f9..0000000000000
--- a/CUDADataFormats/Track/test/trivially_constructible_manual_soa.cpp
+++ /dev/null
@@ -1,15 +0,0 @@
-#include <bits/stdc++.h>
-#include <type_traits>
-#include <stdint.h>
-#include <assert.h>
-#include "CUDADataFormats/Common/interface/PortableDeviceCollection.h"
-#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h"
-
-using namespace std;
-
-int main() {
-  std::cout << "pixelTrack::TrackSoA with manually defined SoA, trivially constructible: "
-            << std::is_trivially_constructible<pixelTrack::TrackSoA>::value << std::endl;
-
-  return 0;
-}

From 63991bc93179170113c0321c4810ef6a10013e8f Mon Sep 17 00:00:00 2001
From: Dimitris Papagiannis <nothingface0@gmail.com>
Date: Mon, 17 Oct 2022 12:27:42 +0200
Subject: [PATCH 038/110] Made static functions inline

---
 .../interface/TrackSoAHeterogeneousT_test.h   | 20 +++++++------------
 1 file changed, 7 insertions(+), 13 deletions(-)

diff --git a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h
index 68028c0ed92dc..cfb895f0c40b6 100644
--- a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h
+++ b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h
@@ -44,23 +44,17 @@ namespace pixelTrack {
     using TrackSoAConstView = cms::cuda::PortableDeviceCollection<TrackSoAHeterogeneousT_test<>>::ConstView;
     // State at the Beam spot
     // phi,tip,1/pt,cotan(theta),zip
-    // __host__ __device__ float charge(TrackSoAConstView tracks, int32_t i) {
-    //   return std::copysign(1.f, tracks[i].state()(2));
-    // }
-
-    // __host__ __device__ float phi(TrackSoAConstView tracks, int32_t i) { return tracks[i].state()(0); }
+    __host__ __device__ inline float charge(TrackSoAConstView tracks, int32_t i) {
+      return std::copysign(1.f, tracks[i].state()(2));
+    }
 
-    // __host__ __device__ float tip(TrackSoAConstView tracks, int32_t i) { return tracks[i].state()(1); }
+    __host__ __device__ inline float phi(TrackSoAConstView tracks, int32_t i) { return tracks[i].state()(0); }
 
-    // __host__ __device__ float zip(TrackSoAConstView tracks, int32_t i) { return tracks[i].state()(4); }
+    __host__ __device__ inline float tip(TrackSoAConstView tracks, int32_t i) { return tracks[i].state()(1); }
 
-    // __host__ __device__ bool isTriplet(TrackSoAConstView tracks, int i) { return tracks[i].nLayers() == 3; }
+    __host__ __device__ inline float zip(TrackSoAConstView tracks, int32_t i) { return tracks[i].state()(4); }
 
-#define phi(tracks, i) (tracks[i].state()(0))
-#define tip(tracks, i) (tracks[i].state()(1))
-#define charge(tracks, i) (tracks[i].state()(2))
-#define zip(tracks, i) (tracks[i].state()(4))
-#define isTriplet(tracks, i) (tracks[i].nLayers() == 3)
+    __host__ __device__ inline bool isTriplet(TrackSoAConstView tracks, int i) { return tracks[i].nLayers() == 3; }
 
     template <typename V3, typename M3, typename V2, typename M2>
     __host__ __device__ inline void copyFromCircle(

From b3b82bf22c2c3f7b7b83afed27e0c9ed4a2990ab Mon Sep 17 00:00:00 2001
From: Dimitris Papagiannis <nothingface0@gmail.com>
Date: Mon, 17 Oct 2022 15:25:49 +0200
Subject: [PATCH 039/110] Added Breno's tests

---
 .../Track/test/TrackSoAHeterogeneous_test.cpp | 13 ++++++--
 .../Track/test/TrackSoAHeterogeneous_test.cu  | 30 +++++++++++--------
 2 files changed, 29 insertions(+), 14 deletions(-)

diff --git a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp
index 244d2a35f94d4..dc525e2259b5c 100644
--- a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp
+++ b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp
@@ -5,7 +5,9 @@
 #include "Geometry/CommonTopologies/interface/SimplePixelTopology.h"
 
 namespace testTrackSoAHeterogeneousT {
-  void runKernels(pixelTrack::TrackSoAView tracks, unsigned int soaSize);
+
+  void runKernels(pixelTrack::TrackSoAView tracks, uint32_t soaSize);
+
 }
 
 int main() {
@@ -17,9 +19,16 @@ int main() {
   const auto soaSize = 256;
   // inner scope to deallocate memory before destroying the stream
   {
-    // pixelTrack::TrackSoA tracks;
     TrackSoAHeterogeneousT<soaSize> tracks(stream);
+    auto ret = cms::cuda::make_host_unique<std::byte[]>(tracks.bufferSize(), stream);
     testTrackSoAHeterogeneousT::runKernels(tracks.view(), soaSize);
+    cudaCheck(cudaMemcpy(ret.get(), tracks.buffer().get(),TrackSoAHeterogeneousT_test<>::computeDataSize(soaSize),cudaMemcpyDeviceToHost));
+    TrackSoAHeterogeneousT_test<> tmp_layout(ret.get(),soaSize);
+    TrackSoAHeterogeneousT_test<>::View tmp_view(tmp_layout);
+    std::cout << "pt" << "\t" << "eta" << "\t" <<"chi2" << "\t" << "quality" << "\t" << "nLayers" << std::endl;
+    for(int i = 0; i < soaSize; ++i){
+      std::cout << tmp_view[i].pt() << "\t" << tmp_view[i].eta() << "\t" << tmp_view[i].chi2() << "\t" << (int)tmp_view[i].quality() << "\t" << (int)tmp_view[i].nLayers() << std::endl;
+    }
   }
   cudaCheck(cudaStreamDestroy(stream));
 
diff --git a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu
index 162fd2448ea6e..9c6085dd824a7 100644
--- a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu
+++ b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu
@@ -1,31 +1,37 @@
 #include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
 namespace testTrackSoAHeterogeneousT {
 
-  __global__ void fill(pixelTrack::TrackSoAView tracks, unsigned int soaSize) {
+  __global__ void fill(pixelTrack::TrackSoAView tracks, uint32_t soaSize) {
+    //assert(tracks);
+
     int i = threadIdx.x;
     if (i >= soaSize)
       return;
     tracks[i].pt() = (float)i;
+    tracks[i].eta() = (float)i;
+    tracks[i].chi2() = (float)i;
+    tracks[i].quality() = (uint8_t)i;
+    tracks[i].nLayers() = i % 128;
   }
 
-  __global__ void verify(pixelTrack::TrackSoAConstView tracks, unsigned int soaSize) {
+  __global__ void verify(pixelTrack::TrackSoAConstView tracks, uint32_t soaSize) {
+    //assert(tracks);
+
     int i = threadIdx.x;
     if (i >= soaSize)
       return;
-    assert(tracks[i].pt() == (float)i);
+    assert(abs(tracks[i].pt() - (float)i) < .0001);
+    assert(abs(tracks[i].eta() - (float)i) < .0001);
+    assert(abs(tracks[i].chi2() - (float)i) < .0001);
+    assert(tracks[i].quality() == i);
+    assert(tracks[i].nLayers() == i % 128);
   }
 
-  void runKernels(pixelTrack::TrackSoAView tracks, unsigned int soaSize) {
+  void runKernels(pixelTrack::TrackSoAView tracks, uint32_t soaSize) {
+    //assert(tracks);
     fill<<<1, 1024>>>(tracks, soaSize);
-    cudaError_t cudaerr = cudaDeviceSynchronize();
-    if (cudaerr != cudaSuccess)
-      printf("kernel launch failed with error \"%s\".\n", cudaGetErrorString(cudaerr));
-
     verify<<<1, 1024>>>(tracks, soaSize);
-    cudaerr = cudaDeviceSynchronize();
-    if (cudaerr != cudaSuccess)
-      printf("kernel launch failed with error \"%s\".\n", cudaGetErrorString(cudaerr));
   }
 
 }  // namespace testTrackSoAHeterogeneousT

From 9a6ccb8c2646fa6e06cab6dabe65ae5413bbad96 Mon Sep 17 00:00:00 2001
From: Dimitris Papagiannis <nothingface0@gmail.com>
Date: Mon, 17 Oct 2022 16:04:17 +0200
Subject: [PATCH 040/110] Fixed merge leftovers

---
 .../Track/interface/TrackSoAHeterogeneousT_test.h        | 3 ---
 CUDADataFormats/Track/test/BuildFile.xml                 | 9 ---------
 2 files changed, 12 deletions(-)

diff --git a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h
index 5cb1836ee3325..9ce074c0a24d4 100644
--- a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h
+++ b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h
@@ -105,9 +105,6 @@ class TrackSoAHeterogeneousT : public cms::cuda::PortableDeviceCollection<TrackS
   explicit TrackSoAHeterogeneousT(cudaStream_t stream)
       : PortableDeviceCollection<TrackSoAHeterogeneousT_test<>>(S, stream) {}
 
-  explicit TrackSoAHeterogeneousT(cudaStream_t stream)
-      : PortableDeviceCollection<TrackSoAHeterogeneousT_test<>>(S, stream) {}
-
   static constexpr int32_t stride() { return S; }
 
   using Quality = pixelTrack::Quality;
diff --git a/CUDADataFormats/Track/test/BuildFile.xml b/CUDADataFormats/Track/test/BuildFile.xml
index 1e3f6524e7232..e91df3fc785f7 100644
--- a/CUDADataFormats/Track/test/BuildFile.xml
+++ b/CUDADataFormats/Track/test/BuildFile.xml
@@ -4,15 +4,6 @@
   <flags CXXFLAGS="-g -DGPU_DEBUG"/>
 </bin>
 
-<use name="CUDADataFormats/Track"/>
-
-<iftool name="cuda-gcc-support">
-  <bin file="TrackSoAHeterogeneous_test.cpp TrackSoAHeterogeneous_test.cu" name="TrackSoAHeterogeneous_test">
-	<flags CUDA_FLAGS="-g -DGPU_DEBUG"/>
-	<flags CXXFLAGS="-g -DGPU_DEBUG"/>	
-  </bin>
-</iftool>
-
 <use name="CUDADataFormats/Track"/>
 <flags CUDA_FLAGS="-g -DGPU_DEBUG"/>
 <iftool name="cuda-gcc-support">

From 5f1068128aab8666c75cf80e6c4e323e9bc40f90 Mon Sep 17 00:00:00 2001
From: Breno Orzari <breno.orzari@hotmail.com>
Date: Mon, 17 Oct 2022 16:04:21 +0200
Subject: [PATCH 041/110] Adding TrackSoAHeterogeneous_test merged with
 TrajectoryState

---
 .../interface/TrackSoAHeterogeneousT_test.h   | 118 +++++++++++-------
 1 file changed, 75 insertions(+), 43 deletions(-)

diff --git a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h
index 7880bf728a91e..6d2623d715a3b 100644
--- a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h
+++ b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h
@@ -4,14 +4,11 @@
 #include <string>
 #include <algorithm>
 
-#include "CUDADataFormats/Track/interface/TrajectoryStateSoAT.h"
+#include <Eigen/Dense>
 #include "Geometry/CommonTopologies/interface/SimplePixelTopology.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
-
-#include "CUDADataFormats/Common/interface/HeterogeneousSoA.h"
 #include "DataFormats/SoATemplate/interface/SoALayout.h"
-
-//#include "DataFormats/Portable/interface/PortableCUDADeviceCollection.h"
+#include "CUDADataFormats/Common/interface/HeterogeneousSoA.h"
 #include "CUDADataFormats/Common/interface/PortableDeviceCollection.h"
 
 namespace pixelTrack {
@@ -24,19 +21,88 @@ namespace pixelTrack {
   }
 }  // namespace pixelTrack
 
+using Vector5f = Eigen::Matrix<float, 5, 1>;
+using Vector15f = Eigen::Matrix<float, 15, 1>;
+
+using Vector5d = Eigen::Matrix<double, 5, 1>;
+using Matrix5d = Eigen::Matrix<double, 5, 5>;
+
 GENERATE_SOA_LAYOUT(TrackSoAHeterogeneousT_test,
                     SOA_COLUMN(uint8_t, quality),
                     SOA_COLUMN(float, chi2),  // this is chi2/ndof as not necessarely all hits are used in the fit
                     SOA_COLUMN(int8_t, nLayers),
                     SOA_COLUMN(float, eta),
-                    SOA_COLUMN(float, pt))
-// TODO: maybe add stateAtBS
+                    SOA_COLUMN(float, pt),
+                    SOA_EIGEN_COLUMN(Vector5f, state),
+                    SOA_EIGEN_COLUMN(Vector15f, covariance),
+                    SOA_SCALAR(int, nTracks))
+
+// Previous TrajectoryStateSoAT class methods
+namespace pixelTrack {
+  namespace utilities {
+    using TrackSoAView = cms::cuda::PortableDeviceCollection<TrackSoAHeterogeneousT_test<>>::View;
+    using TrackSoAConstView = cms::cuda::PortableDeviceCollection<TrackSoAHeterogeneousT_test<>>::ConstView;
+    // State at the Beam spot
+    // phi,tip,1/pt,cotan(theta),zip
+    __host__ __device__ inline float charge(TrackSoAConstView tracks, int32_t i) {
+      return std::copysign(1.f, tracks[i].state()(2));
+    }
+
+    __host__ __device__ inline float phi(TrackSoAConstView tracks, int32_t i) { return tracks[i].state()(0); }
+
+    __host__ __device__ inline float tip(TrackSoAConstView tracks, int32_t i) { return tracks[i].state()(1); }
+
+    __host__ __device__ inline float zip(TrackSoAConstView tracks, int32_t i) { return tracks[i].state()(4); }
+
+    __host__ __device__ inline bool isTriplet(TrackSoAConstView tracks, int i) { return tracks[i].nLayers() == 3; }
+
+    template <typename V3, typename M3, typename V2, typename M2>
+    __device__ inline void copyFromCircle(
+        TrackSoAView tracks, V3 const &cp, M3 const &ccov, V2 const &lp, M2 const &lcov, float b, int32_t i) {
+      tracks[i].state() << cp.template cast<float>(), lp.template cast<float>();
+
+      tracks[i].state()(2) = tracks[i].state()(2) * b;
+      auto cov = tracks[i].covariance();
+      cov(0) = ccov(0, 0);
+      cov(1) = ccov(0, 1);
+      cov(2) = b * float(ccov(0, 2));
+      cov(4) = cov(3) = 0;
+      cov(5) = ccov(1, 1);
+      cov(6) = b * float(ccov(1, 2));
+      cov(8) = cov(7) = 0;
+      cov(9) = b * b * float(ccov(2, 2));
+      cov(11) = cov(10) = 0;
+      cov(12) = lcov(0, 0);
+      cov(13) = lcov(0, 1);
+      cov(14) = lcov(1, 1);
+    }
+
+    template <typename V5, typename M5>
+    __device__ inline void copyFromDense(TrackSoAView tracks, V5 const &v, M5 const &cov, int32_t i) {
+      tracks[i].state() = v.template cast<float>();
+      for (int j = 0, ind = 0; j < 5; ++j)
+        for (auto k = j; k < 5; ++k)
+          tracks[i].covariance()(ind++) = cov(j, k);
+    }
+
+    template <typename V5, typename M5>
+    __device__ inline void copyToDense(TrackSoAConstView tracks, V5 &v, M5 &cov, int32_t i) {
+      v = tracks[i].state().template cast<typename V5::Scalar>();
+      for (int j = 0, ind = 0; j < 5; ++j) {
+        cov(j, j) = tracks[i].covariance()(ind++);
+        for (auto k = j + 1; k < 5; ++k)
+          cov(k, j) = cov(j, k) = tracks[i].covariance()(ind++);
+      }
+    }
+  }  // namespace utilities
+}  // namespace pixelTrack
 
 template <int32_t S>
 class TrackSoAHeterogeneousT : public cms::cuda::PortableDeviceCollection<TrackSoAHeterogeneousT_test<>> {
 public:
   // using cms::cuda::PortableDeviceCollection<TrackSoAHeterogeneousT_test<>>::PortableDeviceCollection;
   TrackSoAHeterogeneousT() = default;
+
   explicit TrackSoAHeterogeneousT(cudaStream_t stream)
       : PortableDeviceCollection<TrackSoAHeterogeneousT_test<>>(S, stream) {}
 
@@ -49,30 +115,13 @@ class TrackSoAHeterogeneousT : public cms::cuda::PortableDeviceCollection<TrackS
   // Always check quality is at least loose!
   // CUDA does not support enums  in __lgc ...
 private:
-
 public:
-  constexpr Quality quality(int32_t i) const { return static_cast<Quality>(view()[i].quality()); }
-  // constexpr Quality &quality(int32_t i) { return static_cast<Quality &>(view()[i].quality()); }
   // TODO: static did not work; using reinterpret_cast
-  constexpr Quality const *qualityData() const { return reinterpret_cast <Quality const *>(view().quality()); }
-  constexpr Quality *qualityData() { return reinterpret_cast< Quality *>(view().quality()); }
-
-  constexpr float pt(int32_t i) const { return view()[i].pt(); }
-  // constexpr float &pt(int32_t i) { return view()[i].pt(); }
-
-  constexpr float eta(int32_t i) const { return view()[i].eta(); }
-  // constexpr float &eta(int32_t i) { return view()[i].eta(); }
-
-  constexpr float chi2(int32_t i) const { return view()[i].chi2(); }
-  // constexpr float &chi2(int32_t i) { return view()[i].chi2(); }
-
-  constexpr int nTracks() const { return nTracks_; }
-  constexpr void setNTracks(int n) { nTracks_ = n; }
+  constexpr Quality const *qualityData() const { return reinterpret_cast<Quality const *>(view().quality()); }
+  constexpr Quality *qualityData() { return reinterpret_cast<Quality *>(view().quality()); }
 
   constexpr int nHits(int i) const { return detIndices.size(i); }
 
-  constexpr bool isTriplet(int i) const { return view()[i].nLayers() == 3; }
-
   constexpr int computeNumberOfLayers(int32_t i) const {
     // layers are in order and we assume tracks are either forward or backward
     auto pdet = detIndices.begin(i);
@@ -87,24 +136,8 @@ class TrackSoAHeterogeneousT : public cms::cuda::PortableDeviceCollection<TrackS
     return nl;
   }
 
-  // State at the Beam spot
-  // phi,tip,1/pt,cotan(theta),zip
-  TrajectoryStateSoAT<S> stateAtBS;
-  constexpr float charge(int32_t i) const { return std::copysign(1.f, stateAtBS.state(i)(2)); }
-  constexpr float phi(int32_t i) const { return stateAtBS.state(i)(0); }
-  constexpr float tip(int32_t i) const { return stateAtBS.state(i)(1); }
-  constexpr float zip(int32_t i) const { return stateAtBS.state(i)(4); }
-
-  // state at the detector of the outermost hit
-  // representation to be decided...
-  // not yet filled on GPU
-  // TrajectoryStateSoA<S> stateAtOuterDet;
-
   HitContainer hitIndices;
   HitContainer detIndices;
-
-private:
-  int nTracks_;
 };
 
 namespace pixelTrack {
@@ -121,7 +154,6 @@ namespace pixelTrack {
   using TrackSoAView = cms::cuda::PortableDeviceCollection<TrackSoAHeterogeneousT_test<>>::View;
   using TrackSoAConstView = cms::cuda::PortableDeviceCollection<TrackSoAHeterogeneousT_test<>>::ConstView;
 
-  using TrajectoryState = TrajectoryStateSoAT<maxNumber()>;
   using HitContainer = TrackSoA::HitContainer;
 
 }  // namespace pixelTrack

From 1f07ee7a7e36d526ae4751fc156cf12028c061d8 Mon Sep 17 00:00:00 2001
From: Breno Orzari <breno.orzari@hotmail.com>
Date: Mon, 17 Oct 2022 16:12:30 +0200
Subject: [PATCH 042/110] Adding successful kernel tests

---
 .../Track/test/TrackSoAHeterogeneous_test.cpp | 15 ++++++--
 .../Track/test/TrackSoAHeterogeneous_test.cu  | 34 +++++++++++--------
 2 files changed, 32 insertions(+), 17 deletions(-)

diff --git a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp
index 244d2a35f94d4..8b1504a4a60e7 100644
--- a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp
+++ b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp
@@ -1,11 +1,13 @@
-#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h"
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test2.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "Geometry/CommonTopologies/interface/SimplePixelTopology.h"
 
 namespace testTrackSoAHeterogeneousT {
-  void runKernels(pixelTrack::TrackSoAView tracks, unsigned int soaSize);
+
+  void runKernels(pixelTrack::TrackSoAView tracks, uint32_t soaSize);
+
 }
 
 int main() {
@@ -17,9 +19,16 @@ int main() {
   const auto soaSize = 256;
   // inner scope to deallocate memory before destroying the stream
   {
-    // pixelTrack::TrackSoA tracks;
     TrackSoAHeterogeneousT<soaSize> tracks(stream);
+    auto ret = cms::cuda::make_host_unique<std::byte[]>(tracks.bufferSize(), stream);
     testTrackSoAHeterogeneousT::runKernels(tracks.view(), soaSize);
+    cudaCheck(cudaMemcpy(ret.get(), tracks.buffer().get(),TrackSoAHeterogeneousT_test<>::computeDataSize(soaSize),cudaMemcpyDeviceToHost));
+    TrackSoAHeterogeneousT_test<> tmp_layout(ret.get(),soaSize);
+    TrackSoAHeterogeneousT_test<>::View tmp_view(tmp_layout);
+    std::cout << "pt" << "\t" << "eta" << "\t" <<"chi2" << "\t" << "quality" << "\t" << "nLayers" << std::endl;
+    for(int i = 0; i < soaSize; ++i){
+      std::cout << tmp_view[i].pt() << "\t" << tmp_view[i].eta() << "\t" << tmp_view[i].chi2() << "\t" << (int)tmp_view[i].quality() << "\t" << (int)tmp_view[i].nLayers() << std::endl;
+    }
   }
   cudaCheck(cudaStreamDestroy(stream));
 
diff --git a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu
index 162fd2448ea6e..912ed44dba3ed 100644
--- a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu
+++ b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu
@@ -1,31 +1,37 @@
-#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test2.h"
+
 namespace testTrackSoAHeterogeneousT {
 
-  __global__ void fill(pixelTrack::TrackSoAView tracks, unsigned int soaSize) {
+  __global__ void fill(pixelTrack::TrackSoAView tracks, uint32_t soaSize) {
+    //assert(tracks);
+
     int i = threadIdx.x;
     if (i >= soaSize)
       return;
     tracks[i].pt() = (float)i;
+    tracks[i].eta() = (float)i;
+    tracks[i].chi2() = (float)i;
+    tracks[i].quality() = (uint8_t)i;
+    tracks[i].nLayers() = i%128;
   }
 
-  __global__ void verify(pixelTrack::TrackSoAConstView tracks, unsigned int soaSize) {
+  __global__ void verify(pixelTrack::TrackSoAConstView tracks, uint32_t soaSize) {
+    //assert(tracks);
+
     int i = threadIdx.x;
     if (i >= soaSize)
       return;
-    assert(tracks[i].pt() == (float)i);
+    assert(abs(tracks[i].pt() - (float)i) < .0001);
+    assert(abs(tracks[i].eta() - (float)i) < .0001);
+    assert(abs(tracks[i].chi2() - (float)i) < .0001);
+    assert(tracks[i].quality() == i);
+    assert(tracks[i].nLayers() == i%128);
   }
 
-  void runKernels(pixelTrack::TrackSoAView tracks, unsigned int soaSize) {
+  void runKernels(pixelTrack::TrackSoAView tracks, uint32_t soaSize) {
+    //assert(tracks);
     fill<<<1, 1024>>>(tracks, soaSize);
-    cudaError_t cudaerr = cudaDeviceSynchronize();
-    if (cudaerr != cudaSuccess)
-      printf("kernel launch failed with error \"%s\".\n", cudaGetErrorString(cudaerr));
-
     verify<<<1, 1024>>>(tracks, soaSize);
-    cudaerr = cudaDeviceSynchronize();
-    if (cudaerr != cudaSuccess)
-      printf("kernel launch failed with error \"%s\".\n", cudaGetErrorString(cudaerr));
   }
 
-}  // namespace testTrackSoAHeterogeneousT
+}  // namespace testTrackingRecHit2D

From 2899c8c8aef7f0dd885ca83160fa846102e736a8 Mon Sep 17 00:00:00 2001
From: Breno Orzari <breno.orzari@hotmail.com>
Date: Mon, 17 Oct 2022 16:15:19 +0200
Subject: [PATCH 043/110] Fixing header name in kernel tests

---
 CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp | 2 +-
 CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp
index 8b1504a4a60e7..dc525e2259b5c 100644
--- a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp
+++ b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp
@@ -1,4 +1,4 @@
-#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test2.h"
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
diff --git a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu
index 912ed44dba3ed..9d26d2497e0b1 100644
--- a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu
+++ b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu
@@ -1,4 +1,4 @@
-#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test2.h"
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h"
 
 namespace testTrackSoAHeterogeneousT {
 

From f8239610c93e287e2973877c1ee98e955f0b7009 Mon Sep 17 00:00:00 2001
From: Dimitris Papagiannis <nothingface0@gmail.com>
Date: Tue, 18 Oct 2022 15:08:59 +0200
Subject: [PATCH 044/110] FINALLY TESTS WORK(?)

---
 .../Track/test/TrackSoAHeterogeneous_test.cpp | 58 ++++++++++++++-----
 .../Track/test/TrackSoAHeterogeneous_test.cu  | 52 +++++++++--------
 2 files changed, 74 insertions(+), 36 deletions(-)

diff --git a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp
index dc525e2259b5c..431dbd4577297 100644
--- a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp
+++ b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp
@@ -1,13 +1,12 @@
 #include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/allocate_device.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/currentDevice.h"
 
 namespace testTrackSoAHeterogeneousT {
 
-  void runKernels(pixelTrack::TrackSoAView tracks, uint32_t soaSize);
-
+  void runKernels(pixelTrack::TrackSoA *tracks, pixelTrack::TrackSoAView tracks_view);
 }
 
 int main() {
@@ -16,19 +15,52 @@ int main() {
   cudaStream_t stream;
   cudaCheck(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
 
-  const auto soaSize = 256;
   // inner scope to deallocate memory before destroying the stream
   {
-    TrackSoAHeterogeneousT<soaSize> tracks(stream);
-    auto ret = cms::cuda::make_host_unique<std::byte[]>(tracks.bufferSize(), stream);
-    testTrackSoAHeterogeneousT::runKernels(tracks.view(), soaSize);
-    cudaCheck(cudaMemcpy(ret.get(), tracks.buffer().get(),TrackSoAHeterogeneousT_test<>::computeDataSize(soaSize),cudaMemcpyDeviceToHost));
-    TrackSoAHeterogeneousT_test<> tmp_layout(ret.get(),soaSize);
+    // Instantiate tracks on host. Portabledevicecollection allocates
+    // SoA on device automatically.
+    int dev = cms::cuda::currentDevice();
+    pixelTrack::TrackSoA tracks_h(stream);
+
+    // Make a copy of tracks_h to device, so that we can
+    // modify hitIndices.
+    void *mem = cms::cuda::allocate_device(dev, sizeof(pixelTrack::TrackSoA), stream);
+    cudaCheck(cudaMemcpy(mem, &tracks_h, sizeof(pixelTrack::TrackSoA), cudaMemcpyHostToDevice));
+
+    // Run the tests
+    pixelTrack::TrackSoA *tracks_d = reinterpret_cast<pixelTrack::TrackSoA *>(mem);
+    testTrackSoAHeterogeneousT::runKernels(tracks_d, tracks_h.view());
+
+    // Copy SoA data back to host
+    auto ret = cms::cuda::make_host_unique<std::byte[]>(tracks_h.bufferSize(), stream);
+    cudaCheck(cudaMemcpy(ret.get(),
+                         tracks_h.buffer().get(),
+                         TrackSoAHeterogeneousT_test<>::computeDataSize(tracks_h.stride()),
+                         cudaMemcpyDeviceToHost));
+
+    // Copy tracks_d back to tracks_h
+    cudaCheck(cudaMemcpy(&tracks_h, mem, sizeof(pixelTrack::TrackSoA), cudaMemcpyDeviceToHost));
+
+    // Create a view to access the copied data
+    TrackSoAHeterogeneousT_test<> tmp_layout(ret.get(), tracks_h.stride());
     TrackSoAHeterogeneousT_test<>::View tmp_view(tmp_layout);
-    std::cout << "pt" << "\t" << "eta" << "\t" <<"chi2" << "\t" << "quality" << "\t" << "nLayers" << std::endl;
-    for(int i = 0; i < soaSize; ++i){
-      std::cout << tmp_view[i].pt() << "\t" << tmp_view[i].eta() << "\t" << tmp_view[i].chi2() << "\t" << (int)tmp_view[i].quality() << "\t" << (int)tmp_view[i].nLayers() << std::endl;
+    std::cout << "pt"
+              << "\t"
+              << "eta"
+              << "\t"
+              << "chi2"
+              << "\t"
+              << "quality"
+              << "\t"
+              << "nLayers"
+              << "\t"
+              << "hitIndices off" << std::endl;
+    for (int i = 0; i < tracks_h.stride(); ++i) {
+      std::cout << tmp_view[i].pt() << "\t" << tmp_view[i].eta() << "\t" << tmp_view[i].chi2() << "\t"
+                << (int)tmp_view[i].quality() << "\t" << (int)tmp_view[i].nLayers() << "\t"
+                << tracks_h.hitIndices.off[i] << std::endl;
     }
+    cudaCheck(cudaFree(mem));
   }
   cudaCheck(cudaStreamDestroy(stream));
 
diff --git a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu
index 3f95e5ed5fe3f..80b8e1c7ce140 100644
--- a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu
+++ b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu
@@ -1,37 +1,43 @@
 #include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/OneToManyAssoc.h"
 
 namespace testTrackSoAHeterogeneousT {
 
-  __global__ void fill(pixelTrack::TrackSoAView tracks, uint32_t soaSize) {
-    //assert(tracks);
+  __global__ void fill(pixelTrack::TrackSoA* __restrict__ tracks, pixelTrack::TrackSoAView tracks_view) {
+    assert(tracks);
 
     int i = threadIdx.x;
-    if (i >= soaSize)
-      return;
-    tracks[i].pt() = (float)i;
-    tracks[i].eta() = (float)i;
-    tracks[i].chi2() = (float)i;
-    tracks[i].quality() = (uint8_t)i;
-    tracks[i].nLayers() = i % 128;
+    for (int j = i; j < tracks->stride(); j += blockDim.x) {
+      tracks_view[j].pt() = (float)j;
+      tracks_view[j].eta() = (float)j;
+      tracks_view[j].chi2() = (float)j;
+      tracks_view[j].quality() = (uint8_t)j % 256;
+      tracks_view[j].nLayers() = j % 128;
+      tracks->hitIndices.off[j] = j;
+    }
   }
 
-  __global__ void verify(pixelTrack::TrackSoAConstView tracks, uint32_t soaSize) {
-    //assert(tracks);
+  __global__ void verify(pixelTrack::TrackSoA* const __restrict__ tracks, pixelTrack::TrackSoAConstView tracks_view) {
+    assert(tracks);
 
     int i = threadIdx.x;
-    if (i >= soaSize)
-      return;
-    assert(abs(tracks[i].pt() - (float)i) < .0001);
-    assert(abs(tracks[i].eta() - (float)i) < .0001);
-    assert(abs(tracks[i].chi2() - (float)i) < .0001);
-    assert(tracks[i].quality() == i);
-    assert(tracks[i].nLayers() == i % 128);
+    if (i == 0) {
+      printf("Stride: %d, block dims: %d\n", tracks->stride(), blockDim.x);
+    }
+    for (int j = i; j < tracks->stride(); j += blockDim.x) {
+      assert(abs(tracks_view[j].pt() - (float)j) < .0001);
+      assert(abs(tracks_view[j].eta() - (float)j) < .0001);
+      assert(abs(tracks_view[j].chi2() - (float)j) < .0001);
+      assert(tracks_view[j].quality() == j % 256);
+      assert(tracks_view[j].nLayers() == j % 128);
+      assert(tracks->hitIndices.off[j] == j);
+    }
   }
 
-  void runKernels(pixelTrack::TrackSoAView tracks, uint32_t soaSize) {
-    //assert(tracks);
-    fill<<<1, 1024>>>(tracks, soaSize);
-    verify<<<1, 1024>>>(tracks, soaSize);
+  void runKernels(pixelTrack::TrackSoA* tracks, pixelTrack::TrackSoAView tracks_view) {
+    assert(tracks);
+    fill<<<1, 1024>>>(tracks, tracks_view);
+    verify<<<1, 1024>>>(tracks, tracks_view);
   }
 
-}  // namespace testTrackingRecHit2D
+}  // namespace testTrackSoAHeterogeneousT

From e4db5066a690c9f9d3b59c18639304e6c79fd59b Mon Sep 17 00:00:00 2001
From: Breno Orzari <breno.orzari@hotmail.com>
Date: Tue, 18 Oct 2022 18:12:27 +0200
Subject: [PATCH 045/110] Changing dataFormats in RecoPixelVertexing

---
 .../plugins/PixelTrackSoAFromCUDA.cc          | 65 ++++++++++++++-----
 .../PixelTriplets/plugins/CAHitNtupletCUDA.cc | 15 +++--
 .../plugins/CAHitNtupletGeneratorOnGPU.cc     | 23 ++++---
 .../plugins/CAHitNtupletGeneratorOnGPU.h      |  4 +-
 4 files changed, 75 insertions(+), 32 deletions(-)

diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc
index 57df7ae63b7a0..e43f6b028aa15 100644
--- a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc
+++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc
@@ -2,7 +2,8 @@
 
 #include "CUDADataFormats/Common/interface/Product.h"
 #include "CUDADataFormats/Common/interface/HostProduct.h"
-#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
+//#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h"
 #include "DataFormats/Common/interface/Handle.h"
 #include "FWCore/Framework/interface/Event.h"
 #include "FWCore/Framework/interface/EventSetup.h"
@@ -32,15 +33,27 @@ class PixelTrackSoAFromCUDA : public edm::stream::EDProducer<edm::ExternalWork>
                edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
   void produce(edm::Event& iEvent, edm::EventSetup const& iSetup) override;
 
-  edm::EDGetTokenT<cms::cuda::Product<PixelTrackHeterogeneous>> tokenCUDA_;
-  edm::EDPutTokenT<PixelTrackHeterogeneous> tokenSOA_;
+  //edm::EDGetTokenT<cms::cuda::Product<PixelTrackHeterogeneous>> tokenCUDA_;
+  //edm::EDPutTokenT<PixelTrackHeterogeneous> tokenSOA_;
 
-  cms::cuda::host::unique_ptr<pixelTrack::TrackSoA> soa_;
+  //edm::EDGetTokenT<cms::cuda::Product<TrackSoAHeterogeneousT<32768>>> tokenCUDA_;
+  edm::EDGetTokenT<cms::cuda::Product<pixelTrack::TrackSoA>> tokenCUDA_;
+  //edm::EDPutTokenT<TrackSoAHeterogeneousT_test<>::View> tokenSOA_;
+  edm::EDPutTokenT<pixelTrack::TrackSoA> tokenSOA_;
+
+  //cms::cuda::host::unique_ptr<pixelTrack::TrackSoA> soa_;
+  //cms::cuda::host::unique_ptr<pixelTrack::TrackSoA> soa_;
+  //TrackSoAHeterogeneousT_test<>::View soa_;
+  pixelTrack::TrackSoA soa_;
+  pixelTrack::TrackSoAView tmp_view_;
 };
 
 PixelTrackSoAFromCUDA::PixelTrackSoAFromCUDA(const edm::ParameterSet& iConfig)
-    : tokenCUDA_(consumes<cms::cuda::Product<PixelTrackHeterogeneous>>(iConfig.getParameter<edm::InputTag>("src"))),
-      tokenSOA_(produces<PixelTrackHeterogeneous>()) {}
+    //: tokenCUDA_(consumes<cms::cuda::Product<PixelTrackHeterogeneous>>(iConfig.getParameter<edm::InputTag>("src"))),
+    //  tokenSOA_(produces<PixelTrackHeterogeneous>()) {}
+    : tokenCUDA_(consumes<cms::cuda::Product<pixelTrack::TrackSoA>>(iConfig.getParameter<edm::InputTag>("src"))),
+      //tokenSOA_(produces<TrackSoAHeterogeneousT_test<>::View>()) {}
+      tokenSOA_(produces<pixelTrack::TrackSoA>()) {}
 
 void PixelTrackSoAFromCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
   edm::ParameterSetDescription desc;
@@ -49,7 +62,7 @@ void PixelTrackSoAFromCUDA::fillDescriptions(edm::ConfigurationDescriptions& des
   descriptions.add("pixelTracksSoA", desc);
 }
 
-void PixelTrackSoAFromCUDA::acquire(edm::Event const& iEvent,
+/*void PixelTrackSoAFromCUDA::acquire(edm::Event const& iEvent,
                                     edm::EventSetup const& iSetup,
                                     edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
   cms::cuda::Product<PixelTrackHeterogeneous> const& inputDataWrapped = iEvent.get(tokenCUDA_);
@@ -57,13 +70,33 @@ void PixelTrackSoAFromCUDA::acquire(edm::Event const& iEvent,
   auto const& inputData = ctx.get(inputDataWrapped);
 
   soa_ = inputData.toHostAsync(ctx.stream());
+}*/
+
+void PixelTrackSoAFromCUDA::acquire(edm::Event const& iEvent,
+                                    edm::EventSetup const& iSetup,
+                                    edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
+  cms::cuda::Product<pixelTrack::TrackSoA> const& inputDataWrapped = iEvent.get(tokenCUDA_);
+  cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)};
+  auto const& inputData = ctx.get(inputDataWrapped);
+
+  //class_ = inputData.toHostAsync(ctx.stream());
+
+  pixelTrack::TrackSoA soa_(ctx.stream());
+  cudaCheck(cudaMemcpy(&soa_,&inputData,sizeof(pixelTrack::TrackSoA),cudaMemcpyDeviceToHost));
+
+  auto retView = cms::cuda::make_host_unique<std::byte[]>(inputData.bufferSize(), ctx.stream());
+  cudaCheck(cudaMemcpy(retView.get(),inputData.buffer().get(),TrackSoAHeterogeneousT_test<>::computeDataSize(32768),cudaMemcpyDeviceToHost));
+  TrackSoAHeterogeneousT_test<> tmp_layout(retView.get(),32768);
+  TrackSoAHeterogeneousT_test<>::View tmp_view_(tmp_layout);
+
 }
 
 void PixelTrackSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& iSetup) {
   // check that the fixed-size SoA does not overflow
-  auto const& tsoa = *soa_;
-  auto maxTracks = tsoa.stride();
-  auto nTracks = tsoa.view().nTracks();
+  //auto tsoa = soa_;
+  //auto maxTracks = tsoa.stride();
+  auto maxTracks = 32768;
+  auto nTracks = tmp_view_.nTracks();
   assert(nTracks < maxTracks);
   if (nTracks == maxTracks - 1) {
     edm::LogWarning("PixelTracks") << "Unsorted reconstructed pixel tracks truncated to " << maxTracks - 1
@@ -71,13 +104,13 @@ void PixelTrackSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& i
   }
 
 #ifdef PIXEL_DEBUG_PRODUCE
-  std::cout << "size of SoA " << sizeof(tsoa) << " stride " << maxTracks << std::endl;
-  std::cout << "found " << nTracks << " tracks in cpu SoA at " << &tsoa << std::endl;
+  std::cout << "size of SoA " << sizeof(soa_) << " stride " << maxTracks << std::endl;
+  std::cout << "found " << nTracks << " tracks in cpu SoA at " << &soa_ << std::endl;
 
   int32_t nt = 0;
   for (int32_t it = 0; it < maxTracks; ++it) {
-    auto nHits = tsoa.nHits(it);
-    assert(nHits == int(tsoa.hitIndices.size(it)));
+    auto nHits = soa_.nHits(it);
+    assert(nHits == int(soa_.hitIndices.size(it)));
     if (nHits == 0)
       break;  // this is a guard: maybe we need to move to nTracks...
     nt++;
@@ -86,9 +119,9 @@ void PixelTrackSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& i
 #endif
 
   // DO NOT  make a copy  (actually TWO....)
-  iEvent.emplace(tokenSOA_, std::move(soa_));
+  iEvent.emplace(tokenSOA_, std::move(soa_));//, std::move(ret)); // view
 
-  assert(!soa_);
+  //assert(!soa_);
 }
 
 DEFINE_FWK_MODULE(PixelTrackSoAFromCUDA);
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc
index 72c482c6189db..c9831afc01067 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc
@@ -20,7 +20,8 @@
 #include "RecoTracker/TkMSParametrization/interface/PixelRecoUtilities.h"
 
 #include "CAHitNtupletGeneratorOnGPU.h"
-#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
+//#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h"
 #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h"
 
 class CAHitNtupletCUDA : public edm::global::EDProducer<> {
@@ -40,9 +41,11 @@ class CAHitNtupletCUDA : public edm::global::EDProducer<> {
 
   edm::ESGetToken<MagneticField, IdealMagneticFieldRecord> tokenField_;
   edm::EDGetTokenT<cms::cuda::Product<TrackingRecHit2DGPU>> tokenHitGPU_;
-  edm::EDPutTokenT<cms::cuda::Product<PixelTrackHeterogeneous>> tokenTrackGPU_;
+  //edm::EDPutTokenT<cms::cuda::Product<PixelTrackHeterogeneous>> tokenTrackGPU_;
+  edm::EDPutTokenT<cms::cuda::Product<pixelTrack::TrackSoA>> tokenTrackGPU_;
   edm::EDGetTokenT<TrackingRecHit2DCPU> tokenHitCPU_;
-  edm::EDPutTokenT<PixelTrackHeterogeneous> tokenTrackCPU_;
+  //edm::EDPutTokenT<PixelTrackHeterogeneous> tokenTrackCPU_;
+  edm::EDPutTokenT<pixelTrack::TrackSoA> tokenTrackCPU_;
 
   CAHitNtupletGeneratorOnGPU gpuAlgo_;
 };
@@ -52,10 +55,12 @@ CAHitNtupletCUDA::CAHitNtupletCUDA(const edm::ParameterSet& iConfig)
   if (onGPU_) {
     tokenHitGPU_ =
         consumes<cms::cuda::Product<TrackingRecHit2DGPU>>(iConfig.getParameter<edm::InputTag>("pixelRecHitSrc"));
-    tokenTrackGPU_ = produces<cms::cuda::Product<PixelTrackHeterogeneous>>();
+    //tokenTrackGPU_ = produces<cms::cuda::Product<PixelTrackHeterogeneous>>();
+    tokenTrackGPU_ = produces<cms::cuda::Product<pixelTrack::TrackSoA>>();
   } else {
     tokenHitCPU_ = consumes<TrackingRecHit2DCPU>(iConfig.getParameter<edm::InputTag>("pixelRecHitSrc"));
-    tokenTrackCPU_ = produces<PixelTrackHeterogeneous>();
+    //tokenTrackCPU_ = produces<PixelTrackHeterogeneous>();
+    tokenTrackCPU_ = produces<pixelTrack::TrackSoA>();
   }
 }
 
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
index 4a5689f572e47..6c5fdb36a9d46 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
@@ -20,6 +20,8 @@
 #include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
 #include "TrackingTools/DetLayers/interface/BarrelDetLayer.h"
 
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h"
+
 #include "CAHitNtupletGeneratorOnGPU.h"
 
 namespace {
@@ -184,13 +186,15 @@ void CAHitNtupletGeneratorOnGPU::endJob() {
   }
 }
 
-PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecHit2DGPU const& hits_d,
+/*PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecHit2DGPU const& hits_d,
                                                                     float bfield,
                                                                     cudaStream_t stream) const {
-  PixelTrackHeterogeneous tracks(cms::cuda::make_device_unique<pixelTrack::TrackSoA>(stream));
-
-  auto* soa = tracks.get();
-  assert(soa);
+  PixelTrackHeterogeneous tracks(cms::cuda::make_device_unique<pixelTrack::TrackSoA>(stream));*/
+  pixelTrack::TrackSoA CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecHit2DGPU const& hits_d,
+                                                                      float bfield,
+                                                                      cudaStream_t stream) const {
+  pixelTrack::TrackSoA tracks(stream);
+  auto* soa = &tracks;
 
   CAHitNtupletGeneratorKernelsGPU kernels(m_params);
   kernels.setCounters(m_counters);
@@ -217,11 +221,12 @@ PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecH
   return tracks;
 }
 
-PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuples(TrackingRecHit2DCPU const& hits_d, float bfield) const {
-  PixelTrackHeterogeneous tracks(std::make_unique<pixelTrack::TrackSoA>());
+pixelTrack::TrackSoA CAHitNtupletGeneratorOnGPU::makeTuples(TrackingRecHit2DCPU const& hits_d, float bfield) const {
+  //PixelTrackHeterogeneous tracks(std::make_unique<pixelTrack::TrackSoA>());
+  pixelTrack::TrackSoA tracks;
 
-  auto* soa = tracks.get();
-  assert(soa);
+  auto* soa = &tracks;
+  //assert(soa);
 
   CAHitNtupletGeneratorKernelsCPU kernels(m_params);
   kernels.setCounters(m_counters);
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h
index 36212298aac2f..ff13d09c1361a 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h
@@ -47,9 +47,9 @@ class CAHitNtupletGeneratorOnGPU {
   void beginJob();
   void endJob();
 
-  PixelTrackHeterogeneous makeTuplesAsync(TrackingRecHit2DGPU const& hits_d, float bfield, cudaStream_t stream) const;
+  pixelTrack::TrackSoA makeTuplesAsync(TrackingRecHit2DGPU const& hits_d, float bfield, cudaStream_t stream) const;
 
-  PixelTrackHeterogeneous makeTuples(TrackingRecHit2DCPU const& hits_d, float bfield) const;
+  pixelTrack::TrackSoA makeTuples(TrackingRecHit2DCPU const& hits_d, float bfield) const;
 
 private:
   void buildDoublets(HitsOnCPU const& hh, cudaStream_t stream) const;

From ba895c004bdf3153ca02d2e2b584fe44873211fc Mon Sep 17 00:00:00 2001
From: Dimitris Papagiannis <nothingface0@gmail.com>
Date: Wed, 19 Oct 2022 18:17:38 +0200
Subject: [PATCH 046/110] HitContainer added as SOA_SCALAR

---
 .../interface/TrackSoAHeterogeneousT_test.h   | 27 +++++++++++--------
 .../Track/test/TrackSoAHeterogeneous_test.cpp |  3 ++-
 .../Track/test/TrackSoAHeterogeneous_test.cu  |  2 ++
 3 files changed, 20 insertions(+), 12 deletions(-)

diff --git a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h
index cfb895f0c40b6..998f63e608244 100644
--- a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h
+++ b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h
@@ -1,6 +1,7 @@
 #ifndef CUDADataFormats_Track_TrackHeterogeneousT_H
 #define CUDADataFormats_Track_TrackHeterogeneousT_H
 
+#include <bits/stdint-uintn.h>
 #include <string>
 #include <algorithm>
 
@@ -19,6 +20,15 @@ namespace pixelTrack {
     auto qp = std::find(qualityName, qualityName + qualitySize, name) - qualityName;
     return static_cast<Quality>(qp);
   }
+
+#ifdef GPU_SMALL_EVENTS
+  // kept for testing and debugging
+  constexpr uint32_t maxNumber() { return 2 * 1024; }
+#else
+  // tested on MC events with 55-75 pileup events
+  constexpr uint32_t maxNumber() { return 32 * 1024; }
+#endif
+
 }  // namespace pixelTrack
 
 using Vector5f = Eigen::Matrix<float, 5, 1>;
@@ -26,6 +36,7 @@ using Vector15f = Eigen::Matrix<float, 15, 1>;
 
 using Vector5d = Eigen::Matrix<double, 5, 1>;
 using Matrix5d = Eigen::Matrix<double, 5, 5>;
+using HitContainer = cms::cuda::OneToManyAssoc<uint32_t, pixelTrack::maxNumber() + 1, 5 * pixelTrack::maxNumber()>;
 
 GENERATE_SOA_LAYOUT(TrackSoAHeterogeneousT_test,
                     SOA_COLUMN(uint8_t, quality),
@@ -35,7 +46,9 @@ GENERATE_SOA_LAYOUT(TrackSoAHeterogeneousT_test,
                     SOA_COLUMN(float, pt),
                     SOA_EIGEN_COLUMN(Vector5f, state),
                     SOA_EIGEN_COLUMN(Vector15f, covariance),
-                    SOA_SCALAR(int, nTracks))
+                    SOA_SCALAR(int, nTracks),
+                    SOA_SCALAR(HitContainer, hitIndices),
+                    SOA_SCALAR(HitContainer, detIndices))
 
 // Previous TrajectoryStateSoAT class methods
 namespace pixelTrack {
@@ -110,7 +123,7 @@ class TrackSoAHeterogeneousT : public cms::cuda::PortableDeviceCollection<TrackS
 
   using Quality = pixelTrack::Quality;
   using hindex_type = uint32_t;
-  using HitContainer = cms::cuda::OneToManyAssoc<hindex_type, S + 1, 5 * S>;
+  // using HitContainer = cms::cuda::OneToManyAssoc<hindex_type, S + 1, 5 * S>;
 
   // Always check quality is at least loose!
   // CUDA does not support enums  in __lgc ...
@@ -142,19 +155,11 @@ class TrackSoAHeterogeneousT : public cms::cuda::PortableDeviceCollection<TrackS
 
 namespace pixelTrack {
 
-#ifdef GPU_SMALL_EVENTS
-  // kept for testing and debugging
-  constexpr uint32_t maxNumber() { return 2 * 1024; }
-#else
-  // tested on MC events with 55-75 pileup events
-  constexpr uint32_t maxNumber() { return 32 * 1024; }
-#endif
-
   using TrackSoA = TrackSoAHeterogeneousT<maxNumber()>;
   using TrackSoAView = cms::cuda::PortableDeviceCollection<TrackSoAHeterogeneousT_test<>>::View;
   using TrackSoAConstView = cms::cuda::PortableDeviceCollection<TrackSoAHeterogeneousT_test<>>::ConstView;
 
-  using HitContainer = TrackSoA::HitContainer;
+  // using HitContainer = TrackSoA::HitContainer;
 
 }  // namespace pixelTrack
 
diff --git a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp
index 431dbd4577297..d40ec10af2fa4 100644
--- a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp
+++ b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp
@@ -58,8 +58,9 @@ int main() {
     for (int i = 0; i < tracks_h.stride(); ++i) {
       std::cout << tmp_view[i].pt() << "\t" << tmp_view[i].eta() << "\t" << tmp_view[i].chi2() << "\t"
                 << (int)tmp_view[i].quality() << "\t" << (int)tmp_view[i].nLayers() << "\t"
-                << tracks_h.hitIndices.off[i] << std::endl;
+                << tmp_view.hitIndices().off[i] << std::endl;
     }
+
     cudaCheck(cudaFree(mem));
   }
   cudaCheck(cudaStreamDestroy(stream));
diff --git a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu
index 80b8e1c7ce140..3ca5fc4994257 100644
--- a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu
+++ b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu
@@ -13,6 +13,7 @@ namespace testTrackSoAHeterogeneousT {
       tracks_view[j].chi2() = (float)j;
       tracks_view[j].quality() = (uint8_t)j % 256;
       tracks_view[j].nLayers() = j % 128;
+      tracks_view.hitIndices().off[j] = j;
       tracks->hitIndices.off[j] = j;
     }
   }
@@ -30,6 +31,7 @@ namespace testTrackSoAHeterogeneousT {
       assert(abs(tracks_view[j].chi2() - (float)j) < .0001);
       assert(tracks_view[j].quality() == j % 256);
       assert(tracks_view[j].nLayers() == j % 128);
+      assert(tracks_view.hitIndices().off[j] == j);
       assert(tracks->hitIndices.off[j] == j);
     }
   }

From 8d927933354b07c8eac74355a285bdfd82f18793 Mon Sep 17 00:00:00 2001
From: Breno Orzari <breno.orzari@hotmail.com>
Date: Wed, 19 Oct 2022 18:36:07 +0200
Subject: [PATCH 047/110] Changing objects type in plugins files to
 pixelTrack::TrackSoA

---
 .../plugins/PixelTrackDumpCUDA.cc             | 22 ++++++++++++-------
 .../plugins/PixelTrackProducerFromSoA.cc      | 13 ++++++-----
 .../plugins/CAHitNtupletGeneratorKernels.h    |  3 ++-
 .../plugins/CAHitNtupletGeneratorOnGPU.h      |  3 ++-
 .../PixelTriplets/plugins/GPUCACell.h         |  3 ++-
 .../PixelTriplets/plugins/HelixFitOnGPU.h     |  3 ++-
 .../plugins/PixelVertexProducerCUDA.cc        | 20 +++++++++++------
 .../plugins/gpuVertexFinder.cc                | 21 ++++++++++--------
 .../plugins/gpuVertexFinder.h                 |  4 +++-
 9 files changed, 58 insertions(+), 34 deletions(-)

diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc
index f3d6022e21654..59489c8e11f5f 100644
--- a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc
+++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc
@@ -1,7 +1,8 @@
 #include <cuda_runtime.h>
 
 #include "CUDADataFormats/Common/interface/Product.h"
-#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
+//#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h"
 #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h"
 #include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h"
 #include "DataFormats/Common/interface/Handle.h"
@@ -30,9 +31,11 @@ class PixelTrackDumpCUDA : public edm::global::EDAnalyzer<> {
 private:
   void analyze(edm::StreamID streamID, edm::Event const& iEvent, const edm::EventSetup& iSetup) const override;
   const bool m_onGPU;
-  edm::EDGetTokenT<cms::cuda::Product<PixelTrackHeterogeneous>> tokenGPUTrack_;
+  //edm::EDGetTokenT<cms::cuda::Product<PixelTrackHeterogeneous>> tokenGPUTrack_;
+  edm::EDGetTokenT<cms::cuda::Product<pixelTrack::TrackSoA>> tokenGPUTrack_;
   edm::EDGetTokenT<cms::cuda::Product<ZVertexHeterogeneous>> tokenGPUVertex_;
-  edm::EDGetTokenT<PixelTrackHeterogeneous> tokenSoATrack_;
+  //edm::EDGetTokenT<PixelTrackHeterogeneous> tokenSoATrack_;
+  edm::EDGetTokenT<pixelTrack::TrackSoA> tokenSoATrack_;
   edm::EDGetTokenT<ZVertexHeterogeneous> tokenSoAVertex_;
 };
 
@@ -40,11 +43,13 @@ PixelTrackDumpCUDA::PixelTrackDumpCUDA(const edm::ParameterSet& iConfig)
     : m_onGPU(iConfig.getParameter<bool>("onGPU")) {
   if (m_onGPU) {
     tokenGPUTrack_ =
-        consumes<cms::cuda::Product<PixelTrackHeterogeneous>>(iConfig.getParameter<edm::InputTag>("pixelTrackSrc"));
+        //consumes<cms::cuda::Product<PixelTrackHeterogeneous>>(iConfig.getParameter<edm::InputTag>("pixelTrackSrc"));
+        consumes<cms::cuda::Product<pixelTrack::TrackSoA>>(iConfig.getParameter<edm::InputTag>("pixelTrackSrc"));
     tokenGPUVertex_ =
         consumes<cms::cuda::Product<ZVertexHeterogeneous>>(iConfig.getParameter<edm::InputTag>("pixelVertexSrc"));
   } else {
-    tokenSoATrack_ = consumes<PixelTrackHeterogeneous>(iConfig.getParameter<edm::InputTag>("pixelTrackSrc"));
+    //tokenSoATrack_ = consumes<PixelTrackHeterogeneous>(iConfig.getParameter<edm::InputTag>("pixelTrackSrc"));
+    tokenSoATrack_ = consumes<pixelTrack::TrackSoA>(iConfig.getParameter<edm::InputTag>("pixelTrackSrc"));
     tokenSoAVertex_ = consumes<ZVertexHeterogeneous>(iConfig.getParameter<edm::InputTag>("pixelVertexSrc"));
   }
 }
@@ -66,7 +71,8 @@ void PixelTrackDumpCUDA::analyze(edm::StreamID streamID,
     cms::cuda::ScopedContextProduce ctx{hTracks};
 
     auto const& tracks = ctx.get(hTracks);
-    auto const* tsoa = tracks.get();
+    //auto const* tsoa = tracks.get();
+    auto const* tsoa = &tracks;
     assert(tsoa);
 
     auto const& vertices = ctx.get(iEvent.get(tokenGPUVertex_));
@@ -74,8 +80,8 @@ void PixelTrackDumpCUDA::analyze(edm::StreamID streamID,
     assert(vsoa);
 
   } else {
-    auto const* tsoa = iEvent.get(tokenSoATrack_).get();
-    assert(tsoa);
+    auto const& tsoa = iEvent.get(tokenSoATrack_);
+    assert(tsoa.buffer());
 
     auto const* vsoa = iEvent.get(tokenSoAVertex_).get();
     assert(vsoa);
diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc
index e6d49cde90d6a..212d2571c09c7 100644
--- a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc
+++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc
@@ -27,7 +27,8 @@
 #include "RecoPixelVertexing/PixelTrackFitting/interface/FitUtils.h"
 
 #include "CUDADataFormats/Common/interface/HostProduct.h"
-#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
+//#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h"
 #include "CUDADataFormats/SiPixelCluster/interface/gpuClusteringConstants.h"
 
 #include "storeTracks.h"
@@ -35,7 +36,7 @@
 
 /**
  * This class creates "leagcy"  reco::Track
- * objects from the output of SoA CA. 
+ * objects from the output of SoA CA.
  */
 class PixelTrackProducerFromSoA : public edm::global::EDProducer<> {
 public:
@@ -54,7 +55,8 @@ class PixelTrackProducerFromSoA : public edm::global::EDProducer<> {
 
   // Event Data tokens
   const edm::EDGetTokenT<reco::BeamSpot> tBeamSpot_;
-  const edm::EDGetTokenT<PixelTrackHeterogeneous> tokenTrack_;
+  //const edm::EDGetTokenT<PixelTrackHeterogeneous> tokenTrack_;
+  const edm::EDGetTokenT<pixelTrack::TrackSoA> tokenTrack_;
   const edm::EDGetTokenT<SiPixelRecHitCollectionNew> cpuHits_;
   const edm::EDGetTokenT<HMSstorage> hmsToken_;
   // Event Setup tokens
@@ -67,7 +69,8 @@ class PixelTrackProducerFromSoA : public edm::global::EDProducer<> {
 
 PixelTrackProducerFromSoA::PixelTrackProducerFromSoA(const edm::ParameterSet &iConfig)
     : tBeamSpot_(consumes<reco::BeamSpot>(iConfig.getParameter<edm::InputTag>("beamSpot"))),
-      tokenTrack_(consumes<PixelTrackHeterogeneous>(iConfig.getParameter<edm::InputTag>("trackSrc"))),
+      //tokenTrack_(consumes<PixelTrackHeterogeneous>(iConfig.getParameter<edm::InputTag>("trackSrc"))),
+      tokenTrack_(consumes<pixelTrack::TrackSoA>(iConfig.getParameter<edm::InputTag>("trackSrc"))),
       cpuHits_(consumes<SiPixelRecHitCollectionNew>(iConfig.getParameter<edm::InputTag>("pixelRecHitLegacySrc"))),
       hmsToken_(consumes<HMSstorage>(iConfig.getParameter<edm::InputTag>("pixelRecHitLegacySrc"))),
       idealMagneticFieldToken_(esConsumes()),
@@ -152,7 +155,7 @@ void PixelTrackProducerFromSoA::produce(edm::StreamID streamID,
   std::vector<const TrackingRecHit *> hits;
   hits.reserve(5);
 
-  const auto &tsoa = *iEvent.get(tokenTrack_);
+  const auto &tsoa = iEvent.get(tokenTrack_);
 
   auto const *quality = tsoa.qualityData();
   // auto const &fit = tsoa.stateAtBS;
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h
index fcab52e96d210..372c7ccd3f96c 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h
@@ -3,7 +3,8 @@
 
 // #define GPU_DEBUG
 
-#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
+//#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h"
 #include "GPUCACell.h"
 
 // #define DUMP_GPU_TK_TUPLES
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h
index ff13d09c1361a..ad64ae19037a3 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h
@@ -3,7 +3,8 @@
 
 #include <cuda_runtime.h>
 #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h"
-#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
+//#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h"
 
 #include "DataFormats/SiPixelDetId/interface/PixelSubdetector.h"
 #include "FWCore/ParameterSet/interface/ParameterSet.h"
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
index 4ec7069ac8e1b..a0c3930d1a739 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
@@ -14,7 +14,8 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/VecArray.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
 #include "RecoPixelVertexing/PixelTriplets/interface/CircleEq.h"
-#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
+//#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h"
 #include "CAConstants.h"
 
 class GPUCACell {
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h
index 67a180c53e887..7a356cf2d7dea 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h
@@ -1,7 +1,8 @@
 #ifndef RecoPixelVertexing_PixelTriplets_plugins_HelixFitOnGPU_h
 #define RecoPixelVertexing_PixelTriplets_plugins_HelixFitOnGPU_h
 
-#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
+//#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h"
 #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h"
 #include "RecoPixelVertexing/PixelTrackFitting/interface/FitResult.h"
 
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc
index 34b0ed9e29fc1..f240e77727293 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc
@@ -16,6 +16,7 @@
 #include "FWCore/Utilities/interface/EDGetToken.h"
 #include "FWCore/Utilities/interface/RunningAverage.h"
 #include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h"
 
 #include "gpuVertexFinder.h"
 
@@ -35,9 +36,11 @@ class PixelVertexProducerCUDA : public edm::global::EDProducer<> {
 
   bool onGPU_;
 
-  edm::EDGetTokenT<cms::cuda::Product<PixelTrackHeterogeneous>> tokenGPUTrack_;
+  //edm::EDGetTokenT<cms::cuda::Product<PixelTrackHeterogeneous>> tokenGPUTrack_;
+  edm::EDGetTokenT<cms::cuda::Product<pixelTrack::TrackSoA>> tokenGPUTrack_;
   edm::EDPutTokenT<ZVertexCUDAProduct> tokenGPUVertex_;
-  edm::EDGetTokenT<PixelTrackHeterogeneous> tokenCPUTrack_;
+  //edm::EDGetTokenT<PixelTrackHeterogeneous> tokenCPUTrack_;
+  edm::EDGetTokenT<pixelTrack::TrackSoA> tokenCPUTrack_;
   edm::EDPutTokenT<ZVertexHeterogeneous> tokenCPUVertex_;
 
   const gpuVertexFinder::Producer gpuAlgo_;
@@ -62,10 +65,12 @@ PixelVertexProducerCUDA::PixelVertexProducerCUDA(const edm::ParameterSet& conf)
 {
   if (onGPU_) {
     tokenGPUTrack_ =
-        consumes<cms::cuda::Product<PixelTrackHeterogeneous>>(conf.getParameter<edm::InputTag>("pixelTrackSrc"));
+        //consumes<cms::cuda::Product<PixelTrackHeterogeneous>>(conf.getParameter<edm::InputTag>("pixelTrackSrc"));
+        consumes<cms::cuda::Product<pixelTrack::TrackSoA>>(conf.getParameter<edm::InputTag>("pixelTrackSrc"));
     tokenGPUVertex_ = produces<ZVertexCUDAProduct>();
   } else {
-    tokenCPUTrack_ = consumes<PixelTrackHeterogeneous>(conf.getParameter<edm::InputTag>("pixelTrackSrc"));
+    //tokenCPUTrack_ = consumes<PixelTrackHeterogeneous>(conf.getParameter<edm::InputTag>("pixelTrackSrc"));
+    tokenCPUTrack_ = consumes<pixelTrack::TrackSoA>(conf.getParameter<edm::InputTag>("pixelTrackSrc"));
     tokenCPUVertex_ = produces<ZVertexHeterogeneous>();
   }
 }
@@ -97,11 +102,12 @@ void PixelVertexProducerCUDA::fillDescriptions(edm::ConfigurationDescriptions& d
 void PixelVertexProducerCUDA::produceOnGPU(edm::StreamID streamID,
                                            edm::Event& iEvent,
                                            const edm::EventSetup& iSetup) const {
-  edm::Handle<cms::cuda::Product<PixelTrackHeterogeneous>> hTracks;
+  //edm::Handle<cms::cuda::Product<PixelTrackHeterogeneous>> hTracks;
+  edm::Handle<cms::cuda::Product<pixelTrack::TrackSoA>> hTracks;
   iEvent.getByToken(tokenGPUTrack_, hTracks);
 
   cms::cuda::ScopedContextProduce ctx{*hTracks};
-  auto const* tracks = ctx.get(*hTracks).get();
+  auto const* tracks = &ctx.get(*hTracks);
 
   assert(tracks);
 
@@ -111,7 +117,7 @@ void PixelVertexProducerCUDA::produceOnGPU(edm::StreamID streamID,
 void PixelVertexProducerCUDA::produceOnCPU(edm::StreamID streamID,
                                            edm::Event& iEvent,
                                            const edm::EventSetup& iSetup) const {
-  auto const* tracks = iEvent.get(tokenCPUTrack_).get();
+  auto const* tracks = &iEvent.get(tokenCPUTrack_);
   assert(tracks);
 
 #ifdef PIXVERTEX_DEBUG_PRODUCE
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc
index 20b007d2d029f..2fbd44147db33 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc
@@ -7,6 +7,8 @@
 #include "gpuSortByPt2.h"
 #include "gpuSplitVertices.h"
 
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h"
+
 #undef PIXVERTEX_DEBUG_PRODUCE
 
 namespace gpuVertexFinder {
@@ -17,28 +19,29 @@ namespace gpuVertexFinder {
 
   // split vertices with a chi2/NDoF greater than this
   constexpr float maxChi2ForSplit = 9.f;
+  using TkSoAConstView = pixelTrack::TrackSoAConstView;
 
-  __global__ void loadTracks(TkSoA const* ptracks, ZVertexSoA* soa, WorkSpace* pws, float ptMin, float ptMax) {
+  __global__ void loadTracks(TkSoA const* ptracks, TkSoAConstView ptracksView, ZVertexSoA* soa, WorkSpace* pws, float ptMin, float ptMax) {
     assert(ptracks);
     assert(soa);
     auto const& tracks = *ptracks;
-    auto const& fit = tracks.stateAtBS;
+    //auto const& fit = tracks.stateAtBS;
     auto const* quality = tracks.qualityData();
 
     auto first = blockIdx.x * blockDim.x + threadIdx.x;
-    for (int idx = first, nt = tracks.nTracks(); idx < nt; idx += gridDim.x * blockDim.x) {
+    for (int idx = first, nt = ptracksView.nTracks(); idx < nt; idx += gridDim.x * blockDim.x) {
       auto nHits = tracks.nHits(idx);
       assert(nHits >= 3);
 
       // initialize soa...
       soa->idv[idx] = -1;
 
-      if (tracks.isTriplet(idx))
+      if (pixelTrack::utilities::isTriplet(ptracksView,idx))
         continue;  // no triplets
       if (quality[idx] < pixelTrack::Quality::highPurity)
         continue;
 
-      auto pt = tracks.pt(idx);
+      auto pt = ptracksView[idx].pt();
 
       if (pt < ptMin)
         continue;
@@ -49,8 +52,8 @@ namespace gpuVertexFinder {
       auto& data = *pws;
       auto it = atomicAdd(&data.ntrks, 1);
       data.itrk[it] = idx;
-      data.zt[it] = tracks.zip(idx);
-      data.ezt2[it] = fit.covariance(idx)(14);
+      data.zt[it] = pixelTrack::utilities::zip(ptracksView,idx);
+      data.ezt2[it] = ptracksView[idx].covariance()(14);
       data.ptt2[it] = pt * pt;
     }
   }
@@ -121,11 +124,11 @@ namespace gpuVertexFinder {
     init<<<1, 1, 0, stream>>>(soa, ws_d.get());
     auto blockSize = 128;
     auto numberOfBlocks = (TkSoA::stride() + blockSize - 1) / blockSize;
-    loadTracks<<<numberOfBlocks, blockSize, 0, stream>>>(tksoa, soa, ws_d.get(), ptMin, ptMax);
+    loadTracks<<<numberOfBlocks, blockSize, 0, stream>>>(tksoa, tksoa->view(), soa, ws_d.get(), ptMin, ptMax);
     cudaCheck(cudaGetLastError());
 #else
     init(soa, ws_d.get());
-    loadTracks(tksoa, soa, ws_d.get(), ptMin, ptMax);
+    loadTracks(tksoa, tksoa->view(), soa, ws_d.get(), ptMin, ptMax);
 #endif
 
 #ifdef __CUDACC__
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h
index 2b6a8107d927f..66b70409b58fd 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h
@@ -5,7 +5,8 @@
 #include <cstdint>
 
 #include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h"
-#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
+//#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h"
 
 namespace gpuVertexFinder {
 
@@ -43,6 +44,7 @@ namespace gpuVertexFinder {
     using ZVertices = ZVertexSoA;
     using WorkSpace = gpuVertexFinder::WorkSpace;
     using TkSoA = pixelTrack::TrackSoA;
+    using TkSoAConstView = pixelTrack::TrackSoAConstView;
 
     Producer(bool oneKernel,
              bool useDensity,

From d1a14c2d79491ec68e302777da9238a3db60aff7 Mon Sep 17 00:00:00 2001
From: Dimitris Papagiannis <nothingface0@gmail.com>
Date: Wed, 19 Oct 2022 19:52:33 +0200
Subject: [PATCH 048/110] Refactored Track class, updated test

---
 .../interface/TrackSoAHeterogeneousT_test.h   | 39 +++++++------------
 .../Track/test/TrackSoAHeterogeneous_test.cpp | 10 ++---
 .../Track/test/TrackSoAHeterogeneous_test.cu  | 30 +++++++-------
 3 files changed, 35 insertions(+), 44 deletions(-)

diff --git a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h
index 998f63e608244..298bd276390b0 100644
--- a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h
+++ b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h
@@ -107,13 +107,27 @@ namespace pixelTrack {
           cov(k, j) = cov(j, k) = tracks[i].covariance()(ind++);
       }
     }
+
+    __host__ __device__ inline int computeNumberOfLayers(TrackSoAConstView tracks, int32_t i) {
+      auto pdet = tracks.detIndices().begin(i);
+      int nl = 1;
+      auto ol = phase1PixelTopology::getLayer(*pdet);
+      for (; pdet < tracks.detIndices().end(i); ++pdet) {
+        auto il = phase1PixelTopology::getLayer(*pdet);
+        if (il != ol)
+          ++nl;
+        ol = il;
+      }
+      return nl;
+    }
+
+    __host__ __device__ inline int nHits(TrackSoAConstView tracks, int i) { return tracks.detIndices().size(i); }
   }  // namespace utilities
 }  // namespace pixelTrack
 
 template <int32_t S>
 class TrackSoAHeterogeneousT : public cms::cuda::PortableDeviceCollection<TrackSoAHeterogeneousT_test<>> {
 public:
-  // using cms::cuda::PortableDeviceCollection<TrackSoAHeterogeneousT_test<>>::PortableDeviceCollection;
   TrackSoAHeterogeneousT() = default;
 
   explicit TrackSoAHeterogeneousT(cudaStream_t stream)
@@ -123,34 +137,13 @@ class TrackSoAHeterogeneousT : public cms::cuda::PortableDeviceCollection<TrackS
 
   using Quality = pixelTrack::Quality;
   using hindex_type = uint32_t;
-  // using HitContainer = cms::cuda::OneToManyAssoc<hindex_type, S + 1, 5 * S>;
 
   // Always check quality is at least loose!
   // CUDA does not support enums  in __lgc ...
 private:
 public:
-  // TODO: static did not work; using reinterpret_cast
   constexpr Quality const *qualityData() const { return reinterpret_cast<Quality const *>(view().quality()); }
   constexpr Quality *qualityData() { return reinterpret_cast<Quality *>(view().quality()); }
-
-  constexpr int nHits(int i) const { return detIndices.size(i); }
-
-  constexpr int computeNumberOfLayers(int32_t i) const {
-    // layers are in order and we assume tracks are either forward or backward
-    auto pdet = detIndices.begin(i);
-    int nl = 1;
-    auto ol = phase1PixelTopology::getLayer(*pdet);
-    for (; pdet < detIndices.end(i); ++pdet) {
-      auto il = phase1PixelTopology::getLayer(*pdet);
-      if (il != ol)
-        ++nl;
-      ol = il;
-    }
-    return nl;
-  }
-
-  HitContainer hitIndices;
-  HitContainer detIndices;
 };
 
 namespace pixelTrack {
@@ -159,8 +152,6 @@ namespace pixelTrack {
   using TrackSoAView = cms::cuda::PortableDeviceCollection<TrackSoAHeterogeneousT_test<>>::View;
   using TrackSoAConstView = cms::cuda::PortableDeviceCollection<TrackSoAHeterogeneousT_test<>>::ConstView;
 
-  // using HitContainer = TrackSoA::HitContainer;
-
 }  // namespace pixelTrack
 
 #endif  // CUDADataFormats_Track_TrackHeterogeneousT_H
diff --git a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp
index d40ec10af2fa4..34704f16c1840 100644
--- a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp
+++ b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp
@@ -1,3 +1,4 @@
+#include <bits/stdint-uintn.h>
 #include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
@@ -6,7 +7,7 @@
 
 namespace testTrackSoAHeterogeneousT {
 
-  void runKernels(pixelTrack::TrackSoA *tracks, pixelTrack::TrackSoAView tracks_view);
+  void runKernels(pixelTrack::TrackSoAView tracks_view, uint32_t soaSize);
 }
 
 int main() {
@@ -28,8 +29,7 @@ int main() {
     cudaCheck(cudaMemcpy(mem, &tracks_h, sizeof(pixelTrack::TrackSoA), cudaMemcpyHostToDevice));
 
     // Run the tests
-    pixelTrack::TrackSoA *tracks_d = reinterpret_cast<pixelTrack::TrackSoA *>(mem);
-    testTrackSoAHeterogeneousT::runKernels(tracks_d, tracks_h.view());
+    testTrackSoAHeterogeneousT::runKernels(tracks_h.view(), tracks_h->metadata().size());
 
     // Copy SoA data back to host
     auto ret = cms::cuda::make_host_unique<std::byte[]>(tracks_h.bufferSize(), stream);
@@ -38,7 +38,6 @@ int main() {
                          TrackSoAHeterogeneousT_test<>::computeDataSize(tracks_h.stride()),
                          cudaMemcpyDeviceToHost));
 
-    // Copy tracks_d back to tracks_h
     cudaCheck(cudaMemcpy(&tracks_h, mem, sizeof(pixelTrack::TrackSoA), cudaMemcpyDeviceToHost));
 
     // Create a view to access the copied data
@@ -55,7 +54,8 @@ int main() {
               << "nLayers"
               << "\t"
               << "hitIndices off" << std::endl;
-    for (int i = 0; i < tracks_h.stride(); ++i) {
+    // for (int i = 0; i < tracks_h.stride(); ++i) {
+    for (int i = 0; i < 10; ++i) {
       std::cout << tmp_view[i].pt() << "\t" << tmp_view[i].eta() << "\t" << tmp_view[i].chi2() << "\t"
                 << (int)tmp_view[i].quality() << "\t" << (int)tmp_view[i].nLayers() << "\t"
                 << tmp_view.hitIndices().off[i] << std::endl;
diff --git a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu
index 3ca5fc4994257..38c7ab61eeece 100644
--- a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu
+++ b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu
@@ -3,43 +3,43 @@
 
 namespace testTrackSoAHeterogeneousT {
 
-  __global__ void fill(pixelTrack::TrackSoA* __restrict__ tracks, pixelTrack::TrackSoAView tracks_view) {
-    assert(tracks);
-
+  __global__ void fill(pixelTrack::TrackSoAView tracks_view) {
     int i = threadIdx.x;
-    for (int j = i; j < tracks->stride(); j += blockDim.x) {
+    if (i == 0) {
+      tracks_view.nTracks() = 420;
+    }
+
+    for (int j = i; j < tracks_view.metadata().size(); j += blockDim.x) {
       tracks_view[j].pt() = (float)j;
       tracks_view[j].eta() = (float)j;
       tracks_view[j].chi2() = (float)j;
       tracks_view[j].quality() = (uint8_t)j % 256;
       tracks_view[j].nLayers() = j % 128;
       tracks_view.hitIndices().off[j] = j;
-      tracks->hitIndices.off[j] = j;
     }
   }
 
-  __global__ void verify(pixelTrack::TrackSoA* const __restrict__ tracks, pixelTrack::TrackSoAConstView tracks_view) {
-    assert(tracks);
-
+  __global__ void verify(pixelTrack::TrackSoAView tracks_view) {
     int i = threadIdx.x;
+
     if (i == 0) {
-      printf("Stride: %d, block dims: %d\n", tracks->stride(), blockDim.x);
+      printf("SoA size: % d, block dims: % d\n", tracks_view.metadata().size(), blockDim.x);
+      assert(tracks_view.nTracks() == 420);
     }
-    for (int j = i; j < tracks->stride(); j += blockDim.x) {
+    for (int j = i; j < tracks_view.metadata().size(); j += blockDim.x) {
       assert(abs(tracks_view[j].pt() - (float)j) < .0001);
       assert(abs(tracks_view[j].eta() - (float)j) < .0001);
       assert(abs(tracks_view[j].chi2() - (float)j) < .0001);
       assert(tracks_view[j].quality() == j % 256);
       assert(tracks_view[j].nLayers() == j % 128);
       assert(tracks_view.hitIndices().off[j] == j);
-      assert(tracks->hitIndices.off[j] == j);
     }
   }
 
-  void runKernels(pixelTrack::TrackSoA* tracks, pixelTrack::TrackSoAView tracks_view) {
-    assert(tracks);
-    fill<<<1, 1024>>>(tracks, tracks_view);
-    verify<<<1, 1024>>>(tracks, tracks_view);
+  void runKernels(pixelTrack::TrackSoAView tracks_view, uint32_t soaSize) {
+    fill<<<1, 1024>>>(tracks_view);
+    cudaDeviceSynchronize();
+    verify<<<1, 1024>>>(tracks_view);
   }
 
 }  // namespace testTrackSoAHeterogeneousT

From 4739777264d8137ee5031c5e6d1d7b19e605d463 Mon Sep 17 00:00:00 2001
From: Dimitris Papagiannis <nothingface0@gmail.com>
Date: Thu, 20 Oct 2022 11:22:03 +0200
Subject: [PATCH 049/110] Cleanup test for TrackSoA

---
 .../Track/test/TrackSoAHeterogeneous_test.cpp | 33 +++++++------------
 .../Track/test/TrackSoAHeterogeneous_test.cu  |  2 +-
 2 files changed, 13 insertions(+), 22 deletions(-)

diff --git a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp
index 34704f16c1840..28dbd0c9b029b 100644
--- a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp
+++ b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp
@@ -7,7 +7,7 @@
 
 namespace testTrackSoAHeterogeneousT {
 
-  void runKernels(pixelTrack::TrackSoAView tracks_view, uint32_t soaSize);
+  void runKernels(pixelTrack::TrackSoAView tracks_view);
 }
 
 int main() {
@@ -16,33 +16,26 @@ int main() {
   cudaStream_t stream;
   cudaCheck(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
 
-  // inner scope to deallocate memory before destroying the stream
+  // Inner scope to deallocate memory before destroying the stream
   {
     // Instantiate tracks on host. Portabledevicecollection allocates
     // SoA on device automatically.
-    int dev = cms::cuda::currentDevice();
-    pixelTrack::TrackSoA tracks_h(stream);
-
-    // Make a copy of tracks_h to device, so that we can
-    // modify hitIndices.
-    void *mem = cms::cuda::allocate_device(dev, sizeof(pixelTrack::TrackSoA), stream);
-    cudaCheck(cudaMemcpy(mem, &tracks_h, sizeof(pixelTrack::TrackSoA), cudaMemcpyHostToDevice));
+    pixelTrack::TrackSoA tracks(stream);
+    uint32_t soaSize = tracks.bufferSize();               // SoA Layout size (bytes)
+    uint32_t soaNumElements = tracks->metadata().size();  // Length of each SoA array in elements
 
     // Run the tests
-    testTrackSoAHeterogeneousT::runKernels(tracks_h.view(), tracks_h->metadata().size());
+    testTrackSoAHeterogeneousT::runKernels(tracks.view());
 
     // Copy SoA data back to host
-    auto ret = cms::cuda::make_host_unique<std::byte[]>(tracks_h.bufferSize(), stream);
-    cudaCheck(cudaMemcpy(ret.get(),
-                         tracks_h.buffer().get(),
-                         TrackSoAHeterogeneousT_test<>::computeDataSize(tracks_h.stride()),
-                         cudaMemcpyDeviceToHost));
-
-    cudaCheck(cudaMemcpy(&tracks_h, mem, sizeof(pixelTrack::TrackSoA), cudaMemcpyDeviceToHost));
+    auto tracks_h_soa = cms::cuda::make_host_unique<std::byte[]>(soaSize, stream);
+    cudaCheck(cudaMemcpy(tracks_h_soa.get(), tracks.const_buffer().get(), soaSize, cudaMemcpyDeviceToHost));
 
     // Create a view to access the copied data
-    TrackSoAHeterogeneousT_test<> tmp_layout(ret.get(), tracks_h.stride());
+    TrackSoAHeterogeneousT_test<> tmp_layout(tracks_h_soa.get(), soaNumElements);
     TrackSoAHeterogeneousT_test<>::View tmp_view(tmp_layout);
+
+    // Print results
     std::cout << "pt"
               << "\t"
               << "eta"
@@ -54,14 +47,12 @@ int main() {
               << "nLayers"
               << "\t"
               << "hitIndices off" << std::endl;
-    // for (int i = 0; i < tracks_h.stride(); ++i) {
+
     for (int i = 0; i < 10; ++i) {
       std::cout << tmp_view[i].pt() << "\t" << tmp_view[i].eta() << "\t" << tmp_view[i].chi2() << "\t"
                 << (int)tmp_view[i].quality() << "\t" << (int)tmp_view[i].nLayers() << "\t"
                 << tmp_view.hitIndices().off[i] << std::endl;
     }
-
-    cudaCheck(cudaFree(mem));
   }
   cudaCheck(cudaStreamDestroy(stream));
 
diff --git a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu
index 38c7ab61eeece..4e3f7ee6c9388 100644
--- a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu
+++ b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu
@@ -36,7 +36,7 @@ namespace testTrackSoAHeterogeneousT {
     }
   }
 
-  void runKernels(pixelTrack::TrackSoAView tracks_view, uint32_t soaSize) {
+  void runKernels(pixelTrack::TrackSoAView tracks_view) {
     fill<<<1, 1024>>>(tracks_view);
     cudaDeviceSynchronize();
     verify<<<1, 1024>>>(tracks_view);

From c3599aeb170c648b8484c5950d6d40ded8e3594d Mon Sep 17 00:00:00 2001
From: Dimitris Papagiannis <nothingface0@gmail.com>
Date: Thu, 20 Oct 2022 11:23:02 +0200
Subject: [PATCH 050/110] Added TODO

---
 CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu
index 4e3f7ee6c9388..9c59d867629b2 100644
--- a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu
+++ b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu
@@ -19,6 +19,7 @@ namespace testTrackSoAHeterogeneousT {
     }
   }
 
+  // TODO: Using TrackSoAConstView fails to assert hitIndices correctly
   __global__ void verify(pixelTrack::TrackSoAView tracks_view) {
     int i = threadIdx.x;
 

From f131e237bcf0e0c86c3cf0efa84b24d6be3133d5 Mon Sep 17 00:00:00 2001
From: Dimitris Papagiannis <nothingface0@gmail.com>
Date: Thu, 20 Oct 2022 11:25:44 +0200
Subject: [PATCH 051/110] Docstring for test

---
 .../Track/test/TrackSoAHeterogeneous_test.cpp        | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp
index 28dbd0c9b029b..f473fd2023b8f 100644
--- a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp
+++ b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp
@@ -1,3 +1,15 @@
+/**
+   Simple test of the pixelTrack::TrackSoA data structure
+   which inherits from PortableDeviceCollection.
+
+   Creates an instance of the class (automatically allocates
+   memory on device), passes the view of the SoA data to
+   the CUDA kernels which:
+   - Fill the SoA with data.
+   - Verify that the data written is correct.
+
+ */
+
 #include <bits/stdint-uintn.h>
 #include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"

From 0e110223163d718e17a81bd3d068be79d195ffbc Mon Sep 17 00:00:00 2001
From: Dimitris Papagiannis <nothingface0@gmail.com>
Date: Thu, 20 Oct 2022 11:27:20 +0200
Subject: [PATCH 052/110] More details in docstring

---
 CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp
index f473fd2023b8f..9bfd445bd786c 100644
--- a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp
+++ b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp
@@ -1,5 +1,5 @@
 /**
-   Simple test of the pixelTrack::TrackSoA data structure
+   Simple test for the pixelTrack::TrackSoA data structure
    which inherits from PortableDeviceCollection.
 
    Creates an instance of the class (automatically allocates
@@ -7,7 +7,10 @@
    the CUDA kernels which:
    - Fill the SoA with data.
    - Verify that the data written is correct.
-
+   
+   Then, the SoA data are copied back to Host, where
+   a temporary host-side view (tmp_view) is created using
+   the same Layout to access the data on host and print it.
  */
 
 #include <bits/stdint-uintn.h>

From ffd6f3ef202bc1f9a71b18bf9f5f47b22f844f9e Mon Sep 17 00:00:00 2001
From: Dimitris Papagiannis <nothingface0@gmail.com>
Date: Thu, 20 Oct 2022 11:38:15 +0200
Subject: [PATCH 053/110] Multiplicity kernels

---
 .../plugins/CAHitNtupletGeneratorKernels.cu       |  8 ++++----
 .../plugins/CAHitNtupletGeneratorKernelsImpl.h    | 15 ++++++---------
 2 files changed, 10 insertions(+), 13 deletions(-)

diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
index 168ba3b0c8144..c5f3ff3a1b649 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
@@ -4,8 +4,8 @@
 template <>
 void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) {
   // these are pointer on GPU!
-  auto *tuples_d = &tracks_d->hitIndices;
-  auto *detId_d = &tracks_d->detIndices;
+  auto *tuples_d = &tracks_d->hitIndices();
+  auto *detId_d = &tracks_d->detIndices();
   auto *quality_d = tracks_d->qualityData();
 
   // zero tuples
@@ -102,8 +102,8 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *
 
   blockSize = 128;
   numberOfBlocks = (3 * caConstants::maxTuples / 4 + blockSize - 1) / blockSize;
-  kernel_countMultiplicity<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
-      tuples_d, tracks_d->view(), device_tupleMultiplicity_.get());
+  kernel_countMultiplicity<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tracks_d->view(),
+                                                                         device_tupleMultiplicity_.get());
   cms::cuda::launchFinalize(device_tupleMultiplicity_.get(), cudaStream);
   kernel_fillMultiplicity<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
       tuples_d, tracks_d->view(), device_tupleMultiplicity_.get());
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
index f38c042ed15c2..34079fe3c714b 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
@@ -375,12 +375,11 @@ __global__ void kernel_mark_used(GPUCACell *__restrict__ cells, uint32_t const *
   }
 }
 
-__global__ void kernel_countMultiplicity(HitContainer const *__restrict__ foundNtuplets,
-                                         TkSoAConstView tracks_view,
+__global__ void kernel_countMultiplicity(TkSoAConstView tracks_view,
                                          caConstants::TupleMultiplicity *tupleMultiplicity) {
   auto first = blockIdx.x * blockDim.x + threadIdx.x;
-  for (int it = first, nt = foundNtuplets->nOnes(); it < nt; it += gridDim.x * blockDim.x) {
-    auto nhits = foundNtuplets->size(it);
+  for (int it = first, nt = tracks_view.hitIndices().nOnes(); it < nt; it += gridDim.x * blockDim.x) {
+    auto nhits = tracks_view.hitIndices().size(it);
     if (nhits < 3)
       continue;
     if (tracks_view[it].quality() == (uint8_t)pixelTrack::Quality::edup)
@@ -393,12 +392,10 @@ __global__ void kernel_countMultiplicity(HitContainer const *__restrict__ foundN
   }
 }
 
-__global__ void kernel_fillMultiplicity(HitContainer const *__restrict__ foundNtuplets,
-                                        TkSoAConstView tracks_view,
-                                        caConstants::TupleMultiplicity *tupleMultiplicity) {
+__global__ void kernel_fillMultiplicity(TkSoAConstView tracks_view, caConstants::TupleMultiplicity *tupleMultiplicity) {
   auto first = blockIdx.x * blockDim.x + threadIdx.x;
-  for (int it = first, nt = foundNtuplets->nOnes(); it < nt; it += gridDim.x * blockDim.x) {
-    auto nhits = foundNtuplets->size(it);
+  for (int it = first, nt = tracks_view.hitIndices().nOnes(); it < nt; it += gridDim.x * blockDim.x) {
+    auto nhits = tracks_view.hitIndices().size(it);
     if (nhits < 3)
       continue;
     if (tracks_view[it].quality() == (uint8_t)pixelTrack::Quality::edup)

From d21b49f65b1a409c48d791567dd8e8435d64475f Mon Sep 17 00:00:00 2001
From: Dimitris Papagiannis <nothingface0@gmail.com>
Date: Thu, 20 Oct 2022 12:12:18 +0200
Subject: [PATCH 054/110] Replacing hitIndices in several parts in
 PixelTriplets

---
 .../plugins/CAHitNtupletGeneratorKernels.cu   | 22 ++++++------
 .../CAHitNtupletGeneratorKernelsImpl.h        | 36 ++++++++++---------
 2 files changed, 30 insertions(+), 28 deletions(-)

diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
index c5f3ff3a1b649..abf4c28d4ee6f 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
@@ -4,12 +4,12 @@
 template <>
 void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) {
   // these are pointer on GPU!
-  auto *tuples_d = &tracks_d->hitIndices();
-  auto *detId_d = &tracks_d->detIndices();
+  // auto *tuples_d = &tracks_d->hitIndices();
+  auto *detId_d = tracks_d->view().detIndices();
   auto *quality_d = tracks_d->qualityData();
 
   // zero tuples
-  cms::cuda::launchZero(tuples_d, cudaStream);
+  cms::cuda::launchZero(tracks_d->view().hitIndices(), cudaStream);
 
   int32_t nhits = hh.nHits();
 
@@ -70,9 +70,8 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *
                                                                      device_theCells_.get(),
                                                                      device_nCells_,
                                                                      device_theCellTracks_.get(),
-                                                                     tuples_d,
+                                                                     tracks_d->view(),
                                                                      device_hitTuple_apc_,
-                                                                     quality_d,
                                                                      params_.minHitsPerNtuplet_);
   cudaCheck(cudaGetLastError());
 
@@ -87,9 +86,11 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *
 
   blockSize = 128;
   numberOfBlocks = (HitContainer::ctNOnes() + blockSize - 1) / blockSize;
-  cms::cuda::finalizeBulk<<<numberOfBlocks, blockSize, 0, cudaStream>>>(device_hitTuple_apc_, tuples_d);
+  cms::cuda::finalizeBulk<<<numberOfBlocks, blockSize, 0, cudaStream>>>(device_hitTuple_apc_,
+                                                                        tracks_d->view().hitIndices());
 
-  kernel_fillHitDetIndices<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d, hh.view(), detId_d);
+  kernel_fillHitDetIndices<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+      tracks_d->view().hitIndices(), hh.view(), tracks_d->view().detIndices());
   cudaCheck(cudaGetLastError());
   kernel_fillNLayers<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tracks_d, tracks_d->view(), device_hitTuple_apc_);
   cudaCheck(cudaGetLastError());
@@ -105,8 +106,8 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *
   kernel_countMultiplicity<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tracks_d->view(),
                                                                          device_tupleMultiplicity_.get());
   cms::cuda::launchFinalize(device_tupleMultiplicity_.get(), cudaStream);
-  kernel_fillMultiplicity<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
-      tuples_d, tracks_d->view(), device_tupleMultiplicity_.get());
+  kernel_fillMultiplicity<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tracks_d->view(),
+                                                                        device_tupleMultiplicity_.get());
   cudaCheck(cudaGetLastError());
 
   // do not run the fishbone if there are hits only in BPIX1
@@ -233,8 +234,7 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA
 
   // classify tracks based on kinematics
   auto numberOfBlocks = nQuadrupletBlocks(blockSize);
-  kernel_classifyTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
-      tuples_d, tracks_d->view(), quality_d, params_.cuts_);
+  kernel_classifyTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tracks_d->view(), quality_d, params_.cuts_);
 
   cudaCheck(cudaGetLastError());
 
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
index 34079fe3c714b..360aed23e90e9 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
@@ -338,9 +338,8 @@ __global__ void kernel_find_ntuplets(GPUCACell::Hits const *__restrict__ hhp,
                                      GPUCACell *__restrict__ cells,
                                      uint32_t const *nCells,
                                      gpuPixelDoublets::CellTracksVector *cellTracks,
-                                     HitContainer *foundNtuplets,
+                                     pixelTrack::TrackSoaView tracks_view,
                                      cms::cuda::AtomicPairCounter *apc,
-                                     Quality *__restrict__ quality,
                                      unsigned int minHitsPerNtuplet) {
   // recursive: not obvious to widen
   auto const &hh = *hhp;
@@ -358,8 +357,15 @@ __global__ void kernel_find_ntuplets(GPUCACell::Hits const *__restrict__ hhp,
     if (doit) {
       GPUCACell::TmpTuple stack;
       stack.reset();
-      thisCell.find_ntuplets<6>(
-          hh, cells, *cellTracks, *foundNtuplets, *apc, quality, stack, minHitsPerNtuplet, pid < 3);
+      thisCell.find_ntuplets<6>(hh,
+                                cells,
+                                *cellTracks,
+                                tracks_view.hitIndices(),
+                                *apc,
+                                tracks_view.qualityData(),
+                                stack,
+                                minHitsPerNtuplet,
+                                pid < 3);
       assert(stack.empty());
       // printf("in %d found quadruplets: %d\n", cellIndex, apc->get());
     }
@@ -412,14 +418,13 @@ __global__ void kernel_fillMultiplicity(TkSoAConstView tracks_view, caConstants:
   Supply both the original TkSoA and the TkSoAView which contains
 the SoA Data
  */
-__global__ void kernel_classifyTracks(HitContainer const *__restrict__ tuples,
-                                      TkSoAView tracks_view,
+__global__ void kernel_classifyTracks(TkSoAView tracks_view,
                                       Quality *__restrict__ quality,
                                       CAHitNtupletGeneratorKernelsGPU::QualityCuts cuts) {
   int first = blockDim.x * blockIdx.x + threadIdx.x;
 
-  for (int it = first, nt = tuples->nOnes(); it < nt; it += gridDim.x * blockDim.x) {
-    auto nhits = tuples->size(it);
+  for (int it = first, nt = tracks_view.hitIndices().nOnes(); it < nt; it += gridDim.x * blockDim.x) {
+    auto nhits = tracks_view.hitIndices().size(it);
     if (nhits == 0)
       break;  // guard
 
@@ -440,7 +445,7 @@ __global__ void kernel_classifyTracks(HitContainer const *__restrict__ tuples,
     }
     if (isNaN) {
 #ifdef NTUPLE_DEBUG
-      printf("NaN in fit %d size %d chi2 %f\n", it, tuples->size(it), tracks_view[it].chi2());
+      printf("NaN in fit %d size %d chi2 %f\n", it, tracks_view.hitIndices().size(it), tracks_view[it].chi2());
 #endif
       continue;
     }
@@ -477,7 +482,7 @@ __global__ void kernel_classifyTracks(HitContainer const *__restrict__ tuples,
 #ifdef NTUPLE_FIT_DEBUG
       printf("Bad chi2 %d size %d pt %f eta %f chi2 %f\n",
              it,
-             tuples->size(it),
+             tracks_view.hitIndices().size(it),
              tracks_view[it].pt(),
              tracks_view[it].eta(),
              tracks_view[it].chi2());
@@ -561,19 +566,16 @@ __global__ void kernel_fillHitDetIndices(HitContainer const *__restrict__ tuples
 /*
   Needs both TkSoA and TkSoAView for accessing SoA, computeNumberOfLayers(), nHits(), stride()
  */
-__global__ void kernel_fillNLayers(TkSoA *__restrict__ ptracks,
-                                   TkSoAView tracks_view,
-                                   cms::cuda::AtomicPairCounter *apc) {
-  auto &tracks = *ptracks;
+__global__ void kernel_fillNLayers(TkSoAView tracks_view, cms::cuda::AtomicPairCounter *apc) {
   auto first = blockIdx.x * blockDim.x + threadIdx.x;
   // clamp the number of tracks to the capacity of the SoA
-  auto ntracks = std::min<int>(apc->get().m, tracks.stride() - 1);
+  auto ntracks = std::min<int>(apc->get().m, tracks_view.metadata().size() - 1);
   if (0 == first)
     tracks_view.nTracks() = ntracks;
   for (int idx = first, nt = ntracks; idx < nt; idx += gridDim.x * blockDim.x) {
-    auto nHits = tracks.nHits(idx);
+    auto nHits = pixelTrack::nHits(tracks_view, idx);
     assert(nHits >= 3);
-    tracks_view[idx].nLayers() = tracks.computeNumberOfLayers(idx);
+    tracks_view[idx].nLayers() = pixelTrack::computeNumberOfLayers(tracks_view, idx);
   }
 }
 

From 7eb507103b69b61bff4bd397a7e9b1b668623ef0 Mon Sep 17 00:00:00 2001
From: Dimitris Papagiannis <nothingface0@gmail.com>
Date: Thu, 20 Oct 2022 12:18:32 +0200
Subject: [PATCH 055/110] Classify, checkOverflows kernels

---
 .../plugins/CAHitNtupletGeneratorKernels.cu   |  8 ++---
 .../CAHitNtupletGeneratorKernelsImpl.h        | 32 +++++++++----------
 2 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
index abf4c28d4ee6f..1bd491d53f509 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
@@ -225,7 +225,7 @@ void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStr
 template <>
 void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) {
   // these are pointer on GPU!
-  auto const *tuples_d = &tracks_d->hitIndices;
+  // auto const *tuples_d = &tracks_d->hitIndices;
   auto *quality_d = tracks_d->qualityData();
 
   int32_t nhits = hh.nHits();
@@ -259,13 +259,13 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA
     // fill hit->track "map"
     assert(hitToTupleView_.offSize > nhits);
     numberOfBlocks = nQuadrupletBlocks(blockSize);
-    kernel_countHitInTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d, device_hitToTuple_.get());
+    kernel_countHitInTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tracks_d->view(), device_hitToTuple_.get());
     cudaCheck(cudaGetLastError());
     assert((hitToTupleView_.assoc == device_hitToTuple_.get()) &&
            (hitToTupleView_.offStorage == device_hitToTupleStorage_.get()) && (hitToTupleView_.offSize > 0));
     cms::cuda::launchFinalize(hitToTupleView_, cudaStream);
     cudaCheck(cudaGetLastError());
-    kernel_fillHitInTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d, device_hitToTuple_.get());
+    kernel_fillHitInTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tracks_d->view(), device_hitToTuple_.get());
     cudaCheck(cudaGetLastError());
 #ifdef GPU_DEBUG
     cudaCheck(cudaDeviceSynchronize());
@@ -297,7 +297,7 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA
 
   if (params_.doStats_) {
     numberOfBlocks = (std::max(nhits, int(params_.maxNumberOfDoublets_)) + blockSize - 1) / blockSize;
-    kernel_checkOverflows<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d,
+    kernel_checkOverflows<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tracks_d->view(),
                                                                         device_tupleMultiplicity_.get(),
                                                                         device_hitToTuple_.get(),
                                                                         device_hitTuple_apc_,
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
index 360aed23e90e9..6833296cc89d5 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
@@ -43,7 +43,7 @@ namespace {
 
 }  // namespace
 
-__global__ void kernel_checkOverflows(HitContainer const *foundNtuplets,
+__global__ void kernel_checkOverflows(pixelTrack::TrackSoAView tracks_view,
                                       caConstants::TupleMultiplicity const *tupleMultiplicity,
                                       CAHitNtupletGeneratorKernelsGPU::HitToTuple const *hitToTuple,
                                       cms::cuda::AtomicPairCounter *apc,
@@ -76,16 +76,16 @@ __global__ void kernel_checkOverflows(HitContainer const *foundNtuplets,
            nHits,
            hitToTuple->totOnes());
     if (apc->get().m < caConstants::maxNumberOfQuadruplets) {
-      assert(foundNtuplets->size(apc->get().m) == 0);
-      assert(foundNtuplets->size() == apc->get().n);
+      assert(tracks_view.hitIndices().size(apc->get().m) == 0);
+      assert(tracks_view.hitIndices().size() == apc->get().n);
     }
   }
 
-  for (int idx = first, nt = foundNtuplets->nOnes(); idx < nt; idx += gridDim.x * blockDim.x) {
-    if (foundNtuplets->size(idx) > 7)  // current real limit
-      printf("ERROR %d, %d\n", idx, foundNtuplets->size(idx));
-    assert(foundNtuplets->size(idx) <= caConstants::maxHitsOnTrack);
-    for (auto ih = foundNtuplets->begin(idx); ih != foundNtuplets->end(idx); ++ih)
+  for (int idx = first, nt = tracks_view.hitIndices().nOnes(); idx < nt; idx += gridDim.x * blockDim.x) {
+    if (tracks_view.hitIndices().size(idx) > 7)  // current real limit
+      printf("ERROR %d, %d\n", idx, tracks_view.hitIndices().size(idx));
+    assert(tracks_view.hitIndices().size(idx) <= caConstants::maxHitsOnTrack);
+    for (auto ih = tracks_view.hitIndices().begin(idx); ih != tracks_view.hitIndices().end(idx); ++ih)
       assert(int(*ih) < nHits);
   }
 #endif
@@ -524,24 +524,24 @@ __global__ void kernel_doStatsForTracks(HitContainer const *__restrict__ tuples,
   }
 }
 
-__global__ void kernel_countHitInTracks(HitContainer const *__restrict__ tuples,
+__global__ void kernel_countHitInTracks(pixelTrack::TrackSoAView tracks_view,
                                         CAHitNtupletGeneratorKernelsGPU::HitToTuple *hitToTuple) {
   int first = blockDim.x * blockIdx.x + threadIdx.x;
-  for (int idx = first, ntot = tuples->nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
-    if (tuples->size(idx) == 0)
+  for (int idx = first, ntot = tracks_view.hitIndices().nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
+    if (tracks_view.hitIndices().size(idx) == 0)
       break;  // guard
-    for (auto h = tuples->begin(idx); h != tuples->end(idx); ++h)
+    for (auto h = tracks_view.hitIndices().begin(idx); h != tracks_view.hitIndices().end(idx); ++h)
       hitToTuple->count(*h);
   }
 }
 
-__global__ void kernel_fillHitInTracks(HitContainer const *__restrict__ tuples,
+__global__ void kernel_fillHitInTracks(pixelTrack::TrackSoAView tracks_view,  // TODO: Make ConstView
                                        CAHitNtupletGeneratorKernelsGPU::HitToTuple *hitToTuple) {
   int first = blockDim.x * blockIdx.x + threadIdx.x;
-  for (int idx = first, ntot = tuples->nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
-    if (tuples->size(idx) == 0)
+  for (int idx = first, ntot = tracks_view.hitIndices().nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
+    if (tracks_view.hitIndices().size(idx) == 0)
       break;  // guard
-    for (auto h = tuples->begin(idx); h != tuples->end(idx); ++h)
+    for (auto h = tracks_view.hitIndices().begin(idx); h != tracks_view.hitIndices().end(idx); ++h)
       hitToTuple->fill(*h, idx);
   }
 }

From 4794b6cc23803e81fcfe898a949264666f6ffcfc Mon Sep 17 00:00:00 2001
From: Breno Orzari <breno.orzari@hotmail.com>
Date: Thu, 20 Oct 2022 15:36:38 +0200
Subject: [PATCH 056/110] Confiscating everything from class

---
 .../interface/TrackSoAHeterogeneousT_test.h   | 22 ++++++++-----------
 1 file changed, 9 insertions(+), 13 deletions(-)

diff --git a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h
index 298bd276390b0..a9b2bb987fcec 100644
--- a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h
+++ b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h
@@ -29,6 +29,8 @@ namespace pixelTrack {
   constexpr uint32_t maxNumber() { return 32 * 1024; }
 #endif
 
+  using HitContainer = cms::cuda::OneToManyAssoc<uint32_t, pixelTrack::maxNumber() + 1, 5 * pixelTrack::maxNumber()>;
+
 }  // namespace pixelTrack
 
 using Vector5f = Eigen::Matrix<float, 5, 1>;
@@ -36,7 +38,7 @@ using Vector15f = Eigen::Matrix<float, 15, 1>;
 
 using Vector5d = Eigen::Matrix<double, 5, 1>;
 using Matrix5d = Eigen::Matrix<double, 5, 5>;
-using HitContainer = cms::cuda::OneToManyAssoc<uint32_t, pixelTrack::maxNumber() + 1, 5 * pixelTrack::maxNumber()>;
+using HitContainer = pixelTrack::HitContainer;
 
 GENERATE_SOA_LAYOUT(TrackSoAHeterogeneousT_test,
                     SOA_COLUMN(uint8_t, quality),
@@ -55,6 +57,8 @@ namespace pixelTrack {
   namespace utilities {
     using TrackSoAView = cms::cuda::PortableDeviceCollection<TrackSoAHeterogeneousT_test<>>::View;
     using TrackSoAConstView = cms::cuda::PortableDeviceCollection<TrackSoAHeterogeneousT_test<>>::ConstView;
+    using Quality = pixelTrack::Quality;
+    using hindex_type = uint32_t;
     // State at the Beam spot
     // phi,tip,1/pt,cotan(theta),zip
     __host__ __device__ inline float charge(TrackSoAConstView tracks, int32_t i) {
@@ -121,6 +125,9 @@ namespace pixelTrack {
       return nl;
     }
 
+    __host__ __device__ inline const Quality *qualityData(TrackSoAConstView tracks) { return reinterpret_cast<Quality const *>(tracks.quality()); }
+    __host__ __device__ inline Quality *qualityData(TrackSoAView tracks) { return reinterpret_cast<Quality *>(tracks.quality()); }
+
     __host__ __device__ inline int nHits(TrackSoAConstView tracks, int i) { return tracks.detIndices().size(i); }
   }  // namespace utilities
 }  // namespace pixelTrack
@@ -133,18 +140,7 @@ class TrackSoAHeterogeneousT : public cms::cuda::PortableDeviceCollection<TrackS
   explicit TrackSoAHeterogeneousT(cudaStream_t stream)
       : PortableDeviceCollection<TrackSoAHeterogeneousT_test<>>(S, stream) {}
 
-  static constexpr int32_t stride() { return S; }
-
-  using Quality = pixelTrack::Quality;
-  using hindex_type = uint32_t;
-
-  // Always check quality is at least loose!
-  // CUDA does not support enums  in __lgc ...
-private:
-public:
-  constexpr Quality const *qualityData() const { return reinterpret_cast<Quality const *>(view().quality()); }
-  constexpr Quality *qualityData() { return reinterpret_cast<Quality *>(view().quality()); }
-};
+}
 
 namespace pixelTrack {
 

From 8de9dfe22d5f2acac5027ac7177b4ee5d69dbfc7 Mon Sep 17 00:00:00 2001
From: Dimitris Papagiannis <nothingface0@gmail.com>
Date: Thu, 20 Oct 2022 17:10:32 +0200
Subject: [PATCH 057/110] Removed unused names

---
 CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h
index a9b2bb987fcec..7814cfd32ae4d 100644
--- a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h
+++ b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h
@@ -35,9 +35,6 @@ namespace pixelTrack {
 
 using Vector5f = Eigen::Matrix<float, 5, 1>;
 using Vector15f = Eigen::Matrix<float, 15, 1>;
-
-using Vector5d = Eigen::Matrix<double, 5, 1>;
-using Matrix5d = Eigen::Matrix<double, 5, 5>;
 using HitContainer = pixelTrack::HitContainer;
 
 GENERATE_SOA_LAYOUT(TrackSoAHeterogeneousT_test,

From 53f4a91b6a840455b3ca8a7f14885a2951a0f973 Mon Sep 17 00:00:00 2001
From: Dimitris Papagiannis <nothingface0@gmail.com>
Date: Fri, 21 Oct 2022 10:59:29 +0200
Subject: [PATCH 058/110] Simplified View alias, fixed calls in KernelsImpl,
 comments

---
 .../interface/TrackSoAHeterogeneousT_test.h   | 32 ++++++++++++-------
 .../plugins/PixelTrackProducerFromSoA.cc      |  6 ++--
 .../CAHitNtupletGeneratorKernelsImpl.h        |  4 +--
 3 files changed, 26 insertions(+), 16 deletions(-)

diff --git a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h
index 7814cfd32ae4d..e5485588ecb8a 100644
--- a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h
+++ b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h
@@ -33,6 +33,8 @@ namespace pixelTrack {
 
 }  // namespace pixelTrack
 
+// Aliases in order to not confuse the GENERATE_SOA_LAYOUT
+// macro with weird colons and angled brackets.
 using Vector5f = Eigen::Matrix<float, 5, 1>;
 using Vector15f = Eigen::Matrix<float, 15, 1>;
 using HitContainer = pixelTrack::HitContainer;
@@ -49,11 +51,12 @@ GENERATE_SOA_LAYOUT(TrackSoAHeterogeneousT_test,
                     SOA_SCALAR(HitContainer, hitIndices),
                     SOA_SCALAR(HitContainer, detIndices))
 
-// Previous TrajectoryStateSoAT class methods
+// Previous TrajectoryStateSoAT class methods.
+// They operate on View and ConstView of the TrackSoA.
 namespace pixelTrack {
   namespace utilities {
-    using TrackSoAView = cms::cuda::PortableDeviceCollection<TrackSoAHeterogeneousT_test<>>::View;
-    using TrackSoAConstView = cms::cuda::PortableDeviceCollection<TrackSoAHeterogeneousT_test<>>::ConstView;
+    using TrackSoAView = TrackSoAHeterogeneousT_test<>::View;
+    using TrackSoAConstView = TrackSoAHeterogeneousT_test<>::ConstView;
     using Quality = pixelTrack::Quality;
     using hindex_type = uint32_t;
     // State at the Beam spot
@@ -109,7 +112,8 @@ namespace pixelTrack {
       }
     }
 
-    __host__ __device__ inline int computeNumberOfLayers(TrackSoAConstView tracks, int32_t i) {
+    // TODO: Not using TrackSoAConstView due to weird bugs with HitContainer
+    __host__ __device__ inline int computeNumberOfLayers(TrackSoAView tracks, int32_t i) {
       auto pdet = tracks.detIndices().begin(i);
       int nl = 1;
       auto ol = phase1PixelTopology::getLayer(*pdet);
@@ -121,11 +125,17 @@ namespace pixelTrack {
       }
       return nl;
     }
+    __host__ __device__ inline int nHits(TrackSoAConstView tracks, int i) { return tracks.detIndices().size(i); }
 
-    __host__ __device__ inline const Quality *qualityData(TrackSoAConstView tracks) { return reinterpret_cast<Quality const *>(tracks.quality()); }
-    __host__ __device__ inline Quality *qualityData(TrackSoAView tracks) { return reinterpret_cast<Quality *>(tracks.quality()); }
+    // Casts quality SoA data (uint8_t) to pixelTrack::Quality. This is required
+    // to use the data as an enum instead of a plain uint8_t
+    __host__ __device__ inline const Quality *qualityData(TrackSoAConstView tracks) {
+      return reinterpret_cast<Quality const *>(tracks.quality());
+    }
+    __host__ __device__ inline Quality *qualityData(TrackSoAView tracks) {
+      return reinterpret_cast<Quality *>(tracks.quality());
+    }
 
-    __host__ __device__ inline int nHits(TrackSoAConstView tracks, int i) { return tracks.detIndices().size(i); }
   }  // namespace utilities
 }  // namespace pixelTrack
 
@@ -134,16 +144,16 @@ class TrackSoAHeterogeneousT : public cms::cuda::PortableDeviceCollection<TrackS
 public:
   TrackSoAHeterogeneousT() = default;
 
+  // Constructor which specifies the SoA size
   explicit TrackSoAHeterogeneousT(cudaStream_t stream)
       : PortableDeviceCollection<TrackSoAHeterogeneousT_test<>>(S, stream) {}
-
-}
+};
 
 namespace pixelTrack {
 
   using TrackSoA = TrackSoAHeterogeneousT<maxNumber()>;
-  using TrackSoAView = cms::cuda::PortableDeviceCollection<TrackSoAHeterogeneousT_test<>>::View;
-  using TrackSoAConstView = cms::cuda::PortableDeviceCollection<TrackSoAHeterogeneousT_test<>>::ConstView;
+  using TrackSoAView = TrackSoAHeterogeneousT_test<>::View;
+  using TrackSoAConstView = TrackSoAHeterogeneousT_test<>::ConstView;
 
 }  // namespace pixelTrack
 
diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc
index e6d49cde90d6a..c5d31764b0fcb 100644
--- a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc
+++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc
@@ -154,9 +154,9 @@ void PixelTrackProducerFromSoA::produce(edm::StreamID streamID,
 
   const auto &tsoa = *iEvent.get(tokenTrack_);
 
-  auto const *quality = tsoa.qualityData();
+  auto const *quality = pixelTrack::utilities::qualityData(tsoa.view());
   // auto const &fit = tsoa.stateAtBS;
-  auto const &hitIndices = tsoa.hitIndices;
+  auto const &hitIndices = tsoa.view().hitIndices();
   auto nTracks = tsoa.view().nTracks();
 
   tracks.reserve(nTracks);
@@ -173,7 +173,7 @@ void PixelTrackProducerFromSoA::produce(edm::StreamID streamID,
   //store the index of the SoA: indToEdm[index_SoAtrack] -> index_edmTrack (if it exists)
   indToEdm.resize(sortIdxs.size(), -1);
   for (const auto &it : sortIdxs) {
-    auto nHits = tsoa.nHits(it);
+    auto nHits = pixelTrack::utilities::nHits(tsoa.view(), it);
     assert(nHits >= 3);
     auto q = quality[it];
     if (q < minQuality_)
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
index 6833296cc89d5..cdf1ab2193be1 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
@@ -338,7 +338,7 @@ __global__ void kernel_find_ntuplets(GPUCACell::Hits const *__restrict__ hhp,
                                      GPUCACell *__restrict__ cells,
                                      uint32_t const *nCells,
                                      gpuPixelDoublets::CellTracksVector *cellTracks,
-                                     pixelTrack::TrackSoaView tracks_view,
+                                     TkSoAView tracks_view,
                                      cms::cuda::AtomicPairCounter *apc,
                                      unsigned int minHitsPerNtuplet) {
   // recursive: not obvious to widen
@@ -575,7 +575,7 @@ __global__ void kernel_fillNLayers(TkSoAView tracks_view, cms::cuda::AtomicPairC
   for (int idx = first, nt = ntracks; idx < nt; idx += gridDim.x * blockDim.x) {
     auto nHits = pixelTrack::nHits(tracks_view, idx);
     assert(nHits >= 3);
-    tracks_view[idx].nLayers() = pixelTrack::computeNumberOfLayers(tracks_view, idx);
+    tracks_view[idx].nLayers() = pixelTrack::utilities::computeNumberOfLayers(tracks_view, idx);
   }
 }
 

From 2a16c84609773436654a7532e25a8197ac530cf5 Mon Sep 17 00:00:00 2001
From: Dimitris Papagiannis <nothingface0@gmail.com>
Date: Fri, 21 Oct 2022 13:07:35 +0200
Subject: [PATCH 059/110] I DID IT

---
 .../plugins/CAHitNtupletGeneratorKernels.cc   | 37 +++++++-------
 .../CAHitNtupletGeneratorKernelsImpl.h        | 51 +++++++++----------
 .../plugins/CAHitNtupletGeneratorOnGPU.cc     |  8 +--
 3 files changed, 46 insertions(+), 50 deletions(-)

diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
index a34e0f280dd9d..80497d3dd706b 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
@@ -79,14 +79,14 @@ void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsOnCPU const &hh, cudaStr
 
 template <>
 void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) {
-  auto *tuples_d = &tracks_d->hitIndices;
-  auto *detId_d = &tracks_d->detIndices;
-  auto *quality_d = tracks_d->qualityData();
+  // auto *tuples_d = tracks_d->view().hitIndices();
+  // auto *detId_d = tracks_d->view().detIndices();
+  // auto *quality_d = tracks_d->qualityData();
 
   // assert(tuples_d && quality_d); // TODO Find equivalent for View
 
   // zero tuples
-  cms::cuda::launchZero(tuples_d, cudaStream);
+  cms::cuda::launchZero(&tracks_d->view().hitIndices(), cudaStream);
 
   auto nhits = hh.nHits();
 
@@ -119,23 +119,22 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA *
                        device_theCells_.get(),
                        device_nCells_,
                        device_theCellTracks_.get(),
-                       tuples_d,
+                       tracks_d->view(),
                        device_hitTuple_apc_,
-                       quality_d,
                        params_.minHitsPerNtuplet_);
   if (params_.doStats_)
     kernel_mark_used(device_theCells_.get(), device_nCells_);
 
-  cms::cuda::finalizeBulk(device_hitTuple_apc_, tuples_d);
+  cms::cuda::finalizeBulk(device_hitTuple_apc_, &tracks_d->view().hitIndices());
 
-  kernel_fillHitDetIndices(tuples_d, hh.view(), detId_d);
-  kernel_fillNLayers(tracks_d, tracks_d->view(), device_hitTuple_apc_);
+  kernel_fillHitDetIndices(tracks_d->view(), hh.view());
+  kernel_fillNLayers(tracks_d->view(), device_hitTuple_apc_);
 
   // remove duplicates (tracks that share a doublet)
   kernel_earlyDuplicateRemover(device_theCells_.get(), device_nCells_, tracks_d->view(), params_.dupPassThrough_);
-  kernel_countMultiplicity(tuples_d, tracks_d->view(), device_tupleMultiplicity_.get());
+  kernel_countMultiplicity(tracks_d->view(), device_tupleMultiplicity_.get());
   cms::cuda::launchFinalize(device_tupleMultiplicity_.get(), cudaStream);
-  kernel_fillMultiplicity(tuples_d, tracks_d->view(), device_tupleMultiplicity_.get());
+  kernel_fillMultiplicity(tracks_d->view(), device_tupleMultiplicity_.get());
 
   if (nhits > 1 && params_.lateFishbone_) {
     gpuPixelDoublets::fishbone(hh.view(), device_theCells_.get(), device_nCells_, isOuterHitOfCell_, nhits, true);
@@ -146,10 +145,10 @@ template <>
 void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) {
   int32_t nhits = hh.nHits();
 
-  auto const *tuples_d = &tracks_d->hitIndices;
-  auto *quality_d = tracks_d->qualityData();
+  // auto const *tuples_d = &tracks_d->hitIndices;
+  auto *quality_d = pixelTrack::utilities::qualityData(tracks_d->view());
   // classify tracks based on kinematics
-  kernel_classifyTracks(tuples_d, tracks_d->view(), quality_d, params_.cuts_);
+  kernel_classifyTracks(tracks_d->view(), quality_d, params_.cuts_);
 
   if (params_.lateFishbone_) {
     // apply fishbone cleaning to good tracks
@@ -161,9 +160,9 @@ void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, TkSoA
 
   // fill hit->track "map"
   if (params_.doSharedHitCut_ || params_.doStats_) {
-    kernel_countHitInTracks(tuples_d, device_hitToTuple_.get());
+    kernel_countHitInTracks(tracks_d->view(), device_hitToTuple_.get());
     cms::cuda::launchFinalize(hitToTupleView_, cudaStream);
-    kernel_fillHitInTracks(tuples_d, device_hitToTuple_.get());
+    kernel_fillHitInTracks(tracks_d->view(), device_hitToTuple_.get());
   }
 
   // remove duplicates (tracks that share at least one hit)
@@ -184,7 +183,7 @@ void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, TkSoA
 
   if (params_.doStats_) {
     std::lock_guard guard(lock_stat);
-    kernel_checkOverflows(tuples_d,
+    kernel_checkOverflows(tracks_d->view(),
                           device_tupleMultiplicity_.get(),
                           device_hitToTuple_.get(),
                           device_hitTuple_apc_,
@@ -202,7 +201,7 @@ void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, TkSoA
     // counters (add flag???)
     std::lock_guard guard(lock_stat);
     kernel_doStatsForHitInTracks(device_hitToTuple_.get(), counters_);
-    kernel_doStatsForTracks(tuples_d, quality_d, counters_);
+    kernel_doStatsForTracks(tracks_d->view(), quality_d, counters_);
   }
 
 #ifdef DUMP_GPU_TK_TUPLES
@@ -211,7 +210,7 @@ void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, TkSoA
   {
     std::lock_guard<std::mutex> guard(lock);
     ++iev;
-    kernel_print_found_ntuplets(hh.view(), tuples_d, tracks_d->view(), device_hitToTuple_.get(), 0, 1000000, iev);
+    kernel_print_found_ntuplets(hh.view(), tracks_d->view(), device_hitToTuple_.get(), 0, 1000000, iev);
   }
 #endif
 }
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
index cdf1ab2193be1..a3a8eb97a43d7 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
@@ -43,7 +43,7 @@ namespace {
 
 }  // namespace
 
-__global__ void kernel_checkOverflows(pixelTrack::TrackSoAView tracks_view,
+__global__ void kernel_checkOverflows(TkSoAView tracks_view,
                                       caConstants::TupleMultiplicity const *tupleMultiplicity,
                                       CAHitNtupletGeneratorKernelsGPU::HitToTuple const *hitToTuple,
                                       cms::cuda::AtomicPairCounter *apc,
@@ -362,7 +362,7 @@ __global__ void kernel_find_ntuplets(GPUCACell::Hits const *__restrict__ hhp,
                                 *cellTracks,
                                 tracks_view.hitIndices(),
                                 *apc,
-                                tracks_view.qualityData(),
+                                pixelTrack::utilities::qualityData(tracks_view),
                                 stack,
                                 minHitsPerNtuplet,
                                 pid < 3);
@@ -508,12 +508,12 @@ __global__ void kernel_classifyTracks(TkSoAView tracks_view,
   }
 }
 
-__global__ void kernel_doStatsForTracks(HitContainer const *__restrict__ tuples,
+__global__ void kernel_doStatsForTracks(TkSoAView tracks_view,
                                         Quality const *__restrict__ quality,
                                         CAHitNtupletGeneratorKernelsGPU::Counters *counters) {
   int first = blockDim.x * blockIdx.x + threadIdx.x;
-  for (int idx = first, ntot = tuples->nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
-    if (tuples->size(idx) == 0)
+  for (int idx = first, ntot = tracks_view.hitIndices().nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
+    if (tracks_view.hitIndices().size(idx) == 0)
       break;  //guard
     if (quality[idx] < pixelTrack::Quality::loose)
       continue;
@@ -524,7 +524,7 @@ __global__ void kernel_doStatsForTracks(HitContainer const *__restrict__ tuples,
   }
 }
 
-__global__ void kernel_countHitInTracks(pixelTrack::TrackSoAView tracks_view,
+__global__ void kernel_countHitInTracks(TkSoAView tracks_view,
                                         CAHitNtupletGeneratorKernelsGPU::HitToTuple *hitToTuple) {
   int first = blockDim.x * blockIdx.x + threadIdx.x;
   for (int idx = first, ntot = tracks_view.hitIndices().nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
@@ -535,7 +535,7 @@ __global__ void kernel_countHitInTracks(pixelTrack::TrackSoAView tracks_view,
   }
 }
 
-__global__ void kernel_fillHitInTracks(pixelTrack::TrackSoAView tracks_view,  // TODO: Make ConstView
+__global__ void kernel_fillHitInTracks(TkSoAView tracks_view,  // TODO: Make ConstView
                                        CAHitNtupletGeneratorKernelsGPU::HitToTuple *hitToTuple) {
   int first = blockDim.x * blockIdx.x + threadIdx.x;
   for (int idx = first, ntot = tracks_view.hitIndices().nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
@@ -546,20 +546,18 @@ __global__ void kernel_fillHitInTracks(pixelTrack::TrackSoAView tracks_view,  //
   }
 }
 
-__global__ void kernel_fillHitDetIndices(HitContainer const *__restrict__ tuples,
-                                         TrackingRecHit2DSOAView const *__restrict__ hhp,
-                                         HitContainer *__restrict__ hitDetIndices) {
+__global__ void kernel_fillHitDetIndices(TkSoAView tracks_view, TrackingRecHit2DSOAView const *__restrict__ hhp) {
   int first = blockDim.x * blockIdx.x + threadIdx.x;
   // copy offsets
-  for (int idx = first, ntot = tuples->totOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
-    hitDetIndices->off[idx] = tuples->off[idx];
+  for (int idx = first, ntot = tracks_view.hitIndices().totOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
+    tracks_view.detIndices().off[idx] = tracks_view.hitIndices().off[idx];
   }
   // fill hit indices
   auto const &hh = *hhp;
   auto nhits = hh.nHits();
-  for (int idx = first, ntot = tuples->size(); idx < ntot; idx += gridDim.x * blockDim.x) {
-    assert(tuples->content[idx] < nhits);
-    hitDetIndices->content[idx] = hh.detectorIndex(tuples->content[idx]);
+  for (int idx = first, ntot = tracks_view.hitIndices().size(); idx < ntot; idx += gridDim.x * blockDim.x) {
+    assert(tracks_view.hitIndices().content[idx] < nhits);
+    tracks_view.detIndices().content[idx] = hh.detectorIndex(tracks_view.hitIndices().content[idx]);
   }
 }
 
@@ -573,7 +571,7 @@ __global__ void kernel_fillNLayers(TkSoAView tracks_view, cms::cuda::AtomicPairC
   if (0 == first)
     tracks_view.nTracks() = ntracks;
   for (int idx = first, nt = ntracks; idx < nt; idx += gridDim.x * blockDim.x) {
-    auto nHits = pixelTrack::nHits(tracks_view, idx);
+    auto nHits = pixelTrack::utilities::nHits(tracks_view, idx);
     assert(nHits >= 3);
     tracks_view[idx].nLayers() = pixelTrack::utilities::computeNumberOfLayers(tracks_view, idx);
   }
@@ -859,7 +857,6 @@ __global__ void kernel_simpleTripletCleaner(
 }
 
 __global__ void kernel_print_found_ntuplets(TrackingRecHit2DSOAView const *__restrict__ hhp,
-                                            HitContainer const *__restrict__ ptuples,
                                             TkSoAConstView tracks_view,
                                             CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ phitToTuple,
                                             int32_t firstPrint,
@@ -867,11 +864,11 @@ __global__ void kernel_print_found_ntuplets(TrackingRecHit2DSOAView const *__res
                                             int iev) {
   constexpr auto loose = (uint8_t)pixelTrack::Quality::loose;
   auto const &hh = *hhp;
-  auto const &foundNtuplets = *ptuples;
+  // auto const &foundNtuplets = *ptuples;
 
   int first = firstPrint + blockDim.x * blockIdx.x + threadIdx.x;
-  for (int i = first, np = std::min(lastPrint, foundNtuplets.nOnes()); i < np; i += blockDim.x * gridDim.x) {
-    auto nh = foundNtuplets.size(i);
+  for (int i = first, np = std::min(lastPrint, tracks_view.hitIndices().nOnes()); i < np; i += blockDim.x * gridDim.x) {
+    auto nh = tracks_view.hitIndices().size(i);
     if (nh < 3)
       continue;
     if (tracks_view[i].quality() < loose)
@@ -889,13 +886,13 @@ __global__ void kernel_print_found_ntuplets(TrackingRecHit2DSOAView const *__res
            pixelTrack::utilities::zip(tracks_view, i),
            //           asinhf(fit_results[i].par(3)),
            tracks_view[i].chi2(),
-           hh.zGlobal(*foundNtuplets.begin(i)),
-           hh.zGlobal(*(foundNtuplets.begin(i) + 1)),
-           hh.zGlobal(*(foundNtuplets.begin(i) + 2)),
-           nh > 3 ? hh.zGlobal(int(*(foundNtuplets.begin(i) + 3))) : 0,
-           nh > 4 ? hh.zGlobal(int(*(foundNtuplets.begin(i) + 4))) : 0,
-           nh > 5 ? hh.zGlobal(int(*(foundNtuplets.begin(i) + 5))) : 0,
-           nh > 6 ? hh.zGlobal(int(*(foundNtuplets.begin(i) + nh - 1))) : 0);
+           hh.zGlobal(*tracks_view.hitIndices().begin(i)),
+           hh.zGlobal(*(tracks_view.hitIndices().begin(i) + 1)),
+           hh.zGlobal(*(tracks_view.hitIndices().begin(i) + 2)),
+           nh > 3 ? hh.zGlobal(int(*(tracks_view.hitIndices().begin(i) + 3))) : 0,
+           nh > 4 ? hh.zGlobal(int(*(tracks_view.hitIndices().begin(i) + 4))) : 0,
+           nh > 5 ? hh.zGlobal(int(*(tracks_view.hitIndices().begin(i) + 5))) : 0,
+           nh > 6 ? hh.zGlobal(int(*(tracks_view.hitIndices().begin(i) + nh - 1))) : 0);
   }
 }
 
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
index 6c5fdb36a9d46..8aa12a2fe5283 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
@@ -240,7 +240,7 @@ pixelTrack::TrackSoA CAHitNtupletGeneratorOnGPU::makeTuples(TrackingRecHit2DCPU
 
   // now fit
   HelixFitOnGPU fitter(bfield, m_params.fitNas4_);
-  fitter.allocateOnGPU(&(soa->hitIndices), kernels.tupleMultiplicity(), soa->view());
+  fitter.allocateOnGPU(&(soa->view().hitIndices()), kernels.tupleMultiplicity(), soa->view());
 
   if (m_params.useRiemannFit_) {
     fitter.launchRiemannKernelsOnCPU(hits_d.view(), hits_d.nHits(), caConstants::maxNumberOfQuadruplets);
@@ -255,9 +255,9 @@ pixelTrack::TrackSoA CAHitNtupletGeneratorOnGPU::makeTuples(TrackingRecHit2DCPU
 #endif
 
   // check that the fixed-size SoA does not overflow
-  auto const& tsoa = *soa;
-  auto maxTracks = tsoa.stride();
-  auto nTracks = tsoa.view().nTracks();
+
+  auto maxTracks = soa->view().metadata().size();
+  auto nTracks = soa->view().nTracks();
   assert(nTracks < maxTracks);
   if (nTracks == maxTracks - 1) {
     edm::LogWarning("PixelTracks") << "Unsorted reconstructed pixel tracks truncated to " << maxTracks - 1

From bd40481320d3126a376b930fcc40313762f5be20 Mon Sep 17 00:00:00 2001
From: Dimitris Papagiannis <nothingface0@gmail.com>
Date: Fri, 21 Oct 2022 15:18:25 +0200
Subject: [PATCH 060/110] MOstly ported? Unused class rules messages

---
 CUDADataFormats/Track/src/classes_def.xml     |  8 +++----
 .../PixelTriplets/plugins/CAHitNtupletCUDA.cc | 15 ++++++-------
 .../plugins/CAHitNtupletGeneratorKernels.cu   | 22 ++++++++-----------
 .../plugins/CAHitNtupletGeneratorOnGPU.cc     | 18 +++++++--------
 .../plugins/CAHitNtupletGeneratorOnGPU.h      |  4 ++--
 .../PixelTriplets/plugins/HelixFitOnGPU.cc    |  6 ++---
 .../PixelTriplets/plugins/HelixFitOnGPU.h     |  2 +-
 7 files changed, 34 insertions(+), 41 deletions(-)

diff --git a/CUDADataFormats/Track/src/classes_def.xml b/CUDADataFormats/Track/src/classes_def.xml
index 9c80ae91baf29..0255d34cf80d5 100644
--- a/CUDADataFormats/Track/src/classes_def.xml
+++ b/CUDADataFormats/Track/src/classes_def.xml
@@ -1,6 +1,6 @@
 <lcgdict>
-  <class name="cms::cuda::Product<HeterogeneousSoA<pixelTrack::TrackSoA>>" persistent="false"/>
-  <class name="edm::Wrapper<cms::cuda::Product<HeterogeneousSoA<pixelTrack::TrackSoA>>>" persistent="false"/>
-  <class name="HeterogeneousSoA<pixelTrack::TrackSoA>" persistent="false"/>
-  <class name="edm::Wrapper<HeterogeneousSoA<pixelTrack::TrackSoA>>" persistent="false"/>
+  <class name="pixelTrack::TrackSoAView" persistent="false"/>
+  <class name="edm::Wrapper<pixelTrack::TrackSoAView>" persistent="false"/>
+  <class name="cms::cuda::Product<pixelTrack::TrackSoAView>" persistent="false"/>
+  <class name="edm::Wrapper<cms::cuda::Product<pixelTrack::TrackSoAView>>" persistent="false"/>
 </lcgdict>
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc
index c9831afc01067..c539f74b85af8 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc
@@ -40,12 +40,13 @@ class CAHitNtupletCUDA : public edm::global::EDProducer<> {
   bool onGPU_;
 
   edm::ESGetToken<MagneticField, IdealMagneticFieldRecord> tokenField_;
+  // GPU
   edm::EDGetTokenT<cms::cuda::Product<TrackingRecHit2DGPU>> tokenHitGPU_;
-  //edm::EDPutTokenT<cms::cuda::Product<PixelTrackHeterogeneous>> tokenTrackGPU_;
-  edm::EDPutTokenT<cms::cuda::Product<pixelTrack::TrackSoA>> tokenTrackGPU_;
+  edm::EDPutTokenT<cms::cuda::Product<pixelTrack::TrackSoAView>> tokenTrackGPU_;
+
+  // CPU
   edm::EDGetTokenT<TrackingRecHit2DCPU> tokenHitCPU_;
-  //edm::EDPutTokenT<PixelTrackHeterogeneous> tokenTrackCPU_;
-  edm::EDPutTokenT<pixelTrack::TrackSoA> tokenTrackCPU_;
+  edm::EDPutTokenT<pixelTrack::TrackSoAView> tokenTrackCPU_;
 
   CAHitNtupletGeneratorOnGPU gpuAlgo_;
 };
@@ -55,12 +56,10 @@ CAHitNtupletCUDA::CAHitNtupletCUDA(const edm::ParameterSet& iConfig)
   if (onGPU_) {
     tokenHitGPU_ =
         consumes<cms::cuda::Product<TrackingRecHit2DGPU>>(iConfig.getParameter<edm::InputTag>("pixelRecHitSrc"));
-    //tokenTrackGPU_ = produces<cms::cuda::Product<PixelTrackHeterogeneous>>();
-    tokenTrackGPU_ = produces<cms::cuda::Product<pixelTrack::TrackSoA>>();
+    tokenTrackGPU_ = produces<cms::cuda::Product<pixelTrack::TrackSoAView>>();
   } else {
     tokenHitCPU_ = consumes<TrackingRecHit2DCPU>(iConfig.getParameter<edm::InputTag>("pixelRecHitSrc"));
-    //tokenTrackCPU_ = produces<PixelTrackHeterogeneous>();
-    tokenTrackCPU_ = produces<pixelTrack::TrackSoA>();
+    tokenTrackCPU_ = produces<pixelTrack::TrackSoAView>();
   }
 }
 
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
index 1bd491d53f509..b9a77bd48737d 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
@@ -4,12 +4,10 @@
 template <>
 void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) {
   // these are pointer on GPU!
-  // auto *tuples_d = &tracks_d->hitIndices();
-  auto *detId_d = tracks_d->view().detIndices();
-  auto *quality_d = tracks_d->qualityData();
+  auto *quality_d = pixelTrack::utilities::qualityData(tracks_d->view());
 
   // zero tuples
-  cms::cuda::launchZero(tracks_d->view().hitIndices(), cudaStream);
+  cms::cuda::launchZero(&(tracks_d->view().hitIndices()), cudaStream);
 
   int32_t nhits = hh.nHits();
 
@@ -87,12 +85,11 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *
   blockSize = 128;
   numberOfBlocks = (HitContainer::ctNOnes() + blockSize - 1) / blockSize;
   cms::cuda::finalizeBulk<<<numberOfBlocks, blockSize, 0, cudaStream>>>(device_hitTuple_apc_,
-                                                                        tracks_d->view().hitIndices());
+                                                                        &tracks_d->view().hitIndices());
 
-  kernel_fillHitDetIndices<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
-      tracks_d->view().hitIndices(), hh.view(), tracks_d->view().detIndices());
+  kernel_fillHitDetIndices<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tracks_d->view(), hh.view());
   cudaCheck(cudaGetLastError());
-  kernel_fillNLayers<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tracks_d, tracks_d->view(), device_hitTuple_apc_);
+  kernel_fillNLayers<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tracks_d->view(), device_hitTuple_apc_);
   cudaCheck(cudaGetLastError());
 
   // remove duplicates (tracks that share a doublet)
@@ -225,8 +222,7 @@ void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStr
 template <>
 void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) {
   // these are pointer on GPU!
-  // auto const *tuples_d = &tracks_d->hitIndices;
-  auto *quality_d = tracks_d->qualityData();
+  auto *quality_d = pixelTrack::utilities::qualityData(tracks_d->view());
 
   int32_t nhits = hh.nHits();
 
@@ -318,7 +314,7 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA
     kernel_doStatsForHitInTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(device_hitToTuple_.get(), counters_);
     cudaCheck(cudaGetLastError());
     numberOfBlocks = (3 * caConstants::maxNumberOfQuadruplets / 4 + blockSize - 1) / blockSize;
-    kernel_doStatsForTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d, quality_d, counters_);
+    kernel_doStatsForTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tracks_d->view(), quality_d, counters_);
     cudaCheck(cudaGetLastError());
   }
 #ifdef GPU_DEBUG
@@ -334,11 +330,11 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA
     ++iev;
     for (int k = 0; k < 20000; k += 500) {
       kernel_print_found_ntuplets<<<1, 32, 0, cudaStream>>>(
-          hh.view(), tuples_d, tracks_d->view(), device_hitToTuple_.get(), k, k + 500, iev);
+          hh.view(), tracks_d->view(), device_hitToTuple_.get(), k, k + 500, iev);
       cudaDeviceSynchronize();
     }
     kernel_print_found_ntuplets<<<1, 32, 0, cudaStream>>>(
-        hh.view(), tuples_d, tracks_d->view(), device_hitToTuple_.get(), 20000, 1000000, iev);
+        hh.view(), tracks_d->view(), device_hitToTuple_.get(), 20000, 1000000, iev);
     cudaDeviceSynchronize();
     // cudaStreamSynchronize(cudaStream);
   }
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
index 8aa12a2fe5283..5dd3de3e232f8 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
@@ -190,9 +190,9 @@ void CAHitNtupletGeneratorOnGPU::endJob() {
                                                                     float bfield,
                                                                     cudaStream_t stream) const {
   PixelTrackHeterogeneous tracks(cms::cuda::make_device_unique<pixelTrack::TrackSoA>(stream));*/
-  pixelTrack::TrackSoA CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecHit2DGPU const& hits_d,
-                                                                      float bfield,
-                                                                      cudaStream_t stream) const {
+pixelTrack::TrackSoAView CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecHit2DGPU const& hits_d,
+                                                                     float bfield,
+                                                                     cudaStream_t stream) const {
   pixelTrack::TrackSoA tracks(stream);
   auto* soa = &tracks;
 
@@ -204,7 +204,7 @@ void CAHitNtupletGeneratorOnGPU::endJob() {
   kernels.launchKernels(hits_d, soa, stream);
 
   HelixFitOnGPU fitter(bfield, m_params.fitNas4_);
-  fitter.allocateOnGPU(&(soa->hitIndices), kernels.tupleMultiplicity(), soa->view());
+  fitter.allocateOnGPU(kernels.tupleMultiplicity(), soa->view());
   if (m_params.useRiemannFit_) {
     fitter.launchRiemannKernels(hits_d.view(), hits_d.nHits(), caConstants::maxNumberOfQuadruplets, stream);
   } else {
@@ -218,10 +218,10 @@ void CAHitNtupletGeneratorOnGPU::endJob() {
   std::cout << "finished building pixel tracks on GPU" << std::endl;
 #endif
 
-  return tracks;
+  return tracks.view();
 }
 
-pixelTrack::TrackSoA CAHitNtupletGeneratorOnGPU::makeTuples(TrackingRecHit2DCPU const& hits_d, float bfield) const {
+pixelTrack::TrackSoAView CAHitNtupletGeneratorOnGPU::makeTuples(TrackingRecHit2DCPU const& hits_d, float bfield) const {
   //PixelTrackHeterogeneous tracks(std::make_unique<pixelTrack::TrackSoA>());
   pixelTrack::TrackSoA tracks;
 
@@ -236,11 +236,11 @@ pixelTrack::TrackSoA CAHitNtupletGeneratorOnGPU::makeTuples(TrackingRecHit2DCPU
   kernels.launchKernels(hits_d, soa, nullptr);
 
   if (0 == hits_d.nHits())
-    return tracks;
+    return tracks.view();
 
   // now fit
   HelixFitOnGPU fitter(bfield, m_params.fitNas4_);
-  fitter.allocateOnGPU(&(soa->view().hitIndices()), kernels.tupleMultiplicity(), soa->view());
+  fitter.allocateOnGPU(kernels.tupleMultiplicity(), soa->view());
 
   if (m_params.useRiemannFit_) {
     fitter.launchRiemannKernelsOnCPU(hits_d.view(), hits_d.nHits(), caConstants::maxNumberOfQuadruplets);
@@ -264,5 +264,5 @@ pixelTrack::TrackSoA CAHitNtupletGeneratorOnGPU::makeTuples(TrackingRecHit2DCPU
                                    << " candidates";
   }
 
-  return tracks;
+  return tracks.view();
 }
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h
index ff13d09c1361a..0c5b9531fed0c 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h
@@ -47,9 +47,9 @@ class CAHitNtupletGeneratorOnGPU {
   void beginJob();
   void endJob();
 
-  pixelTrack::TrackSoA makeTuplesAsync(TrackingRecHit2DGPU const& hits_d, float bfield, cudaStream_t stream) const;
+  pixelTrack::TrackSoAView makeTuplesAsync(TrackingRecHit2DGPU const& hits_d, float bfield, cudaStream_t stream) const;
 
-  pixelTrack::TrackSoA makeTuples(TrackingRecHit2DCPU const& hits_d, float bfield) const;
+  pixelTrack::TrackSoAView makeTuples(TrackingRecHit2DCPU const& hits_d, float bfield) const;
 
 private:
   void buildDoublets(HitsOnCPU const& hh, cudaStream_t stream) const;
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc
index 624934645338b..f757f574f6142 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc
@@ -1,10 +1,8 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HelixFitOnGPU.h"
 
-void HelixFitOnGPU::allocateOnGPU(Tuples const *tuples,
-                                  TupleMultiplicity const *tupleMultiplicity,
-                                  OutputSoAView helix_fit_results) {
-  tuples_ = tuples;
+void HelixFitOnGPU::allocateOnGPU(TupleMultiplicity const *tupleMultiplicity, OutputSoAView helix_fit_results) {
+  tuples_ = &helix_fit_results.hitIndices();
   tupleMultiplicity_ = tupleMultiplicity;
   outputSoa_ = helix_fit_results;
 
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h
index 67a180c53e887..9bda40749c052 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h
@@ -50,7 +50,7 @@ class HelixFitOnGPU {
   void launchRiemannKernelsOnCPU(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples);
   void launchBrokenLineKernelsOnCPU(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples);
 
-  void allocateOnGPU(Tuples const *tuples, TupleMultiplicity const *tupleMultiplicity, OutputSoAView outputSoA);
+  void allocateOnGPU(TupleMultiplicity const *tupleMultiplicity, OutputSoAView outputSoA);
   void deallocateOnGPU();
 
 private:

From 21eb31b0a35f91bb75eb197c876e8ff8552143e8 Mon Sep 17 00:00:00 2001
From: Dimitris Papagiannis <nothingface0@gmail.com>
Date: Fri, 21 Oct 2022 16:52:02 +0200
Subject: [PATCH 061/110] PixelTrackSoAFromCUDA adapted to new formats

---
 CUDADataFormats/Track/src/classes.h           |  2 +-
 CUDADataFormats/Track/src/classes_def.xml     |  4 +
 .../plugins/PixelTrackSoAFromCUDA.cc          | 76 ++++++++-----------
 3 files changed, 35 insertions(+), 47 deletions(-)

diff --git a/CUDADataFormats/Track/src/classes.h b/CUDADataFormats/Track/src/classes.h
index 97c116f6c88d3..5870985315f14 100644
--- a/CUDADataFormats/Track/src/classes.h
+++ b/CUDADataFormats/Track/src/classes.h
@@ -3,7 +3,7 @@
 
 #include "CUDADataFormats/Common/interface/Product.h"
 #include "CUDADataFormats/Common/interface/HostProduct.h"
-#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h"
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h"
 #include "DataFormats/Common/interface/Wrapper.h"
 
 #endif  // CUDADataFormats_Track_src_classes_h
diff --git a/CUDADataFormats/Track/src/classes_def.xml b/CUDADataFormats/Track/src/classes_def.xml
index 0255d34cf80d5..9f320a3833ff0 100644
--- a/CUDADataFormats/Track/src/classes_def.xml
+++ b/CUDADataFormats/Track/src/classes_def.xml
@@ -3,4 +3,8 @@
   <class name="edm::Wrapper<pixelTrack::TrackSoAView>" persistent="false"/>
   <class name="cms::cuda::Product<pixelTrack::TrackSoAView>" persistent="false"/>
   <class name="edm::Wrapper<cms::cuda::Product<pixelTrack::TrackSoAView>>" persistent="false"/>
+  <class name="pixelTrack::TrackSoALayout" persistent="false"/>
+  <class name="edm::Wrapper<pixelTrack::TrackSoALayout>" persistent="false"/>
+  <class name="cms::cuda::Product<pixelTrack::TrackSoALayout>" persistent="false"/>
+  <class name="edm::Wrapper<cms::cuda::Product<pixelTrack::TrackSoALayout>>" persistent="false"/>  
 </lcgdict>
diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc
index e43f6b028aa15..7cffdcb80a273 100644
--- a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc
+++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc
@@ -33,27 +33,16 @@ class PixelTrackSoAFromCUDA : public edm::stream::EDProducer<edm::ExternalWork>
                edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
   void produce(edm::Event& iEvent, edm::EventSetup const& iSetup) override;
 
-  //edm::EDGetTokenT<cms::cuda::Product<PixelTrackHeterogeneous>> tokenCUDA_;
-  //edm::EDPutTokenT<PixelTrackHeterogeneous> tokenSOA_;
-
-  //edm::EDGetTokenT<cms::cuda::Product<TrackSoAHeterogeneousT<32768>>> tokenCUDA_;
-  edm::EDGetTokenT<cms::cuda::Product<pixelTrack::TrackSoA>> tokenCUDA_;
-  //edm::EDPutTokenT<TrackSoAHeterogeneousT_test<>::View> tokenSOA_;
-  edm::EDPutTokenT<pixelTrack::TrackSoA> tokenSOA_;
-
-  //cms::cuda::host::unique_ptr<pixelTrack::TrackSoA> soa_;
-  //cms::cuda::host::unique_ptr<pixelTrack::TrackSoA> soa_;
-  //TrackSoAHeterogeneousT_test<>::View soa_;
-  pixelTrack::TrackSoA soa_;
-  pixelTrack::TrackSoAView tmp_view_;
+  edm::EDGetTokenT<cms::cuda::Product<pixelTrack::TrackSoALayout>> tokenCUDA_;
+  edm::EDPutTokenT<pixelTrack::TrackSoAView> tokenSOA_;
+
+  pixelTrack::TrackSoAView soa_view_h;
+  pixelTrack::TrackSoALayout soa_layout_h;
 };
 
 PixelTrackSoAFromCUDA::PixelTrackSoAFromCUDA(const edm::ParameterSet& iConfig)
-    //: tokenCUDA_(consumes<cms::cuda::Product<PixelTrackHeterogeneous>>(iConfig.getParameter<edm::InputTag>("src"))),
-    //  tokenSOA_(produces<PixelTrackHeterogeneous>()) {}
-    : tokenCUDA_(consumes<cms::cuda::Product<pixelTrack::TrackSoA>>(iConfig.getParameter<edm::InputTag>("src"))),
-      //tokenSOA_(produces<TrackSoAHeterogeneousT_test<>::View>()) {}
-      tokenSOA_(produces<pixelTrack::TrackSoA>()) {}
+    : tokenCUDA_(consumes<cms::cuda::Product<pixelTrack::TrackSoALayout>>(iConfig.getParameter<edm::InputTag>("src"))),
+      tokenSOA_(produces<pixelTrack::TrackSoAView>()) {}
 
 void PixelTrackSoAFromCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
   edm::ParameterSetDescription desc;
@@ -62,41 +51,36 @@ void PixelTrackSoAFromCUDA::fillDescriptions(edm::ConfigurationDescriptions& des
   descriptions.add("pixelTracksSoA", desc);
 }
 
-/*void PixelTrackSoAFromCUDA::acquire(edm::Event const& iEvent,
-                                    edm::EventSetup const& iSetup,
-                                    edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
-  cms::cuda::Product<PixelTrackHeterogeneous> const& inputDataWrapped = iEvent.get(tokenCUDA_);
-  cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)};
-  auto const& inputData = ctx.get(inputDataWrapped);
-
-  soa_ = inputData.toHostAsync(ctx.stream());
-}*/
-
 void PixelTrackSoAFromCUDA::acquire(edm::Event const& iEvent,
                                     edm::EventSetup const& iSetup,
                                     edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
-  cms::cuda::Product<pixelTrack::TrackSoA> const& inputDataWrapped = iEvent.get(tokenCUDA_);
+  cms::cuda::Product<pixelTrack::TrackSoALayout> const& inputDataWrapped = iEvent.get(tokenCUDA_);
   cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)};
-  auto const& inputData = ctx.get(inputDataWrapped);
+  auto const& soa_layout_d = ctx.get(inputDataWrapped);  // Layout of data on device
 
-  //class_ = inputData.toHostAsync(ctx.stream());
+  auto soa_buffer_h = cms::cuda::make_host_unique<std::byte[]>(soa_layout_d.metadata().byteSize(), ctx.stream());
 
-  pixelTrack::TrackSoA soa_(ctx.stream());
-  cudaCheck(cudaMemcpy(&soa_,&inputData,sizeof(pixelTrack::TrackSoA),cudaMemcpyDeviceToHost));
+  cudaCheck(cudaMemcpyAsync(soa_buffer_h.get(),
+                            soa_layout_d.metadata().data(),
+                            soa_layout_d.metadata().byteSize(),
+                            cudaMemcpyDeviceToHost,
+                            ctx.stream()));
+  pixelTrack::TrackSoALayout soa_layout_h(soa_buffer_h.get(), soa_layout_d.metadata().size());
+  pixelTrack::TrackSoAView soa_view_h(soa_layout_h);
 
-  auto retView = cms::cuda::make_host_unique<std::byte[]>(inputData.bufferSize(), ctx.stream());
-  cudaCheck(cudaMemcpy(retView.get(),inputData.buffer().get(),TrackSoAHeterogeneousT_test<>::computeDataSize(32768),cudaMemcpyDeviceToHost));
-  TrackSoAHeterogeneousT_test<> tmp_layout(retView.get(),32768);
-  TrackSoAHeterogeneousT_test<>::View tmp_view_(tmp_layout);
+  // // Allocate enough host memory to fit the SoA data in the input view
+  // auto soa_buffer_host = cms::cuda::make_host_unique<std::byte[]>(soa_.layout()., ctx.stream());
 
+  // // Copy data from the view on device to host memory
+  // cudaCheck(cudaMemcpy(soa_buffer_host.get(), soa_.buffer().get(), soa_.metadata().byteSize(), cudaMemcpyDeviceToHost));
+  // TrackSoAHeterogeneousT_test<> soa_layout(soa_buffer_host.get(), soa_.metadata().size());
+  // TrackSoAHeterogeneousT_test<>::View soa_host_view_(soa_layout);  // Store the host-side view
 }
 
 void PixelTrackSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& iSetup) {
   // check that the fixed-size SoA does not overflow
-  //auto tsoa = soa_;
-  //auto maxTracks = tsoa.stride();
-  auto maxTracks = 32768;
-  auto nTracks = tmp_view_.nTracks();
+  auto maxTracks = soa_layout_h.metadata().size();
+  auto nTracks = soa_view_h.nTracks();
   assert(nTracks < maxTracks);
   if (nTracks == maxTracks - 1) {
     edm::LogWarning("PixelTracks") << "Unsorted reconstructed pixel tracks truncated to " << maxTracks - 1
@@ -104,13 +88,13 @@ void PixelTrackSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& i
   }
 
 #ifdef PIXEL_DEBUG_PRODUCE
-  std::cout << "size of SoA " << sizeof(soa_) << " stride " << maxTracks << std::endl;
-  std::cout << "found " << nTracks << " tracks in cpu SoA at " << &soa_ << std::endl;
+  std::cout << "size of SoA " << soa_layout_h.metadata().byteSize() << " stride " << maxTracks << std::endl;
+  std::cout << "found " << nTracks << " tracks in cpu SoA at " << soa_layout_h.metadata().data() << std::endl;
 
   int32_t nt = 0;
   for (int32_t it = 0; it < maxTracks; ++it) {
-    auto nHits = soa_.nHits(it);
-    assert(nHits == int(soa_.hitIndices.size(it)));
+    auto nHits = pixelTrack::utilities::nHits(soa_view_h, it);
+    assert(nHits == int(soa_view_h.hitIndices().size(it)));
     if (nHits == 0)
       break;  // this is a guard: maybe we need to move to nTracks...
     nt++;
@@ -119,7 +103,7 @@ void PixelTrackSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& i
 #endif
 
   // DO NOT  make a copy  (actually TWO....)
-  iEvent.emplace(tokenSOA_, std::move(soa_));//, std::move(ret)); // view
+  iEvent.emplace(tokenSOA_, std::move(soa_view_h));  //, std::move(ret)); // view
 
   //assert(!soa_);
 }

From 2234467c25abad84e0b92948dd74067e1d2cc216 Mon Sep 17 00:00:00 2001
From: Breno Orzari <breno.orzari@hotmail.com>
Date: Fri, 21 Oct 2022 16:57:40 +0200
Subject: [PATCH 062/110] Adding copyToHost function

---
 .../Track/interface/TrackSoAHeterogeneousT_test.h    | 12 ++++++++++++
 .../Track/test/TrackSoAHeterogeneous_test.cpp        |  8 ++------
 2 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h
index e5485588ecb8a..630c5b70bb22d 100644
--- a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h
+++ b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h
@@ -11,6 +11,10 @@
 #include "DataFormats/SoATemplate/interface/SoALayout.h"
 #include "CUDADataFormats/Common/interface/HeterogeneousSoA.h"
 #include "CUDADataFormats/Common/interface/PortableDeviceCollection.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/allocate_device.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/allocate_host.h"
 
 namespace pixelTrack {
   enum class Quality : uint8_t { bad = 0, edup, dup, loose, strict, tight, highPurity, notQuality };
@@ -147,11 +151,19 @@ class TrackSoAHeterogeneousT : public cms::cuda::PortableDeviceCollection<TrackS
   // Constructor which specifies the SoA size
   explicit TrackSoAHeterogeneousT(cudaStream_t stream)
       : PortableDeviceCollection<TrackSoAHeterogeneousT_test<>>(S, stream) {}
+
+  // Copy data from device to host
+  __host__ cms::cuda::host::unique_ptr<std::byte[]> copyToHost(cudaStream_t stream) {
+    auto tracks_h_soa = cms::cuda::make_host_unique<std::byte[]>(bufferSize(), stream);
+    cudaCheck(cudaMemcpy(tracks_h_soa.get(), const_buffer().get(), bufferSize(), cudaMemcpyDeviceToHost));
+    return tracks_h_soa;
+  }
 };
 
 namespace pixelTrack {
 
   using TrackSoA = TrackSoAHeterogeneousT<maxNumber()>;
+  using TrackSoALayout = TrackSoAHeterogeneousT_test<>;
   using TrackSoAView = TrackSoAHeterogeneousT_test<>::View;
   using TrackSoAConstView = TrackSoAHeterogeneousT_test<>::ConstView;
 
diff --git a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp
index 9bfd445bd786c..4be8343e3474d 100644
--- a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp
+++ b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp
@@ -7,7 +7,7 @@
    the CUDA kernels which:
    - Fill the SoA with data.
    - Verify that the data written is correct.
-   
+
    Then, the SoA data are copied back to Host, where
    a temporary host-side view (tmp_view) is created using
    the same Layout to access the data on host and print it.
@@ -36,17 +36,13 @@ int main() {
     // Instantiate tracks on host. Portabledevicecollection allocates
     // SoA on device automatically.
     pixelTrack::TrackSoA tracks(stream);
-    uint32_t soaSize = tracks.bufferSize();               // SoA Layout size (bytes)
     uint32_t soaNumElements = tracks->metadata().size();  // Length of each SoA array in elements
 
     // Run the tests
     testTrackSoAHeterogeneousT::runKernels(tracks.view());
 
-    // Copy SoA data back to host
-    auto tracks_h_soa = cms::cuda::make_host_unique<std::byte[]>(soaSize, stream);
-    cudaCheck(cudaMemcpy(tracks_h_soa.get(), tracks.const_buffer().get(), soaSize, cudaMemcpyDeviceToHost));
-
     // Create a view to access the copied data
+    auto tracks_h_soa = tracks.copyToHost(stream);
     TrackSoAHeterogeneousT_test<> tmp_layout(tracks_h_soa.get(), soaNumElements);
     TrackSoAHeterogeneousT_test<>::View tmp_view(tmp_layout);
 

From f98898e2fef9ad932da862d65c1859f749b84a91 Mon Sep 17 00:00:00 2001
From: Dimitris Papagiannis <nothingface0@gmail.com>
Date: Fri, 21 Oct 2022 17:50:34 +0200
Subject: [PATCH 063/110] CAHitNtuplet outputs modified to Layout(GPU) or
 View(CPU)

---
 .../PixelTriplets/plugins/CAHitNtupletCUDA.cc          |  6 ++++--
 .../plugins/CAHitNtupletGeneratorOnGPU.cc              | 10 ++++++----
 .../PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h |  6 +++++-
 3 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc
index c539f74b85af8..219dc21ec93d9 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc
@@ -41,10 +41,12 @@ class CAHitNtupletCUDA : public edm::global::EDProducer<> {
 
   edm::ESGetToken<MagneticField, IdealMagneticFieldRecord> tokenField_;
   // GPU
+  // Produces a view on GPU, which is used by PixelTrackSoAFromCUDA
   edm::EDGetTokenT<cms::cuda::Product<TrackingRecHit2DGPU>> tokenHitGPU_;
-  edm::EDPutTokenT<cms::cuda::Product<pixelTrack::TrackSoAView>> tokenTrackGPU_;
+  edm::EDPutTokenT<cms::cuda::Product<pixelTrack::TrackSoALayout>> tokenTrackGPU_;
 
   // CPU
+  // Produces a view on CPU, which is used by PixelTrackProducerFromSoA
   edm::EDGetTokenT<TrackingRecHit2DCPU> tokenHitCPU_;
   edm::EDPutTokenT<pixelTrack::TrackSoAView> tokenTrackCPU_;
 
@@ -56,7 +58,7 @@ CAHitNtupletCUDA::CAHitNtupletCUDA(const edm::ParameterSet& iConfig)
   if (onGPU_) {
     tokenHitGPU_ =
         consumes<cms::cuda::Product<TrackingRecHit2DGPU>>(iConfig.getParameter<edm::InputTag>("pixelRecHitSrc"));
-    tokenTrackGPU_ = produces<cms::cuda::Product<pixelTrack::TrackSoAView>>();
+    tokenTrackGPU_ = produces<cms::cuda::Product<pixelTrack::TrackSoALayout>>();
   } else {
     tokenHitCPU_ = consumes<TrackingRecHit2DCPU>(iConfig.getParameter<edm::InputTag>("pixelRecHitSrc"));
     tokenTrackCPU_ = produces<pixelTrack::TrackSoAView>();
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
index 5dd3de3e232f8..7233e0e241fcc 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
@@ -190,10 +190,12 @@ void CAHitNtupletGeneratorOnGPU::endJob() {
                                                                     float bfield,
                                                                     cudaStream_t stream) const {
   PixelTrackHeterogeneous tracks(cms::cuda::make_device_unique<pixelTrack::TrackSoA>(stream));*/
-pixelTrack::TrackSoAView CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecHit2DGPU const& hits_d,
-                                                                     float bfield,
-                                                                     cudaStream_t stream) const {
+pixelTrack::TrackSoALayout CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecHit2DGPU const& hits_d,
+                                                                       float bfield,
+                                                                       cudaStream_t stream) const {
   pixelTrack::TrackSoA tracks(stream);
+  auto soaNumElements = tracks->metadata().size();
+  TrackSoAHeterogeneousT_test<> tmp_layout(tracks.buffer().get(), soaNumElements);
   auto* soa = &tracks;
 
   CAHitNtupletGeneratorKernelsGPU kernels(m_params);
@@ -218,7 +220,7 @@ pixelTrack::TrackSoAView CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRec
   std::cout << "finished building pixel tracks on GPU" << std::endl;
 #endif
 
-  return tracks.view();
+  return tmp_layout;
 }
 
 pixelTrack::TrackSoAView CAHitNtupletGeneratorOnGPU::makeTuples(TrackingRecHit2DCPU const& hits_d, float bfield) const {
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h
index 0c5b9531fed0c..621503f3e22a4 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h
@@ -47,8 +47,12 @@ class CAHitNtupletGeneratorOnGPU {
   void beginJob();
   void endJob();
 
-  pixelTrack::TrackSoAView makeTuplesAsync(TrackingRecHit2DGPU const& hits_d, float bfield, cudaStream_t stream) const;
+  // On GPU
+  pixelTrack::TrackSoALayout makeTuplesAsync(TrackingRecHit2DGPU const& hits_d,
+                                             float bfield,
+                                             cudaStream_t stream) const;
 
+  // On CPU
   pixelTrack::TrackSoAView makeTuples(TrackingRecHit2DCPU const& hits_d, float bfield) const;
 
 private:

From d566db8155ead426f7e9dfd91e8170b824f09015 Mon Sep 17 00:00:00 2001
From: Breno Orzari <breno.orzari@hotmail.com>
Date: Fri, 21 Oct 2022 18:15:44 +0200
Subject: [PATCH 064/110] Changing products to soa views

---
 .../plugins/PixelTrackSoAFromCUDA.cc          | 22 +++++++++----------
 .../PixelTriplets/plugins/CAHitNtupletCUDA.cc |  4 ++--
 .../plugins/CAHitNtupletGeneratorOnGPU.cc     |  6 ++---
 .../plugins/CAHitNtupletGeneratorOnGPU.h      |  2 +-
 .../PixelTriplets/src/classes.h               |  1 +
 .../PixelTriplets/src/classes_def.xml         |  4 ++++
 .../plugins/PixelVertexProducerCUDA.cc        | 11 +++++-----
 7 files changed, 27 insertions(+), 23 deletions(-)

diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc
index 7cffdcb80a273..e31f195578f35 100644
--- a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc
+++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc
@@ -33,15 +33,15 @@ class PixelTrackSoAFromCUDA : public edm::stream::EDProducer<edm::ExternalWork>
                edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
   void produce(edm::Event& iEvent, edm::EventSetup const& iSetup) override;
 
-  edm::EDGetTokenT<cms::cuda::Product<pixelTrack::TrackSoALayout>> tokenCUDA_;
+  edm::EDGetTokenT<pixelTrack::TrackSoAView> tokenCUDA_;
   edm::EDPutTokenT<pixelTrack::TrackSoAView> tokenSOA_;
 
   pixelTrack::TrackSoAView soa_view_h;
-  pixelTrack::TrackSoALayout soa_layout_h;
+  //pixelTrack::TrackSoALayout soa_layout_h;
 };
 
 PixelTrackSoAFromCUDA::PixelTrackSoAFromCUDA(const edm::ParameterSet& iConfig)
-    : tokenCUDA_(consumes<cms::cuda::Product<pixelTrack::TrackSoALayout>>(iConfig.getParameter<edm::InputTag>("src"))),
+    : tokenCUDA_(consumes<pixelTrack::TrackSoAView>(iConfig.getParameter<edm::InputTag>("src"))),
       tokenSOA_(produces<pixelTrack::TrackSoAView>()) {}
 
 void PixelTrackSoAFromCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
@@ -54,11 +54,11 @@ void PixelTrackSoAFromCUDA::fillDescriptions(edm::ConfigurationDescriptions& des
 void PixelTrackSoAFromCUDA::acquire(edm::Event const& iEvent,
                                     edm::EventSetup const& iSetup,
                                     edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
-  cms::cuda::Product<pixelTrack::TrackSoALayout> const& inputDataWrapped = iEvent.get(tokenCUDA_);
-  cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)};
-  auto const& soa_layout_d = ctx.get(inputDataWrapped);  // Layout of data on device
+  soa_view_h = iEvent.get(tokenCUDA_);
+  //cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)};
+  //auto const& soa_view_h = ctx.get(inputDataWrapped);  // Layout of data on device
 
-  auto soa_buffer_h = cms::cuda::make_host_unique<std::byte[]>(soa_layout_d.metadata().byteSize(), ctx.stream());
+  /*auto soa_buffer_h = cms::cuda::make_host_unique<std::byte[]>(soa_layout_d.metadata().byteSize(), ctx.stream());
 
   cudaCheck(cudaMemcpyAsync(soa_buffer_h.get(),
                             soa_layout_d.metadata().data(),
@@ -66,7 +66,7 @@ void PixelTrackSoAFromCUDA::acquire(edm::Event const& iEvent,
                             cudaMemcpyDeviceToHost,
                             ctx.stream()));
   pixelTrack::TrackSoALayout soa_layout_h(soa_buffer_h.get(), soa_layout_d.metadata().size());
-  pixelTrack::TrackSoAView soa_view_h(soa_layout_h);
+  pixelTrack::TrackSoAView soa_view_h(soa_layout_h);*/
 
   // // Allocate enough host memory to fit the SoA data in the input view
   // auto soa_buffer_host = cms::cuda::make_host_unique<std::byte[]>(soa_.layout()., ctx.stream());
@@ -79,7 +79,7 @@ void PixelTrackSoAFromCUDA::acquire(edm::Event const& iEvent,
 
 void PixelTrackSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& iSetup) {
   // check that the fixed-size SoA does not overflow
-  auto maxTracks = soa_layout_h.metadata().size();
+  auto maxTracks = soa_view_h.metadata().size();
   auto nTracks = soa_view_h.nTracks();
   assert(nTracks < maxTracks);
   if (nTracks == maxTracks - 1) {
@@ -88,8 +88,8 @@ void PixelTrackSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& i
   }
 
 #ifdef PIXEL_DEBUG_PRODUCE
-  std::cout << "size of SoA " << soa_layout_h.metadata().byteSize() << " stride " << maxTracks << std::endl;
-  std::cout << "found " << nTracks << " tracks in cpu SoA at " << soa_layout_h.metadata().data() << std::endl;
+  std::cout << " stride " << maxTracks << std::endl;
+  std::cout << "found " << nTracks << std::endl;
 
   int32_t nt = 0;
   for (int32_t it = 0; it < maxTracks; ++it) {
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc
index 219dc21ec93d9..502717f263a90 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc
@@ -43,7 +43,7 @@ class CAHitNtupletCUDA : public edm::global::EDProducer<> {
   // GPU
   // Produces a view on GPU, which is used by PixelTrackSoAFromCUDA
   edm::EDGetTokenT<cms::cuda::Product<TrackingRecHit2DGPU>> tokenHitGPU_;
-  edm::EDPutTokenT<cms::cuda::Product<pixelTrack::TrackSoALayout>> tokenTrackGPU_;
+  edm::EDPutTokenT<pixelTrack::TrackSoAView> tokenTrackGPU_;
 
   // CPU
   // Produces a view on CPU, which is used by PixelTrackProducerFromSoA
@@ -58,7 +58,7 @@ CAHitNtupletCUDA::CAHitNtupletCUDA(const edm::ParameterSet& iConfig)
   if (onGPU_) {
     tokenHitGPU_ =
         consumes<cms::cuda::Product<TrackingRecHit2DGPU>>(iConfig.getParameter<edm::InputTag>("pixelRecHitSrc"));
-    tokenTrackGPU_ = produces<cms::cuda::Product<pixelTrack::TrackSoALayout>>();
+    tokenTrackGPU_ = produces<pixelTrack::TrackSoAView>();
   } else {
     tokenHitCPU_ = consumes<TrackingRecHit2DCPU>(iConfig.getParameter<edm::InputTag>("pixelRecHitSrc"));
     tokenTrackCPU_ = produces<pixelTrack::TrackSoAView>();
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
index 7233e0e241fcc..6a8de7fc49f66 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
@@ -190,12 +190,10 @@ void CAHitNtupletGeneratorOnGPU::endJob() {
                                                                     float bfield,
                                                                     cudaStream_t stream) const {
   PixelTrackHeterogeneous tracks(cms::cuda::make_device_unique<pixelTrack::TrackSoA>(stream));*/
-pixelTrack::TrackSoALayout CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecHit2DGPU const& hits_d,
+pixelTrack::TrackSoAView CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecHit2DGPU const& hits_d,
                                                                        float bfield,
                                                                        cudaStream_t stream) const {
   pixelTrack::TrackSoA tracks(stream);
-  auto soaNumElements = tracks->metadata().size();
-  TrackSoAHeterogeneousT_test<> tmp_layout(tracks.buffer().get(), soaNumElements);
   auto* soa = &tracks;
 
   CAHitNtupletGeneratorKernelsGPU kernels(m_params);
@@ -220,7 +218,7 @@ pixelTrack::TrackSoALayout CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingR
   std::cout << "finished building pixel tracks on GPU" << std::endl;
 #endif
 
-  return tmp_layout;
+  return tracks.view();
 }
 
 pixelTrack::TrackSoAView CAHitNtupletGeneratorOnGPU::makeTuples(TrackingRecHit2DCPU const& hits_d, float bfield) const {
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h
index 621503f3e22a4..85457f30fd19d 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h
@@ -48,7 +48,7 @@ class CAHitNtupletGeneratorOnGPU {
   void endJob();
 
   // On GPU
-  pixelTrack::TrackSoALayout makeTuplesAsync(TrackingRecHit2DGPU const& hits_d,
+  pixelTrack::TrackSoAView makeTuplesAsync(TrackingRecHit2DGPU const& hits_d,
                                              float bfield,
                                              cudaStream_t stream) const;
 
diff --git a/RecoPixelVertexing/PixelTriplets/src/classes.h b/RecoPixelVertexing/PixelTriplets/src/classes.h
index 4f495027ac186..db84e140b26de 100644
--- a/RecoPixelVertexing/PixelTriplets/src/classes.h
+++ b/RecoPixelVertexing/PixelTriplets/src/classes.h
@@ -1,5 +1,6 @@
 #include "RecoPixelVertexing/PixelTriplets/interface/IntermediateHitTriplets.h"
 #include "DataFormats/Common/interface/Wrapper.h"
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h"
 
 #include <vector>
 
diff --git a/RecoPixelVertexing/PixelTriplets/src/classes_def.xml b/RecoPixelVertexing/PixelTriplets/src/classes_def.xml
index ea89a65a45dbb..78018a50bfff3 100644
--- a/RecoPixelVertexing/PixelTriplets/src/classes_def.xml
+++ b/RecoPixelVertexing/PixelTriplets/src/classes_def.xml
@@ -1,4 +1,8 @@
 <lcgdict>
   <class name="IntermediateHitTriplets" persistent="false"/>
   <class name="edm::Wrapper<IntermediateHitTriplets>" persistent="false"/>
+  <class name="pixelTrack::TrackSoAView" persistent="false"/>
+  <class name="edm::Wrapper<pixelTrack::TrackSoAView>" persistent="false"/>
+  <class name="cms::cuda::Product<pixelTrack::TrackSoAView>" persistent="false"/>
+  <class name="edm::Wrapper<cms::cuda::Product<pixelTrack::TrackSoAView>>" persistent="false"/>
 </lcgdict>
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc
index 34b0ed9e29fc1..5b316a53a691e 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc
@@ -16,6 +16,7 @@
 #include "FWCore/Utilities/interface/EDGetToken.h"
 #include "FWCore/Utilities/interface/RunningAverage.h"
 #include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h"
 
 #include "gpuVertexFinder.h"
 
@@ -35,9 +36,9 @@ class PixelVertexProducerCUDA : public edm::global::EDProducer<> {
 
   bool onGPU_;
 
-  edm::EDGetTokenT<cms::cuda::Product<PixelTrackHeterogeneous>> tokenGPUTrack_;
+  edm::EDGetTokenT<cms::cuda::Product<pixelTrack::TrackSoALayout>> tokenGPUTrack_;
   edm::EDPutTokenT<ZVertexCUDAProduct> tokenGPUVertex_;
-  edm::EDGetTokenT<PixelTrackHeterogeneous> tokenCPUTrack_;
+  edm::EDGetTokenT<pixelTrack::TrackSoAView> tokenCPUTrack_;
   edm::EDPutTokenT<ZVertexHeterogeneous> tokenCPUVertex_;
 
   const gpuVertexFinder::Producer gpuAlgo_;
@@ -62,10 +63,10 @@ PixelVertexProducerCUDA::PixelVertexProducerCUDA(const edm::ParameterSet& conf)
 {
   if (onGPU_) {
     tokenGPUTrack_ =
-        consumes<cms::cuda::Product<PixelTrackHeterogeneous>>(conf.getParameter<edm::InputTag>("pixelTrackSrc"));
+        consumes<cms::cuda::Product<pixelTrack::TrackSoALayout>>(conf.getParameter<edm::InputTag>("pixelTrackSrc"));
     tokenGPUVertex_ = produces<ZVertexCUDAProduct>();
   } else {
-    tokenCPUTrack_ = consumes<PixelTrackHeterogeneous>(conf.getParameter<edm::InputTag>("pixelTrackSrc"));
+    tokenCPUTrack_ = consumes<pixelTrack::TrackSoAView>(conf.getParameter<edm::InputTag>("pixelTrackSrc"));
     tokenCPUVertex_ = produces<ZVertexHeterogeneous>();
   }
 }
@@ -97,7 +98,7 @@ void PixelVertexProducerCUDA::fillDescriptions(edm::ConfigurationDescriptions& d
 void PixelVertexProducerCUDA::produceOnGPU(edm::StreamID streamID,
                                            edm::Event& iEvent,
                                            const edm::EventSetup& iSetup) const {
-  edm::Handle<cms::cuda::Product<PixelTrackHeterogeneous>> hTracks;
+  edm::Handle<cms::cuda::Product<pixelTrack::TrackSoALayout>> hTracks;
   iEvent.getByToken(tokenGPUTrack_, hTracks);
 
   cms::cuda::ScopedContextProduce ctx{*hTracks};

From d544dc3308fe1bb803656f4d074bcc7561c82c63 Mon Sep 17 00:00:00 2001
From: Breno Orzari <breno.orzari@hotmail.com>
Date: Tue, 25 Oct 2022 16:43:57 +0200
Subject: [PATCH 065/110] Solving issue with ctx.emplace

---
 RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc  | 4 ++--
 .../PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h        | 3 ++-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc
index 502717f263a90..d8a634328af7a 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc
@@ -43,7 +43,7 @@ class CAHitNtupletCUDA : public edm::global::EDProducer<> {
   // GPU
   // Produces a view on GPU, which is used by PixelTrackSoAFromCUDA
   edm::EDGetTokenT<cms::cuda::Product<TrackingRecHit2DGPU>> tokenHitGPU_;
-  edm::EDPutTokenT<pixelTrack::TrackSoAView> tokenTrackGPU_;
+  edm::EDPutTokenT<cms::cuda::Product<pixelTrack::TrackSoAView>> tokenTrackGPU_;
 
   // CPU
   // Produces a view on CPU, which is used by PixelTrackProducerFromSoA
@@ -58,7 +58,7 @@ CAHitNtupletCUDA::CAHitNtupletCUDA(const edm::ParameterSet& iConfig)
   if (onGPU_) {
     tokenHitGPU_ =
         consumes<cms::cuda::Product<TrackingRecHit2DGPU>>(iConfig.getParameter<edm::InputTag>("pixelRecHitSrc"));
-    tokenTrackGPU_ = produces<pixelTrack::TrackSoAView>();
+    tokenTrackGPU_ = produces<cms::cuda::Product<pixelTrack::TrackSoAView>>();
   } else {
     tokenHitCPU_ = consumes<TrackingRecHit2DCPU>(iConfig.getParameter<edm::InputTag>("pixelRecHitSrc"));
     tokenTrackCPU_ = produces<pixelTrack::TrackSoAView>();
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h
index 85457f30fd19d..6b9a00ef9757f 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h
@@ -3,7 +3,8 @@
 
 #include <cuda_runtime.h>
 #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h"
-#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
+//#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h"
 
 #include "DataFormats/SiPixelDetId/interface/PixelSubdetector.h"
 #include "FWCore/ParameterSet/interface/ParameterSet.h"

From 656d9d1acd28e332958046806f7f6b9452e1bae4 Mon Sep 17 00:00:00 2001
From: Dimitris Papagiannis <nothingface0@gmail.com>
Date: Tue, 25 Oct 2022 17:54:35 +0200
Subject: [PATCH 066/110] PixelVertexProducer adapted to new inputs

---
 .../PixelTriplets/src/classes_def.xml         |  4 +--
 .../plugins/PixelVertexProducerCUDA.cc        | 29 ++++++++---------
 .../plugins/gpuVertexFinder.cc                | 31 +++++++++----------
 .../plugins/gpuVertexFinder.h                 |  6 ++--
 4 files changed, 32 insertions(+), 38 deletions(-)

diff --git a/RecoPixelVertexing/PixelTriplets/src/classes_def.xml b/RecoPixelVertexing/PixelTriplets/src/classes_def.xml
index 78018a50bfff3..405eedfe74760 100644
--- a/RecoPixelVertexing/PixelTriplets/src/classes_def.xml
+++ b/RecoPixelVertexing/PixelTriplets/src/classes_def.xml
@@ -3,6 +3,6 @@
   <class name="edm::Wrapper<IntermediateHitTriplets>" persistent="false"/>
   <class name="pixelTrack::TrackSoAView" persistent="false"/>
   <class name="edm::Wrapper<pixelTrack::TrackSoAView>" persistent="false"/>
-  <class name="cms::cuda::Product<pixelTrack::TrackSoAView>" persistent="false"/>
-  <class name="edm::Wrapper<cms::cuda::Product<pixelTrack::TrackSoAView>>" persistent="false"/>
+  <!-- <class name="cms::cuda::Product<pixelTrack::TrackSoAView>" persistent="false"/> -->
+  <!-- <class name="edm::Wrapper<cms::cuda::Product<pixelTrack::TrackSoAView>>" persistent="false"/> -->
 </lcgdict>
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc
index 5b316a53a691e..16b3267a326ce 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc
@@ -36,7 +36,7 @@ class PixelVertexProducerCUDA : public edm::global::EDProducer<> {
 
   bool onGPU_;
 
-  edm::EDGetTokenT<cms::cuda::Product<pixelTrack::TrackSoALayout>> tokenGPUTrack_;
+  edm::EDGetTokenT<cms::cuda::Product<pixelTrack::TrackSoAView>> tokenGPUTrack_;
   edm::EDPutTokenT<ZVertexCUDAProduct> tokenGPUVertex_;
   edm::EDGetTokenT<pixelTrack::TrackSoAView> tokenCPUTrack_;
   edm::EDPutTokenT<ZVertexHeterogeneous> tokenCPUVertex_;
@@ -63,7 +63,7 @@ PixelVertexProducerCUDA::PixelVertexProducerCUDA(const edm::ParameterSet& conf)
 {
   if (onGPU_) {
     tokenGPUTrack_ =
-        consumes<cms::cuda::Product<pixelTrack::TrackSoALayout>>(conf.getParameter<edm::InputTag>("pixelTrackSrc"));
+        consumes<cms::cuda::Product<pixelTrack::TrackSoAView>>(conf.getParameter<edm::InputTag>("pixelTrackSrc"));
     tokenGPUVertex_ = produces<ZVertexCUDAProduct>();
   } else {
     tokenCPUTrack_ = consumes<pixelTrack::TrackSoAView>(conf.getParameter<edm::InputTag>("pixelTrackSrc"));
@@ -98,40 +98,37 @@ void PixelVertexProducerCUDA::fillDescriptions(edm::ConfigurationDescriptions& d
 void PixelVertexProducerCUDA::produceOnGPU(edm::StreamID streamID,
                                            edm::Event& iEvent,
                                            const edm::EventSetup& iSetup) const {
-  edm::Handle<cms::cuda::Product<pixelTrack::TrackSoALayout>> hTracks;
+  edm::Handle<cms::cuda::Product<pixelTrack::TrackSoAView>> hTracks;
   iEvent.getByToken(tokenGPUTrack_, hTracks);
 
   cms::cuda::ScopedContextProduce ctx{*hTracks};
-  auto const* tracks = ctx.get(*hTracks).get();
+  auto tracks_view = ctx.get(*hTracks);
 
-  assert(tracks);
-
-  ctx.emplace(iEvent, tokenGPUVertex_, gpuAlgo_.makeAsync(ctx.stream(), tracks, ptMin_, ptMax_));
+  ctx.emplace(iEvent, tokenGPUVertex_, gpuAlgo_.makeAsync(ctx.stream(), tracks_view, ptMin_, ptMax_));
 }
 
 void PixelVertexProducerCUDA::produceOnCPU(edm::StreamID streamID,
                                            edm::Event& iEvent,
                                            const edm::EventSetup& iSetup) const {
-  auto const* tracks = iEvent.get(tokenCPUTrack_).get();
-  assert(tracks);
+  auto tracks_view = iEvent.get(tokenCPUTrack_);
 
 #ifdef PIXVERTEX_DEBUG_PRODUCE
-  auto const& tsoa = *tracks;
-  auto maxTracks = tsoa.stride();
-  std::cout << "size of SoA " << sizeof(tsoa) << " stride " << maxTracks << std::endl;
+
+  auto maxTracks = tracks_view.metadata().size();
+  // std::cout << "size of SoA " << sizeof(tsoa) << " stride " << maxTracks << std::endl;
 
   int32_t nt = 0;
   for (int32_t it = 0; it < maxTracks; ++it) {
-    auto nHits = tsoa.nHits(it);
-    assert(nHits == int(tsoa.hitIndices.size(it)));
+    auto nHits = pixelTrack::utilities::nHits(tracks_view, it);
+    assert(nHits == int(tracks_view.hitIndices().size(it)));
     if (nHits == 0)
       break;  // this is a guard: maybe we need to move to nTracks...
     nt++;
   }
-  std::cout << "found " << nt << " tracks in cpu SoA for Vertexing at " << tracks << std::endl;
+  // std::cout << "found " << nt << " tracks in cpu SoA for Vertexing at " << tracks << std::endl;
 #endif  // PIXVERTEX_DEBUG_PRODUCE
 
-  iEvent.emplace(tokenCPUVertex_, gpuAlgo_.make(tracks, ptMin_, ptMax_));
+  iEvent.emplace(tokenCPUVertex_, gpuAlgo_.make(tracks_view, ptMin_, ptMax_));
 }
 
 void PixelVertexProducerCUDA::produce(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const {
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc
index 20b007d2d029f..fe2f00f91b495 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc
@@ -18,27 +18,24 @@ namespace gpuVertexFinder {
   // split vertices with a chi2/NDoF greater than this
   constexpr float maxChi2ForSplit = 9.f;
 
-  __global__ void loadTracks(TkSoA const* ptracks, ZVertexSoA* soa, WorkSpace* pws, float ptMin, float ptMax) {
-    assert(ptracks);
+  __global__ void loadTracks(TkSoAView tracks_view, ZVertexSoA* soa, WorkSpace* pws, float ptMin, float ptMax) {
     assert(soa);
-    auto const& tracks = *ptracks;
-    auto const& fit = tracks.stateAtBS;
-    auto const* quality = tracks.qualityData();
+    auto const* quality = pixelTrack::utilities::qualityData(tracks_view);
 
     auto first = blockIdx.x * blockDim.x + threadIdx.x;
-    for (int idx = first, nt = tracks.nTracks(); idx < nt; idx += gridDim.x * blockDim.x) {
-      auto nHits = tracks.nHits(idx);
+    for (int idx = first, nt = tracks_view.nTracks(); idx < nt; idx += gridDim.x * blockDim.x) {
+      auto nHits = pixelTrack::utilities::nHits(tracks_view, idx);
       assert(nHits >= 3);
 
       // initialize soa...
       soa->idv[idx] = -1;
 
-      if (tracks.isTriplet(idx))
+      if (pixelTrack::utilities::isTriplet(tracks_view, idx))
         continue;  // no triplets
       if (quality[idx] < pixelTrack::Quality::highPurity)
         continue;
 
-      auto pt = tracks.pt(idx);
+      auto pt = tracks_view[idx].pt();
 
       if (pt < ptMin)
         continue;
@@ -49,8 +46,8 @@ namespace gpuVertexFinder {
       auto& data = *pws;
       auto it = atomicAdd(&data.ntrks, 1);
       data.itrk[it] = idx;
-      data.zt[it] = tracks.zip(idx);
-      data.ezt2[it] = fit.covariance(idx)(14);
+      data.zt[it] = pixelTrack::utilities::zip(tracks_view, idx);
+      data.ezt2[it] = tracks_view[idx].covariance()(14);
       data.ptt2[it] = pt * pt;
     }
   }
@@ -95,19 +92,19 @@ namespace gpuVertexFinder {
 #endif
 
 #ifdef __CUDACC__
-  ZVertexHeterogeneous Producer::makeAsync(cudaStream_t stream, TkSoA const* tksoa, float ptMin, float ptMax) const {
+  ZVertexHeterogeneous Producer::makeAsync(cudaStream_t stream, TkSoAView tracks_view, float ptMin, float ptMax) const {
 #ifdef PIXVERTEX_DEBUG_PRODUCE
     std::cout << "producing Vertices on GPU" << std::endl;
 #endif  // PIXVERTEX_DEBUG_PRODUCE
     ZVertexHeterogeneous vertices(cms::cuda::make_device_unique<ZVertexSoA>(stream));
 #else
-  ZVertexHeterogeneous Producer::make(TkSoA const* tksoa, float ptMin, float ptMax) const {
+  ZVertexHeterogeneous Producer::make(TkSoAView tracks_view, float ptMin, float ptMax) const {
 #ifdef PIXVERTEX_DEBUG_PRODUCE
     std::cout << "producing Vertices on  CPU" << std::endl;
 #endif  // PIXVERTEX_DEBUG_PRODUCE
     ZVertexHeterogeneous vertices(std::make_unique<ZVertexSoA>());
 #endif
-    assert(tksoa);
+    // assert(tksoa);
     auto* soa = vertices.get();
     assert(soa);
 
@@ -120,12 +117,12 @@ namespace gpuVertexFinder {
 #ifdef __CUDACC__
     init<<<1, 1, 0, stream>>>(soa, ws_d.get());
     auto blockSize = 128;
-    auto numberOfBlocks = (TkSoA::stride() + blockSize - 1) / blockSize;
-    loadTracks<<<numberOfBlocks, blockSize, 0, stream>>>(tksoa, soa, ws_d.get(), ptMin, ptMax);
+    auto numberOfBlocks = (tracks_view.metadata().size() + blockSize - 1) / blockSize;
+    loadTracks<<<numberOfBlocks, blockSize, 0, stream>>>(tracks_view, soa, ws_d.get(), ptMin, ptMax);
     cudaCheck(cudaGetLastError());
 #else
     init(soa, ws_d.get());
-    loadTracks(tksoa, soa, ws_d.get(), ptMin, ptMax);
+    loadTracks(tracks_view, soa, ws_d.get(), ptMin, ptMax);
 #endif
 
 #ifdef __CUDACC__
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h
index 2b6a8107d927f..514c9b6a881fd 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h
@@ -11,6 +11,7 @@ namespace gpuVertexFinder {
 
   using ZVertices = ZVertexSoA;
   using TkSoA = pixelTrack::TrackSoA;
+  using TkSoAView = pixelTrack::TrackSoAView;
 
   // workspace used in the vertex reco algos
   struct WorkSpace {
@@ -42,7 +43,6 @@ namespace gpuVertexFinder {
   public:
     using ZVertices = ZVertexSoA;
     using WorkSpace = gpuVertexFinder::WorkSpace;
-    using TkSoA = pixelTrack::TrackSoA;
 
     Producer(bool oneKernel,
              bool useDensity,
@@ -64,8 +64,8 @@ namespace gpuVertexFinder {
 
     ~Producer() = default;
 
-    ZVertexHeterogeneous makeAsync(cudaStream_t stream, TkSoA const* tksoa, float ptMin, float ptMax) const;
-    ZVertexHeterogeneous make(TkSoA const* tksoa, float ptMin, float ptMax) const;
+    ZVertexHeterogeneous makeAsync(cudaStream_t stream, TkSoAView tracks_view, float ptMin, float ptMax) const;
+    ZVertexHeterogeneous make(TkSoAView tracks_view, float ptMin, float ptMax) const;
 
   private:
     const bool oneKernel_;

From 3cdfab43ba24dcd3874c616afa29296d153ffae3 Mon Sep 17 00:00:00 2001
From: Dimitris Papagiannis <nothingface0@gmail.com>
Date: Wed, 26 Oct 2022 12:15:35 +0200
Subject: [PATCH 067/110] Removed duplicate entries in classes_def

---
 RecoPixelVertexing/PixelTriplets/src/classes_def.xml | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/RecoPixelVertexing/PixelTriplets/src/classes_def.xml b/RecoPixelVertexing/PixelTriplets/src/classes_def.xml
index 405eedfe74760..ea89a65a45dbb 100644
--- a/RecoPixelVertexing/PixelTriplets/src/classes_def.xml
+++ b/RecoPixelVertexing/PixelTriplets/src/classes_def.xml
@@ -1,8 +1,4 @@
 <lcgdict>
   <class name="IntermediateHitTriplets" persistent="false"/>
   <class name="edm::Wrapper<IntermediateHitTriplets>" persistent="false"/>
-  <class name="pixelTrack::TrackSoAView" persistent="false"/>
-  <class name="edm::Wrapper<pixelTrack::TrackSoAView>" persistent="false"/>
-  <!-- <class name="cms::cuda::Product<pixelTrack::TrackSoAView>" persistent="false"/> -->
-  <!-- <class name="edm::Wrapper<cms::cuda::Product<pixelTrack::TrackSoAView>>" persistent="false"/> -->
 </lcgdict>

From f01e173eb624eaf16237ec5d1dd207e819484a5e Mon Sep 17 00:00:00 2001
From: Breno Orzari <breno.orzari@hotmail.com>
Date: Wed, 26 Oct 2022 14:37:49 +0200
Subject: [PATCH 068/110] Changing PixelTrackProducerFromSoA to use view

---
 .../plugins/PixelTrackProducerFromSoA.cc      | 38 ++++++++++++-------
 1 file changed, 25 insertions(+), 13 deletions(-)

diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc
index c5d31764b0fcb..9e4839ec8b644 100644
--- a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc
+++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc
@@ -27,7 +27,8 @@
 #include "RecoPixelVertexing/PixelTrackFitting/interface/FitUtils.h"
 
 #include "CUDADataFormats/Common/interface/HostProduct.h"
-#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
+//#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h"
 #include "CUDADataFormats/SiPixelCluster/interface/gpuClusteringConstants.h"
 
 #include "storeTracks.h"
@@ -35,7 +36,7 @@
 
 /**
  * This class creates "leagcy"  reco::Track
- * objects from the output of SoA CA. 
+ * objects from the output of SoA CA.
  */
 class PixelTrackProducerFromSoA : public edm::global::EDProducer<> {
 public:
@@ -54,7 +55,8 @@ class PixelTrackProducerFromSoA : public edm::global::EDProducer<> {
 
   // Event Data tokens
   const edm::EDGetTokenT<reco::BeamSpot> tBeamSpot_;
-  const edm::EDGetTokenT<PixelTrackHeterogeneous> tokenTrack_;
+  //const edm::EDGetTokenT<PixelTrackHeterogeneous> tokenTrack_;
+  const edm::EDGetTokenT<pixelTrack::TrackSoAView> tokenTrack_;
   const edm::EDGetTokenT<SiPixelRecHitCollectionNew> cpuHits_;
   const edm::EDGetTokenT<HMSstorage> hmsToken_;
   // Event Setup tokens
@@ -67,7 +69,8 @@ class PixelTrackProducerFromSoA : public edm::global::EDProducer<> {
 
 PixelTrackProducerFromSoA::PixelTrackProducerFromSoA(const edm::ParameterSet &iConfig)
     : tBeamSpot_(consumes<reco::BeamSpot>(iConfig.getParameter<edm::InputTag>("beamSpot"))),
-      tokenTrack_(consumes<PixelTrackHeterogeneous>(iConfig.getParameter<edm::InputTag>("trackSrc"))),
+      //tokenTrack_(consumes<PixelTrackHeterogeneous>(iConfig.getParameter<edm::InputTag>("trackSrc"))),
+      tokenTrack_(consumes<pixelTrack::TrackSoAView>(iConfig.getParameter<edm::InputTag>("trackSrc"))),
       cpuHits_(consumes<SiPixelRecHitCollectionNew>(iConfig.getParameter<edm::InputTag>("pixelRecHitLegacySrc"))),
       hmsToken_(consumes<HMSstorage>(iConfig.getParameter<edm::InputTag>("pixelRecHitLegacySrc"))),
       idealMagneticFieldToken_(esConsumes()),
@@ -152,12 +155,16 @@ void PixelTrackProducerFromSoA::produce(edm::StreamID streamID,
   std::vector<const TrackingRecHit *> hits;
   hits.reserve(5);
 
-  const auto &tsoa = *iEvent.get(tokenTrack_);
+  //const auto &tsoa = *iEvent.get(tokenTrack_);
+  auto tsoa = iEvent.get(tokenTrack_);
 
-  auto const *quality = pixelTrack::utilities::qualityData(tsoa.view());
+  //auto const *quality = pixelTrack::utilities::qualityData(tsoa.view());
   // auto const &fit = tsoa.stateAtBS;
-  auto const &hitIndices = tsoa.view().hitIndices();
-  auto nTracks = tsoa.view().nTracks();
+  //auto const &hitIndices = tsoa.view().hitIndices();
+  //auto nTracks = tsoa.view().nTracks();
+  auto const *quality = pixelTrack::utilities::qualityData(tsoa);
+  auto const hitIndices = tsoa.hitIndices();
+  auto nTracks = tsoa.nTracks();
 
   tracks.reserve(nTracks);
 
@@ -167,13 +174,15 @@ void PixelTrackProducerFromSoA::produce(edm::StreamID streamID,
   std::vector<int32_t> sortIdxs(nTracks);
   std::iota(sortIdxs.begin(), sortIdxs.end(), 0);
   std::sort(sortIdxs.begin(), sortIdxs.end(), [&](int32_t const i1, int32_t const i2) {
-    return tsoa.view()[i1].pt() > tsoa.view()[i2].pt();
+    //return tsoa.view()[i1].pt() > tsoa.view()[i2].pt();
+    return tsoa[i1].pt() > tsoa[i2].pt();
   });
 
   //store the index of the SoA: indToEdm[index_SoAtrack] -> index_edmTrack (if it exists)
   indToEdm.resize(sortIdxs.size(), -1);
   for (const auto &it : sortIdxs) {
-    auto nHits = pixelTrack::utilities::nHits(tsoa.view(), it);
+    //auto nHits = pixelTrack::utilities::nHits(tsoa.view(), it);
+    auto nHits = pixelTrack::utilities::nHits(tsoa, it);
     assert(nHits >= 3);
     auto q = quality[it];
     if (q < minQuality_)
@@ -190,12 +199,15 @@ void PixelTrackProducerFromSoA::produce(edm::StreamID streamID,
 
     // mind: this values are respect the beamspot!
 
-    float chi2 = tsoa.view()[it].chi2();
-    float phi = pixelTrack::utilities::phi(tsoa.view(), it);
+    //float chi2 = tsoa.view()[it].chi2();
+    //float phi = pixelTrack::utilities::phi(tsoa.view(), it);
+    float chi2 = tsoa[it].chi2();
+    float phi = pixelTrack::utilities::phi(tsoa, it);
 
     riemannFit::Vector5d ipar, opar;
     riemannFit::Matrix5d icov, ocov;
-    pixelTrack::utilities::copyToDense<riemannFit::Vector5d, riemannFit::Matrix5d>(tsoa.view(), ipar, icov, it);
+    //pixelTrack::utilities::copyToDense<riemannFit::Vector5d, riemannFit::Matrix5d>(tsoa.view(), ipar, icov, it);
+    pixelTrack::utilities::copyToDense<riemannFit::Vector5d, riemannFit::Matrix5d>(tsoa, ipar, icov, it);
     riemannFit::transformToPerigeePlane(ipar, icov, opar, ocov);
 
     LocalTrajectoryParameters lpar(opar(0), opar(1), opar(2), opar(3), opar(4), 1.);

From 598ea08fc25579899b027fb317d31666c9d33340 Mon Sep 17 00:00:00 2001
From: Breno Orzari <breno.orzari@hotmail.com>
Date: Wed, 26 Oct 2022 15:07:30 +0200
Subject: [PATCH 069/110] Changing layout name to avoid underscore

---
 .../interface/TrackSoAHeterogeneousT_test.h      | 16 ++++++++--------
 .../Track/test/TrackSoAHeterogeneous_test.cpp    |  4 ++--
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h
index 630c5b70bb22d..f7edf60840fce 100644
--- a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h
+++ b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h
@@ -43,7 +43,7 @@ using Vector5f = Eigen::Matrix<float, 5, 1>;
 using Vector15f = Eigen::Matrix<float, 15, 1>;
 using HitContainer = pixelTrack::HitContainer;
 
-GENERATE_SOA_LAYOUT(TrackSoAHeterogeneousT_test,
+GENERATE_SOA_LAYOUT(TrackSoAHeterogeneousLayout,
                     SOA_COLUMN(uint8_t, quality),
                     SOA_COLUMN(float, chi2),  // this is chi2/ndof as not necessarely all hits are used in the fit
                     SOA_COLUMN(int8_t, nLayers),
@@ -59,8 +59,8 @@ GENERATE_SOA_LAYOUT(TrackSoAHeterogeneousT_test,
 // They operate on View and ConstView of the TrackSoA.
 namespace pixelTrack {
   namespace utilities {
-    using TrackSoAView = TrackSoAHeterogeneousT_test<>::View;
-    using TrackSoAConstView = TrackSoAHeterogeneousT_test<>::ConstView;
+    using TrackSoAView = TrackSoAHeterogeneousLayout<>::View;
+    using TrackSoAConstView = TrackSoAHeterogeneousLayout<>::ConstView;
     using Quality = pixelTrack::Quality;
     using hindex_type = uint32_t;
     // State at the Beam spot
@@ -144,13 +144,13 @@ namespace pixelTrack {
 }  // namespace pixelTrack
 
 template <int32_t S>
-class TrackSoAHeterogeneousT : public cms::cuda::PortableDeviceCollection<TrackSoAHeterogeneousT_test<>> {
+class TrackSoAHeterogeneousT : public cms::cuda::PortableDeviceCollection<TrackSoAHeterogeneousLayout<>> {
 public:
   TrackSoAHeterogeneousT() = default;
 
   // Constructor which specifies the SoA size
   explicit TrackSoAHeterogeneousT(cudaStream_t stream)
-      : PortableDeviceCollection<TrackSoAHeterogeneousT_test<>>(S, stream) {}
+      : PortableDeviceCollection<TrackSoAHeterogeneousLayout<>>(S, stream) {}
 
   // Copy data from device to host
   __host__ cms::cuda::host::unique_ptr<std::byte[]> copyToHost(cudaStream_t stream) {
@@ -163,9 +163,9 @@ class TrackSoAHeterogeneousT : public cms::cuda::PortableDeviceCollection<TrackS
 namespace pixelTrack {
 
   using TrackSoA = TrackSoAHeterogeneousT<maxNumber()>;
-  using TrackSoALayout = TrackSoAHeterogeneousT_test<>;
-  using TrackSoAView = TrackSoAHeterogeneousT_test<>::View;
-  using TrackSoAConstView = TrackSoAHeterogeneousT_test<>::ConstView;
+  using TrackSoALayout = TrackSoAHeterogeneousLayout<>;
+  using TrackSoAView = TrackSoAHeterogeneousLayout<>::View;
+  using TrackSoAConstView = TrackSoAHeterogeneousLayout<>::ConstView;
 
 }  // namespace pixelTrack
 
diff --git a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp
index 4be8343e3474d..db26e83428f56 100644
--- a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp
+++ b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp
@@ -43,8 +43,8 @@ int main() {
 
     // Create a view to access the copied data
     auto tracks_h_soa = tracks.copyToHost(stream);
-    TrackSoAHeterogeneousT_test<> tmp_layout(tracks_h_soa.get(), soaNumElements);
-    TrackSoAHeterogeneousT_test<>::View tmp_view(tmp_layout);
+    TrackSoAHeterogeneousLayout<> tmp_layout(tracks_h_soa.get(), soaNumElements);
+    TrackSoAHeterogeneousLayout<>::View tmp_view(tmp_layout);
 
     // Print results
     std::cout << "pt"

From 11acc8dad5fe537c3ec877716a2bd561b1affac8 Mon Sep 17 00:00:00 2001
From: Dimitris Papagiannis <nothingface0@gmail.com>
Date: Wed, 26 Oct 2022 15:34:40 +0200
Subject: [PATCH 070/110] Correct instantiation of tracks on host-side

---
 .../interface/TrackSoAHeterogeneousT_test.h   |  5 +-
 .../plugins/CAHitNtupletGeneratorKernels.cc   | 48 +++++++++--------
 .../plugins/CAHitNtupletGeneratorKernels.cu   | 54 ++++++++++---------
 .../plugins/CAHitNtupletGeneratorKernels.h    |  4 +-
 .../plugins/CAHitNtupletGeneratorOnGPU.cc     | 29 +++++-----
 5 files changed, 75 insertions(+), 65 deletions(-)

diff --git a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h
index f7edf60840fce..323b41226bee0 100644
--- a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h
+++ b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h
@@ -153,8 +153,9 @@ class TrackSoAHeterogeneousT : public cms::cuda::PortableDeviceCollection<TrackS
       : PortableDeviceCollection<TrackSoAHeterogeneousLayout<>>(S, stream) {}
 
   // Copy data from device to host
-  __host__ cms::cuda::host::unique_ptr<std::byte[]> copyToHost(cudaStream_t stream) {
-    auto tracks_h_soa = cms::cuda::make_host_unique<std::byte[]>(bufferSize(), stream);
+  // Copy data from device to host
+  __host__ std::unique_ptr<std::byte[]> copyToHost(cudaStream_t stream) {
+    auto tracks_h_soa = std::make_unique<std::byte[]>(bufferSize());
     cudaCheck(cudaMemcpy(tracks_h_soa.get(), const_buffer().get(), bufferSize(), cudaMemcpyDeviceToHost));
     return tracks_h_soa;
   }
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
index 80497d3dd706b..cdefeab9e36b7 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
@@ -78,7 +78,9 @@ void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsOnCPU const &hh, cudaStr
 }
 
 template <>
-void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) {
+void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh,
+                                                    TkSoAView tracks_view,
+                                                    cudaStream_t cudaStream) {
   // auto *tuples_d = tracks_d->view().hitIndices();
   // auto *detId_d = tracks_d->view().detIndices();
   // auto *quality_d = tracks_d->qualityData();
@@ -86,7 +88,7 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA *
   // assert(tuples_d && quality_d); // TODO Find equivalent for View
 
   // zero tuples
-  cms::cuda::launchZero(&tracks_d->view().hitIndices(), cudaStream);
+  cms::cuda::launchZero(&tracks_view.hitIndices(), cudaStream);
 
   auto nhits = hh.nHits();
 
@@ -119,22 +121,22 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA *
                        device_theCells_.get(),
                        device_nCells_,
                        device_theCellTracks_.get(),
-                       tracks_d->view(),
+                       tracks_view,
                        device_hitTuple_apc_,
                        params_.minHitsPerNtuplet_);
   if (params_.doStats_)
     kernel_mark_used(device_theCells_.get(), device_nCells_);
 
-  cms::cuda::finalizeBulk(device_hitTuple_apc_, &tracks_d->view().hitIndices());
+  cms::cuda::finalizeBulk(device_hitTuple_apc_, &tracks_view.hitIndices());
 
-  kernel_fillHitDetIndices(tracks_d->view(), hh.view());
-  kernel_fillNLayers(tracks_d->view(), device_hitTuple_apc_);
+  kernel_fillHitDetIndices(tracks_view, hh.view());
+  kernel_fillNLayers(tracks_view, device_hitTuple_apc_);
 
   // remove duplicates (tracks that share a doublet)
-  kernel_earlyDuplicateRemover(device_theCells_.get(), device_nCells_, tracks_d->view(), params_.dupPassThrough_);
-  kernel_countMultiplicity(tracks_d->view(), device_tupleMultiplicity_.get());
+  kernel_earlyDuplicateRemover(device_theCells_.get(), device_nCells_, tracks_view, params_.dupPassThrough_);
+  kernel_countMultiplicity(tracks_view, device_tupleMultiplicity_.get());
   cms::cuda::launchFinalize(device_tupleMultiplicity_.get(), cudaStream);
-  kernel_fillMultiplicity(tracks_d->view(), device_tupleMultiplicity_.get());
+  kernel_fillMultiplicity(tracks_view, device_tupleMultiplicity_.get());
 
   if (nhits > 1 && params_.lateFishbone_) {
     gpuPixelDoublets::fishbone(hh.view(), device_theCells_.get(), device_nCells_, isOuterHitOfCell_, nhits, true);
@@ -142,13 +144,15 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA *
 }
 
 template <>
-void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) {
+void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh,
+                                                     TkSoAView tracks_view,
+                                                     cudaStream_t cudaStream) {
   int32_t nhits = hh.nHits();
 
   // auto const *tuples_d = &tracks_d->hitIndices;
-  auto *quality_d = pixelTrack::utilities::qualityData(tracks_d->view());
+  auto *quality_d = pixelTrack::utilities::qualityData(tracks_view);
   // classify tracks based on kinematics
-  kernel_classifyTracks(tracks_d->view(), quality_d, params_.cuts_);
+  kernel_classifyTracks(tracks_view, quality_d, params_.cuts_);
 
   if (params_.lateFishbone_) {
     // apply fishbone cleaning to good tracks
@@ -156,34 +160,34 @@ void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, TkSoA
   }
 
   // remove duplicates (tracks that share a doublet)
-  kernel_fastDuplicateRemover(device_theCells_.get(), device_nCells_, tracks_d->view(), params_.dupPassThrough_);
+  kernel_fastDuplicateRemover(device_theCells_.get(), device_nCells_, tracks_view, params_.dupPassThrough_);
 
   // fill hit->track "map"
   if (params_.doSharedHitCut_ || params_.doStats_) {
-    kernel_countHitInTracks(tracks_d->view(), device_hitToTuple_.get());
+    kernel_countHitInTracks(tracks_view, device_hitToTuple_.get());
     cms::cuda::launchFinalize(hitToTupleView_, cudaStream);
-    kernel_fillHitInTracks(tracks_d->view(), device_hitToTuple_.get());
+    kernel_fillHitInTracks(tracks_view, device_hitToTuple_.get());
   }
 
   // remove duplicates (tracks that share at least one hit)
   if (params_.doSharedHitCut_) {
     kernel_rejectDuplicate(
-        tracks_d->view(), params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get());
+        tracks_view, params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get());
 
     kernel_sharedHitCleaner(
-        hh.view(), tracks_d->view(), params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get());
+        hh.view(), tracks_view, params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get());
     if (params_.useSimpleTripletCleaner_) {
       kernel_simpleTripletCleaner(
-          tracks_d->view(), params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get());
+          tracks_view, params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get());
     } else {
       kernel_tripletCleaner(
-          tracks_d->view(), params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get());
+          tracks_view, params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get());
     }
   }
 
   if (params_.doStats_) {
     std::lock_guard guard(lock_stat);
-    kernel_checkOverflows(tracks_d->view(),
+    kernel_checkOverflows(tracks_view,
                           device_tupleMultiplicity_.get(),
                           device_hitToTuple_.get(),
                           device_hitTuple_apc_,
@@ -201,7 +205,7 @@ void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, TkSoA
     // counters (add flag???)
     std::lock_guard guard(lock_stat);
     kernel_doStatsForHitInTracks(device_hitToTuple_.get(), counters_);
-    kernel_doStatsForTracks(tracks_d->view(), quality_d, counters_);
+    kernel_doStatsForTracks(tracks_view, quality_d, counters_);
   }
 
 #ifdef DUMP_GPU_TK_TUPLES
@@ -210,7 +214,7 @@ void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, TkSoA
   {
     std::lock_guard<std::mutex> guard(lock);
     ++iev;
-    kernel_print_found_ntuplets(hh.view(), tracks_d->view(), device_hitToTuple_.get(), 0, 1000000, iev);
+    kernel_print_found_ntuplets(hh.view(), tracks_view, device_hitToTuple_.get(), 0, 1000000, iev);
   }
 #endif
 }
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
index b9a77bd48737d..9cbdcae1a13d8 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
@@ -2,12 +2,14 @@
 #include <mutex>
 
 template <>
-void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) {
+void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh,
+                                                    TkSoAView tracks_view,
+                                                    cudaStream_t cudaStream) {
   // these are pointer on GPU!
-  auto *quality_d = pixelTrack::utilities::qualityData(tracks_d->view());
+  auto *quality_d = pixelTrack::utilities::qualityData(tracks_view);
 
   // zero tuples
-  cms::cuda::launchZero(&(tracks_d->view().hitIndices()), cudaStream);
+  cms::cuda::launchZero(&(tracks_view.hitIndices()), cudaStream);
 
   int32_t nhits = hh.nHits();
 
@@ -68,7 +70,7 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *
                                                                      device_theCells_.get(),
                                                                      device_nCells_,
                                                                      device_theCellTracks_.get(),
-                                                                     tracks_d->view(),
+                                                                     tracks_view,
                                                                      device_hitTuple_apc_,
                                                                      params_.minHitsPerNtuplet_);
   cudaCheck(cudaGetLastError());
@@ -85,26 +87,24 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *
   blockSize = 128;
   numberOfBlocks = (HitContainer::ctNOnes() + blockSize - 1) / blockSize;
   cms::cuda::finalizeBulk<<<numberOfBlocks, blockSize, 0, cudaStream>>>(device_hitTuple_apc_,
-                                                                        &tracks_d->view().hitIndices());
+                                                                        &tracks_view.hitIndices());
 
-  kernel_fillHitDetIndices<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tracks_d->view(), hh.view());
+  kernel_fillHitDetIndices<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tracks_view, hh.view());
   cudaCheck(cudaGetLastError());
-  kernel_fillNLayers<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tracks_d->view(), device_hitTuple_apc_);
+  kernel_fillNLayers<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tracks_view, device_hitTuple_apc_);
   cudaCheck(cudaGetLastError());
 
   // remove duplicates (tracks that share a doublet)
   numberOfBlocks = nDoubletBlocks(blockSize);
   kernel_earlyDuplicateRemover<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
-      device_theCells_.get(), device_nCells_, tracks_d->view(), params_.dupPassThrough_);
+      device_theCells_.get(), device_nCells_, tracks_view, params_.dupPassThrough_);
   cudaCheck(cudaGetLastError());
 
   blockSize = 128;
   numberOfBlocks = (3 * caConstants::maxTuples / 4 + blockSize - 1) / blockSize;
-  kernel_countMultiplicity<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tracks_d->view(),
-                                                                         device_tupleMultiplicity_.get());
+  kernel_countMultiplicity<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tracks_view, device_tupleMultiplicity_.get());
   cms::cuda::launchFinalize(device_tupleMultiplicity_.get(), cudaStream);
-  kernel_fillMultiplicity<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tracks_d->view(),
-                                                                        device_tupleMultiplicity_.get());
+  kernel_fillMultiplicity<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tracks_view, device_tupleMultiplicity_.get());
   cudaCheck(cudaGetLastError());
 
   // do not run the fishbone if there are hits only in BPIX1
@@ -220,9 +220,11 @@ void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStr
 }
 
 template <>
-void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) {
+void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh,
+                                                     TkSoAView tracks_view,
+                                                     cudaStream_t cudaStream) {
   // these are pointer on GPU!
-  auto *quality_d = pixelTrack::utilities::qualityData(tracks_d->view());
+  auto *quality_d = pixelTrack::utilities::qualityData(tracks_view);
 
   int32_t nhits = hh.nHits();
 
@@ -230,7 +232,7 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA
 
   // classify tracks based on kinematics
   auto numberOfBlocks = nQuadrupletBlocks(blockSize);
-  kernel_classifyTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tracks_d->view(), quality_d, params_.cuts_);
+  kernel_classifyTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tracks_view, quality_d, params_.cuts_);
 
   cudaCheck(cudaGetLastError());
 
@@ -245,7 +247,7 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA
   // mark duplicates (tracks that share a doublet)
   numberOfBlocks = nDoubletBlocks(blockSize);
   kernel_fastDuplicateRemover<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
-      device_theCells_.get(), device_nCells_, tracks_d->view(), params_.dupPassThrough_);
+      device_theCells_.get(), device_nCells_, tracks_view, params_.dupPassThrough_);
   cudaCheck(cudaGetLastError());
 #ifdef GPU_DEBUG
   cudaCheck(cudaDeviceSynchronize());
@@ -255,13 +257,13 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA
     // fill hit->track "map"
     assert(hitToTupleView_.offSize > nhits);
     numberOfBlocks = nQuadrupletBlocks(blockSize);
-    kernel_countHitInTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tracks_d->view(), device_hitToTuple_.get());
+    kernel_countHitInTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tracks_view, device_hitToTuple_.get());
     cudaCheck(cudaGetLastError());
     assert((hitToTupleView_.assoc == device_hitToTuple_.get()) &&
            (hitToTupleView_.offStorage == device_hitToTupleStorage_.get()) && (hitToTupleView_.offSize > 0));
     cms::cuda::launchFinalize(hitToTupleView_, cudaStream);
     cudaCheck(cudaGetLastError());
-    kernel_fillHitInTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tracks_d->view(), device_hitToTuple_.get());
+    kernel_fillHitInTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tracks_view, device_hitToTuple_.get());
     cudaCheck(cudaGetLastError());
 #ifdef GPU_DEBUG
     cudaCheck(cudaDeviceSynchronize());
@@ -273,17 +275,17 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA
     numberOfBlocks = (hitToTupleView_.offSize + blockSize - 1) / blockSize;
 
     kernel_rejectDuplicate<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
-        tracks_d->view(), params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get());
+        tracks_view, params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get());
 
     kernel_sharedHitCleaner<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
-        hh.view(), tracks_d->view(), params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get());
+        hh.view(), tracks_view, params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get());
 
     if (params_.useSimpleTripletCleaner_) {
       kernel_simpleTripletCleaner<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
-          tracks_d->view(), params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get());
+          tracks_view, params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get());
     } else {
       kernel_tripletCleaner<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
-          tracks_d->view(), params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get());
+          tracks_view, params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get());
     }
     cudaCheck(cudaGetLastError());
 #ifdef GPU_DEBUG
@@ -293,7 +295,7 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA
 
   if (params_.doStats_) {
     numberOfBlocks = (std::max(nhits, int(params_.maxNumberOfDoublets_)) + blockSize - 1) / blockSize;
-    kernel_checkOverflows<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tracks_d->view(),
+    kernel_checkOverflows<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tracks_view,
                                                                         device_tupleMultiplicity_.get(),
                                                                         device_hitToTuple_.get(),
                                                                         device_hitTuple_apc_,
@@ -314,7 +316,7 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA
     kernel_doStatsForHitInTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(device_hitToTuple_.get(), counters_);
     cudaCheck(cudaGetLastError());
     numberOfBlocks = (3 * caConstants::maxNumberOfQuadruplets / 4 + blockSize - 1) / blockSize;
-    kernel_doStatsForTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tracks_d->view(), quality_d, counters_);
+    kernel_doStatsForTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tracks_view, quality_d, counters_);
     cudaCheck(cudaGetLastError());
   }
 #ifdef GPU_DEBUG
@@ -330,11 +332,11 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA
     ++iev;
     for (int k = 0; k < 20000; k += 500) {
       kernel_print_found_ntuplets<<<1, 32, 0, cudaStream>>>(
-          hh.view(), tracks_d->view(), device_hitToTuple_.get(), k, k + 500, iev);
+          hh.view(), tracks_view, device_hitToTuple_.get(), k, k + 500, iev);
       cudaDeviceSynchronize();
     }
     kernel_print_found_ntuplets<<<1, 32, 0, cudaStream>>>(
-        hh.view(), tracks_d->view(), device_hitToTuple_.get(), 20000, 1000000, iev);
+        hh.view(), tracks_view, device_hitToTuple_.get(), 20000, 1000000, iev);
     cudaDeviceSynchronize();
     // cudaStreamSynchronize(cudaStream);
   }
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h
index fcab52e96d210..5a82798905b13 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h
@@ -186,9 +186,9 @@ class CAHitNtupletGeneratorKernels {
 
   TupleMultiplicity const* tupleMultiplicity() const { return device_tupleMultiplicity_.get(); }
 
-  void launchKernels(HitsOnCPU const& hh, TkSoA* tuples_d, cudaStream_t cudaStream);
+  void launchKernels(HitsOnCPU const& hh, TkSoAView tracks_view, cudaStream_t cudaStream);
 
-  void classifyTuples(HitsOnCPU const& hh, TkSoA* tuples_d, cudaStream_t cudaStream);
+  void classifyTuples(HitsOnCPU const& hh, TkSoAView tracks_view, cudaStream_t cudaStream);
 
   void buildDoublets(HitsOnCPU const& hh, cudaStream_t stream);
   void allocateOnGPU(int32_t nHits, cudaStream_t stream);
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
index 6a8de7fc49f66..f4ab7d3e83504 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
@@ -191,8 +191,8 @@ void CAHitNtupletGeneratorOnGPU::endJob() {
                                                                     cudaStream_t stream) const {
   PixelTrackHeterogeneous tracks(cms::cuda::make_device_unique<pixelTrack::TrackSoA>(stream));*/
 pixelTrack::TrackSoAView CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecHit2DGPU const& hits_d,
-                                                                       float bfield,
-                                                                       cudaStream_t stream) const {
+                                                                     float bfield,
+                                                                     cudaStream_t stream) const {
   pixelTrack::TrackSoA tracks(stream);
   auto* soa = &tracks;
 
@@ -201,7 +201,7 @@ pixelTrack::TrackSoAView CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRec
   kernels.allocateOnGPU(hits_d.nHits(), stream);
 
   kernels.buildDoublets(hits_d, stream);
-  kernels.launchKernels(hits_d, soa, stream);
+  kernels.launchKernels(hits_d, soa->view(), stream);
 
   HelixFitOnGPU fitter(bfield, m_params.fitNas4_);
   fitter.allocateOnGPU(kernels.tupleMultiplicity(), soa->view());
@@ -210,7 +210,7 @@ pixelTrack::TrackSoAView CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRec
   } else {
     fitter.launchBrokenLineKernels(hits_d.view(), hits_d.nHits(), caConstants::maxNumberOfQuadruplets, stream);
   }
-  kernels.classifyTuples(hits_d, soa, stream);
+  kernels.classifyTuples(hits_d, soa->view(), stream);
 
 #ifdef GPU_DEBUG
   cudaDeviceSynchronize();
@@ -223,9 +223,12 @@ pixelTrack::TrackSoAView CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRec
 
 pixelTrack::TrackSoAView CAHitNtupletGeneratorOnGPU::makeTuples(TrackingRecHit2DCPU const& hits_d, float bfield) const {
   //PixelTrackHeterogeneous tracks(std::make_unique<pixelTrack::TrackSoA>());
-  pixelTrack::TrackSoA tracks;
+  // pixelTrack::TrackSoA tracks;
 
-  auto* soa = &tracks;
+  auto tracks_h_soa =
+      std::make_unique<std::byte[]>(TrackSoAHeterogeneousLayout<>::computeDataSize(pixelTrack::maxNumber()));
+  TrackSoAHeterogeneousLayout<> tracks_layout(tracks_h_soa.get(), pixelTrack::maxNumber());
+  TrackSoAHeterogeneousLayout<>::View tracks_view(tracks_layout);
   //assert(soa);
 
   CAHitNtupletGeneratorKernelsCPU kernels(m_params);
@@ -233,14 +236,14 @@ pixelTrack::TrackSoAView CAHitNtupletGeneratorOnGPU::makeTuples(TrackingRecHit2D
   kernels.allocateOnGPU(hits_d.nHits(), nullptr);
 
   kernels.buildDoublets(hits_d, nullptr);
-  kernels.launchKernels(hits_d, soa, nullptr);
+  kernels.launchKernels(hits_d, tracks_view, nullptr);
 
   if (0 == hits_d.nHits())
-    return tracks.view();
+    return tracks_view;
 
   // now fit
   HelixFitOnGPU fitter(bfield, m_params.fitNas4_);
-  fitter.allocateOnGPU(kernels.tupleMultiplicity(), soa->view());
+  fitter.allocateOnGPU(kernels.tupleMultiplicity(), tracks_view);
 
   if (m_params.useRiemannFit_) {
     fitter.launchRiemannKernelsOnCPU(hits_d.view(), hits_d.nHits(), caConstants::maxNumberOfQuadruplets);
@@ -248,7 +251,7 @@ pixelTrack::TrackSoAView CAHitNtupletGeneratorOnGPU::makeTuples(TrackingRecHit2D
     fitter.launchBrokenLineKernelsOnCPU(hits_d.view(), hits_d.nHits(), caConstants::maxNumberOfQuadruplets);
   }
 
-  kernels.classifyTuples(hits_d, soa, nullptr);
+  kernels.classifyTuples(hits_d, tracks_view, nullptr);
 
 #ifdef GPU_DEBUG
   std::cout << "finished building pixel tracks on CPU" << std::endl;
@@ -256,13 +259,13 @@ pixelTrack::TrackSoAView CAHitNtupletGeneratorOnGPU::makeTuples(TrackingRecHit2D
 
   // check that the fixed-size SoA does not overflow
 
-  auto maxTracks = soa->view().metadata().size();
-  auto nTracks = soa->view().nTracks();
+  auto maxTracks = tracks_view.metadata().size();
+  auto nTracks = tracks_view.nTracks();
   assert(nTracks < maxTracks);
   if (nTracks == maxTracks - 1) {
     edm::LogWarning("PixelTracks") << "Unsorted reconstructed pixel tracks truncated to " << maxTracks - 1
                                    << " candidates";
   }
 
-  return tracks.view();
+  return tracks_view;
 }

From 34bf18b58c4571601209d443d86d1c0f11ee1777 Mon Sep 17 00:00:00 2001
From: Breno Orzari <breno.orzari@hotmail.com>
Date: Mon, 31 Oct 2022 15:48:29 +0100
Subject: [PATCH 071/110] Creating two PortableCollections: host and device

---
 ...ogeneousT_test.h => PixelTrackUtilities.h} | 35 ++-----------------
 .../interface/TrackSoAHeterogeneousDevice.h   | 35 +++++++++++++++++++
 .../interface/TrackSoAHeterogeneousHost.h     | 32 +++++++++++++++++
 .../Track/src/TrackSoAHeterogeneous_t_test.cc |  3 +-
 CUDADataFormats/Track/src/classes.h           |  3 +-
 .../Track/test/TrackSoAHeterogeneous_t.cpp    |  3 +-
 .../Track/test/TrackSoAHeterogeneous_test.cpp | 35 ++++++++++++-------
 .../Track/test/TrackSoAHeterogeneous_test.cu  | 17 +++++----
 8 files changed, 109 insertions(+), 54 deletions(-)
 rename CUDADataFormats/Track/interface/{TrackSoAHeterogeneousT_test.h => PixelTrackUtilities.h} (80%)
 create mode 100644 CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h
 create mode 100644 CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h

diff --git a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h b/CUDADataFormats/Track/interface/PixelTrackUtilities.h
similarity index 80%
rename from CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h
rename to CUDADataFormats/Track/interface/PixelTrackUtilities.h
index 323b41226bee0..08ed721b20052 100644
--- a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h
+++ b/CUDADataFormats/Track/interface/PixelTrackUtilities.h
@@ -1,20 +1,10 @@
-#ifndef CUDADataFormats_Track_TrackHeterogeneousT_H
-#define CUDADataFormats_Track_TrackHeterogeneousT_H
-
-#include <bits/stdint-uintn.h>
-#include <string>
-#include <algorithm>
+#ifndef CUDADataFormats_Track_PixelTrackUtilities_h
+#define CUDADataFormats_Track_PixelTrackUtilities_h
 
 #include <Eigen/Dense>
 #include "Geometry/CommonTopologies/interface/SimplePixelTopology.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
 #include "DataFormats/SoATemplate/interface/SoALayout.h"
-#include "CUDADataFormats/Common/interface/HeterogeneousSoA.h"
-#include "CUDADataFormats/Common/interface/PortableDeviceCollection.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/allocate_device.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/allocate_host.h"
 
 namespace pixelTrack {
   enum class Quality : uint8_t { bad = 0, edup, dup, loose, strict, tight, highPurity, notQuality };
@@ -143,31 +133,12 @@ namespace pixelTrack {
   }  // namespace utilities
 }  // namespace pixelTrack
 
-template <int32_t S>
-class TrackSoAHeterogeneousT : public cms::cuda::PortableDeviceCollection<TrackSoAHeterogeneousLayout<>> {
-public:
-  TrackSoAHeterogeneousT() = default;
-
-  // Constructor which specifies the SoA size
-  explicit TrackSoAHeterogeneousT(cudaStream_t stream)
-      : PortableDeviceCollection<TrackSoAHeterogeneousLayout<>>(S, stream) {}
-
-  // Copy data from device to host
-  // Copy data from device to host
-  __host__ std::unique_ptr<std::byte[]> copyToHost(cudaStream_t stream) {
-    auto tracks_h_soa = std::make_unique<std::byte[]>(bufferSize());
-    cudaCheck(cudaMemcpy(tracks_h_soa.get(), const_buffer().get(), bufferSize(), cudaMemcpyDeviceToHost));
-    return tracks_h_soa;
-  }
-};
-
 namespace pixelTrack {
 
-  using TrackSoA = TrackSoAHeterogeneousT<maxNumber()>;
   using TrackSoALayout = TrackSoAHeterogeneousLayout<>;
   using TrackSoAView = TrackSoAHeterogeneousLayout<>::View;
   using TrackSoAConstView = TrackSoAHeterogeneousLayout<>::ConstView;
 
 }  // namespace pixelTrack
 
-#endif  // CUDADataFormats_Track_TrackHeterogeneousT_H
+#endif  // #ifndef CUDADataFormats_Track_PixelTrackHeterogeneous_h
diff --git a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h
new file mode 100644
index 0000000000000..cbafb46c9e099
--- /dev/null
+++ b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h
@@ -0,0 +1,35 @@
+#ifndef CUDADataFormats_Track_TrackHeterogeneousDevice_H
+#define CUDADataFormats_Track_TrackHeterogeneousDevice_H
+
+#include <bits/stdint-uintn.h>
+
+#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h"
+#include "CUDADataFormats/Common/interface/PortableDeviceCollection.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
+//#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+//#include "HeterogeneousCore/CUDAUtilities/interface/allocate_device.h"
+//#include "HeterogeneousCore/CUDAUtilities/interface/allocate_host.h"
+
+template <int32_t S>
+class TrackSoAHeterogeneousDevice : public cms::cuda::PortableDeviceCollection<TrackSoAHeterogeneousLayout<>> {
+public:
+  //TrackSoAHeterogeneousDevice() = default;
+
+  // Constructor which specifies the SoA size
+  explicit TrackSoAHeterogeneousDevice(cudaStream_t stream)
+      : PortableDeviceCollection<TrackSoAHeterogeneousLayout<>>(S, stream) {}
+
+  // Copy data from device to host
+  __host__ void copyToHost(cms::cuda::host::unique_ptr<std::byte[]> &host_ptr, cudaStream_t stream) {
+    cudaCheck(cudaMemcpy(host_ptr.get(), const_buffer().get(), bufferSize(), cudaMemcpyDeviceToHost));
+  }
+};
+
+namespace pixelTrack {
+
+  using TrackSoADevice = TrackSoAHeterogeneousDevice<maxNumber()>;
+
+}  // namespace pixelTrack
+
+#endif  // CUDADataFormats_Track_TrackHeterogeneousT_H
diff --git a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h
new file mode 100644
index 0000000000000..276ddabcc39d4
--- /dev/null
+++ b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h
@@ -0,0 +1,32 @@
+#ifndef CUDADataFormats_Track_TrackHeterogeneousHost_H
+#define CUDADataFormats_Track_TrackHeterogeneousHost_H
+
+#include <bits/stdint-uintn.h>
+
+#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h"
+#include "CUDADataFormats/Common/interface/PortableHostCollection.h"
+//#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+//#include "HeterogeneousCore/CUDAUtilities/interface/allocate_device.h"
+//#include "HeterogeneousCore/CUDAUtilities/interface/allocate_host.h"
+
+template <int32_t S>
+class TrackSoAHeterogeneousHost : public cms::cuda::PortableHostCollection<TrackSoAHeterogeneousLayout<>> {
+public:
+  TrackSoAHeterogeneousHost() = default;
+
+  // Constructor which specifies the SoA size
+  explicit TrackSoAHeterogeneousHost(cudaStream_t stream)
+      : PortableHostCollection<TrackSoAHeterogeneousLayout<>>(S, stream) {}
+
+
+
+};
+
+namespace pixelTrack {
+
+  using TrackSoAHost = TrackSoAHeterogeneousHost<maxNumber()>;
+
+}  // namespace pixelTrack
+
+#endif  // CUDADataFormats_Track_TrackHeterogeneousT_H
diff --git a/CUDADataFormats/Track/src/TrackSoAHeterogeneous_t_test.cc b/CUDADataFormats/Track/src/TrackSoAHeterogeneous_t_test.cc
index b15debe3cb72b..24792bb6350f8 100644
--- a/CUDADataFormats/Track/src/TrackSoAHeterogeneous_t_test.cc
+++ b/CUDADataFormats/Track/src/TrackSoAHeterogeneous_t_test.cc
@@ -1 +1,2 @@
-#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h"
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h"
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h"
diff --git a/CUDADataFormats/Track/src/classes.h b/CUDADataFormats/Track/src/classes.h
index 5870985315f14..338598e28ebf5 100644
--- a/CUDADataFormats/Track/src/classes.h
+++ b/CUDADataFormats/Track/src/classes.h
@@ -3,7 +3,8 @@
 
 #include "CUDADataFormats/Common/interface/Product.h"
 #include "CUDADataFormats/Common/interface/HostProduct.h"
-#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h"
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h"
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h"
 #include "DataFormats/Common/interface/Wrapper.h"
 
 #endif  // CUDADataFormats_Track_src_classes_h
diff --git a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_t.cpp b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_t.cpp
index 9708b689dd05b..b3d62ffa810f6 100644
--- a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_t.cpp
+++ b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_t.cpp
@@ -1,4 +1,5 @@
-#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h"
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h"
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h"
 
 #include <iostream>
 #include <cassert>
diff --git a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp
index db26e83428f56..be7d5fc7e6c1c 100644
--- a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp
+++ b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp
@@ -14,7 +14,8 @@
  */
 
 #include <bits/stdint-uintn.h>
-#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h"
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h"
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/allocate_device.h"
@@ -22,7 +23,7 @@
 
 namespace testTrackSoAHeterogeneousT {
 
-  void runKernels(pixelTrack::TrackSoAView tracks_view);
+  void runKernels(pixelTrack::TrackSoAView tracks_view, cudaStream_t stream);
 }
 
 int main() {
@@ -35,16 +36,24 @@ int main() {
   {
     // Instantiate tracks on host. Portabledevicecollection allocates
     // SoA on device automatically.
-    pixelTrack::TrackSoA tracks(stream);
-    uint32_t soaNumElements = tracks->metadata().size();  // Length of each SoA array in elements
+    // pixelTrack::TrackSoADevice tracks(stream);
+    // uint32_t soaNumElements = tracks->metadata().size();  // Length of each SoA array in elements
+    //
+    // // Run the tests
+    // testTrackSoAHeterogeneousT::runKernels(tracks.view());
+    //
+    // // Create a view to access the copied data
+    // auto tracks_h_soa = tracks.copyToHost(stream);
+    // TrackSoAHeterogeneousLayout<> tmp_layout(tracks_h_soa.get(), soaNumElements);
+    // TrackSoAHeterogeneousLayout<>::View tmp_view(tmp_layout);
 
-    // Run the tests
-    testTrackSoAHeterogeneousT::runKernels(tracks.view());
+    // pixelTrack::TrackSoAHost tracks_h(stream);
+    // pixelTrack::TrackSoADevice tracks_d(stream);
+    // testTrackSoAHeterogeneousT::runKernels(tracks_d.view());
+    // tracks_d.copyToHost(tracks_h.buffer(), stream);
 
-    // Create a view to access the copied data
-    auto tracks_h_soa = tracks.copyToHost(stream);
-    TrackSoAHeterogeneousLayout<> tmp_layout(tracks_h_soa.get(), soaNumElements);
-    TrackSoAHeterogeneousLayout<>::View tmp_view(tmp_layout);
+    pixelTrack::TrackSoAHost tracks_h(stream);
+    testTrackSoAHeterogeneousT::runKernels(tracks_h.view(), stream);
 
     // Print results
     std::cout << "pt"
@@ -60,9 +69,9 @@ int main() {
               << "hitIndices off" << std::endl;
 
     for (int i = 0; i < 10; ++i) {
-      std::cout << tmp_view[i].pt() << "\t" << tmp_view[i].eta() << "\t" << tmp_view[i].chi2() << "\t"
-                << (int)tmp_view[i].quality() << "\t" << (int)tmp_view[i].nLayers() << "\t"
-                << tmp_view.hitIndices().off[i] << std::endl;
+      std::cout << tracks_h.view()[i].pt() << "\t" << tracks_h.view()[i].eta() << "\t" << tracks_h.view()[i].chi2() << "\t"
+                << (int)tracks_h.view()[i].quality() << "\t" << (int)tracks_h.view()[i].nLayers() << "\t"
+                << tracks_h.view().hitIndices().off[i] << std::endl;
     }
   }
   cudaCheck(cudaStreamDestroy(stream));
diff --git a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu
index 9c59d867629b2..b7602fb790752 100644
--- a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu
+++ b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu
@@ -1,5 +1,7 @@
-#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h"
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h"
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/OneToManyAssoc.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 
 namespace testTrackSoAHeterogeneousT {
 
@@ -20,7 +22,7 @@ namespace testTrackSoAHeterogeneousT {
   }
 
   // TODO: Using TrackSoAConstView fails to assert hitIndices correctly
-  __global__ void verify(pixelTrack::TrackSoAView tracks_view) {
+  __global__ void verify(pixelTrack::TrackSoAConstView tracks_view) {
     int i = threadIdx.x;
 
     if (i == 0) {
@@ -37,10 +39,13 @@ namespace testTrackSoAHeterogeneousT {
     }
   }
 
-  void runKernels(pixelTrack::TrackSoAView tracks_view) {
-    fill<<<1, 1024>>>(tracks_view);
-    cudaDeviceSynchronize();
-    verify<<<1, 1024>>>(tracks_view);
+  void runKernels(pixelTrack::TrackSoAView tracks_view, cudaStream_t stream) {
+    fill<<<1, 1024, 0, stream>>>(tracks_view);
+    cudaCheck(cudaGetLastError());
+    cudaCheck(cudaDeviceSynchronize());
+    verify<<<1, 1024, 0, stream>>>(tracks_view);
+    cudaCheck(cudaGetLastError());
+    cudaCheck(cudaDeviceSynchronize());
   }
 
 }  // namespace testTrackSoAHeterogeneousT

From 45ace5b170a606cc9851822311be355cd7ddb63e Mon Sep 17 00:00:00 2001
From: Dimitris Papagiannis <nothingface0@gmail.com>
Date: Mon, 31 Oct 2022 16:31:48 +0100
Subject: [PATCH 072/110] Updating utilities to use references instead of
 instances

---
 .../Track/interface/PixelTrackUtilities.h     | 24 +++++++++----------
 .../Track/test/TrackSoAHeterogeneous_test.cpp |  4 ++--
 .../Track/test/TrackSoAHeterogeneous_test.cu  |  5 ++--
 3 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/CUDADataFormats/Track/interface/PixelTrackUtilities.h b/CUDADataFormats/Track/interface/PixelTrackUtilities.h
index 08ed721b20052..e5b44edb3b752 100644
--- a/CUDADataFormats/Track/interface/PixelTrackUtilities.h
+++ b/CUDADataFormats/Track/interface/PixelTrackUtilities.h
@@ -55,21 +55,21 @@ namespace pixelTrack {
     using hindex_type = uint32_t;
     // State at the Beam spot
     // phi,tip,1/pt,cotan(theta),zip
-    __host__ __device__ inline float charge(TrackSoAConstView tracks, int32_t i) {
+    __host__ __device__ inline float charge(TrackSoAConstView &tracks, int32_t i) {
       return std::copysign(1.f, tracks[i].state()(2));
     }
 
-    __host__ __device__ inline float phi(TrackSoAConstView tracks, int32_t i) { return tracks[i].state()(0); }
+    __host__ __device__ inline float phi(TrackSoAConstView &tracks, int32_t i) { return tracks[i].state()(0); }
 
-    __host__ __device__ inline float tip(TrackSoAConstView tracks, int32_t i) { return tracks[i].state()(1); }
+    __host__ __device__ inline float tip(TrackSoAConstView &tracks, int32_t i) { return tracks[i].state()(1); }
 
-    __host__ __device__ inline float zip(TrackSoAConstView tracks, int32_t i) { return tracks[i].state()(4); }
+    __host__ __device__ inline float zip(TrackSoAConstView &tracks, int32_t i) { return tracks[i].state()(4); }
 
-    __host__ __device__ inline bool isTriplet(TrackSoAConstView tracks, int i) { return tracks[i].nLayers() == 3; }
+    __host__ __device__ inline bool isTriplet(TrackSoAConstView &tracks, int i) { return tracks[i].nLayers() == 3; }
 
     template <typename V3, typename M3, typename V2, typename M2>
     __host__ __device__ inline void copyFromCircle(
-        TrackSoAView tracks, V3 const &cp, M3 const &ccov, V2 const &lp, M2 const &lcov, float b, int32_t i) {
+        TrackSoAView &tracks, V3 const &cp, M3 const &ccov, V2 const &lp, M2 const &lcov, float b, int32_t i) {
       tracks[i].state() << cp.template cast<float>(), lp.template cast<float>();
 
       tracks[i].state()(2) = tracks[i].state()(2) * b;
@@ -89,7 +89,7 @@ namespace pixelTrack {
     }
 
     template <typename V5, typename M5>
-    __host__ __device__ inline void copyFromDense(TrackSoAView tracks, V5 const &v, M5 const &cov, int32_t i) {
+    __host__ __device__ inline void copyFromDense(TrackSoAView &tracks, V5 const &v, M5 const &cov, int32_t i) {
       tracks[i].state() = v.template cast<float>();
       for (int j = 0, ind = 0; j < 5; ++j)
         for (auto k = j; k < 5; ++k)
@@ -97,7 +97,7 @@ namespace pixelTrack {
     }
 
     template <typename V5, typename M5>
-    __host__ __device__ inline void copyToDense(TrackSoAConstView tracks, V5 &v, M5 &cov, int32_t i) {
+    __host__ __device__ inline void copyToDense(TrackSoAConstView &tracks, V5 &v, M5 &cov, int32_t i) {
       v = tracks[i].state().template cast<typename V5::Scalar>();
       for (int j = 0, ind = 0; j < 5; ++j) {
         cov(j, j) = tracks[i].covariance()(ind++);
@@ -107,7 +107,7 @@ namespace pixelTrack {
     }
 
     // TODO: Not using TrackSoAConstView due to weird bugs with HitContainer
-    __host__ __device__ inline int computeNumberOfLayers(TrackSoAView tracks, int32_t i) {
+    __host__ __device__ inline int computeNumberOfLayers(TrackSoAView &tracks, int32_t i) {
       auto pdet = tracks.detIndices().begin(i);
       int nl = 1;
       auto ol = phase1PixelTopology::getLayer(*pdet);
@@ -119,14 +119,14 @@ namespace pixelTrack {
       }
       return nl;
     }
-    __host__ __device__ inline int nHits(TrackSoAConstView tracks, int i) { return tracks.detIndices().size(i); }
+    __host__ __device__ inline int nHits(TrackSoAConstView &tracks, int i) { return tracks.detIndices().size(i); }
 
     // Casts quality SoA data (uint8_t) to pixelTrack::Quality. This is required
     // to use the data as an enum instead of a plain uint8_t
-    __host__ __device__ inline const Quality *qualityData(TrackSoAConstView tracks) {
+    __host__ __device__ inline const Quality *qualityData(TrackSoAConstView &tracks) {
       return reinterpret_cast<Quality const *>(tracks.quality());
     }
-    __host__ __device__ inline Quality *qualityData(TrackSoAView tracks) {
+    __host__ __device__ inline Quality *qualityData(TrackSoAView &tracks) {
       return reinterpret_cast<Quality *>(tracks.quality());
     }
 
diff --git a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp
index be7d5fc7e6c1c..ef04698c3a104 100644
--- a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp
+++ b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp
@@ -69,8 +69,8 @@ int main() {
               << "hitIndices off" << std::endl;
 
     for (int i = 0; i < 10; ++i) {
-      std::cout << tracks_h.view()[i].pt() << "\t" << tracks_h.view()[i].eta() << "\t" << tracks_h.view()[i].chi2() << "\t"
-                << (int)tracks_h.view()[i].quality() << "\t" << (int)tracks_h.view()[i].nLayers() << "\t"
+      std::cout << tracks_h.view()[i].pt() << "\t" << tracks_h.view()[i].eta() << "\t" << tracks_h.view()[i].chi2()
+                << "\t" << (int)tracks_h.view()[i].quality() << "\t" << (int)tracks_h.view()[i].nLayers() << "\t"
                 << tracks_h.view().hitIndices().off[i] << std::endl;
     }
   }
diff --git a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu
index b7602fb790752..8273f011ace80 100644
--- a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu
+++ b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu
@@ -21,8 +21,8 @@ namespace testTrackSoAHeterogeneousT {
     }
   }
 
-  // TODO: Using TrackSoAConstView fails to assert hitIndices correctly
-  __global__ void verify(pixelTrack::TrackSoAConstView tracks_view) {
+  // TODO: Use TrackSoAConstView when https://github.com/cms-sw/cmssw/pull/39919 is merged
+  __global__ void verify(pixelTrack::TrackSoAView tracks_view) {
     int i = threadIdx.x;
 
     if (i == 0) {
@@ -43,6 +43,7 @@ namespace testTrackSoAHeterogeneousT {
     fill<<<1, 1024, 0, stream>>>(tracks_view);
     cudaCheck(cudaGetLastError());
     cudaCheck(cudaDeviceSynchronize());
+
     verify<<<1, 1024, 0, stream>>>(tracks_view);
     cudaCheck(cudaGetLastError());
     cudaCheck(cudaDeviceSynchronize());

From eff7567607df8d3f00a6e8f2632c68e934f42524 Mon Sep 17 00:00:00 2001
From: Dimitris Papagiannis <nothingface0@gmail.com>
Date: Mon, 31 Oct 2022 16:35:34 +0100
Subject: [PATCH 073/110] Cleanup test

---
 .../Track/test/TrackSoAHeterogeneous_test.cpp | 23 +++++--------------
 1 file changed, 6 insertions(+), 17 deletions(-)

diff --git a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp
index ef04698c3a104..0ad6863d4f8c7 100644
--- a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp
+++ b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp
@@ -34,26 +34,15 @@ int main() {
 
   // Inner scope to deallocate memory before destroying the stream
   {
-    // Instantiate tracks on host. Portabledevicecollection allocates
+    // Instantiate tracks on device. PortableDeviceCollection allocates
     // SoA on device automatically.
-    // pixelTrack::TrackSoADevice tracks(stream);
-    // uint32_t soaNumElements = tracks->metadata().size();  // Length of each SoA array in elements
-    //
-    // // Run the tests
-    // testTrackSoAHeterogeneousT::runKernels(tracks.view());
-    //
-    // // Create a view to access the copied data
-    // auto tracks_h_soa = tracks.copyToHost(stream);
-    // TrackSoAHeterogeneousLayout<> tmp_layout(tracks_h_soa.get(), soaNumElements);
-    // TrackSoAHeterogeneousLayout<>::View tmp_view(tmp_layout);
-
-    // pixelTrack::TrackSoAHost tracks_h(stream);
-    // pixelTrack::TrackSoADevice tracks_d(stream);
-    // testTrackSoAHeterogeneousT::runKernels(tracks_d.view());
-    // tracks_d.copyToHost(tracks_h.buffer(), stream);
+    pixelTrack::TrackSoADevice tracks_d(stream);
+    testTrackSoAHeterogeneousT::runKernels(tracks_d.view(), stream);
 
+    // Instantate tracks on host. This is where the data will be
+    // copied to from device.
     pixelTrack::TrackSoAHost tracks_h(stream);
-    testTrackSoAHeterogeneousT::runKernels(tracks_h.view(), stream);
+    tracks_d.copyToHost(tracks_h.buffer(), stream);
 
     // Print results
     std::cout << "pt"

From 8aeb647f3cd9760d74ccfed0a55a9d4fd8c43b43 Mon Sep 17 00:00:00 2001
From: Dimitris Papagiannis <nothingface0@gmail.com>
Date: Mon, 31 Oct 2022 16:40:46 +0100
Subject: [PATCH 074/110] Minor cleanup, comments, namespace usage

---
 CUDADataFormats/Track/interface/PixelTrackUtilities.h     | 6 +++---
 .../Track/interface/TrackSoAHeterogeneousDevice.h         | 5 +----
 .../Track/interface/TrackSoAHeterogeneousHost.h           | 8 +-------
 3 files changed, 5 insertions(+), 14 deletions(-)

diff --git a/CUDADataFormats/Track/interface/PixelTrackUtilities.h b/CUDADataFormats/Track/interface/PixelTrackUtilities.h
index e5b44edb3b752..1c7ffe22711e8 100644
--- a/CUDADataFormats/Track/interface/PixelTrackUtilities.h
+++ b/CUDADataFormats/Track/interface/PixelTrackUtilities.h
@@ -119,7 +119,7 @@ namespace pixelTrack {
       }
       return nl;
     }
-    __host__ __device__ inline int nHits(TrackSoAConstView &tracks, int i) { return tracks.detIndices().size(i); }
+    __host__ __device__ inline int nHits(TrackSoAView &tracks, int i) { return tracks.detIndices().size(i); }
 
     // Casts quality SoA data (uint8_t) to pixelTrack::Quality. This is required
     // to use the data as an enum instead of a plain uint8_t
@@ -134,11 +134,11 @@ namespace pixelTrack {
 }  // namespace pixelTrack
 
 namespace pixelTrack {
-
+  // Common types for both Host and Device code
   using TrackSoALayout = TrackSoAHeterogeneousLayout<>;
   using TrackSoAView = TrackSoAHeterogeneousLayout<>::View;
   using TrackSoAConstView = TrackSoAHeterogeneousLayout<>::ConstView;
 
 }  // namespace pixelTrack
 
-#endif  // #ifndef CUDADataFormats_Track_PixelTrackHeterogeneous_h
+#endif
diff --git a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h
index cbafb46c9e099..a77643de29001 100644
--- a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h
+++ b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h
@@ -6,10 +6,7 @@
 #include "CUDADataFormats/Track/interface/PixelTrackUtilities.h"
 #include "CUDADataFormats/Common/interface/PortableDeviceCollection.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
-//#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-//#include "HeterogeneousCore/CUDAUtilities/interface/allocate_device.h"
-//#include "HeterogeneousCore/CUDAUtilities/interface/allocate_host.h"
 
 template <int32_t S>
 class TrackSoAHeterogeneousDevice : public cms::cuda::PortableDeviceCollection<TrackSoAHeterogeneousLayout<>> {
@@ -28,7 +25,7 @@ class TrackSoAHeterogeneousDevice : public cms::cuda::PortableDeviceCollection<T
 
 namespace pixelTrack {
 
-  using TrackSoADevice = TrackSoAHeterogeneousDevice<maxNumber()>;
+  using TrackSoADevice = TrackSoAHeterogeneousDevice<pixelTrack::maxNumber()>;
 
 }  // namespace pixelTrack
 
diff --git a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h
index 276ddabcc39d4..a4b18134066a3 100644
--- a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h
+++ b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h
@@ -5,10 +5,7 @@
 
 #include "CUDADataFormats/Track/interface/PixelTrackUtilities.h"
 #include "CUDADataFormats/Common/interface/PortableHostCollection.h"
-//#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-//#include "HeterogeneousCore/CUDAUtilities/interface/allocate_device.h"
-//#include "HeterogeneousCore/CUDAUtilities/interface/allocate_host.h"
 
 template <int32_t S>
 class TrackSoAHeterogeneousHost : public cms::cuda::PortableHostCollection<TrackSoAHeterogeneousLayout<>> {
@@ -18,14 +15,11 @@ class TrackSoAHeterogeneousHost : public cms::cuda::PortableHostCollection<Track
   // Constructor which specifies the SoA size
   explicit TrackSoAHeterogeneousHost(cudaStream_t stream)
       : PortableHostCollection<TrackSoAHeterogeneousLayout<>>(S, stream) {}
-
-
-
 };
 
 namespace pixelTrack {
 
-  using TrackSoAHost = TrackSoAHeterogeneousHost<maxNumber()>;
+  using TrackSoAHost = TrackSoAHeterogeneousHost<pixelTrack::maxNumber()>;
 
 }  // namespace pixelTrack
 

From 672aab19a7333dceba0bcf0f5e33305b575b91be Mon Sep 17 00:00:00 2001
From: Dimitris Papagiannis <nothingface0@gmail.com>
Date: Mon, 31 Oct 2022 17:06:45 +0100
Subject: [PATCH 075/110] Updating products in modules

---
 .../plugins/PixelTrackProducerFromSoA.cc      |  7 +--
 .../plugins/PixelTrackSoAFromCUDA.cc          | 53 +++++++------------
 .../PixelTriplets/plugins/CAHitNtupletCUDA.cc | 12 ++---
 .../plugins/CAHitNtupletGeneratorOnGPU.cc     | 40 ++++++--------
 .../plugins/CAHitNtupletGeneratorOnGPU.h      | 11 ++--
 5 files changed, 52 insertions(+), 71 deletions(-)

diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc
index 9e4839ec8b644..5ffa051c27cfc 100644
--- a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc
+++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc
@@ -27,8 +27,9 @@
 #include "RecoPixelVertexing/PixelTrackFitting/interface/FitUtils.h"
 
 #include "CUDADataFormats/Common/interface/HostProduct.h"
-//#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
-#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h"
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h"
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h"
+#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h"
 #include "CUDADataFormats/SiPixelCluster/interface/gpuClusteringConstants.h"
 
 #include "storeTracks.h"
@@ -56,7 +57,7 @@ class PixelTrackProducerFromSoA : public edm::global::EDProducer<> {
   // Event Data tokens
   const edm::EDGetTokenT<reco::BeamSpot> tBeamSpot_;
   //const edm::EDGetTokenT<PixelTrackHeterogeneous> tokenTrack_;
-  const edm::EDGetTokenT<pixelTrack::TrackSoAView> tokenTrack_;
+  const edm::EDGetTokenT<pixelTrack::TrackSoAHost> tokenTrack_;
   const edm::EDGetTokenT<SiPixelRecHitCollectionNew> cpuHits_;
   const edm::EDGetTokenT<HMSstorage> hmsToken_;
   // Event Setup tokens
diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc
index e31f195578f35..594081963bb90 100644
--- a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc
+++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc
@@ -2,8 +2,9 @@
 
 #include "CUDADataFormats/Common/interface/Product.h"
 #include "CUDADataFormats/Common/interface/HostProduct.h"
-//#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
-#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h"
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h"
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h"
+#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h"
 #include "DataFormats/Common/interface/Handle.h"
 #include "FWCore/Framework/interface/Event.h"
 #include "FWCore/Framework/interface/EventSetup.h"
@@ -33,16 +34,15 @@ class PixelTrackSoAFromCUDA : public edm::stream::EDProducer<edm::ExternalWork>
                edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
   void produce(edm::Event& iEvent, edm::EventSetup const& iSetup) override;
 
-  edm::EDGetTokenT<pixelTrack::TrackSoAView> tokenCUDA_;
-  edm::EDPutTokenT<pixelTrack::TrackSoAView> tokenSOA_;
+  edm::EDGetTokenT<cms::cuda::Product<pixelTrack::TrackSoADevice>> tokenCUDA_;
+  edm::EDPutTokenT<pixelTrack::TrackSoAHost> tokenSOA_;
 
-  pixelTrack::TrackSoAView soa_view_h;
-  //pixelTrack::TrackSoALayout soa_layout_h;
+  pixelTrack::TrackSoAHost tracks_h;
 };
 
 PixelTrackSoAFromCUDA::PixelTrackSoAFromCUDA(const edm::ParameterSet& iConfig)
-    : tokenCUDA_(consumes<pixelTrack::TrackSoAView>(iConfig.getParameter<edm::InputTag>("src"))),
-      tokenSOA_(produces<pixelTrack::TrackSoAView>()) {}
+    : tokenCUDA_(consumes<cms::cuda::Product<pixelTrack::TrackSoADevice>>(iConfig.getParameter<edm::InputTag>("src"))),
+      tokenSOA_(produces<pixelTrack::TrackSoAHost>()) {}
 
 void PixelTrackSoAFromCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
   edm::ParameterSetDescription desc;
@@ -54,33 +54,18 @@ void PixelTrackSoAFromCUDA::fillDescriptions(edm::ConfigurationDescriptions& des
 void PixelTrackSoAFromCUDA::acquire(edm::Event const& iEvent,
                                     edm::EventSetup const& iSetup,
                                     edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
-  soa_view_h = iEvent.get(tokenCUDA_);
-  //cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)};
-  //auto const& soa_view_h = ctx.get(inputDataWrapped);  // Layout of data on device
-
-  /*auto soa_buffer_h = cms::cuda::make_host_unique<std::byte[]>(soa_layout_d.metadata().byteSize(), ctx.stream());
-
-  cudaCheck(cudaMemcpyAsync(soa_buffer_h.get(),
-                            soa_layout_d.metadata().data(),
-                            soa_layout_d.metadata().byteSize(),
-                            cudaMemcpyDeviceToHost,
-                            ctx.stream()));
-  pixelTrack::TrackSoALayout soa_layout_h(soa_buffer_h.get(), soa_layout_d.metadata().size());
-  pixelTrack::TrackSoAView soa_view_h(soa_layout_h);*/
-
-  // // Allocate enough host memory to fit the SoA data in the input view
-  // auto soa_buffer_host = cms::cuda::make_host_unique<std::byte[]>(soa_.layout()., ctx.stream());
-
-  // // Copy data from the view on device to host memory
-  // cudaCheck(cudaMemcpy(soa_buffer_host.get(), soa_.buffer().get(), soa_.metadata().byteSize(), cudaMemcpyDeviceToHost));
-  // TrackSoAHeterogeneousT_test<> soa_layout(soa_buffer_host.get(), soa_.metadata().size());
-  // TrackSoAHeterogeneousT_test<>::View soa_host_view_(soa_layout);  // Store the host-side view
+  cms::cuda::Product<pixelTrack::TrackSoADevice> const& inputDataWrapped = iEvent.get(tokenCUDA_);
+  cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)};
+  auto const tracks_d = ctx.get(inputDataWrapped);  // Tracks on device
+
+  pixelTrack::TrackSoAHost tracks_h(ctx.stream());
+  tracks_d.copyToHost(tracks_h.buffer(), ctx.stream());
 }
 
 void PixelTrackSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& iSetup) {
   // check that the fixed-size SoA does not overflow
-  auto maxTracks = soa_view_h.metadata().size();
-  auto nTracks = soa_view_h.nTracks();
+  auto maxTracks = tracks_h.view().metadata().size();
+  auto nTracks = tracks_h.view().nTracks();
   assert(nTracks < maxTracks);
   if (nTracks == maxTracks - 1) {
     edm::LogWarning("PixelTracks") << "Unsorted reconstructed pixel tracks truncated to " << maxTracks - 1
@@ -93,8 +78,8 @@ void PixelTrackSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& i
 
   int32_t nt = 0;
   for (int32_t it = 0; it < maxTracks; ++it) {
-    auto nHits = pixelTrack::utilities::nHits(soa_view_h, it);
-    assert(nHits == int(soa_view_h.hitIndices().size(it)));
+    auto nHits = pixelTrack::utilities::nHits(tracks_h.view(), it);
+    assert(nHits == int(tracks_h.view().hitIndices().size(it)));
     if (nHits == 0)
       break;  // this is a guard: maybe we need to move to nTracks...
     nt++;
@@ -103,7 +88,7 @@ void PixelTrackSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& i
 #endif
 
   // DO NOT  make a copy  (actually TWO....)
-  iEvent.emplace(tokenSOA_, std::move(soa_view_h));  //, std::move(ret)); // view
+  iEvent.emplace(tokenSOA_, std::move(tracks_h));  //, std::move(ret)); // view
 
   //assert(!soa_);
 }
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc
index d8a634328af7a..2e48865b682bf 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc
@@ -20,8 +20,8 @@
 #include "RecoTracker/TkMSParametrization/interface/PixelRecoUtilities.h"
 
 #include "CAHitNtupletGeneratorOnGPU.h"
-//#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
-#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h"
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h"
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h"
 #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h"
 
 class CAHitNtupletCUDA : public edm::global::EDProducer<> {
@@ -43,12 +43,12 @@ class CAHitNtupletCUDA : public edm::global::EDProducer<> {
   // GPU
   // Produces a view on GPU, which is used by PixelTrackSoAFromCUDA
   edm::EDGetTokenT<cms::cuda::Product<TrackingRecHit2DGPU>> tokenHitGPU_;
-  edm::EDPutTokenT<cms::cuda::Product<pixelTrack::TrackSoAView>> tokenTrackGPU_;
+  edm::EDPutTokenT<cms::cuda::Product<pixelTrack::TrackSoADevice>> tokenTrackGPU_;
 
   // CPU
   // Produces a view on CPU, which is used by PixelTrackProducerFromSoA
   edm::EDGetTokenT<TrackingRecHit2DCPU> tokenHitCPU_;
-  edm::EDPutTokenT<pixelTrack::TrackSoAView> tokenTrackCPU_;
+  edm::EDPutTokenT<pixelTrack::TrackSoAHost> tokenTrackCPU_;
 
   CAHitNtupletGeneratorOnGPU gpuAlgo_;
 };
@@ -58,10 +58,10 @@ CAHitNtupletCUDA::CAHitNtupletCUDA(const edm::ParameterSet& iConfig)
   if (onGPU_) {
     tokenHitGPU_ =
         consumes<cms::cuda::Product<TrackingRecHit2DGPU>>(iConfig.getParameter<edm::InputTag>("pixelRecHitSrc"));
-    tokenTrackGPU_ = produces<cms::cuda::Product<pixelTrack::TrackSoAView>>();
+    tokenTrackGPU_ = produces<cms::cuda::Product<pixelTrack::TrackSoADevice>>();
   } else {
     tokenHitCPU_ = consumes<TrackingRecHit2DCPU>(iConfig.getParameter<edm::InputTag>("pixelRecHitSrc"));
-    tokenTrackCPU_ = produces<pixelTrack::TrackSoAView>();
+    tokenTrackCPU_ = produces<pixelTrack::TrackSoAHost>();
   }
 }
 
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
index f4ab7d3e83504..180711886c8d1 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
@@ -19,8 +19,8 @@
 #include "FWCore/Utilities/interface/isFinite.h"
 #include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
 #include "TrackingTools/DetLayers/interface/BarrelDetLayer.h"
-
-#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h"
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h"
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h"
 
 #include "CAHitNtupletGeneratorOnGPU.h"
 
@@ -190,10 +190,10 @@ void CAHitNtupletGeneratorOnGPU::endJob() {
                                                                     float bfield,
                                                                     cudaStream_t stream) const {
   PixelTrackHeterogeneous tracks(cms::cuda::make_device_unique<pixelTrack::TrackSoA>(stream));*/
-pixelTrack::TrackSoAView CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecHit2DGPU const& hits_d,
-                                                                     float bfield,
-                                                                     cudaStream_t stream) const {
-  pixelTrack::TrackSoA tracks(stream);
+pixelTrack::TrackSoADevice CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecHit2DGPU const& hits_d,
+                                                                       float bfield,
+                                                                       cudaStream_t stream) const {
+  pixelTrack::TrackSoADevice tracks(stream);
   auto* soa = &tracks;
 
   CAHitNtupletGeneratorKernelsGPU kernels(m_params);
@@ -218,32 +218,26 @@ pixelTrack::TrackSoAView CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRec
   std::cout << "finished building pixel tracks on GPU" << std::endl;
 #endif
 
-  return tracks.view();
+  return tracks;
 }
 
-pixelTrack::TrackSoAView CAHitNtupletGeneratorOnGPU::makeTuples(TrackingRecHit2DCPU const& hits_d, float bfield) const {
-  //PixelTrackHeterogeneous tracks(std::make_unique<pixelTrack::TrackSoA>());
-  // pixelTrack::TrackSoA tracks;
-
-  auto tracks_h_soa =
-      std::make_unique<std::byte[]>(TrackSoAHeterogeneousLayout<>::computeDataSize(pixelTrack::maxNumber()));
-  TrackSoAHeterogeneousLayout<> tracks_layout(tracks_h_soa.get(), pixelTrack::maxNumber());
-  TrackSoAHeterogeneousLayout<>::View tracks_view(tracks_layout);
-  //assert(soa);
+pixelTrack::TrackSoAHost CAHitNtupletGeneratorOnGPU::makeTuples(TrackingRecHit2DCPU const& hits_d, float bfield) const {
+  pixelTrack::TrackSoADevice tracks(stream);
+  auto* soa = &tracks;
 
   CAHitNtupletGeneratorKernelsCPU kernels(m_params);
   kernels.setCounters(m_counters);
   kernels.allocateOnGPU(hits_d.nHits(), nullptr);
 
   kernels.buildDoublets(hits_d, nullptr);
-  kernels.launchKernels(hits_d, tracks_view, nullptr);
+  kernels.launchKernels(hits_d, tracks.view(), nullptr);
 
   if (0 == hits_d.nHits())
-    return tracks_view;
+    return tracks;
 
   // now fit
   HelixFitOnGPU fitter(bfield, m_params.fitNas4_);
-  fitter.allocateOnGPU(kernels.tupleMultiplicity(), tracks_view);
+  fitter.allocateOnGPU(kernels.tupleMultiplicity(), tracks.view());
 
   if (m_params.useRiemannFit_) {
     fitter.launchRiemannKernelsOnCPU(hits_d.view(), hits_d.nHits(), caConstants::maxNumberOfQuadruplets);
@@ -251,7 +245,7 @@ pixelTrack::TrackSoAView CAHitNtupletGeneratorOnGPU::makeTuples(TrackingRecHit2D
     fitter.launchBrokenLineKernelsOnCPU(hits_d.view(), hits_d.nHits(), caConstants::maxNumberOfQuadruplets);
   }
 
-  kernels.classifyTuples(hits_d, tracks_view, nullptr);
+  kernels.classifyTuples(hits_d, tracks.view(), nullptr);
 
 #ifdef GPU_DEBUG
   std::cout << "finished building pixel tracks on CPU" << std::endl;
@@ -259,13 +253,13 @@ pixelTrack::TrackSoAView CAHitNtupletGeneratorOnGPU::makeTuples(TrackingRecHit2D
 
   // check that the fixed-size SoA does not overflow
 
-  auto maxTracks = tracks_view.metadata().size();
-  auto nTracks = tracks_view.nTracks();
+  auto maxTracks = tracks.view().metadata().size();
+  auto nTracks = tracks.view().nTracks();
   assert(nTracks < maxTracks);
   if (nTracks == maxTracks - 1) {
     edm::LogWarning("PixelTracks") << "Unsorted reconstructed pixel tracks truncated to " << maxTracks - 1
                                    << " candidates";
   }
 
-  return tracks_view;
+  return tracks;
 }
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h
index 6b9a00ef9757f..323ad0d071f0c 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h
@@ -3,8 +3,8 @@
 
 #include <cuda_runtime.h>
 #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h"
-//#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
-#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h"
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h"
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h"
 
 #include "DataFormats/SiPixelDetId/interface/PixelSubdetector.h"
 #include "FWCore/ParameterSet/interface/ParameterSet.h"
@@ -29,7 +29,8 @@ class CAHitNtupletGeneratorOnGPU {
   using hindex_type = TrackingRecHit2DSOAView::hindex_type;
 
   using Quality = pixelTrack::Quality;
-  using OutputSoAView = pixelTrack::TrackSoAView;
+  using OutputSoAHost = pixelTrack::TrackSoAHost;
+  using OutputSoADevice = pixelTrack::TrackSoADevice;
   using HitContainer = pixelTrack::HitContainer;
   using Tuple = HitContainer;
 
@@ -49,12 +50,12 @@ class CAHitNtupletGeneratorOnGPU {
   void endJob();
 
   // On GPU
-  pixelTrack::TrackSoAView makeTuplesAsync(TrackingRecHit2DGPU const& hits_d,
+  pixelTrack::TrackSoADevice makeTuplesAsync(TrackingRecHit2DGPU const& hits_d,
                                              float bfield,
                                              cudaStream_t stream) const;
 
   // On CPU
-  pixelTrack::TrackSoAView makeTuples(TrackingRecHit2DCPU const& hits_d, float bfield) const;
+  pixelTrack::TrackSoAHost makeTuples(TrackingRecHit2DCPU const& hits_d, float bfield) const;
 
 private:
   void buildDoublets(HitsOnCPU const& hh, cudaStream_t stream) const;

From be09c9aa34ed9a18e47cb482032185e40318a8bb Mon Sep 17 00:00:00 2001
From: Dimitris Papagiannis <nothingface0@gmail.com>
Date: Mon, 31 Oct 2022 17:17:30 +0100
Subject: [PATCH 076/110] GPUCACell, PixelTrackDumpCUDA fix include

---
 .../Track/interface/PixelTrackHeterogeneous.h |  6 ------
 .../plugins/PixelTrackDumpCUDA.cc             | 20 +++++++++----------
 .../PixelTriplets/plugins/GPUCACell.h         |  4 ++--
 3 files changed, 11 insertions(+), 19 deletions(-)

diff --git a/CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h b/CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h
index c0e5c99b6fd28..73ec80e6322a2 100644
--- a/CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h
+++ b/CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h
@@ -1,10 +1,4 @@
 #ifndef CUDADataFormats_Track_PixelTrackHeterogeneous_h
 #define CUDADataFormats_Track_PixelTrackHeterogeneous_h
 
-#include "CUDADataFormats/Common/interface/HeterogeneousSoA.h"
-//#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h"
-#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h"
-
-using PixelTrackHeterogeneous = HeterogeneousSoA<pixelTrack::TrackSoA>;
-
 #endif  // #ifndef CUDADataFormats_Track_PixelTrackHeterogeneous_h
diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc
index 59489c8e11f5f..f97dfecfff370 100644
--- a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc
+++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc
@@ -1,8 +1,8 @@
 #include <cuda_runtime.h>
 
 #include "CUDADataFormats/Common/interface/Product.h"
-//#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
-#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h"
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h"
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h"
 #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h"
 #include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h"
 #include "DataFormats/Common/interface/Handle.h"
@@ -31,11 +31,12 @@ class PixelTrackDumpCUDA : public edm::global::EDAnalyzer<> {
 private:
   void analyze(edm::StreamID streamID, edm::Event const& iEvent, const edm::EventSetup& iSetup) const override;
   const bool m_onGPU;
-  //edm::EDGetTokenT<cms::cuda::Product<PixelTrackHeterogeneous>> tokenGPUTrack_;
-  edm::EDGetTokenT<cms::cuda::Product<pixelTrack::TrackSoA>> tokenGPUTrack_;
+  // GPU
+  edm::EDGetTokenT<cms::cuda::Product<pixelTrack::TrackSoADevice>> tokenGPUTrack_;
   edm::EDGetTokenT<cms::cuda::Product<ZVertexHeterogeneous>> tokenGPUVertex_;
-  //edm::EDGetTokenT<PixelTrackHeterogeneous> tokenSoATrack_;
-  edm::EDGetTokenT<pixelTrack::TrackSoA> tokenSoATrack_;
+
+  // CPU
+  edm::EDGetTokenT<pixelTrack::TrackSoAHost> tokenSoATrack_;
   edm::EDGetTokenT<ZVertexHeterogeneous> tokenSoAVertex_;
 };
 
@@ -43,13 +44,11 @@ PixelTrackDumpCUDA::PixelTrackDumpCUDA(const edm::ParameterSet& iConfig)
     : m_onGPU(iConfig.getParameter<bool>("onGPU")) {
   if (m_onGPU) {
     tokenGPUTrack_ =
-        //consumes<cms::cuda::Product<PixelTrackHeterogeneous>>(iConfig.getParameter<edm::InputTag>("pixelTrackSrc"));
-        consumes<cms::cuda::Product<pixelTrack::TrackSoA>>(iConfig.getParameter<edm::InputTag>("pixelTrackSrc"));
+        consumes<cms::cuda::Product<pixelTrack::TrackSoADevice>>(iConfig.getParameter<edm::InputTag>("pixelTrackSrc"));
     tokenGPUVertex_ =
         consumes<cms::cuda::Product<ZVertexHeterogeneous>>(iConfig.getParameter<edm::InputTag>("pixelVertexSrc"));
   } else {
-    //tokenSoATrack_ = consumes<PixelTrackHeterogeneous>(iConfig.getParameter<edm::InputTag>("pixelTrackSrc"));
-    tokenSoATrack_ = consumes<pixelTrack::TrackSoA>(iConfig.getParameter<edm::InputTag>("pixelTrackSrc"));
+    tokenSoATrack_ = consumes<pixelTrack::TrackSoAHost>(iConfig.getParameter<edm::InputTag>("pixelTrackSrc"));
     tokenSoAVertex_ = consumes<ZVertexHeterogeneous>(iConfig.getParameter<edm::InputTag>("pixelVertexSrc"));
   }
 }
@@ -71,7 +70,6 @@ void PixelTrackDumpCUDA::analyze(edm::StreamID streamID,
     cms::cuda::ScopedContextProduce ctx{hTracks};
 
     auto const& tracks = ctx.get(hTracks);
-    //auto const* tsoa = tracks.get();
     auto const* tsoa = &tracks;
     assert(tsoa);
 
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
index a0c3930d1a739..0e1c322c051f8 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
@@ -14,8 +14,8 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/VecArray.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
 #include "RecoPixelVertexing/PixelTriplets/interface/CircleEq.h"
-//#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
-#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h"
+#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h"
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h"
 #include "CAConstants.h"
 
 class GPUCACell {

From a985dc6f28084712bf9bf60e95d00e073ac9ac90 Mon Sep 17 00:00:00 2001
From: Dimitris Papagiannis <nothingface0@gmail.com>
Date: Mon, 31 Oct 2022 17:18:05 +0100
Subject: [PATCH 077/110] Removed unused file

---
 RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
index 0e1c322c051f8..b448e16a35e4b 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
@@ -15,7 +15,7 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
 #include "RecoPixelVertexing/PixelTriplets/interface/CircleEq.h"
 #include "CUDADataFormats/Track/interface/PixelTrackUtilities.h"
-#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h"
+
 #include "CAConstants.h"
 
 class GPUCACell {

From 5c4e8c953af09c51b587a5f84c732d30d2bcf7e6 Mon Sep 17 00:00:00 2001
From: Dimitris Papagiannis <nothingface0@gmail.com>
Date: Mon, 31 Oct 2022 17:22:25 +0100
Subject: [PATCH 078/110] Updated CAHitNtuplet.. inclusions

---
 .../PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h    | 6 ++----
 .../plugins/CAHitNtupletGeneratorKernelsImpl.h              | 3 +--
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h
index d7901041902d3..529f7de99ea98 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h
@@ -3,8 +3,8 @@
 
 // #define GPU_DEBUG
 
-//#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
-#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h"
+#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h"
+
 #include "GPUCACell.h"
 
 // #define DUMP_GPU_TK_TUPLES
@@ -35,7 +35,6 @@ namespace cAHitNtupletGenerator {
   using TupleMultiplicity = caConstants::TupleMultiplicity;
 
   using Quality = pixelTrack::Quality;
-  using TkSoA = pixelTrack::TrackSoA;
   using TkSoAView = pixelTrack::TrackSoAView;
   using TkSoAConstView = pixelTrack::TrackSoAConstView;
   using HitContainer = pixelTrack::HitContainer;
@@ -176,7 +175,6 @@ class CAHitNtupletGeneratorKernels {
   using TupleMultiplicity = caConstants::TupleMultiplicity;
 
   using Quality = pixelTrack::Quality;
-  using TkSoA = pixelTrack::TrackSoA;
   using TkSoAView = pixelTrack::TrackSoAView;
   using TkSoAConstView = pixelTrack::TrackSoAConstView;
   using HitContainer = pixelTrack::HitContainer;
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
index a3a8eb97a43d7..afe4aaa11f70b 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
@@ -15,7 +15,7 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
 #include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h"
-#include "CUDADataFormats/Track/interface/TrajectoryStateSoAT.h"
+#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h"
 
 #include "CAConstants.h"
 #include "CAHitNtupletGeneratorKernels.h"
@@ -30,7 +30,6 @@ using HitToTuple = caConstants::HitToTuple;
 using TupleMultiplicity = caConstants::TupleMultiplicity;
 
 using Quality = pixelTrack::Quality;
-using TkSoA = pixelTrack::TrackSoA;
 using TkSoAView = pixelTrack::TrackSoAView;
 using TkSoAConstView = pixelTrack::TrackSoAConstView;
 using HitContainer = pixelTrack::HitContainer;

From 9c1885a7ab00408ff47ad4d4ac5a644de70fb253 Mon Sep 17 00:00:00 2001
From: Breno Orzari <breno.orzari@hotmail.com>
Date: Mon, 31 Oct 2022 17:41:33 +0100
Subject: [PATCH 079/110] Updating RecoPixelVertexing/PixelVertexFinding to
 Host/Device

---
 .../plugins/PixelVertexProducerCUDA.cc        | 27 ++++++++++---------
 .../plugins/gpuVertexFinder.cc                |  9 ++++---
 .../plugins/gpuVertexFinder.h                 |  9 +++----
 3 files changed, 23 insertions(+), 22 deletions(-)

diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc
index 16b3267a326ce..7d8ea3485c447 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc
@@ -16,7 +16,8 @@
 #include "FWCore/Utilities/interface/EDGetToken.h"
 #include "FWCore/Utilities/interface/RunningAverage.h"
 #include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
-#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h"
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h"
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h"
 
 #include "gpuVertexFinder.h"
 
@@ -36,9 +37,9 @@ class PixelVertexProducerCUDA : public edm::global::EDProducer<> {
 
   bool onGPU_;
 
-  edm::EDGetTokenT<cms::cuda::Product<pixelTrack::TrackSoAView>> tokenGPUTrack_;
+  edm::EDGetTokenT<cms::cuda::Product<pixelTrack::TrackSoADevice>> tokenGPUTrack_;
   edm::EDPutTokenT<ZVertexCUDAProduct> tokenGPUVertex_;
-  edm::EDGetTokenT<pixelTrack::TrackSoAView> tokenCPUTrack_;
+  edm::EDGetTokenT<pixelTrack::TrackSoAHost> tokenCPUTrack_;
   edm::EDPutTokenT<ZVertexHeterogeneous> tokenCPUVertex_;
 
   const gpuVertexFinder::Producer gpuAlgo_;
@@ -63,10 +64,10 @@ PixelVertexProducerCUDA::PixelVertexProducerCUDA(const edm::ParameterSet& conf)
 {
   if (onGPU_) {
     tokenGPUTrack_ =
-        consumes<cms::cuda::Product<pixelTrack::TrackSoAView>>(conf.getParameter<edm::InputTag>("pixelTrackSrc"));
+        consumes<cms::cuda::Product<pixelTrack::TrackSoADevice>>(conf.getParameter<edm::InputTag>("pixelTrackSrc"));
     tokenGPUVertex_ = produces<ZVertexCUDAProduct>();
   } else {
-    tokenCPUTrack_ = consumes<pixelTrack::TrackSoAView>(conf.getParameter<edm::InputTag>("pixelTrackSrc"));
+    tokenCPUTrack_ = consumes<pixelTrack::TrackSoAHost>(conf.getParameter<edm::InputTag>("pixelTrackSrc"));
     tokenCPUVertex_ = produces<ZVertexHeterogeneous>();
   }
 }
@@ -98,29 +99,29 @@ void PixelVertexProducerCUDA::fillDescriptions(edm::ConfigurationDescriptions& d
 void PixelVertexProducerCUDA::produceOnGPU(edm::StreamID streamID,
                                            edm::Event& iEvent,
                                            const edm::EventSetup& iSetup) const {
-  edm::Handle<cms::cuda::Product<pixelTrack::TrackSoAView>> hTracks;
+  edm::Handle<cms::cuda::Product<pixelTrack::TrackSoADevice>> hTracks;
   iEvent.getByToken(tokenGPUTrack_, hTracks);
 
   cms::cuda::ScopedContextProduce ctx{*hTracks};
-  auto tracks_view = ctx.get(*hTracks);
+  auto &tracks = ctx.get(*hTracks);
 
-  ctx.emplace(iEvent, tokenGPUVertex_, gpuAlgo_.makeAsync(ctx.stream(), tracks_view, ptMin_, ptMax_));
+  ctx.emplace(iEvent, tokenGPUVertex_, gpuAlgo_.makeAsync(ctx.stream(), tracks.view(), ptMin_, ptMax_));
 }
 
 void PixelVertexProducerCUDA::produceOnCPU(edm::StreamID streamID,
                                            edm::Event& iEvent,
                                            const edm::EventSetup& iSetup) const {
-  auto tracks_view = iEvent.get(tokenCPUTrack_);
+  auto & tracks = iEvent.get(tokenCPUTrack_);
 
 #ifdef PIXVERTEX_DEBUG_PRODUCE
 
-  auto maxTracks = tracks_view.metadata().size();
+  auto maxTracks = tracks.view().metadata().size();
   // std::cout << "size of SoA " << sizeof(tsoa) << " stride " << maxTracks << std::endl;
 
   int32_t nt = 0;
   for (int32_t it = 0; it < maxTracks; ++it) {
-    auto nHits = pixelTrack::utilities::nHits(tracks_view, it);
-    assert(nHits == int(tracks_view.hitIndices().size(it)));
+    auto nHits = pixelTrack::utilities::nHits(tracks.view(), it);
+    assert(nHits == int(tracks.view().hitIndices().size(it)));
     if (nHits == 0)
       break;  // this is a guard: maybe we need to move to nTracks...
     nt++;
@@ -128,7 +129,7 @@ void PixelVertexProducerCUDA::produceOnCPU(edm::StreamID streamID,
   // std::cout << "found " << nt << " tracks in cpu SoA for Vertexing at " << tracks << std::endl;
 #endif  // PIXVERTEX_DEBUG_PRODUCE
 
-  iEvent.emplace(tokenCPUVertex_, gpuAlgo_.make(tracks_view, ptMin_, ptMax_));
+  iEvent.emplace(tokenCPUVertex_, gpuAlgo_.make(tracks.view(), ptMin_, ptMax_));
 }
 
 void PixelVertexProducerCUDA::produce(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const {
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc
index a476c95a5e78a..66de3fe8c99f7 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc
@@ -7,7 +7,7 @@
 #include "gpuSortByPt2.h"
 #include "gpuSplitVertices.h"
 
-#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h"
+#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h"
 
 #undef PIXVERTEX_DEBUG_PRODUCE
 
@@ -19,9 +19,10 @@ namespace gpuVertexFinder {
 
   // split vertices with a chi2/NDoF greater than this
   constexpr float maxChi2ForSplit = 9.f;
+  //using TkSoAView = pixelTrack::TrackSoAView;
   using TkSoAConstView = pixelTrack::TrackSoAConstView;
 
-  __global__ void loadTracks(TkSoAView tracks_view, ZVertexSoA* soa, WorkSpace* pws, float ptMin, float ptMax) {
+  __global__ void loadTracks(TkSoAConstView tracks_view, ZVertexSoA* soa, WorkSpace* pws, float ptMin, float ptMax) {
     assert(soa);
     auto const* quality = pixelTrack::utilities::qualityData(tracks_view);
 
@@ -95,13 +96,13 @@ namespace gpuVertexFinder {
 #endif
 
 #ifdef __CUDACC__
-  ZVertexHeterogeneous Producer::makeAsync(cudaStream_t stream, TkSoAView tracks_view, float ptMin, float ptMax) const {
+  ZVertexHeterogeneous Producer::makeAsync(cudaStream_t stream, TkSoAConstView tracks_view, float ptMin, float ptMax) const {
 #ifdef PIXVERTEX_DEBUG_PRODUCE
     std::cout << "producing Vertices on GPU" << std::endl;
 #endif  // PIXVERTEX_DEBUG_PRODUCE
     ZVertexHeterogeneous vertices(cms::cuda::make_device_unique<ZVertexSoA>(stream));
 #else
-  ZVertexHeterogeneous Producer::make(TkSoAView tracks_view, float ptMin, float ptMax) const {
+  ZVertexHeterogeneous Producer::make(TkSoAConstView tracks_view, float ptMin, float ptMax) const {
 #ifdef PIXVERTEX_DEBUG_PRODUCE
     std::cout << "producing Vertices on  CPU" << std::endl;
 #endif  // PIXVERTEX_DEBUG_PRODUCE
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h
index a890c53b20cb8..98bb9d75530d4 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h
@@ -6,13 +6,12 @@
 
 #include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h"
 //#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
-#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h"
+#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h"
 
 namespace gpuVertexFinder {
 
   using ZVertices = ZVertexSoA;
-  using TkSoA = pixelTrack::TrackSoA;
-  using TkSoAView = pixelTrack::TrackSoAView;
+  using TkSoAConstView = pixelTrack::TrackSoAConstView;
 
   // workspace used in the vertex reco algos
   struct WorkSpace {
@@ -65,8 +64,8 @@ namespace gpuVertexFinder {
 
     ~Producer() = default;
 
-    ZVertexHeterogeneous makeAsync(cudaStream_t stream, TkSoAView tracks_view, float ptMin, float ptMax) const;
-    ZVertexHeterogeneous make(TkSoAView tracks_view, float ptMin, float ptMax) const;
+    ZVertexHeterogeneous makeAsync(cudaStream_t stream, TkSoAConstView tracks_view, float ptMin, float ptMax) const;
+    ZVertexHeterogeneous make(TkSoAConstView tracks_view, float ptMin, float ptMax) const;
 
   private:
     const bool oneKernel_;

From 5b98c63b0ded588e3eb4bef013a2a3ff01bb8cf2 Mon Sep 17 00:00:00 2001
From: Breno Orzari <breno.orzari@hotmail.com>
Date: Mon, 31 Oct 2022 17:49:49 +0100
Subject: [PATCH 080/110] Changing nHits to ConstView

---
 CUDADataFormats/Track/interface/PixelTrackUtilities.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CUDADataFormats/Track/interface/PixelTrackUtilities.h b/CUDADataFormats/Track/interface/PixelTrackUtilities.h
index 1c7ffe22711e8..e202dced07307 100644
--- a/CUDADataFormats/Track/interface/PixelTrackUtilities.h
+++ b/CUDADataFormats/Track/interface/PixelTrackUtilities.h
@@ -119,7 +119,7 @@ namespace pixelTrack {
       }
       return nl;
     }
-    __host__ __device__ inline int nHits(TrackSoAView &tracks, int i) { return tracks.detIndices().size(i); }
+    __host__ __device__ inline int nHits(TrackSoAConstView &tracks, int i) { return tracks.detIndices().size(i); }
 
     // Casts quality SoA data (uint8_t) to pixelTrack::Quality. This is required
     // to use the data as an enum instead of a plain uint8_t

From f6c5e8ae554696c2afd302172b120aa789a10b0b Mon Sep 17 00:00:00 2001
From: Dimitris Papagiannis <nothingface0@gmail.com>
Date: Mon, 31 Oct 2022 17:50:59 +0100
Subject: [PATCH 081/110] Fixes for HelixFit, classes_def

---
 .../Track/interface/TrackSoAHeterogeneousDevice.h  |  2 +-
 CUDADataFormats/Track/src/classes_def.xml          | 14 ++++++--------
 .../plugins/CAHitNtupletGeneratorOnGPU.cc          |  3 +--
 .../PixelTriplets/plugins/HelixFitOnGPU.h          |  3 +--
 RecoPixelVertexing/PixelTriplets/src/classes.h     |  1 -
 5 files changed, 9 insertions(+), 14 deletions(-)

diff --git a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h
index a77643de29001..611f98d7d9dae 100644
--- a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h
+++ b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h
@@ -11,7 +11,7 @@
 template <int32_t S>
 class TrackSoAHeterogeneousDevice : public cms::cuda::PortableDeviceCollection<TrackSoAHeterogeneousLayout<>> {
 public:
-  //TrackSoAHeterogeneousDevice() = default;
+  TrackSoAHeterogeneousDevice() = default;  // cms::cuda::Product needs this
 
   // Constructor which specifies the SoA size
   explicit TrackSoAHeterogeneousDevice(cudaStream_t stream)
diff --git a/CUDADataFormats/Track/src/classes_def.xml b/CUDADataFormats/Track/src/classes_def.xml
index 9f320a3833ff0..c4337b0b7ee06 100644
--- a/CUDADataFormats/Track/src/classes_def.xml
+++ b/CUDADataFormats/Track/src/classes_def.xml
@@ -1,10 +1,8 @@
 <lcgdict>
-  <class name="pixelTrack::TrackSoAView" persistent="false"/>
-  <class name="edm::Wrapper<pixelTrack::TrackSoAView>" persistent="false"/>
-  <class name="cms::cuda::Product<pixelTrack::TrackSoAView>" persistent="false"/>
-  <class name="edm::Wrapper<cms::cuda::Product<pixelTrack::TrackSoAView>>" persistent="false"/>
-  <class name="pixelTrack::TrackSoALayout" persistent="false"/>
-  <class name="edm::Wrapper<pixelTrack::TrackSoALayout>" persistent="false"/>
-  <class name="cms::cuda::Product<pixelTrack::TrackSoALayout>" persistent="false"/>
-  <class name="edm::Wrapper<cms::cuda::Product<pixelTrack::TrackSoALayout>>" persistent="false"/>  
+  <class name="pixelTrack::TrackSoAHost" persistent="false"/>
+  <class name="edm::Wrapper<pixelTrack::TrackSoAHost>" persistent="false"/>
+  <class name="pixelTrack::TrackSoADevice" persistent="false"/>
+  <class name="edm::Wrapper<pixelTrack::TrackSoADevice>" persistent="false"/>
+  <class name="cms::cuda::Product<pixelTrack::TrackSoADevice>" persistent="false"/>
+  <class name="edm::Wrapper<cms::cuda::Product<pixelTrack::TrackSoADevice>>" persistent="false"/>
 </lcgdict>
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
index 180711886c8d1..37454b9065f0a 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
@@ -222,8 +222,7 @@ pixelTrack::TrackSoADevice CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingR
 }
 
 pixelTrack::TrackSoAHost CAHitNtupletGeneratorOnGPU::makeTuples(TrackingRecHit2DCPU const& hits_d, float bfield) const {
-  pixelTrack::TrackSoADevice tracks(stream);
-  auto* soa = &tracks;
+  pixelTrack::TrackSoAHost tracks;
 
   CAHitNtupletGeneratorKernelsCPU kernels(m_params);
   kernels.setCounters(m_counters);
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h
index 1a1283b9079c9..9fd2112476c9d 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h
@@ -1,8 +1,7 @@
 #ifndef RecoPixelVertexing_PixelTriplets_plugins_HelixFitOnGPU_h
 #define RecoPixelVertexing_PixelTriplets_plugins_HelixFitOnGPU_h
 
-//#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
-#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h"
+#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h"
 #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h"
 #include "RecoPixelVertexing/PixelTrackFitting/interface/FitResult.h"
 
diff --git a/RecoPixelVertexing/PixelTriplets/src/classes.h b/RecoPixelVertexing/PixelTriplets/src/classes.h
index db84e140b26de..4f495027ac186 100644
--- a/RecoPixelVertexing/PixelTriplets/src/classes.h
+++ b/RecoPixelVertexing/PixelTriplets/src/classes.h
@@ -1,6 +1,5 @@
 #include "RecoPixelVertexing/PixelTriplets/interface/IntermediateHitTriplets.h"
 #include "DataFormats/Common/interface/Wrapper.h"
-#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h"
 
 #include <vector>
 

From 800e2a6daa26e6b9732481ab97468b420d94e1cd Mon Sep 17 00:00:00 2001
From: Dimitris Papagiannis <nothingface0@gmail.com>
Date: Mon, 31 Oct 2022 18:14:10 +0100
Subject: [PATCH 082/110] Fixing types, still not compiling

---
 .../interface/TrackSoAHeterogeneousDevice.h   |  2 +-
 .../plugins/PixelTrackProducerFromSoA.cc      | 30 ++++++++-----------
 .../plugins/PixelTrackSoAFromCUDA.cc          |  2 +-
 3 files changed, 14 insertions(+), 20 deletions(-)

diff --git a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h
index 611f98d7d9dae..aaf4035d460e5 100644
--- a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h
+++ b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h
@@ -18,7 +18,7 @@ class TrackSoAHeterogeneousDevice : public cms::cuda::PortableDeviceCollection<T
       : PortableDeviceCollection<TrackSoAHeterogeneousLayout<>>(S, stream) {}
 
   // Copy data from device to host
-  __host__ void copyToHost(cms::cuda::host::unique_ptr<std::byte[]> &host_ptr, cudaStream_t stream) {
+  __host__ void copyToHost(cms::cuda::host::unique_ptr<std::byte[]> &host_ptr, cudaStream_t stream) const {
     cudaCheck(cudaMemcpy(host_ptr.get(), const_buffer().get(), bufferSize(), cudaMemcpyDeviceToHost));
   }
 };
diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc
index 5ffa051c27cfc..6a38ba45e96d9 100644
--- a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc
+++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc
@@ -56,7 +56,6 @@ class PixelTrackProducerFromSoA : public edm::global::EDProducer<> {
 
   // Event Data tokens
   const edm::EDGetTokenT<reco::BeamSpot> tBeamSpot_;
-  //const edm::EDGetTokenT<PixelTrackHeterogeneous> tokenTrack_;
   const edm::EDGetTokenT<pixelTrack::TrackSoAHost> tokenTrack_;
   const edm::EDGetTokenT<SiPixelRecHitCollectionNew> cpuHits_;
   const edm::EDGetTokenT<HMSstorage> hmsToken_;
@@ -70,8 +69,7 @@ class PixelTrackProducerFromSoA : public edm::global::EDProducer<> {
 
 PixelTrackProducerFromSoA::PixelTrackProducerFromSoA(const edm::ParameterSet &iConfig)
     : tBeamSpot_(consumes<reco::BeamSpot>(iConfig.getParameter<edm::InputTag>("beamSpot"))),
-      //tokenTrack_(consumes<PixelTrackHeterogeneous>(iConfig.getParameter<edm::InputTag>("trackSrc"))),
-      tokenTrack_(consumes<pixelTrack::TrackSoAView>(iConfig.getParameter<edm::InputTag>("trackSrc"))),
+      tokenTrack_(consumes<pixelTrack::TrackSoAHost>(iConfig.getParameter<edm::InputTag>("trackSrc"))),
       cpuHits_(consumes<SiPixelRecHitCollectionNew>(iConfig.getParameter<edm::InputTag>("pixelRecHitLegacySrc"))),
       hmsToken_(consumes<HMSstorage>(iConfig.getParameter<edm::InputTag>("pixelRecHitLegacySrc"))),
       idealMagneticFieldToken_(esConsumes()),
@@ -156,16 +154,15 @@ void PixelTrackProducerFromSoA::produce(edm::StreamID streamID,
   std::vector<const TrackingRecHit *> hits;
   hits.reserve(5);
 
-  //const auto &tsoa = *iEvent.get(tokenTrack_);
-  auto tsoa = iEvent.get(tokenTrack_);
+  const auto &tsoa = *iEvent.get(tokenTrack_);
 
   //auto const *quality = pixelTrack::utilities::qualityData(tsoa.view());
   // auto const &fit = tsoa.stateAtBS;
   //auto const &hitIndices = tsoa.view().hitIndices();
   //auto nTracks = tsoa.view().nTracks();
-  auto const *quality = pixelTrack::utilities::qualityData(tsoa);
-  auto const hitIndices = tsoa.hitIndices();
-  auto nTracks = tsoa.nTracks();
+  auto const *quality = pixelTrack::utilities::qualityData(tsoa.view());
+  auto const hitIndices = tsoa.view().hitIndices();
+  auto nTracks = tsoa.view().nTracks();
 
   tracks.reserve(nTracks);
 
@@ -175,15 +172,14 @@ void PixelTrackProducerFromSoA::produce(edm::StreamID streamID,
   std::vector<int32_t> sortIdxs(nTracks);
   std::iota(sortIdxs.begin(), sortIdxs.end(), 0);
   std::sort(sortIdxs.begin(), sortIdxs.end(), [&](int32_t const i1, int32_t const i2) {
-    //return tsoa.view()[i1].pt() > tsoa.view()[i2].pt();
-    return tsoa[i1].pt() > tsoa[i2].pt();
+    return tsoa.view()[i1].pt() > tsoa.view()[i2].pt();
   });
 
   //store the index of the SoA: indToEdm[index_SoAtrack] -> index_edmTrack (if it exists)
   indToEdm.resize(sortIdxs.size(), -1);
   for (const auto &it : sortIdxs) {
-    //auto nHits = pixelTrack::utilities::nHits(tsoa.view(), it);
-    auto nHits = pixelTrack::utilities::nHits(tsoa, it);
+    auto nHits = pixelTrack::utilities::nHits(tsoa.view(), it);
+
     assert(nHits >= 3);
     auto q = quality[it];
     if (q < minQuality_)
@@ -200,15 +196,13 @@ void PixelTrackProducerFromSoA::produce(edm::StreamID streamID,
 
     // mind: this values are respect the beamspot!
 
-    //float chi2 = tsoa.view()[it].chi2();
-    //float phi = pixelTrack::utilities::phi(tsoa.view(), it);
-    float chi2 = tsoa[it].chi2();
-    float phi = pixelTrack::utilities::phi(tsoa, it);
+    float chi2 = tsoa.view()[it].chi2();
+    float phi = pixelTrack::utilities::phi(tsoa.view(), it);
 
     riemannFit::Vector5d ipar, opar;
     riemannFit::Matrix5d icov, ocov;
-    //pixelTrack::utilities::copyToDense<riemannFit::Vector5d, riemannFit::Matrix5d>(tsoa.view(), ipar, icov, it);
-    pixelTrack::utilities::copyToDense<riemannFit::Vector5d, riemannFit::Matrix5d>(tsoa, ipar, icov, it);
+    pixelTrack::utilities::copyToDense<riemannFit::Vector5d, riemannFit::Matrix5d>(tsoa.view(), ipar, icov, it);
+
     riemannFit::transformToPerigeePlane(ipar, icov, opar, ocov);
 
     LocalTrajectoryParameters lpar(opar(0), opar(1), opar(2), opar(3), opar(4), 1.);
diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc
index 594081963bb90..d06b988d4a5f5 100644
--- a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc
+++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc
@@ -56,7 +56,7 @@ void PixelTrackSoAFromCUDA::acquire(edm::Event const& iEvent,
                                     edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
   cms::cuda::Product<pixelTrack::TrackSoADevice> const& inputDataWrapped = iEvent.get(tokenCUDA_);
   cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)};
-  auto const tracks_d = ctx.get(inputDataWrapped);  // Tracks on device
+  auto const& tracks_d = ctx.get(inputDataWrapped);  // Tracks on device
 
   pixelTrack::TrackSoAHost tracks_h(ctx.stream());
   tracks_d.copyToHost(tracks_h.buffer(), ctx.stream());

From 8eff092fc3af646ad936c84cef46b7a2bbf6e0dd Mon Sep 17 00:00:00 2001
From: Breno Orzari <breno.orzari@hotmail.com>
Date: Tue, 1 Nov 2022 10:56:29 +0100
Subject: [PATCH 083/110] Const reference everything

---
 .../Track/interface/PixelTrackUtilities.h        | 16 ++++++++--------
 .../plugins/PixelTrackProducerFromSoA.cc         |  3 ++-
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/CUDADataFormats/Track/interface/PixelTrackUtilities.h b/CUDADataFormats/Track/interface/PixelTrackUtilities.h
index e202dced07307..828a8261ca259 100644
--- a/CUDADataFormats/Track/interface/PixelTrackUtilities.h
+++ b/CUDADataFormats/Track/interface/PixelTrackUtilities.h
@@ -59,13 +59,13 @@ namespace pixelTrack {
       return std::copysign(1.f, tracks[i].state()(2));
     }
 
-    __host__ __device__ inline float phi(TrackSoAConstView &tracks, int32_t i) { return tracks[i].state()(0); }
+    __host__ __device__ inline float phi(const TrackSoAConstView &tracks, int32_t i) { return tracks[i].state()(0); }
 
-    __host__ __device__ inline float tip(TrackSoAConstView &tracks, int32_t i) { return tracks[i].state()(1); }
+    __host__ __device__ inline float tip(const TrackSoAConstView &tracks, int32_t i) { return tracks[i].state()(1); }
 
-    __host__ __device__ inline float zip(TrackSoAConstView &tracks, int32_t i) { return tracks[i].state()(4); }
+    __host__ __device__ inline float zip(const TrackSoAConstView &tracks, int32_t i) { return tracks[i].state()(4); }
 
-    __host__ __device__ inline bool isTriplet(TrackSoAConstView &tracks, int i) { return tracks[i].nLayers() == 3; }
+    __host__ __device__ inline bool isTriplet(const TrackSoAConstView &tracks, int i) { return tracks[i].nLayers() == 3; }
 
     template <typename V3, typename M3, typename V2, typename M2>
     __host__ __device__ inline void copyFromCircle(
@@ -97,7 +97,7 @@ namespace pixelTrack {
     }
 
     template <typename V5, typename M5>
-    __host__ __device__ inline void copyToDense(TrackSoAConstView &tracks, V5 &v, M5 &cov, int32_t i) {
+    __host__ __device__ inline void copyToDense(const TrackSoAConstView &tracks, V5 &v, M5 &cov, int32_t i) {
       v = tracks[i].state().template cast<typename V5::Scalar>();
       for (int j = 0, ind = 0; j < 5; ++j) {
         cov(j, j) = tracks[i].covariance()(ind++);
@@ -119,14 +119,14 @@ namespace pixelTrack {
       }
       return nl;
     }
-    __host__ __device__ inline int nHits(TrackSoAConstView &tracks, int i) { return tracks.detIndices().size(i); }
+    __host__ __device__ inline int nHits(const TrackSoAConstView &tracks, int i) { return tracks.detIndices().size(i); }
 
     // Casts quality SoA data (uint8_t) to pixelTrack::Quality. This is required
     // to use the data as an enum instead of a plain uint8_t
-    __host__ __device__ inline const Quality *qualityData(TrackSoAConstView &tracks) {
+    __host__ __device__ inline const Quality *qualityData(const TrackSoAConstView &tracks) {
       return reinterpret_cast<Quality const *>(tracks.quality());
     }
-    __host__ __device__ inline Quality *qualityData(TrackSoAView &tracks) {
+    __host__ __device__ inline Quality *qualityData(TrackSoAView tracks) {
       return reinterpret_cast<Quality *>(tracks.quality());
     }
 
diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc
index 6a38ba45e96d9..ec5be3b7f05c7 100644
--- a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc
+++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc
@@ -154,7 +154,8 @@ void PixelTrackProducerFromSoA::produce(edm::StreamID streamID,
   std::vector<const TrackingRecHit *> hits;
   hits.reserve(5);
 
-  const auto &tsoa = *iEvent.get(tokenTrack_);
+  //const auto &tsoa = *iEvent.get(tokenTrack_);
+  auto & tsoa = iEvent.get(tokenTrack_);
 
   //auto const *quality = pixelTrack::utilities::qualityData(tsoa.view());
   // auto const &fit = tsoa.stateAtBS;

From a55739365ba1e681efcf214317d962ebf2acb184 Mon Sep 17 00:00:00 2001
From: Dimitris Papagiannis <nothingface0@gmail.com>
Date: Tue, 1 Nov 2022 12:09:34 +0100
Subject: [PATCH 084/110] Updating RecoTauTag

---
 .../Track/interface/PixelTrackUtilities.h     |  2 +-
 .../HLTProducers/src/L2TauTagNNProducer.cc    | 49 ++++++++++---------
 2 files changed, 26 insertions(+), 25 deletions(-)

diff --git a/CUDADataFormats/Track/interface/PixelTrackUtilities.h b/CUDADataFormats/Track/interface/PixelTrackUtilities.h
index 828a8261ca259..4208dfe93f69c 100644
--- a/CUDADataFormats/Track/interface/PixelTrackUtilities.h
+++ b/CUDADataFormats/Track/interface/PixelTrackUtilities.h
@@ -55,7 +55,7 @@ namespace pixelTrack {
     using hindex_type = uint32_t;
     // State at the Beam spot
     // phi,tip,1/pt,cotan(theta),zip
-    __host__ __device__ inline float charge(TrackSoAConstView &tracks, int32_t i) {
+    __host__ __device__ inline float charge(const TrackSoAConstView &tracks, int32_t i) {
       return std::copysign(1.f, tracks[i].state()(2));
     }
 
diff --git a/RecoTauTag/HLTProducers/src/L2TauTagNNProducer.cc b/RecoTauTag/HLTProducers/src/L2TauTagNNProducer.cc
index 34e04b0f7aedb..db650684e7578 100644
--- a/RecoTauTag/HLTProducers/src/L2TauTagNNProducer.cc
+++ b/RecoTauTag/HLTProducers/src/L2TauTagNNProducer.cc
@@ -45,10 +45,11 @@
 #include "DataFormats/TrajectoryState/interface/LocalTrajectoryParameters.h"
 #include "DataFormats/GeometrySurface/interface/Plane.h"
 #include "DataFormats/BeamSpot/interface/BeamSpot.h"
-#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
 #include "MagneticField/Records/interface/IdealMagneticFieldRecord.h"
 #include "CUDADataFormats/SiPixelCluster/interface/gpuClusteringConstants.h"
-#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h"
+#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h"
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h"
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h"
 #include "CUDADataFormats/Vertex/interface/ZVertexSoA.h"
 #include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h"
 
@@ -179,16 +180,16 @@ class L2TauNNProducer : public edm::stream::EDProducer<edm::GlobalCache<L2TauNNP
                        const caloRecHitCollections& caloRecHits);
   void fillPatatracks(tensorflow::Tensor& cellGridMatrix,
                       const std::vector<l1t::TauRef>& allTaus,
-                      const pixelTrack::TrackSoA& patatracks_tsoa,
+                      const pixelTrack::TrackSoAHost& patatracks_tsoa,
                       const ZVertexSoA& patavtx_soa,
                       const reco::BeamSpot& beamspot,
                       const MagneticField* magfi);
   void selectGoodTracksAndVertices(const ZVertexSoA& patavtx_soa,
-                                   const pixelTrack::TrackSoA& patatracks_tsoa,
+                                   const pixelTrack::TrackSoAHost& patatracks_tsoa,
                                    std::vector<int>& trkGood,
                                    std::vector<int>& vtxGood);
   std::pair<float, float> impactParameter(int it,
-                                          const pixelTrack::TrackSoA& patatracks_tsoa,
+                                          const pixelTrack::TrackSoAHost& patatracks_tsoa,
                                           float patatrackPhi,
                                           const reco::BeamSpot& beamspot,
                                           const MagneticField* magfi);
@@ -208,7 +209,7 @@ class L2TauNNProducer : public edm::stream::EDProducer<edm::GlobalCache<L2TauNNP
   const edm::ESGetToken<CaloGeometry, CaloGeometryRecord> geometryToken_;
   const edm::ESGetToken<MagneticField, IdealMagneticFieldRecord> bFieldToken_;
   const edm::EDGetTokenT<ZVertexHeterogeneous> pataVerticesToken_;
-  const edm::EDGetTokenT<PixelTrackHeterogeneous> pataTracksToken_;
+  const edm::EDGetTokenT<pixelTrack::TrackSoAHost> pataTracksToken_;
   const edm::EDGetTokenT<reco::BeamSpot> beamSpotToken_;
   const unsigned int maxVtx_;
   const float fractionSumPt2_;
@@ -293,7 +294,7 @@ L2TauNNProducer::L2TauNNProducer(const edm::ParameterSet& cfg, const L2TauNNProd
       geometryToken_(esConsumes<CaloGeometry, CaloGeometryRecord>()),
       bFieldToken_(esConsumes<MagneticField, IdealMagneticFieldRecord>()),
       pataVerticesToken_(consumes<ZVertexHeterogeneous>(cfg.getParameter<edm::InputTag>("pataVertices"))),
-      pataTracksToken_(consumes<PixelTrackHeterogeneous>(cfg.getParameter<edm::InputTag>("pataTracks"))),
+      pataTracksToken_(consumes<pixelTrack::TrackSoAHost>(cfg.getParameter<edm::InputTag>("pataTracks"))),
       beamSpotToken_(consumes<reco::BeamSpot>(cfg.getParameter<edm::InputTag>("BeamSpot"))),
       maxVtx_(cfg.getParameter<uint>("maxVtx")),
       fractionSumPt2_(cfg.getParameter<double>("fractionSumPt2")),
@@ -570,31 +571,31 @@ void L2TauNNProducer::fillCaloRecHits(tensorflow::Tensor& cellGridMatrix,
 }
 
 void L2TauNNProducer::selectGoodTracksAndVertices(const ZVertexSoA& patavtx_soa,
-                                                  const pixelTrack::TrackSoA& patatracks_tsoa,
+                                                  const pixelTrack::TrackSoAHost& patatracks_tsoa,
                                                   std::vector<int>& trkGood,
                                                   std::vector<int>& vtxGood) {
-  const auto maxTracks = patatracks_tsoa.stride();
+  const auto maxTracks = patatracks_tsoa.view().metadata().size();
   const int nv = patavtx_soa.nvFinal;
   trkGood.clear();
   trkGood.reserve(maxTracks);
   vtxGood.clear();
   vtxGood.reserve(nv);
-  auto const* quality = patatracks_tsoa.qualityData();
+  auto const* quality = pixelTrack::utilities::qualityData(patatracks_tsoa.view());
 
   // No need to sort either as the algorithms is just using the max (not even the location, just the max value of pt2sum).
   std::vector<float> pTSquaredSum(nv, 0);
   std::vector<int> nTrkAssociated(nv, 0);
 
   for (int32_t trk_idx = 0; trk_idx < maxTracks; ++trk_idx) {
-    auto nHits = patatracks_tsoa.nHits(trk_idx);
+    auto nHits = pixelTrack::utilities::nHits(patatracks_tsoa.view(), trk_idx);
     if (nHits == 0) {
       break;
     }
     int vtx_ass_to_track = patavtx_soa.idv[trk_idx];
     if (vtx_ass_to_track >= 0 && vtx_ass_to_track < nv) {
-      auto patatrackPt = patatracks_tsoa.pt[trk_idx];
+      auto patatrackPt = patatracks_tsoa.view()[trk_idx].pt();
       ++nTrkAssociated[vtx_ass_to_track];
-      if (patatrackPt >= trackPtMin_ && patatracks_tsoa.chi2(trk_idx) <= trackChi2Max_) {
+      if (patatrackPt >= trackPtMin_ && patatracks_tsoa.const_view()[trk_idx].chi2() <= trackChi2Max_) {
         patatrackPt = std::min(patatrackPt, trackPtMax_);
         pTSquaredSum[vtx_ass_to_track] += patatrackPt * patatrackPt;
       }
@@ -617,15 +618,15 @@ void L2TauNNProducer::selectGoodTracksAndVertices(const ZVertexSoA& patavtx_soa,
 }
 
 std::pair<float, float> L2TauNNProducer::impactParameter(int it,
-                                                         const pixelTrack::TrackSoA& patatracks_tsoa,
+                                                         const pixelTrack::TrackSoAHost& patatracks_tsoa,
                                                          float patatrackPhi,
                                                          const reco::BeamSpot& beamspot,
                                                          const MagneticField* magfi) {
-  auto const& fit = patatracks_tsoa.stateAtBS;
+  // auto const& fit = patatracks_tsoa.stateAtBS;
   /* dxy and dz */
   riemannFit::Vector5d ipar, opar;
   riemannFit::Matrix5d icov, ocov;
-  fit.copyToDense(ipar, icov, it);
+  pixelTrack::utilities::copyToDense(patatracks_tsoa.view(), ipar, icov, it);
   riemannFit::transformToPerigeePlane(ipar, icov, opar, ocov);
   LocalTrajectoryParameters lpar(opar(0), opar(1), opar(2), opar(3), opar(4), 1.);
   float sp = std::sin(patatrackPhi);
@@ -650,7 +651,7 @@ std::pair<float, float> L2TauNNProducer::impactParameter(int it,
 
 void L2TauNNProducer::fillPatatracks(tensorflow::Tensor& cellGridMatrix,
                                      const std::vector<l1t::TauRef>& allTaus,
-                                     const pixelTrack::TrackSoA& patatracks_tsoa,
+                                     const pixelTrack::TrackSoAHost& patatracks_tsoa,
                                      const ZVertexSoA& patavtx_soa,
                                      const reco::BeamSpot& beamspot,
                                      const MagneticField* magfi) {
@@ -675,14 +676,14 @@ void L2TauNNProducer::fillPatatracks(tensorflow::Tensor& cellGridMatrix,
     const float tauPhi = allTaus[tau_idx]->phi();
 
     for (const auto it : trkGood) {
-      const float patatrackPt = patatracks_tsoa.pt[it];
+      const float patatrackPt = patatracks_tsoa.const_view()[it].pt();
       if (patatrackPt <= 0)
         continue;
-      const float patatrackPhi = patatracks_tsoa.phi(it);
-      const float patatrackEta = patatracks_tsoa.eta(it);
-      const float patatrackCharge = patatracks_tsoa.charge(it);
-      const float patatrackChi2OverNdof = patatracks_tsoa.chi2(it);
-      const auto nHits = patatracks_tsoa.nHits(it);
+      const float patatrackPhi = pixelTrack::utilities::phi(patatracks_tsoa.const_view(), it);
+      const float patatrackEta = patatracks_tsoa.const_view()[it].eta();
+      const float patatrackCharge = pixelTrack::utilities::charge(patatracks_tsoa.const_view(), it);
+      const float patatrackChi2OverNdof = patatracks_tsoa.view()[it].chi2();
+      const auto nHits = pixelTrack::utilities::nHits(patatracks_tsoa.const_view(), it);
       if (nHits <= 0)
         continue;
       const int patatrackNdof = 2 * std::min(6, nHits) - 5;
@@ -763,7 +764,7 @@ void L2TauNNProducer::produce(edm::Event& event, const edm::EventSetup& eventset
   const auto eeCal = event.getHandle(eeToken_);
   const auto hbhe = event.getHandle(hbheToken_);
   const auto ho = event.getHandle(hoToken_);
-  const auto& patatracks_SoA = *event.get(pataTracksToken_);
+  auto& patatracks_SoA = event.get(pataTracksToken_);
   const auto& vertices_SoA = *event.get(pataVerticesToken_);
   const auto bsHandle = event.getHandle(beamSpotToken_);
 

From 44663213ef8a764b8bba20831a10704d1ce9a9da Mon Sep 17 00:00:00 2001
From: Dimitris Papagiannis <nothingface0@gmail.com>
Date: Tue, 1 Nov 2022 13:21:48 +0100
Subject: [PATCH 085/110] DQM/MonitorTrackSoA adapted

---
 .../plugins/SiPixelPhase1MonitorTrackSoA.cc   | 32 ++++++++++---------
 1 file changed, 17 insertions(+), 15 deletions(-)

diff --git a/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1MonitorTrackSoA.cc b/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1MonitorTrackSoA.cc
index 622895ba07bcc..b4c996afc7055 100644
--- a/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1MonitorTrackSoA.cc
+++ b/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1MonitorTrackSoA.cc
@@ -21,7 +21,8 @@
 #include "DQMServices/Core/interface/MonitorElement.h"
 #include "DQMServices/Core/interface/DQMEDAnalyzer.h"
 #include "DQMServices/Core/interface/DQMStore.h"
-#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
+#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h"
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h"
 // for string manipulations
 #include <fmt/printf.h>
 
@@ -34,7 +35,7 @@ class SiPixelPhase1MonitorTrackSoA : public DQMEDAnalyzer {
   static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
 
 private:
-  edm::EDGetTokenT<PixelTrackHeterogeneous> tokenSoATrack_;
+  edm::EDGetTokenT<pixelTrack::TrackSoAHost> tokenSoATrack_;
   std::string topFolderName_;
   bool useQualityCut_;
   pixelTrack::Quality minQuality_;
@@ -62,7 +63,7 @@ class SiPixelPhase1MonitorTrackSoA : public DQMEDAnalyzer {
 //
 
 SiPixelPhase1MonitorTrackSoA::SiPixelPhase1MonitorTrackSoA(const edm::ParameterSet& iConfig) {
-  tokenSoATrack_ = consumes<PixelTrackHeterogeneous>(iConfig.getParameter<edm::InputTag>("pixelTrackSrc"));
+  tokenSoATrack_ = consumes<pixelTrack::TrackSoAHost>(iConfig.getParameter<edm::InputTag>("pixelTrackSrc"));
   topFolderName_ = iConfig.getParameter<std::string>("topFolderName");  //"SiPixelHeterogeneous/PixelTrackSoA";
   useQualityCut_ = iConfig.getParameter<bool>("useQualityCut");
   minQuality_ = pixelTrack::qualityByName(iConfig.getParameter<std::string>("minQuality"));
@@ -78,23 +79,24 @@ void SiPixelPhase1MonitorTrackSoA::analyze(const edm::Event& iEvent, const edm::
     return;
   }
 
-  auto const& tsoa = *((tsoaHandle.product())->get());
-  auto maxTracks = tsoa.stride();
-  auto const* quality = tsoa.qualityData();
+  auto& tsoa = *tsoaHandle.product();
+  auto maxTracks = tsoa.view().metadata().size();
+  auto const* quality = pixelTrack::utilities::qualityData(tsoa.const_view());
   int32_t nTracks = 0;
   int32_t nLooseAndAboveTracks = 0;
 
   for (int32_t it = 0; it < maxTracks; ++it) {
-    auto nHits = tsoa.nHits(it);
-    auto nLayers = tsoa.nLayers(it);
+    auto nHits = pixelTrack::utilities::nHits(tsoa.const_view(), it);
+    auto nLayers = tsoa.view()[it].nLayers();
     if (nHits == 0)
       break;  // this is a guard
-    float pt = tsoa.pt(it);
+    float pt = tsoa.view()[it].pt();
     if (!(pt > 0.))
       continue;
 
     // fill the quality for all tracks
-    pixelTrack::Quality qual = tsoa.quality(it);
+    // pixelTrack::Quality qual = tsoa.view()[it].quality();
+    pixelTrack::Quality qual = quality[it];
     hquality->Fill(int(qual));
     nTracks++;
 
@@ -102,11 +104,11 @@ void SiPixelPhase1MonitorTrackSoA::analyze(const edm::Event& iEvent, const edm::
       continue;
 
     // fill parameters only for quality >= loose
-    float chi2 = tsoa.chi2(it);
-    float phi = tsoa.phi(it);
-    float zip = tsoa.zip(it);
-    float eta = tsoa.eta(it);
-    float tip = tsoa.tip(it);
+    float chi2 = tsoa.view()[it].chi2();
+    float phi = pixelTrack::utilities::phi(tsoa.const_view(), it);
+    float zip = pixelTrack::utilities::zip(tsoa.const_view(), it);
+    float eta = tsoa.view()[it].eta();
+    float tip = pixelTrack::utilities::tip(tsoa.const_view(), it);
 
     hchi2->Fill(chi2);
     hChi2VsPhi->Fill(phi, chi2);

From cf50b7d62d324454bc67d9f12b35294aa4372e60 Mon Sep 17 00:00:00 2001
From: Dimitris Papagiannis <nothingface0@gmail.com>
Date: Tue, 1 Nov 2022 13:39:49 +0100
Subject: [PATCH 086/110] Fixed utilities function calls arguments

---
 .../plugins/SiPixelPhase1CompareTrackSoA.cc   | 94 +++++++++++--------
 1 file changed, 56 insertions(+), 38 deletions(-)

diff --git a/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1CompareTrackSoA.cc b/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1CompareTrackSoA.cc
index 7b12f694d4e8c..dedff1f758e8f 100644
--- a/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1CompareTrackSoA.cc
+++ b/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1CompareTrackSoA.cc
@@ -2,7 +2,7 @@
 // Package:    SiPixelPhase1CompareTrackSoA
 // Class:      SiPixelPhase1CompareTrackSoA
 //
-/**\class SiPixelPhase1CompareTrackSoA SiPixelPhase1CompareTrackSoA.cc 
+/**\class SiPixelPhase1CompareTrackSoA SiPixelPhase1CompareTrackSoA.cc
 */
 //
 // Author: Suvankar Roy Chowdhury
@@ -20,7 +20,9 @@
 #include "DQMServices/Core/interface/MonitorElement.h"
 #include "DQMServices/Core/interface/DQMEDAnalyzer.h"
 #include "DQMServices/Core/interface/DQMStore.h"
-#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
+//#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h"
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h"
 // for string manipulations
 #include <fmt/printf.h>
 
@@ -71,8 +73,8 @@ class SiPixelPhase1CompareTrackSoA : public DQMEDAnalyzer {
   static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
 
 private:
-  const edm::EDGetTokenT<PixelTrackHeterogeneous> tokenSoATrackCPU_;
-  const edm::EDGetTokenT<PixelTrackHeterogeneous> tokenSoATrackGPU_;
+  const edm::EDGetTokenT<pixelTrack::TrackSoAHost> tokenSoATrackCPU_;
+  const edm::EDGetTokenT<pixelTrack::TrackSoADevice> tokenSoATrackGPU_;
   const std::string topFolderName_;
   const bool useQualityCut_;
   const pixelTrack::Quality minQuality_;
@@ -113,8 +115,8 @@ class SiPixelPhase1CompareTrackSoA : public DQMEDAnalyzer {
 //
 
 SiPixelPhase1CompareTrackSoA::SiPixelPhase1CompareTrackSoA(const edm::ParameterSet& iConfig)
-    : tokenSoATrackCPU_(consumes<PixelTrackHeterogeneous>(iConfig.getParameter<edm::InputTag>("pixelTrackSrcCPU"))),
-      tokenSoATrackGPU_(consumes<PixelTrackHeterogeneous>(iConfig.getParameter<edm::InputTag>("pixelTrackSrcGPU"))),
+    : tokenSoATrackCPU_(consumes<pixelTrack::TrackSoAHost>(iConfig.getParameter<edm::InputTag>("pixelTrackSrcCPU"))),
+      tokenSoATrackGPU_(consumes<pixelTrack::TrackSoADevice>(iConfig.getParameter<edm::InputTag>("pixelTrackSrcGPU"))),
       topFolderName_(iConfig.getParameter<std::string>("topFolderName")),
       useQualityCut_(iConfig.getParameter<bool>("useQualityCut")),
       minQuality_(pixelTrack::qualityByName(iConfig.getParameter<std::string>("minQuality"))),
@@ -138,12 +140,12 @@ void SiPixelPhase1CompareTrackSoA::analyze(const edm::Event& iEvent, const edm::
     return;
   }
 
-  auto const& tsoaCPU = *tsoaHandleCPU->get();
-  auto const& tsoaGPU = *tsoaHandleGPU->get();
-  auto maxTracksCPU = tsoaCPU.stride();  //this should be same for both?
-  auto maxTracksGPU = tsoaGPU.stride();  //this should be same for both?
-  auto const* qualityCPU = tsoaCPU.qualityData();
-  auto const* qualityGPU = tsoaGPU.qualityData();
+  auto& tsoaCPU = *tsoaHandleCPU.product();
+  auto& tsoaGPU = *tsoaHandleGPU.product();
+  auto maxTracksCPU = tsoaCPU.view().metadata().size();  //this should be same for both?
+  auto maxTracksGPU = tsoaGPU.view().metadata().size();  //this should be same for both?
+  auto const* qualityCPU = pixelTrack::utilities::qualityData(tsoaCPU.view());
+  auto const* qualityGPU = pixelTrack::utilities::qualityData(tsoaGPU.view());
   int32_t nTracksCPU = 0;
   int32_t nTracksGPU = 0;
   int32_t nLooseAndAboveTracksCPU = 0;
@@ -153,9 +155,9 @@ void SiPixelPhase1CompareTrackSoA::analyze(const edm::Event& iEvent, const edm::
   //Loop over GPU tracks and store the indices of the loose tracks. Whats happens if useQualityCut_ is false?
   std::vector<int32_t> looseTrkidxGPU;
   for (int32_t jt = 0; jt < maxTracksGPU; ++jt) {
-    if (tsoaGPU.nHits(jt) == 0)
+    if (pixelTrack::utilities::nHits(tsoaGPU.view(), jt) == 0)
       break;  // this is a guard
-    if (!(tsoaGPU.pt(jt) > 0.))
+    if (!(tsoaGPU.view()[jt].pt() > 0.))
       continue;
     nTracksGPU++;
     if (useQualityCut_ && qualityGPU[jt] < minQuality_)
@@ -166,9 +168,18 @@ void SiPixelPhase1CompareTrackSoA::analyze(const edm::Event& iEvent, const edm::
 
   //Now loop over CPU tracks//nested loop for loose gPU tracks
   for (int32_t it = 0; it < maxTracksCPU; ++it) {
-    if (tsoaCPU.nHits(it) == 0)
+    float chi2CPU = tsoaCPU.view()[it].chi2();
+    int nHitsCPU = pixelTrack::utilities::nHits(tsoaCPU.view(), it);
+    int8_t nLayersCPU = tsoaCPU.view()[it].nLayers();
+    float ptCPU = tsoaCPU.view()[it].pt();
+    float etaCPU = tsoaCPU.view()[it].eta();
+    float phiCPU = pixelTrack::utilities::phi(tsoaCPU.view(), it);
+    float zipCPU = pixelTrack::utilities::zip(tsoaCPU.view(), it);
+    float tipCPU = pixelTrack::utilities::tip(tsoaCPU.view(), it);
+
+    if (nHitsCPU == 0)
       break;  // this is a guard
-    if (!(tsoaCPU.pt(it) > 0.))
+    if (!(ptCPU > 0.))
       continue;
     nTracksCPU++;
     if (useQualityCut_ && qualityCPU[it] < minQuality_)
@@ -178,12 +189,10 @@ void SiPixelPhase1CompareTrackSoA::analyze(const edm::Event& iEvent, const edm::
     const int32_t notFound = -1;
     int32_t closestTkidx = notFound;
     float mindr2 = dr2cut_;
-    float etacpu = tsoaCPU.eta(it);
-    float phicpu = tsoaCPU.phi(it);
     for (auto gid : looseTrkidxGPU) {
-      float etagpu = tsoaGPU.eta(gid);
-      float phigpu = tsoaGPU.phi(gid);
-      float dr2 = reco::deltaR2(etacpu, phicpu, etagpu, phigpu);
+      float etaGPU = tsoaGPU.view()[gid].eta();
+      float phiGPU = pixelTrack::utilities::phi(tsoaGPU.view(), gid);
+      float dr2 = reco::deltaR2(etaCPU, phiCPU, etaGPU, phiGPU);
       if (dr2 > dr2cut_)
         continue;  // this is arbitrary
       if (mindr2 > dr2) {
@@ -192,27 +201,36 @@ void SiPixelPhase1CompareTrackSoA::analyze(const edm::Event& iEvent, const edm::
       }
     }
 
-    hpt_eta_tkAllCPU_->Fill(etacpu, tsoaCPU.pt(it));  //all CPU tk
-    hphi_z_tkAllCPU_->Fill(phicpu, tsoaCPU.zip(it));
+    hpt_eta_tkAllCPU_->Fill(etaCPU, ptCPU);  //all CPU tk
+    hphi_z_tkAllCPU_->Fill(phiCPU, zipCPU);
     if (closestTkidx == notFound)
       continue;
     nLooseAndAboveTracksCPU_matchedGPU++;
 
-    hchi2_->Fill(tsoaCPU.chi2(it), tsoaGPU.chi2(closestTkidx));
-    hnHits_->Fill(tsoaCPU.nHits(it), tsoaGPU.nHits(closestTkidx));
-    hnLayers_->Fill(tsoaCPU.nLayers(it), tsoaGPU.nLayers(closestTkidx));
-    hpt_->Fill(tsoaCPU.pt(it), tsoaGPU.pt(closestTkidx));
-    hptLogLog_->Fill(tsoaCPU.pt(it), tsoaGPU.pt(closestTkidx));
-    heta_->Fill(etacpu, tsoaGPU.eta(closestTkidx));
-    hphi_->Fill(phicpu, tsoaGPU.phi(closestTkidx));
-    hz_->Fill(tsoaCPU.zip(it), tsoaGPU.zip(closestTkidx));
-    htip_->Fill(tsoaCPU.tip(it), tsoaGPU.tip(closestTkidx));
-    hptdiffMatched_->Fill(tsoaCPU.pt(it) - tsoaGPU.pt(closestTkidx));
-    hetadiffMatched_->Fill(etacpu - tsoaGPU.eta(closestTkidx));
-    hphidiffMatched_->Fill(reco::deltaPhi(phicpu, tsoaGPU.phi(closestTkidx)));
-    hzdiffMatched_->Fill(tsoaCPU.zip(it) - tsoaGPU.zip(closestTkidx));
-    hpt_eta_tkAllCPUMatched_->Fill(etacpu, tsoaCPU.pt(it));  //matched to gpu
-    hphi_z_tkAllCPUMatched_->Fill(phicpu, tsoaCPU.zip(it));
+    float chi2GPU = tsoaGPU.view()[closestTkidx].chi2();
+    int nHitsGPU = pixelTrack::utilities::nHits(tsoaGPU.view(), closestTkidx);
+    int8_t nLayersGPU = tsoaGPU.view()[closestTkidx].nLayers();
+    float ptGPU = tsoaGPU.view()[closestTkidx].pt();
+    float etaGPU = tsoaGPU.view()[closestTkidx].eta();
+    float phiGPU = pixelTrack::utilities::phi(tsoaGPU.view(), closestTkidx);
+    float zipGPU = pixelTrack::utilities::zip(tsoaGPU.view(), closestTkidx);
+    float tipGPU = pixelTrack::utilities::tip(tsoaGPU.view(), closestTkidx);
+
+    hchi2_->Fill(chi2CPU, chi2GPU);
+    hnHits_->Fill(nHitsCPU, nHitsGPU);
+    hnLayers_->Fill(nLayersCPU, nLayersGPU);
+    hpt_->Fill(ptCPU, ptCPU);
+    hptLogLog_->Fill(ptCPU, ptGPU);
+    heta_->Fill(etaCPU, etaGPU);
+    hphi_->Fill(phiCPU, phiGPU);
+    hz_->Fill(zipCPU, zipGPU);
+    htip_->Fill(tipCPU, tipGPU);
+    hptdiffMatched_->Fill(ptCPU - ptGPU);
+    hetadiffMatched_->Fill(etaCPU - etaGPU);
+    hphidiffMatched_->Fill(reco::deltaPhi(phiCPU, phiGPU));
+    hzdiffMatched_->Fill(zipCPU - zipGPU);
+    hpt_eta_tkAllCPUMatched_->Fill(etaCPU, ptCPU);  //matched to gpu
+    hphi_z_tkAllCPUMatched_->Fill(phiCPU, zipCPU);
   }
   hnTracks_->Fill(nTracksCPU, nTracksGPU);
   hnLooseAndAboveTracks_->Fill(nLooseAndAboveTracksCPU, nLooseAndAboveTracksGPU);

From ebb47181ea6ebfffd78312a44ac9c708c0142007 Mon Sep 17 00:00:00 2001
From: Dimitris Papagiannis <nothingface0@gmail.com>
Date: Tue, 1 Nov 2022 13:55:41 +0100
Subject: [PATCH 087/110] SeedProducer adapted to new Data format

---
 .../plugins/SeedProducerFromSoA.cc            | 24 ++++++++++---------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/RecoTracker/TkSeedGenerator/plugins/SeedProducerFromSoA.cc b/RecoTracker/TkSeedGenerator/plugins/SeedProducerFromSoA.cc
index 0e5823fc46c46..a5cc27c338ebe 100644
--- a/RecoTracker/TkSeedGenerator/plugins/SeedProducerFromSoA.cc
+++ b/RecoTracker/TkSeedGenerator/plugins/SeedProducerFromSoA.cc
@@ -1,4 +1,6 @@
-#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
+//#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h"
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h"
 #include "DataFormats/BeamSpot/interface/BeamSpot.h"
 #include "DataFormats/GeometrySurface/interface/Plane.h"
 #include "DataFormats/TrackerCommon/interface/TrackerTopology.h"
@@ -45,7 +47,7 @@ class SeedProducerFromSoA : public edm::global::EDProducer<> {
 
   // Event data tokens
   const edm::EDGetTokenT<reco::BeamSpot> tBeamSpot_;
-  const edm::EDGetTokenT<PixelTrackHeterogeneous> tokenTrack_;
+  const edm::EDGetTokenT<pixelTrack::TrackSoAHost> tokenTrack_;
   // Event setup tokens
   const edm::ESGetToken<MagneticField, IdealMagneticFieldRecord> idealMagneticFieldToken_;
   const edm::ESGetToken<TrackerGeometry, TrackerDigiGeometryRecord> trackerDigiGeometryToken_;
@@ -55,7 +57,7 @@ class SeedProducerFromSoA : public edm::global::EDProducer<> {
 
 SeedProducerFromSoA::SeedProducerFromSoA(const edm::ParameterSet& iConfig)
     : tBeamSpot_(consumes<reco::BeamSpot>(iConfig.getParameter<edm::InputTag>("beamSpot"))),
-      tokenTrack_(consumes<PixelTrackHeterogeneous>(iConfig.getParameter<edm::InputTag>("src"))),
+      tokenTrack_(consumes<pixelTrack::TrackSoAHost>(iConfig.getParameter<edm::InputTag>("src"))),
       idealMagneticFieldToken_(esConsumes()),
       trackerDigiGeometryToken_(esConsumes()),
       trackerPropagatorToken_(esConsumes(edm::ESInputTag("PropagatorWithMaterial"))),
@@ -89,16 +91,16 @@ void SeedProducerFromSoA::produce(edm::StreamID streamID, edm::Event& iEvent, co
   // std::cout << "beamspot " << bsh.x0() << ' ' << bsh.y0() << ' ' << bsh.z0() << std::endl;
   GlobalPoint bs(bsh.x0(), bsh.y0(), bsh.z0());
 
-  const auto& tsoa = *(iEvent.get(tokenTrack_));
+  auto & tsoa = iEvent.get(tokenTrack_);
 
-  auto const* quality = tsoa.qualityData();
-  auto const& fit = tsoa.stateAtBS;
-  auto const& detIndices = tsoa.detIndices;
-  auto maxTracks = tsoa.stride();
+  auto const* quality = pixelTrack::utilities::qualityData(tsoa.view());
+  //auto const& fit = tsoa.stateAtBS;
+  auto const& detIndices = tsoa.view().detIndices();
+  auto maxTracks = tsoa.view().metadata().size();
 
   int32_t nt = 0;
   for (int32_t it = 0; it < maxTracks; ++it) {
-    auto nHits = tsoa.nHits(it);
+    auto nHits = pixelTrack::utilities::nHits(tsoa.view(),it);
     if (nHits == 0)
       break;  // this is a guard: maybe we need to move to nTracks...
 
@@ -120,11 +122,11 @@ void SeedProducerFromSoA::produce(edm::StreamID streamID, edm::Event& iEvent, co
 
     // mind: this values are respect the beamspot!
 
-    float phi = tsoa.phi(it);
+    float phi = pixelTrack::utilities::phi(tsoa.view(),it);
 
     riemannFit::Vector5d ipar, opar;
     riemannFit::Matrix5d icov, ocov;
-    fit.copyToDense(ipar, icov, it);
+    pixelTrack::utilities::copyToDense(tsoa.view(),ipar, icov, it);
     riemannFit::transformToPerigeePlane(ipar, icov, opar, ocov);
 
     LocalTrajectoryParameters lpar(opar(0), opar(1), opar(2), opar(3), opar(4), 1.);

From 9ace379c4e88131602f962153830d7e9cefad519 Mon Sep 17 00:00:00 2001
From: Dimitris Papagiannis <nothingface0@gmail.com>
Date: Tue, 1 Nov 2022 16:20:14 +0100
Subject: [PATCH 088/110] Fix tracks SoA instantiation on host

---
 .../PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc         | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
index 37454b9065f0a..9e778bae66158 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
@@ -222,7 +222,7 @@ pixelTrack::TrackSoADevice CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingR
 }
 
 pixelTrack::TrackSoAHost CAHitNtupletGeneratorOnGPU::makeTuples(TrackingRecHit2DCPU const& hits_d, float bfield) const {
-  pixelTrack::TrackSoAHost tracks;
+  pixelTrack::TrackSoAHost tracks(nullptr);
 
   CAHitNtupletGeneratorKernelsCPU kernels(m_params);
   kernels.setCounters(m_counters);

From 33a9741ffdfae24d42c50e38d8a29da362a2a8c2 Mon Sep 17 00:00:00 2001
From: Dimitris Papagiannis <nothingface0@gmail.com>
Date: Wed, 2 Nov 2022 17:04:54 +0100
Subject: [PATCH 089/110] Fixed segfault due to using local variable instead of
 the class attribute

---
 .../Track/interface/TrackSoAHeterogeneousDevice.h  |  3 ++-
 .../plugins/PixelTrackSoAFromCUDA.cc               | 14 ++++++--------
 .../plugins/CAHitNtupletGeneratorKernelsImpl.h     |  9 +++++----
 3 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h
index aaf4035d460e5..b79f8d959720c 100644
--- a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h
+++ b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h
@@ -19,7 +19,8 @@ class TrackSoAHeterogeneousDevice : public cms::cuda::PortableDeviceCollection<T
 
   // Copy data from device to host
   __host__ void copyToHost(cms::cuda::host::unique_ptr<std::byte[]> &host_ptr, cudaStream_t stream) const {
-    cudaCheck(cudaMemcpy(host_ptr.get(), const_buffer().get(), bufferSize(), cudaMemcpyDeviceToHost));
+    cudaCheck(cudaMemcpyAsync(host_ptr.get(), const_buffer().get(), bufferSize(), cudaMemcpyDeviceToHost, stream));
+    cudaCheck(cudaGetLastError());
   }
 };
 
diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc
index d06b988d4a5f5..283e5b0292464 100644
--- a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc
+++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc
@@ -17,6 +17,7 @@
 #include "FWCore/Utilities/interface/EDGetToken.h"
 #include "FWCore/Utilities/interface/InputTag.h"
 #include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 
 // Switch on to enable checks and printout for found tracks
 // #define PIXEL_DEBUG_PRODUCE
@@ -56,10 +57,9 @@ void PixelTrackSoAFromCUDA::acquire(edm::Event const& iEvent,
                                     edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
   cms::cuda::Product<pixelTrack::TrackSoADevice> const& inputDataWrapped = iEvent.get(tokenCUDA_);
   cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)};
-  auto const& tracks_d = ctx.get(inputDataWrapped);  // Tracks on device
-
-  pixelTrack::TrackSoAHost tracks_h(ctx.stream());
-  tracks_d.copyToHost(tracks_h.buffer(), ctx.stream());
+  auto const& tracks_d = ctx.get(inputDataWrapped);      // Tracks on device
+  tracks_h = pixelTrack::TrackSoAHost(ctx.stream());     // Create an instance of Tracks on Host, using the stream
+  tracks_d.copyToHost(tracks_h.buffer(), ctx.stream());  // Copy data from Device to Host
 }
 
 void PixelTrackSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& iSetup) {
@@ -67,6 +67,7 @@ void PixelTrackSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& i
   auto maxTracks = tracks_h.view().metadata().size();
   auto nTracks = tracks_h.view().nTracks();
   assert(nTracks < maxTracks);
+
   if (nTracks == maxTracks - 1) {
     edm::LogWarning("PixelTracks") << "Unsorted reconstructed pixel tracks truncated to " << maxTracks - 1
                                    << " candidates";
@@ -86,11 +87,8 @@ void PixelTrackSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& i
   }
   assert(nTracks == nt);
 #endif
-
   // DO NOT  make a copy  (actually TWO....)
-  iEvent.emplace(tokenSOA_, std::move(tracks_h));  //, std::move(ret)); // view
-
-  //assert(!soa_);
+  iEvent.emplace(tokenSOA_, std::move(tracks_h));
 }
 
 DEFINE_FWK_MODULE(PixelTrackSoAFromCUDA);
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
index afe4aaa11f70b..f8657cf7bae89 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
@@ -380,8 +380,8 @@ __global__ void kernel_mark_used(GPUCACell *__restrict__ cells, uint32_t const *
   }
 }
 
-__global__ void kernel_countMultiplicity(TkSoAConstView tracks_view,
-                                         caConstants::TupleMultiplicity *tupleMultiplicity) {
+// TODO: change arg type to TkSoAConstview
+__global__ void kernel_countMultiplicity(TkSoAView tracks_view, caConstants::TupleMultiplicity *tupleMultiplicity) {
   auto first = blockIdx.x * blockDim.x + threadIdx.x;
   for (int it = first, nt = tracks_view.hitIndices().nOnes(); it < nt; it += gridDim.x * blockDim.x) {
     auto nhits = tracks_view.hitIndices().size(it);
@@ -397,7 +397,8 @@ __global__ void kernel_countMultiplicity(TkSoAConstView tracks_view,
   }
 }
 
-__global__ void kernel_fillMultiplicity(TkSoAConstView tracks_view, caConstants::TupleMultiplicity *tupleMultiplicity) {
+// TODO: change arg type to TkSoAConstview
+__global__ void kernel_fillMultiplicity(TkSoAView tracks_view, caConstants::TupleMultiplicity *tupleMultiplicity) {
   auto first = blockIdx.x * blockDim.x + threadIdx.x;
   for (int it = first, nt = tracks_view.hitIndices().nOnes(); it < nt; it += gridDim.x * blockDim.x) {
     auto nhits = tracks_view.hitIndices().size(it);
@@ -856,7 +857,7 @@ __global__ void kernel_simpleTripletCleaner(
 }
 
 __global__ void kernel_print_found_ntuplets(TrackingRecHit2DSOAView const *__restrict__ hhp,
-                                            TkSoAConstView tracks_view,
+                                            TkSoAView tracks_view,
                                             CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ phitToTuple,
                                             int32_t firstPrint,
                                             int32_t lastPrint,

From afdec7a7e2d9acfba46d8421866136607028fbfe Mon Sep 17 00:00:00 2001
From: Dimitris Papagiannis <nothingface0@gmail.com>
Date: Thu, 3 Nov 2022 10:37:38 +0100
Subject: [PATCH 090/110] Cleanup, removed unused includes

---
 .../Track/interface/PixelTrackHeterogeneous.h |   4 -
 .../Track/interface/TrackSoAHeterogeneousT.h  | 107 ------------------
 .../plugins/PixelTrackProducerFromSoA.cc      |  10 +-
 .../plugins/BrokenLineFitOnGPU.h              |   1 -
 .../plugins/CAHitNtupletGeneratorKernels.cc   |   7 --
 .../CAHitNtupletGeneratorKernelsImpl.h        |   2 -
 .../plugins/CAHitNtupletGeneratorOnGPU.cc     |   4 -
 .../plugins/PixelVertexProducerCUDA.cc        |   4 +-
 .../plugins/gpuVertexFinder.cc                |   7 +-
 .../plugins/gpuVertexFinder.h                 |   1 -
 10 files changed, 6 insertions(+), 141 deletions(-)
 delete mode 100644 CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h
 delete mode 100644 CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h

diff --git a/CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h b/CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h
deleted file mode 100644
index 73ec80e6322a2..0000000000000
--- a/CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h
+++ /dev/null
@@ -1,4 +0,0 @@
-#ifndef CUDADataFormats_Track_PixelTrackHeterogeneous_h
-#define CUDADataFormats_Track_PixelTrackHeterogeneous_h
-
-#endif  // #ifndef CUDADataFormats_Track_PixelTrackHeterogeneous_h
diff --git a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h
deleted file mode 100644
index 356ea3eddeb7f..0000000000000
--- a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h
+++ /dev/null
@@ -1,107 +0,0 @@
-#ifndef CUDADataFormats_Track_TrackHeterogeneousT_H
-#define CUDADataFormats_Track_TrackHeterogeneousT_H
-
-#include <string>
-#include <algorithm>
-
-#include "CUDADataFormats/Track/interface/TrajectoryStateSoAT.h"
-#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
-
-#include "CUDADataFormats/Common/interface/HeterogeneousSoA.h"
-
-namespace pixelTrack {
-  enum class Quality : uint8_t { bad = 0, edup, dup, loose, strict, tight, highPurity, notQuality };
-  constexpr uint32_t qualitySize{uint8_t(Quality::notQuality)};
-  const std::string qualityName[qualitySize]{"bad", "edup", "dup", "loose", "strict", "tight", "highPurity"};
-  inline Quality qualityByName(std::string const &name) {
-    auto qp = std::find(qualityName, qualityName + qualitySize, name) - qualityName;
-    return static_cast<Quality>(qp);
-  }
-}  // namespace pixelTrack
-
-template <int32_t S>
-class TrackSoAHeterogeneousT {
-public:
-  static constexpr int32_t stride() { return S; }
-
-  using Quality = pixelTrack::Quality;
-  using hindex_type = uint32_t;
-  using HitContainer = cms::cuda::OneToManyAssoc<hindex_type, S + 1, 5 * S>;
-
-  // Always check quality is at least loose!
-  // CUDA does not support enums  in __lgc ...
-private:
-  eigenSoA::ScalarSoA<uint8_t, S> quality_;
-
-public:
-  constexpr Quality quality(int32_t i) const { return (Quality)(quality_(i)); }
-  constexpr Quality &quality(int32_t i) { return (Quality &)(quality_(i)); }
-  constexpr Quality const *qualityData() const { return (Quality const *)(quality_.data()); }
-  constexpr Quality *qualityData() { return (Quality *)(quality_.data()); }
-
-  // this is chi2/ndof as not necessarely all hits are used in the fit
-  eigenSoA::ScalarSoA<float, S> chi2;
-
-  eigenSoA::ScalarSoA<int8_t, S> nLayers;
-
-  constexpr int nTracks() const { return nTracks_; }
-  constexpr void setNTracks(int n) { nTracks_ = n; }
-
-  constexpr int nHits(int i) const { return detIndices.size(i); }
-
-  constexpr bool isTriplet(int i) const { return nLayers(i) == 3; }
-
-  constexpr int computeNumberOfLayers(int32_t i) const {
-    // layers are in order and we assume tracks are either forward or backward
-    auto pdet = detIndices.begin(i);
-    int nl = 1;
-    auto ol = phase1PixelTopology::getLayer(*pdet);
-    for (; pdet < detIndices.end(i); ++pdet) {
-      auto il = phase1PixelTopology::getLayer(*pdet);
-      if (il != ol)
-        ++nl;
-      ol = il;
-    }
-    return nl;
-  }
-
-  // State at the Beam spot
-  // phi,tip,1/pt,cotan(theta),zip
-  TrajectoryStateSoAT<S> stateAtBS;
-  eigenSoA::ScalarSoA<float, S> eta;
-  eigenSoA::ScalarSoA<float, S> pt;
-  constexpr float charge(int32_t i) const { return std::copysign(1.f, stateAtBS.state(i)(2)); }
-  constexpr float phi(int32_t i) const { return stateAtBS.state(i)(0); }
-  constexpr float tip(int32_t i) const { return stateAtBS.state(i)(1); }
-  constexpr float zip(int32_t i) const { return stateAtBS.state(i)(4); }
-
-  // state at the detector of the outermost hit
-  // representation to be decided...
-  // not yet filled on GPU
-  // TrajectoryStateSoA<S> stateAtOuterDet;
-
-  HitContainer hitIndices;
-  HitContainer detIndices;
-
-private:
-  int nTracks_;
-};
-
-namespace pixelTrack {
-
-#ifdef GPU_SMALL_EVENTS
-  // kept for testing and debugging
-  constexpr uint32_t maxNumber() { return 2 * 1024; }
-#else
-  // tested on MC events with 55-75 pileup events
-  constexpr uint32_t maxNumber() { return 32 * 1024; }
-#endif
-
-  using TrackSoA = TrackSoAHeterogeneousT<maxNumber()>;
-  using TrajectoryState = TrajectoryStateSoAT<maxNumber()>;
-  using HitContainer = TrackSoA::HitContainer;
-
-}  // namespace pixelTrack
-
-#endif  // CUDADataFormats_Track_TrackHeterogeneousT_H
diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc
index ec5be3b7f05c7..36d3dd8c3dcc7 100644
--- a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc
+++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc
@@ -106,7 +106,6 @@ void PixelTrackProducerFromSoA::fillDescriptions(edm::ConfigurationDescriptions
 void PixelTrackProducerFromSoA::produce(edm::StreamID streamID,
                                         edm::Event &iEvent,
                                         const edm::EventSetup &iSetup) const {
-  // enum class Quality : uint8_t { bad = 0, edup, dup, loose, strict, tight, highPurity };
   reco::TrackBase::TrackQuality recoQuality[] = {reco::TrackBase::undefQuality,
                                                  reco::TrackBase::undefQuality,
                                                  reco::TrackBase::discarded,
@@ -154,13 +153,7 @@ void PixelTrackProducerFromSoA::produce(edm::StreamID streamID,
   std::vector<const TrackingRecHit *> hits;
   hits.reserve(5);
 
-  //const auto &tsoa = *iEvent.get(tokenTrack_);
-  auto & tsoa = iEvent.get(tokenTrack_);
-
-  //auto const *quality = pixelTrack::utilities::qualityData(tsoa.view());
-  // auto const &fit = tsoa.stateAtBS;
-  //auto const &hitIndices = tsoa.view().hitIndices();
-  //auto nTracks = tsoa.view().nTracks();
+  auto &tsoa = iEvent.get(tokenTrack_);
   auto const *quality = pixelTrack::utilities::qualityData(tsoa.view());
   auto const hitIndices = tsoa.view().hitIndices();
   auto nTracks = tsoa.view().nTracks();
@@ -246,7 +239,6 @@ void PixelTrackProducerFromSoA::produce(edm::StreamID streamID,
     // filter???
     tracks.emplace_back(track.release(), hits);
   }
-  // std::cout << "processed " << nt << " good tuples " << tracks.size() << "out of " << indToEdm.size() << std::endl;
 
   // store tracks
   storeTracks(iEvent, tracks, httopo);
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h
index aefde7ac602b1..2b2d93cf7415a 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h
@@ -8,7 +8,6 @@
 
 #include <cuda_runtime.h>
 
-#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
 #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
index cdefeab9e36b7..65a3f3a8dff4c 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
@@ -81,12 +81,6 @@ template <>
 void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh,
                                                     TkSoAView tracks_view,
                                                     cudaStream_t cudaStream) {
-  // auto *tuples_d = tracks_d->view().hitIndices();
-  // auto *detId_d = tracks_d->view().detIndices();
-  // auto *quality_d = tracks_d->qualityData();
-
-  // assert(tuples_d && quality_d); // TODO Find equivalent for View
-
   // zero tuples
   cms::cuda::launchZero(&tracks_view.hitIndices(), cudaStream);
 
@@ -149,7 +143,6 @@ void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh,
                                                      cudaStream_t cudaStream) {
   int32_t nhits = hh.nHits();
 
-  // auto const *tuples_d = &tracks_d->hitIndices;
   auto *quality_d = pixelTrack::utilities::qualityData(tracks_view);
   // classify tracks based on kinematics
   kernel_classifyTracks(tracks_view, quality_d, params_.cuts_);
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
index f8657cf7bae89..4f2272db13354 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
@@ -630,10 +630,8 @@ __global__ void kernel_markSharedHit(int const *__restrict__ nshared,
                                      HitContainer const *__restrict__ tuples,
                                      Quality *__restrict__ quality,
                                      bool dupPassThrough) {
-  // constexpr auto bad = (uint8_t)pixelTrack::Quality::bad;
   constexpr auto dup = pixelTrack::Quality::dup;
   constexpr auto loose = pixelTrack::Quality::loose;
-  // constexpr auto strict = (uint8_t)pixelTrack::Quality::strict;
 
   // quality to mark rejected
   auto const reject = dupPassThrough ? loose : dup;
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
index 9e778bae66158..4893ebdcc828f 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
@@ -186,10 +186,6 @@ void CAHitNtupletGeneratorOnGPU::endJob() {
   }
 }
 
-/*PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecHit2DGPU const& hits_d,
-                                                                    float bfield,
-                                                                    cudaStream_t stream) const {
-  PixelTrackHeterogeneous tracks(cms::cuda::make_device_unique<pixelTrack::TrackSoA>(stream));*/
 pixelTrack::TrackSoADevice CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecHit2DGPU const& hits_d,
                                                                        float bfield,
                                                                        cudaStream_t stream) const {
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc
index 7d8ea3485c447..9dd8a016dc02d 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc
@@ -111,12 +111,11 @@ void PixelVertexProducerCUDA::produceOnGPU(edm::StreamID streamID,
 void PixelVertexProducerCUDA::produceOnCPU(edm::StreamID streamID,
                                            edm::Event& iEvent,
                                            const edm::EventSetup& iSetup) const {
-  auto & tracks = iEvent.get(tokenCPUTrack_);
+  auto& tracks = iEvent.get(tokenCPUTrack_);
 
 #ifdef PIXVERTEX_DEBUG_PRODUCE
 
   auto maxTracks = tracks.view().metadata().size();
-  // std::cout << "size of SoA " << sizeof(tsoa) << " stride " << maxTracks << std::endl;
 
   int32_t nt = 0;
   for (int32_t it = 0; it < maxTracks; ++it) {
@@ -126,7 +125,6 @@ void PixelVertexProducerCUDA::produceOnCPU(edm::StreamID streamID,
       break;  // this is a guard: maybe we need to move to nTracks...
     nt++;
   }
-  // std::cout << "found " << nt << " tracks in cpu SoA for Vertexing at " << tracks << std::endl;
 #endif  // PIXVERTEX_DEBUG_PRODUCE
 
   iEvent.emplace(tokenCPUVertex_, gpuAlgo_.make(tracks.view(), ptMin_, ptMax_));
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc
index 66de3fe8c99f7..c92060f8ba2cc 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc
@@ -19,7 +19,6 @@ namespace gpuVertexFinder {
 
   // split vertices with a chi2/NDoF greater than this
   constexpr float maxChi2ForSplit = 9.f;
-  //using TkSoAView = pixelTrack::TrackSoAView;
   using TkSoAConstView = pixelTrack::TrackSoAConstView;
 
   __global__ void loadTracks(TkSoAConstView tracks_view, ZVertexSoA* soa, WorkSpace* pws, float ptMin, float ptMax) {
@@ -96,7 +95,10 @@ namespace gpuVertexFinder {
 #endif
 
 #ifdef __CUDACC__
-  ZVertexHeterogeneous Producer::makeAsync(cudaStream_t stream, TkSoAConstView tracks_view, float ptMin, float ptMax) const {
+  ZVertexHeterogeneous Producer::makeAsync(cudaStream_t stream,
+                                           TkSoAConstView tracks_view,
+                                           float ptMin,
+                                           float ptMax) const {
 #ifdef PIXVERTEX_DEBUG_PRODUCE
     std::cout << "producing Vertices on GPU" << std::endl;
 #endif  // PIXVERTEX_DEBUG_PRODUCE
@@ -108,7 +110,6 @@ namespace gpuVertexFinder {
 #endif  // PIXVERTEX_DEBUG_PRODUCE
     ZVertexHeterogeneous vertices(std::make_unique<ZVertexSoA>());
 #endif
-    // assert(tksoa);
     auto* soa = vertices.get();
     assert(soa);
 
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h
index 98bb9d75530d4..8c542607812b9 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h
@@ -5,7 +5,6 @@
 #include <cstdint>
 
 #include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h"
-//#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
 #include "CUDADataFormats/Track/interface/PixelTrackUtilities.h"
 
 namespace gpuVertexFinder {

From 182ffb802d8452c2443b75f7228ee9ae302ebf46 Mon Sep 17 00:00:00 2001
From: Breno Orzari <breno.orzari@hotmail.com>
Date: Thu, 3 Nov 2022 14:45:48 +0100
Subject: [PATCH 091/110] Changing dataformats to ZVertex{Device/Host}

---
 .../interface/ZVertexSoAHeterogeneousDevice.h | 33 +++++++++++++++
 .../interface/ZVertexSoAHeterogeneousHost.h   | 26 ++++++++++++
 .../Vertex/interface/ZVertexUtilities.h       | 41 +++++++++++++++++++
 CUDADataFormats/Vertex/src/classes.h          |  4 +-
 CUDADataFormats/Vertex/src/classes_def.xml    | 10 +++--
 5 files changed, 109 insertions(+), 5 deletions(-)
 create mode 100644 CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h
 create mode 100644 CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h
 create mode 100644 CUDADataFormats/Vertex/interface/ZVertexUtilities.h

diff --git a/CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h b/CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h
new file mode 100644
index 0000000000000..47cb8af2b4cc6
--- /dev/null
+++ b/CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h
@@ -0,0 +1,33 @@
+#ifndef CUDADataFormats_Vertex_ZVertexHeterogeneousDevice_H
+#define CUDADataFormats_Vertex_ZVertexHeterogeneousDevice_H
+
+#include <bits/stdint-uintn.h>
+
+#include "CUDADataFormats/Vertex/interface/ZVertexUtilities.h"
+#include "CUDADataFormats/Common/interface/PortableDeviceCollection.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+template <int32_t S>
+class ZVertexSoAHeterogeneousDevice : public cms::cuda::PortableDeviceCollection<ZVertexSoAHeterogeneousLayout<>> {
+public:
+  ZVertexSoAHeterogeneousDevice() = default;  // cms::cuda::Product needs this
+
+  // Constructor which specifies the SoA size
+  explicit ZVertexSoAHeterogeneousDevice(cudaStream_t stream)
+      : PortableDeviceCollection<ZVertexSoAHeterogeneousLayout<>>(S, stream) {}
+
+  // Copy data from device to host
+  __host__ void copyToHost(cms::cuda::host::unique_ptr<std::byte[]> &host_ptr, cudaStream_t stream) const {
+    cudaCheck(cudaMemcpyAsync(host_ptr.get(), const_buffer().get(), bufferSize(), cudaMemcpyDeviceToHost, stream));
+    cudaCheck(cudaGetLastError());
+  }
+};
+
+namespace ZVertex {
+
+  using ZVertexSoADevice = ZVertexSoAHeterogeneousDevice<ZVertex::utilities::MAXTRACKS>;
+
+}  // namespace pixelTrack
+
+#endif  // CUDADataFormats_Vertex_ZVertexHeterogeneousT_H
diff --git a/CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h b/CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h
new file mode 100644
index 0000000000000..e751e2da8f5de
--- /dev/null
+++ b/CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h
@@ -0,0 +1,26 @@
+#ifndef CUDADataFormats_Vertex_ZVertexHeterogeneousHost_H
+#define CUDADataFormats_Vertex_ZVertexHeterogeneousHost_H
+
+#include <bits/stdint-uintn.h>
+
+#include "CUDADataFormats/Vertex/interface/ZVertexUtilities.h"
+#include "CUDADataFormats/Common/interface/PortableHostCollection.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+template <int32_t S>
+class ZVertexSoAHeterogeneousHost : public cms::cuda::PortableHostCollection<ZVertexSoAHeterogeneousLayout<>> {
+public:
+  ZVertexSoAHeterogeneousHost() = default;
+
+  // Constructor which specifies the SoA size
+  explicit ZVertexSoAHeterogeneousHost(cudaStream_t stream)
+      : PortableHostCollection<ZVertexSoAHeterogeneousLayout<>>(S, stream) {}
+};
+
+namespace ZVertex {
+
+  using ZVertexSoAHost = ZVertexSoAHeterogeneousHost<ZVertex::utilities::MAXTRACKS>;
+
+}  // namespace ZVertex
+
+#endif  // CUDADataFormats_Vertex_ZVertexHeterogeneousT_H
diff --git a/CUDADataFormats/Vertex/interface/ZVertexUtilities.h b/CUDADataFormats/Vertex/interface/ZVertexUtilities.h
new file mode 100644
index 0000000000000..4c5dece118f50
--- /dev/null
+++ b/CUDADataFormats/Vertex/interface/ZVertexUtilities.h
@@ -0,0 +1,41 @@
+#ifndef CUDADataFormats_Vertex_ZVertexUtilities_h
+#define CUDADataFormats_Vertex_ZVertexUtilities_h
+
+//#include <Eigen/Dense>
+//#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
+#include "DataFormats/SoATemplate/interface/SoALayout.h"
+
+GENERATE_SOA_LAYOUT(ZVertexSoAHeterogeneousLayout,
+                    SOA_COLUMN(int16_t, idv),
+                    SOA_COLUMN(float, zv),  // this is chi2/ndof as not necessarely all hits are used in the fit
+                    SOA_COLUMN(float, wv),
+                    SOA_COLUMN(float, chi2),
+                    SOA_COLUMN(float, ptv2),
+                    SOA_COLUMN(int32_t, ndof),
+                    SOA_COLUMN(uint16_t, sortInd),
+                    SOA_SCALAR(uint32_t, nvFinal))
+
+// Previous TrajectoryStateSoAT class methods.
+// They operate on View and ConstView of the TrackSoA.
+namespace ZVertex {
+  namespace utilities {
+    using ZVertexSoAView = ZVertexSoAHeterogeneousLayout<>::View;
+
+    static constexpr uint32_t MAXTRACKS = 32 * 1024;
+    static constexpr uint32_t MAXVTX = 1024;
+
+    __host__ __device__ inline void init(ZVertexSoAView &vertices) { vertices.nvFinal() = 0; }
+
+  }  // namespace utilities
+}  // namespace pixelTrack
+
+namespace ZVertex {
+  // Common types for both Host and Device code
+  using ZVertexSoALayout = ZVertexSoAHeterogeneousLayout<>;
+  using ZVertexSoAView = ZVertexSoAHeterogeneousLayout<>::View;
+  using ZVertexSoAConstView = ZVertexSoAHeterogeneousLayout<>::ConstView;
+
+}  // namespace pixelTrack
+
+#endif
diff --git a/CUDADataFormats/Vertex/src/classes.h b/CUDADataFormats/Vertex/src/classes.h
index 7931beaa8f4bd..6f087ecb2cf46 100644
--- a/CUDADataFormats/Vertex/src/classes.h
+++ b/CUDADataFormats/Vertex/src/classes.h
@@ -1,7 +1,9 @@
 #ifndef CUDADataFormats_Vertex_src_classes_h
 #define CUDADataFormats_Vertex_src_classes_h
 
-#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h"
+//#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h"
+#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h"
+#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h"
 #include "CUDADataFormats/Common/interface/Product.h"
 #include "DataFormats/Common/interface/Wrapper.h"
 
diff --git a/CUDADataFormats/Vertex/src/classes_def.xml b/CUDADataFormats/Vertex/src/classes_def.xml
index ea633080af9af..58616cbb534fa 100644
--- a/CUDADataFormats/Vertex/src/classes_def.xml
+++ b/CUDADataFormats/Vertex/src/classes_def.xml
@@ -1,6 +1,8 @@
 <lcgdict>
-  <class name="cms::cuda::Product<ZVertexHeterogeneous>" persistent="false"/>
-  <class name="edm::Wrapper<ZVertexCUDAProduct>" persistent="false"/>
-  <class name="ZVertexHeterogeneous" persistent="false"/>
-  <class name="edm::Wrapper<ZVertexHeterogeneous>" persistent="false"/>
+  <class name="ZVertex::ZVertexSoAHost" persistent="false"/>
+  <class name="edm::Wrapper<ZVertex::ZVertexSoAHost>" persistent="false"/>
+  <class name="ZVertex::ZVertexSoADevice" persistent="false"/>
+  <class name="edm::Wrapper<ZVertex::ZVertexSoADevice>" persistent="false"/>
+  <class name="cms::cuda::Product<ZVertex::ZVertexSoADevice>" persistent="false"/>
+  <class name="edm::Wrapper<cms::cuda::Product<ZVertex::ZVertexSoADevice>>" persistent="false"/>
 </lcgdict>

From 8cb5f20a618077b43608e8035c259fbb3281bec1 Mon Sep 17 00:00:00 2001
From: Breno Orzari <breno.orzari@hotmail.com>
Date: Thu, 3 Nov 2022 15:03:31 +0100
Subject: [PATCH 092/110] Fixing headers in ZVertexUtilities.h

---
 CUDADataFormats/Vertex/interface/ZVertexUtilities.h | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/CUDADataFormats/Vertex/interface/ZVertexUtilities.h b/CUDADataFormats/Vertex/interface/ZVertexUtilities.h
index 4c5dece118f50..05ed34e2e8d69 100644
--- a/CUDADataFormats/Vertex/interface/ZVertexUtilities.h
+++ b/CUDADataFormats/Vertex/interface/ZVertexUtilities.h
@@ -1,9 +1,7 @@
 #ifndef CUDADataFormats_Vertex_ZVertexUtilities_h
 #define CUDADataFormats_Vertex_ZVertexUtilities_h
 
-//#include <Eigen/Dense>
-//#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
+#include <cuda_runtime.h>
 #include "DataFormats/SoATemplate/interface/SoALayout.h"
 
 GENERATE_SOA_LAYOUT(ZVertexSoAHeterogeneousLayout,
@@ -16,8 +14,8 @@ GENERATE_SOA_LAYOUT(ZVertexSoAHeterogeneousLayout,
                     SOA_COLUMN(uint16_t, sortInd),
                     SOA_SCALAR(uint32_t, nvFinal))
 
-// Previous TrajectoryStateSoAT class methods.
-// They operate on View and ConstView of the TrackSoA.
+// Previous ZVertexSoA class methods.
+// They operate on View and ConstView of the ZVertexSoA.
 namespace ZVertex {
   namespace utilities {
     using ZVertexSoAView = ZVertexSoAHeterogeneousLayout<>::View;
@@ -28,7 +26,7 @@ namespace ZVertex {
     __host__ __device__ inline void init(ZVertexSoAView &vertices) { vertices.nvFinal() = 0; }
 
   }  // namespace utilities
-}  // namespace pixelTrack
+}  // namespace ZVertex
 
 namespace ZVertex {
   // Common types for both Host and Device code
@@ -36,6 +34,6 @@ namespace ZVertex {
   using ZVertexSoAView = ZVertexSoAHeterogeneousLayout<>::View;
   using ZVertexSoAConstView = ZVertexSoAHeterogeneousLayout<>::ConstView;
 
-}  // namespace pixelTrack
+}  // namespace ZVertex
 
 #endif

From 107cc41317c237fb15e611146018f54a4d1f3eab Mon Sep 17 00:00:00 2001
From: Dimitris Papagiannis <nothingface0@gmail.com>
Date: Thu, 3 Nov 2022 16:04:06 +0100
Subject: [PATCH 093/110] Changed input to correct type

---
 .../plugins/SiPixelPhase1CompareTrackSoA.cc                 | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1CompareTrackSoA.cc b/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1CompareTrackSoA.cc
index dedff1f758e8f..36c045582c942 100644
--- a/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1CompareTrackSoA.cc
+++ b/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1CompareTrackSoA.cc
@@ -74,7 +74,7 @@ class SiPixelPhase1CompareTrackSoA : public DQMEDAnalyzer {
 
 private:
   const edm::EDGetTokenT<pixelTrack::TrackSoAHost> tokenSoATrackCPU_;
-  const edm::EDGetTokenT<pixelTrack::TrackSoADevice> tokenSoATrackGPU_;
+  const edm::EDGetTokenT<pixelTrack::TrackSoAHost> tokenSoATrackGPU_;
   const std::string topFolderName_;
   const bool useQualityCut_;
   const pixelTrack::Quality minQuality_;
@@ -113,10 +113,12 @@ class SiPixelPhase1CompareTrackSoA : public DQMEDAnalyzer {
 //
 // constructors
 //
+// Note that the GPU TrackSoA is also of type TrackSoAHost, as the data have
+// been copied from Device to Host
 
 SiPixelPhase1CompareTrackSoA::SiPixelPhase1CompareTrackSoA(const edm::ParameterSet& iConfig)
     : tokenSoATrackCPU_(consumes<pixelTrack::TrackSoAHost>(iConfig.getParameter<edm::InputTag>("pixelTrackSrcCPU"))),
-      tokenSoATrackGPU_(consumes<pixelTrack::TrackSoADevice>(iConfig.getParameter<edm::InputTag>("pixelTrackSrcGPU"))),
+      tokenSoATrackGPU_(consumes<pixelTrack::TrackSoAHost>(iConfig.getParameter<edm::InputTag>("pixelTrackSrcGPU"))),
       topFolderName_(iConfig.getParameter<std::string>("topFolderName")),
       useQualityCut_(iConfig.getParameter<bool>("useQualityCut")),
       minQuality_(pixelTrack::qualityByName(iConfig.getParameter<std::string>("minQuality"))),

From b683719c459eeb34caf2c39edb85b2a3a800735a Mon Sep 17 00:00:00 2001
From: Dimitris Papagiannis <nothingface0@gmail.com>
Date: Thu, 3 Nov 2022 16:15:33 +0100
Subject: [PATCH 094/110] Adapted DQM modules to new Vertex type

---
 .../plugins/SiPixelPhase1CompareVertexSoA.cc  | 43 ++++++++++---------
 .../plugins/SiPixelPhase1MonitorVertexSoA.cc  | 24 +++++------
 2 files changed, 34 insertions(+), 33 deletions(-)

diff --git a/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1CompareVertexSoA.cc b/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1CompareVertexSoA.cc
index 0113ea50973d8..68b553c45a48a 100644
--- a/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1CompareVertexSoA.cc
+++ b/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1CompareVertexSoA.cc
@@ -18,7 +18,7 @@
 #include "DQMServices/Core/interface/MonitorElement.h"
 #include "DQMServices/Core/interface/DQMEDAnalyzer.h"
 #include "DQMServices/Core/interface/DQMStore.h"
-#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h"
+#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h"
 #include "DataFormats/BeamSpot/interface/BeamSpot.h"
 
 class SiPixelPhase1CompareVertexSoA : public DQMEDAnalyzer {
@@ -31,8 +31,9 @@ class SiPixelPhase1CompareVertexSoA : public DQMEDAnalyzer {
   static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
 
 private:
-  const edm::EDGetTokenT<ZVertexHeterogeneous> tokenSoAVertexCPU_;
-  const edm::EDGetTokenT<ZVertexHeterogeneous> tokenSoAVertexGPU_;
+  const edm::EDGetTokenT<ZVertexSoAHeterogeneousHost> tokenSoAVertexCPU_;
+  // Note that this has been copied from device to host, hence is a HostCollection
+  const edm::EDGetTokenT<ZVertexSoAHeterogeneousHost> tokenSoAVertexGPU_;
   const edm::EDGetTokenT<reco::BeamSpot> tokenBeamSpot_;
   const std::string topFolderName_;
   const float dzCut_;
@@ -54,8 +55,8 @@ class SiPixelPhase1CompareVertexSoA : public DQMEDAnalyzer {
 //
 
 SiPixelPhase1CompareVertexSoA::SiPixelPhase1CompareVertexSoA(const edm::ParameterSet& iConfig)
-    : tokenSoAVertexCPU_(consumes<ZVertexHeterogeneous>(iConfig.getParameter<edm::InputTag>("pixelVertexSrcCPU"))),
-      tokenSoAVertexGPU_(consumes<ZVertexHeterogeneous>(iConfig.getParameter<edm::InputTag>("pixelVertexSrcGPU"))),
+    : tokenSoAVertexCPU_(consumes<ZVertexHeterogeneousHost>(iConfig.getParameter<edm::InputTag>("pixelVertexSrcCPU"))),
+      tokenSoAVertexGPU_(consumes<ZVertexHeterogeneousHost>(iConfig.getParameter<edm::InputTag>("pixelVertexSrcGPU"))),
       tokenBeamSpot_(consumes<reco::BeamSpot>(iConfig.getParameter<edm::InputTag>("beamSpotSrc"))),
       topFolderName_(iConfig.getParameter<std::string>("topFolderName")),
       dzCut_(iConfig.getParameter<double>("dzCut")) {}
@@ -64,8 +65,8 @@ SiPixelPhase1CompareVertexSoA::SiPixelPhase1CompareVertexSoA(const edm::Paramete
 // -- Analyze
 //
 void SiPixelPhase1CompareVertexSoA::analyze(const edm::Event& iEvent, const edm::EventSetup& iSetup) {
-  const auto& vsoaHandleCPU = iEvent.getHandle(tokenSoAVertexCPU_);
-  const auto& vsoaHandleGPU = iEvent.getHandle(tokenSoAVertexGPU_);
+  auto& vsoaHandleCPU = iEvent.getHandle(tokenSoAVertexCPU_);
+  auto& vsoaHandleGPU = iEvent.getHandle(tokenSoAVertexGPU_);
   if (not vsoaHandleCPU or not vsoaHandleGPU) {
     edm::LogWarning out("SiPixelPhase1CompareTrackSoA");
     if (not vsoaHandleCPU) {
@@ -78,10 +79,10 @@ void SiPixelPhase1CompareVertexSoA::analyze(const edm::Event& iEvent, const edm:
     return;
   }
 
-  auto const& vsoaCPU = *vsoaHandleCPU->get();
-  int nVerticesCPU = vsoaCPU.nvFinal;
-  auto const& vsoaGPU = *vsoaHandleGPU->get();
-  int nVerticesGPU = vsoaGPU.nvFinal;
+  auto& vsoaCPU = *vsoaHandleCPU->get();
+  int nVerticesCPU = vsoaCPU.view().nvFinal();
+  auto& vsoaGPU = *vsoaHandleGPU->get();
+  int nVerticesGPU = vsoaGPU.view().nvFinal();
 
   auto bsHandle = iEvent.getHandle(tokenBeamSpot_);
   float x0 = 0., y0 = 0., z0 = 0., dxdz = 0., dydz = 0.;
@@ -97,22 +98,22 @@ void SiPixelPhase1CompareVertexSoA::analyze(const edm::Event& iEvent, const edm:
   }
 
   for (int ivc = 0; ivc < nVerticesCPU; ivc++) {
-    auto sic = vsoaCPU.sortInd[ivc];
-    auto zc = vsoaCPU.zv[sic];
+    auto sic = vsoaCPU.view()[ivc].sortInd();
+    auto zc = vsoaCPU.view()[sic].zv();
     auto xc = x0 + dxdz * zc;
     auto yc = y0 + dydz * zc;
     zc += z0;
 
-    auto ndofCPU = vsoaCPU.ndof[sic];
-    auto chi2CPU = vsoaCPU.chi2[sic];
+    auto ndofCPU = vsoaCPU.view()[sic].ndof();
+    auto chi2CPU = vsoaCPU.view()[sic].chi2();
 
     const int32_t notFound = -1;
     int32_t closestVtxidx = notFound;
     float mindz = dzCut_;
 
     for (int ivg = 0; ivg < nVerticesGPU; ivg++) {
-      auto sig = vsoaGPU.sortInd[ivg];
-      auto zgc = vsoaGPU.zv[sig] + z0;
+      auto sig = vsoaGPU.view()[ivg].sortInd();
+      auto zgc = vsoaGPU.view()[sig].zv() + z0;
       auto zDist = std::abs(zc - zgc);
       //insert some matching condition
       if (zDist > dzCut_)
@@ -125,12 +126,12 @@ void SiPixelPhase1CompareVertexSoA::analyze(const edm::Event& iEvent, const edm:
     if (closestVtxidx == notFound)
       continue;
 
-    auto zg = vsoaGPU.zv[closestVtxidx];
+    auto zg = vsoaGPU.view()[closestVtxidx].zv();
     auto xg = x0 + dxdz * zg;
     auto yg = y0 + dydz * zg;
     zg += z0;
-    auto ndofGPU = vsoaGPU.ndof[closestVtxidx];
-    auto chi2GPU = vsoaGPU.chi2[closestVtxidx];
+    auto ndofGPU = vsoaGPU.view()[closestVtxidx].ndof();
+    auto chi2GPU = vsoaGPU.view()[closestVtxidx].chi2();
 
     hx_->Fill(xc - x0, xg - x0);
     hy_->Fill(yc - y0, yg - y0);
@@ -140,7 +141,7 @@ void SiPixelPhase1CompareVertexSoA::analyze(const edm::Event& iEvent, const edm:
     hzdiff_->Fill(zc - zg);
     hchi2_->Fill(chi2CPU, chi2GPU);
     hchi2oNdof_->Fill(chi2CPU / ndofCPU, chi2GPU / ndofGPU);
-    hptv2_->Fill(vsoaCPU.ptv2[sic], vsoaGPU.ptv2[closestVtxidx]);
+    hptv2_->Fill(vsoaCPU.view()[sic].ptv2(), vsoaGPU.view()[closestVtxidx].ptv2());
     hntrks_->Fill(ndofCPU + 1, ndofGPU + 1);
   }
   hnVertex_->Fill(nVerticesCPU, nVerticesGPU);
diff --git a/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1MonitorVertexSoA.cc b/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1MonitorVertexSoA.cc
index af6c240a69172..23e93816981b3 100644
--- a/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1MonitorVertexSoA.cc
+++ b/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1MonitorVertexSoA.cc
@@ -21,7 +21,7 @@
 #include "DQMServices/Core/interface/MonitorElement.h"
 #include "DQMServices/Core/interface/DQMEDAnalyzer.h"
 #include "DQMServices/Core/interface/DQMStore.h"
-#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h"
+#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneousHost.h"
 #include "DataFormats/BeamSpot/interface/BeamSpot.h"
 
 class SiPixelPhase1MonitorVertexSoA : public DQMEDAnalyzer {
@@ -34,7 +34,7 @@ class SiPixelPhase1MonitorVertexSoA : public DQMEDAnalyzer {
   static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
 
 private:
-  edm::EDGetTokenT<ZVertexHeterogeneous> tokenSoAVertex_;
+  edm::EDGetTokenT<ZVertexHeterogeneousHost> tokenSoAVertex_;
   edm::EDGetTokenT<reco::BeamSpot> tokenBeamSpot_;
   std::string topFolderName_;
   MonitorElement* hnVertex;
@@ -52,7 +52,7 @@ class SiPixelPhase1MonitorVertexSoA : public DQMEDAnalyzer {
 //
 
 SiPixelPhase1MonitorVertexSoA::SiPixelPhase1MonitorVertexSoA(const edm::ParameterSet& iConfig) {
-  tokenSoAVertex_ = consumes<ZVertexHeterogeneous>(iConfig.getParameter<edm::InputTag>("pixelVertexSrc"));
+  tokenSoAVertex_ = consumes<ZVertexHeterogeneousHost>(iConfig.getParameter<edm::InputTag>("pixelVertexSrc"));
   tokenBeamSpot_ = consumes<reco::BeamSpot>(iConfig.getParameter<edm::InputTag>("beamSpotSrc"));
   topFolderName_ = iConfig.getParameter<std::string>("topFolderName");
 }
@@ -61,14 +61,14 @@ SiPixelPhase1MonitorVertexSoA::SiPixelPhase1MonitorVertexSoA(const edm::Paramete
 // -- Analyze
 //
 void SiPixelPhase1MonitorVertexSoA::analyze(const edm::Event& iEvent, const edm::EventSetup& iSetup) {
-  const auto& vsoaHandle = iEvent.getHandle(tokenSoAVertex_);
+  auto& vsoaHandle = iEvent.getHandle(tokenSoAVertex_);
   if (!vsoaHandle.isValid()) {
     edm::LogWarning("SiPixelPhase1MonitorTrackSoA") << "No Vertex SoA found \n returning!" << std::endl;
     return;
   }
 
-  auto const& vsoa = *((vsoaHandle.product())->get());
-  int nVertices = vsoa.nvFinal;
+  auto& vsoa = *((vsoaHandle.product())->get());
+  int nVertices = vsoa.view().nvFinal();
   auto bsHandle = iEvent.getHandle(tokenBeamSpot_);
   float x0 = 0., y0 = 0., z0 = 0., dxdz = 0., dydz = 0.;
   if (!bsHandle.isValid()) {
@@ -82,18 +82,18 @@ void SiPixelPhase1MonitorVertexSoA::analyze(const edm::Event& iEvent, const edm:
     dydz = bs.dydz();
   }
   for (int iv = 0; iv < nVertices; iv++) {
-    auto si = vsoa.sortInd[iv];
-    auto z = vsoa.zv[si];
+    auto si = vsoa.view()[iv].sortInd();
+    auto z = vsoa.view()[si].zv();
     auto x = x0 + dxdz * z;
     auto y = y0 + dydz * z;
     z += z0;
     hx->Fill(x);
     hy->Fill(y);
     hz->Fill(z);
-    auto ndof = vsoa.ndof[si];
-    hchi2->Fill(vsoa.chi2[si]);
-    hchi2oNdof->Fill(vsoa.chi2[si] / ndof);
-    hptv2->Fill(vsoa.ptv2[si]);
+    auto ndof = vsoa.view()[si].ndof();
+    hchi2->Fill(vsoa.view()[si].chi2());
+    hchi2oNdof->Fill(vsoa.view()[si].chi2() / ndof);
+    hptv2->Fill(vsoa.view()[si].ptv2());
     hntrks->Fill(ndof + 1);
   }
   hnVertex->Fill(nVertices);

From 52ca1b07c6ce5184ea4a3be1da5a8a83ee634686 Mon Sep 17 00:00:00 2001
From: Dimitris Papagiannis <nothingface0@gmail.com>
Date: Thu, 3 Nov 2022 16:27:44 +0100
Subject: [PATCH 095/110] Use alias from namespace

---
 .../plugins/SiPixelPhase1CompareVertexSoA.cc              | 8 ++++----
 .../plugins/SiPixelPhase1MonitorVertexSoA.cc              | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1CompareVertexSoA.cc b/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1CompareVertexSoA.cc
index 68b553c45a48a..d14aba06019bf 100644
--- a/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1CompareVertexSoA.cc
+++ b/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1CompareVertexSoA.cc
@@ -31,9 +31,9 @@ class SiPixelPhase1CompareVertexSoA : public DQMEDAnalyzer {
   static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
 
 private:
-  const edm::EDGetTokenT<ZVertexSoAHeterogeneousHost> tokenSoAVertexCPU_;
+  const edm::EDGetTokenT<ZVertex::ZVertexSoAHost> tokenSoAVertexCPU_;
   // Note that this has been copied from device to host, hence is a HostCollection
-  const edm::EDGetTokenT<ZVertexSoAHeterogeneousHost> tokenSoAVertexGPU_;
+  const edm::EDGetTokenT<ZVertex::ZVertexSoAHost> tokenSoAVertexGPU_;
   const edm::EDGetTokenT<reco::BeamSpot> tokenBeamSpot_;
   const std::string topFolderName_;
   const float dzCut_;
@@ -55,8 +55,8 @@ class SiPixelPhase1CompareVertexSoA : public DQMEDAnalyzer {
 //
 
 SiPixelPhase1CompareVertexSoA::SiPixelPhase1CompareVertexSoA(const edm::ParameterSet& iConfig)
-    : tokenSoAVertexCPU_(consumes<ZVertexHeterogeneousHost>(iConfig.getParameter<edm::InputTag>("pixelVertexSrcCPU"))),
-      tokenSoAVertexGPU_(consumes<ZVertexHeterogeneousHost>(iConfig.getParameter<edm::InputTag>("pixelVertexSrcGPU"))),
+    : tokenSoAVertexCPU_(consumes<ZVertex::ZVertexSoAHost>(iConfig.getParameter<edm::InputTag>("pixelVertexSrcCPU"))),
+      tokenSoAVertexGPU_(consumes<ZVertex::ZVertexSoAHost>(iConfig.getParameter<edm::InputTag>("pixelVertexSrcGPU"))),
       tokenBeamSpot_(consumes<reco::BeamSpot>(iConfig.getParameter<edm::InputTag>("beamSpotSrc"))),
       topFolderName_(iConfig.getParameter<std::string>("topFolderName")),
       dzCut_(iConfig.getParameter<double>("dzCut")) {}
diff --git a/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1MonitorVertexSoA.cc b/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1MonitorVertexSoA.cc
index 23e93816981b3..914be969a9ff5 100644
--- a/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1MonitorVertexSoA.cc
+++ b/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1MonitorVertexSoA.cc
@@ -34,7 +34,7 @@ class SiPixelPhase1MonitorVertexSoA : public DQMEDAnalyzer {
   static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
 
 private:
-  edm::EDGetTokenT<ZVertexHeterogeneousHost> tokenSoAVertex_;
+  edm::EDGetTokenT<ZVertex::ZVertexSoAHost> tokenSoAVertex_;
   edm::EDGetTokenT<reco::BeamSpot> tokenBeamSpot_;
   std::string topFolderName_;
   MonitorElement* hnVertex;
@@ -52,7 +52,7 @@ class SiPixelPhase1MonitorVertexSoA : public DQMEDAnalyzer {
 //
 
 SiPixelPhase1MonitorVertexSoA::SiPixelPhase1MonitorVertexSoA(const edm::ParameterSet& iConfig) {
-  tokenSoAVertex_ = consumes<ZVertexHeterogeneousHost>(iConfig.getParameter<edm::InputTag>("pixelVertexSrc"));
+  tokenSoAVertex_ = consumes<ZVertex::ZVertexSoAHost>(iConfig.getParameter<edm::InputTag>("pixelVertexSrc"));
   tokenBeamSpot_ = consumes<reco::BeamSpot>(iConfig.getParameter<edm::InputTag>("beamSpotSrc"));
   topFolderName_ = iConfig.getParameter<std::string>("topFolderName");
 }

From 91752db111d55964270713ac713d7ae576d6ae2e Mon Sep 17 00:00:00 2001
From: Dimitris Papagiannis <nothingface0@gmail.com>
Date: Thu, 3 Nov 2022 16:43:26 +0100
Subject: [PATCH 096/110] Fixed handle and instance creations

---
 .../plugins/SiPixelPhase1CompareVertexSoA.cc              | 8 ++++----
 .../plugins/SiPixelPhase1MonitorVertexSoA.cc              | 6 +++---
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1CompareVertexSoA.cc b/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1CompareVertexSoA.cc
index d14aba06019bf..9172824631da2 100644
--- a/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1CompareVertexSoA.cc
+++ b/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1CompareVertexSoA.cc
@@ -65,8 +65,8 @@ SiPixelPhase1CompareVertexSoA::SiPixelPhase1CompareVertexSoA(const edm::Paramete
 // -- Analyze
 //
 void SiPixelPhase1CompareVertexSoA::analyze(const edm::Event& iEvent, const edm::EventSetup& iSetup) {
-  auto& vsoaHandleCPU = iEvent.getHandle(tokenSoAVertexCPU_);
-  auto& vsoaHandleGPU = iEvent.getHandle(tokenSoAVertexGPU_);
+  const auto& vsoaHandleCPU = iEvent.getHandle(tokenSoAVertexCPU_);
+  const auto& vsoaHandleGPU = iEvent.getHandle(tokenSoAVertexGPU_);
   if (not vsoaHandleCPU or not vsoaHandleGPU) {
     edm::LogWarning out("SiPixelPhase1CompareTrackSoA");
     if (not vsoaHandleCPU) {
@@ -79,9 +79,9 @@ void SiPixelPhase1CompareVertexSoA::analyze(const edm::Event& iEvent, const edm:
     return;
   }
 
-  auto& vsoaCPU = *vsoaHandleCPU->get();
+  auto& vsoaCPU = *vsoaHandleCPU;
   int nVerticesCPU = vsoaCPU.view().nvFinal();
-  auto& vsoaGPU = *vsoaHandleGPU->get();
+  auto& vsoaGPU = *vsoaHandleGPU;
   int nVerticesGPU = vsoaGPU.view().nvFinal();
 
   auto bsHandle = iEvent.getHandle(tokenBeamSpot_);
diff --git a/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1MonitorVertexSoA.cc b/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1MonitorVertexSoA.cc
index 914be969a9ff5..27e0df36a17a4 100644
--- a/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1MonitorVertexSoA.cc
+++ b/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1MonitorVertexSoA.cc
@@ -21,7 +21,7 @@
 #include "DQMServices/Core/interface/MonitorElement.h"
 #include "DQMServices/Core/interface/DQMEDAnalyzer.h"
 #include "DQMServices/Core/interface/DQMStore.h"
-#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneousHost.h"
+#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h"
 #include "DataFormats/BeamSpot/interface/BeamSpot.h"
 
 class SiPixelPhase1MonitorVertexSoA : public DQMEDAnalyzer {
@@ -61,13 +61,13 @@ SiPixelPhase1MonitorVertexSoA::SiPixelPhase1MonitorVertexSoA(const edm::Paramete
 // -- Analyze
 //
 void SiPixelPhase1MonitorVertexSoA::analyze(const edm::Event& iEvent, const edm::EventSetup& iSetup) {
-  auto& vsoaHandle = iEvent.getHandle(tokenSoAVertex_);
+  const auto& vsoaHandle = iEvent.getHandle(tokenSoAVertex_);
   if (!vsoaHandle.isValid()) {
     edm::LogWarning("SiPixelPhase1MonitorTrackSoA") << "No Vertex SoA found \n returning!" << std::endl;
     return;
   }
 
-  auto& vsoa = *((vsoaHandle.product())->get());
+  auto& vsoa = *vsoaHandle.product();
   int nVertices = vsoa.view().nvFinal();
   auto bsHandle = iEvent.getHandle(tokenBeamSpot_);
   float x0 = 0., y0 = 0., z0 = 0., dxdz = 0., dydz = 0.;

From 1c1028dc7c78a44d85421990529a776cd90fccae Mon Sep 17 00:00:00 2001
From: Breno Orzari <breno.orzari@hotmail.com>
Date: Thu, 3 Nov 2022 17:18:20 +0100
Subject: [PATCH 097/110] Updating dataformats in vertexing to
 ZVertex{Device/Host}

---
 .../plugins/PixelTrackDumpCUDA.cc             | 19 +++++++-----
 .../plugins/PixelVertexProducerCUDA.cc        | 10 ++++---
 .../plugins/PixelVertexProducerFromSoA.cc     | 21 ++++++-------
 .../plugins/PixelVertexSoAFromCUDA.cc         | 23 +++++++-------
 .../plugins/gpuClusterTracksByDensity.h       | 17 ++++++-----
 .../plugins/gpuClusterTracksDBSCAN.h          | 15 +++++-----
 .../plugins/gpuClusterTracksIterative.h       | 15 +++++-----
 .../plugins/gpuFitVertices.h                  | 19 ++++++------
 .../PixelVertexFinding/plugins/gpuSortByPt2.h | 14 ++++-----
 .../plugins/gpuSplitVertices.h                | 19 ++++++------
 .../plugins/gpuVertexFinder.cc                | 30 +++++++++++--------
 .../plugins/gpuVertexFinder.h                 | 21 +++++++------
 12 files changed, 122 insertions(+), 101 deletions(-)

diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc
index f97dfecfff370..a1acc6376e111 100644
--- a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc
+++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc
@@ -4,7 +4,9 @@
 #include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h"
 #include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h"
 #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h"
-#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h"
+//#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h"
+#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h"
+#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h"
 #include "DataFormats/Common/interface/Handle.h"
 #include "FWCore/Framework/interface/ConsumesCollector.h"
 #include "FWCore/Framework/interface/Event.h"
@@ -33,11 +35,11 @@ class PixelTrackDumpCUDA : public edm::global::EDAnalyzer<> {
   const bool m_onGPU;
   // GPU
   edm::EDGetTokenT<cms::cuda::Product<pixelTrack::TrackSoADevice>> tokenGPUTrack_;
-  edm::EDGetTokenT<cms::cuda::Product<ZVertexHeterogeneous>> tokenGPUVertex_;
+  edm::EDGetTokenT<cms::cuda::Product<ZVertex::ZVertexSoADevice>> tokenGPUVertex_;
 
   // CPU
   edm::EDGetTokenT<pixelTrack::TrackSoAHost> tokenSoATrack_;
-  edm::EDGetTokenT<ZVertexHeterogeneous> tokenSoAVertex_;
+  edm::EDGetTokenT<ZVertex::ZVertexSoAHost> tokenSoAVertex_;
 };
 
 PixelTrackDumpCUDA::PixelTrackDumpCUDA(const edm::ParameterSet& iConfig)
@@ -46,10 +48,10 @@ PixelTrackDumpCUDA::PixelTrackDumpCUDA(const edm::ParameterSet& iConfig)
     tokenGPUTrack_ =
         consumes<cms::cuda::Product<pixelTrack::TrackSoADevice>>(iConfig.getParameter<edm::InputTag>("pixelTrackSrc"));
     tokenGPUVertex_ =
-        consumes<cms::cuda::Product<ZVertexHeterogeneous>>(iConfig.getParameter<edm::InputTag>("pixelVertexSrc"));
+        consumes<cms::cuda::Product<ZVertex::ZVertexSoADevice>>(iConfig.getParameter<edm::InputTag>("pixelVertexSrc"));
   } else {
     tokenSoATrack_ = consumes<pixelTrack::TrackSoAHost>(iConfig.getParameter<edm::InputTag>("pixelTrackSrc"));
-    tokenSoAVertex_ = consumes<ZVertexHeterogeneous>(iConfig.getParameter<edm::InputTag>("pixelVertexSrc"));
+    tokenSoAVertex_ = consumes<ZVertex::ZVertexSoAHost>(iConfig.getParameter<edm::InputTag>("pixelVertexSrc"));
   }
 }
 
@@ -74,15 +76,16 @@ void PixelTrackDumpCUDA::analyze(edm::StreamID streamID,
     assert(tsoa);
 
     auto const& vertices = ctx.get(iEvent.get(tokenGPUVertex_));
-    auto const* vsoa = vertices.get();
+    //auto const* vsoa = vertices.get();
+    auto const* vsoa = &vertices;
     assert(vsoa);
 
   } else {
     auto const& tsoa = iEvent.get(tokenSoATrack_);
     assert(tsoa.buffer());
 
-    auto const* vsoa = iEvent.get(tokenSoAVertex_).get();
-    assert(vsoa);
+    auto const& vsoa = iEvent.get(tokenSoAVertex_);
+    assert(vsoa.buffer());
   }
 }
 
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc
index 9dd8a016dc02d..45d1a9d52d99e 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc
@@ -18,6 +18,8 @@
 #include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
 #include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h"
 #include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h"
+#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h"
+#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h"
 
 #include "gpuVertexFinder.h"
 
@@ -38,9 +40,9 @@ class PixelVertexProducerCUDA : public edm::global::EDProducer<> {
   bool onGPU_;
 
   edm::EDGetTokenT<cms::cuda::Product<pixelTrack::TrackSoADevice>> tokenGPUTrack_;
-  edm::EDPutTokenT<ZVertexCUDAProduct> tokenGPUVertex_;
+  edm::EDPutTokenT<cms::cuda::Product<ZVertex::ZVertexSoADevice>> tokenGPUVertex_;
   edm::EDGetTokenT<pixelTrack::TrackSoAHost> tokenCPUTrack_;
-  edm::EDPutTokenT<ZVertexHeterogeneous> tokenCPUVertex_;
+  edm::EDPutTokenT<ZVertex::ZVertexSoAHost> tokenCPUVertex_;
 
   const gpuVertexFinder::Producer gpuAlgo_;
 
@@ -65,10 +67,10 @@ PixelVertexProducerCUDA::PixelVertexProducerCUDA(const edm::ParameterSet& conf)
   if (onGPU_) {
     tokenGPUTrack_ =
         consumes<cms::cuda::Product<pixelTrack::TrackSoADevice>>(conf.getParameter<edm::InputTag>("pixelTrackSrc"));
-    tokenGPUVertex_ = produces<ZVertexCUDAProduct>();
+    tokenGPUVertex_ = produces<cms::cuda::Product<ZVertex::ZVertexSoADevice>>();
   } else {
     tokenCPUTrack_ = consumes<pixelTrack::TrackSoAHost>(conf.getParameter<edm::InputTag>("pixelTrackSrc"));
-    tokenCPUVertex_ = produces<ZVertexHeterogeneous>();
+    tokenCPUVertex_ = produces<ZVertex::ZVertexSoAHost>();
   }
 }
 
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerFromSoA.cc b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerFromSoA.cc
index 029c619b42e58..61ec3f9a6a5be 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerFromSoA.cc
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerFromSoA.cc
@@ -1,4 +1,5 @@
-#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h"
+#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h"
+#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h"
 #include "DataFormats/BeamSpot/interface/BeamSpot.h"
 #include "DataFormats/Common/interface/OrphanHandle.h"
 #include "DataFormats/TrackReco/interface/Track.h"
@@ -35,14 +36,14 @@ class PixelVertexProducerFromSoA : public edm::global::EDProducer<> {
 private:
   void produce(edm::StreamID streamID, edm::Event &iEvent, const edm::EventSetup &iSetup) const override;
 
-  edm::EDGetTokenT<ZVertexHeterogeneous> tokenVertex_;
+  edm::EDGetTokenT<ZVertex::ZVertexSoAHost> tokenVertex_;
   edm::EDGetTokenT<reco::BeamSpot> tokenBeamSpot_;
   edm::EDGetTokenT<reco::TrackCollection> tokenTracks_;
   edm::EDGetTokenT<IndToEdm> tokenIndToEdm_;
 };
 
 PixelVertexProducerFromSoA::PixelVertexProducerFromSoA(const edm::ParameterSet &conf)
-    : tokenVertex_(consumes<ZVertexHeterogeneous>(conf.getParameter<edm::InputTag>("src"))),
+    : tokenVertex_(consumes<ZVertex::ZVertexSoAHost>(conf.getParameter<edm::InputTag>("src"))),
       tokenBeamSpot_(consumes<reco::BeamSpot>(conf.getParameter<edm::InputTag>("beamSpot"))),
       tokenTracks_(consumes<reco::TrackCollection>(conf.getParameter<edm::InputTag>("TrackCollection"))),
       tokenIndToEdm_(consumes<IndToEdm>(conf.getParameter<edm::InputTag>("TrackCollection"))) {
@@ -81,9 +82,9 @@ void PixelVertexProducerFromSoA::produce(edm::StreamID streamID, edm::Event &iEv
     dydz = bs.dydz();
   }
 
-  auto const &soa = *(iEvent.get(tokenVertex_).get());
+  auto const &soa = iEvent.get(tokenVertex_);
 
-  int nv = soa.nvFinal;
+  int nv = soa.view().nvFinal();
 
 #ifdef PIXVERTEX_DEBUG_PRODUCE
   std::cout << "converting " << nv << " vertices "
@@ -92,20 +93,20 @@ void PixelVertexProducerFromSoA::produce(edm::StreamID streamID, edm::Event &iEv
 
   std::set<uint16_t> uind;  // for verifing index consistency
   for (int j = nv - 1; j >= 0; --j) {
-    auto i = soa.sortInd[j];  // on gpu sorted in ascending order....
+    auto i = soa.view()[j].sortInd();  // on gpu sorted in ascending order....
     assert(i < nv);
     uind.insert(i);
     assert(itrk.empty());
-    auto z = soa.zv[i];
+    auto z = soa.view()[i].zv();
     auto x = x0 + dxdz * z;
     auto y = y0 + dydz * z;
     z += z0;
     reco::Vertex::Error err;
-    err(2, 2) = 1.f / soa.wv[i];
+    err(2, 2) = 1.f / soa.view()[i].wv();
     err(2, 2) *= 2.;  // artifically inflate error
     //Copy also the tracks (no intention to be efficient....)
     for (auto k = 0U; k < indToEdm.size(); ++k) {
-      if (soa.idv[k] == int16_t(i))
+      if (soa.view()[k].idv() == int16_t(i))
         itrk.push_back(k);
     }
     auto nt = itrk.size();
@@ -119,7 +120,7 @@ void PixelVertexProducerFromSoA::produce(edm::StreamID streamID, edm::Event &iEv
       itrk.clear();
       continue;
     }  // remove outliers
-    (*vertexes).emplace_back(reco::Vertex::Point(x, y, z), err, soa.chi2[i], soa.ndof[i], nt);
+    (*vertexes).emplace_back(reco::Vertex::Point(x, y, z), err, soa.view()[i].chi2(), soa.view()[i].ndof(), nt);
     auto &v = (*vertexes).back();
     v.reserve(itrk.size());
     for (auto it : itrk) {
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexSoAFromCUDA.cc b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexSoAFromCUDA.cc
index dc125878b1058..ef97c9a2b6ea7 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexSoAFromCUDA.cc
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexSoAFromCUDA.cc
@@ -2,7 +2,8 @@
 
 #include "CUDADataFormats/Common/interface/Product.h"
 #include "CUDADataFormats/Common/interface/HostProduct.h"
-#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h"
+#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h"
+#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h"
 #include "DataFormats/Common/interface/Handle.h"
 #include "FWCore/Framework/interface/ESHandle.h"
 #include "FWCore/Framework/interface/Event.h"
@@ -30,15 +31,15 @@ class PixelVertexSoAFromCUDA : public edm::stream::EDProducer<edm::ExternalWork>
                edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
   void produce(edm::Event& iEvent, edm::EventSetup const& iSetup) override;
 
-  edm::EDGetTokenT<cms::cuda::Product<ZVertexHeterogeneous>> tokenCUDA_;
-  edm::EDPutTokenT<ZVertexHeterogeneous> tokenSOA_;
+  edm::EDGetTokenT<cms::cuda::Product<ZVertex::ZVertexSoADevice>> tokenCUDA_;
+  edm::EDPutTokenT<ZVertex::ZVertexSoAHost> tokenSOA_;
 
-  cms::cuda::host::unique_ptr<ZVertexSoA> m_soa;
+  ZVertex::ZVertexSoAHost zvertex_h;
 };
 
 PixelVertexSoAFromCUDA::PixelVertexSoAFromCUDA(const edm::ParameterSet& iConfig)
-    : tokenCUDA_(consumes<cms::cuda::Product<ZVertexHeterogeneous>>(iConfig.getParameter<edm::InputTag>("src"))),
-      tokenSOA_(produces<ZVertexHeterogeneous>()) {}
+    : tokenCUDA_(consumes<cms::cuda::Product<ZVertex::ZVertexSoADevice>>(iConfig.getParameter<edm::InputTag>("src"))),
+      tokenSOA_(produces<ZVertex::ZVertexSoAHost>()) {}
 
 void PixelVertexSoAFromCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
   edm::ParameterSetDescription desc;
@@ -50,16 +51,16 @@ void PixelVertexSoAFromCUDA::fillDescriptions(edm::ConfigurationDescriptions& de
 void PixelVertexSoAFromCUDA::acquire(edm::Event const& iEvent,
                                      edm::EventSetup const& iSetup,
                                      edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
-  auto const& inputDataWrapped = iEvent.get(tokenCUDA_);
+  cms::cuda::Product<ZVertex::ZVertexSoADevice> const& inputDataWrapped = iEvent.get(tokenCUDA_);
   cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)};
-  auto const& inputData = ctx.get(inputDataWrapped);
-
-  m_soa = inputData.toHostAsync(ctx.stream());
+  auto const& zvertex_d = ctx.get(inputDataWrapped);
+  zvertex_h = ZVertex::ZVertexSoAHost(ctx.stream());
+  zvertex_d.copyToHost(zvertex_h.buffer(), ctx.stream());
 }
 
 void PixelVertexSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& iSetup) {
   // No copies....
-  iEvent.emplace(tokenSOA_, ZVertexHeterogeneous(std::move(m_soa)));
+  iEvent.emplace(tokenSOA_, std::move(zvertex_h));
 }
 
 DEFINE_FWK_MODULE(PixelVertexSoAFromCUDA);
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksByDensity.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksByDensity.h
index f71aa56842a67..f920586117078 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksByDensity.h
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksByDensity.h
@@ -17,7 +17,7 @@ namespace gpuVertexFinder {
   //
   // based on Rodrighez&Laio algo
   //
-  __device__ __forceinline__ void clusterTracksByDensity(gpuVertexFinder::ZVertices* pdata,
+  __device__ __forceinline__ void clusterTracksByDensity(VtxSoAView pdata,
                                                          gpuVertexFinder::WorkSpace* pws,
                                                          int minT,      // min number of neighbours to be "seed"
                                                          float eps,     // max absolute distance to cluster
@@ -32,20 +32,21 @@ namespace gpuVertexFinder {
 
     auto er2mx = errmax * errmax;
 
-    auto& __restrict__ data = *pdata;
+    auto& __restrict__ data = pdata;
     auto& __restrict__ ws = *pws;
     auto nt = ws.ntrks;
     float const* __restrict__ zt = ws.zt;
     float const* __restrict__ ezt2 = ws.ezt2;
 
-    uint32_t& nvFinal = data.nvFinal;
+    uint32_t& nvFinal = data.nvFinal();
     uint32_t& nvIntermediate = ws.nvIntermediate;
 
     uint8_t* __restrict__ izt = ws.izt;
-    int32_t* __restrict__ nn = data.ndof;
+    int32_t* __restrict__ nn = data.ndof();
     int32_t* __restrict__ iv = ws.iv;
 
-    assert(pdata);
+    //TODO: check if there is a way to assert this
+    //assert(pdata);
     assert(zt);
 
     using Hist = cms::cuda::HistoContainer<uint8_t, 256, 16000, 8, uint16_t>;
@@ -63,7 +64,7 @@ namespace gpuVertexFinder {
 
     // fill hist  (bin shall be wider than "eps")
     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
-      assert(i < ZVertices::MAXTRACKS);
+      assert(i < ZVertex::utilities::MAXTRACKS);
       int iz = int(zt[i] * 10.);  // valid if eps<=0.1
       // iz = std::clamp(iz, INT8_MIN, INT8_MAX);  // sorry c++17 only
       iz = std::min(std::max(iz, INT8_MIN), INT8_MAX);
@@ -197,7 +198,7 @@ namespace gpuVertexFinder {
     }
     __syncthreads();
 
-    assert(foundClusters < ZVertices::MAXVTX);
+    assert(foundClusters < ZVertex::utilities::MAXVTX);
 
     // propagate the negative id to all the tracks in the cluster.
     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
@@ -219,7 +220,7 @@ namespace gpuVertexFinder {
       printf("found %d proto vertices\n", foundClusters);
   }
 
-  __global__ void clusterTracksByDensityKernel(gpuVertexFinder::ZVertices* pdata,
+  __global__ void clusterTracksByDensityKernel(VtxSoAView pdata,
                                                gpuVertexFinder::WorkSpace* pws,
                                                int minT,      // min number of neighbours to be "seed"
                                                float eps,     // max absolute distance to cluster
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksDBSCAN.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksDBSCAN.h
index a11283a7b2065..0476cfbae5fef 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksDBSCAN.h
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksDBSCAN.h
@@ -14,7 +14,7 @@ namespace gpuVertexFinder {
 
   // this algo does not really scale as it works in a single block...
   // enough for <10K tracks we have
-  __global__ void clusterTracksDBSCAN(ZVertices* pdata,
+  __global__ void clusterTracksDBSCAN(VtxSoAView pdata,
                                       WorkSpace* pws,
                                       int minT,      // min number of neighbours to be "core"
                                       float eps,     // max absolute distance to cluster
@@ -28,20 +28,21 @@ namespace gpuVertexFinder {
 
     auto er2mx = errmax * errmax;
 
-    auto& __restrict__ data = *pdata;
+    auto& __restrict__ data = pdata;
     auto& __restrict__ ws = *pws;
     auto nt = ws.ntrks;
     float const* __restrict__ zt = ws.zt;
     float const* __restrict__ ezt2 = ws.ezt2;
 
-    uint32_t& nvFinal = data.nvFinal;
+    uint32_t& nvFinal = data.nvFinal();
     uint32_t& nvIntermediate = ws.nvIntermediate;
 
     uint8_t* __restrict__ izt = ws.izt;
-    int32_t* __restrict__ nn = data.ndof;
+    int32_t* __restrict__ nn = data.ndof();
     int32_t* __restrict__ iv = ws.iv;
 
-    assert(pdata);
+    //TODO: check if there is a way to assert this
+    //assert(pdata);
     assert(zt);
 
     using Hist = cms::cuda::HistoContainer<uint8_t, 256, 16000, 8, uint16_t>;
@@ -59,7 +60,7 @@ namespace gpuVertexFinder {
 
     // fill hist  (bin shall be wider than "eps")
     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
-      assert(i < ZVertices::MAXTRACKS);
+      assert(i < ZVertex::utilities::MAXTRACKS);
       int iz = int(zt[i] * 10.);  // valid if eps<=0.1
       iz = std::clamp(iz, INT8_MIN, INT8_MAX);
       izt[i] = iz - INT8_MIN;
@@ -214,7 +215,7 @@ namespace gpuVertexFinder {
     }
     __syncthreads();
 
-    assert(foundClusters < ZVertices::MAXVTX);
+    assert(foundClusters < ZVertex::utilities::MAXVTX);
 
     // propagate the negative id to all the tracks in the cluster.
     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksIterative.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksIterative.h
index 66d246fcfa4fa..230405c47366a 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksIterative.h
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksIterative.h
@@ -14,7 +14,7 @@ namespace gpuVertexFinder {
 
   // this algo does not really scale as it works in a single block...
   // enough for <10K tracks we have
-  __global__ void clusterTracksIterative(ZVertices* pdata,
+  __global__ void clusterTracksIterative(VtxSoAView pdata,
                                          WorkSpace* pws,
                                          int minT,      // min number of neighbours to be "core"
                                          float eps,     // max absolute distance to cluster
@@ -28,20 +28,21 @@ namespace gpuVertexFinder {
 
     auto er2mx = errmax * errmax;
 
-    auto& __restrict__ data = *pdata;
+    auto& __restrict__ data = pdata;
     auto& __restrict__ ws = *pws;
     auto nt = ws.ntrks;
     float const* __restrict__ zt = ws.zt;
     float const* __restrict__ ezt2 = ws.ezt2;
 
-    uint32_t& nvFinal = data.nvFinal;
+    uint32_t& nvFinal = data.nvFinal();
     uint32_t& nvIntermediate = ws.nvIntermediate;
 
     uint8_t* __restrict__ izt = ws.izt;
-    int32_t* __restrict__ nn = data.ndof;
+    int32_t* __restrict__ nn = data.ndof();
     int32_t* __restrict__ iv = ws.iv;
 
-    assert(pdata);
+    //TODO: check if there is a way to assert this
+    //assert(pdata);
     assert(zt);
 
     using Hist = cms::cuda::HistoContainer<uint8_t, 256, 16000, 8, uint16_t>;
@@ -59,7 +60,7 @@ namespace gpuVertexFinder {
 
     // fill hist  (bin shall be wider than "eps")
     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
-      assert(i < ZVertices::MAXTRACKS);
+      assert(i < ZVertex::utilities::MAXTRACKS);
       int iz = int(zt[i] * 10.);  // valid if eps<=0.1
       iz = std::clamp(iz, INT8_MIN, INT8_MAX);
       izt[i] = iz - INT8_MIN;
@@ -185,7 +186,7 @@ namespace gpuVertexFinder {
     }
     __syncthreads();
 
-    assert(foundClusters < ZVertices::MAXVTX);
+    assert(foundClusters < ZVertex::utilities::MAXVTX);
 
     // propagate the negative id to all the tracks in the cluster.
     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuFitVertices.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuFitVertices.h
index 0acf67244528a..51364e78ee92e 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuFitVertices.h
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuFitVertices.h
@@ -12,27 +12,28 @@
 
 namespace gpuVertexFinder {
 
-  __device__ __forceinline__ void fitVertices(ZVertices* pdata,
+  __device__ __forceinline__ void fitVertices(VtxSoAView pdata,
                                               WorkSpace* pws,
                                               float chi2Max  // for outlier rejection
   ) {
     constexpr bool verbose = false;  // in principle the compiler should optmize out if false
 
-    auto& __restrict__ data = *pdata;
+    auto& __restrict__ data = pdata;
     auto& __restrict__ ws = *pws;
     auto nt = ws.ntrks;
     float const* __restrict__ zt = ws.zt;
     float const* __restrict__ ezt2 = ws.ezt2;
-    float* __restrict__ zv = data.zv;
-    float* __restrict__ wv = data.wv;
-    float* __restrict__ chi2 = data.chi2;
-    uint32_t& nvFinal = data.nvFinal;
+    float* __restrict__ zv = data.zv();
+    float* __restrict__ wv = data.wv();
+    float* __restrict__ chi2 = data.chi2();
+    uint32_t& nvFinal = data.nvFinal();
     uint32_t& nvIntermediate = ws.nvIntermediate;
 
-    int32_t* __restrict__ nn = data.ndof;
+    int32_t* __restrict__ nn = data.ndof();
     int32_t* __restrict__ iv = ws.iv;
 
-    assert(pdata);
+    //TODO: check if there is a way to assert this
+    //assert(pdata);
     assert(zt);
 
     assert(nvFinal <= nvIntermediate);
@@ -101,7 +102,7 @@ namespace gpuVertexFinder {
       printf("and %d noise\n", noise);
   }
 
-  __global__ void fitVerticesKernel(ZVertices* pdata,
+  __global__ void fitVerticesKernel(VtxSoAView pdata,
                                     WorkSpace* pws,
                                     float chi2Max  // for outlier rejection
   ) {
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSortByPt2.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSortByPt2.h
index 93f78d498b26f..c705fc1f4065e 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSortByPt2.h
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSortByPt2.h
@@ -15,16 +15,16 @@
 
 namespace gpuVertexFinder {
 
-  __device__ __forceinline__ void sortByPt2(ZVertices* pdata, WorkSpace* pws) {
-    auto& __restrict__ data = *pdata;
+  __device__ __forceinline__ void sortByPt2(VtxSoAView pdata, WorkSpace* pws) {
+    auto& __restrict__ data = pdata;
     auto& __restrict__ ws = *pws;
     auto nt = ws.ntrks;
     float const* __restrict__ ptt2 = ws.ptt2;
-    uint32_t const& nvFinal = data.nvFinal;
+    uint32_t const& nvFinal = data.nvFinal();
 
     int32_t const* __restrict__ iv = ws.iv;
-    float* __restrict__ ptv2 = data.ptv2;
-    uint16_t* __restrict__ sortInd = data.sortInd;
+    float* __restrict__ ptv2 = data.ptv2();
+    uint16_t* __restrict__ sortInd = data.sortInd();
 
     // if (threadIdx.x == 0)
     //    printf("sorting %d vertices\n",nvFinal);
@@ -34,7 +34,7 @@ namespace gpuVertexFinder {
 
     // fill indexing
     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
-      data.idv[ws.itrk[i]] = iv[i];
+      data[ws.itrk[i]].idv() = iv[i];
     }
 
     // can be done asynchronoisly at the end of previous event
@@ -66,7 +66,7 @@ namespace gpuVertexFinder {
 #endif
   }
 
-  __global__ void sortByPt2Kernel(ZVertices* pdata, WorkSpace* pws) { sortByPt2(pdata, pws); }
+  __global__ void sortByPt2Kernel(VtxSoAView pdata, WorkSpace* pws) { sortByPt2(pdata, pws); }
 
 }  // namespace gpuVertexFinder
 
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSplitVertices.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSplitVertices.h
index 0fe8bd882dcc5..ad72c489ed67e 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSplitVertices.h
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSplitVertices.h
@@ -12,23 +12,24 @@
 
 namespace gpuVertexFinder {
 
-  __device__ __forceinline__ void splitVertices(ZVertices* pdata, WorkSpace* pws, float maxChi2) {
+  __device__ __forceinline__ void splitVertices(VtxSoAView pdata, WorkSpace* pws, float maxChi2) {
     constexpr bool verbose = false;  // in principle the compiler should optmize out if false
 
-    auto& __restrict__ data = *pdata;
+    auto& __restrict__ data = pdata;
     auto& __restrict__ ws = *pws;
     auto nt = ws.ntrks;
     float const* __restrict__ zt = ws.zt;
     float const* __restrict__ ezt2 = ws.ezt2;
-    float* __restrict__ zv = data.zv;
-    float* __restrict__ wv = data.wv;
-    float const* __restrict__ chi2 = data.chi2;
-    uint32_t& nvFinal = data.nvFinal;
+    float* __restrict__ zv = data.zv();
+    float* __restrict__ wv = data.wv();
+    float const* __restrict__ chi2 = data.chi2();
+    uint32_t& nvFinal = data.nvFinal();
 
-    int32_t const* __restrict__ nn = data.ndof;
+    int32_t const* __restrict__ nn = data.ndof();
     int32_t* __restrict__ iv = ws.iv;
 
-    assert(pdata);
+    //TODO: check if there is a way to assert this
+    //assert(pdata);
     assert(zt);
 
     // one vertex per block
@@ -130,7 +131,7 @@ namespace gpuVertexFinder {
     }  // loop on vertices
   }
 
-  __global__ void splitVerticesKernel(ZVertices* pdata, WorkSpace* pws, float maxChi2) {
+  __global__ void splitVerticesKernel(VtxSoAView pdata, WorkSpace* pws, float maxChi2) {
     splitVertices(pdata, pws, maxChi2);
   }
 
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc
index c92060f8ba2cc..f8755996c3980 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc
@@ -1,5 +1,7 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 
+#include <Eigen/Core> //TODO: understand why this is needed
+
 #include "gpuClusterTracksByDensity.h"
 #include "gpuClusterTracksDBSCAN.h"
 #include "gpuClusterTracksIterative.h"
@@ -8,6 +10,7 @@
 #include "gpuSplitVertices.h"
 
 #include "CUDADataFormats/Track/interface/PixelTrackUtilities.h"
+#include "CUDADataFormats/Vertex/interface/ZVertexUtilities.h"
 
 #undef PIXVERTEX_DEBUG_PRODUCE
 
@@ -20,9 +23,11 @@ namespace gpuVertexFinder {
   // split vertices with a chi2/NDoF greater than this
   constexpr float maxChi2ForSplit = 9.f;
   using TkSoAConstView = pixelTrack::TrackSoAConstView;
+  using VtxSoAView = ZVertex::ZVertexSoAView;
 
-  __global__ void loadTracks(TkSoAConstView tracks_view, ZVertexSoA* soa, WorkSpace* pws, float ptMin, float ptMax) {
-    assert(soa);
+  __global__ void loadTracks(TkSoAConstView tracks_view, VtxSoAView soa, WorkSpace* pws, float ptMin, float ptMax) {
+    //TODO: check if there is a way to assert this
+    //assert(soa);
     auto const* quality = pixelTrack::utilities::qualityData(tracks_view);
 
     auto first = blockIdx.x * blockDim.x + threadIdx.x;
@@ -31,7 +36,7 @@ namespace gpuVertexFinder {
       assert(nHits >= 3);
 
       // initialize soa...
-      soa->idv[idx] = -1;
+      soa[idx].idv() = -1;
 
       if (pixelTrack::utilities::isTriplet(tracks_view, idx))
         continue;  // no triplets
@@ -57,7 +62,7 @@ namespace gpuVertexFinder {
 
 // #define THREE_KERNELS
 #ifndef THREE_KERNELS
-  __global__ void vertexFinderOneKernel(gpuVertexFinder::ZVertices* pdata,
+  __global__ void vertexFinderOneKernel(VtxSoAView pdata,
                                         gpuVertexFinder::WorkSpace* pws,
                                         int minT,      // min number of neighbours to be "seed"
                                         float eps,     // max absolute distance to cluster
@@ -75,7 +80,7 @@ namespace gpuVertexFinder {
     sortByPt2(pdata, pws);
   }
 #else
-  __global__ void vertexFinderKernel1(gpuVertexFinder::ZVertices* pdata,
+  __global__ void vertexFinderKernel1(gpuVertexFinder::VtxSoAView pdata,
                                       gpuVertexFinder::WorkSpace* pws,
                                       int minT,      // min number of neighbours to be "seed"
                                       float eps,     // max absolute distance to cluster
@@ -87,7 +92,7 @@ namespace gpuVertexFinder {
     fitVertices(pdata, pws, maxChi2ForFirstFit);
   }
 
-  __global__ void vertexFinderKernel2(gpuVertexFinder::ZVertices* pdata, gpuVertexFinder::WorkSpace* pws) {
+  __global__ void vertexFinderKernel2(gpuVertexFinder::VtxSoAView pdata, gpuVertexFinder::WorkSpace* pws) {
     fitVertices(pdata, pws, maxChi2ForFinalFit);
     __syncthreads();
     sortByPt2(pdata, pws);
@@ -95,23 +100,24 @@ namespace gpuVertexFinder {
 #endif
 
 #ifdef __CUDACC__
-  ZVertexHeterogeneous Producer::makeAsync(cudaStream_t stream,
+  ZVertex::ZVertexSoADevice Producer::makeAsync(cudaStream_t stream,
                                            TkSoAConstView tracks_view,
                                            float ptMin,
                                            float ptMax) const {
 #ifdef PIXVERTEX_DEBUG_PRODUCE
     std::cout << "producing Vertices on GPU" << std::endl;
 #endif  // PIXVERTEX_DEBUG_PRODUCE
-    ZVertexHeterogeneous vertices(cms::cuda::make_device_unique<ZVertexSoA>(stream));
+    ZVertex::ZVertexSoADevice vertices(stream);
 #else
-  ZVertexHeterogeneous Producer::make(TkSoAConstView tracks_view, float ptMin, float ptMax) const {
+  ZVertex::ZVertexSoAHost Producer::make(TkSoAConstView tracks_view, float ptMin, float ptMax) const {
 #ifdef PIXVERTEX_DEBUG_PRODUCE
     std::cout << "producing Vertices on  CPU" << std::endl;
 #endif  // PIXVERTEX_DEBUG_PRODUCE
-    ZVertexHeterogeneous vertices(std::make_unique<ZVertexSoA>());
+    ZVertex::ZVertexSoAHost vertices;
 #endif
-    auto* soa = vertices.get();
-    assert(soa);
+    auto soa = vertices.view();
+    //TODO: check if there is a way to assert this
+    //assert(soa);
 
 #ifdef __CUDACC__
     auto ws_d = cms::cuda::make_device_unique<WorkSpace>(stream);
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h
index 8c542607812b9..b8a81ea04a03d 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h
@@ -4,18 +4,21 @@
 #include <cstddef>
 #include <cstdint>
 
-#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h"
+//#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h"
+#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h"
+#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h"
+#include "CUDADataFormats/Vertex/interface/ZVertexUtilities.h"
 #include "CUDADataFormats/Track/interface/PixelTrackUtilities.h"
 
 namespace gpuVertexFinder {
 
-  using ZVertices = ZVertexSoA;
+  using VtxSoAView = ZVertex::ZVertexSoAView;
   using TkSoAConstView = pixelTrack::TrackSoAConstView;
 
   // workspace used in the vertex reco algos
   struct WorkSpace {
-    static constexpr uint32_t MAXTRACKS = ZVertexSoA::MAXTRACKS;
-    static constexpr uint32_t MAXVTX = ZVertexSoA::MAXVTX;
+    static constexpr uint32_t MAXTRACKS = ZVertex::utilities::MAXTRACKS;
+    static constexpr uint32_t MAXVTX = ZVertex::utilities::MAXVTX;
 
     uint32_t ntrks;            // number of "selected tracks"
     uint16_t itrk[MAXTRACKS];  // index of original track
@@ -33,14 +36,14 @@ namespace gpuVertexFinder {
     }
   };
 
-  __global__ void init(ZVertexSoA* pdata, WorkSpace* pws) {
-    pdata->init();
+  __global__ void init(VtxSoAView pdata, WorkSpace* pws) {
+    ZVertex::utilities::init(pdata);
     pws->init();
   }
 
   class Producer {
   public:
-    using ZVertices = ZVertexSoA;
+    using VtxSoAView = ZVertex::ZVertexSoAView;
     using WorkSpace = gpuVertexFinder::WorkSpace;
 
     Producer(bool oneKernel,
@@ -63,8 +66,8 @@ namespace gpuVertexFinder {
 
     ~Producer() = default;
 
-    ZVertexHeterogeneous makeAsync(cudaStream_t stream, TkSoAConstView tracks_view, float ptMin, float ptMax) const;
-    ZVertexHeterogeneous make(TkSoAConstView tracks_view, float ptMin, float ptMax) const;
+    ZVertex::ZVertexSoADevice makeAsync(cudaStream_t stream, TkSoAConstView tracks_view, float ptMin, float ptMax) const;
+    ZVertex::ZVertexSoAHost make(TkSoAConstView tracks_view, float ptMin, float ptMax) const;
 
   private:
     const bool oneKernel_;

From 152bb647d4539b9ef6b2bb7dceca7ee66b8f5c2e Mon Sep 17 00:00:00 2001
From: Breno Orzari <breno.orzari@hotmail.com>
Date: Fri, 4 Nov 2022 12:06:35 +0100
Subject: [PATCH 098/110] Removing copyToHost and <bits/stdint-uintn.h>

---
 .../Track/interface/TrackSoAHeterogeneousDevice.h          | 7 +------
 .../Track/interface/TrackSoAHeterogeneousHost.h            | 2 +-
 CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp  | 6 ++++--
 .../Vertex/interface/ZVertexSoAHeterogeneousDevice.h       | 7 +------
 .../Vertex/interface/ZVertexSoAHeterogeneousHost.h         | 2 +-
 .../PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc     | 3 ++-
 .../plugins/CAHitNtupletGeneratorKernelsImpl.h             | 3 +--
 .../PixelVertexFinding/plugins/PixelVertexSoAFromCUDA.cc   | 7 ++++---
 .../PixelVertexFinding/plugins/gpuVertexFinder.cc          | 6 ++----
 .../PixelVertexFinding/plugins/gpuVertexFinder.h           | 3 +--
 10 files changed, 18 insertions(+), 28 deletions(-)

diff --git a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h
index b79f8d959720c..fb1c45f331d19 100644
--- a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h
+++ b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h
@@ -1,7 +1,7 @@
 #ifndef CUDADataFormats_Track_TrackHeterogeneousDevice_H
 #define CUDADataFormats_Track_TrackHeterogeneousDevice_H
 
-#include <bits/stdint-uintn.h>
+#include <cstdint>
 
 #include "CUDADataFormats/Track/interface/PixelTrackUtilities.h"
 #include "CUDADataFormats/Common/interface/PortableDeviceCollection.h"
@@ -17,11 +17,6 @@ class TrackSoAHeterogeneousDevice : public cms::cuda::PortableDeviceCollection<T
   explicit TrackSoAHeterogeneousDevice(cudaStream_t stream)
       : PortableDeviceCollection<TrackSoAHeterogeneousLayout<>>(S, stream) {}
 
-  // Copy data from device to host
-  __host__ void copyToHost(cms::cuda::host::unique_ptr<std::byte[]> &host_ptr, cudaStream_t stream) const {
-    cudaCheck(cudaMemcpyAsync(host_ptr.get(), const_buffer().get(), bufferSize(), cudaMemcpyDeviceToHost, stream));
-    cudaCheck(cudaGetLastError());
-  }
 };
 
 namespace pixelTrack {
diff --git a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h
index a4b18134066a3..70427f2bfd559 100644
--- a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h
+++ b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h
@@ -1,7 +1,7 @@
 #ifndef CUDADataFormats_Track_TrackHeterogeneousHost_H
 #define CUDADataFormats_Track_TrackHeterogeneousHost_H
 
-#include <bits/stdint-uintn.h>
+#include <cstdint>
 
 #include "CUDADataFormats/Track/interface/PixelTrackUtilities.h"
 #include "CUDADataFormats/Common/interface/PortableHostCollection.h"
diff --git a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp
index 0ad6863d4f8c7..0647296b9ef40 100644
--- a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp
+++ b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp
@@ -13,7 +13,7 @@
    the same Layout to access the data on host and print it.
  */
 
-#include <bits/stdint-uintn.h>
+#include <cstdint>
 #include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h"
 #include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
@@ -42,7 +42,9 @@ int main() {
     // Instantate tracks on host. This is where the data will be
     // copied to from device.
     pixelTrack::TrackSoAHost tracks_h(stream);
-    tracks_d.copyToHost(tracks_h.buffer(), stream);
+    //tracks_d.copyToHost(tracks_h.buffer(), stream);
+    cudaCheck(cudaMemcpyAsync(tracks_h.buffer().get(), tracks_d.const_buffer().get(), tracks_d.bufferSize(), cudaMemcpyDeviceToHost, stream));
+    cudaCheck(cudaGetLastError());
 
     // Print results
     std::cout << "pt"
diff --git a/CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h b/CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h
index 47cb8af2b4cc6..d1ff67b042701 100644
--- a/CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h
+++ b/CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h
@@ -1,7 +1,7 @@
 #ifndef CUDADataFormats_Vertex_ZVertexHeterogeneousDevice_H
 #define CUDADataFormats_Vertex_ZVertexHeterogeneousDevice_H
 
-#include <bits/stdint-uintn.h>
+#include <cstdint>
 
 #include "CUDADataFormats/Vertex/interface/ZVertexUtilities.h"
 #include "CUDADataFormats/Common/interface/PortableDeviceCollection.h"
@@ -17,11 +17,6 @@ class ZVertexSoAHeterogeneousDevice : public cms::cuda::PortableDeviceCollection
   explicit ZVertexSoAHeterogeneousDevice(cudaStream_t stream)
       : PortableDeviceCollection<ZVertexSoAHeterogeneousLayout<>>(S, stream) {}
 
-  // Copy data from device to host
-  __host__ void copyToHost(cms::cuda::host::unique_ptr<std::byte[]> &host_ptr, cudaStream_t stream) const {
-    cudaCheck(cudaMemcpyAsync(host_ptr.get(), const_buffer().get(), bufferSize(), cudaMemcpyDeviceToHost, stream));
-    cudaCheck(cudaGetLastError());
-  }
 };
 
 namespace ZVertex {
diff --git a/CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h b/CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h
index e751e2da8f5de..4867c49d15bab 100644
--- a/CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h
+++ b/CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h
@@ -1,7 +1,7 @@
 #ifndef CUDADataFormats_Vertex_ZVertexHeterogeneousHost_H
 #define CUDADataFormats_Vertex_ZVertexHeterogeneousHost_H
 
-#include <bits/stdint-uintn.h>
+#include <cstdint>
 
 #include "CUDADataFormats/Vertex/interface/ZVertexUtilities.h"
 #include "CUDADataFormats/Common/interface/PortableHostCollection.h"
diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc
index 283e5b0292464..1dadeb9d0dcc1 100644
--- a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc
+++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc
@@ -59,7 +59,8 @@ void PixelTrackSoAFromCUDA::acquire(edm::Event const& iEvent,
   cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)};
   auto const& tracks_d = ctx.get(inputDataWrapped);      // Tracks on device
   tracks_h = pixelTrack::TrackSoAHost(ctx.stream());     // Create an instance of Tracks on Host, using the stream
-  tracks_d.copyToHost(tracks_h.buffer(), ctx.stream());  // Copy data from Device to Host
+  cudaCheck(cudaMemcpyAsync(tracks_h.buffer().get(), tracks_d.const_buffer().get(), tracks_d.bufferSize(), cudaMemcpyDeviceToHost, ctx.stream())); // Copy data from Device to Host
+  cudaCheck(cudaGetLastError());
 }
 
 void PixelTrackSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& iSetup) {
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
index 4f2272db13354..75f52305ab39b 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
@@ -9,7 +9,6 @@
 #include <cstdint>
 #include <limits>
 
-#include <bits/stdint-uintn.h>
 #include <cuda_runtime.h>
 
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
@@ -196,7 +195,7 @@ __global__ void kernel_fastDuplicateRemover(GPUCACell const *__restrict__ cells,
 
     /* chi2 penalize higher-pt tracks  (try rescale it?)
     auto score = [&](auto it) {
-      return tracks_view[it].nLayers() < 4 ? 
+      return tracks_view[it].nLayers() < 4 ?
               std::abs(pixelTrack::utilities::tip(tracks_view, it)) :  // tip for triplets
               tracks_view[it].chi2(it);            //chi2 for quads
     };
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexSoAFromCUDA.cc b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexSoAFromCUDA.cc
index ef97c9a2b6ea7..f373c95e02760 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexSoAFromCUDA.cc
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexSoAFromCUDA.cc
@@ -53,9 +53,10 @@ void PixelVertexSoAFromCUDA::acquire(edm::Event const& iEvent,
                                      edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
   cms::cuda::Product<ZVertex::ZVertexSoADevice> const& inputDataWrapped = iEvent.get(tokenCUDA_);
   cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)};
-  auto const& zvertex_d = ctx.get(inputDataWrapped);
-  zvertex_h = ZVertex::ZVertexSoAHost(ctx.stream());
-  zvertex_d.copyToHost(zvertex_h.buffer(), ctx.stream());
+  auto const& zvertex_d = ctx.get(inputDataWrapped); // Tracks on device
+  zvertex_h = ZVertex::ZVertexSoAHost(ctx.stream()); // Create an instance of Tracks on Host, using the stream
+  cudaCheck(cudaMemcpyAsync(zvertex_h.buffer().get(), zvertex_d.const_buffer().get(), zvertex_d.bufferSize(), cudaMemcpyDeviceToHost, ctx.stream())); // Copy data from Device to Host
+  cudaCheck(cudaGetLastError());
 }
 
 void PixelVertexSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& iSetup) {
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc
index f8755996c3980..0e6327c6ed05b 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc
@@ -1,6 +1,7 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 
-#include <Eigen/Core> //TODO: understand why this is needed
+#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h"
+#include "CUDADataFormats/Vertex/interface/ZVertexUtilities.h"
 
 #include "gpuClusterTracksByDensity.h"
 #include "gpuClusterTracksDBSCAN.h"
@@ -9,9 +10,6 @@
 #include "gpuSortByPt2.h"
 #include "gpuSplitVertices.h"
 
-#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h"
-#include "CUDADataFormats/Vertex/interface/ZVertexUtilities.h"
-
 #undef PIXVERTEX_DEBUG_PRODUCE
 
 namespace gpuVertexFinder {
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h
index b8a81ea04a03d..d56d68470acd8 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h
@@ -4,11 +4,10 @@
 #include <cstddef>
 #include <cstdint>
 
-//#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h"
+#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h"
 #include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h"
 #include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h"
 #include "CUDADataFormats/Vertex/interface/ZVertexUtilities.h"
-#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h"
 
 namespace gpuVertexFinder {
 

From 90b8e3af01e5aea45d64241afe6f35adcc5a9642 Mon Sep 17 00:00:00 2001
From: Breno Orzari <breno.orzari@hotmail.com>
Date: Fri, 4 Nov 2022 12:14:41 +0100
Subject: [PATCH 099/110] Removing ZVertexHeterogeneous (not needed anymore)

---
 .../Vertex/interface/ZVertexHeterogeneous.h   | 13 ----------
 CUDADataFormats/Vertex/interface/ZVertexSoA.h | 26 -------------------
 2 files changed, 39 deletions(-)
 delete mode 100644 CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h
 delete mode 100644 CUDADataFormats/Vertex/interface/ZVertexSoA.h

diff --git a/CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h b/CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h
deleted file mode 100644
index 417a960951fb1..0000000000000
--- a/CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h
+++ /dev/null
@@ -1,13 +0,0 @@
-#ifndef CUDADataFormatsVertexZVertexHeterogeneous_H
-#define CUDADataFormatsVertexZVertexHeterogeneous_H
-
-#include "CUDADataFormats/Vertex/interface/ZVertexSoA.h"
-#include "CUDADataFormats/Common/interface/HeterogeneousSoA.h"
-
-using ZVertexHeterogeneous = HeterogeneousSoA<ZVertexSoA>;
-#ifndef __CUDACC__
-#include "CUDADataFormats/Common/interface/Product.h"
-using ZVertexCUDAProduct = cms::cuda::Product<ZVertexHeterogeneous>;
-#endif
-
-#endif
diff --git a/CUDADataFormats/Vertex/interface/ZVertexSoA.h b/CUDADataFormats/Vertex/interface/ZVertexSoA.h
deleted file mode 100644
index e31b87f30fa11..0000000000000
--- a/CUDADataFormats/Vertex/interface/ZVertexSoA.h
+++ /dev/null
@@ -1,26 +0,0 @@
-#ifndef CUDADataFormats_Vertex_ZVertexSoA_h
-#define CUDADataFormats_Vertex_ZVertexSoA_h
-
-#include <cstdint>
-#include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h"
-
-// SOA for vertices
-// These vertices are clusterized and fitted only along the beam line (z)
-// to obtain their global coordinate the beam spot position shall be added (eventually correcting for the beam angle as well)
-struct ZVertexSoA {
-  static constexpr uint32_t MAXTRACKS = 32 * 1024;
-  static constexpr uint32_t MAXVTX = 1024;
-
-  int16_t idv[MAXTRACKS];    // vertex index for each associated (original) track  (-1 == not associate)
-  float zv[MAXVTX];          // output z-posistion of found vertices
-  float wv[MAXVTX];          // output weight (1/error^2) on the above
-  float chi2[MAXVTX];        // vertices chi2
-  float ptv2[MAXVTX];        // vertices pt^2
-  int32_t ndof[MAXTRACKS];   // vertices number of dof (reused as workspace for the number of nearest neighbours FIXME)
-  uint16_t sortInd[MAXVTX];  // sorted index (by pt2)  ascending
-  uint32_t nvFinal;          // the number of vertices
-
-  __host__ __device__ void init() { nvFinal = 0; }
-};
-
-#endif  // CUDADataFormats_Vertex_ZVertexSoA_h

From 3071132831016924fb83632827740df0b907057e Mon Sep 17 00:00:00 2001
From: Breno Orzari <breno.orzari@hotmail.com>
Date: Fri, 4 Nov 2022 12:32:41 +0100
Subject: [PATCH 100/110] Cleanup and updating dataformat in
 L2TauTagNNProducer.cc

---
 .../interface/ZVertexSoAHeterogeneousDevice.h |  2 +-
 .../interface/ZVertexSoAHeterogeneousHost.h   |  2 +-
 CUDADataFormats/Vertex/src/classes.h          |  1 -
 .../plugins/PixelTrackDumpCUDA.cc             |  1 -
 .../HLTProducers/src/L2TauTagNNProducer.cc    | 27 ++++++++++---------
 5 files changed, 16 insertions(+), 17 deletions(-)

diff --git a/CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h b/CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h
index d1ff67b042701..b1b9779ddf400 100644
--- a/CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h
+++ b/CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h
@@ -25,4 +25,4 @@ namespace ZVertex {
 
 }  // namespace pixelTrack
 
-#endif  // CUDADataFormats_Vertex_ZVertexHeterogeneousT_H
+#endif  // CUDADataFormats_Vertex_ZVertexHeterogeneousDevice_H
diff --git a/CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h b/CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h
index 4867c49d15bab..0c02356192c4e 100644
--- a/CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h
+++ b/CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h
@@ -23,4 +23,4 @@ namespace ZVertex {
 
 }  // namespace ZVertex
 
-#endif  // CUDADataFormats_Vertex_ZVertexHeterogeneousT_H
+#endif  // CUDADataFormats_Vertex_ZVertexHeterogeneousHost_H
diff --git a/CUDADataFormats/Vertex/src/classes.h b/CUDADataFormats/Vertex/src/classes.h
index 6f087ecb2cf46..0340affffa06c 100644
--- a/CUDADataFormats/Vertex/src/classes.h
+++ b/CUDADataFormats/Vertex/src/classes.h
@@ -1,7 +1,6 @@
 #ifndef CUDADataFormats_Vertex_src_classes_h
 #define CUDADataFormats_Vertex_src_classes_h
 
-//#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h"
 #include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h"
 #include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h"
 #include "CUDADataFormats/Common/interface/Product.h"
diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc
index a1acc6376e111..6bf47b7302da1 100644
--- a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc
+++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc
@@ -4,7 +4,6 @@
 #include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h"
 #include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h"
 #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h"
-//#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h"
 #include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h"
 #include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h"
 #include "DataFormats/Common/interface/Handle.h"
diff --git a/RecoTauTag/HLTProducers/src/L2TauTagNNProducer.cc b/RecoTauTag/HLTProducers/src/L2TauTagNNProducer.cc
index db650684e7578..aa8565e9aed1f 100644
--- a/RecoTauTag/HLTProducers/src/L2TauTagNNProducer.cc
+++ b/RecoTauTag/HLTProducers/src/L2TauTagNNProducer.cc
@@ -50,8 +50,9 @@
 #include "CUDADataFormats/Track/interface/PixelTrackUtilities.h"
 #include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h"
 #include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h"
-#include "CUDADataFormats/Vertex/interface/ZVertexSoA.h"
-#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h"
+#include "CUDADataFormats/Vertex/interface/ZVertexUtilities.h"
+#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h"
+#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h"
 
 namespace L2TauTagNNv1 {
   constexpr int nCellEta = 5;
@@ -181,10 +182,10 @@ class L2TauNNProducer : public edm::stream::EDProducer<edm::GlobalCache<L2TauNNP
   void fillPatatracks(tensorflow::Tensor& cellGridMatrix,
                       const std::vector<l1t::TauRef>& allTaus,
                       const pixelTrack::TrackSoAHost& patatracks_tsoa,
-                      const ZVertexSoA& patavtx_soa,
+                      const ZVertex::ZVertexSoAHost& patavtx_soa,
                       const reco::BeamSpot& beamspot,
                       const MagneticField* magfi);
-  void selectGoodTracksAndVertices(const ZVertexSoA& patavtx_soa,
+  void selectGoodTracksAndVertices(const ZVertex::ZVertexSoAHost& patavtx_soa,
                                    const pixelTrack::TrackSoAHost& patatracks_tsoa,
                                    std::vector<int>& trkGood,
                                    std::vector<int>& vtxGood);
@@ -208,7 +209,7 @@ class L2TauNNProducer : public edm::stream::EDProducer<edm::GlobalCache<L2TauNNP
   const edm::EDGetTokenT<EcalRecHitCollection> eeToken_;
   const edm::ESGetToken<CaloGeometry, CaloGeometryRecord> geometryToken_;
   const edm::ESGetToken<MagneticField, IdealMagneticFieldRecord> bFieldToken_;
-  const edm::EDGetTokenT<ZVertexHeterogeneous> pataVerticesToken_;
+  const edm::EDGetTokenT<ZVertex::ZVertexSoAHost> pataVerticesToken_;
   const edm::EDGetTokenT<pixelTrack::TrackSoAHost> pataTracksToken_;
   const edm::EDGetTokenT<reco::BeamSpot> beamSpotToken_;
   const unsigned int maxVtx_;
@@ -293,7 +294,7 @@ L2TauNNProducer::L2TauNNProducer(const edm::ParameterSet& cfg, const L2TauNNProd
       eeToken_(consumes<EcalRecHitCollection>(cfg.getParameter<edm::InputTag>("eeInput"))),
       geometryToken_(esConsumes<CaloGeometry, CaloGeometryRecord>()),
       bFieldToken_(esConsumes<MagneticField, IdealMagneticFieldRecord>()),
-      pataVerticesToken_(consumes<ZVertexHeterogeneous>(cfg.getParameter<edm::InputTag>("pataVertices"))),
+      pataVerticesToken_(consumes<ZVertex::ZVertexSoAHost>(cfg.getParameter<edm::InputTag>("pataVertices"))),
       pataTracksToken_(consumes<pixelTrack::TrackSoAHost>(cfg.getParameter<edm::InputTag>("pataTracks"))),
       beamSpotToken_(consumes<reco::BeamSpot>(cfg.getParameter<edm::InputTag>("BeamSpot"))),
       maxVtx_(cfg.getParameter<uint>("maxVtx")),
@@ -570,12 +571,12 @@ void L2TauNNProducer::fillCaloRecHits(tensorflow::Tensor& cellGridMatrix,
   }
 }
 
-void L2TauNNProducer::selectGoodTracksAndVertices(const ZVertexSoA& patavtx_soa,
+void L2TauNNProducer::selectGoodTracksAndVertices(const ZVertex::ZVertexSoAHost& patavtx_soa,
                                                   const pixelTrack::TrackSoAHost& patatracks_tsoa,
                                                   std::vector<int>& trkGood,
                                                   std::vector<int>& vtxGood) {
   const auto maxTracks = patatracks_tsoa.view().metadata().size();
-  const int nv = patavtx_soa.nvFinal;
+  const int nv = patavtx_soa.view().nvFinal();
   trkGood.clear();
   trkGood.reserve(maxTracks);
   vtxGood.clear();
@@ -591,7 +592,7 @@ void L2TauNNProducer::selectGoodTracksAndVertices(const ZVertexSoA& patavtx_soa,
     if (nHits == 0) {
       break;
     }
-    int vtx_ass_to_track = patavtx_soa.idv[trk_idx];
+    int vtx_ass_to_track = patavtx_soa.view()[trk_idx].idv();
     if (vtx_ass_to_track >= 0 && vtx_ass_to_track < nv) {
       auto patatrackPt = patatracks_tsoa.view()[trk_idx].pt();
       ++nTrkAssociated[vtx_ass_to_track];
@@ -607,7 +608,7 @@ void L2TauNNProducer::selectGoodTracksAndVertices(const ZVertexSoA& patavtx_soa,
   if (nv > 0) {
     const auto minFOM_fromFrac = (*std::max_element(pTSquaredSum.begin(), pTSquaredSum.end())) * fractionSumPt2_;
     for (int j = nv - 1; j >= 0 && vtxGood.size() < maxVtx_; --j) {
-      auto vtx_idx = patavtx_soa.sortInd[j];
+      auto vtx_idx = patavtx_soa.view()[j].sortInd();
       assert(vtx_idx < nv);
       if (nTrkAssociated[vtx_idx] >= 2 && pTSquaredSum[vtx_idx] >= minFOM_fromFrac &&
           pTSquaredSum[vtx_idx] > minSumPt2_) {
@@ -652,7 +653,7 @@ std::pair<float, float> L2TauNNProducer::impactParameter(int it,
 void L2TauNNProducer::fillPatatracks(tensorflow::Tensor& cellGridMatrix,
                                      const std::vector<l1t::TauRef>& allTaus,
                                      const pixelTrack::TrackSoAHost& patatracks_tsoa,
-                                     const ZVertexSoA& patavtx_soa,
+                                     const ZVertex::ZVertexSoAHost& patavtx_soa,
                                      const reco::BeamSpot& beamspot,
                                      const MagneticField* magfi) {
   using NNInputs = L2TauTagNNv1::NNInputs;
@@ -688,7 +689,7 @@ void L2TauNNProducer::fillPatatracks(tensorflow::Tensor& cellGridMatrix,
         continue;
       const int patatrackNdof = 2 * std::min(6, nHits) - 5;
 
-      const int vtx_idx_assTrk = patavtx_soa.idv[it];
+      const int vtx_idx_assTrk = patavtx_soa.view()[it].idv();
       if (reco::deltaR2(patatrackEta, patatrackPhi, tauEta, tauPhi) < dR2_max) {
         std::tie(deta, dphi, eta_idx, phi_idx) =
             getEtaPhiIndices(patatrackEta, patatrackPhi, allTaus[tau_idx]->polarP4());
@@ -765,7 +766,7 @@ void L2TauNNProducer::produce(edm::Event& event, const edm::EventSetup& eventset
   const auto hbhe = event.getHandle(hbheToken_);
   const auto ho = event.getHandle(hoToken_);
   auto& patatracks_SoA = event.get(pataTracksToken_);
-  const auto& vertices_SoA = *event.get(pataVerticesToken_);
+  auto& vertices_SoA = event.get(pataVerticesToken_);
   const auto bsHandle = event.getHandle(beamSpotToken_);
 
   auto const fieldESH = eventsetup.getHandle(bFieldToken_);

From 551398bbfbcbf9f294aaa46b8a6aa12a7c2ff455 Mon Sep 17 00:00:00 2001
From: Dimitris Papagiannis <nothingface0@gmail.com>
Date: Fri, 4 Nov 2022 12:45:55 +0100
Subject: [PATCH 101/110] Adapted test

---
 .../PixelVertexFinding/test/VertexFinder_t.h  | 111 ++++++++++--------
 1 file changed, 60 insertions(+), 51 deletions(-)

diff --git a/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h b/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h
index 5f8a0646c726a..cf6fccf04ffc0 100644
--- a/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h
+++ b/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h
@@ -7,6 +7,13 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/launch.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/allocate_device.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/currentDevice.h"
+
+#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h"  // TODO: included in order to compile Eigen columns first!!!
+#include "CUDADataFormats/Vertex/interface/ZVertexUtilities.h"
+#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h"
+#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h"
 #ifdef USE_DBSCAN
 #include "RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksDBSCAN.h"
 #define CLUSTERIZE gpuVertexFinder::clusterTracksDBSCAN
@@ -23,7 +30,7 @@
 
 #ifdef ONE_KERNEL
 #ifdef __CUDACC__
-__global__ void vertexFinderOneKernel(gpuVertexFinder::ZVertices* pdata,
+__global__ void vertexFinderOneKernel(gpuVertexFinder::VtxSoAView pdata,
                                       gpuVertexFinder::WorkSpace* pws,
                                       int minT,      // min number of neighbours to be "seed"
                                       float eps,     // max absolute distance to cluster
@@ -102,23 +109,26 @@ struct ClusterGenerator {
 };
 
 // a macro SORRY
-#define LOC_ONGPU(M) ((char*)(onGPU_d.get()) + offsetof(gpuVertexFinder::ZVertices, M))
 #define LOC_WS(M) ((char*)(ws_d.get()) + offsetof(gpuVertexFinder::WorkSpace, M))
 
-__global__ void print(gpuVertexFinder::ZVertices const* pdata, gpuVertexFinder::WorkSpace const* pws) {
-  auto const& __restrict__ data = *pdata;
+__global__ void print(gpuVertexFinder::VtxSoAView pdata, gpuVertexFinder::WorkSpace const* pws) {
   auto const& __restrict__ ws = *pws;
-  printf("nt,nv %d %d,%d\n", ws.ntrks, data.nvFinal, ws.nvIntermediate);
+  printf("nt,nv %d %d,%d\n", ws.ntrks, pdata.nvFinal(), ws.nvIntermediate);
 }
 
 int main() {
+  cudaStream_t stream;
 #ifdef __CUDACC__
   cms::cudatest::requireDevices();
+  cudaCheck(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
 
-  auto onGPU_d = cms::cuda::make_device_unique<gpuVertexFinder::ZVertices[]>(1, nullptr);
+  // auto onGPU_d = cms::cuda::make_device_unique<gpuVertexFinder::ZVertices[]>(1, nullptr);
+  ZVertex::ZVertexSoADevice onGPU_d(stream);
   auto ws_d = cms::cuda::make_device_unique<gpuVertexFinder::WorkSpace[]>(1, nullptr);
 #else
-  auto onGPU_d = std::make_unique<gpuVertexFinder::ZVertices>();
+  stream = nullptr;
+  // auto onGPU_d = std::make_unique<gpuVertexFinder::ZVertices>();
+  ZVertex::ZVertexSoAHost onGPU_d(stream);
   auto ws_d = std::make_unique<gpuVertexFinder::WorkSpace>();
 #endif
 
@@ -135,10 +145,9 @@ int main() {
       gen(ev);
 
 #ifdef __CUDACC__
-      init<<<1, 1, 0, 0>>>(onGPU_d.get(), ws_d.get());
+      gpuVertexFinder::init<<<1, 1, 0, stream>>>(onGPU_d.view(), ws_d.get());
 #else
-      onGPU_d->init();
-      ws_d->init();
+      gpuVertexFinder::init(onGPU_d.view(), ws_d.get());
 #endif
 
       std::cout << "v,t size " << ev.zvert.size() << ' ' << ev.ztrack.size() << std::endl;
@@ -168,30 +177,30 @@ int main() {
 
       uint32_t nv = 0;
 #ifdef __CUDACC__
-      print<<<1, 1, 0, 0>>>(onGPU_d.get(), ws_d.get());
+      print<<<1, 1, 0, stream>>>(onGPU_d.view(), ws_d.get());
       cudaCheck(cudaGetLastError());
       cudaDeviceSynchronize();
 
 #ifdef ONE_KERNEL
-      cms::cuda::launch(vertexFinderOneKernel, {1, 512 + 256}, onGPU_d.get(), ws_d.get(), kk, par[0], par[1], par[2]);
+      cms::cuda::launch(vertexFinderOneKernel, {1, 512 + 256}, onGPU_d.view(), ws_d.get(), kk, par[0], par[1], par[2]);
 #else
-      cms::cuda::launch(CLUSTERIZE, {1, 512 + 256}, onGPU_d.get(), ws_d.get(), kk, par[0], par[1], par[2]);
+      cms::cuda::launch(CLUSTERIZE, {1, 512 + 256}, onGPU_d.view(), ws_d.get(), kk, par[0], par[1], par[2]);
 #endif
-      print<<<1, 1, 0, 0>>>(onGPU_d.get(), ws_d.get());
+      print<<<1, 1, 0, stream>>>(onGPU_d.view(), ws_d.get());
 
       cudaCheck(cudaGetLastError());
       cudaDeviceSynchronize();
 
-      cms::cuda::launch(gpuVertexFinder::fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 50.f);
+      cms::cuda::launch(gpuVertexFinder::fitVerticesKernel, {1, 1024 - 256}, onGPU_d.view(), ws_d.get(), 50.f);
       cudaCheck(cudaGetLastError());
-      cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost));
+      cudaCheck(cudaMemcpy(&nv, &onGPU_d.view().nvFinal(), sizeof(uint32_t), cudaMemcpyDeviceToHost));
 
 #else
-      print(onGPU_d.get(), ws_d.get());
-      CLUSTERIZE(onGPU_d.get(), ws_d.get(), kk, par[0], par[1], par[2]);
-      print(onGPU_d.get(), ws_d.get());
-      fitVertices(onGPU_d.get(), ws_d.get(), 50.f);
-      nv = onGPU_d->nvFinal;
+      print(onGPU_d.view(), ws_d.get());
+      CLUSTERIZE(onGPU_d.view(), ws_d.get(), kk, par[0], par[1], par[2]);
+      print(onGPU_d.view(), ws_d.get());
+      fitVertices(onGPU_d.view(), ws_d.get(), 50.f);
+      nv = onGPU_d.view().nvFinal();
 #endif
 
       if (nv == 0) {
@@ -221,18 +230,18 @@ int main() {
       nn = hnn;
       ind = hind;
 #else
-      zv = onGPU_d->zv;
-      wv = onGPU_d->wv;
-      ptv2 = onGPU_d->ptv2;
-      nn = onGPU_d->ndof;
-      ind = onGPU_d->sortInd;
+      zv = onGPU_d.view().zv();
+      wv = onGPU_d.view().wv();
+      ptv2 = onGPU_d.view().ptv2();
+      nn = onGPU_d.view().ndof();
+      ind = onGPU_d.view().sortInd();
 #endif
 
 #ifdef __CUDACC__
-      cudaCheck(cudaMemcpy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t), cudaMemcpyDeviceToHost));
-      cudaCheck(cudaMemcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float), cudaMemcpyDeviceToHost));
+      cudaCheck(cudaMemcpy(nn, onGPU_d.view().ndof(), nv * sizeof(int32_t), cudaMemcpyDeviceToHost));
+      cudaCheck(cudaMemcpy(chi2, onGPU_d.view().chi2(), nv * sizeof(float), cudaMemcpyDeviceToHost));
 #else
-      memcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float));
+      memcpy(chi2, onGPU_d.view().chi2(), nv * sizeof(float));
 #endif
 
       for (auto j = 0U; j < nv; ++j)
@@ -244,14 +253,14 @@ int main() {
       }
 
 #ifdef __CUDACC__
-      cms::cuda::launch(gpuVertexFinder::fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 50.f);
-      cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost));
-      cudaCheck(cudaMemcpy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t), cudaMemcpyDeviceToHost));
-      cudaCheck(cudaMemcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float), cudaMemcpyDeviceToHost));
+      cms::cuda::launch(gpuVertexFinder::fitVerticesKernel, {1, 1024 - 256}, onGPU_d.view(), ws_d.get(), 50.f);
+      cudaCheck(cudaMemcpy(&nv, &onGPU_d.view().nvFinal(), sizeof(uint32_t), cudaMemcpyDeviceToHost));
+      cudaCheck(cudaMemcpy(nn, onGPU_d.view().ndof(), nv * sizeof(int32_t), cudaMemcpyDeviceToHost));
+      cudaCheck(cudaMemcpy(chi2, onGPU_d.view().chi2(), nv * sizeof(float), cudaMemcpyDeviceToHost));
 #else
-      fitVertices(onGPU_d.get(), ws_d.get(), 50.f);
-      nv = onGPU_d->nvFinal;
-      memcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float));
+      fitVertices(onGPU_d.view(), ws_d.get(), 50.f);
+      nv = onGPU_d.view().nvFinal();
+      memcpy(chi2, onGPU_d.view().chi2(), nv * sizeof(float));
 #endif
 
       for (auto j = 0U; j < nv; ++j)
@@ -264,26 +273,26 @@ int main() {
 
 #ifdef __CUDACC__
       // one vertex per block!!!
-      cms::cuda::launch(gpuVertexFinder::splitVerticesKernel, {1024, 64}, onGPU_d.get(), ws_d.get(), 9.f);
+      cms::cuda::launch(gpuVertexFinder::splitVerticesKernel, {1024, 64}, onGPU_d.view(), ws_d.get(), 9.f);
       cudaCheck(cudaMemcpy(&nv, LOC_WS(nvIntermediate), sizeof(uint32_t), cudaMemcpyDeviceToHost));
 #else
-      splitVertices(onGPU_d.get(), ws_d.get(), 9.f);
+      splitVertices(onGPU_d.view(), ws_d.get(), 9.f);
       nv = ws_d->nvIntermediate;
 #endif
       std::cout << "after split " << nv << std::endl;
 
 #ifdef __CUDACC__
-      cms::cuda::launch(gpuVertexFinder::fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 5000.f);
+      cms::cuda::launch(gpuVertexFinder::fitVerticesKernel, {1, 1024 - 256}, onGPU_d.view(), ws_d.get(), 5000.f);
       cudaCheck(cudaGetLastError());
 
-      cms::cuda::launch(gpuVertexFinder::sortByPt2Kernel, {1, 256}, onGPU_d.get(), ws_d.get());
+      cms::cuda::launch(gpuVertexFinder::sortByPt2Kernel, {1, 256}, onGPU_d.view(), ws_d.get());
       cudaCheck(cudaGetLastError());
-      cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost));
+      cudaCheck(cudaMemcpy(&nv, &onGPU_d.view().nvFinal(), sizeof(uint32_t), cudaMemcpyDeviceToHost));
 #else
-      fitVertices(onGPU_d.get(), ws_d.get(), 5000.f);
-      sortByPt2(onGPU_d.get(), ws_d.get());
-      nv = onGPU_d->nvFinal;
-      memcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float));
+      fitVertices(onGPU_d.view(), ws_d.get(), 5000.f);
+      sortByPt2(onGPU_d.view(), ws_d.get());
+      nv = onGPU_d.view().nvFinal();
+      memcpy(chi2, onGPU_d.view().chi2(), nv * sizeof(float));
 #endif
 
       if (nv == 0) {
@@ -292,12 +301,12 @@ int main() {
       }
 
 #ifdef __CUDACC__
-      cudaCheck(cudaMemcpy(zv, LOC_ONGPU(zv), nv * sizeof(float), cudaMemcpyDeviceToHost));
-      cudaCheck(cudaMemcpy(wv, LOC_ONGPU(wv), nv * sizeof(float), cudaMemcpyDeviceToHost));
-      cudaCheck(cudaMemcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float), cudaMemcpyDeviceToHost));
-      cudaCheck(cudaMemcpy(ptv2, LOC_ONGPU(ptv2), nv * sizeof(float), cudaMemcpyDeviceToHost));
-      cudaCheck(cudaMemcpy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t), cudaMemcpyDeviceToHost));
-      cudaCheck(cudaMemcpy(ind, LOC_ONGPU(sortInd), nv * sizeof(uint16_t), cudaMemcpyDeviceToHost));
+      cudaCheck(cudaMemcpy(zv, onGPU_d.view().zv(), nv * sizeof(float), cudaMemcpyDeviceToHost));
+      cudaCheck(cudaMemcpy(wv, onGPU_d.view().wv(), nv * sizeof(float), cudaMemcpyDeviceToHost));
+      cudaCheck(cudaMemcpy(chi2, onGPU_d.view().chi2(), nv * sizeof(float), cudaMemcpyDeviceToHost));
+      cudaCheck(cudaMemcpy(ptv2, onGPU_d.view().ptv2(), nv * sizeof(float), cudaMemcpyDeviceToHost));
+      cudaCheck(cudaMemcpy(nn, onGPU_d.view().ndof(), nv * sizeof(int32_t), cudaMemcpyDeviceToHost));
+      cudaCheck(cudaMemcpy(ind, onGPU_d.view().sortInd(), nv * sizeof(uint16_t), cudaMemcpyDeviceToHost));
 #endif
       for (auto j = 0U; j < nv; ++j)
         if (nn[j] > 0)

From 71cef1bc5e4e9e3a5e8290fbb1619956e8d2b8d1 Mon Sep 17 00:00:00 2001
From: Breno Orzari <breno.orzari@hotmail.com>
Date: Fri, 4 Nov 2022 14:45:59 +0100
Subject: [PATCH 102/110] Adding nullptr to Host collection

---
 .../PixelVertexFinding/plugins/gpuVertexFinder.cc               | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc
index 0e6327c6ed05b..b12926b95c707 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc
@@ -111,7 +111,7 @@ namespace gpuVertexFinder {
 #ifdef PIXVERTEX_DEBUG_PRODUCE
     std::cout << "producing Vertices on  CPU" << std::endl;
 #endif  // PIXVERTEX_DEBUG_PRODUCE
-    ZVertex::ZVertexSoAHost vertices;
+    ZVertex::ZVertexSoAHost vertices(nullptr);
 #endif
     auto soa = vertices.view();
     //TODO: check if there is a way to assert this

From ce16830e724b92375e972d34b1aa705208b45659 Mon Sep 17 00:00:00 2001
From: Dimitris Papagiannis <nothingface0@gmail.com>
Date: Fri, 4 Nov 2022 15:12:24 +0100
Subject: [PATCH 103/110] Initial implementation for WorkSpace port

---
 .../interface/ZVertexSoAHeterogeneousHost.h   |  3 +-
 .../Vertex/interface/ZVertexUtilities.h       | 16 ++++-----
 .../plugins/WorkSpaceSoAHeterogeneousDevice.h | 24 +++++++++++++
 .../plugins/WorkSpaceSoAHeterogeneousHost.h   | 24 +++++++++++++
 .../plugins/WorkSpaceUtilities.h              | 36 +++++++++++++++++++
 .../plugins/gpuVertexFinder.h                 | 32 ++++-------------
 6 files changed, 97 insertions(+), 38 deletions(-)
 create mode 100644 RecoPixelVertexing/PixelVertexFinding/plugins/WorkSpaceSoAHeterogeneousDevice.h
 create mode 100644 RecoPixelVertexing/PixelVertexFinding/plugins/WorkSpaceSoAHeterogeneousHost.h
 create mode 100644 RecoPixelVertexing/PixelVertexFinding/plugins/WorkSpaceUtilities.h

diff --git a/CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h b/CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h
index 0c02356192c4e..4c07bb3ffedb4 100644
--- a/CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h
+++ b/CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h
@@ -5,14 +5,13 @@
 
 #include "CUDADataFormats/Vertex/interface/ZVertexUtilities.h"
 #include "CUDADataFormats/Common/interface/PortableHostCollection.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 
 template <int32_t S>
 class ZVertexSoAHeterogeneousHost : public cms::cuda::PortableHostCollection<ZVertexSoAHeterogeneousLayout<>> {
 public:
   ZVertexSoAHeterogeneousHost() = default;
 
-  // Constructor which specifies the SoA size
+  // Constructor which specifies the SoA size and CUDA stream
   explicit ZVertexSoAHeterogeneousHost(cudaStream_t stream)
       : PortableHostCollection<ZVertexSoAHeterogeneousLayout<>>(S, stream) {}
 };
diff --git a/CUDADataFormats/Vertex/interface/ZVertexUtilities.h b/CUDADataFormats/Vertex/interface/ZVertexUtilities.h
index 05ed34e2e8d69..d0614abee91c9 100644
--- a/CUDADataFormats/Vertex/interface/ZVertexUtilities.h
+++ b/CUDADataFormats/Vertex/interface/ZVertexUtilities.h
@@ -6,7 +6,7 @@
 
 GENERATE_SOA_LAYOUT(ZVertexSoAHeterogeneousLayout,
                     SOA_COLUMN(int16_t, idv),
-                    SOA_COLUMN(float, zv),  // this is chi2/ndof as not necessarely all hits are used in the fit
+                    SOA_COLUMN(float, zv),
                     SOA_COLUMN(float, wv),
                     SOA_COLUMN(float, chi2),
                     SOA_COLUMN(float, ptv2),
@@ -17,8 +17,12 @@ GENERATE_SOA_LAYOUT(ZVertexSoAHeterogeneousLayout,
 // Previous ZVertexSoA class methods.
 // They operate on View and ConstView of the ZVertexSoA.
 namespace ZVertex {
+  // Common types for both Host and Device code
+  using ZVertexSoALayout = ZVertexSoAHeterogeneousLayout<>;
+  using ZVertexSoAView = ZVertexSoAHeterogeneousLayout<>::View;
+  using ZVertexSoAConstView = ZVertexSoAHeterogeneousLayout<>::ConstView;
+
   namespace utilities {
-    using ZVertexSoAView = ZVertexSoAHeterogeneousLayout<>::View;
 
     static constexpr uint32_t MAXTRACKS = 32 * 1024;
     static constexpr uint32_t MAXVTX = 1024;
@@ -28,12 +32,4 @@ namespace ZVertex {
   }  // namespace utilities
 }  // namespace ZVertex
 
-namespace ZVertex {
-  // Common types for both Host and Device code
-  using ZVertexSoALayout = ZVertexSoAHeterogeneousLayout<>;
-  using ZVertexSoAView = ZVertexSoAHeterogeneousLayout<>::View;
-  using ZVertexSoAConstView = ZVertexSoAHeterogeneousLayout<>::ConstView;
-
-}  // namespace ZVertex
-
 #endif
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/WorkSpaceSoAHeterogeneousDevice.h b/RecoPixelVertexing/PixelVertexFinding/plugins/WorkSpaceSoAHeterogeneousDevice.h
new file mode 100644
index 0000000000000..abe77cf84a777
--- /dev/null
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/WorkSpaceSoAHeterogeneousDevice.h
@@ -0,0 +1,24 @@
+#ifndef RecoPixelVertexing_PixelVertexFinding_WorkSpaceSoAHeterogeneousDevice_h
+#define RecoPixelVertexing_PixelVertexFinding_WorkSpaceSoAHeterogeneousDevice_h
+
+#include <cstdint>
+#include "WorkSpaceUtilities.h"
+#include "CUDADataFormats/Vertex/interface/ZVertexUtilities.h"
+#include "CUDADataFormats/Vertex/interface/WorkSpaceUtilities.h"
+#include "CUDADataFormats/Common/interface/PortableDeviceCollection.h"
+
+template <int32_t S>
+class WorkSpaceSoAHeterogeneousDevice : public cms::cuda::PortableDeviceCollection<WorksSpaceSoAHeterogeneousLayout> {
+  WorkSpaceSoAHeterogeneousDevice() = default;
+
+  // Constructor which specifies the SoA size and CUDA stream
+  explicit WorkSpaceSoAHeterogeneousDevice(cudaStream_t stream)
+      : PortableDeviceCollection<WorkSpaceSoAHeterogeneousLayout<>>(S, stream) {}
+};
+
+namespace gpuVertexFinder {
+  namespace WorkSpace {
+    using WorkSpaceSoADevice = WorkSpaceSoAHeterogeneousDevice<ZVertex::utilities::MAXTRACKS>;
+  }
+}  // namespace gpuVertexFinder
+#endif
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/WorkSpaceSoAHeterogeneousHost.h b/RecoPixelVertexing/PixelVertexFinding/plugins/WorkSpaceSoAHeterogeneousHost.h
new file mode 100644
index 0000000000000..5b893718a468d
--- /dev/null
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/WorkSpaceSoAHeterogeneousHost.h
@@ -0,0 +1,24 @@
+#ifndef RecoPixelVertexing_PixelVertexFinding_WorkSpaceSoAHeterogeneousHost_h
+#define RecoPixelVertexing_PixelVertexFinding_WorkSpaceSoAHeterogeneousHost_h
+
+#include <cstdint>
+#include "WorkSpaceUtilities.h"
+#include "CUDADataFormats/Vertex/interface/ZVertexUtilities.h"
+#include "CUDADataFormats/Vertex/interface/WorkSpaceUtilities.h"
+#include "CUDADataFormats/Common/interface/PortableHostCollection.h"
+
+template <int32_t S>
+class WorkSpaceSoAHeterogeneousHost : public cms::cuda::PortableHostCollection<WorksSpaceSoAHeterogeneousLayout> {
+  WorkSpaceSoAHeterogeneousHost() = default;
+
+  // Constructor which specifies the SoA size and CUDA stream
+  explicit WorkSpaceSoAHeterogeneousHost(cudaStream_t stream)
+      : PortableHostCollection<WorkSpaceSoAHeterogeneousLayout<>>(S, stream) {}
+};
+
+namespace gpuVertexFinder {
+  namespace WorkSpace {
+    using WorkSpaceSoAHost = WorkSpaceSoAHeterogeneousHost<ZVertex::utilities::MAXTRACKS>;
+  }
+}  // namespace gpuVertexFinder
+#endif
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/WorkSpaceUtilities.h b/RecoPixelVertexing/PixelVertexFinding/plugins/WorkSpaceUtilities.h
new file mode 100644
index 0000000000000..a86ade097ec7c
--- /dev/null
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/WorkSpaceUtilities.h
@@ -0,0 +1,36 @@
+#ifndef RecoPixelVertexing_PixelVertexFinding_WorkSpace_h
+#define RecoPixelVertexing_PixelVertexFinding_WorkSpace_h
+
+#include <cstdint>
+#include <cuda_runtime.h>
+#include "DataFormats/SoATemplate/interface/SoALayout.h"
+
+// Intermediate data used in the vertex reco algos
+// For internal use only
+GENERATE_SOA_LAYOUT(WorkSpaceSoAHeterogeneousLayout,
+                    SOA_COLUMN(uint16_t, itrk),            // index of original track
+                    SOA_COLUMN(float, zt),                 // input track z at bs
+                    SOA_COLUMN(float, ezt2),               // input error^2 on the above
+                    SOA_COLUMN(float, ptt2),               // input pt^2 on the above
+                    SOA_COLUMN(uint8_t, izt),              // interized z-position of input tracks
+                    SOA_COLUMN(int32_t, iv),               // vertex index for each associated track
+                    SOA_SCALAR(uint32_t, ntrks),           // number of "selected tracks"
+                    SOA_SCALAR(uint32_t, nvIntermediate))  // the number of vertices after splitting pruning etc.
+
+// Methods that operate on View and ConstView of the WorkSpaceSoALayout.
+namespace gpuVertexFinder {
+  namespace workSpace {
+    using WorkSpaceSoALayout = WorkSpaceSoAHeterogeneousLayout<>;
+    using WorkSpaceSoAView = WorkSpaceSoAHeterogeneousLayout<>::View;
+    using WorkSpaceSoAConstView = WorkSpaceSoAHeterogeneousLayout<>::ConstView;
+
+    namespace utilities {
+      __host__ __device__ inline void init(WorkSpaceSoAView &workspace_view) {
+        workspace_view.ntrks() = 0;
+        workspace_view.nvIntermediate() = 0;
+      }
+    }  // namespace utilities
+  }    // namespace workSpace
+}  // namespace gpuVertexFinder
+
+#endif
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h
index d56d68470acd8..dfed3772dd2ec 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h
@@ -8,43 +8,23 @@
 #include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h"
 #include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h"
 #include "CUDADataFormats/Vertex/interface/ZVertexUtilities.h"
+#include "WorkSpaceUtilities.h"
+#include "WorkSpaceSoAHeterogeneousHost.h"
+#include "WorkSpaceSoAHeterogeneousDevice.h"
 
 namespace gpuVertexFinder {
 
   using VtxSoAView = ZVertex::ZVertexSoAView;
   using TkSoAConstView = pixelTrack::TrackSoAConstView;
+  using WsSoAView = gpuVertexFinder::workSpace::WorkSpaceSoAView;
 
-  // workspace used in the vertex reco algos
-  struct WorkSpace {
-    static constexpr uint32_t MAXTRACKS = ZVertex::utilities::MAXTRACKS;
-    static constexpr uint32_t MAXVTX = ZVertex::utilities::MAXVTX;
-
-    uint32_t ntrks;            // number of "selected tracks"
-    uint16_t itrk[MAXTRACKS];  // index of original track
-    float zt[MAXTRACKS];       // input track z at bs
-    float ezt2[MAXTRACKS];     // input error^2 on the above
-    float ptt2[MAXTRACKS];     // input pt^2 on the above
-    uint8_t izt[MAXTRACKS];    // interized z-position of input tracks
-    int32_t iv[MAXTRACKS];     // vertex index for each associated track
-
-    uint32_t nvIntermediate;  // the number of vertices after splitting pruning etc.
-
-    __host__ __device__ void init() {
-      ntrks = 0;
-      nvIntermediate = 0;
-    }
-  };
-
-  __global__ void init(VtxSoAView pdata, WorkSpace* pws) {
+  __global__ void init(VtxSoAView pdata, WsSoAview pws) {
     ZVertex::utilities::init(pdata);
-    pws->init();
+    gpuVertexFinder::workSpace::utilities::init(pws);
   }
 
   class Producer {
   public:
-    using VtxSoAView = ZVertex::ZVertexSoAView;
-    using WorkSpace = gpuVertexFinder::WorkSpace;
-
     Producer(bool oneKernel,
              bool useDensity,
              bool useDBSCAN,

From c1dfb684634190b48dd110728bfd3af1d3dd772e Mon Sep 17 00:00:00 2001
From: Dimitris Papagiannis <nothingface0@gmail.com>
Date: Fri, 4 Nov 2022 15:32:34 +0100
Subject: [PATCH 104/110] Adapt gpuVertexFinder.cc

---
 .../plugins/gpuVertexFinder.cc                | 81 +++++++++----------
 1 file changed, 40 insertions(+), 41 deletions(-)

diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc
index b12926b95c707..baefe500d74d7 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc
@@ -20,10 +20,8 @@ namespace gpuVertexFinder {
 
   // split vertices with a chi2/NDoF greater than this
   constexpr float maxChi2ForSplit = 9.f;
-  using TkSoAConstView = pixelTrack::TrackSoAConstView;
-  using VtxSoAView = ZVertex::ZVertexSoAView;
 
-  __global__ void loadTracks(TkSoAConstView tracks_view, VtxSoAView soa, WorkSpace* pws, float ptMin, float ptMax) {
+  __global__ void loadTracks(TkSoAConstView tracks_view, VtxSoAView soa, WsSoAView pws, float ptMin, float ptMax) {
     //TODO: check if there is a way to assert this
     //assert(soa);
     auto const* quality = pixelTrack::utilities::qualityData(tracks_view);
@@ -49,19 +47,19 @@ namespace gpuVertexFinder {
       // clamp pt
       pt = std::min(pt, ptMax);
 
-      auto& data = *pws;
-      auto it = atomicAdd(&data.ntrks, 1);
-      data.itrk[it] = idx;
-      data.zt[it] = pixelTrack::utilities::zip(tracks_view, idx);
-      data.ezt2[it] = tracks_view[idx].covariance()(14);
-      data.ptt2[it] = pt * pt;
+      auto& data = pws;
+      auto it = atomicAdd(&data.ntrks(), 1);
+      data[it].itrk() = idx;
+      data[it].zt() = pixelTrack::utilities::zip(tracks_view, idx);
+      data[it].ezt2() = tracks_view[idx].covariance()(14);
+      data[it].ptt2() = pt * pt;
     }
   }
 
 // #define THREE_KERNELS
 #ifndef THREE_KERNELS
   __global__ void vertexFinderOneKernel(VtxSoAView pdata,
-                                        gpuVertexFinder::WorkSpace* pws,
+                                        WsSoAView pws,
                                         int minT,      // min number of neighbours to be "seed"
                                         float eps,     // max absolute distance to cluster
                                         float errmax,  // max error to be "seed"
@@ -78,8 +76,8 @@ namespace gpuVertexFinder {
     sortByPt2(pdata, pws);
   }
 #else
-  __global__ void vertexFinderKernel1(gpuVertexFinder::VtxSoAView pdata,
-                                      gpuVertexFinder::WorkSpace* pws,
+  __global__ void vertexFinderKernel1(VtxSoAView pdata,
+                                      WsSoAView pws,
                                       int minT,      // min number of neighbours to be "seed"
                                       float eps,     // max absolute distance to cluster
                                       float errmax,  // max error to be "seed"
@@ -90,7 +88,7 @@ namespace gpuVertexFinder {
     fitVertices(pdata, pws, maxChi2ForFirstFit);
   }
 
-  __global__ void vertexFinderKernel2(gpuVertexFinder::VtxSoAView pdata, gpuVertexFinder::WorkSpace* pws) {
+  __global__ void vertexFinderKernel2(VtxSoAView pdata, WsSoAView pws) {
     fitVertices(pdata, pws, maxChi2ForFinalFit);
     __syncthreads();
     sortByPt2(pdata, pws);
@@ -99,9 +97,9 @@ namespace gpuVertexFinder {
 
 #ifdef __CUDACC__
   ZVertex::ZVertexSoADevice Producer::makeAsync(cudaStream_t stream,
-                                           TkSoAConstView tracks_view,
-                                           float ptMin,
-                                           float ptMax) const {
+                                                TkSoAConstView tracks_view,
+                                                float ptMin,
+                                                float ptMax) const {
 #ifdef PIXVERTEX_DEBUG_PRODUCE
     std::cout << "producing Vertices on GPU" << std::endl;
 #endif  // PIXVERTEX_DEBUG_PRODUCE
@@ -118,20 +116,20 @@ namespace gpuVertexFinder {
     //assert(soa);
 
 #ifdef __CUDACC__
-    auto ws_d = cms::cuda::make_device_unique<WorkSpace>(stream);
+    auto ws_d = gpuVertexing::workSpace::WorkSpaceSoAHeterogeneousDevice(stream);
 #else
-    auto ws_d = std::make_unique<WorkSpace>();
+    auto ws_d = gpuVertexing::workSpace::WorkSpaceSoAHeterogeneousHost(nullptr);
 #endif
 
 #ifdef __CUDACC__
-    init<<<1, 1, 0, stream>>>(soa, ws_d.get());
+    init<<<1, 1, 0, stream>>>(soa, ws_d.view());
     auto blockSize = 128;
     auto numberOfBlocks = (tracks_view.metadata().size() + blockSize - 1) / blockSize;
-    loadTracks<<<numberOfBlocks, blockSize, 0, stream>>>(tracks_view, soa, ws_d.get(), ptMin, ptMax);
+    loadTracks<<<numberOfBlocks, blockSize, 0, stream>>>(tracks_view, soa, ws_d.view(), ptMin, ptMax);
     cudaCheck(cudaGetLastError());
 #else
-    init(soa, ws_d.get());
-    loadTracks(tracks_view, soa, ws_d.get(), ptMin, ptMax);
+    init(soa, ws_d.view());
+    loadTracks(tracks_view, soa, ws_d.view(), ptMin, ptMax);
 #endif
 
 #ifdef __CUDACC__
@@ -143,50 +141,51 @@ namespace gpuVertexFinder {
     if (oneKernel_) {
       // implemented only for density clustesrs
 #ifndef THREE_KERNELS
-      vertexFinderOneKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
+      vertexFinderOneKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.view(), minT, eps, errmax, chi2max);
 #else
-      vertexFinderKernel1<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
+      vertexFinderKernel1<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.view(), minT, eps, errmax, chi2max);
       cudaCheck(cudaGetLastError());
       // one block per vertex...
-      splitVerticesKernel<<<numBlocks, threadsPerBlock, 0, stream>>>(soa, ws_d.get(), maxChi2ForSplit);
+      splitVerticesKernel<<<numBlocks, threadsPerBlock, 0, stream>>>(soa, ws_d.view(), maxChi2ForSplit);
       cudaCheck(cudaGetLastError());
-      vertexFinderKernel2<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get());
+      vertexFinderKernel2<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.view());
 #endif
     } else {  // five kernels
       if (useDensity_) {
-        clusterTracksByDensityKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
+        clusterTracksByDensityKernel<<<1, maxThreadsForPrint, 0, stream>>>(
+            soa, ws_d.view(), minT, eps, errmax, chi2max);
       } else if (useDBSCAN_) {
-        clusterTracksDBSCAN<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
+        clusterTracksDBSCAN<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.view(), minT, eps, errmax, chi2max);
       } else if (useIterative_) {
-        clusterTracksIterative<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
+        clusterTracksIterative<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.view(), minT, eps, errmax, chi2max);
       }
       cudaCheck(cudaGetLastError());
-      fitVerticesKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), maxChi2ForFirstFit);
+      fitVerticesKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.view(), maxChi2ForFirstFit);
       cudaCheck(cudaGetLastError());
       // one block per vertex...
-      splitVerticesKernel<<<numBlocks, threadsPerBlock, 0, stream>>>(soa, ws_d.get(), maxChi2ForSplit);
+      splitVerticesKernel<<<numBlocks, threadsPerBlock, 0, stream>>>(soa, ws_d.view(), maxChi2ForSplit);
       cudaCheck(cudaGetLastError());
-      fitVerticesKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), maxChi2ForFinalFit);
+      fitVerticesKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.view(), maxChi2ForFinalFit);
       cudaCheck(cudaGetLastError());
-      sortByPt2Kernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get());
+      sortByPt2Kernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.view());
     }
     cudaCheck(cudaGetLastError());
 #else  // __CUDACC__
     if (useDensity_) {
-      clusterTracksByDensity(soa, ws_d.get(), minT, eps, errmax, chi2max);
+      clusterTracksByDensity(soa, ws_d.view(), minT, eps, errmax, chi2max);
     } else if (useDBSCAN_) {
-      clusterTracksDBSCAN(soa, ws_d.get(), minT, eps, errmax, chi2max);
+      clusterTracksDBSCAN(soa, ws_d.view(), minT, eps, errmax, chi2max);
     } else if (useIterative_) {
-      clusterTracksIterative(soa, ws_d.get(), minT, eps, errmax, chi2max);
+      clusterTracksIterative(soa, ws_d.view(), minT, eps, errmax, chi2max);
     }
 #ifdef PIXVERTEX_DEBUG_PRODUCE
-    std::cout << "found " << (*ws_d).nvIntermediate << " vertices " << std::endl;
+    std::cout << "found " << ws_d.view().nvIntermediate() << " vertices " << std::endl;
 #endif  // PIXVERTEX_DEBUG_PRODUCE
-    fitVertices(soa, ws_d.get(), maxChi2ForFirstFit);
+    fitVertices(soa, ws_d.view(), maxChi2ForFirstFit);
     // one block per vertex!
-    splitVertices(soa, ws_d.get(), maxChi2ForSplit);
-    fitVertices(soa, ws_d.get(), maxChi2ForFinalFit);
-    sortByPt2(soa, ws_d.get());
+    splitVertices(soa, ws_d.view(), maxChi2ForSplit);
+    fitVertices(soa, ws_d.view(), maxChi2ForFinalFit);
+    sortByPt2(soa, ws_d.view());
 #endif
 
     return vertices;

From b4d1dbb80f8cc1291bfa3cdc2d0fdb64ae7599d3 Mon Sep 17 00:00:00 2001
From: Breno Orzari <breno.orzari@hotmail.com>
Date: Fri, 4 Nov 2022 15:36:18 +0100
Subject: [PATCH 105/110] Changing WorkSpace into WsSoAView

---
 .../plugins/gpuClusterTracksByDensity.h        | 18 +++++++++---------
 .../plugins/gpuClusterTracksDBSCAN.h           | 16 ++++++++--------
 .../plugins/gpuClusterTracksIterative.h        | 16 ++++++++--------
 .../plugins/gpuFitVertices.h                   | 16 ++++++++--------
 .../PixelVertexFinding/plugins/gpuSortByPt2.h  | 14 +++++++-------
 .../plugins/gpuSplitVertices.h                 | 16 ++++++++--------
 6 files changed, 48 insertions(+), 48 deletions(-)

diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksByDensity.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksByDensity.h
index f920586117078..4124f80e017eb 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksByDensity.h
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksByDensity.h
@@ -18,7 +18,7 @@ namespace gpuVertexFinder {
   // based on Rodrighez&Laio algo
   //
   __device__ __forceinline__ void clusterTracksByDensity(VtxSoAView pdata,
-                                                         gpuVertexFinder::WorkSpace* pws,
+                                                         WsSoAView pws,
                                                          int minT,      // min number of neighbours to be "seed"
                                                          float eps,     // max absolute distance to cluster
                                                          float errmax,  // max error to be "seed"
@@ -33,17 +33,17 @@ namespace gpuVertexFinder {
     auto er2mx = errmax * errmax;
 
     auto& __restrict__ data = pdata;
-    auto& __restrict__ ws = *pws;
-    auto nt = ws.ntrks;
-    float const* __restrict__ zt = ws.zt;
-    float const* __restrict__ ezt2 = ws.ezt2;
+    auto& __restrict__ ws = pws;
+    auto nt = ws.ntrks();
+    float const* __restrict__ zt = ws.zt();
+    float const* __restrict__ ezt2 = ws.ezt2();
 
     uint32_t& nvFinal = data.nvFinal();
-    uint32_t& nvIntermediate = ws.nvIntermediate;
+    uint32_t& nvIntermediate = ws.nvIntermediate();
 
-    uint8_t* __restrict__ izt = ws.izt;
+    uint8_t* __restrict__ izt = ws.izt();
     int32_t* __restrict__ nn = data.ndof();
-    int32_t* __restrict__ iv = ws.iv;
+    int32_t* __restrict__ iv = ws.iv();
 
     //TODO: check if there is a way to assert this
     //assert(pdata);
@@ -221,7 +221,7 @@ namespace gpuVertexFinder {
   }
 
   __global__ void clusterTracksByDensityKernel(VtxSoAView pdata,
-                                               gpuVertexFinder::WorkSpace* pws,
+                                               WsSoAView pws,
                                                int minT,      // min number of neighbours to be "seed"
                                                float eps,     // max absolute distance to cluster
                                                float errmax,  // max error to be "seed"
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksDBSCAN.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksDBSCAN.h
index 0476cfbae5fef..43e420a4c0cbc 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksDBSCAN.h
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksDBSCAN.h
@@ -15,7 +15,7 @@ namespace gpuVertexFinder {
   // this algo does not really scale as it works in a single block...
   // enough for <10K tracks we have
   __global__ void clusterTracksDBSCAN(VtxSoAView pdata,
-                                      WorkSpace* pws,
+                                      WsSoAView pws,
                                       int minT,      // min number of neighbours to be "core"
                                       float eps,     // max absolute distance to cluster
                                       float errmax,  // max error to be "seed"
@@ -29,17 +29,17 @@ namespace gpuVertexFinder {
     auto er2mx = errmax * errmax;
 
     auto& __restrict__ data = pdata;
-    auto& __restrict__ ws = *pws;
-    auto nt = ws.ntrks;
-    float const* __restrict__ zt = ws.zt;
-    float const* __restrict__ ezt2 = ws.ezt2;
+    auto& __restrict__ ws = pws;
+    auto nt = ws.ntrks();
+    float const* __restrict__ zt = ws.zt();
+    float const* __restrict__ ezt2 = ws.ezt2();
 
     uint32_t& nvFinal = data.nvFinal();
-    uint32_t& nvIntermediate = ws.nvIntermediate;
+    uint32_t& nvIntermediate = ws.nvIntermediate();
 
-    uint8_t* __restrict__ izt = ws.izt;
+    uint8_t* __restrict__ izt = ws.izt();
     int32_t* __restrict__ nn = data.ndof();
-    int32_t* __restrict__ iv = ws.iv;
+    int32_t* __restrict__ iv = ws.iv();
 
     //TODO: check if there is a way to assert this
     //assert(pdata);
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksIterative.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksIterative.h
index 230405c47366a..1b172cabf9318 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksIterative.h
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksIterative.h
@@ -15,7 +15,7 @@ namespace gpuVertexFinder {
   // this algo does not really scale as it works in a single block...
   // enough for <10K tracks we have
   __global__ void clusterTracksIterative(VtxSoAView pdata,
-                                         WorkSpace* pws,
+                                         WsSoAView pws,
                                          int minT,      // min number of neighbours to be "core"
                                          float eps,     // max absolute distance to cluster
                                          float errmax,  // max error to be "seed"
@@ -29,17 +29,17 @@ namespace gpuVertexFinder {
     auto er2mx = errmax * errmax;
 
     auto& __restrict__ data = pdata;
-    auto& __restrict__ ws = *pws;
-    auto nt = ws.ntrks;
-    float const* __restrict__ zt = ws.zt;
-    float const* __restrict__ ezt2 = ws.ezt2;
+    auto& __restrict__ ws = pws;
+    auto nt = ws.ntrks();
+    float const* __restrict__ zt = ws.zt();
+    float const* __restrict__ ezt2 = ws.ezt2();
 
     uint32_t& nvFinal = data.nvFinal();
-    uint32_t& nvIntermediate = ws.nvIntermediate;
+    uint32_t& nvIntermediate = ws.nvIntermediate();
 
-    uint8_t* __restrict__ izt = ws.izt;
+    uint8_t* __restrict__ izt = ws.izt();
     int32_t* __restrict__ nn = data.ndof();
-    int32_t* __restrict__ iv = ws.iv;
+    int32_t* __restrict__ iv = ws.iv();
 
     //TODO: check if there is a way to assert this
     //assert(pdata);
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuFitVertices.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuFitVertices.h
index 51364e78ee92e..7b926023b4e19 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuFitVertices.h
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuFitVertices.h
@@ -13,24 +13,24 @@
 namespace gpuVertexFinder {
 
   __device__ __forceinline__ void fitVertices(VtxSoAView pdata,
-                                              WorkSpace* pws,
+                                              WsSoAView pws,
                                               float chi2Max  // for outlier rejection
   ) {
     constexpr bool verbose = false;  // in principle the compiler should optmize out if false
 
     auto& __restrict__ data = pdata;
-    auto& __restrict__ ws = *pws;
-    auto nt = ws.ntrks;
-    float const* __restrict__ zt = ws.zt;
-    float const* __restrict__ ezt2 = ws.ezt2;
+    auto& __restrict__ ws = pws;
+    auto nt = ws.ntrks();
+    float const* __restrict__ zt = ws.zt();
+    float const* __restrict__ ezt2 = ws.ezt2();
     float* __restrict__ zv = data.zv();
     float* __restrict__ wv = data.wv();
     float* __restrict__ chi2 = data.chi2();
     uint32_t& nvFinal = data.nvFinal();
-    uint32_t& nvIntermediate = ws.nvIntermediate;
+    uint32_t& nvIntermediate = ws.nvIntermediate();
 
     int32_t* __restrict__ nn = data.ndof();
-    int32_t* __restrict__ iv = ws.iv;
+    int32_t* __restrict__ iv = ws.iv();
 
     //TODO: check if there is a way to assert this
     //assert(pdata);
@@ -103,7 +103,7 @@ namespace gpuVertexFinder {
   }
 
   __global__ void fitVerticesKernel(VtxSoAView pdata,
-                                    WorkSpace* pws,
+                                    WsSoAView pws,
                                     float chi2Max  // for outlier rejection
   ) {
     fitVertices(pdata, pws, chi2Max);
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSortByPt2.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSortByPt2.h
index c705fc1f4065e..38eeac91c5161 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSortByPt2.h
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSortByPt2.h
@@ -15,14 +15,14 @@
 
 namespace gpuVertexFinder {
 
-  __device__ __forceinline__ void sortByPt2(VtxSoAView pdata, WorkSpace* pws) {
+  __device__ __forceinline__ void sortByPt2(VtxSoAView pdata, WsSoAView pws) {
     auto& __restrict__ data = pdata;
-    auto& __restrict__ ws = *pws;
-    auto nt = ws.ntrks;
-    float const* __restrict__ ptt2 = ws.ptt2;
+    auto& __restrict__ ws = pws;
+    auto nt = ws.ntrks();
+    float const* __restrict__ ptt2 = ws.ptt2();
     uint32_t const& nvFinal = data.nvFinal();
 
-    int32_t const* __restrict__ iv = ws.iv;
+    int32_t const* __restrict__ iv = ws.iv();
     float* __restrict__ ptv2 = data.ptv2();
     uint16_t* __restrict__ sortInd = data.sortInd();
 
@@ -34,7 +34,7 @@ namespace gpuVertexFinder {
 
     // fill indexing
     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
-      data[ws.itrk[i]].idv() = iv[i];
+      data[ws[i].itrk(i)].idv() = iv[i];
     }
 
     // can be done asynchronoisly at the end of previous event
@@ -66,7 +66,7 @@ namespace gpuVertexFinder {
 #endif
   }
 
-  __global__ void sortByPt2Kernel(VtxSoAView pdata, WorkSpace* pws) { sortByPt2(pdata, pws); }
+  __global__ void sortByPt2Kernel(VtxSoAView pdata, WsSoAView pws) { sortByPt2(pdata, pws); }
 
 }  // namespace gpuVertexFinder
 
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSplitVertices.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSplitVertices.h
index ad72c489ed67e..f90978811b839 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSplitVertices.h
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSplitVertices.h
@@ -12,21 +12,21 @@
 
 namespace gpuVertexFinder {
 
-  __device__ __forceinline__ void splitVertices(VtxSoAView pdata, WorkSpace* pws, float maxChi2) {
+  __device__ __forceinline__ void splitVertices(VtxSoAView pdata, WsSoAView pws, float maxChi2) {
     constexpr bool verbose = false;  // in principle the compiler should optmize out if false
 
     auto& __restrict__ data = pdata;
-    auto& __restrict__ ws = *pws;
-    auto nt = ws.ntrks;
-    float const* __restrict__ zt = ws.zt;
-    float const* __restrict__ ezt2 = ws.ezt2;
+    auto& __restrict__ ws = pws;
+    auto nt = ws.ntrks();
+    float const* __restrict__ zt = ws.zt();
+    float const* __restrict__ ezt2 = ws.ezt2();
     float* __restrict__ zv = data.zv();
     float* __restrict__ wv = data.wv();
     float const* __restrict__ chi2 = data.chi2();
     uint32_t& nvFinal = data.nvFinal();
 
     int32_t const* __restrict__ nn = data.ndof();
-    int32_t* __restrict__ iv = ws.iv;
+    int32_t* __restrict__ iv = ws.iv();
 
     //TODO: check if there is a way to assert this
     //assert(pdata);
@@ -121,7 +121,7 @@ namespace gpuVertexFinder {
       // get a new global vertex
       __shared__ uint32_t igv;
       if (0 == threadIdx.x)
-        igv = atomicAdd(&ws.nvIntermediate, 1);
+        igv = atomicAdd(&ws.nvIntermediate(), 1);
       __syncthreads();
       for (auto k = threadIdx.x; k < nq; k += blockDim.x) {
         if (1 == newV[k])
@@ -131,7 +131,7 @@ namespace gpuVertexFinder {
     }  // loop on vertices
   }
 
-  __global__ void splitVerticesKernel(VtxSoAView pdata, WorkSpace* pws, float maxChi2) {
+  __global__ void splitVerticesKernel(VtxSoAView pdata, WsSoAView pws, float maxChi2) {
     splitVertices(pdata, pws, maxChi2);
   }
 

From 73d9392759d514c0b04550f6a35b479e6f5c3842 Mon Sep 17 00:00:00 2001
From: Dimitris Papagiannis <nothingface0@gmail.com>
Date: Fri, 4 Nov 2022 15:45:17 +0100
Subject: [PATCH 106/110] Updated tests to use new WorkSpace definition

---
 .../PixelVertexFinding/test/VertexFinder_t.h  | 82 ++++++++++---------
 1 file changed, 42 insertions(+), 40 deletions(-)

diff --git a/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h b/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h
index cf6fccf04ffc0..ec392a1f4a8d8 100644
--- a/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h
+++ b/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h
@@ -9,6 +9,9 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/launch.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/allocate_device.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/currentDevice.h"
+#include "RecoPixelVertexing/PixelVertexFinding/plugins/WorkSpaceUtilities.h"
+#include "RecoPixelVertexing/PixelVertexFinding/plugins/WorkSpaceSoAHeterogeneousHost.h"
+#include "RecoPixelVertexing/PixelVertexFinding/plugins/WorkSpaceSoAHeterogeneousDevice.h"
 
 #include "CUDADataFormats/Track/interface/PixelTrackUtilities.h"  // TODO: included in order to compile Eigen columns first!!!
 #include "CUDADataFormats/Vertex/interface/ZVertexUtilities.h"
@@ -31,7 +34,7 @@
 #ifdef ONE_KERNEL
 #ifdef __CUDACC__
 __global__ void vertexFinderOneKernel(gpuVertexFinder::VtxSoAView pdata,
-                                      gpuVertexFinder::WorkSpace* pws,
+                                      gpuVertexFinder::WsSoAView pws,
                                       int minT,      // min number of neighbours to be "seed"
                                       float eps,     // max absolute distance to cluster
                                       float errmax,  // max error to be "seed"
@@ -108,12 +111,9 @@ struct ClusterGenerator {
   std::exponential_distribution<float> ptGen;
 };
 
-// a macro SORRY
-#define LOC_WS(M) ((char*)(ws_d.get()) + offsetof(gpuVertexFinder::WorkSpace, M))
-
-__global__ void print(gpuVertexFinder::VtxSoAView pdata, gpuVertexFinder::WorkSpace const* pws) {
-  auto const& __restrict__ ws = *pws;
-  printf("nt,nv %d %d,%d\n", ws.ntrks, pdata.nvFinal(), ws.nvIntermediate);
+__global__ void print(gpuVertexFinder::VtxSoAView pdata, gpuVertexFinder::WsSoAView pws) {
+  auto const& __restrict__ ws = pws;
+  printf("nt,nv %d %d,%d\n", ws.ntrks(), pdata.nvFinal(), ws.nvIntermediate());
 }
 
 int main() {
@@ -122,14 +122,13 @@ int main() {
   cms::cudatest::requireDevices();
   cudaCheck(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
 
-  // auto onGPU_d = cms::cuda::make_device_unique<gpuVertexFinder::ZVertices[]>(1, nullptr);
   ZVertex::ZVertexSoADevice onGPU_d(stream);
-  auto ws_d = cms::cuda::make_device_unique<gpuVertexFinder::WorkSpace[]>(1, nullptr);
+  gpuVertexing::workSpace::WorkSpaceSoAHeterogeneousDevice ws_d(stream);
 #else
   stream = nullptr;
-  // auto onGPU_d = std::make_unique<gpuVertexFinder::ZVertices>();
+
   ZVertex::ZVertexSoAHost onGPU_d(stream);
-  auto ws_d = std::make_unique<gpuVertexFinder::WorkSpace>();
+  gpuVertexing::workSpace::WorkSpaceSoAHeterogeneousHost ws_d(stream);
 #endif
 
   Event ev;
@@ -145,23 +144,26 @@ int main() {
       gen(ev);
 
 #ifdef __CUDACC__
-      gpuVertexFinder::init<<<1, 1, 0, stream>>>(onGPU_d.view(), ws_d.get());
+      gpuVertexFinder::init<<<1, 1, 0, stream>>>(onGPU_d.view(), ws_d.view());
 #else
-      gpuVertexFinder::init(onGPU_d.view(), ws_d.get());
+      gpuVertexFinder::init(onGPU_d.view(), ws_d.view());
 #endif
 
       std::cout << "v,t size " << ev.zvert.size() << ' ' << ev.ztrack.size() << std::endl;
       auto nt = ev.ztrack.size();
 #ifdef __CUDACC__
-      cudaCheck(cudaMemcpy(LOC_WS(ntrks), &nt, sizeof(uint32_t), cudaMemcpyHostToDevice));
-      cudaCheck(cudaMemcpy(LOC_WS(zt), ev.ztrack.data(), sizeof(float) * ev.ztrack.size(), cudaMemcpyHostToDevice));
-      cudaCheck(cudaMemcpy(LOC_WS(ezt2), ev.eztrack.data(), sizeof(float) * ev.eztrack.size(), cudaMemcpyHostToDevice));
-      cudaCheck(cudaMemcpy(LOC_WS(ptt2), ev.pttrack.data(), sizeof(float) * ev.eztrack.size(), cudaMemcpyHostToDevice));
+      cudaCheck(cudaMemcpy(&ws_d.view().ntrks(), &nt, sizeof(uint32_t), cudaMemcpyHostToDevice));
+      cudaCheck(
+          cudaMemcpy(ws_d.view().zt(), ev.ztrack.data(), sizeof(float) * ev.ztrack.size(), cudaMemcpyHostToDevice));
+      cudaCheck(
+          cudaMemcpy(ws_d.view().ezt2(), ev.eztrack.data(), sizeof(float) * ev.eztrack.size(), cudaMemcpyHostToDevice));
+      cudaCheck(
+          cudaMemcpy(ws_d.view().ptt2(), ev.pttrack.data(), sizeof(float) * ev.eztrack.size(), cudaMemcpyHostToDevice));
 #else
-      ::memcpy(LOC_WS(ntrks), &nt, sizeof(uint32_t));
-      ::memcpy(LOC_WS(zt), ev.ztrack.data(), sizeof(float) * ev.ztrack.size());
-      ::memcpy(LOC_WS(ezt2), ev.eztrack.data(), sizeof(float) * ev.eztrack.size());
-      ::memcpy(LOC_WS(ptt2), ev.pttrack.data(), sizeof(float) * ev.eztrack.size());
+      ::memcpy(&ws_d.view().ntrks(), &nt, sizeof(uint32_t));
+      ::memcpy(ws_d.view().zt(), ev.ztrack.data(), sizeof(float) * ev.ztrack.size());
+      ::memcpy(ws_d.view().ezt2(), ev.eztrack.data(), sizeof(float) * ev.eztrack.size());
+      ::memcpy(ws_d.view().ptt2(), ev.pttrack.data(), sizeof(float) * ev.eztrack.size());
 #endif
 
       std::cout << "M eps, pset " << kk << ' ' << eps << ' ' << (i % 4) << std::endl;
@@ -177,29 +179,29 @@ int main() {
 
       uint32_t nv = 0;
 #ifdef __CUDACC__
-      print<<<1, 1, 0, stream>>>(onGPU_d.view(), ws_d.get());
+      print<<<1, 1, 0, stream>>>(onGPU_d.view(), ws_d.view());
       cudaCheck(cudaGetLastError());
       cudaDeviceSynchronize();
 
 #ifdef ONE_KERNEL
-      cms::cuda::launch(vertexFinderOneKernel, {1, 512 + 256}, onGPU_d.view(), ws_d.get(), kk, par[0], par[1], par[2]);
+      cms::cuda::launch(vertexFinderOneKernel, {1, 512 + 256}, onGPU_d.view(), ws_d.view(), kk, par[0], par[1], par[2]);
 #else
-      cms::cuda::launch(CLUSTERIZE, {1, 512 + 256}, onGPU_d.view(), ws_d.get(), kk, par[0], par[1], par[2]);
+      cms::cuda::launch(CLUSTERIZE, {1, 512 + 256}, onGPU_d.view(), ws_d.view(), kk, par[0], par[1], par[2]);
 #endif
-      print<<<1, 1, 0, stream>>>(onGPU_d.view(), ws_d.get());
+      print<<<1, 1, 0, stream>>>(onGPU_d.view(), ws_d.view());
 
       cudaCheck(cudaGetLastError());
       cudaDeviceSynchronize();
 
-      cms::cuda::launch(gpuVertexFinder::fitVerticesKernel, {1, 1024 - 256}, onGPU_d.view(), ws_d.get(), 50.f);
+      cms::cuda::launch(gpuVertexFinder::fitVerticesKernel, {1, 1024 - 256}, onGPU_d.view(), ws_d.view(), 50.f);
       cudaCheck(cudaGetLastError());
       cudaCheck(cudaMemcpy(&nv, &onGPU_d.view().nvFinal(), sizeof(uint32_t), cudaMemcpyDeviceToHost));
 
 #else
-      print(onGPU_d.view(), ws_d.get());
-      CLUSTERIZE(onGPU_d.view(), ws_d.get(), kk, par[0], par[1], par[2]);
-      print(onGPU_d.view(), ws_d.get());
-      fitVertices(onGPU_d.view(), ws_d.get(), 50.f);
+      print(onGPU_d.view(), ws_d.view());
+      CLUSTERIZE(onGPU_d.view(), ws_d.view(), kk, par[0], par[1], par[2]);
+      print(onGPU_d.view(), ws_d.view());
+      fitVertices(onGPU_d.view(), ws_d.view(), 50.f);
       nv = onGPU_d.view().nvFinal();
 #endif
 
@@ -253,12 +255,12 @@ int main() {
       }
 
 #ifdef __CUDACC__
-      cms::cuda::launch(gpuVertexFinder::fitVerticesKernel, {1, 1024 - 256}, onGPU_d.view(), ws_d.get(), 50.f);
+      cms::cuda::launch(gpuVertexFinder::fitVerticesKernel, {1, 1024 - 256}, onGPU_d.view(), ws_d.view(), 50.f);
       cudaCheck(cudaMemcpy(&nv, &onGPU_d.view().nvFinal(), sizeof(uint32_t), cudaMemcpyDeviceToHost));
       cudaCheck(cudaMemcpy(nn, onGPU_d.view().ndof(), nv * sizeof(int32_t), cudaMemcpyDeviceToHost));
       cudaCheck(cudaMemcpy(chi2, onGPU_d.view().chi2(), nv * sizeof(float), cudaMemcpyDeviceToHost));
 #else
-      fitVertices(onGPU_d.view(), ws_d.get(), 50.f);
+      fitVertices(onGPU_d.view(), ws_d.view(), 50.f);
       nv = onGPU_d.view().nvFinal();
       memcpy(chi2, onGPU_d.view().chi2(), nv * sizeof(float));
 #endif
@@ -273,24 +275,24 @@ int main() {
 
 #ifdef __CUDACC__
       // one vertex per block!!!
-      cms::cuda::launch(gpuVertexFinder::splitVerticesKernel, {1024, 64}, onGPU_d.view(), ws_d.get(), 9.f);
-      cudaCheck(cudaMemcpy(&nv, LOC_WS(nvIntermediate), sizeof(uint32_t), cudaMemcpyDeviceToHost));
+      cms::cuda::launch(gpuVertexFinder::splitVerticesKernel, {1024, 64}, onGPU_d.view(), ws_d.view(), 9.f);
+      cudaCheck(cudaMemcpy(&nv, &ws_d.view().nvIntermediate(), sizeof(uint32_t), cudaMemcpyDeviceToHost));
 #else
-      splitVertices(onGPU_d.view(), ws_d.get(), 9.f);
-      nv = ws_d->nvIntermediate;
+      splitVertices(onGPU_d.view(), ws_d.view(), 9.f);
+      nv = ws_d.view().nvIntermediate();
 #endif
       std::cout << "after split " << nv << std::endl;
 
 #ifdef __CUDACC__
-      cms::cuda::launch(gpuVertexFinder::fitVerticesKernel, {1, 1024 - 256}, onGPU_d.view(), ws_d.get(), 5000.f);
+      cms::cuda::launch(gpuVertexFinder::fitVerticesKernel, {1, 1024 - 256}, onGPU_d.view(), ws_d.view(), 5000.f);
       cudaCheck(cudaGetLastError());
 
-      cms::cuda::launch(gpuVertexFinder::sortByPt2Kernel, {1, 256}, onGPU_d.view(), ws_d.get());
+      cms::cuda::launch(gpuVertexFinder::sortByPt2Kernel, {1, 256}, onGPU_d.view(), ws_d.view());
       cudaCheck(cudaGetLastError());
       cudaCheck(cudaMemcpy(&nv, &onGPU_d.view().nvFinal(), sizeof(uint32_t), cudaMemcpyDeviceToHost));
 #else
-      fitVertices(onGPU_d.view(), ws_d.get(), 5000.f);
-      sortByPt2(onGPU_d.view(), ws_d.get());
+      fitVertices(onGPU_d.view(), ws_d.view(), 5000.f);
+      sortByPt2(onGPU_d.view(), ws_d.view());
       nv = onGPU_d.view().nvFinal();
       memcpy(chi2, onGPU_d.view().chi2(), nv * sizeof(float));
 #endif

From 3085f4944650e92019a05d732dbc1775e1d79d6c Mon Sep 17 00:00:00 2001
From: Breno Orzari <breno.orzari@hotmail.com>
Date: Fri, 4 Nov 2022 16:14:38 +0100
Subject: [PATCH 107/110] Fixing namespaces and some dataformats usage

---
 .../plugins/WorkSpaceSoAHeterogeneousDevice.h |  7 ++--
 .../plugins/WorkSpaceSoAHeterogeneousHost.h   |  7 ++--
 .../PixelVertexFinding/plugins/gpuSortByPt2.h |  2 +-
 .../plugins/gpuVertexFinder.cc                |  4 +--
 .../plugins/gpuVertexFinder.h                 |  2 +-
 .../PixelVertexFinding/test/VertexFinder_t.h  | 33 ++++++++++---------
 6 files changed, 29 insertions(+), 26 deletions(-)

diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/WorkSpaceSoAHeterogeneousDevice.h b/RecoPixelVertexing/PixelVertexFinding/plugins/WorkSpaceSoAHeterogeneousDevice.h
index abe77cf84a777..1c704d1374ca7 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/WorkSpaceSoAHeterogeneousDevice.h
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/WorkSpaceSoAHeterogeneousDevice.h
@@ -4,11 +4,12 @@
 #include <cstdint>
 #include "WorkSpaceUtilities.h"
 #include "CUDADataFormats/Vertex/interface/ZVertexUtilities.h"
-#include "CUDADataFormats/Vertex/interface/WorkSpaceUtilities.h"
+#include "RecoPixelVertexing/PixelVertexFinding/plugins/WorkSpaceUtilities.h"
 #include "CUDADataFormats/Common/interface/PortableDeviceCollection.h"
 
 template <int32_t S>
-class WorkSpaceSoAHeterogeneousDevice : public cms::cuda::PortableDeviceCollection<WorksSpaceSoAHeterogeneousLayout> {
+class WorkSpaceSoAHeterogeneousDevice : public cms::cuda::PortableDeviceCollection<WorkSpaceSoAHeterogeneousLayout<>> {
+public:
   WorkSpaceSoAHeterogeneousDevice() = default;
 
   // Constructor which specifies the SoA size and CUDA stream
@@ -17,7 +18,7 @@ class WorkSpaceSoAHeterogeneousDevice : public cms::cuda::PortableDeviceCollecti
 };
 
 namespace gpuVertexFinder {
-  namespace WorkSpace {
+  namespace workSpace {
     using WorkSpaceSoADevice = WorkSpaceSoAHeterogeneousDevice<ZVertex::utilities::MAXTRACKS>;
   }
 }  // namespace gpuVertexFinder
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/WorkSpaceSoAHeterogeneousHost.h b/RecoPixelVertexing/PixelVertexFinding/plugins/WorkSpaceSoAHeterogeneousHost.h
index 5b893718a468d..1051da0bbcee8 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/WorkSpaceSoAHeterogeneousHost.h
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/WorkSpaceSoAHeterogeneousHost.h
@@ -4,11 +4,12 @@
 #include <cstdint>
 #include "WorkSpaceUtilities.h"
 #include "CUDADataFormats/Vertex/interface/ZVertexUtilities.h"
-#include "CUDADataFormats/Vertex/interface/WorkSpaceUtilities.h"
+#include "RecoPixelVertexing/PixelVertexFinding/plugins/WorkSpaceUtilities.h"
 #include "CUDADataFormats/Common/interface/PortableHostCollection.h"
 
 template <int32_t S>
-class WorkSpaceSoAHeterogeneousHost : public cms::cuda::PortableHostCollection<WorksSpaceSoAHeterogeneousLayout> {
+class WorkSpaceSoAHeterogeneousHost : public cms::cuda::PortableHostCollection<WorkSpaceSoAHeterogeneousLayout<>> {
+public:
   WorkSpaceSoAHeterogeneousHost() = default;
 
   // Constructor which specifies the SoA size and CUDA stream
@@ -17,7 +18,7 @@ class WorkSpaceSoAHeterogeneousHost : public cms::cuda::PortableHostCollection<W
 };
 
 namespace gpuVertexFinder {
-  namespace WorkSpace {
+  namespace workSpace {
     using WorkSpaceSoAHost = WorkSpaceSoAHeterogeneousHost<ZVertex::utilities::MAXTRACKS>;
   }
 }  // namespace gpuVertexFinder
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSortByPt2.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSortByPt2.h
index 38eeac91c5161..ff8cea612de47 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSortByPt2.h
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSortByPt2.h
@@ -34,7 +34,7 @@ namespace gpuVertexFinder {
 
     // fill indexing
     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
-      data[ws[i].itrk(i)].idv() = iv[i];
+      data[ws[i].itrk()].idv() = iv[i];
     }
 
     // can be done asynchronoisly at the end of previous event
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc
index baefe500d74d7..686f9899d8439 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc
@@ -116,9 +116,9 @@ namespace gpuVertexFinder {
     //assert(soa);
 
 #ifdef __CUDACC__
-    auto ws_d = gpuVertexing::workSpace::WorkSpaceSoAHeterogeneousDevice(stream);
+    auto ws_d = gpuVertexFinder::workSpace::WorkSpaceSoADevice(stream);
 #else
-    auto ws_d = gpuVertexing::workSpace::WorkSpaceSoAHeterogeneousHost(nullptr);
+    auto ws_d = gpuVertexFinder::workSpace::WorkSpaceSoAHost(nullptr);
 #endif
 
 #ifdef __CUDACC__
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h
index dfed3772dd2ec..cc8224521680c 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h
@@ -18,7 +18,7 @@ namespace gpuVertexFinder {
   using TkSoAConstView = pixelTrack::TrackSoAConstView;
   using WsSoAView = gpuVertexFinder::workSpace::WorkSpaceSoAView;
 
-  __global__ void init(VtxSoAView pdata, WsSoAview pws) {
+  __global__ void init(VtxSoAView pdata, WsSoAView pws) {
     ZVertex::utilities::init(pdata);
     gpuVertexFinder::workSpace::utilities::init(pws);
   }
diff --git a/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h b/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h
index ec392a1f4a8d8..211b8c1b4d4c6 100644
--- a/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h
+++ b/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h
@@ -9,14 +9,15 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/launch.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/allocate_device.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/currentDevice.h"
-#include "RecoPixelVertexing/PixelVertexFinding/plugins/WorkSpaceUtilities.h"
-#include "RecoPixelVertexing/PixelVertexFinding/plugins/WorkSpaceSoAHeterogeneousHost.h"
-#include "RecoPixelVertexing/PixelVertexFinding/plugins/WorkSpaceSoAHeterogeneousDevice.h"
 
 #include "CUDADataFormats/Track/interface/PixelTrackUtilities.h"  // TODO: included in order to compile Eigen columns first!!!
 #include "CUDADataFormats/Vertex/interface/ZVertexUtilities.h"
 #include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h"
 #include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h"
+
+#include "RecoPixelVertexing/PixelVertexFinding/plugins/WorkSpaceUtilities.h"
+#include "RecoPixelVertexing/PixelVertexFinding/plugins/WorkSpaceSoAHeterogeneousHost.h"
+#include "RecoPixelVertexing/PixelVertexFinding/plugins/WorkSpaceSoAHeterogeneousDevice.h"
 #ifdef USE_DBSCAN
 #include "RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksDBSCAN.h"
 #define CLUSTERIZE gpuVertexFinder::clusterTracksDBSCAN
@@ -40,15 +41,15 @@ __global__ void vertexFinderOneKernel(gpuVertexFinder::VtxSoAView pdata,
                                       float errmax,  // max error to be "seed"
                                       float chi2max  // max normalized distance to cluster,
 ) {
-  clusterTracksByDensity(pdata, pws, minT, eps, errmax, chi2max);
+  gpuVertexFinder::clusterTracksByDensity(pdata, pws, minT, eps, errmax, chi2max);
   __syncthreads();
-  fitVertices(pdata, pws, 50.);
+  gpuVertexFinder::fitVertices(pdata, pws, 50.);
   __syncthreads();
-  splitVertices(pdata, pws, 9.f);
+  gpuVertexFinder::splitVertices(pdata, pws, 9.f);
   __syncthreads();
-  fitVertices(pdata, pws, 5000.);
+  gpuVertexFinder::fitVertices(pdata, pws, 5000.);
   __syncthreads();
-  sortByPt2(pdata, pws);
+  gpuVertexFinder::sortByPt2(pdata, pws);
 }
 #endif
 #endif
@@ -112,7 +113,7 @@ struct ClusterGenerator {
 };
 
 __global__ void print(gpuVertexFinder::VtxSoAView pdata, gpuVertexFinder::WsSoAView pws) {
-  auto const& __restrict__ ws = pws;
+  auto & __restrict__ ws = pws;
   printf("nt,nv %d %d,%d\n", ws.ntrks(), pdata.nvFinal(), ws.nvIntermediate());
 }
 
@@ -123,12 +124,12 @@ int main() {
   cudaCheck(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
 
   ZVertex::ZVertexSoADevice onGPU_d(stream);
-  gpuVertexing::workSpace::WorkSpaceSoAHeterogeneousDevice ws_d(stream);
+  gpuVertexFinder::workSpace::WorkSpaceSoADevice ws_d(stream);
 #else
   stream = nullptr;
 
   ZVertex::ZVertexSoAHost onGPU_d(stream);
-  gpuVertexing::workSpace::WorkSpaceSoAHeterogeneousHost ws_d(stream);
+  gpuVertexFinder::workSpace::WorkSpaceSoAHost ws_d(stream);
 #endif
 
   Event ev;
@@ -201,7 +202,7 @@ int main() {
       print(onGPU_d.view(), ws_d.view());
       CLUSTERIZE(onGPU_d.view(), ws_d.view(), kk, par[0], par[1], par[2]);
       print(onGPU_d.view(), ws_d.view());
-      fitVertices(onGPU_d.view(), ws_d.view(), 50.f);
+      gpuVertexFinder::fitVertices(onGPU_d.view(), ws_d.view(), 50.f);
       nv = onGPU_d.view().nvFinal();
 #endif
 
@@ -260,7 +261,7 @@ int main() {
       cudaCheck(cudaMemcpy(nn, onGPU_d.view().ndof(), nv * sizeof(int32_t), cudaMemcpyDeviceToHost));
       cudaCheck(cudaMemcpy(chi2, onGPU_d.view().chi2(), nv * sizeof(float), cudaMemcpyDeviceToHost));
 #else
-      fitVertices(onGPU_d.view(), ws_d.view(), 50.f);
+      gpuVertexFinder::fitVertices(onGPU_d.view(), ws_d.view(), 50.f);
       nv = onGPU_d.view().nvFinal();
       memcpy(chi2, onGPU_d.view().chi2(), nv * sizeof(float));
 #endif
@@ -278,7 +279,7 @@ int main() {
       cms::cuda::launch(gpuVertexFinder::splitVerticesKernel, {1024, 64}, onGPU_d.view(), ws_d.view(), 9.f);
       cudaCheck(cudaMemcpy(&nv, &ws_d.view().nvIntermediate(), sizeof(uint32_t), cudaMemcpyDeviceToHost));
 #else
-      splitVertices(onGPU_d.view(), ws_d.view(), 9.f);
+      gpuVertexFinder::splitVertices(onGPU_d.view(), ws_d.view(), 9.f);
       nv = ws_d.view().nvIntermediate();
 #endif
       std::cout << "after split " << nv << std::endl;
@@ -291,8 +292,8 @@ int main() {
       cudaCheck(cudaGetLastError());
       cudaCheck(cudaMemcpy(&nv, &onGPU_d.view().nvFinal(), sizeof(uint32_t), cudaMemcpyDeviceToHost));
 #else
-      fitVertices(onGPU_d.view(), ws_d.view(), 5000.f);
-      sortByPt2(onGPU_d.view(), ws_d.view());
+      gpuVertexFinder::fitVertices(onGPU_d.view(), ws_d.view(), 5000.f);
+      gpuVertexFinder::sortByPt2(onGPU_d.view(), ws_d.view());
       nv = onGPU_d.view().nvFinal();
       memcpy(chi2, onGPU_d.view().chi2(), nv * sizeof(float));
 #endif

From 80b9c4d06244bd972a05db4cf4ea42a8ae66f0ed Mon Sep 17 00:00:00 2001
From: Dimitris Papagiannis <nothingface0@gmail.com>
Date: Fri, 4 Nov 2022 17:09:47 +0100
Subject: [PATCH 108/110] code-format

---
 CUDADataFormats/Track/interface/PixelTrackUtilities.h  |  4 +++-
 .../Track/interface/TrackSoAHeterogeneousDevice.h      |  1 -
 .../Track/test/TrackSoAHeterogeneous_test.cpp          |  3 ++-
 .../Vertex/interface/ZVertexSoAHeterogeneousDevice.h   |  3 +--
 .../PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc | 10 +++++++---
 .../plugins/PixelVertexProducerCUDA.cc                 |  2 +-
 .../plugins/PixelVertexSoAFromCUDA.cc                  | 10 +++++++---
 .../PixelVertexFinding/test/VertexFinder_t.h           |  2 +-
 .../TkSeedGenerator/plugins/SeedProducerFromSoA.cc     |  8 ++++----
 9 files changed, 26 insertions(+), 17 deletions(-)

diff --git a/CUDADataFormats/Track/interface/PixelTrackUtilities.h b/CUDADataFormats/Track/interface/PixelTrackUtilities.h
index 4208dfe93f69c..ce3658f126a83 100644
--- a/CUDADataFormats/Track/interface/PixelTrackUtilities.h
+++ b/CUDADataFormats/Track/interface/PixelTrackUtilities.h
@@ -65,7 +65,9 @@ namespace pixelTrack {
 
     __host__ __device__ inline float zip(const TrackSoAConstView &tracks, int32_t i) { return tracks[i].state()(4); }
 
-    __host__ __device__ inline bool isTriplet(const TrackSoAConstView &tracks, int i) { return tracks[i].nLayers() == 3; }
+    __host__ __device__ inline bool isTriplet(const TrackSoAConstView &tracks, int i) {
+      return tracks[i].nLayers() == 3;
+    }
 
     template <typename V3, typename M3, typename V2, typename M2>
     __host__ __device__ inline void copyFromCircle(
diff --git a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h
index fb1c45f331d19..71b8dc48b8b35 100644
--- a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h
+++ b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h
@@ -16,7 +16,6 @@ class TrackSoAHeterogeneousDevice : public cms::cuda::PortableDeviceCollection<T
   // Constructor which specifies the SoA size
   explicit TrackSoAHeterogeneousDevice(cudaStream_t stream)
       : PortableDeviceCollection<TrackSoAHeterogeneousLayout<>>(S, stream) {}
-
 };
 
 namespace pixelTrack {
diff --git a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp
index 0647296b9ef40..572a84cdd2d73 100644
--- a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp
+++ b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp
@@ -43,7 +43,8 @@ int main() {
     // copied to from device.
     pixelTrack::TrackSoAHost tracks_h(stream);
     //tracks_d.copyToHost(tracks_h.buffer(), stream);
-    cudaCheck(cudaMemcpyAsync(tracks_h.buffer().get(), tracks_d.const_buffer().get(), tracks_d.bufferSize(), cudaMemcpyDeviceToHost, stream));
+    cudaCheck(cudaMemcpyAsync(
+        tracks_h.buffer().get(), tracks_d.const_buffer().get(), tracks_d.bufferSize(), cudaMemcpyDeviceToHost, stream));
     cudaCheck(cudaGetLastError());
 
     // Print results
diff --git a/CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h b/CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h
index b1b9779ddf400..ca97e2533b8d1 100644
--- a/CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h
+++ b/CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h
@@ -16,13 +16,12 @@ class ZVertexSoAHeterogeneousDevice : public cms::cuda::PortableDeviceCollection
   // Constructor which specifies the SoA size
   explicit ZVertexSoAHeterogeneousDevice(cudaStream_t stream)
       : PortableDeviceCollection<ZVertexSoAHeterogeneousLayout<>>(S, stream) {}
-
 };
 
 namespace ZVertex {
 
   using ZVertexSoADevice = ZVertexSoAHeterogeneousDevice<ZVertex::utilities::MAXTRACKS>;
 
-}  // namespace pixelTrack
+}  // namespace ZVertex
 
 #endif  // CUDADataFormats_Vertex_ZVertexHeterogeneousDevice_H
diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc
index 1dadeb9d0dcc1..191e9009f6d6e 100644
--- a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc
+++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc
@@ -57,9 +57,13 @@ void PixelTrackSoAFromCUDA::acquire(edm::Event const& iEvent,
                                     edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
   cms::cuda::Product<pixelTrack::TrackSoADevice> const& inputDataWrapped = iEvent.get(tokenCUDA_);
   cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)};
-  auto const& tracks_d = ctx.get(inputDataWrapped);      // Tracks on device
-  tracks_h = pixelTrack::TrackSoAHost(ctx.stream());     // Create an instance of Tracks on Host, using the stream
-  cudaCheck(cudaMemcpyAsync(tracks_h.buffer().get(), tracks_d.const_buffer().get(), tracks_d.bufferSize(), cudaMemcpyDeviceToHost, ctx.stream())); // Copy data from Device to Host
+  auto const& tracks_d = ctx.get(inputDataWrapped);   // Tracks on device
+  tracks_h = pixelTrack::TrackSoAHost(ctx.stream());  // Create an instance of Tracks on Host, using the stream
+  cudaCheck(cudaMemcpyAsync(tracks_h.buffer().get(),
+                            tracks_d.const_buffer().get(),
+                            tracks_d.bufferSize(),
+                            cudaMemcpyDeviceToHost,
+                            ctx.stream()));  // Copy data from Device to Host
   cudaCheck(cudaGetLastError());
 }
 
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc
index 45d1a9d52d99e..db60fb7ebf4bb 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc
@@ -105,7 +105,7 @@ void PixelVertexProducerCUDA::produceOnGPU(edm::StreamID streamID,
   iEvent.getByToken(tokenGPUTrack_, hTracks);
 
   cms::cuda::ScopedContextProduce ctx{*hTracks};
-  auto &tracks = ctx.get(*hTracks);
+  auto& tracks = ctx.get(*hTracks);
 
   ctx.emplace(iEvent, tokenGPUVertex_, gpuAlgo_.makeAsync(ctx.stream(), tracks.view(), ptMin_, ptMax_));
 }
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexSoAFromCUDA.cc b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexSoAFromCUDA.cc
index f373c95e02760..7dd714f22de6f 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexSoAFromCUDA.cc
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexSoAFromCUDA.cc
@@ -53,9 +53,13 @@ void PixelVertexSoAFromCUDA::acquire(edm::Event const& iEvent,
                                      edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
   cms::cuda::Product<ZVertex::ZVertexSoADevice> const& inputDataWrapped = iEvent.get(tokenCUDA_);
   cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)};
-  auto const& zvertex_d = ctx.get(inputDataWrapped); // Tracks on device
-  zvertex_h = ZVertex::ZVertexSoAHost(ctx.stream()); // Create an instance of Tracks on Host, using the stream
-  cudaCheck(cudaMemcpyAsync(zvertex_h.buffer().get(), zvertex_d.const_buffer().get(), zvertex_d.bufferSize(), cudaMemcpyDeviceToHost, ctx.stream())); // Copy data from Device to Host
+  auto const& zvertex_d = ctx.get(inputDataWrapped);  // Tracks on device
+  zvertex_h = ZVertex::ZVertexSoAHost(ctx.stream());  // Create an instance of Tracks on Host, using the stream
+  cudaCheck(cudaMemcpyAsync(zvertex_h.buffer().get(),
+                            zvertex_d.const_buffer().get(),
+                            zvertex_d.bufferSize(),
+                            cudaMemcpyDeviceToHost,
+                            ctx.stream()));  // Copy data from Device to Host
   cudaCheck(cudaGetLastError());
 }
 
diff --git a/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h b/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h
index 211b8c1b4d4c6..c2cea5a9a1f13 100644
--- a/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h
+++ b/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h
@@ -113,7 +113,7 @@ struct ClusterGenerator {
 };
 
 __global__ void print(gpuVertexFinder::VtxSoAView pdata, gpuVertexFinder::WsSoAView pws) {
-  auto & __restrict__ ws = pws;
+  auto& __restrict__ ws = pws;
   printf("nt,nv %d %d,%d\n", ws.ntrks(), pdata.nvFinal(), ws.nvIntermediate());
 }
 
diff --git a/RecoTracker/TkSeedGenerator/plugins/SeedProducerFromSoA.cc b/RecoTracker/TkSeedGenerator/plugins/SeedProducerFromSoA.cc
index a5cc27c338ebe..4301749e441fe 100644
--- a/RecoTracker/TkSeedGenerator/plugins/SeedProducerFromSoA.cc
+++ b/RecoTracker/TkSeedGenerator/plugins/SeedProducerFromSoA.cc
@@ -91,7 +91,7 @@ void SeedProducerFromSoA::produce(edm::StreamID streamID, edm::Event& iEvent, co
   // std::cout << "beamspot " << bsh.x0() << ' ' << bsh.y0() << ' ' << bsh.z0() << std::endl;
   GlobalPoint bs(bsh.x0(), bsh.y0(), bsh.z0());
 
-  auto & tsoa = iEvent.get(tokenTrack_);
+  auto& tsoa = iEvent.get(tokenTrack_);
 
   auto const* quality = pixelTrack::utilities::qualityData(tsoa.view());
   //auto const& fit = tsoa.stateAtBS;
@@ -100,7 +100,7 @@ void SeedProducerFromSoA::produce(edm::StreamID streamID, edm::Event& iEvent, co
 
   int32_t nt = 0;
   for (int32_t it = 0; it < maxTracks; ++it) {
-    auto nHits = pixelTrack::utilities::nHits(tsoa.view(),it);
+    auto nHits = pixelTrack::utilities::nHits(tsoa.view(), it);
     if (nHits == 0)
       break;  // this is a guard: maybe we need to move to nTracks...
 
@@ -122,11 +122,11 @@ void SeedProducerFromSoA::produce(edm::StreamID streamID, edm::Event& iEvent, co
 
     // mind: this values are respect the beamspot!
 
-    float phi = pixelTrack::utilities::phi(tsoa.view(),it);
+    float phi = pixelTrack::utilities::phi(tsoa.view(), it);
 
     riemannFit::Vector5d ipar, opar;
     riemannFit::Matrix5d icov, ocov;
-    pixelTrack::utilities::copyToDense(tsoa.view(),ipar, icov, it);
+    pixelTrack::utilities::copyToDense(tsoa.view(), ipar, icov, it);
     riemannFit::transformToPerigeePlane(ipar, icov, opar, ocov);
 
     LocalTrajectoryParameters lpar(opar(0), opar(1), opar(2), opar(3), opar(4), 1.);

From 9b5af467d02c0695e9ee91bb4a83dad6c34f3c8e Mon Sep 17 00:00:00 2001
From: AdrianoDee <adriano.diflorio@ba.infn.it>
Date: Wed, 16 Nov 2022 12:01:16 +0100
Subject: [PATCH 109/110] Pixel hits portable (WIP)

---
 .../interface/TrackingRecHit2DHeterogeneous.h |  1 +
 .../interface/TrackingRecHitSoADevice.h       | 60 ++++++++++++++++
 .../interface/TrackingRecHitSoAHost.h         | 45 ++++++++++++
 .../interface/TrackingRecHitsUtilities.h      | 68 +++++++++++++++++++
 CUDADataFormats/TrackingRecHit/src/classes.h  |  2 +
 .../TrackingRecHit/src/classes_def.xml        |  4 ++
 .../TrackingRecHit/test/BuildFile.xml         |  9 ++-
 .../test/TrackingRecHitSoA_test.cpp           | 45 ++++++++++++
 .../test/TrackingRecHitSoA_test.cu            | 66 ++++++++++++++++++
 .../plugins/PixelRecHitGPUKernel.cu           | 19 +++---
 .../plugins/PixelRecHitGPUKernel.h            |  4 +-
 .../plugins/SiPixelRecHitCUDA.cc              |  6 +-
 .../plugins/SiPixelRecHitFromCUDA.cc          | 10 +--
 .../plugins/SiPixelRecHitSoAFromCUDA.cc       | 50 +++++++++-----
 .../plugins/SiPixelRecHitSoAFromLegacy.cc     | 33 +++++----
 .../SiPixelRecHits/plugins/gpuPixelRecHits.h  | 36 +++++-----
 .../python/SiPixelRecHits_cfi.py              |  2 +-
 17 files changed, 392 insertions(+), 68 deletions(-)
 create mode 100644 CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h
 create mode 100644 CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoAHost.h
 create mode 100644 CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h
 create mode 100644 CUDADataFormats/TrackingRecHit/test/TrackingRecHitSoA_test.cpp
 create mode 100644 CUDADataFormats/TrackingRecHit/test/TrackingRecHitSoA_test.cu

diff --git a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h
index 8ce37f280ac6c..98112285fce13 100644
--- a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h
+++ b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h
@@ -4,6 +4,7 @@
 #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DSOAView.h"
 #include "CUDADataFormats/Common/interface/HeterogeneousSoA.h"
 #include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h"
+#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h"
 
 template <typename Traits>
 class TrackingRecHit2DHeterogeneous {
diff --git a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h
new file mode 100644
index 0000000000000..fad70322a7c35
--- /dev/null
+++ b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h
@@ -0,0 +1,60 @@
+#ifndef CUDADataFormats_Track_TrackHeterogeneousDevice_H
+#define CUDADataFormats_Track_TrackHeterogeneousDevice_H
+
+#include <cstdint>
+
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h"
+#include "CUDADataFormats/Common/interface/PortableDeviceCollection.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+class TrackingRecHitSoADevice : public cms::cuda::PortableDeviceCollection<TrackingRecHitSoALayout<>> {
+public:
+  TrackingRecHitSoADevice() = default;  // cms::cuda::Product needs this
+
+  // Constructor which specifies the SoA size
+  explicit TrackingRecHitSoADevice(uint32_t nHits, bool isPhase2, int32_t offsetBPIX2, pixelCPEforGPU::ParamsOnGPU const* cpeParams, uint32_t const* hitsModuleStart, cudaStream_t stream)
+      : PortableDeviceCollection<TrackingRecHitSoALayout<>>(nHits, stream), nHits_(nHits), cpeParams_(cpeParams)
+      {
+        nModules_ = isPhase2 ? phase2PixelTopology::numberOfModules : phase1PixelTopology::numberOfModules;
+        phiBinner_ = &(view().phiBinner());
+        cudaCheck(cudaMemcpyAsync(&(view().nHits()), &nHits, sizeof(uint32_t),cudaMemcpyHostToDevice,stream));
+        cudaCheck(cudaMemcpyAsync(&(view().nMaxModules()), &nModules_, sizeof(uint32_t),cudaMemcpyHostToDevice,stream));
+        cudaCheck(cudaMemcpyAsync(&(view().hitsModuleStart()), hitsModuleStart, sizeof(uint32_t) * int(nModules_),cudaMemcpyHostToDevice,stream));
+        // cudaCheck(cudaMemcpyAsync(&(view().cpeParams()), cpeParams, int(sizeof(pixelCPEforGPU::ParamsOnGPU)),cudaMemcpyHostToDevice,stream));
+        cudaCheck(cudaMemcpyAsync(&(view().offsetBPIX2()), &offsetBPIX2, sizeof(int32_t),cudaMemcpyHostToDevice,stream));
+
+      }
+
+  uint32_t nHits() const { return nHits_; }
+  uint32_t nModules() const { return nModules_; }
+
+  cms::cuda::host::unique_ptr<float[]> localCoordToHostAsync(cudaStream_t stream) const {
+    auto ret = cms::cuda::make_host_unique<float[]>(5 * nHits(), stream);
+    size_t rowSize = sizeof(float) * nHits();
+    cudaCheck(cudaMemcpyAsync(ret.get(), view().xLocal() , rowSize, cudaMemcpyDeviceToHost, stream));
+    cudaCheck(cudaMemcpyAsync(ret.get() + rowSize , view().xLocal() , rowSize, cudaMemcpyDeviceToHost, stream));
+    cudaCheck(cudaMemcpyAsync(ret.get() + (rowSize * 2), view().xLocal() , rowSize, cudaMemcpyDeviceToHost, stream));
+    cudaCheck(cudaMemcpyAsync(ret.get() + (rowSize * 3) , view().xLocal() , rowSize, cudaMemcpyDeviceToHost, stream));
+    return ret;
+  }
+
+  cms::cuda::host::unique_ptr<uint32_t[]> hitsModuleStartToHostAsync(cudaStream_t stream) const {
+    auto ret = cms::cuda::make_host_unique<uint32_t[]>(nModules() + 1, stream);
+    cudaCheck(cudaMemcpyAsync(ret.get(), view().hitsModuleStart().begin(), sizeof(uint32_t) * (nModules() + 1), cudaMemcpyDeviceToHost, stream));
+    return ret;
+  }
+
+  auto phiBinnerStorage() { return phiBinnerStorage_; }
+  auto phiBinner() { return phiBinner_; }
+
+  private:
+    uint32_t nHits_; //Needed for the host SoA size
+    pixelCPEforGPU::ParamsOnGPU const* cpeParams_; //TODO: this is used not that much (only once in BrokenLineFit), would make sens to remove it from this class.
+    uint32_t nModules_;
+    trackingRecHitSoA::PhiBinnerStorageType* phiBinnerStorage_;
+    trackingRecHitSoA::PhiBinner* phiBinner_;
+};
+
+
+#endif  // CUDADataFormats_Track_TrackHeterogeneousT_H
diff --git a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoAHost.h b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoAHost.h
new file mode 100644
index 0000000000000..cb76d538474da
--- /dev/null
+++ b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoAHost.h
@@ -0,0 +1,45 @@
+#ifndef CUDADataFormats_Track_TrackHeterogeneousHost_H
+#define CUDADataFormats_Track_TrackHeterogeneousHost_H
+
+#include <cstdint>
+
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h"
+#include "CUDADataFormats/Common/interface/PortableHostCollection.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+class TrackingRecHitSoAHost : public cms::cuda::PortableHostCollection<TrackingRecHitSoALayout<>> {
+public:
+  TrackingRecHitSoAHost() = default;
+
+  // This SoA Host is used basically only for DQM
+  // so we  just need a slim constructor
+  explicit TrackingRecHitSoAHost(uint32_t nHits, cudaStream_t stream)
+  : PortableHostCollection<TrackingRecHitSoALayout<>>(nHits, stream) {}
+
+  explicit TrackingRecHitSoAHost(uint32_t nHits, bool isPhase2, int32_t offsetBPIX2, pixelCPEforGPU::ParamsOnGPU const* cpeParams, uint32_t const* hitsModuleStart, cudaStream_t stream)
+      : PortableHostCollection<TrackingRecHitSoALayout<>>(nHits, stream), nHits_(nHits), cpeParams_(cpeParams)
+      {
+        nModules_ = isPhase2 ? phase2PixelTopology::numberOfModules : phase1PixelTopology::numberOfModules;
+
+        view().nHits() = nHits;
+        view().nMaxModules() = nModules_;
+        std::copy(hitsModuleStart,hitsModuleStart+nModules_,view().hitsModuleStart().begin());
+
+        view().offsetBPIX2() = offsetBPIX2;
+
+      }
+
+  uint32_t nHits() const { return nHits_; }
+  uint32_t nModules() const { return nModules_; }
+  auto phiBinnerStorage() { return phiBinnerStorage_; }
+
+  private:
+    uint32_t nHits_; //Needed for the host SoA size
+    pixelCPEforGPU::ParamsOnGPU const* cpeParams_;
+    uint32_t nModules_;
+    trackingRecHitSoA::PhiBinnerStorageType* phiBinnerStorage_;
+};
+
+
+#endif  // CUDADataFormats_Track_TrackHeterogeneousT_H
diff --git a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h
new file mode 100644
index 0000000000000..f9cc022e571e3
--- /dev/null
+++ b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h
@@ -0,0 +1,68 @@
+#ifndef CUDADataFormats_RecHits_TrackingRecHitsUtilities_h
+#define CUDADataFormats_RecHits_TrackingRecHitsUtilities_h
+
+#include <Eigen/Dense>
+#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
+#include "DataFormats/SoATemplate/interface/SoALayout.h"
+#include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h"
+#include "SiPixelHitStatus.h"
+
+namespace trackingRecHitSoA{
+
+  // more information on bit fields : https://en.cppreference.com/w/cpp/language/bit_field
+  struct SiPixelHitStatusAndCharge {
+    SiPixelHitStatus status;
+    uint32_t charge : 24;
+  };
+
+  struct Test {
+    int a;
+  };
+
+  using hindex_type = uint32_t;  // if above is <=2^32
+  using PhiBinner = cms::cuda::
+      HistoContainer<int16_t, 256, -1, 8 * sizeof(int16_t), hindex_type, pixelTopology::maxLayers>;  //28 for phase2 geometry
+  using PhiBinnerStorageType = PhiBinner::index_type;
+
+  using AverageGeometry = pixelTopology::AverageGeometry;
+
+  using ParamsOnGPU = pixelCPEforGPU::ParamsOnGPU;
+
+  using HitLayerStartArray = std::array<uint32_t,11>;
+  using HitModuleStartArray = std::array<uint32_t,1856>;
+
+}
+
+
+GENERATE_SOA_LAYOUT(TrackingRecHitSoALayout,
+                    SOA_COLUMN(float, xLocal),
+                    SOA_COLUMN(float, yLocal),  // this is chi2/ndof as not necessarely all hits are used in the fit
+                    SOA_COLUMN(float, xerrLocal),
+                    SOA_COLUMN(float, yerrLocal),
+                    SOA_COLUMN(float, xGlobal),
+                    SOA_COLUMN(float, yGlobal),
+                    SOA_COLUMN(float, zGlobal),
+                    SOA_COLUMN(float, rGlobal),
+                    SOA_COLUMN(int16_t, iphi),
+                    SOA_COLUMN(trackingRecHitSoA::SiPixelHitStatusAndCharge, chargeAndStatus),
+                    SOA_COLUMN(int16_t, clusterSizeX),
+                    SOA_COLUMN(int16_t, clusterSizeY),
+                    SOA_COLUMN(int16_t, detectorIndex),
+
+                    SOA_SCALAR(trackingRecHitSoA::ParamsOnGPU, cpeParams),
+                    SOA_SCALAR(trackingRecHitSoA::AverageGeometry, averageGeometry),
+                    SOA_SCALAR(trackingRecHitSoA::PhiBinner, phiBinner),
+                    SOA_SCALAR(trackingRecHitSoA::HitLayerStartArray,hitsLayerStart),
+                    SOA_SCALAR(trackingRecHitSoA::HitModuleStartArray,hitsModuleStart),
+                    SOA_SCALAR(uint32_t, nHits),
+                    SOA_SCALAR(int32_t, offsetBPIX2),
+                    SOA_SCALAR(uint32_t, nMaxModules))
+
+namespace trackingRecHitSoA
+{
+  using HitSoAView = TrackingRecHitSoALayout<>::View;
+  using HitSoAConstView = TrackingRecHitSoALayout<>::ConstView;
+
+}
+#endif
diff --git a/CUDADataFormats/TrackingRecHit/src/classes.h b/CUDADataFormats/TrackingRecHit/src/classes.h
index abecfb38797de..b43537c915e3d 100644
--- a/CUDADataFormats/TrackingRecHit/src/classes.h
+++ b/CUDADataFormats/TrackingRecHit/src/classes.h
@@ -4,6 +4,8 @@
 #include "CUDADataFormats/Common/interface/Product.h"
 #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h"
 #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DReduced.h"
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoAHost.h"
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h"
 #include "DataFormats/Common/interface/Wrapper.h"
 
 #endif  // CUDADataFormats_SiPixelCluster_src_classes_h
diff --git a/CUDADataFormats/TrackingRecHit/src/classes_def.xml b/CUDADataFormats/TrackingRecHit/src/classes_def.xml
index f633d77c48ef7..fe92a1b6ea31e 100644
--- a/CUDADataFormats/TrackingRecHit/src/classes_def.xml
+++ b/CUDADataFormats/TrackingRecHit/src/classes_def.xml
@@ -7,4 +7,8 @@
   <class name="edm::Wrapper<cms::cuda::Product<TrackingRecHit2DGPU>>" persistent="false"/>
   <class name="TrackingRecHit2DReduced" persistent="false"/>
   <class name="edm::Wrapper<TrackingRecHit2DReduced>" persistent="false"/>
+  <class name="TrackingRecHitSoAHost" persistent="false"/>
+  <class name="edm::Wrapper<TrackingRecHitSoAHost>" persistent="false"/>
+  <class name="cms::cuda::Product<TrackingRecHitSoADevice>" persistent="false"/>
+  <class name="edm::Wrapper<cms::cuda::Product<TrackingRecHitSoADevice>>" persistent="false"/>
 </lcgdict>
diff --git a/CUDADataFormats/TrackingRecHit/test/BuildFile.xml b/CUDADataFormats/TrackingRecHit/test/BuildFile.xml
index ce49c46fffba0..77626dbf724ff 100644
--- a/CUDADataFormats/TrackingRecHit/test/BuildFile.xml
+++ b/CUDADataFormats/TrackingRecHit/test/BuildFile.xml
@@ -1,5 +1,12 @@
 <use name="CUDADataFormats/TrackingRecHit"/>
 <flags CUDA_FLAGS="-g -DGPU_DEBUG"/>
+
 <iftool name="cuda-gcc-support">
-<bin file="TrackingRecHit2DCUDA_t.cpp TrackingRecHit2DCUDA_t.cu" name="TrackingRecHit2DCUDA_t"/>
+ <bin file="TrackingRecHit2DCUDA_t.cpp TrackingRecHit2DCUDA_t.cu" name="TrackingRecHit2DCUDA_t">
+ </bin>
+</iftool>
+
+<iftool name="cuda-gcc-support">
+  <bin file="TrackingRecHitSoA_test.cpp TrackingRecHitSoA_test.cu" name="TrackingRecHitSoA_test">
+  </bin>
 </iftool>
diff --git a/CUDADataFormats/TrackingRecHit/test/TrackingRecHitSoA_test.cpp b/CUDADataFormats/TrackingRecHit/test/TrackingRecHitSoA_test.cpp
new file mode 100644
index 0000000000000..7f4308ebf1492
--- /dev/null
+++ b/CUDADataFormats/TrackingRecHit/test/TrackingRecHitSoA_test.cpp
@@ -0,0 +1,45 @@
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoAHost.h"
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h"
+
+#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/allocate_device.h"
+
+#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h"
+
+namespace testTrackingRecHit2DNew {
+
+  void run(TrackingRecHitSoADevice& hits, cudaStream_t stream);
+
+}
+
+int main() {
+  cms::cudatest::requireDevices();
+
+  cudaStream_t stream;
+  cudaCheck(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
+
+
+  // inner scope to deallocate memory before destroying the stream
+  {
+    uint32_t nHits = 2000;
+    int32_t offset = 100;
+    uint32_t moduleStart[1856];
+
+    for (size_t i = 0; i < 1856; i++) {
+      moduleStart[i] = i*2;
+    }
+
+    TrackingRecHitSoADevice tkhit(nHits,false,offset,nullptr,&moduleStart[0],stream);
+
+    testTrackingRecHit2DNew::run(tkhit,stream);
+
+    auto test = tkhit.localCoordToHostAsync(stream);
+    printf("tkhit hits %d \n",tkhit.nHits());
+  }
+
+  cudaCheck(cudaStreamDestroy(stream));
+
+  return 0;
+}
diff --git a/CUDADataFormats/TrackingRecHit/test/TrackingRecHitSoA_test.cu b/CUDADataFormats/TrackingRecHit/test/TrackingRecHitSoA_test.cu
new file mode 100644
index 0000000000000..93f4dde061786
--- /dev/null
+++ b/CUDADataFormats/TrackingRecHit/test/TrackingRecHitSoA_test.cu
@@ -0,0 +1,66 @@
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h"
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h"
+
+namespace testTrackingRecHit2DNew {
+
+  __global__ void fill(trackingRecHitSoA::HitSoAView soa) {
+    // assert(soa);
+
+    int i = threadIdx.x;
+    int j = blockIdx.x;
+    if(i==0 and j==0)
+    {
+
+      soa.offsetBPIX2() = 22;
+      soa[10].xLocal() =1.11;
+   }
+
+   soa[i].iphi() = i%10;
+   soa.hitsLayerStart()[j] = j;
+    //k = soa.test().a;
+
+  }
+
+  __global__ void show(trackingRecHitSoA::HitSoAView soa) {
+    // assert(soa);
+
+    int i = threadIdx.x;
+    int j = blockIdx.x;
+    if(i==0 and j==0)
+    {
+      printf("nbins = %d \n", soa.phiBinner().nbins());
+      printf("offsetBPIX %d ->%d \n",i,soa.offsetBPIX2());
+      printf("nHits %d ->%d \n",i,soa.nHits());
+      printf("hitsModuleStart %d ->%d \n",i,soa.hitsModuleStart().at(28));
+   }
+
+   if(i<soa.nHits())
+    printf("iPhi %d ->%d \n",i,soa[i].iphi());
+
+  if(j*blockDim.x+i < soa.phiBinner().nbins())
+   printf(">bin size %d ->%d \n",j*blockDim.x+i,soa.phiBinner().size(j*blockDim.x+i));
+
+  }
+
+
+
+  void run(TrackingRecHitSoADevice& hits, cudaStream_t stream) {
+    // assert(soa);
+    printf("RUN!\n");
+    int k = 0;
+    show<<<10, 100, 0, stream>>>(hits.view());
+    printf("k = %d\n",k);
+
+    cms::cuda::fillManyFromVector(hits.phiBinner(),
+                                  10,
+                                  hits.view().iphi(),
+                                  hits.view().hitsLayerStart().data(),
+                                  2000,
+                                  256,
+                                  hits.phiBinnerStorage(),
+                                  stream);
+
+    show<<<10, 1000, 0, stream>>>(hits.view());
+  }
+
+}  // namespace testTrackingRecHit2D
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu
index 135254fa6e9f2..6fd9a57a6cc72 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu
@@ -34,7 +34,7 @@ namespace {
 
 namespace pixelgpudetails {
 
-  TrackingRecHit2DGPU PixelRecHitGPUKernel::makeHitsAsync(SiPixelDigisCUDA const& digis_d,
+  TrackingRecHitSoADevice PixelRecHitGPUKernel::makeHitsAsync(SiPixelDigisCUDA const& digis_d,
                                                           SiPixelClustersCUDA const& clusters_d,
                                                           BeamSpotCUDA const& bs_d,
                                                           pixelCPEforGPU::ParamsOnGPU const* cpeParams,
@@ -42,10 +42,11 @@ namespace pixelgpudetails {
                                                           cudaStream_t stream) const {
     auto nHits = clusters_d.nClusters();
 
-    TrackingRecHit2DGPU hits_d(
-        nHits, isPhase2, clusters_d.offsetBPIX2(), cpeParams, clusters_d.clusModuleStart(), stream);
-    assert(hits_d.nMaxModules() == isPhase2 ? phase2PixelTopology::numberOfModules
-                                            : phase1PixelTopology::numberOfModules);
+    TrackingRecHitSoADevice hits_d(nHits, isPhase2, clusters_d.offsetBPIX2(), cpeParams, clusters_d.clusModuleStart(), stream);
+    // TrackingRecHit2DGPU hits_d(
+    //     nHits, isPhase2, clusters_d.offsetBPIX2(), cpeParams, clusters_d.clusModuleStart(), stream);
+    // assert(hits_d.nMaxModules() == isPhase2 ? phase2PixelTopology::numberOfModules
+    //                                         : phase1PixelTopology::numberOfModules);
 
     int activeModulesWithDigis = digis_d.nModules();
     // protect from empty events
@@ -65,13 +66,13 @@ namespace pixelgpudetails {
 
       // assuming full warp of threads is better than a smaller number...
       if (nHits) {
-        setHitsLayerStart<<<1, 32, 0, stream>>>(clusters_d.clusModuleStart(), cpeParams, hits_d.hitsLayerStart());
+        setHitsLayerStart<<<1, 32, 0, stream>>>(clusters_d.clusModuleStart(), cpeParams, hits_d.view().hitsLayerStart().data());
         cudaCheck(cudaGetLastError());
         auto nLayers = isPhase2 ? phase2PixelTopology::numberOfLayers : phase1PixelTopology::numberOfLayers;
-        cms::cuda::fillManyFromVector(hits_d.phiBinner(),
+        cms::cuda::fillManyFromVector(&(hits_d.view().phiBinner()),
                                       nLayers,
-                                      hits_d.iphi(),
-                                      hits_d.hitsLayerStart(),
+                                      hits_d.view().iphi(),
+                                      hits_d.view().hitsLayerStart().data(),
                                       nHits,
                                       256,
                                       hits_d.phiBinnerStorage(),
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.h b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.h
index 8289c8db7f2f4..5d55e713391e1 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.h
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.h
@@ -8,7 +8,7 @@
 #include "CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h"
 #include "CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h"
 #include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h"
-#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h"
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h"
 
 namespace pixelgpudetails {
 
@@ -22,7 +22,7 @@ namespace pixelgpudetails {
     PixelRecHitGPUKernel& operator=(const PixelRecHitGPUKernel&) = delete;
     PixelRecHitGPUKernel& operator=(PixelRecHitGPUKernel&&) = delete;
 
-    TrackingRecHit2DGPU makeHitsAsync(SiPixelDigisCUDA const& digis_d,
+    TrackingRecHitSoADevice makeHitsAsync(SiPixelDigisCUDA const& digis_d,
                                       SiPixelClustersCUDA const& clusters_d,
                                       BeamSpotCUDA const& bs_d,
                                       pixelCPEforGPU::ParamsOnGPU const* cpeParams,
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitCUDA.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitCUDA.cc
index 8112e9ebd19c8..1c050e037144e 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitCUDA.cc
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitCUDA.cc
@@ -4,7 +4,7 @@
 #include "CUDADataFormats/Common/interface/Product.h"
 #include "CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h"
 #include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h"
-#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h"
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h"
 #include "DataFormats/Common/interface/Handle.h"
 #include "FWCore/Framework/interface/Event.h"
 #include "FWCore/Framework/interface/EventSetup.h"
@@ -37,7 +37,7 @@ class SiPixelRecHitCUDA : public edm::global::EDProducer<> {
   const edm::EDGetTokenT<cms::cuda::Product<BeamSpotCUDA>> tBeamSpot;
   const edm::EDGetTokenT<cms::cuda::Product<SiPixelClustersCUDA>> token_;
   const edm::EDGetTokenT<cms::cuda::Product<SiPixelDigisCUDA>> tokenDigi_;
-  const edm::EDPutTokenT<cms::cuda::Product<TrackingRecHit2DGPU>> tokenHit_;
+  const edm::EDPutTokenT<cms::cuda::Product<TrackingRecHitSoADevice>> tokenHit_;
 
   const pixelgpudetails::PixelRecHitGPUKernel gpuAlgo_;
 };
@@ -47,7 +47,7 @@ SiPixelRecHitCUDA::SiPixelRecHitCUDA(const edm::ParameterSet& iConfig)
       tBeamSpot(consumes<cms::cuda::Product<BeamSpotCUDA>>(iConfig.getParameter<edm::InputTag>("beamSpot"))),
       token_(consumes<cms::cuda::Product<SiPixelClustersCUDA>>(iConfig.getParameter<edm::InputTag>("src"))),
       tokenDigi_(consumes<cms::cuda::Product<SiPixelDigisCUDA>>(iConfig.getParameter<edm::InputTag>("src"))),
-      tokenHit_(produces<cms::cuda::Product<TrackingRecHit2DGPU>>()) {}
+      tokenHit_(produces<cms::cuda::Product<TrackingRecHitSoADevice>>()) {}
 
 void SiPixelRecHitCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
   edm::ParameterSetDescription desc;
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitFromCUDA.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitFromCUDA.cc
index 7ff2da5552e6d..bc6c3fa370372 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitFromCUDA.cc
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitFromCUDA.cc
@@ -24,6 +24,8 @@
 #include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
 #include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h"
 
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h"
+
 class SiPixelRecHitFromCUDA : public edm::stream::EDProducer<edm::ExternalWork> {
 public:
   explicit SiPixelRecHitFromCUDA(const edm::ParameterSet& iConfig);
@@ -40,7 +42,7 @@ class SiPixelRecHitFromCUDA : public edm::stream::EDProducer<edm::ExternalWork>
   void produce(edm::Event& iEvent, edm::EventSetup const& iSetup) override;
 
   const edm::ESGetToken<TrackerGeometry, TrackerDigiGeometryRecord> geomToken_;
-  const edm::EDGetTokenT<cms::cuda::Product<TrackingRecHit2DGPU>> hitsToken_;  // CUDA hits
+  const edm::EDGetTokenT<cms::cuda::Product<TrackingRecHitSoADevice>> hitsToken_;  // CUDA hits
   const edm::EDGetTokenT<SiPixelClusterCollectionNew> clusterToken_;           // legacy clusters
   const edm::EDPutTokenT<SiPixelRecHitCollection> rechitsPutToken_;            // legacy rechits
   const edm::EDPutTokenT<HMSstorage> hostPutToken_;
@@ -54,7 +56,7 @@ class SiPixelRecHitFromCUDA : public edm::stream::EDProducer<edm::ExternalWork>
 SiPixelRecHitFromCUDA::SiPixelRecHitFromCUDA(const edm::ParameterSet& iConfig)
     : geomToken_(esConsumes()),
       hitsToken_(
-          consumes<cms::cuda::Product<TrackingRecHit2DGPU>>(iConfig.getParameter<edm::InputTag>("pixelRecHitSrc"))),
+          consumes<cms::cuda::Product<TrackingRecHitSoADevice>>(iConfig.getParameter<edm::InputTag>("pixelRecHitSrc"))),
       clusterToken_(consumes<SiPixelClusterCollectionNew>(iConfig.getParameter<edm::InputTag>("src"))),
       rechitsPutToken_(produces<SiPixelRecHitCollection>()),
       hostPutToken_(produces<HMSstorage>()) {}
@@ -69,12 +71,12 @@ void SiPixelRecHitFromCUDA::fillDescriptions(edm::ConfigurationDescriptions& des
 void SiPixelRecHitFromCUDA::acquire(edm::Event const& iEvent,
                                     edm::EventSetup const& iSetup,
                                     edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
-  cms::cuda::Product<TrackingRecHit2DGPU> const& inputDataWrapped = iEvent.get(hitsToken_);
+  cms::cuda::Product<TrackingRecHitSoADevice> const& inputDataWrapped = iEvent.get(hitsToken_);
   cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)};
   auto const& inputData = ctx.get(inputDataWrapped);
 
   nHits_ = inputData.nHits();
-  nMaxModules_ = inputData.nMaxModules();
+  nMaxModules_ = inputData.nModules();
   LogDebug("SiPixelRecHitFromCUDA") << "converting " << nHits_ << " Hits";
 
   if (0 == nHits_)
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromCUDA.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromCUDA.cc
index 7532470ebd3d4..aedaf6955c747 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromCUDA.cc
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromCUDA.cc
@@ -24,6 +24,9 @@
 #include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
 #include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h"
 
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoAHost.h"
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h"
+
 class SiPixelRecHitSoAFromCUDA : public edm::stream::EDProducer<edm::ExternalWork> {
 public:
   explicit SiPixelRecHitSoAFromCUDA(const edm::ParameterSet& iConfig);
@@ -38,22 +41,24 @@ class SiPixelRecHitSoAFromCUDA : public edm::stream::EDProducer<edm::ExternalWor
                edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
   void produce(edm::Event& iEvent, edm::EventSetup const& iSetup) override;
 
-  const edm::EDGetTokenT<cms::cuda::Product<TrackingRecHit2DGPU>> hitsTokenGPU_;  // CUDA hits
-  const edm::EDPutTokenT<TrackingRecHit2DCPU> hitsPutTokenCPU_;
+  const edm::EDGetTokenT<cms::cuda::Product<TrackingRecHitSoADevice>> hitsTokenGPU_;  // CUDA hits
+  const edm::EDPutTokenT<TrackingRecHitSoAHost> hitsPutTokenCPU_;
   const edm::EDPutTokenT<HMSstorage> hostPutToken_;
 
   uint32_t nHits_;
+  TrackingRecHitSoAHost hits_h_;
+
   uint32_t nMaxModules_;
 
-  cms::cuda::host::unique_ptr<float[]> store32_;
-  cms::cuda::host::unique_ptr<uint16_t[]> store16_;
-  cms::cuda::host::unique_ptr<uint32_t[]> hitsModuleStart_;
+  // cms::cuda::host::unique_ptr<float[]> store32_;
+  // cms::cuda::host::unique_ptr<uint16_t[]> store16_;
+  // cms::cuda::host::unique_ptr<uint32_t[]> hitsModuleStart_;
 };
 
 SiPixelRecHitSoAFromCUDA::SiPixelRecHitSoAFromCUDA(const edm::ParameterSet& iConfig)
     : hitsTokenGPU_(
-          consumes<cms::cuda::Product<TrackingRecHit2DGPU>>(iConfig.getParameter<edm::InputTag>("pixelRecHitSrc"))),
-      hitsPutTokenCPU_(produces<TrackingRecHit2DCPU>()),
+          consumes<cms::cuda::Product<TrackingRecHitSoADevice>>(iConfig.getParameter<edm::InputTag>("pixelRecHitSrc"))),
+      hitsPutTokenCPU_(produces<TrackingRecHitSoAHost>()),
       hostPutToken_(produces<HMSstorage>()) {}
 
 void SiPixelRecHitSoAFromCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
@@ -65,29 +70,42 @@ void SiPixelRecHitSoAFromCUDA::fillDescriptions(edm::ConfigurationDescriptions&
 void SiPixelRecHitSoAFromCUDA::acquire(edm::Event const& iEvent,
                                        edm::EventSetup const& iSetup,
                                        edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
-  cms::cuda::Product<TrackingRecHit2DGPU> const& inputDataWrapped = iEvent.get(hitsTokenGPU_);
+  cms::cuda::Product<TrackingRecHitSoADevice> const& inputDataWrapped = iEvent.get(hitsTokenGPU_);
   cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)};
   auto const& inputData = ctx.get(inputDataWrapped);
 
-  nHits_ = inputData.nHits();
-  LogDebug("SiPixelRecHitSoAFromCUDA") << "copying to cpu SoA" << inputData.nHits() << " Hits";
+  nHits_ = inputData.view().nHits();
 
   if (0 == nHits_)
     return;
-  nMaxModules_ = inputData.nMaxModules();
-  store32_ = inputData.store32ToHostAsync(ctx.stream());
-  store16_ = inputData.store16ToHostAsync(ctx.stream());
-  hitsModuleStart_ = inputData.hitsModuleStartToHostAsync(ctx.stream());
+
+  nMaxModules_ = inputData.view().nMaxModules();
+
+  hits_h_ = TrackingRecHitSoAHost(nHits_,ctx.stream());
+  cudaCheck(cudaMemcpyAsync(hits_h_.buffer().get(),
+                            inputData.const_buffer().get(),
+                            inputData.bufferSize(),
+                            cudaMemcpyDeviceToHost,
+                            ctx.stream()));  // Copy data from Device to Host
+  cudaCheck(cudaGetLastError());
+
+
+  LogDebug("SiPixelRecHitSoAFromCUDA") << "copying to cpu SoA" << inputData.nHits() << " Hits";
+
+  // store32_ = inputData.store32ToHostAsync(ctx.stream());
+  // store16_ = inputData.store16ToHostAsync(ctx.stream());
+  // hitsModuleStart_ = inputData.hitsModuleStartToHostAsync(ctx.stream());
 }
 
 void SiPixelRecHitSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& es) {
   auto hmsp = std::make_unique<uint32_t[]>(nMaxModules_ + 1);
 
   if (nHits_ > 0)
-    std::copy(hitsModuleStart_.get(), hitsModuleStart_.get() + nMaxModules_ + 1, hmsp.get());
+    std::copy(hits_h_.view().hitsModuleStart().begin(),hits_h_.view().hitsModuleStart().end(),hmsp.get());
+    // std::copy(hitsModuleStart_.get(), hitsModuleStart_.get() + nMaxModules_ + 1, hmsp.get());
 
   iEvent.emplace(hostPutToken_, std::move(hmsp));
-  iEvent.emplace(hitsPutTokenCPU_, store32_.get(), store16_.get(), hitsModuleStart_.get(), nHits_);
+  iEvent.emplace(hitsPutTokenCPU_, std::move(hits_h_));//store32_.get(), store16_.get(), hitsModuleStart_.get(), nHits_);
 }
 
 DEFINE_FWK_MODULE(SiPixelRecHitSoAFromCUDA);
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromLegacy.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromLegacy.cc
index d23ecec66fea0..663674b2a4145 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromLegacy.cc
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromLegacy.cc
@@ -3,7 +3,7 @@
 #include "CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h"
 #include "CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h"
 #include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h"
-#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h"
+// #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h"
 #include "CUDADataFormats/Common/interface/HostProduct.h"
 #include "DataFormats/BeamSpot/interface/BeamSpot.h"
 #include "DataFormats/Common/interface/DetSetVectorNew.h"
@@ -25,6 +25,8 @@
 #include "RecoLocalTracker/SiPixelRecHits/interface/PixelCPEBase.h"
 #include "RecoLocalTracker/SiPixelRecHits/interface/PixelCPEFast.h"
 
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoAHost.h"
+
 #include "gpuPixelRecHits.h"
 
 class SiPixelRecHitSoAFromLegacy : public edm::global::EDProducer<> {
@@ -44,7 +46,7 @@ class SiPixelRecHitSoAFromLegacy : public edm::global::EDProducer<> {
   const edm::ESGetToken<PixelClusterParameterEstimator, TkPixelCPERecord> cpeToken_;
   const edm::EDGetTokenT<reco::BeamSpot> bsGetToken_;
   const edm::EDGetTokenT<SiPixelClusterCollectionNew> clusterToken_;  // Legacy Clusters
-  const edm::EDPutTokenT<TrackingRecHit2DCPU> tokenHit_;
+  const edm::EDPutTokenT<TrackingRecHitSoAHost> tokenHit_;
   const edm::EDPutTokenT<HMSstorage> tokenModuleStart_;
   const bool convert2Legacy_;
   const bool isPhase2_;
@@ -55,7 +57,7 @@ SiPixelRecHitSoAFromLegacy::SiPixelRecHitSoAFromLegacy(const edm::ParameterSet&
       cpeToken_(esConsumes(edm::ESInputTag("", iConfig.getParameter<std::string>("CPE")))),
       bsGetToken_{consumes<reco::BeamSpot>(iConfig.getParameter<edm::InputTag>("beamSpot"))},
       clusterToken_{consumes<SiPixelClusterCollectionNew>(iConfig.getParameter<edm::InputTag>("src"))},
-      tokenHit_{produces<TrackingRecHit2DCPU>()},
+      tokenHit_{produces<TrackingRecHitSoAHost>()},
       tokenModuleStart_{produces<HMSstorage>()},
       convert2Legacy_(iConfig.getParameter<bool>("convertToLegacy")),
       isPhase2_(iConfig.getParameter<bool>("isPhase2")) {
@@ -156,9 +158,9 @@ void SiPixelRecHitSoAFromLegacy::produce(edm::StreamID streamID, edm::Event& iEv
   // output SoA
   // element 96 is the start of BPIX2 (i.e. the number of clusters in BPIX1)
 
-  auto output = std::make_unique<TrackingRecHit2DCPU>(
+  auto output = std::make_unique<TrackingRecHitSoAHost>(
       numberOfClusters, isPhase2_, hitsModuleStart[startBPIX2], &cpeView, hitsModuleStart, nullptr);
-  assert(output->nMaxModules() == uint32_t(nMaxModules));
+  assert(output->nModules() == uint32_t(nMaxModules));
 
   if (0 == numberOfClusters) {
     iEvent.put(std::move(output));
@@ -239,9 +241,9 @@ void SiPixelRecHitSoAFromLegacy::produce(edm::StreamID streamID, edm::Event& iEv
     gpuPixelRecHits::getHits(&cpeView, &bsHost, digiView, ndigi, &clusterView, output->view());
     for (auto h = fc; h < lc; ++h)
       if (h - fc < maxHitsInModule)
-        assert(gind == output->view()->detectorIndex(h));
+        assert(gind == output->view()[h].detectorIndex());
       else
-        assert(gpuClustering::invalidModuleId == output->view()->detectorIndex(h));
+        assert(gpuClustering::invalidModuleId == output->view()[h].detectorIndex());
     if (convert2Legacy_) {
       SiPixelRecHitCollectionNew::FastFiller recHitsOnDetUnit(*legacyOutput, detid);
       for (auto h = fc; h < lc; ++h) {
@@ -250,8 +252,8 @@ void SiPixelRecHitSoAFromLegacy::produce(edm::StreamID streamID, edm::Event& iEv
         if (ih >= maxHitsInModule)
           break;
         assert(ih < clusterRef.size());
-        LocalPoint lp(output->view()->xLocal(h), output->view()->yLocal(h));
-        LocalError le(output->view()->xerrLocal(h), 0, output->view()->yerrLocal(h));
+        LocalPoint lp(output->view()[h].xLocal(), output->view()[h].yLocal());
+        LocalError le(output->view()[h].xerrLocal(), 0, output->view()[h].yerrLocal());
         SiPixelRecHitQuality::QualWordType rqw = 0;
         SiPixelRecHit hit(lp, le, rqw, *genericDet, clusterRef[ih]);
         recHitsOnDetUnit.push_back(hit);
@@ -264,20 +266,21 @@ void SiPixelRecHitSoAFromLegacy::produce(edm::StreamID streamID, edm::Event& iEv
   // fill data structure to support CA
   const auto nLayers = isPhase2_ ? phase2PixelTopology::numberOfLayers : phase1PixelTopology::numberOfLayers;
   for (auto i = 0U; i < nLayers + 1; ++i) {
-    output->hitsLayerStart()[i] = hitsModuleStart[cpeView.layerGeometry().layerStart[i]];
+    output->view().hitsLayerStart()[i] = hitsModuleStart[cpeView.layerGeometry().layerStart[i]];
     LogDebug("SiPixelRecHitSoAFromLegacy")
         << "Layer n." << i << " - starting at module: " << cpeView.layerGeometry().layerStart[i]
-        << " - starts ad cluster: " << output->hitsLayerStart()[i] << "\n";
+        << " - starts ad cluster: " << output->view()[i].hitsLayerStart() << "\n";
   }
 
-  cms::cuda::fillManyFromVector(output->phiBinner(),
+  cms::cuda::fillManyFromVector(&(output->view().phiBinner()),
                                 nLayers,
-                                output->iphi(),
-                                output->hitsLayerStart(),
-                                numberOfHits,
+                                output->view().iphi(),
+                                output->view().hitsLayerStart().data(),
+                                output->nHits(),
                                 256,
                                 output->phiBinnerStorage());
 
+
   LogDebug("SiPixelRecHitSoAFromLegacy") << "created HitSoa for " << numberOfClusters << " clusters in "
                                          << numberOfDetUnits << " Dets";
   iEvent.put(std::move(output));
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h b/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h
index 5b862b2cf63b9..db0940f0f50f7 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h
@@ -7,11 +7,12 @@
 
 #include "CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h"
 #include "CUDADataFormats/SiPixelCluster/interface/gpuClusteringConstants.h"
-#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h"
+// #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h"
 #include "DataFormats/Math/interface/approx_atan2.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
 #include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h"
 #include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDASOAView.h"
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h"
 
 namespace gpuPixelRecHits {
 
@@ -20,14 +21,14 @@ namespace gpuPixelRecHits {
                           SiPixelDigisCUDASOAView const digis,
                           int numElements,
                           SiPixelClustersCUDA::SiPixelClustersCUDASOAView const* __restrict__ pclusters,
-                          TrackingRecHit2DSOAView* phits) {
+                          trackingRecHitSoA::HitSoAView hits) {
     // FIXME
     // the compiler seems NOT to optimize loads from views (even in a simple test case)
     // The whole gimnastic here of copying or not is a pure heuristic exercise that seems to produce the fastest code with the above signature
     // not using views (passing a gazzilion of array pointers) seems to produce the fastest code (but it is harder to mantain)
-    assert(phits);
+    // assert(phits);
     assert(cpeParams);
-    auto& hits = *phits;
+    // auto& hits = *phits;
 
     auto const& clusters = *pclusters;
     auto isPhase2 = cpeParams->commonParams().isPhase2;
@@ -175,18 +176,19 @@ namespace gpuPixelRecHits {
           pixelCPEforGPU::errorFromSize(cpeParams->commonParams(), cpeParams->detParams(me), clusParams, ic);
 
         // store it
-        hits.setChargeAndStatus(h, clusParams.charge[ic], clusParams.status[ic]);
-        hits.detectorIndex(h) = me;
+        hits[h].chargeAndStatus().charge = clusParams.charge[ic];
+        hits[h].chargeAndStatus().status = clusParams.status[ic];
+        hits[h].detectorIndex() = me;
 
         float xl, yl;
-        hits.xLocal(h) = xl = clusParams.xpos[ic];
-        hits.yLocal(h) = yl = clusParams.ypos[ic];
+        hits[h].xLocal() = xl = clusParams.xpos[ic];
+        hits[h].yLocal() = yl = clusParams.ypos[ic];
 
-        hits.clusterSizeX(h) = clusParams.xsize[ic];
-        hits.clusterSizeY(h) = clusParams.ysize[ic];
+        hits[h].clusterSizeX() = clusParams.xsize[ic];
+        hits[h].clusterSizeY() = clusParams.ysize[ic];
 
-        hits.xerrLocal(h) = clusParams.xerr[ic] * clusParams.xerr[ic] + cpeParams->detParams(me).apeXX;
-        hits.yerrLocal(h) = clusParams.yerr[ic] * clusParams.yerr[ic] + cpeParams->detParams(me).apeYY;
+        hits[h].xerrLocal() = clusParams.xerr[ic] * clusParams.xerr[ic] + cpeParams->detParams(me).apeXX;
+        hits[h].yerrLocal() = clusParams.yerr[ic] * clusParams.yerr[ic] + cpeParams->detParams(me).apeYY;
 
         // keep it local for computations
         float xg, yg, zg;
@@ -197,12 +199,12 @@ namespace gpuPixelRecHits {
         yg -= bs->y;
         zg -= bs->z;
 
-        hits.xGlobal(h) = xg;
-        hits.yGlobal(h) = yg;
-        hits.zGlobal(h) = zg;
+        hits[h].xGlobal() = xg;
+        hits[h].yGlobal() = yg;
+        hits[h].zGlobal() = zg;
 
-        hits.rGlobal(h) = std::sqrt(xg * xg + yg * yg);
-        hits.iphi(h) = unsafe_atan2s<7>(yg, xg);
+        hits[h].rGlobal() = std::sqrt(xg * xg + yg * yg);
+        hits[h].iphi() = unsafe_atan2s<7>(yg, xg);
       }
       __syncthreads();
     }  // end loop on batches
diff --git a/RecoLocalTracker/SiPixelRecHits/python/SiPixelRecHits_cfi.py b/RecoLocalTracker/SiPixelRecHits/python/SiPixelRecHits_cfi.py
index 4af0238682abb..11a69fead8ad3 100644
--- a/RecoLocalTracker/SiPixelRecHits/python/SiPixelRecHits_cfi.py
+++ b/RecoLocalTracker/SiPixelRecHits/python/SiPixelRecHits_cfi.py
@@ -56,7 +56,7 @@
 siPixelRecHitsPreSplittingSoA = SwitchProducerCUDA(
     cpu = cms.EDAlias(
             siPixelRecHitsPreSplittingCPU = cms.VPSet(
-                 cms.PSet(type = cms.string("cmscudacompatCPUTraitsTrackingRecHit2DHeterogeneous")),
+                 cms.PSet(type = cms.string("TrackingRecHitSoAHost")),
                  cms.PSet(type = cms.string("uintAsHostProduct"))
              )),
 )

From 8016c995db6364391f8e955d52d7337016084671 Mon Sep 17 00:00:00 2001
From: AdrianoDee <adriano.diflorio@ba.infn.it>
Date: Fri, 18 Nov 2022 12:33:50 +0100
Subject: [PATCH 110/110] Pixel hits portable (WIP) - II

---
 .../interface/TrackingRecHitSoADevice.h       | 30 +++++++++--------
 .../interface/TrackingRecHitsUtilities.h      | 26 +++++++++++----
 .../test/TrackingRecHitSoA_test.cpp           | 18 +++++++++--
 .../test/TrackingRecHitSoA_test.cu            | 15 ++++++---
 .../plugins/PixelRecHitGPUKernel.cu           |  9 ++++--
 .../plugins/PixelRecHitGPUKernel.h            |  1 +
 .../plugins/SiPixelRecHitCUDA.cc              |  3 ++
 .../plugins/SiPixelRecHitFromCUDA.cc          | 32 +++++++++++++++----
 .../SiPixelRecHits/plugins/gpuPixelRecHits.h  |  1 +
 9 files changed, 99 insertions(+), 36 deletions(-)

diff --git a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h
index fad70322a7c35..104eab337af3f 100644
--- a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h
+++ b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h
@@ -14,10 +14,11 @@ class TrackingRecHitSoADevice : public cms::cuda::PortableDeviceCollection<Track
 
   // Constructor which specifies the SoA size
   explicit TrackingRecHitSoADevice(uint32_t nHits, bool isPhase2, int32_t offsetBPIX2, pixelCPEforGPU::ParamsOnGPU const* cpeParams, uint32_t const* hitsModuleStart, cudaStream_t stream)
-      : PortableDeviceCollection<TrackingRecHitSoALayout<>>(nHits, stream), nHits_(nHits), cpeParams_(cpeParams)
+      : PortableDeviceCollection<TrackingRecHitSoALayout<>>(nHits, stream), nHits_(nHits), cpeParams_(cpeParams), hitsModuleStart_(hitsModuleStart)
       {
         nModules_ = isPhase2 ? phase2PixelTopology::numberOfModules : phase1PixelTopology::numberOfModules;
         phiBinner_ = &(view().phiBinner());
+        // phiBinner_ = cms::cuda::make_device_unique<TrackingRecHit2DSOAView::PhiBinner>(stream).get();
         cudaCheck(cudaMemcpyAsync(&(view().nHits()), &nHits, sizeof(uint32_t),cudaMemcpyHostToDevice,stream));
         cudaCheck(cudaMemcpyAsync(&(view().nMaxModules()), &nModules_, sizeof(uint32_t),cudaMemcpyHostToDevice,stream));
         cudaCheck(cudaMemcpyAsync(&(view().hitsModuleStart()), hitsModuleStart, sizeof(uint32_t) * int(nModules_),cudaMemcpyHostToDevice,stream));
@@ -26,34 +27,35 @@ class TrackingRecHitSoADevice : public cms::cuda::PortableDeviceCollection<Track
 
       }
 
-  uint32_t nHits() const { return nHits_; }
+  uint32_t nHits() const { return nHits_; } //go to size of view
   uint32_t nModules() const { return nModules_; }
 
   cms::cuda::host::unique_ptr<float[]> localCoordToHostAsync(cudaStream_t stream) const {
-    auto ret = cms::cuda::make_host_unique<float[]>(5 * nHits(), stream);
+    auto ret = cms::cuda::make_host_unique<float[]>(4 * nHits(), stream);
     size_t rowSize = sizeof(float) * nHits();
-    cudaCheck(cudaMemcpyAsync(ret.get(), view().xLocal() , rowSize, cudaMemcpyDeviceToHost, stream));
-    cudaCheck(cudaMemcpyAsync(ret.get() + rowSize , view().xLocal() , rowSize, cudaMemcpyDeviceToHost, stream));
-    cudaCheck(cudaMemcpyAsync(ret.get() + (rowSize * 2), view().xLocal() , rowSize, cudaMemcpyDeviceToHost, stream));
-    cudaCheck(cudaMemcpyAsync(ret.get() + (rowSize * 3) , view().xLocal() , rowSize, cudaMemcpyDeviceToHost, stream));
+    printf("%d \n",nModules());
+    printf("%d \n",nHits());
+    cudaCheck(cudaMemcpyAsync(ret.get(), view().xLocal() , rowSize * 4, cudaMemcpyDeviceToHost, stream));
+    // cudaCheck(cudaMemcpyAsync(ret.get() + rowSize , view().yLocal() , rowSize, cudaMemcpyDeviceToHost, stream));
+    // cudaCheck(cudaMemcpyAsync(ret.get() + size_t(rowSize * 2), view().xerrLocal() , rowSize, cudaMemcpyDeviceToHost, stream));
+    // cudaCheck(cudaMemcpyAsync(ret.get() + size_t(rowSize * 3) , view().yerrLocal() , rowSize, cudaMemcpyDeviceToHost, stream));
     return ret;
-  }
+  } //move to utilities
 
-  cms::cuda::host::unique_ptr<uint32_t[]> hitsModuleStartToHostAsync(cudaStream_t stream) const {
-    auto ret = cms::cuda::make_host_unique<uint32_t[]>(nModules() + 1, stream);
-    cudaCheck(cudaMemcpyAsync(ret.get(), view().hitsModuleStart().begin(), sizeof(uint32_t) * (nModules() + 1), cudaMemcpyDeviceToHost, stream));
-    return ret;
-  }
 
   auto phiBinnerStorage() { return phiBinnerStorage_; }
+  auto hitsModuleStart() const { return hitsModuleStart_; }
   auto phiBinner() { return phiBinner_; }
 
   private:
     uint32_t nHits_; //Needed for the host SoA size
-    pixelCPEforGPU::ParamsOnGPU const* cpeParams_; //TODO: this is used not that much (only once in BrokenLineFit), would make sens to remove it from this class.
+    pixelCPEforGPU::ParamsOnGPU const* cpeParams_; //TODO: this is used not that much from the hits (only once in BrokenLineFit), would make sens to remove it from this class.
+    uint32_t const* hitsModuleStart_;
+
     uint32_t nModules_;
     trackingRecHitSoA::PhiBinnerStorageType* phiBinnerStorage_;
     trackingRecHitSoA::PhiBinner* phiBinner_;
+
 };
 
 
diff --git a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h
index f9cc022e571e3..c37636d68a138 100644
--- a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h
+++ b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h
@@ -6,6 +6,7 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
 #include "DataFormats/SoATemplate/interface/SoALayout.h"
 #include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
 #include "SiPixelHitStatus.h"
 
 namespace trackingRecHitSoA{
@@ -21,16 +22,15 @@ namespace trackingRecHitSoA{
   };
 
   using hindex_type = uint32_t;  // if above is <=2^32
-  using PhiBinner = cms::cuda::
-      HistoContainer<int16_t, 256, -1, 8 * sizeof(int16_t), hindex_type, pixelTopology::maxLayers>;  //28 for phase2 geometry
+  using PhiBinner = cms::cuda::HistoContainer<int16_t, 256, -1, 8 * sizeof(int16_t), hindex_type, phase1PixelTopology::numberOfLayers>;  //28 for phase2 geometry
   using PhiBinnerStorageType = PhiBinner::index_type;
 
   using AverageGeometry = pixelTopology::AverageGeometry;
 
   using ParamsOnGPU = pixelCPEforGPU::ParamsOnGPU;
 
-  using HitLayerStartArray = std::array<uint32_t,11>;
-  using HitModuleStartArray = std::array<uint32_t,1856>;
+  using HitLayerStartArray = std::array<uint32_t,phase1PixelTopology::numberOfLayers+1>;
+  using HitModuleStartArray = std::array<uint32_t,phase1PixelTopology::numberOfModules>;
 
 }
 
@@ -49,20 +49,34 @@ GENERATE_SOA_LAYOUT(TrackingRecHitSoALayout,
                     SOA_COLUMN(int16_t, clusterSizeX),
                     SOA_COLUMN(int16_t, clusterSizeY),
                     SOA_COLUMN(int16_t, detectorIndex),
+                    SOA_COLUMN(trackingRecHitSoA::PhiBinnerStorageType, phiBinnerStorage),
+
+                    SOA_SCALAR(trackingRecHitSoA::HitModuleStartArray,hitsModuleStart),
+                    SOA_SCALAR(trackingRecHitSoA::HitLayerStartArray,hitsLayerStart),
 
                     SOA_SCALAR(trackingRecHitSoA::ParamsOnGPU, cpeParams),
                     SOA_SCALAR(trackingRecHitSoA::AverageGeometry, averageGeometry),
                     SOA_SCALAR(trackingRecHitSoA::PhiBinner, phiBinner),
-                    SOA_SCALAR(trackingRecHitSoA::HitLayerStartArray,hitsLayerStart),
-                    SOA_SCALAR(trackingRecHitSoA::HitModuleStartArray,hitsModuleStart),
+
                     SOA_SCALAR(uint32_t, nHits),
                     SOA_SCALAR(int32_t, offsetBPIX2),
                     SOA_SCALAR(uint32_t, nMaxModules))
 
 namespace trackingRecHitSoA
 {
+
   using HitSoAView = TrackingRecHitSoALayout<>::View;
   using HitSoAConstView = TrackingRecHitSoALayout<>::ConstView;
 
+  constexpr size_t columnsSizes = 8 * sizeof(float) + 4 * sizeof(int16_t) + sizeof(trackingRecHitSoA::SiPixelHitStatusAndCharge) + sizeof(trackingRecHitSoA::PhiBinnerStorageType);
+
+  // cms::cuda::host::unique_ptr<uint32_t[]> hitsModuleStartToHostAsync(HitSoAConstView& view, cudaStream_t stream) {
+  //   // printf("%d \n",nModules());
+  //   auto ret = cms::cuda::make_host_unique<uint32_t[]>(view.nMaxModules() + 1, stream);
+  //   cudaCheck(cudaMemcpyAsync(ret.get(), view.hitsModuleStart().data(), sizeof(uint32_t) * (view.nMaxModules() + 1), cudaMemcpyDeviceToHost, stream));
+  //   return ret;
+  // }
+
+
 }
 #endif
diff --git a/CUDADataFormats/TrackingRecHit/test/TrackingRecHitSoA_test.cpp b/CUDADataFormats/TrackingRecHit/test/TrackingRecHitSoA_test.cpp
index 7f4308ebf1492..eda9a97c02859 100644
--- a/CUDADataFormats/TrackingRecHit/test/TrackingRecHitSoA_test.cpp
+++ b/CUDADataFormats/TrackingRecHit/test/TrackingRecHitSoA_test.cpp
@@ -18,7 +18,7 @@ int main() {
   cms::cudatest::requireDevices();
 
   cudaStream_t stream;
-  cudaCheck(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
+  cudaCheck(cudaStreamCreateWithFlags(&stream, cudaStreamDefault));
 
 
   // inner scope to deallocate memory before destroying the stream
@@ -34,9 +34,23 @@ int main() {
     TrackingRecHitSoADevice tkhit(nHits,false,offset,nullptr,&moduleStart[0],stream);
 
     testTrackingRecHit2DNew::run(tkhit,stream);
+    printf("tkhit hits %d \n",tkhit.nHits());
 
     auto test = tkhit.localCoordToHostAsync(stream);
-    printf("tkhit hits %d \n",tkhit.nHits());
+    printf("test[9] %.2f\n",test[9]);
+
+    // auto mods = tkhit.hitsModuleStartToHostAsync(stream);
+    // auto ret = cms::cuda::make_host_unique<uint32_t[]>(tkhit.nModules() + 1, stream);
+    // uint32_t* ret;
+    // // cudaCheck(cudaMemcpyAsync(ret, &(tkhit.view().hitsModuleStart()), sizeof(uint32_t) * (tkhit.nModules() + 1), cudaMemcpyDeviceToHost, stream));
+    // size_t skipSize = int(trackingRecHitSoA::columnsSizes * nHits);
+    // cudaCheck(cudaMemcpyAsync(ret,
+    //                           tkhit.const_buffer().get() + skipSize,
+    //                           sizeof(uint32_t) * (1856 + 1),
+    //                           cudaMemcpyDeviceToHost,
+    //                           ctx.stream()));
+
+    printf("mods[9] %d\n",tkhit.hitsModuleStart()[9]);
   }
 
   cudaCheck(cudaStreamDestroy(stream));
diff --git a/CUDADataFormats/TrackingRecHit/test/TrackingRecHitSoA_test.cu b/CUDADataFormats/TrackingRecHit/test/TrackingRecHitSoA_test.cu
index 93f4dde061786..eb042219d3a33 100644
--- a/CUDADataFormats/TrackingRecHit/test/TrackingRecHitSoA_test.cu
+++ b/CUDADataFormats/TrackingRecHit/test/TrackingRecHitSoA_test.cu
@@ -18,7 +18,7 @@ namespace testTrackingRecHit2DNew {
    soa[i].iphi() = i%10;
    soa.hitsLayerStart()[j] = j;
     //k = soa.test().a;
-
+    __syncthreads();
   }
 
   __global__ void show(trackingRecHitSoA::HitSoAView soa) {
@@ -26,9 +26,11 @@ namespace testTrackingRecHit2DNew {
 
     int i = threadIdx.x;
     int j = blockIdx.x;
+
     if(i==0 and j==0)
     {
       printf("nbins = %d \n", soa.phiBinner().nbins());
+      printf("mMaxModules = %d \n", soa.nMaxModules());
       printf("offsetBPIX %d ->%d \n",i,soa.offsetBPIX2());
       printf("nHits %d ->%d \n",i,soa.nHits());
       printf("hitsModuleStart %d ->%d \n",i,soa.hitsModuleStart().at(28));
@@ -39,7 +41,7 @@ namespace testTrackingRecHit2DNew {
 
   if(j*blockDim.x+i < soa.phiBinner().nbins())
    printf(">bin size %d ->%d \n",j*blockDim.x+i,soa.phiBinner().size(j*blockDim.x+i));
-
+   __syncthreads();
   }
 
 
@@ -48,19 +50,22 @@ namespace testTrackingRecHit2DNew {
     // assert(soa);
     printf("RUN!\n");
     int k = 0;
-    show<<<10, 100, 0, stream>>>(hits.view());
+    fill<<<10, 100, 0, stream>>>(hits.view());
     printf("k = %d\n",k);
 
+    cudaCheck(cudaDeviceSynchronize());
+
     cms::cuda::fillManyFromVector(hits.phiBinner(),
                                   10,
                                   hits.view().iphi(),
                                   hits.view().hitsLayerStart().data(),
                                   2000,
                                   256,
-                                  hits.phiBinnerStorage(),
+                                  hits.view().phiBinnerStorage(),
                                   stream);
-
+    cudaCheck(cudaDeviceSynchronize());
     show<<<10, 1000, 0, stream>>>(hits.view());
+    cudaCheck(cudaDeviceSynchronize());
   }
 
 }  // namespace testTrackingRecHit2D
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu
index 6fd9a57a6cc72..0fea13b849ef1 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu
@@ -13,6 +13,7 @@
 #include "PixelRecHitGPUKernel.h"
 #include "gpuPixelRecHits.h"
 
+#define GPU_DEBUG
 namespace {
   __global__ void setHitsLayerStart(uint32_t const* __restrict__ hitsModuleStart,
                                     pixelCPEforGPU::ParamsOnGPU const* cpeParams,
@@ -69,13 +70,13 @@ namespace pixelgpudetails {
         setHitsLayerStart<<<1, 32, 0, stream>>>(clusters_d.clusModuleStart(), cpeParams, hits_d.view().hitsLayerStart().data());
         cudaCheck(cudaGetLastError());
         auto nLayers = isPhase2 ? phase2PixelTopology::numberOfLayers : phase1PixelTopology::numberOfLayers;
-        cms::cuda::fillManyFromVector(&(hits_d.view().phiBinner()),
+        cms::cuda::fillManyFromVector(hits_d.phiBinner(),
                                       nLayers,
                                       hits_d.view().iphi(),
                                       hits_d.view().hitsLayerStart().data(),
                                       nHits,
                                       256,
-                                      hits_d.phiBinnerStorage(),
+                                      hits_d.view().phiBinnerStorage(),
                                       stream);
         cudaCheck(cudaGetLastError());
 
@@ -84,6 +85,10 @@ namespace pixelgpudetails {
 #endif
       }
     }
+    #ifdef GPU_DEBUG
+    cudaCheck(cudaDeviceSynchronize());
+          std::cout << "DONE" << std::endl;
+    #endif
 
     return hits_d;
   }
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.h b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.h
index 5d55e713391e1..ada509f57de0a 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.h
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.h
@@ -10,6 +10,7 @@
 #include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h"
 #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h"
 
+#define GPU_DEBUG
 namespace pixelgpudetails {
 
   class PixelRecHitGPUKernel {
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitCUDA.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitCUDA.cc
index 1c050e037144e..2f5c5710b1586 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitCUDA.cc
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitCUDA.cc
@@ -23,6 +23,8 @@
 
 #include "PixelRecHitGPUKernel.h"
 
+#define GPU_DEBUG
+
 class SiPixelRecHitCUDA : public edm::global::EDProducer<> {
 public:
   explicit SiPixelRecHitCUDA(const edm::ParameterSet& iConfig);
@@ -82,6 +84,7 @@ void SiPixelRecHitCUDA::produce(edm::StreamID streamID, edm::Event& iEvent, cons
               tokenHit_,
               gpuAlgo_.makeHitsAsync(
                   digis, clusters, bs, fcpe->getGPUProductAsync(ctx.stream()), fcpe->isPhase2(), ctx.stream()));
+ std::cout << __LINE__<<std::endl;
 }
 
 DEFINE_FWK_MODULE(SiPixelRecHitCUDA);
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitFromCUDA.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitFromCUDA.cc
index bc6c3fa370372..cc4017ab0c91a 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitFromCUDA.cc
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitFromCUDA.cc
@@ -50,6 +50,7 @@ class SiPixelRecHitFromCUDA : public edm::stream::EDProducer<edm::ExternalWork>
   uint32_t nHits_;
   uint32_t nMaxModules_;
   cms::cuda::host::unique_ptr<float[]> store32_;
+  // uint32_t* hitsModuleStart_;
   cms::cuda::host::unique_ptr<uint32_t[]> hitsModuleStart_;
 };
 
@@ -74,15 +75,32 @@ void SiPixelRecHitFromCUDA::acquire(edm::Event const& iEvent,
   cms::cuda::Product<TrackingRecHitSoADevice> const& inputDataWrapped = iEvent.get(hitsToken_);
   cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)};
   auto const& inputData = ctx.get(inputDataWrapped);
-
+  std::cout << __LINE__<<std::endl;
   nHits_ = inputData.nHits();
+  std::cout << __LINE__<<std::endl;
   nMaxModules_ = inputData.nModules();
-  LogDebug("SiPixelRecHitFromCUDA") << "converting " << nHits_ << " Hits";
+  std::cout << __LINE__<<std::endl;
+  // LogDebug("SiPixelRecHitFromCUDA")
+  std::cout << "SiPixelRecHitFromCUDA " << "converting " << nHits_ << " Hits" << std::endl;
 
+  std::cout << inputData.hitsModuleStart()[2] << std::endl;
   if (0 == nHits_)
     return;
   store32_ = inputData.localCoordToHostAsync(ctx.stream());
-  hitsModuleStart_ = inputData.hitsModuleStartToHostAsync(ctx.stream());
+  std::cout << __LINE__<<std::endl;
+
+  // size_t skipSize = int(trackingRecHitSoA::columnsSizes * nHits_);
+  // cudaCheck(cudaMemcpyAsync(hitsModuleStart_,
+  //                           inputData.const_buffer().get() + skipSize,
+  //                           sizeof(uint32_t) * (nMaxModules_ + 1),
+  //                           cudaMemcpyDeviceToHost,
+  //                           ctx.stream()));  // Copy data from Device to Host
+  //
+  // cudaCheck(cudaMemcpyAsync(hitsModuleStart_, inputData.buffer() + int(trackingRecHitSoA::columnsSizes * nHits_), sizeof(uint32_t) * (nMaxModules_ + 1), cudaMemcpyDeviceToHost, ctx.stream()));
+
+  std::copy(inputData.hitsModuleStart(), inputData.hitsModuleStart() + nMaxModules_ + 1, hitsModuleStart_.get());
+// trackingRecHitSoA::hitsModuleStartToHostAsync(inputData.view(), ctx.stream());
+  std::cout << __LINE__ << std::endl;
 }
 
 void SiPixelRecHitFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& es) {
@@ -98,23 +116,23 @@ void SiPixelRecHitFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& e
     return;
   }
   output.reserve(nMaxModules_, nHits_);
-
+  std::cout << __LINE__ << std::endl;
   std::copy(hitsModuleStart_.get(), hitsModuleStart_.get() + nMaxModules_ + 1, hmsp.get());
   // wrap the buffer in a HostProduct, and move it to the Event, without reallocating the buffer or affecting hitsModuleStart
   iEvent.emplace(hostPutToken_, std::move(hmsp));
-
+  std::cout << __LINE__ << std::endl;
   auto xl = store32_.get();
   auto yl = xl + nHits_;
   auto xe = yl + nHits_;
   auto ye = xe + nHits_;
-
+  std::cout << __LINE__ << std::endl;
   const TrackerGeometry* geom = &es.getData(geomToken_);
 
   edm::Handle<SiPixelClusterCollectionNew> hclusters = iEvent.getHandle(clusterToken_);
   auto const& input = *hclusters;
 
   constexpr uint32_t maxHitsInModule = gpuClustering::maxHitsInModule();
-
+  std::cout << __LINE__ << std::endl;
   int numberOfDetUnits = 0;
   int numberOfClusters = 0;
   for (auto const& dsv : input) {
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h b/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h
index db0940f0f50f7..9dbab6c900030 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h
@@ -14,6 +14,7 @@
 #include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDASOAView.h"
 #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h"
 
+#define GPU_DEBUG
 namespace gpuPixelRecHits {
 
   __global__ void getHits(pixelCPEforGPU::ParamsOnGPU const* __restrict__ cpeParams,