diff --git a/CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h b/CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h
deleted file mode 100644
index 3ee5af80353dd..0000000000000
--- a/CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h
+++ /dev/null
@@ -1,9 +0,0 @@
-#ifndef CUDADataFormats_Track_PixelTrackHeterogeneous_h
-#define CUDADataFormats_Track_PixelTrackHeterogeneous_h
-
-#include "CUDADataFormats/Common/interface/HeterogeneousSoA.h"
-#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h"
-
-using PixelTrackHeterogeneous = HeterogeneousSoA<pixelTrack::TrackSoA>;
-
-#endif  // #ifndef CUDADataFormats_Track_PixelTrackHeterogeneous_h
\ No newline at end of file
diff --git a/CUDADataFormats/Track/interface/PixelTrackUtilities.h b/CUDADataFormats/Track/interface/PixelTrackUtilities.h
new file mode 100644
index 0000000000000..ce3658f126a83
--- /dev/null
+++ b/CUDADataFormats/Track/interface/PixelTrackUtilities.h
@@ -0,0 +1,146 @@
+#ifndef CUDADataFormats_Track_PixelTrackUtilities_h
+#define CUDADataFormats_Track_PixelTrackUtilities_h
+
+#include <Eigen/Dense>
+#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
+#include "DataFormats/SoATemplate/interface/SoALayout.h"
+
+namespace pixelTrack {
+  enum class Quality : uint8_t { bad = 0, edup, dup, loose, strict, tight, highPurity, notQuality };
+  constexpr uint32_t qualitySize{uint8_t(Quality::notQuality)};
+  const std::string qualityName[qualitySize]{"bad", "edup", "dup", "loose", "strict", "tight", "highPurity"};
+  inline Quality qualityByName(std::string const &name) {
+    auto qp = std::find(qualityName, qualityName + qualitySize, name) - qualityName;
+    return static_cast<Quality>(qp);
+  }
+
+#ifdef GPU_SMALL_EVENTS
+  // kept for testing and debugging
+  constexpr uint32_t maxNumber() { return 2 * 1024; }
+#else
+  // tested on MC events with 55-75 pileup events
+  constexpr uint32_t maxNumber() { return 32 * 1024; }
+#endif
+
+  using HitContainer = cms::cuda::OneToManyAssoc<uint32_t, pixelTrack::maxNumber() + 1, 5 * pixelTrack::maxNumber()>;
+
+}  // namespace pixelTrack
+
+// Aliases in order to not confuse the GENERATE_SOA_LAYOUT
+// macro with weird colons and angled brackets.
+using Vector5f = Eigen::Matrix<float, 5, 1>;
+using Vector15f = Eigen::Matrix<float, 15, 1>;
+using HitContainer = pixelTrack::HitContainer;
+
+GENERATE_SOA_LAYOUT(TrackSoAHeterogeneousLayout,
+                    SOA_COLUMN(uint8_t, quality),
+                    SOA_COLUMN(float, chi2),  // this is chi2/ndof as not necessarely all hits are used in the fit
+                    SOA_COLUMN(int8_t, nLayers),
+                    SOA_COLUMN(float, eta),
+                    SOA_COLUMN(float, pt),
+                    SOA_EIGEN_COLUMN(Vector5f, state),
+                    SOA_EIGEN_COLUMN(Vector15f, covariance),
+                    SOA_SCALAR(int, nTracks),
+                    SOA_SCALAR(HitContainer, hitIndices),
+                    SOA_SCALAR(HitContainer, detIndices))
+
+// Previous TrajectoryStateSoAT class methods.
+// They operate on View and ConstView of the TrackSoA.
+namespace pixelTrack {
+  namespace utilities {
+    using TrackSoAView = TrackSoAHeterogeneousLayout<>::View;
+    using TrackSoAConstView = TrackSoAHeterogeneousLayout<>::ConstView;
+    using Quality = pixelTrack::Quality;
+    using hindex_type = uint32_t;
+    // State at the Beam spot
+    // phi,tip,1/pt,cotan(theta),zip
+    __host__ __device__ inline float charge(const TrackSoAConstView &tracks, int32_t i) {
+      return std::copysign(1.f, tracks[i].state()(2));
+    }
+
+    __host__ __device__ inline float phi(const TrackSoAConstView &tracks, int32_t i) { return tracks[i].state()(0); }
+
+    __host__ __device__ inline float tip(const TrackSoAConstView &tracks, int32_t i) { return tracks[i].state()(1); }
+
+    __host__ __device__ inline float zip(const TrackSoAConstView &tracks, int32_t i) { return tracks[i].state()(4); }
+
+    __host__ __device__ inline bool isTriplet(const TrackSoAConstView &tracks, int i) {
+      return tracks[i].nLayers() == 3;
+    }
+
+    template <typename V3, typename M3, typename V2, typename M2>
+    __host__ __device__ inline void copyFromCircle(
+        TrackSoAView &tracks, V3 const &cp, M3 const &ccov, V2 const &lp, M2 const &lcov, float b, int32_t i) {
+      tracks[i].state() << cp.template cast<float>(), lp.template cast<float>();
+
+      tracks[i].state()(2) = tracks[i].state()(2) * b;
+      auto cov = tracks[i].covariance();
+      cov(0) = ccov(0, 0);
+      cov(1) = ccov(0, 1);
+      cov(2) = b * float(ccov(0, 2));
+      cov(4) = cov(3) = 0;
+      cov(5) = ccov(1, 1);
+      cov(6) = b * float(ccov(1, 2));
+      cov(8) = cov(7) = 0;
+      cov(9) = b * b * float(ccov(2, 2));
+      cov(11) = cov(10) = 0;
+      cov(12) = lcov(0, 0);
+      cov(13) = lcov(0, 1);
+      cov(14) = lcov(1, 1);
+    }
+
+    template <typename V5, typename M5>
+    __host__ __device__ inline void copyFromDense(TrackSoAView &tracks, V5 const &v, M5 const &cov, int32_t i) {
+      tracks[i].state() = v.template cast<float>();
+      for (int j = 0, ind = 0; j < 5; ++j)
+        for (auto k = j; k < 5; ++k)
+          tracks[i].covariance()(ind++) = cov(j, k);
+    }
+
+    template <typename V5, typename M5>
+    __host__ __device__ inline void copyToDense(const TrackSoAConstView &tracks, V5 &v, M5 &cov, int32_t i) {
+      v = tracks[i].state().template cast<typename V5::Scalar>();
+      for (int j = 0, ind = 0; j < 5; ++j) {
+        cov(j, j) = tracks[i].covariance()(ind++);
+        for (auto k = j + 1; k < 5; ++k)
+          cov(k, j) = cov(j, k) = tracks[i].covariance()(ind++);
+      }
+    }
+
+    // TODO: Not using TrackSoAConstView due to weird bugs with HitContainer
+    __host__ __device__ inline int computeNumberOfLayers(TrackSoAView &tracks, int32_t i) {
+      auto pdet = tracks.detIndices().begin(i);
+      int nl = 1;
+      auto ol = phase1PixelTopology::getLayer(*pdet);
+      for (; pdet < tracks.detIndices().end(i); ++pdet) {
+        auto il = phase1PixelTopology::getLayer(*pdet);
+        if (il != ol)
+          ++nl;
+        ol = il;
+      }
+      return nl;
+    }
+    __host__ __device__ inline int nHits(const TrackSoAConstView &tracks, int i) { return tracks.detIndices().size(i); }
+
+    // Casts quality SoA data (uint8_t) to pixelTrack::Quality. This is required
+    // to use the data as an enum instead of a plain uint8_t
+    __host__ __device__ inline const Quality *qualityData(const TrackSoAConstView &tracks) {
+      return reinterpret_cast<Quality const *>(tracks.quality());
+    }
+    __host__ __device__ inline Quality *qualityData(TrackSoAView tracks) {
+      return reinterpret_cast<Quality *>(tracks.quality());
+    }
+
+  }  // namespace utilities
+}  // namespace pixelTrack
+
+namespace pixelTrack {
+  // Common types for both Host and Device code
+  using TrackSoALayout = TrackSoAHeterogeneousLayout<>;
+  using TrackSoAView = TrackSoAHeterogeneousLayout<>::View;
+  using TrackSoAConstView = TrackSoAHeterogeneousLayout<>::ConstView;
+
+}  // namespace pixelTrack
+
+#endif
diff --git a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h
new file mode 100644
index 0000000000000..71b8dc48b8b35
--- /dev/null
+++ b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h
@@ -0,0 +1,27 @@
+#ifndef CUDADataFormats_Track_TrackHeterogeneousDevice_H
+#define CUDADataFormats_Track_TrackHeterogeneousDevice_H
+
+#include <cstdint>
+
+#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h"
+#include "CUDADataFormats/Common/interface/PortableDeviceCollection.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+template <int32_t S>
+class TrackSoAHeterogeneousDevice : public cms::cuda::PortableDeviceCollection<TrackSoAHeterogeneousLayout<>> {
+public:
+  TrackSoAHeterogeneousDevice() = default;  // cms::cuda::Product needs this
+
+  // Constructor which specifies the SoA size
+  explicit TrackSoAHeterogeneousDevice(cudaStream_t stream)
+      : PortableDeviceCollection<TrackSoAHeterogeneousLayout<>>(S, stream) {}
+};
+
+namespace pixelTrack {
+
+  using TrackSoADevice = TrackSoAHeterogeneousDevice<pixelTrack::maxNumber()>;
+
+}  // namespace pixelTrack
+
+#endif  // CUDADataFormats_Track_TrackHeterogeneousT_H
diff --git a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h
new file mode 100644
index 0000000000000..70427f2bfd559
--- /dev/null
+++ b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h
@@ -0,0 +1,26 @@
+#ifndef CUDADataFormats_Track_TrackHeterogeneousHost_H
+#define CUDADataFormats_Track_TrackHeterogeneousHost_H
+
+#include <cstdint>
+
+#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h"
+#include "CUDADataFormats/Common/interface/PortableHostCollection.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+template <int32_t S>
+class TrackSoAHeterogeneousHost : public cms::cuda::PortableHostCollection<TrackSoAHeterogeneousLayout<>> {
+public:
+  TrackSoAHeterogeneousHost() = default;
+
+  // Constructor which specifies the SoA size
+  explicit TrackSoAHeterogeneousHost(cudaStream_t stream)
+      : PortableHostCollection<TrackSoAHeterogeneousLayout<>>(S, stream) {}
+};
+
+namespace pixelTrack {
+
+  using TrackSoAHost = TrackSoAHeterogeneousHost<pixelTrack::maxNumber()>;
+
+}  // namespace pixelTrack
+
+#endif  // CUDADataFormats_Track_TrackHeterogeneousT_H
diff --git a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h
deleted file mode 100644
index 356ea3eddeb7f..0000000000000
--- a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h
+++ /dev/null
@@ -1,107 +0,0 @@
-#ifndef CUDADataFormats_Track_TrackHeterogeneousT_H
-#define CUDADataFormats_Track_TrackHeterogeneousT_H
-
-#include <string>
-#include <algorithm>
-
-#include "CUDADataFormats/Track/interface/TrajectoryStateSoAT.h"
-#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
-
-#include "CUDADataFormats/Common/interface/HeterogeneousSoA.h"
-
-namespace pixelTrack {
-  enum class Quality : uint8_t { bad = 0, edup, dup, loose, strict, tight, highPurity, notQuality };
-  constexpr uint32_t qualitySize{uint8_t(Quality::notQuality)};
-  const std::string qualityName[qualitySize]{"bad", "edup", "dup", "loose", "strict", "tight", "highPurity"};
-  inline Quality qualityByName(std::string const &name) {
-    auto qp = std::find(qualityName, qualityName + qualitySize, name) - qualityName;
-    return static_cast<Quality>(qp);
-  }
-}  // namespace pixelTrack
-
-template <int32_t S>
-class TrackSoAHeterogeneousT {
-public:
-  static constexpr int32_t stride() { return S; }
-
-  using Quality = pixelTrack::Quality;
-  using hindex_type = uint32_t;
-  using HitContainer = cms::cuda::OneToManyAssoc<hindex_type, S + 1, 5 * S>;
-
-  // Always check quality is at least loose!
-  // CUDA does not support enums  in __lgc ...
-private:
-  eigenSoA::ScalarSoA<uint8_t, S> quality_;
-
-public:
-  constexpr Quality quality(int32_t i) const { return (Quality)(quality_(i)); }
-  constexpr Quality &quality(int32_t i) { return (Quality &)(quality_(i)); }
-  constexpr Quality const *qualityData() const { return (Quality const *)(quality_.data()); }
-  constexpr Quality *qualityData() { return (Quality *)(quality_.data()); }
-
-  // this is chi2/ndof as not necessarely all hits are used in the fit
-  eigenSoA::ScalarSoA<float, S> chi2;
-
-  eigenSoA::ScalarSoA<int8_t, S> nLayers;
-
-  constexpr int nTracks() const { return nTracks_; }
-  constexpr void setNTracks(int n) { nTracks_ = n; }
-
-  constexpr int nHits(int i) const { return detIndices.size(i); }
-
-  constexpr bool isTriplet(int i) const { return nLayers(i) == 3; }
-
-  constexpr int computeNumberOfLayers(int32_t i) const {
-    // layers are in order and we assume tracks are either forward or backward
-    auto pdet = detIndices.begin(i);
-    int nl = 1;
-    auto ol = phase1PixelTopology::getLayer(*pdet);
-    for (; pdet < detIndices.end(i); ++pdet) {
-      auto il = phase1PixelTopology::getLayer(*pdet);
-      if (il != ol)
-        ++nl;
-      ol = il;
-    }
-    return nl;
-  }
-
-  // State at the Beam spot
-  // phi,tip,1/pt,cotan(theta),zip
-  TrajectoryStateSoAT<S> stateAtBS;
-  eigenSoA::ScalarSoA<float, S> eta;
-  eigenSoA::ScalarSoA<float, S> pt;
-  constexpr float charge(int32_t i) const { return std::copysign(1.f, stateAtBS.state(i)(2)); }
-  constexpr float phi(int32_t i) const { return stateAtBS.state(i)(0); }
-  constexpr float tip(int32_t i) const { return stateAtBS.state(i)(1); }
-  constexpr float zip(int32_t i) const { return stateAtBS.state(i)(4); }
-
-  // state at the detector of the outermost hit
-  // representation to be decided...
-  // not yet filled on GPU
-  // TrajectoryStateSoA<S> stateAtOuterDet;
-
-  HitContainer hitIndices;
-  HitContainer detIndices;
-
-private:
-  int nTracks_;
-};
-
-namespace pixelTrack {
-
-#ifdef GPU_SMALL_EVENTS
-  // kept for testing and debugging
-  constexpr uint32_t maxNumber() { return 2 * 1024; }
-#else
-  // tested on MC events with 55-75 pileup events
-  constexpr uint32_t maxNumber() { return 32 * 1024; }
-#endif
-
-  using TrackSoA = TrackSoAHeterogeneousT<maxNumber()>;
-  using TrajectoryState = TrajectoryStateSoAT<maxNumber()>;
-  using HitContainer = TrackSoA::HitContainer;
-
-}  // namespace pixelTrack
-
-#endif  // CUDADataFormats_Track_TrackHeterogeneousT_H
diff --git a/CUDADataFormats/Track/interface/TrajectoryStateSoAT.h b/CUDADataFormats/Track/interface/TrajectoryStateSoAT.h
deleted file mode 100644
index 64fcd573a6991..0000000000000
--- a/CUDADataFormats/Track/interface/TrajectoryStateSoAT.h
+++ /dev/null
@@ -1,59 +0,0 @@
-#ifndef CUDADataFormats_Track_TrajectoryStateSOAT_H
-#define CUDADataFormats_Track_TrajectoryStateSOAT_H
-
-#include <Eigen/Dense>
-#include "HeterogeneousCore/CUDAUtilities/interface/eigenSoA.h"
-
-template <int32_t S>
-struct TrajectoryStateSoAT {
-  using Vector5f = Eigen::Matrix<float, 5, 1>;
-  using Vector15f = Eigen::Matrix<float, 15, 1>;
-
-  using Vector5d = Eigen::Matrix<double, 5, 1>;
-  using Matrix5d = Eigen::Matrix<double, 5, 5>;
-
-  static constexpr int32_t stride() { return S; }
-
-  eigenSoA::MatrixSoA<Vector5f, S> state;
-  eigenSoA::MatrixSoA<Vector15f, S> covariance;
-
-  template <typename V3, typename M3, typename V2, typename M2>
-  __host__ __device__ inline void copyFromCircle(
-      V3 const& cp, M3 const& ccov, V2 const& lp, M2 const& lcov, float b, int32_t i) {
-    state(i) << cp.template cast<float>(), lp.template cast<float>();
-    state(i)(2) *= b;
-    auto cov = covariance(i);
-    cov(0) = ccov(0, 0);
-    cov(1) = ccov(0, 1);
-    cov(2) = b * float(ccov(0, 2));
-    cov(4) = cov(3) = 0;
-    cov(5) = ccov(1, 1);
-    cov(6) = b * float(ccov(1, 2));
-    cov(8) = cov(7) = 0;
-    cov(9) = b * b * float(ccov(2, 2));
-    cov(11) = cov(10) = 0;
-    cov(12) = lcov(0, 0);
-    cov(13) = lcov(0, 1);
-    cov(14) = lcov(1, 1);
-  }
-
-  template <typename V5, typename M5>
-  __host__ __device__ inline void copyFromDense(V5 const& v, M5 const& cov, int32_t i) {
-    state(i) = v.template cast<float>();
-    for (int j = 0, ind = 0; j < 5; ++j)
-      for (auto k = j; k < 5; ++k)
-        covariance(i)(ind++) = cov(j, k);
-  }
-
-  template <typename V5, typename M5>
-  __host__ __device__ inline void copyToDense(V5& v, M5& cov, int32_t i) const {
-    v = state(i).template cast<typename V5::Scalar>();
-    for (int j = 0, ind = 0; j < 5; ++j) {
-      cov(j, j) = covariance(i)(ind++);
-      for (auto k = j + 1; k < 5; ++k)
-        cov(k, j) = cov(j, k) = covariance(i)(ind++);
-    }
-  }
-};
-
-#endif  // CUDADataFormats_Track_TrajectoryStateSOAT_H
diff --git a/CUDADataFormats/Track/src/TrackSoAHeterogeneous_t_test.cc b/CUDADataFormats/Track/src/TrackSoAHeterogeneous_t_test.cc
new file mode 100644
index 0000000000000..24792bb6350f8
--- /dev/null
+++ b/CUDADataFormats/Track/src/TrackSoAHeterogeneous_t_test.cc
@@ -0,0 +1,2 @@
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h"
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h"
diff --git a/CUDADataFormats/Track/src/classes.h b/CUDADataFormats/Track/src/classes.h
index 97c116f6c88d3..338598e28ebf5 100644
--- a/CUDADataFormats/Track/src/classes.h
+++ b/CUDADataFormats/Track/src/classes.h
@@ -3,7 +3,8 @@
 
 #include "CUDADataFormats/Common/interface/Product.h"
 #include "CUDADataFormats/Common/interface/HostProduct.h"
-#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h"
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h"
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h"
 #include "DataFormats/Common/interface/Wrapper.h"
 
 #endif  // CUDADataFormats_Track_src_classes_h
diff --git a/CUDADataFormats/Track/src/classes_def.xml b/CUDADataFormats/Track/src/classes_def.xml
index 9c80ae91baf29..c4337b0b7ee06 100644
--- a/CUDADataFormats/Track/src/classes_def.xml
+++ b/CUDADataFormats/Track/src/classes_def.xml
@@ -1,6 +1,8 @@
 <lcgdict>
-  <class name="cms::cuda::Product<HeterogeneousSoA<pixelTrack::TrackSoA>>" persistent="false"/>
-  <class name="edm::Wrapper<cms::cuda::Product<HeterogeneousSoA<pixelTrack::TrackSoA>>>" persistent="false"/>
-  <class name="HeterogeneousSoA<pixelTrack::TrackSoA>" persistent="false"/>
-  <class name="edm::Wrapper<HeterogeneousSoA<pixelTrack::TrackSoA>>" persistent="false"/>
+  <class name="pixelTrack::TrackSoAHost" persistent="false"/>
+  <class name="edm::Wrapper<pixelTrack::TrackSoAHost>" persistent="false"/>
+  <class name="pixelTrack::TrackSoADevice" persistent="false"/>
+  <class name="edm::Wrapper<pixelTrack::TrackSoADevice>" persistent="false"/>
+  <class name="cms::cuda::Product<pixelTrack::TrackSoADevice>" persistent="false"/>
+  <class name="edm::Wrapper<cms::cuda::Product<pixelTrack::TrackSoADevice>>" persistent="false"/>
 </lcgdict>
diff --git a/CUDADataFormats/Track/test/BuildFile.xml b/CUDADataFormats/Track/test/BuildFile.xml
index fc78783db473b..e91df3fc785f7 100644
--- a/CUDADataFormats/Track/test/BuildFile.xml
+++ b/CUDADataFormats/Track/test/BuildFile.xml
@@ -1,19 +1,12 @@
 <use name="HeterogeneousCore/CUDAUtilities"/>
-
 <bin file="TrackSoAHeterogeneous_t.cpp">
   <use name="eigen"/>
   <flags CXXFLAGS="-g -DGPU_DEBUG"/>
 </bin>
 
-<bin file="TrajectoryStateSOA_t.cpp" name="cpuTrajectoryStateSOA_t">
-  <use name="eigen"/>
-  <flags CXXFLAGS="-g -DGPU_DEBUG"/>
-</bin>
-
+<use name="CUDADataFormats/Track"/>
+<flags CUDA_FLAGS="-g -DGPU_DEBUG"/>
 <iftool name="cuda-gcc-support">
-<bin file="TrajectoryStateSOA_t.cu" name="gpuTrajectoryStateSOA_t">
-  <use name="eigen"/>
-  <flags CUDA_FLAGS="-g -DGPU_DEBUG"/>
-  <flags CXXFLAGS="-g -DGPU_DEBUG"/>
-</bin>
+  <bin file="TrackSoAHeterogeneous_test.cpp TrackSoAHeterogeneous_test.cu" name="TrackSoAHeterogeneous_test">
+  </bin>
 </iftool>
diff --git a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_t.cpp b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_t.cpp
index 9708b689dd05b..b3d62ffa810f6 100644
--- a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_t.cpp
+++ b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_t.cpp
@@ -1,4 +1,5 @@
-#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h"
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h"
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h"
 
 #include <iostream>
 #include <cassert>
diff --git a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp
new file mode 100644
index 0000000000000..572a84cdd2d73
--- /dev/null
+++ b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp
@@ -0,0 +1,72 @@
+/**
+   Simple test for the pixelTrack::TrackSoA data structure
+   which inherits from PortableDeviceCollection.
+
+   Creates an instance of the class (automatically allocates
+   memory on device), passes the view of the SoA data to
+   the CUDA kernels which:
+   - Fill the SoA with data.
+   - Verify that the data written is correct.
+
+   Then, the SoA data are copied back to Host, where
+   a temporary host-side view (tmp_view) is created using
+   the same Layout to access the data on host and print it.
+ */
+
+#include <cstdint>
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h"
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/allocate_device.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/currentDevice.h"
+
+namespace testTrackSoAHeterogeneousT {
+
+  void runKernels(pixelTrack::TrackSoAView tracks_view, cudaStream_t stream);
+}
+
+int main() {
+  cms::cudatest::requireDevices();
+
+  cudaStream_t stream;
+  cudaCheck(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
+
+  // Inner scope to deallocate memory before destroying the stream
+  {
+    // Instantiate tracks on device. PortableDeviceCollection allocates
+    // SoA on device automatically.
+    pixelTrack::TrackSoADevice tracks_d(stream);
+    testTrackSoAHeterogeneousT::runKernels(tracks_d.view(), stream);
+
+    // Instantate tracks on host. This is where the data will be
+    // copied to from device.
+    pixelTrack::TrackSoAHost tracks_h(stream);
+    //tracks_d.copyToHost(tracks_h.buffer(), stream);
+    cudaCheck(cudaMemcpyAsync(
+        tracks_h.buffer().get(), tracks_d.const_buffer().get(), tracks_d.bufferSize(), cudaMemcpyDeviceToHost, stream));
+    cudaCheck(cudaGetLastError());
+
+    // Print results
+    std::cout << "pt"
+              << "\t"
+              << "eta"
+              << "\t"
+              << "chi2"
+              << "\t"
+              << "quality"
+              << "\t"
+              << "nLayers"
+              << "\t"
+              << "hitIndices off" << std::endl;
+
+    for (int i = 0; i < 10; ++i) {
+      std::cout << tracks_h.view()[i].pt() << "\t" << tracks_h.view()[i].eta() << "\t" << tracks_h.view()[i].chi2()
+                << "\t" << (int)tracks_h.view()[i].quality() << "\t" << (int)tracks_h.view()[i].nLayers() << "\t"
+                << tracks_h.view().hitIndices().off[i] << std::endl;
+    }
+  }
+  cudaCheck(cudaStreamDestroy(stream));
+
+  return 0;
+}
diff --git a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu
new file mode 100644
index 0000000000000..8273f011ace80
--- /dev/null
+++ b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu
@@ -0,0 +1,52 @@
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h"
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/OneToManyAssoc.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+namespace testTrackSoAHeterogeneousT {
+
+  __global__ void fill(pixelTrack::TrackSoAView tracks_view) {
+    int i = threadIdx.x;
+    if (i == 0) {
+      tracks_view.nTracks() = 420;
+    }
+
+    for (int j = i; j < tracks_view.metadata().size(); j += blockDim.x) {
+      tracks_view[j].pt() = (float)j;
+      tracks_view[j].eta() = (float)j;
+      tracks_view[j].chi2() = (float)j;
+      tracks_view[j].quality() = (uint8_t)j % 256;
+      tracks_view[j].nLayers() = j % 128;
+      tracks_view.hitIndices().off[j] = j;
+    }
+  }
+
+  // TODO: Use TrackSoAConstView when https://github.com/cms-sw/cmssw/pull/39919 is merged
+  __global__ void verify(pixelTrack::TrackSoAView tracks_view) {
+    int i = threadIdx.x;
+
+    if (i == 0) {
+      printf("SoA size: % d, block dims: % d\n", tracks_view.metadata().size(), blockDim.x);
+      assert(tracks_view.nTracks() == 420);
+    }
+    for (int j = i; j < tracks_view.metadata().size(); j += blockDim.x) {
+      assert(abs(tracks_view[j].pt() - (float)j) < .0001);
+      assert(abs(tracks_view[j].eta() - (float)j) < .0001);
+      assert(abs(tracks_view[j].chi2() - (float)j) < .0001);
+      assert(tracks_view[j].quality() == j % 256);
+      assert(tracks_view[j].nLayers() == j % 128);
+      assert(tracks_view.hitIndices().off[j] == j);
+    }
+  }
+
+  void runKernels(pixelTrack::TrackSoAView tracks_view, cudaStream_t stream) {
+    fill<<<1, 1024, 0, stream>>>(tracks_view);
+    cudaCheck(cudaGetLastError());
+    cudaCheck(cudaDeviceSynchronize());
+
+    verify<<<1, 1024, 0, stream>>>(tracks_view);
+    cudaCheck(cudaGetLastError());
+    cudaCheck(cudaDeviceSynchronize());
+  }
+
+}  // namespace testTrackSoAHeterogeneousT
diff --git a/CUDADataFormats/Track/test/TrajectoryStateSOA_t.cpp b/CUDADataFormats/Track/test/TrajectoryStateSOA_t.cpp
deleted file mode 100644
index d6ff539a642b0..0000000000000
--- a/CUDADataFormats/Track/test/TrajectoryStateSOA_t.cpp
+++ /dev/null
@@ -1 +0,0 @@
-#include "TrajectoryStateSOA_t.h"
diff --git a/CUDADataFormats/Track/test/TrajectoryStateSOA_t.cu b/CUDADataFormats/Track/test/TrajectoryStateSOA_t.cu
deleted file mode 100644
index d6ff539a642b0..0000000000000
--- a/CUDADataFormats/Track/test/TrajectoryStateSOA_t.cu
+++ /dev/null
@@ -1 +0,0 @@
-#include "TrajectoryStateSOA_t.h"
diff --git a/CUDADataFormats/Track/test/TrajectoryStateSOA_t.h b/CUDADataFormats/Track/test/TrajectoryStateSOA_t.h
deleted file mode 100644
index 97b88873c2613..0000000000000
--- a/CUDADataFormats/Track/test/TrajectoryStateSOA_t.h
+++ /dev/null
@@ -1,75 +0,0 @@
-#include "CUDADataFormats/Track/interface/TrajectoryStateSoAT.h"
-
-using Vector5d = Eigen::Matrix<double, 5, 1>;
-using Matrix5d = Eigen::Matrix<double, 5, 5>;
-
-__host__ __device__ Matrix5d loadCov(Vector5d const& e) {
-  Matrix5d cov;
-  for (int i = 0; i < 5; ++i)
-    cov(i, i) = e(i) * e(i);
-  for (int i = 0; i < 5; ++i) {
-    for (int j = 0; j < i; ++j) {
-      double v = 0.3 * std::sqrt(cov(i, i) * cov(j, j));  // this makes the matrix pos defined
-      cov(i, j) = (i + j) % 2 ? -0.4 * v : 0.1 * v;
-      cov(j, i) = cov(i, j);
-    }
-  }
-  return cov;
-}
-
-using TS = TrajectoryStateSoAT<128>;
-
-__global__ void testTSSoA(TS* pts, int n) {
-  assert(n <= 128);
-
-  Vector5d par0;
-  par0 << 0.2, 0.1, 3.5, 0.8, 0.1;
-  Vector5d e0;
-  e0 << 0.01, 0.01, 0.035, -0.03, -0.01;
-  auto cov0 = loadCov(e0);
-
-  TS& ts = *pts;
-
-  int first = threadIdx.x + blockIdx.x * blockDim.x;
-
-  for (int i = first; i < n; i += blockDim.x * gridDim.x) {
-    ts.copyFromDense(par0, cov0, i);
-    Vector5d par1;
-    Matrix5d cov1;
-    ts.copyToDense(par1, cov1, i);
-    Vector5d delV = par1 - par0;
-    Matrix5d delM = cov1 - cov0;
-    for (int j = 0; j < 5; ++j) {
-      assert(std::abs(delV(j)) < 1.e-5);
-      for (auto k = j; k < 5; ++k) {
-        assert(cov0(k, j) == cov0(j, k));
-        assert(cov1(k, j) == cov1(j, k));
-        assert(std::abs(delM(k, j)) < 1.e-5);
-      }
-    }
-  }
-}
-
-#ifdef __CUDACC__
-#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-#endif
-
-int main() {
-#ifdef __CUDACC__
-  cms::cudatest::requireDevices();
-#endif
-
-  TS ts;
-
-#ifdef __CUDACC__
-  TS* ts_d;
-  cudaCheck(cudaMalloc(&ts_d, sizeof(TS)));
-  testTSSoA<<<1, 64>>>(ts_d, 128);
-  cudaCheck(cudaGetLastError());
-  cudaCheck(cudaMemcpy(&ts, ts_d, sizeof(TS), cudaMemcpyDefault));
-  cudaCheck(cudaDeviceSynchronize());
-#else
-  testTSSoA(&ts, 128);
-#endif
-}
diff --git a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h
index 8ce37f280ac6c..98112285fce13 100644
--- a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h
+++ b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h
@@ -4,6 +4,7 @@
 #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DSOAView.h"
 #include "CUDADataFormats/Common/interface/HeterogeneousSoA.h"
 #include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h"
+#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h"
 
 template <typename Traits>
 class TrackingRecHit2DHeterogeneous {
diff --git a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h
new file mode 100644
index 0000000000000..104eab337af3f
--- /dev/null
+++ b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h
@@ -0,0 +1,62 @@
+#ifndef CUDADataFormats_Track_TrackHeterogeneousDevice_H
+#define CUDADataFormats_Track_TrackHeterogeneousDevice_H
+
+#include <cstdint>
+
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h"
+#include "CUDADataFormats/Common/interface/PortableDeviceCollection.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+class TrackingRecHitSoADevice : public cms::cuda::PortableDeviceCollection<TrackingRecHitSoALayout<>> {
+public:
+  TrackingRecHitSoADevice() = default;  // cms::cuda::Product needs this
+
+  // Constructor which specifies the SoA size
+  explicit TrackingRecHitSoADevice(uint32_t nHits, bool isPhase2, int32_t offsetBPIX2, pixelCPEforGPU::ParamsOnGPU const* cpeParams, uint32_t const* hitsModuleStart, cudaStream_t stream)
+      : PortableDeviceCollection<TrackingRecHitSoALayout<>>(nHits, stream), nHits_(nHits), cpeParams_(cpeParams), hitsModuleStart_(hitsModuleStart)
+      {
+        nModules_ = isPhase2 ? phase2PixelTopology::numberOfModules : phase1PixelTopology::numberOfModules;
+        phiBinner_ = &(view().phiBinner());
+        // phiBinner_ = cms::cuda::make_device_unique<TrackingRecHit2DSOAView::PhiBinner>(stream).get();
+        cudaCheck(cudaMemcpyAsync(&(view().nHits()), &nHits, sizeof(uint32_t),cudaMemcpyHostToDevice,stream));
+        cudaCheck(cudaMemcpyAsync(&(view().nMaxModules()), &nModules_, sizeof(uint32_t),cudaMemcpyHostToDevice,stream));
+        cudaCheck(cudaMemcpyAsync(&(view().hitsModuleStart()), hitsModuleStart, sizeof(uint32_t) * int(nModules_),cudaMemcpyHostToDevice,stream));
+        // cudaCheck(cudaMemcpyAsync(&(view().cpeParams()), cpeParams, int(sizeof(pixelCPEforGPU::ParamsOnGPU)),cudaMemcpyHostToDevice,stream));
+        cudaCheck(cudaMemcpyAsync(&(view().offsetBPIX2()), &offsetBPIX2, sizeof(int32_t),cudaMemcpyHostToDevice,stream));
+
+      }
+
+  uint32_t nHits() const { return nHits_; } //go to size of view
+  uint32_t nModules() const { return nModules_; }
+
+  cms::cuda::host::unique_ptr<float[]> localCoordToHostAsync(cudaStream_t stream) const {
+    auto ret = cms::cuda::make_host_unique<float[]>(4 * nHits(), stream);
+    size_t rowSize = sizeof(float) * nHits();
+    printf("%d \n",nModules());
+    printf("%d \n",nHits());
+    cudaCheck(cudaMemcpyAsync(ret.get(), view().xLocal() , rowSize * 4, cudaMemcpyDeviceToHost, stream));
+    // cudaCheck(cudaMemcpyAsync(ret.get() + rowSize , view().yLocal() , rowSize, cudaMemcpyDeviceToHost, stream));
+    // cudaCheck(cudaMemcpyAsync(ret.get() + size_t(rowSize * 2), view().xerrLocal() , rowSize, cudaMemcpyDeviceToHost, stream));
+    // cudaCheck(cudaMemcpyAsync(ret.get() + size_t(rowSize * 3) , view().yerrLocal() , rowSize, cudaMemcpyDeviceToHost, stream));
+    return ret;
+  } //move to utilities
+
+
+  auto phiBinnerStorage() { return phiBinnerStorage_; }
+  auto hitsModuleStart() const { return hitsModuleStart_; }
+  auto phiBinner() { return phiBinner_; }
+
+  private:
+    uint32_t nHits_; //Needed for the host SoA size
+    pixelCPEforGPU::ParamsOnGPU const* cpeParams_; //TODO: this is used not that much from the hits (only once in BrokenLineFit), would make sens to remove it from this class.
+    uint32_t const* hitsModuleStart_;
+
+    uint32_t nModules_;
+    trackingRecHitSoA::PhiBinnerStorageType* phiBinnerStorage_;
+    trackingRecHitSoA::PhiBinner* phiBinner_;
+
+};
+
+
+#endif  // CUDADataFormats_Track_TrackHeterogeneousT_H
diff --git a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoAHost.h b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoAHost.h
new file mode 100644
index 0000000000000..cb76d538474da
--- /dev/null
+++ b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoAHost.h
@@ -0,0 +1,45 @@
+#ifndef CUDADataFormats_Track_TrackHeterogeneousHost_H
+#define CUDADataFormats_Track_TrackHeterogeneousHost_H
+
+#include <cstdint>
+
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h"
+#include "CUDADataFormats/Common/interface/PortableHostCollection.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+class TrackingRecHitSoAHost : public cms::cuda::PortableHostCollection<TrackingRecHitSoALayout<>> {
+public:
+  TrackingRecHitSoAHost() = default;
+
+  // This SoA Host is used basically only for DQM
+  // so we  just need a slim constructor
+  explicit TrackingRecHitSoAHost(uint32_t nHits, cudaStream_t stream)
+  : PortableHostCollection<TrackingRecHitSoALayout<>>(nHits, stream) {}
+
+  explicit TrackingRecHitSoAHost(uint32_t nHits, bool isPhase2, int32_t offsetBPIX2, pixelCPEforGPU::ParamsOnGPU const* cpeParams, uint32_t const* hitsModuleStart, cudaStream_t stream)
+      : PortableHostCollection<TrackingRecHitSoALayout<>>(nHits, stream), nHits_(nHits), cpeParams_(cpeParams)
+      {
+        nModules_ = isPhase2 ? phase2PixelTopology::numberOfModules : phase1PixelTopology::numberOfModules;
+
+        view().nHits() = nHits;
+        view().nMaxModules() = nModules_;
+        std::copy(hitsModuleStart,hitsModuleStart+nModules_,view().hitsModuleStart().begin());
+
+        view().offsetBPIX2() = offsetBPIX2;
+
+      }
+
+  uint32_t nHits() const { return nHits_; }
+  uint32_t nModules() const { return nModules_; }
+  auto phiBinnerStorage() { return phiBinnerStorage_; }
+
+  private:
+    uint32_t nHits_; //Needed for the host SoA size
+    pixelCPEforGPU::ParamsOnGPU const* cpeParams_;
+    uint32_t nModules_;
+    trackingRecHitSoA::PhiBinnerStorageType* phiBinnerStorage_;
+};
+
+
+#endif  // CUDADataFormats_Track_TrackHeterogeneousT_H
diff --git a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h
new file mode 100644
index 0000000000000..c37636d68a138
--- /dev/null
+++ b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h
@@ -0,0 +1,82 @@
+#ifndef CUDADataFormats_RecHits_TrackingRecHitsUtilities_h
+#define CUDADataFormats_RecHits_TrackingRecHitsUtilities_h
+
+#include <Eigen/Dense>
+#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
+#include "DataFormats/SoATemplate/interface/SoALayout.h"
+#include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
+#include "SiPixelHitStatus.h"
+
+namespace trackingRecHitSoA{
+
+  // more information on bit fields : https://en.cppreference.com/w/cpp/language/bit_field
+  struct SiPixelHitStatusAndCharge {
+    SiPixelHitStatus status;
+    uint32_t charge : 24;
+  };
+
+  struct Test {
+    int a;
+  };
+
+  using hindex_type = uint32_t;  // if above is <=2^32
+  using PhiBinner = cms::cuda::HistoContainer<int16_t, 256, -1, 8 * sizeof(int16_t), hindex_type, phase1PixelTopology::numberOfLayers>;  //28 for phase2 geometry
+  using PhiBinnerStorageType = PhiBinner::index_type;
+
+  using AverageGeometry = pixelTopology::AverageGeometry;
+
+  using ParamsOnGPU = pixelCPEforGPU::ParamsOnGPU;
+
+  using HitLayerStartArray = std::array<uint32_t,phase1PixelTopology::numberOfLayers+1>;
+  using HitModuleStartArray = std::array<uint32_t,phase1PixelTopology::numberOfModules>;
+
+}
+
+
+GENERATE_SOA_LAYOUT(TrackingRecHitSoALayout,
+                    SOA_COLUMN(float, xLocal),
+                    SOA_COLUMN(float, yLocal),  // this is chi2/ndof as not necessarely all hits are used in the fit
+                    SOA_COLUMN(float, xerrLocal),
+                    SOA_COLUMN(float, yerrLocal),
+                    SOA_COLUMN(float, xGlobal),
+                    SOA_COLUMN(float, yGlobal),
+                    SOA_COLUMN(float, zGlobal),
+                    SOA_COLUMN(float, rGlobal),
+                    SOA_COLUMN(int16_t, iphi),
+                    SOA_COLUMN(trackingRecHitSoA::SiPixelHitStatusAndCharge, chargeAndStatus),
+                    SOA_COLUMN(int16_t, clusterSizeX),
+                    SOA_COLUMN(int16_t, clusterSizeY),
+                    SOA_COLUMN(int16_t, detectorIndex),
+                    SOA_COLUMN(trackingRecHitSoA::PhiBinnerStorageType, phiBinnerStorage),
+
+                    SOA_SCALAR(trackingRecHitSoA::HitModuleStartArray,hitsModuleStart),
+                    SOA_SCALAR(trackingRecHitSoA::HitLayerStartArray,hitsLayerStart),
+
+                    SOA_SCALAR(trackingRecHitSoA::ParamsOnGPU, cpeParams),
+                    SOA_SCALAR(trackingRecHitSoA::AverageGeometry, averageGeometry),
+                    SOA_SCALAR(trackingRecHitSoA::PhiBinner, phiBinner),
+
+                    SOA_SCALAR(uint32_t, nHits),
+                    SOA_SCALAR(int32_t, offsetBPIX2),
+                    SOA_SCALAR(uint32_t, nMaxModules))
+
+namespace trackingRecHitSoA
+{
+
+  using HitSoAView = TrackingRecHitSoALayout<>::View;
+  using HitSoAConstView = TrackingRecHitSoALayout<>::ConstView;
+
+  constexpr size_t columnsSizes = 8 * sizeof(float) + 4 * sizeof(int16_t) + sizeof(trackingRecHitSoA::SiPixelHitStatusAndCharge) + sizeof(trackingRecHitSoA::PhiBinnerStorageType);
+
+  // cms::cuda::host::unique_ptr<uint32_t[]> hitsModuleStartToHostAsync(HitSoAConstView& view, cudaStream_t stream) {
+  //   // printf("%d \n",nModules());
+  //   auto ret = cms::cuda::make_host_unique<uint32_t[]>(view.nMaxModules() + 1, stream);
+  //   cudaCheck(cudaMemcpyAsync(ret.get(), view.hitsModuleStart().data(), sizeof(uint32_t) * (view.nMaxModules() + 1), cudaMemcpyDeviceToHost, stream));
+  //   return ret;
+  // }
+
+
+}
+#endif
diff --git a/CUDADataFormats/TrackingRecHit/src/classes.h b/CUDADataFormats/TrackingRecHit/src/classes.h
index abecfb38797de..b43537c915e3d 100644
--- a/CUDADataFormats/TrackingRecHit/src/classes.h
+++ b/CUDADataFormats/TrackingRecHit/src/classes.h
@@ -4,6 +4,8 @@
 #include "CUDADataFormats/Common/interface/Product.h"
 #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h"
 #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DReduced.h"
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoAHost.h"
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h"
 #include "DataFormats/Common/interface/Wrapper.h"
 
 #endif  // CUDADataFormats_SiPixelCluster_src_classes_h
diff --git a/CUDADataFormats/TrackingRecHit/src/classes_def.xml b/CUDADataFormats/TrackingRecHit/src/classes_def.xml
index f633d77c48ef7..fe92a1b6ea31e 100644
--- a/CUDADataFormats/TrackingRecHit/src/classes_def.xml
+++ b/CUDADataFormats/TrackingRecHit/src/classes_def.xml
@@ -7,4 +7,8 @@
   <class name="edm::Wrapper<cms::cuda::Product<TrackingRecHit2DGPU>>" persistent="false"/>
   <class name="TrackingRecHit2DReduced" persistent="false"/>
   <class name="edm::Wrapper<TrackingRecHit2DReduced>" persistent="false"/>
+  <class name="TrackingRecHitSoAHost" persistent="false"/>
+  <class name="edm::Wrapper<TrackingRecHitSoAHost>" persistent="false"/>
+  <class name="cms::cuda::Product<TrackingRecHitSoADevice>" persistent="false"/>
+  <class name="edm::Wrapper<cms::cuda::Product<TrackingRecHitSoADevice>>" persistent="false"/>
 </lcgdict>
diff --git a/CUDADataFormats/TrackingRecHit/test/BuildFile.xml b/CUDADataFormats/TrackingRecHit/test/BuildFile.xml
index ce49c46fffba0..77626dbf724ff 100644
--- a/CUDADataFormats/TrackingRecHit/test/BuildFile.xml
+++ b/CUDADataFormats/TrackingRecHit/test/BuildFile.xml
@@ -1,5 +1,12 @@
 <use name="CUDADataFormats/TrackingRecHit"/>
 <flags CUDA_FLAGS="-g -DGPU_DEBUG"/>
+
 <iftool name="cuda-gcc-support">
-<bin file="TrackingRecHit2DCUDA_t.cpp TrackingRecHit2DCUDA_t.cu" name="TrackingRecHit2DCUDA_t"/>
+ <bin file="TrackingRecHit2DCUDA_t.cpp TrackingRecHit2DCUDA_t.cu" name="TrackingRecHit2DCUDA_t">
+ </bin>
+</iftool>
+
+<iftool name="cuda-gcc-support">
+  <bin file="TrackingRecHitSoA_test.cpp TrackingRecHitSoA_test.cu" name="TrackingRecHitSoA_test">
+  </bin>
 </iftool>
diff --git a/CUDADataFormats/TrackingRecHit/test/TrackingRecHitSoA_test.cpp b/CUDADataFormats/TrackingRecHit/test/TrackingRecHitSoA_test.cpp
new file mode 100644
index 0000000000000..eda9a97c02859
--- /dev/null
+++ b/CUDADataFormats/TrackingRecHit/test/TrackingRecHitSoA_test.cpp
@@ -0,0 +1,59 @@
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoAHost.h"
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h"
+
+#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/allocate_device.h"
+
+#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h"
+
+namespace testTrackingRecHit2DNew {
+
+  void run(TrackingRecHitSoADevice& hits, cudaStream_t stream);
+
+}
+
+int main() {
+  cms::cudatest::requireDevices();
+
+  cudaStream_t stream;
+  cudaCheck(cudaStreamCreateWithFlags(&stream, cudaStreamDefault));
+
+
+  // inner scope to deallocate memory before destroying the stream
+  {
+    uint32_t nHits = 2000;
+    int32_t offset = 100;
+    uint32_t moduleStart[1856];
+
+    for (size_t i = 0; i < 1856; i++) {
+      moduleStart[i] = i*2;
+    }
+
+    TrackingRecHitSoADevice tkhit(nHits,false,offset,nullptr,&moduleStart[0],stream);
+
+    testTrackingRecHit2DNew::run(tkhit,stream);
+    printf("tkhit hits %d \n",tkhit.nHits());
+
+    auto test = tkhit.localCoordToHostAsync(stream);
+    printf("test[9] %.2f\n",test[9]);
+
+    // auto mods = tkhit.hitsModuleStartToHostAsync(stream);
+    // auto ret = cms::cuda::make_host_unique<uint32_t[]>(tkhit.nModules() + 1, stream);
+    // uint32_t* ret;
+    // // cudaCheck(cudaMemcpyAsync(ret, &(tkhit.view().hitsModuleStart()), sizeof(uint32_t) * (tkhit.nModules() + 1), cudaMemcpyDeviceToHost, stream));
+    // size_t skipSize = int(trackingRecHitSoA::columnsSizes * nHits);
+    // cudaCheck(cudaMemcpyAsync(ret,
+    //                           tkhit.const_buffer().get() + skipSize,
+    //                           sizeof(uint32_t) * (1856 + 1),
+    //                           cudaMemcpyDeviceToHost,
+    //                           ctx.stream()));
+
+    printf("mods[9] %d\n",tkhit.hitsModuleStart()[9]);
+  }
+
+  cudaCheck(cudaStreamDestroy(stream));
+
+  return 0;
+}
diff --git a/CUDADataFormats/TrackingRecHit/test/TrackingRecHitSoA_test.cu b/CUDADataFormats/TrackingRecHit/test/TrackingRecHitSoA_test.cu
new file mode 100644
index 0000000000000..eb042219d3a33
--- /dev/null
+++ b/CUDADataFormats/TrackingRecHit/test/TrackingRecHitSoA_test.cu
@@ -0,0 +1,71 @@
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h"
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h"
+
+namespace testTrackingRecHit2DNew {
+
+  __global__ void fill(trackingRecHitSoA::HitSoAView soa) {
+    // assert(soa);
+
+    int i = threadIdx.x;
+    int j = blockIdx.x;
+    if(i==0 and j==0)
+    {
+
+      soa.offsetBPIX2() = 22;
+      soa[10].xLocal() =1.11;
+   }
+
+   soa[i].iphi() = i%10;
+   soa.hitsLayerStart()[j] = j;
+    //k = soa.test().a;
+    __syncthreads();
+  }
+
+  __global__ void show(trackingRecHitSoA::HitSoAView soa) {
+    // assert(soa);
+
+    int i = threadIdx.x;
+    int j = blockIdx.x;
+
+    if(i==0 and j==0)
+    {
+      printf("nbins = %d \n", soa.phiBinner().nbins());
+      printf("mMaxModules = %d \n", soa.nMaxModules());
+      printf("offsetBPIX %d ->%d \n",i,soa.offsetBPIX2());
+      printf("nHits %d ->%d \n",i,soa.nHits());
+      printf("hitsModuleStart %d ->%d \n",i,soa.hitsModuleStart().at(28));
+   }
+
+   if(i<soa.nHits())
+    printf("iPhi %d ->%d \n",i,soa[i].iphi());
+
+  if(j*blockDim.x+i < soa.phiBinner().nbins())
+   printf(">bin size %d ->%d \n",j*blockDim.x+i,soa.phiBinner().size(j*blockDim.x+i));
+   __syncthreads();
+  }
+
+
+
+  void run(TrackingRecHitSoADevice& hits, cudaStream_t stream) {
+    // assert(soa);
+    printf("RUN!\n");
+    int k = 0;
+    fill<<<10, 100, 0, stream>>>(hits.view());
+    printf("k = %d\n",k);
+
+    cudaCheck(cudaDeviceSynchronize());
+
+    cms::cuda::fillManyFromVector(hits.phiBinner(),
+                                  10,
+                                  hits.view().iphi(),
+                                  hits.view().hitsLayerStart().data(),
+                                  2000,
+                                  256,
+                                  hits.view().phiBinnerStorage(),
+                                  stream);
+    cudaCheck(cudaDeviceSynchronize());
+    show<<<10, 1000, 0, stream>>>(hits.view());
+    cudaCheck(cudaDeviceSynchronize());
+  }
+
+}  // namespace testTrackingRecHit2D
diff --git a/CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h b/CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h
deleted file mode 100644
index 417a960951fb1..0000000000000
--- a/CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h
+++ /dev/null
@@ -1,13 +0,0 @@
-#ifndef CUDADataFormatsVertexZVertexHeterogeneous_H
-#define CUDADataFormatsVertexZVertexHeterogeneous_H
-
-#include "CUDADataFormats/Vertex/interface/ZVertexSoA.h"
-#include "CUDADataFormats/Common/interface/HeterogeneousSoA.h"
-
-using ZVertexHeterogeneous = HeterogeneousSoA<ZVertexSoA>;
-#ifndef __CUDACC__
-#include "CUDADataFormats/Common/interface/Product.h"
-using ZVertexCUDAProduct = cms::cuda::Product<ZVertexHeterogeneous>;
-#endif
-
-#endif
diff --git a/CUDADataFormats/Vertex/interface/ZVertexSoA.h b/CUDADataFormats/Vertex/interface/ZVertexSoA.h
deleted file mode 100644
index e31b87f30fa11..0000000000000
--- a/CUDADataFormats/Vertex/interface/ZVertexSoA.h
+++ /dev/null
@@ -1,26 +0,0 @@
-#ifndef CUDADataFormats_Vertex_ZVertexSoA_h
-#define CUDADataFormats_Vertex_ZVertexSoA_h
-
-#include <cstdint>
-#include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h"
-
-// SOA for vertices
-// These vertices are clusterized and fitted only along the beam line (z)
-// to obtain their global coordinate the beam spot position shall be added (eventually correcting for the beam angle as well)
-struct ZVertexSoA {
-  static constexpr uint32_t MAXTRACKS = 32 * 1024;
-  static constexpr uint32_t MAXVTX = 1024;
-
-  int16_t idv[MAXTRACKS];    // vertex index for each associated (original) track  (-1 == not associate)
-  float zv[MAXVTX];          // output z-posistion of found vertices
-  float wv[MAXVTX];          // output weight (1/error^2) on the above
-  float chi2[MAXVTX];        // vertices chi2
-  float ptv2[MAXVTX];        // vertices pt^2
-  int32_t ndof[MAXTRACKS];   // vertices number of dof (reused as workspace for the number of nearest neighbours FIXME)
-  uint16_t sortInd[MAXVTX];  // sorted index (by pt2)  ascending
-  uint32_t nvFinal;          // the number of vertices
-
-  __host__ __device__ void init() { nvFinal = 0; }
-};
-
-#endif  // CUDADataFormats_Vertex_ZVertexSoA_h
diff --git a/CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h b/CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h
new file mode 100644
index 0000000000000..ca97e2533b8d1
--- /dev/null
+++ b/CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h
@@ -0,0 +1,27 @@
+#ifndef CUDADataFormats_Vertex_ZVertexHeterogeneousDevice_H
+#define CUDADataFormats_Vertex_ZVertexHeterogeneousDevice_H
+
+#include <cstdint>
+
+#include "CUDADataFormats/Vertex/interface/ZVertexUtilities.h"
+#include "CUDADataFormats/Common/interface/PortableDeviceCollection.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+template <int32_t S>
+class ZVertexSoAHeterogeneousDevice : public cms::cuda::PortableDeviceCollection<ZVertexSoAHeterogeneousLayout<>> {
+public:
+  ZVertexSoAHeterogeneousDevice() = default;  // cms::cuda::Product needs this
+
+  // Constructor which specifies the SoA size
+  explicit ZVertexSoAHeterogeneousDevice(cudaStream_t stream)
+      : PortableDeviceCollection<ZVertexSoAHeterogeneousLayout<>>(S, stream) {}
+};
+
+namespace ZVertex {
+
+  using ZVertexSoADevice = ZVertexSoAHeterogeneousDevice<ZVertex::utilities::MAXTRACKS>;
+
+}  // namespace ZVertex
+
+#endif  // CUDADataFormats_Vertex_ZVertexHeterogeneousDevice_H
diff --git a/CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h b/CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h
new file mode 100644
index 0000000000000..4c07bb3ffedb4
--- /dev/null
+++ b/CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h
@@ -0,0 +1,25 @@
+#ifndef CUDADataFormats_Vertex_ZVertexHeterogeneousHost_H
+#define CUDADataFormats_Vertex_ZVertexHeterogeneousHost_H
+
+#include <cstdint>
+
+#include "CUDADataFormats/Vertex/interface/ZVertexUtilities.h"
+#include "CUDADataFormats/Common/interface/PortableHostCollection.h"
+
+template <int32_t S>
+class ZVertexSoAHeterogeneousHost : public cms::cuda::PortableHostCollection<ZVertexSoAHeterogeneousLayout<>> {
+public:
+  ZVertexSoAHeterogeneousHost() = default;
+
+  // Constructor which specifies the SoA size and CUDA stream
+  explicit ZVertexSoAHeterogeneousHost(cudaStream_t stream)
+      : PortableHostCollection<ZVertexSoAHeterogeneousLayout<>>(S, stream) {}
+};
+
+namespace ZVertex {
+
+  using ZVertexSoAHost = ZVertexSoAHeterogeneousHost<ZVertex::utilities::MAXTRACKS>;
+
+}  // namespace ZVertex
+
+#endif  // CUDADataFormats_Vertex_ZVertexHeterogeneousHost_H
diff --git a/CUDADataFormats/Vertex/interface/ZVertexUtilities.h b/CUDADataFormats/Vertex/interface/ZVertexUtilities.h
new file mode 100644
index 0000000000000..d0614abee91c9
--- /dev/null
+++ b/CUDADataFormats/Vertex/interface/ZVertexUtilities.h
@@ -0,0 +1,35 @@
+#ifndef CUDADataFormats_Vertex_ZVertexUtilities_h
+#define CUDADataFormats_Vertex_ZVertexUtilities_h
+
+#include <cuda_runtime.h>
+#include "DataFormats/SoATemplate/interface/SoALayout.h"
+
+GENERATE_SOA_LAYOUT(ZVertexSoAHeterogeneousLayout,
+                    SOA_COLUMN(int16_t, idv),
+                    SOA_COLUMN(float, zv),
+                    SOA_COLUMN(float, wv),
+                    SOA_COLUMN(float, chi2),
+                    SOA_COLUMN(float, ptv2),
+                    SOA_COLUMN(int32_t, ndof),
+                    SOA_COLUMN(uint16_t, sortInd),
+                    SOA_SCALAR(uint32_t, nvFinal))
+
+// Previous ZVertexSoA class methods.
+// They operate on View and ConstView of the ZVertexSoA.
+namespace ZVertex {
+  // Common types for both Host and Device code
+  using ZVertexSoALayout = ZVertexSoAHeterogeneousLayout<>;
+  using ZVertexSoAView = ZVertexSoAHeterogeneousLayout<>::View;
+  using ZVertexSoAConstView = ZVertexSoAHeterogeneousLayout<>::ConstView;
+
+  namespace utilities {
+
+    static constexpr uint32_t MAXTRACKS = 32 * 1024;
+    static constexpr uint32_t MAXVTX = 1024;
+
+    __host__ __device__ inline void init(ZVertexSoAView &vertices) { vertices.nvFinal() = 0; }
+
+  }  // namespace utilities
+}  // namespace ZVertex
+
+#endif
diff --git a/CUDADataFormats/Vertex/src/classes.h b/CUDADataFormats/Vertex/src/classes.h
index 7931beaa8f4bd..0340affffa06c 100644
--- a/CUDADataFormats/Vertex/src/classes.h
+++ b/CUDADataFormats/Vertex/src/classes.h
@@ -1,7 +1,8 @@
 #ifndef CUDADataFormats_Vertex_src_classes_h
 #define CUDADataFormats_Vertex_src_classes_h
 
-#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h"
+#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h"
+#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h"
 #include "CUDADataFormats/Common/interface/Product.h"
 #include "DataFormats/Common/interface/Wrapper.h"
 
diff --git a/CUDADataFormats/Vertex/src/classes_def.xml b/CUDADataFormats/Vertex/src/classes_def.xml
index ea633080af9af..58616cbb534fa 100644
--- a/CUDADataFormats/Vertex/src/classes_def.xml
+++ b/CUDADataFormats/Vertex/src/classes_def.xml
@@ -1,6 +1,8 @@
 <lcgdict>
-  <class name="cms::cuda::Product<ZVertexHeterogeneous>" persistent="false"/>
-  <class name="edm::Wrapper<ZVertexCUDAProduct>" persistent="false"/>
-  <class name="ZVertexHeterogeneous" persistent="false"/>
-  <class name="edm::Wrapper<ZVertexHeterogeneous>" persistent="false"/>
+  <class name="ZVertex::ZVertexSoAHost" persistent="false"/>
+  <class name="edm::Wrapper<ZVertex::ZVertexSoAHost>" persistent="false"/>
+  <class name="ZVertex::ZVertexSoADevice" persistent="false"/>
+  <class name="edm::Wrapper<ZVertex::ZVertexSoADevice>" persistent="false"/>
+  <class name="cms::cuda::Product<ZVertex::ZVertexSoADevice>" persistent="false"/>
+  <class name="edm::Wrapper<cms::cuda::Product<ZVertex::ZVertexSoADevice>>" persistent="false"/>
 </lcgdict>
diff --git a/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1CompareTrackSoA.cc b/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1CompareTrackSoA.cc
index 7b12f694d4e8c..36c045582c942 100644
--- a/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1CompareTrackSoA.cc
+++ b/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1CompareTrackSoA.cc
@@ -2,7 +2,7 @@
 // Package:    SiPixelPhase1CompareTrackSoA
 // Class:      SiPixelPhase1CompareTrackSoA
 //
-/**\class SiPixelPhase1CompareTrackSoA SiPixelPhase1CompareTrackSoA.cc 
+/**\class SiPixelPhase1CompareTrackSoA SiPixelPhase1CompareTrackSoA.cc
 */
 //
 // Author: Suvankar Roy Chowdhury
@@ -20,7 +20,9 @@
 #include "DQMServices/Core/interface/MonitorElement.h"
 #include "DQMServices/Core/interface/DQMEDAnalyzer.h"
 #include "DQMServices/Core/interface/DQMStore.h"
-#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
+//#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h"
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h"
 // for string manipulations
 #include <fmt/printf.h>
 
@@ -71,8 +73,8 @@ class SiPixelPhase1CompareTrackSoA : public DQMEDAnalyzer {
   static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
 
 private:
-  const edm::EDGetTokenT<PixelTrackHeterogeneous> tokenSoATrackCPU_;
-  const edm::EDGetTokenT<PixelTrackHeterogeneous> tokenSoATrackGPU_;
+  const edm::EDGetTokenT<pixelTrack::TrackSoAHost> tokenSoATrackCPU_;
+  const edm::EDGetTokenT<pixelTrack::TrackSoAHost> tokenSoATrackGPU_;
   const std::string topFolderName_;
   const bool useQualityCut_;
   const pixelTrack::Quality minQuality_;
@@ -111,10 +113,12 @@ class SiPixelPhase1CompareTrackSoA : public DQMEDAnalyzer {
 //
 // constructors
 //
+// Note that the GPU TrackSoA is also of type TrackSoAHost, as the data have
+// been copied from Device to Host
 
 SiPixelPhase1CompareTrackSoA::SiPixelPhase1CompareTrackSoA(const edm::ParameterSet& iConfig)
-    : tokenSoATrackCPU_(consumes<PixelTrackHeterogeneous>(iConfig.getParameter<edm::InputTag>("pixelTrackSrcCPU"))),
-      tokenSoATrackGPU_(consumes<PixelTrackHeterogeneous>(iConfig.getParameter<edm::InputTag>("pixelTrackSrcGPU"))),
+    : tokenSoATrackCPU_(consumes<pixelTrack::TrackSoAHost>(iConfig.getParameter<edm::InputTag>("pixelTrackSrcCPU"))),
+      tokenSoATrackGPU_(consumes<pixelTrack::TrackSoAHost>(iConfig.getParameter<edm::InputTag>("pixelTrackSrcGPU"))),
       topFolderName_(iConfig.getParameter<std::string>("topFolderName")),
       useQualityCut_(iConfig.getParameter<bool>("useQualityCut")),
       minQuality_(pixelTrack::qualityByName(iConfig.getParameter<std::string>("minQuality"))),
@@ -138,12 +142,12 @@ void SiPixelPhase1CompareTrackSoA::analyze(const edm::Event& iEvent, const edm::
     return;
   }
 
-  auto const& tsoaCPU = *tsoaHandleCPU->get();
-  auto const& tsoaGPU = *tsoaHandleGPU->get();
-  auto maxTracksCPU = tsoaCPU.stride();  //this should be same for both?
-  auto maxTracksGPU = tsoaGPU.stride();  //this should be same for both?
-  auto const* qualityCPU = tsoaCPU.qualityData();
-  auto const* qualityGPU = tsoaGPU.qualityData();
+  auto& tsoaCPU = *tsoaHandleCPU.product();
+  auto& tsoaGPU = *tsoaHandleGPU.product();
+  auto maxTracksCPU = tsoaCPU.view().metadata().size();  //this should be same for both?
+  auto maxTracksGPU = tsoaGPU.view().metadata().size();  //this should be same for both?
+  auto const* qualityCPU = pixelTrack::utilities::qualityData(tsoaCPU.view());
+  auto const* qualityGPU = pixelTrack::utilities::qualityData(tsoaGPU.view());
   int32_t nTracksCPU = 0;
   int32_t nTracksGPU = 0;
   int32_t nLooseAndAboveTracksCPU = 0;
@@ -153,9 +157,9 @@ void SiPixelPhase1CompareTrackSoA::analyze(const edm::Event& iEvent, const edm::
   //Loop over GPU tracks and store the indices of the loose tracks. Whats happens if useQualityCut_ is false?
   std::vector<int32_t> looseTrkidxGPU;
   for (int32_t jt = 0; jt < maxTracksGPU; ++jt) {
-    if (tsoaGPU.nHits(jt) == 0)
+    if (pixelTrack::utilities::nHits(tsoaGPU.view(), jt) == 0)
       break;  // this is a guard
-    if (!(tsoaGPU.pt(jt) > 0.))
+    if (!(tsoaGPU.view()[jt].pt() > 0.))
       continue;
     nTracksGPU++;
     if (useQualityCut_ && qualityGPU[jt] < minQuality_)
@@ -166,9 +170,18 @@ void SiPixelPhase1CompareTrackSoA::analyze(const edm::Event& iEvent, const edm::
 
   //Now loop over CPU tracks//nested loop for loose gPU tracks
   for (int32_t it = 0; it < maxTracksCPU; ++it) {
-    if (tsoaCPU.nHits(it) == 0)
+    float chi2CPU = tsoaCPU.view()[it].chi2();
+    int nHitsCPU = pixelTrack::utilities::nHits(tsoaCPU.view(), it);
+    int8_t nLayersCPU = tsoaCPU.view()[it].nLayers();
+    float ptCPU = tsoaCPU.view()[it].pt();
+    float etaCPU = tsoaCPU.view()[it].eta();
+    float phiCPU = pixelTrack::utilities::phi(tsoaCPU.view(), it);
+    float zipCPU = pixelTrack::utilities::zip(tsoaCPU.view(), it);
+    float tipCPU = pixelTrack::utilities::tip(tsoaCPU.view(), it);
+
+    if (nHitsCPU == 0)
       break;  // this is a guard
-    if (!(tsoaCPU.pt(it) > 0.))
+    if (!(ptCPU > 0.))
       continue;
     nTracksCPU++;
     if (useQualityCut_ && qualityCPU[it] < minQuality_)
@@ -178,12 +191,10 @@ void SiPixelPhase1CompareTrackSoA::analyze(const edm::Event& iEvent, const edm::
     const int32_t notFound = -1;
     int32_t closestTkidx = notFound;
     float mindr2 = dr2cut_;
-    float etacpu = tsoaCPU.eta(it);
-    float phicpu = tsoaCPU.phi(it);
     for (auto gid : looseTrkidxGPU) {
-      float etagpu = tsoaGPU.eta(gid);
-      float phigpu = tsoaGPU.phi(gid);
-      float dr2 = reco::deltaR2(etacpu, phicpu, etagpu, phigpu);
+      float etaGPU = tsoaGPU.view()[gid].eta();
+      float phiGPU = pixelTrack::utilities::phi(tsoaGPU.view(), gid);
+      float dr2 = reco::deltaR2(etaCPU, phiCPU, etaGPU, phiGPU);
       if (dr2 > dr2cut_)
         continue;  // this is arbitrary
       if (mindr2 > dr2) {
@@ -192,27 +203,36 @@ void SiPixelPhase1CompareTrackSoA::analyze(const edm::Event& iEvent, const edm::
       }
     }
 
-    hpt_eta_tkAllCPU_->Fill(etacpu, tsoaCPU.pt(it));  //all CPU tk
-    hphi_z_tkAllCPU_->Fill(phicpu, tsoaCPU.zip(it));
+    hpt_eta_tkAllCPU_->Fill(etaCPU, ptCPU);  //all CPU tk
+    hphi_z_tkAllCPU_->Fill(phiCPU, zipCPU);
     if (closestTkidx == notFound)
       continue;
     nLooseAndAboveTracksCPU_matchedGPU++;
 
-    hchi2_->Fill(tsoaCPU.chi2(it), tsoaGPU.chi2(closestTkidx));
-    hnHits_->Fill(tsoaCPU.nHits(it), tsoaGPU.nHits(closestTkidx));
-    hnLayers_->Fill(tsoaCPU.nLayers(it), tsoaGPU.nLayers(closestTkidx));
-    hpt_->Fill(tsoaCPU.pt(it), tsoaGPU.pt(closestTkidx));
-    hptLogLog_->Fill(tsoaCPU.pt(it), tsoaGPU.pt(closestTkidx));
-    heta_->Fill(etacpu, tsoaGPU.eta(closestTkidx));
-    hphi_->Fill(phicpu, tsoaGPU.phi(closestTkidx));
-    hz_->Fill(tsoaCPU.zip(it), tsoaGPU.zip(closestTkidx));
-    htip_->Fill(tsoaCPU.tip(it), tsoaGPU.tip(closestTkidx));
-    hptdiffMatched_->Fill(tsoaCPU.pt(it) - tsoaGPU.pt(closestTkidx));
-    hetadiffMatched_->Fill(etacpu - tsoaGPU.eta(closestTkidx));
-    hphidiffMatched_->Fill(reco::deltaPhi(phicpu, tsoaGPU.phi(closestTkidx)));
-    hzdiffMatched_->Fill(tsoaCPU.zip(it) - tsoaGPU.zip(closestTkidx));
-    hpt_eta_tkAllCPUMatched_->Fill(etacpu, tsoaCPU.pt(it));  //matched to gpu
-    hphi_z_tkAllCPUMatched_->Fill(phicpu, tsoaCPU.zip(it));
+    float chi2GPU = tsoaGPU.view()[closestTkidx].chi2();
+    int nHitsGPU = pixelTrack::utilities::nHits(tsoaGPU.view(), closestTkidx);
+    int8_t nLayersGPU = tsoaGPU.view()[closestTkidx].nLayers();
+    float ptGPU = tsoaGPU.view()[closestTkidx].pt();
+    float etaGPU = tsoaGPU.view()[closestTkidx].eta();
+    float phiGPU = pixelTrack::utilities::phi(tsoaGPU.view(), closestTkidx);
+    float zipGPU = pixelTrack::utilities::zip(tsoaGPU.view(), closestTkidx);
+    float tipGPU = pixelTrack::utilities::tip(tsoaGPU.view(), closestTkidx);
+
+    hchi2_->Fill(chi2CPU, chi2GPU);
+    hnHits_->Fill(nHitsCPU, nHitsGPU);
+    hnLayers_->Fill(nLayersCPU, nLayersGPU);
+    hpt_->Fill(ptCPU, ptCPU);
+    hptLogLog_->Fill(ptCPU, ptGPU);
+    heta_->Fill(etaCPU, etaGPU);
+    hphi_->Fill(phiCPU, phiGPU);
+    hz_->Fill(zipCPU, zipGPU);
+    htip_->Fill(tipCPU, tipGPU);
+    hptdiffMatched_->Fill(ptCPU - ptGPU);
+    hetadiffMatched_->Fill(etaCPU - etaGPU);
+    hphidiffMatched_->Fill(reco::deltaPhi(phiCPU, phiGPU));
+    hzdiffMatched_->Fill(zipCPU - zipGPU);
+    hpt_eta_tkAllCPUMatched_->Fill(etaCPU, ptCPU);  //matched to gpu
+    hphi_z_tkAllCPUMatched_->Fill(phiCPU, zipCPU);
   }
   hnTracks_->Fill(nTracksCPU, nTracksGPU);
   hnLooseAndAboveTracks_->Fill(nLooseAndAboveTracksCPU, nLooseAndAboveTracksGPU);
diff --git a/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1CompareVertexSoA.cc b/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1CompareVertexSoA.cc
index 0113ea50973d8..9172824631da2 100644
--- a/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1CompareVertexSoA.cc
+++ b/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1CompareVertexSoA.cc
@@ -18,7 +18,7 @@
 #include "DQMServices/Core/interface/MonitorElement.h"
 #include "DQMServices/Core/interface/DQMEDAnalyzer.h"
 #include "DQMServices/Core/interface/DQMStore.h"
-#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h"
+#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h"
 #include "DataFormats/BeamSpot/interface/BeamSpot.h"
 
 class SiPixelPhase1CompareVertexSoA : public DQMEDAnalyzer {
@@ -31,8 +31,9 @@ class SiPixelPhase1CompareVertexSoA : public DQMEDAnalyzer {
   static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
 
 private:
-  const edm::EDGetTokenT<ZVertexHeterogeneous> tokenSoAVertexCPU_;
-  const edm::EDGetTokenT<ZVertexHeterogeneous> tokenSoAVertexGPU_;
+  const edm::EDGetTokenT<ZVertex::ZVertexSoAHost> tokenSoAVertexCPU_;
+  // Note that this has been copied from device to host, hence is a HostCollection
+  const edm::EDGetTokenT<ZVertex::ZVertexSoAHost> tokenSoAVertexGPU_;
   const edm::EDGetTokenT<reco::BeamSpot> tokenBeamSpot_;
   const std::string topFolderName_;
   const float dzCut_;
@@ -54,8 +55,8 @@ class SiPixelPhase1CompareVertexSoA : public DQMEDAnalyzer {
 //
 
 SiPixelPhase1CompareVertexSoA::SiPixelPhase1CompareVertexSoA(const edm::ParameterSet& iConfig)
-    : tokenSoAVertexCPU_(consumes<ZVertexHeterogeneous>(iConfig.getParameter<edm::InputTag>("pixelVertexSrcCPU"))),
-      tokenSoAVertexGPU_(consumes<ZVertexHeterogeneous>(iConfig.getParameter<edm::InputTag>("pixelVertexSrcGPU"))),
+    : tokenSoAVertexCPU_(consumes<ZVertex::ZVertexSoAHost>(iConfig.getParameter<edm::InputTag>("pixelVertexSrcCPU"))),
+      tokenSoAVertexGPU_(consumes<ZVertex::ZVertexSoAHost>(iConfig.getParameter<edm::InputTag>("pixelVertexSrcGPU"))),
       tokenBeamSpot_(consumes<reco::BeamSpot>(iConfig.getParameter<edm::InputTag>("beamSpotSrc"))),
       topFolderName_(iConfig.getParameter<std::string>("topFolderName")),
       dzCut_(iConfig.getParameter<double>("dzCut")) {}
@@ -78,10 +79,10 @@ void SiPixelPhase1CompareVertexSoA::analyze(const edm::Event& iEvent, const edm:
     return;
   }
 
-  auto const& vsoaCPU = *vsoaHandleCPU->get();
-  int nVerticesCPU = vsoaCPU.nvFinal;
-  auto const& vsoaGPU = *vsoaHandleGPU->get();
-  int nVerticesGPU = vsoaGPU.nvFinal;
+  auto& vsoaCPU = *vsoaHandleCPU;
+  int nVerticesCPU = vsoaCPU.view().nvFinal();
+  auto& vsoaGPU = *vsoaHandleGPU;
+  int nVerticesGPU = vsoaGPU.view().nvFinal();
 
   auto bsHandle = iEvent.getHandle(tokenBeamSpot_);
   float x0 = 0., y0 = 0., z0 = 0., dxdz = 0., dydz = 0.;
@@ -97,22 +98,22 @@ void SiPixelPhase1CompareVertexSoA::analyze(const edm::Event& iEvent, const edm:
   }
 
   for (int ivc = 0; ivc < nVerticesCPU; ivc++) {
-    auto sic = vsoaCPU.sortInd[ivc];
-    auto zc = vsoaCPU.zv[sic];
+    auto sic = vsoaCPU.view()[ivc].sortInd();
+    auto zc = vsoaCPU.view()[sic].zv();
     auto xc = x0 + dxdz * zc;
     auto yc = y0 + dydz * zc;
     zc += z0;
 
-    auto ndofCPU = vsoaCPU.ndof[sic];
-    auto chi2CPU = vsoaCPU.chi2[sic];
+    auto ndofCPU = vsoaCPU.view()[sic].ndof();
+    auto chi2CPU = vsoaCPU.view()[sic].chi2();
 
     const int32_t notFound = -1;
     int32_t closestVtxidx = notFound;
     float mindz = dzCut_;
 
     for (int ivg = 0; ivg < nVerticesGPU; ivg++) {
-      auto sig = vsoaGPU.sortInd[ivg];
-      auto zgc = vsoaGPU.zv[sig] + z0;
+      auto sig = vsoaGPU.view()[ivg].sortInd();
+      auto zgc = vsoaGPU.view()[sig].zv() + z0;
       auto zDist = std::abs(zc - zgc);
       //insert some matching condition
       if (zDist > dzCut_)
@@ -125,12 +126,12 @@ void SiPixelPhase1CompareVertexSoA::analyze(const edm::Event& iEvent, const edm:
     if (closestVtxidx == notFound)
       continue;
 
-    auto zg = vsoaGPU.zv[closestVtxidx];
+    auto zg = vsoaGPU.view()[closestVtxidx].zv();
     auto xg = x0 + dxdz * zg;
     auto yg = y0 + dydz * zg;
     zg += z0;
-    auto ndofGPU = vsoaGPU.ndof[closestVtxidx];
-    auto chi2GPU = vsoaGPU.chi2[closestVtxidx];
+    auto ndofGPU = vsoaGPU.view()[closestVtxidx].ndof();
+    auto chi2GPU = vsoaGPU.view()[closestVtxidx].chi2();
 
     hx_->Fill(xc - x0, xg - x0);
     hy_->Fill(yc - y0, yg - y0);
@@ -140,7 +141,7 @@ void SiPixelPhase1CompareVertexSoA::analyze(const edm::Event& iEvent, const edm:
     hzdiff_->Fill(zc - zg);
     hchi2_->Fill(chi2CPU, chi2GPU);
     hchi2oNdof_->Fill(chi2CPU / ndofCPU, chi2GPU / ndofGPU);
-    hptv2_->Fill(vsoaCPU.ptv2[sic], vsoaGPU.ptv2[closestVtxidx]);
+    hptv2_->Fill(vsoaCPU.view()[sic].ptv2(), vsoaGPU.view()[closestVtxidx].ptv2());
     hntrks_->Fill(ndofCPU + 1, ndofGPU + 1);
   }
   hnVertex_->Fill(nVerticesCPU, nVerticesGPU);
diff --git a/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1MonitorTrackSoA.cc b/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1MonitorTrackSoA.cc
index 622895ba07bcc..b4c996afc7055 100644
--- a/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1MonitorTrackSoA.cc
+++ b/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1MonitorTrackSoA.cc
@@ -21,7 +21,8 @@
 #include "DQMServices/Core/interface/MonitorElement.h"
 #include "DQMServices/Core/interface/DQMEDAnalyzer.h"
 #include "DQMServices/Core/interface/DQMStore.h"
-#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
+#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h"
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h"
 // for string manipulations
 #include <fmt/printf.h>
 
@@ -34,7 +35,7 @@ class SiPixelPhase1MonitorTrackSoA : public DQMEDAnalyzer {
   static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
 
 private:
-  edm::EDGetTokenT<PixelTrackHeterogeneous> tokenSoATrack_;
+  edm::EDGetTokenT<pixelTrack::TrackSoAHost> tokenSoATrack_;
   std::string topFolderName_;
   bool useQualityCut_;
   pixelTrack::Quality minQuality_;
@@ -62,7 +63,7 @@ class SiPixelPhase1MonitorTrackSoA : public DQMEDAnalyzer {
 //
 
 SiPixelPhase1MonitorTrackSoA::SiPixelPhase1MonitorTrackSoA(const edm::ParameterSet& iConfig) {
-  tokenSoATrack_ = consumes<PixelTrackHeterogeneous>(iConfig.getParameter<edm::InputTag>("pixelTrackSrc"));
+  tokenSoATrack_ = consumes<pixelTrack::TrackSoAHost>(iConfig.getParameter<edm::InputTag>("pixelTrackSrc"));
   topFolderName_ = iConfig.getParameter<std::string>("topFolderName");  //"SiPixelHeterogeneous/PixelTrackSoA";
   useQualityCut_ = iConfig.getParameter<bool>("useQualityCut");
   minQuality_ = pixelTrack::qualityByName(iConfig.getParameter<std::string>("minQuality"));
@@ -78,23 +79,24 @@ void SiPixelPhase1MonitorTrackSoA::analyze(const edm::Event& iEvent, const edm::
     return;
   }
 
-  auto const& tsoa = *((tsoaHandle.product())->get());
-  auto maxTracks = tsoa.stride();
-  auto const* quality = tsoa.qualityData();
+  auto& tsoa = *tsoaHandle.product();
+  auto maxTracks = tsoa.view().metadata().size();
+  auto const* quality = pixelTrack::utilities::qualityData(tsoa.const_view());
   int32_t nTracks = 0;
   int32_t nLooseAndAboveTracks = 0;
 
   for (int32_t it = 0; it < maxTracks; ++it) {
-    auto nHits = tsoa.nHits(it);
-    auto nLayers = tsoa.nLayers(it);
+    auto nHits = pixelTrack::utilities::nHits(tsoa.const_view(), it);
+    auto nLayers = tsoa.view()[it].nLayers();
     if (nHits == 0)
       break;  // this is a guard
-    float pt = tsoa.pt(it);
+    float pt = tsoa.view()[it].pt();
     if (!(pt > 0.))
       continue;
 
     // fill the quality for all tracks
-    pixelTrack::Quality qual = tsoa.quality(it);
+    // pixelTrack::Quality qual = tsoa.view()[it].quality();
+    pixelTrack::Quality qual = quality[it];
     hquality->Fill(int(qual));
     nTracks++;
 
@@ -102,11 +104,11 @@ void SiPixelPhase1MonitorTrackSoA::analyze(const edm::Event& iEvent, const edm::
       continue;
 
     // fill parameters only for quality >= loose
-    float chi2 = tsoa.chi2(it);
-    float phi = tsoa.phi(it);
-    float zip = tsoa.zip(it);
-    float eta = tsoa.eta(it);
-    float tip = tsoa.tip(it);
+    float chi2 = tsoa.view()[it].chi2();
+    float phi = pixelTrack::utilities::phi(tsoa.const_view(), it);
+    float zip = pixelTrack::utilities::zip(tsoa.const_view(), it);
+    float eta = tsoa.view()[it].eta();
+    float tip = pixelTrack::utilities::tip(tsoa.const_view(), it);
 
     hchi2->Fill(chi2);
     hChi2VsPhi->Fill(phi, chi2);
diff --git a/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1MonitorVertexSoA.cc b/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1MonitorVertexSoA.cc
index af6c240a69172..27e0df36a17a4 100644
--- a/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1MonitorVertexSoA.cc
+++ b/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1MonitorVertexSoA.cc
@@ -21,7 +21,7 @@
 #include "DQMServices/Core/interface/MonitorElement.h"
 #include "DQMServices/Core/interface/DQMEDAnalyzer.h"
 #include "DQMServices/Core/interface/DQMStore.h"
-#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h"
+#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h"
 #include "DataFormats/BeamSpot/interface/BeamSpot.h"
 
 class SiPixelPhase1MonitorVertexSoA : public DQMEDAnalyzer {
@@ -34,7 +34,7 @@ class SiPixelPhase1MonitorVertexSoA : public DQMEDAnalyzer {
   static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
 
 private:
-  edm::EDGetTokenT<ZVertexHeterogeneous> tokenSoAVertex_;
+  edm::EDGetTokenT<ZVertex::ZVertexSoAHost> tokenSoAVertex_;
   edm::EDGetTokenT<reco::BeamSpot> tokenBeamSpot_;
   std::string topFolderName_;
   MonitorElement* hnVertex;
@@ -52,7 +52,7 @@ class SiPixelPhase1MonitorVertexSoA : public DQMEDAnalyzer {
 //
 
 SiPixelPhase1MonitorVertexSoA::SiPixelPhase1MonitorVertexSoA(const edm::ParameterSet& iConfig) {
-  tokenSoAVertex_ = consumes<ZVertexHeterogeneous>(iConfig.getParameter<edm::InputTag>("pixelVertexSrc"));
+  tokenSoAVertex_ = consumes<ZVertex::ZVertexSoAHost>(iConfig.getParameter<edm::InputTag>("pixelVertexSrc"));
   tokenBeamSpot_ = consumes<reco::BeamSpot>(iConfig.getParameter<edm::InputTag>("beamSpotSrc"));
   topFolderName_ = iConfig.getParameter<std::string>("topFolderName");
 }
@@ -67,8 +67,8 @@ void SiPixelPhase1MonitorVertexSoA::analyze(const edm::Event& iEvent, const edm:
     return;
   }
 
-  auto const& vsoa = *((vsoaHandle.product())->get());
-  int nVertices = vsoa.nvFinal;
+  auto& vsoa = *vsoaHandle.product();
+  int nVertices = vsoa.view().nvFinal();
   auto bsHandle = iEvent.getHandle(tokenBeamSpot_);
   float x0 = 0., y0 = 0., z0 = 0., dxdz = 0., dydz = 0.;
   if (!bsHandle.isValid()) {
@@ -82,18 +82,18 @@ void SiPixelPhase1MonitorVertexSoA::analyze(const edm::Event& iEvent, const edm:
     dydz = bs.dydz();
   }
   for (int iv = 0; iv < nVertices; iv++) {
-    auto si = vsoa.sortInd[iv];
-    auto z = vsoa.zv[si];
+    auto si = vsoa.view()[iv].sortInd();
+    auto z = vsoa.view()[si].zv();
     auto x = x0 + dxdz * z;
     auto y = y0 + dydz * z;
     z += z0;
     hx->Fill(x);
     hy->Fill(y);
     hz->Fill(z);
-    auto ndof = vsoa.ndof[si];
-    hchi2->Fill(vsoa.chi2[si]);
-    hchi2oNdof->Fill(vsoa.chi2[si] / ndof);
-    hptv2->Fill(vsoa.ptv2[si]);
+    auto ndof = vsoa.view()[si].ndof();
+    hchi2->Fill(vsoa.view()[si].chi2());
+    hchi2oNdof->Fill(vsoa.view()[si].chi2() / ndof);
+    hptv2->Fill(vsoa.view()[si].ptv2());
     hntrks->Fill(ndof + 1);
   }
   hnVertex->Fill(nVertices);
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu
index 135254fa6e9f2..0fea13b849ef1 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu
@@ -13,6 +13,7 @@
 #include "PixelRecHitGPUKernel.h"
 #include "gpuPixelRecHits.h"
 
+#define GPU_DEBUG
 namespace {
   __global__ void setHitsLayerStart(uint32_t const* __restrict__ hitsModuleStart,
                                     pixelCPEforGPU::ParamsOnGPU const* cpeParams,
@@ -34,7 +35,7 @@ namespace {
 
 namespace pixelgpudetails {
 
-  TrackingRecHit2DGPU PixelRecHitGPUKernel::makeHitsAsync(SiPixelDigisCUDA const& digis_d,
+  TrackingRecHitSoADevice PixelRecHitGPUKernel::makeHitsAsync(SiPixelDigisCUDA const& digis_d,
                                                           SiPixelClustersCUDA const& clusters_d,
                                                           BeamSpotCUDA const& bs_d,
                                                           pixelCPEforGPU::ParamsOnGPU const* cpeParams,
@@ -42,10 +43,11 @@ namespace pixelgpudetails {
                                                           cudaStream_t stream) const {
     auto nHits = clusters_d.nClusters();
 
-    TrackingRecHit2DGPU hits_d(
-        nHits, isPhase2, clusters_d.offsetBPIX2(), cpeParams, clusters_d.clusModuleStart(), stream);
-    assert(hits_d.nMaxModules() == isPhase2 ? phase2PixelTopology::numberOfModules
-                                            : phase1PixelTopology::numberOfModules);
+    TrackingRecHitSoADevice hits_d(nHits, isPhase2, clusters_d.offsetBPIX2(), cpeParams, clusters_d.clusModuleStart(), stream);
+    // TrackingRecHit2DGPU hits_d(
+    //     nHits, isPhase2, clusters_d.offsetBPIX2(), cpeParams, clusters_d.clusModuleStart(), stream);
+    // assert(hits_d.nMaxModules() == isPhase2 ? phase2PixelTopology::numberOfModules
+    //                                         : phase1PixelTopology::numberOfModules);
 
     int activeModulesWithDigis = digis_d.nModules();
     // protect from empty events
@@ -65,16 +67,16 @@ namespace pixelgpudetails {
 
       // assuming full warp of threads is better than a smaller number...
       if (nHits) {
-        setHitsLayerStart<<<1, 32, 0, stream>>>(clusters_d.clusModuleStart(), cpeParams, hits_d.hitsLayerStart());
+        setHitsLayerStart<<<1, 32, 0, stream>>>(clusters_d.clusModuleStart(), cpeParams, hits_d.view().hitsLayerStart().data());
         cudaCheck(cudaGetLastError());
         auto nLayers = isPhase2 ? phase2PixelTopology::numberOfLayers : phase1PixelTopology::numberOfLayers;
         cms::cuda::fillManyFromVector(hits_d.phiBinner(),
                                       nLayers,
-                                      hits_d.iphi(),
-                                      hits_d.hitsLayerStart(),
+                                      hits_d.view().iphi(),
+                                      hits_d.view().hitsLayerStart().data(),
                                       nHits,
                                       256,
-                                      hits_d.phiBinnerStorage(),
+                                      hits_d.view().phiBinnerStorage(),
                                       stream);
         cudaCheck(cudaGetLastError());
 
@@ -83,6 +85,10 @@ namespace pixelgpudetails {
 #endif
       }
     }
+    #ifdef GPU_DEBUG
+    cudaCheck(cudaDeviceSynchronize());
+          std::cout << "DONE" << std::endl;
+    #endif
 
     return hits_d;
   }
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.h b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.h
index 8289c8db7f2f4..ada509f57de0a 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.h
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.h
@@ -8,8 +8,9 @@
 #include "CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h"
 #include "CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h"
 #include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h"
-#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h"
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h"
 
+#define GPU_DEBUG
 namespace pixelgpudetails {
 
   class PixelRecHitGPUKernel {
@@ -22,7 +23,7 @@ namespace pixelgpudetails {
     PixelRecHitGPUKernel& operator=(const PixelRecHitGPUKernel&) = delete;
     PixelRecHitGPUKernel& operator=(PixelRecHitGPUKernel&&) = delete;
 
-    TrackingRecHit2DGPU makeHitsAsync(SiPixelDigisCUDA const& digis_d,
+    TrackingRecHitSoADevice makeHitsAsync(SiPixelDigisCUDA const& digis_d,
                                       SiPixelClustersCUDA const& clusters_d,
                                       BeamSpotCUDA const& bs_d,
                                       pixelCPEforGPU::ParamsOnGPU const* cpeParams,
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitCUDA.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitCUDA.cc
index 8112e9ebd19c8..2f5c5710b1586 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitCUDA.cc
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitCUDA.cc
@@ -4,7 +4,7 @@
 #include "CUDADataFormats/Common/interface/Product.h"
 #include "CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h"
 #include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h"
-#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h"
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h"
 #include "DataFormats/Common/interface/Handle.h"
 #include "FWCore/Framework/interface/Event.h"
 #include "FWCore/Framework/interface/EventSetup.h"
@@ -23,6 +23,8 @@
 
 #include "PixelRecHitGPUKernel.h"
 
+#define GPU_DEBUG
+
 class SiPixelRecHitCUDA : public edm::global::EDProducer<> {
 public:
   explicit SiPixelRecHitCUDA(const edm::ParameterSet& iConfig);
@@ -37,7 +39,7 @@ class SiPixelRecHitCUDA : public edm::global::EDProducer<> {
   const edm::EDGetTokenT<cms::cuda::Product<BeamSpotCUDA>> tBeamSpot;
   const edm::EDGetTokenT<cms::cuda::Product<SiPixelClustersCUDA>> token_;
   const edm::EDGetTokenT<cms::cuda::Product<SiPixelDigisCUDA>> tokenDigi_;
-  const edm::EDPutTokenT<cms::cuda::Product<TrackingRecHit2DGPU>> tokenHit_;
+  const edm::EDPutTokenT<cms::cuda::Product<TrackingRecHitSoADevice>> tokenHit_;
 
   const pixelgpudetails::PixelRecHitGPUKernel gpuAlgo_;
 };
@@ -47,7 +49,7 @@ SiPixelRecHitCUDA::SiPixelRecHitCUDA(const edm::ParameterSet& iConfig)
       tBeamSpot(consumes<cms::cuda::Product<BeamSpotCUDA>>(iConfig.getParameter<edm::InputTag>("beamSpot"))),
       token_(consumes<cms::cuda::Product<SiPixelClustersCUDA>>(iConfig.getParameter<edm::InputTag>("src"))),
       tokenDigi_(consumes<cms::cuda::Product<SiPixelDigisCUDA>>(iConfig.getParameter<edm::InputTag>("src"))),
-      tokenHit_(produces<cms::cuda::Product<TrackingRecHit2DGPU>>()) {}
+      tokenHit_(produces<cms::cuda::Product<TrackingRecHitSoADevice>>()) {}
 
 void SiPixelRecHitCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
   edm::ParameterSetDescription desc;
@@ -82,6 +84,7 @@ void SiPixelRecHitCUDA::produce(edm::StreamID streamID, edm::Event& iEvent, cons
               tokenHit_,
               gpuAlgo_.makeHitsAsync(
                   digis, clusters, bs, fcpe->getGPUProductAsync(ctx.stream()), fcpe->isPhase2(), ctx.stream()));
+ std::cout << __LINE__<<std::endl;
 }
 
 DEFINE_FWK_MODULE(SiPixelRecHitCUDA);
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitFromCUDA.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitFromCUDA.cc
index 7ff2da5552e6d..cc4017ab0c91a 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitFromCUDA.cc
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitFromCUDA.cc
@@ -24,6 +24,8 @@
 #include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
 #include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h"
 
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h"
+
 class SiPixelRecHitFromCUDA : public edm::stream::EDProducer<edm::ExternalWork> {
 public:
   explicit SiPixelRecHitFromCUDA(const edm::ParameterSet& iConfig);
@@ -40,7 +42,7 @@ class SiPixelRecHitFromCUDA : public edm::stream::EDProducer<edm::ExternalWork>
   void produce(edm::Event& iEvent, edm::EventSetup const& iSetup) override;
 
   const edm::ESGetToken<TrackerGeometry, TrackerDigiGeometryRecord> geomToken_;
-  const edm::EDGetTokenT<cms::cuda::Product<TrackingRecHit2DGPU>> hitsToken_;  // CUDA hits
+  const edm::EDGetTokenT<cms::cuda::Product<TrackingRecHitSoADevice>> hitsToken_;  // CUDA hits
   const edm::EDGetTokenT<SiPixelClusterCollectionNew> clusterToken_;           // legacy clusters
   const edm::EDPutTokenT<SiPixelRecHitCollection> rechitsPutToken_;            // legacy rechits
   const edm::EDPutTokenT<HMSstorage> hostPutToken_;
@@ -48,13 +50,14 @@ class SiPixelRecHitFromCUDA : public edm::stream::EDProducer<edm::ExternalWork>
   uint32_t nHits_;
   uint32_t nMaxModules_;
   cms::cuda::host::unique_ptr<float[]> store32_;
+  // uint32_t* hitsModuleStart_;
   cms::cuda::host::unique_ptr<uint32_t[]> hitsModuleStart_;
 };
 
 SiPixelRecHitFromCUDA::SiPixelRecHitFromCUDA(const edm::ParameterSet& iConfig)
     : geomToken_(esConsumes()),
       hitsToken_(
-          consumes<cms::cuda::Product<TrackingRecHit2DGPU>>(iConfig.getParameter<edm::InputTag>("pixelRecHitSrc"))),
+          consumes<cms::cuda::Product<TrackingRecHitSoADevice>>(iConfig.getParameter<edm::InputTag>("pixelRecHitSrc"))),
       clusterToken_(consumes<SiPixelClusterCollectionNew>(iConfig.getParameter<edm::InputTag>("src"))),
       rechitsPutToken_(produces<SiPixelRecHitCollection>()),
       hostPutToken_(produces<HMSstorage>()) {}
@@ -69,18 +72,35 @@ void SiPixelRecHitFromCUDA::fillDescriptions(edm::ConfigurationDescriptions& des
 void SiPixelRecHitFromCUDA::acquire(edm::Event const& iEvent,
                                     edm::EventSetup const& iSetup,
                                     edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
-  cms::cuda::Product<TrackingRecHit2DGPU> const& inputDataWrapped = iEvent.get(hitsToken_);
+  cms::cuda::Product<TrackingRecHitSoADevice> const& inputDataWrapped = iEvent.get(hitsToken_);
   cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)};
   auto const& inputData = ctx.get(inputDataWrapped);
-
+  std::cout << __LINE__<<std::endl;
   nHits_ = inputData.nHits();
-  nMaxModules_ = inputData.nMaxModules();
-  LogDebug("SiPixelRecHitFromCUDA") << "converting " << nHits_ << " Hits";
+  std::cout << __LINE__<<std::endl;
+  nMaxModules_ = inputData.nModules();
+  std::cout << __LINE__<<std::endl;
+  // LogDebug("SiPixelRecHitFromCUDA")
+  std::cout << "SiPixelRecHitFromCUDA " << "converting " << nHits_ << " Hits" << std::endl;
 
+  std::cout << inputData.hitsModuleStart()[2] << std::endl;
   if (0 == nHits_)
     return;
   store32_ = inputData.localCoordToHostAsync(ctx.stream());
-  hitsModuleStart_ = inputData.hitsModuleStartToHostAsync(ctx.stream());
+  std::cout << __LINE__<<std::endl;
+
+  // size_t skipSize = int(trackingRecHitSoA::columnsSizes * nHits_);
+  // cudaCheck(cudaMemcpyAsync(hitsModuleStart_,
+  //                           inputData.const_buffer().get() + skipSize,
+  //                           sizeof(uint32_t) * (nMaxModules_ + 1),
+  //                           cudaMemcpyDeviceToHost,
+  //                           ctx.stream()));  // Copy data from Device to Host
+  //
+  // cudaCheck(cudaMemcpyAsync(hitsModuleStart_, inputData.buffer() + int(trackingRecHitSoA::columnsSizes * nHits_), sizeof(uint32_t) * (nMaxModules_ + 1), cudaMemcpyDeviceToHost, ctx.stream()));
+
+  std::copy(inputData.hitsModuleStart(), inputData.hitsModuleStart() + nMaxModules_ + 1, hitsModuleStart_.get());
+// trackingRecHitSoA::hitsModuleStartToHostAsync(inputData.view(), ctx.stream());
+  std::cout << __LINE__ << std::endl;
 }
 
 void SiPixelRecHitFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& es) {
@@ -96,23 +116,23 @@ void SiPixelRecHitFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& e
     return;
   }
   output.reserve(nMaxModules_, nHits_);
-
+  std::cout << __LINE__ << std::endl;
   std::copy(hitsModuleStart_.get(), hitsModuleStart_.get() + nMaxModules_ + 1, hmsp.get());
   // wrap the buffer in a HostProduct, and move it to the Event, without reallocating the buffer or affecting hitsModuleStart
   iEvent.emplace(hostPutToken_, std::move(hmsp));
-
+  std::cout << __LINE__ << std::endl;
   auto xl = store32_.get();
   auto yl = xl + nHits_;
   auto xe = yl + nHits_;
   auto ye = xe + nHits_;
-
+  std::cout << __LINE__ << std::endl;
   const TrackerGeometry* geom = &es.getData(geomToken_);
 
   edm::Handle<SiPixelClusterCollectionNew> hclusters = iEvent.getHandle(clusterToken_);
   auto const& input = *hclusters;
 
   constexpr uint32_t maxHitsInModule = gpuClustering::maxHitsInModule();
-
+  std::cout << __LINE__ << std::endl;
   int numberOfDetUnits = 0;
   int numberOfClusters = 0;
   for (auto const& dsv : input) {
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromCUDA.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromCUDA.cc
index 7532470ebd3d4..aedaf6955c747 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromCUDA.cc
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromCUDA.cc
@@ -24,6 +24,9 @@
 #include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
 #include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h"
 
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoAHost.h"
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h"
+
 class SiPixelRecHitSoAFromCUDA : public edm::stream::EDProducer<edm::ExternalWork> {
 public:
   explicit SiPixelRecHitSoAFromCUDA(const edm::ParameterSet& iConfig);
@@ -38,22 +41,24 @@ class SiPixelRecHitSoAFromCUDA : public edm::stream::EDProducer<edm::ExternalWor
                edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
   void produce(edm::Event& iEvent, edm::EventSetup const& iSetup) override;
 
-  const edm::EDGetTokenT<cms::cuda::Product<TrackingRecHit2DGPU>> hitsTokenGPU_;  // CUDA hits
-  const edm::EDPutTokenT<TrackingRecHit2DCPU> hitsPutTokenCPU_;
+  const edm::EDGetTokenT<cms::cuda::Product<TrackingRecHitSoADevice>> hitsTokenGPU_;  // CUDA hits
+  const edm::EDPutTokenT<TrackingRecHitSoAHost> hitsPutTokenCPU_;
   const edm::EDPutTokenT<HMSstorage> hostPutToken_;
 
   uint32_t nHits_;
+  TrackingRecHitSoAHost hits_h_;
+
   uint32_t nMaxModules_;
 
-  cms::cuda::host::unique_ptr<float[]> store32_;
-  cms::cuda::host::unique_ptr<uint16_t[]> store16_;
-  cms::cuda::host::unique_ptr<uint32_t[]> hitsModuleStart_;
+  // cms::cuda::host::unique_ptr<float[]> store32_;
+  // cms::cuda::host::unique_ptr<uint16_t[]> store16_;
+  // cms::cuda::host::unique_ptr<uint32_t[]> hitsModuleStart_;
 };
 
 SiPixelRecHitSoAFromCUDA::SiPixelRecHitSoAFromCUDA(const edm::ParameterSet& iConfig)
     : hitsTokenGPU_(
-          consumes<cms::cuda::Product<TrackingRecHit2DGPU>>(iConfig.getParameter<edm::InputTag>("pixelRecHitSrc"))),
-      hitsPutTokenCPU_(produces<TrackingRecHit2DCPU>()),
+          consumes<cms::cuda::Product<TrackingRecHitSoADevice>>(iConfig.getParameter<edm::InputTag>("pixelRecHitSrc"))),
+      hitsPutTokenCPU_(produces<TrackingRecHitSoAHost>()),
       hostPutToken_(produces<HMSstorage>()) {}
 
 void SiPixelRecHitSoAFromCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
@@ -65,29 +70,42 @@ void SiPixelRecHitSoAFromCUDA::fillDescriptions(edm::ConfigurationDescriptions&
 void SiPixelRecHitSoAFromCUDA::acquire(edm::Event const& iEvent,
                                        edm::EventSetup const& iSetup,
                                        edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
-  cms::cuda::Product<TrackingRecHit2DGPU> const& inputDataWrapped = iEvent.get(hitsTokenGPU_);
+  cms::cuda::Product<TrackingRecHitSoADevice> const& inputDataWrapped = iEvent.get(hitsTokenGPU_);
   cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)};
   auto const& inputData = ctx.get(inputDataWrapped);
 
-  nHits_ = inputData.nHits();
-  LogDebug("SiPixelRecHitSoAFromCUDA") << "copying to cpu SoA" << inputData.nHits() << " Hits";
+  nHits_ = inputData.view().nHits();
 
   if (0 == nHits_)
     return;
-  nMaxModules_ = inputData.nMaxModules();
-  store32_ = inputData.store32ToHostAsync(ctx.stream());
-  store16_ = inputData.store16ToHostAsync(ctx.stream());
-  hitsModuleStart_ = inputData.hitsModuleStartToHostAsync(ctx.stream());
+
+  nMaxModules_ = inputData.view().nMaxModules();
+
+  hits_h_ = TrackingRecHitSoAHost(nHits_,ctx.stream());
+  cudaCheck(cudaMemcpyAsync(hits_h_.buffer().get(),
+                            inputData.const_buffer().get(),
+                            inputData.bufferSize(),
+                            cudaMemcpyDeviceToHost,
+                            ctx.stream()));  // Copy data from Device to Host
+  cudaCheck(cudaGetLastError());
+
+
+  LogDebug("SiPixelRecHitSoAFromCUDA") << "copying to cpu SoA" << inputData.nHits() << " Hits";
+
+  // store32_ = inputData.store32ToHostAsync(ctx.stream());
+  // store16_ = inputData.store16ToHostAsync(ctx.stream());
+  // hitsModuleStart_ = inputData.hitsModuleStartToHostAsync(ctx.stream());
 }
 
 void SiPixelRecHitSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& es) {
   auto hmsp = std::make_unique<uint32_t[]>(nMaxModules_ + 1);
 
   if (nHits_ > 0)
-    std::copy(hitsModuleStart_.get(), hitsModuleStart_.get() + nMaxModules_ + 1, hmsp.get());
+    std::copy(hits_h_.view().hitsModuleStart().begin(),hits_h_.view().hitsModuleStart().end(),hmsp.get());
+    // std::copy(hitsModuleStart_.get(), hitsModuleStart_.get() + nMaxModules_ + 1, hmsp.get());
 
   iEvent.emplace(hostPutToken_, std::move(hmsp));
-  iEvent.emplace(hitsPutTokenCPU_, store32_.get(), store16_.get(), hitsModuleStart_.get(), nHits_);
+  iEvent.emplace(hitsPutTokenCPU_, std::move(hits_h_));//store32_.get(), store16_.get(), hitsModuleStart_.get(), nHits_);
 }
 
 DEFINE_FWK_MODULE(SiPixelRecHitSoAFromCUDA);
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromLegacy.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromLegacy.cc
index d23ecec66fea0..663674b2a4145 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromLegacy.cc
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromLegacy.cc
@@ -3,7 +3,7 @@
 #include "CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h"
 #include "CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h"
 #include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h"
-#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h"
+// #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h"
 #include "CUDADataFormats/Common/interface/HostProduct.h"
 #include "DataFormats/BeamSpot/interface/BeamSpot.h"
 #include "DataFormats/Common/interface/DetSetVectorNew.h"
@@ -25,6 +25,8 @@
 #include "RecoLocalTracker/SiPixelRecHits/interface/PixelCPEBase.h"
 #include "RecoLocalTracker/SiPixelRecHits/interface/PixelCPEFast.h"
 
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoAHost.h"
+
 #include "gpuPixelRecHits.h"
 
 class SiPixelRecHitSoAFromLegacy : public edm::global::EDProducer<> {
@@ -44,7 +46,7 @@ class SiPixelRecHitSoAFromLegacy : public edm::global::EDProducer<> {
   const edm::ESGetToken<PixelClusterParameterEstimator, TkPixelCPERecord> cpeToken_;
   const edm::EDGetTokenT<reco::BeamSpot> bsGetToken_;
   const edm::EDGetTokenT<SiPixelClusterCollectionNew> clusterToken_;  // Legacy Clusters
-  const edm::EDPutTokenT<TrackingRecHit2DCPU> tokenHit_;
+  const edm::EDPutTokenT<TrackingRecHitSoAHost> tokenHit_;
   const edm::EDPutTokenT<HMSstorage> tokenModuleStart_;
   const bool convert2Legacy_;
   const bool isPhase2_;
@@ -55,7 +57,7 @@ SiPixelRecHitSoAFromLegacy::SiPixelRecHitSoAFromLegacy(const edm::ParameterSet&
       cpeToken_(esConsumes(edm::ESInputTag("", iConfig.getParameter<std::string>("CPE")))),
       bsGetToken_{consumes<reco::BeamSpot>(iConfig.getParameter<edm::InputTag>("beamSpot"))},
       clusterToken_{consumes<SiPixelClusterCollectionNew>(iConfig.getParameter<edm::InputTag>("src"))},
-      tokenHit_{produces<TrackingRecHit2DCPU>()},
+      tokenHit_{produces<TrackingRecHitSoAHost>()},
       tokenModuleStart_{produces<HMSstorage>()},
       convert2Legacy_(iConfig.getParameter<bool>("convertToLegacy")),
       isPhase2_(iConfig.getParameter<bool>("isPhase2")) {
@@ -156,9 +158,9 @@ void SiPixelRecHitSoAFromLegacy::produce(edm::StreamID streamID, edm::Event& iEv
   // output SoA
   // element 96 is the start of BPIX2 (i.e. the number of clusters in BPIX1)
 
-  auto output = std::make_unique<TrackingRecHit2DCPU>(
+  auto output = std::make_unique<TrackingRecHitSoAHost>(
       numberOfClusters, isPhase2_, hitsModuleStart[startBPIX2], &cpeView, hitsModuleStart, nullptr);
-  assert(output->nMaxModules() == uint32_t(nMaxModules));
+  assert(output->nModules() == uint32_t(nMaxModules));
 
   if (0 == numberOfClusters) {
     iEvent.put(std::move(output));
@@ -239,9 +241,9 @@ void SiPixelRecHitSoAFromLegacy::produce(edm::StreamID streamID, edm::Event& iEv
     gpuPixelRecHits::getHits(&cpeView, &bsHost, digiView, ndigi, &clusterView, output->view());
     for (auto h = fc; h < lc; ++h)
       if (h - fc < maxHitsInModule)
-        assert(gind == output->view()->detectorIndex(h));
+        assert(gind == output->view()[h].detectorIndex());
       else
-        assert(gpuClustering::invalidModuleId == output->view()->detectorIndex(h));
+        assert(gpuClustering::invalidModuleId == output->view()[h].detectorIndex());
     if (convert2Legacy_) {
       SiPixelRecHitCollectionNew::FastFiller recHitsOnDetUnit(*legacyOutput, detid);
       for (auto h = fc; h < lc; ++h) {
@@ -250,8 +252,8 @@ void SiPixelRecHitSoAFromLegacy::produce(edm::StreamID streamID, edm::Event& iEv
         if (ih >= maxHitsInModule)
           break;
         assert(ih < clusterRef.size());
-        LocalPoint lp(output->view()->xLocal(h), output->view()->yLocal(h));
-        LocalError le(output->view()->xerrLocal(h), 0, output->view()->yerrLocal(h));
+        LocalPoint lp(output->view()[h].xLocal(), output->view()[h].yLocal());
+        LocalError le(output->view()[h].xerrLocal(), 0, output->view()[h].yerrLocal());
         SiPixelRecHitQuality::QualWordType rqw = 0;
         SiPixelRecHit hit(lp, le, rqw, *genericDet, clusterRef[ih]);
         recHitsOnDetUnit.push_back(hit);
@@ -264,20 +266,21 @@ void SiPixelRecHitSoAFromLegacy::produce(edm::StreamID streamID, edm::Event& iEv
   // fill data structure to support CA
   const auto nLayers = isPhase2_ ? phase2PixelTopology::numberOfLayers : phase1PixelTopology::numberOfLayers;
   for (auto i = 0U; i < nLayers + 1; ++i) {
-    output->hitsLayerStart()[i] = hitsModuleStart[cpeView.layerGeometry().layerStart[i]];
+    output->view().hitsLayerStart()[i] = hitsModuleStart[cpeView.layerGeometry().layerStart[i]];
     LogDebug("SiPixelRecHitSoAFromLegacy")
         << "Layer n." << i << " - starting at module: " << cpeView.layerGeometry().layerStart[i]
-        << " - starts ad cluster: " << output->hitsLayerStart()[i] << "\n";
+        << " - starts ad cluster: " << output->view()[i].hitsLayerStart() << "\n";
   }
 
-  cms::cuda::fillManyFromVector(output->phiBinner(),
+  cms::cuda::fillManyFromVector(&(output->view().phiBinner()),
                                 nLayers,
-                                output->iphi(),
-                                output->hitsLayerStart(),
-                                numberOfHits,
+                                output->view().iphi(),
+                                output->view().hitsLayerStart().data(),
+                                output->nHits(),
                                 256,
                                 output->phiBinnerStorage());
 
+
   LogDebug("SiPixelRecHitSoAFromLegacy") << "created HitSoa for " << numberOfClusters << " clusters in "
                                          << numberOfDetUnits << " Dets";
   iEvent.put(std::move(output));
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h b/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h
index 5b862b2cf63b9..9dbab6c900030 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h
@@ -7,12 +7,14 @@
 
 #include "CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h"
 #include "CUDADataFormats/SiPixelCluster/interface/gpuClusteringConstants.h"
-#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h"
+// #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h"
 #include "DataFormats/Math/interface/approx_atan2.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
 #include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h"
 #include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDASOAView.h"
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h"
 
+#define GPU_DEBUG
 namespace gpuPixelRecHits {
 
   __global__ void getHits(pixelCPEforGPU::ParamsOnGPU const* __restrict__ cpeParams,
@@ -20,14 +22,14 @@ namespace gpuPixelRecHits {
                           SiPixelDigisCUDASOAView const digis,
                           int numElements,
                           SiPixelClustersCUDA::SiPixelClustersCUDASOAView const* __restrict__ pclusters,
-                          TrackingRecHit2DSOAView* phits) {
+                          trackingRecHitSoA::HitSoAView hits) {
     // FIXME
     // the compiler seems NOT to optimize loads from views (even in a simple test case)
     // The whole gimnastic here of copying or not is a pure heuristic exercise that seems to produce the fastest code with the above signature
     // not using views (passing a gazzilion of array pointers) seems to produce the fastest code (but it is harder to mantain)
-    assert(phits);
+    // assert(phits);
     assert(cpeParams);
-    auto& hits = *phits;
+    // auto& hits = *phits;
 
     auto const& clusters = *pclusters;
     auto isPhase2 = cpeParams->commonParams().isPhase2;
@@ -175,18 +177,19 @@ namespace gpuPixelRecHits {
           pixelCPEforGPU::errorFromSize(cpeParams->commonParams(), cpeParams->detParams(me), clusParams, ic);
 
         // store it
-        hits.setChargeAndStatus(h, clusParams.charge[ic], clusParams.status[ic]);
-        hits.detectorIndex(h) = me;
+        hits[h].chargeAndStatus().charge = clusParams.charge[ic];
+        hits[h].chargeAndStatus().status = clusParams.status[ic];
+        hits[h].detectorIndex() = me;
 
         float xl, yl;
-        hits.xLocal(h) = xl = clusParams.xpos[ic];
-        hits.yLocal(h) = yl = clusParams.ypos[ic];
+        hits[h].xLocal() = xl = clusParams.xpos[ic];
+        hits[h].yLocal() = yl = clusParams.ypos[ic];
 
-        hits.clusterSizeX(h) = clusParams.xsize[ic];
-        hits.clusterSizeY(h) = clusParams.ysize[ic];
+        hits[h].clusterSizeX() = clusParams.xsize[ic];
+        hits[h].clusterSizeY() = clusParams.ysize[ic];
 
-        hits.xerrLocal(h) = clusParams.xerr[ic] * clusParams.xerr[ic] + cpeParams->detParams(me).apeXX;
-        hits.yerrLocal(h) = clusParams.yerr[ic] * clusParams.yerr[ic] + cpeParams->detParams(me).apeYY;
+        hits[h].xerrLocal() = clusParams.xerr[ic] * clusParams.xerr[ic] + cpeParams->detParams(me).apeXX;
+        hits[h].yerrLocal() = clusParams.yerr[ic] * clusParams.yerr[ic] + cpeParams->detParams(me).apeYY;
 
         // keep it local for computations
         float xg, yg, zg;
@@ -197,12 +200,12 @@ namespace gpuPixelRecHits {
         yg -= bs->y;
         zg -= bs->z;
 
-        hits.xGlobal(h) = xg;
-        hits.yGlobal(h) = yg;
-        hits.zGlobal(h) = zg;
+        hits[h].xGlobal() = xg;
+        hits[h].yGlobal() = yg;
+        hits[h].zGlobal() = zg;
 
-        hits.rGlobal(h) = std::sqrt(xg * xg + yg * yg);
-        hits.iphi(h) = unsafe_atan2s<7>(yg, xg);
+        hits[h].rGlobal() = std::sqrt(xg * xg + yg * yg);
+        hits[h].iphi() = unsafe_atan2s<7>(yg, xg);
       }
       __syncthreads();
     }  // end loop on batches
diff --git a/RecoLocalTracker/SiPixelRecHits/python/SiPixelRecHits_cfi.py b/RecoLocalTracker/SiPixelRecHits/python/SiPixelRecHits_cfi.py
index 4af0238682abb..11a69fead8ad3 100644
--- a/RecoLocalTracker/SiPixelRecHits/python/SiPixelRecHits_cfi.py
+++ b/RecoLocalTracker/SiPixelRecHits/python/SiPixelRecHits_cfi.py
@@ -56,7 +56,7 @@
 siPixelRecHitsPreSplittingSoA = SwitchProducerCUDA(
     cpu = cms.EDAlias(
             siPixelRecHitsPreSplittingCPU = cms.VPSet(
-                 cms.PSet(type = cms.string("cmscudacompatCPUTraitsTrackingRecHit2DHeterogeneous")),
+                 cms.PSet(type = cms.string("TrackingRecHitSoAHost")),
                  cms.PSet(type = cms.string("uintAsHostProduct"))
              )),
 )
diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc
index f3d6022e21654..6bf47b7302da1 100644
--- a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc
+++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc
@@ -1,9 +1,11 @@
 #include <cuda_runtime.h>
 
 #include "CUDADataFormats/Common/interface/Product.h"
-#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h"
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h"
 #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h"
-#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h"
+#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h"
+#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h"
 #include "DataFormats/Common/interface/Handle.h"
 #include "FWCore/Framework/interface/ConsumesCollector.h"
 #include "FWCore/Framework/interface/Event.h"
@@ -30,22 +32,25 @@ class PixelTrackDumpCUDA : public edm::global::EDAnalyzer<> {
 private:
   void analyze(edm::StreamID streamID, edm::Event const& iEvent, const edm::EventSetup& iSetup) const override;
   const bool m_onGPU;
-  edm::EDGetTokenT<cms::cuda::Product<PixelTrackHeterogeneous>> tokenGPUTrack_;
-  edm::EDGetTokenT<cms::cuda::Product<ZVertexHeterogeneous>> tokenGPUVertex_;
-  edm::EDGetTokenT<PixelTrackHeterogeneous> tokenSoATrack_;
-  edm::EDGetTokenT<ZVertexHeterogeneous> tokenSoAVertex_;
+  // GPU
+  edm::EDGetTokenT<cms::cuda::Product<pixelTrack::TrackSoADevice>> tokenGPUTrack_;
+  edm::EDGetTokenT<cms::cuda::Product<ZVertex::ZVertexSoADevice>> tokenGPUVertex_;
+
+  // CPU
+  edm::EDGetTokenT<pixelTrack::TrackSoAHost> tokenSoATrack_;
+  edm::EDGetTokenT<ZVertex::ZVertexSoAHost> tokenSoAVertex_;
 };
 
 PixelTrackDumpCUDA::PixelTrackDumpCUDA(const edm::ParameterSet& iConfig)
     : m_onGPU(iConfig.getParameter<bool>("onGPU")) {
   if (m_onGPU) {
     tokenGPUTrack_ =
-        consumes<cms::cuda::Product<PixelTrackHeterogeneous>>(iConfig.getParameter<edm::InputTag>("pixelTrackSrc"));
+        consumes<cms::cuda::Product<pixelTrack::TrackSoADevice>>(iConfig.getParameter<edm::InputTag>("pixelTrackSrc"));
     tokenGPUVertex_ =
-        consumes<cms::cuda::Product<ZVertexHeterogeneous>>(iConfig.getParameter<edm::InputTag>("pixelVertexSrc"));
+        consumes<cms::cuda::Product<ZVertex::ZVertexSoADevice>>(iConfig.getParameter<edm::InputTag>("pixelVertexSrc"));
   } else {
-    tokenSoATrack_ = consumes<PixelTrackHeterogeneous>(iConfig.getParameter<edm::InputTag>("pixelTrackSrc"));
-    tokenSoAVertex_ = consumes<ZVertexHeterogeneous>(iConfig.getParameter<edm::InputTag>("pixelVertexSrc"));
+    tokenSoATrack_ = consumes<pixelTrack::TrackSoAHost>(iConfig.getParameter<edm::InputTag>("pixelTrackSrc"));
+    tokenSoAVertex_ = consumes<ZVertex::ZVertexSoAHost>(iConfig.getParameter<edm::InputTag>("pixelVertexSrc"));
   }
 }
 
@@ -66,19 +71,20 @@ void PixelTrackDumpCUDA::analyze(edm::StreamID streamID,
     cms::cuda::ScopedContextProduce ctx{hTracks};
 
     auto const& tracks = ctx.get(hTracks);
-    auto const* tsoa = tracks.get();
+    auto const* tsoa = &tracks;
     assert(tsoa);
 
     auto const& vertices = ctx.get(iEvent.get(tokenGPUVertex_));
-    auto const* vsoa = vertices.get();
+    //auto const* vsoa = vertices.get();
+    auto const* vsoa = &vertices;
     assert(vsoa);
 
   } else {
-    auto const* tsoa = iEvent.get(tokenSoATrack_).get();
-    assert(tsoa);
+    auto const& tsoa = iEvent.get(tokenSoATrack_);
+    assert(tsoa.buffer());
 
-    auto const* vsoa = iEvent.get(tokenSoAVertex_).get();
-    assert(vsoa);
+    auto const& vsoa = iEvent.get(tokenSoAVertex_);
+    assert(vsoa.buffer());
   }
 }
 
diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc
index 59ba877e9e626..36d3dd8c3dcc7 100644
--- a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc
+++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc
@@ -27,7 +27,9 @@
 #include "RecoPixelVertexing/PixelTrackFitting/interface/FitUtils.h"
 
 #include "CUDADataFormats/Common/interface/HostProduct.h"
-#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h"
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h"
+#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h"
 #include "CUDADataFormats/SiPixelCluster/interface/gpuClusteringConstants.h"
 
 #include "storeTracks.h"
@@ -35,7 +37,7 @@
 
 /**
  * This class creates "leagcy"  reco::Track
- * objects from the output of SoA CA. 
+ * objects from the output of SoA CA.
  */
 class PixelTrackProducerFromSoA : public edm::global::EDProducer<> {
 public:
@@ -54,7 +56,7 @@ class PixelTrackProducerFromSoA : public edm::global::EDProducer<> {
 
   // Event Data tokens
   const edm::EDGetTokenT<reco::BeamSpot> tBeamSpot_;
-  const edm::EDGetTokenT<PixelTrackHeterogeneous> tokenTrack_;
+  const edm::EDGetTokenT<pixelTrack::TrackSoAHost> tokenTrack_;
   const edm::EDGetTokenT<SiPixelRecHitCollectionNew> cpuHits_;
   const edm::EDGetTokenT<HMSstorage> hmsToken_;
   // Event Setup tokens
@@ -67,7 +69,7 @@ class PixelTrackProducerFromSoA : public edm::global::EDProducer<> {
 
 PixelTrackProducerFromSoA::PixelTrackProducerFromSoA(const edm::ParameterSet &iConfig)
     : tBeamSpot_(consumes<reco::BeamSpot>(iConfig.getParameter<edm::InputTag>("beamSpot"))),
-      tokenTrack_(consumes<PixelTrackHeterogeneous>(iConfig.getParameter<edm::InputTag>("trackSrc"))),
+      tokenTrack_(consumes<pixelTrack::TrackSoAHost>(iConfig.getParameter<edm::InputTag>("trackSrc"))),
       cpuHits_(consumes<SiPixelRecHitCollectionNew>(iConfig.getParameter<edm::InputTag>("pixelRecHitLegacySrc"))),
       hmsToken_(consumes<HMSstorage>(iConfig.getParameter<edm::InputTag>("pixelRecHitLegacySrc"))),
       idealMagneticFieldToken_(esConsumes()),
@@ -104,7 +106,6 @@ void PixelTrackProducerFromSoA::fillDescriptions(edm::ConfigurationDescriptions
 void PixelTrackProducerFromSoA::produce(edm::StreamID streamID,
                                         edm::Event &iEvent,
                                         const edm::EventSetup &iSetup) const {
-  // enum class Quality : uint8_t { bad = 0, edup, dup, loose, strict, tight, highPurity };
   reco::TrackBase::TrackQuality recoQuality[] = {reco::TrackBase::undefQuality,
                                                  reco::TrackBase::undefQuality,
                                                  reco::TrackBase::discarded,
@@ -152,12 +153,10 @@ void PixelTrackProducerFromSoA::produce(edm::StreamID streamID,
   std::vector<const TrackingRecHit *> hits;
   hits.reserve(5);
 
-  const auto &tsoa = *iEvent.get(tokenTrack_);
-
-  auto const *quality = tsoa.qualityData();
-  auto const &fit = tsoa.stateAtBS;
-  auto const &hitIndices = tsoa.hitIndices;
-  auto nTracks = tsoa.nTracks();
+  auto &tsoa = iEvent.get(tokenTrack_);
+  auto const *quality = pixelTrack::utilities::qualityData(tsoa.view());
+  auto const hitIndices = tsoa.view().hitIndices();
+  auto nTracks = tsoa.view().nTracks();
 
   tracks.reserve(nTracks);
 
@@ -166,13 +165,15 @@ void PixelTrackProducerFromSoA::produce(edm::StreamID streamID,
   //sort index by pt
   std::vector<int32_t> sortIdxs(nTracks);
   std::iota(sortIdxs.begin(), sortIdxs.end(), 0);
-  std::sort(
-      sortIdxs.begin(), sortIdxs.end(), [&](int32_t const i1, int32_t const i2) { return tsoa.pt(i1) > tsoa.pt(i2); });
+  std::sort(sortIdxs.begin(), sortIdxs.end(), [&](int32_t const i1, int32_t const i2) {
+    return tsoa.view()[i1].pt() > tsoa.view()[i2].pt();
+  });
 
   //store the index of the SoA: indToEdm[index_SoAtrack] -> index_edmTrack (if it exists)
   indToEdm.resize(sortIdxs.size(), -1);
   for (const auto &it : sortIdxs) {
-    auto nHits = tsoa.nHits(it);
+    auto nHits = pixelTrack::utilities::nHits(tsoa.view(), it);
+
     assert(nHits >= 3);
     auto q = quality[it];
     if (q < minQuality_)
@@ -189,12 +190,13 @@ void PixelTrackProducerFromSoA::produce(edm::StreamID streamID,
 
     // mind: this values are respect the beamspot!
 
-    float chi2 = tsoa.chi2(it);
-    float phi = tsoa.phi(it);
+    float chi2 = tsoa.view()[it].chi2();
+    float phi = pixelTrack::utilities::phi(tsoa.view(), it);
 
     riemannFit::Vector5d ipar, opar;
     riemannFit::Matrix5d icov, ocov;
-    fit.copyToDense(ipar, icov, it);
+    pixelTrack::utilities::copyToDense<riemannFit::Vector5d, riemannFit::Matrix5d>(tsoa.view(), ipar, icov, it);
+
     riemannFit::transformToPerigeePlane(ipar, icov, opar, ocov);
 
     LocalTrajectoryParameters lpar(opar(0), opar(1), opar(2), opar(3), opar(4), 1.);
@@ -237,7 +239,6 @@ void PixelTrackProducerFromSoA::produce(edm::StreamID streamID,
     // filter???
     tracks.emplace_back(track.release(), hits);
   }
-  // std::cout << "processed " << nt << " good tuples " << tracks.size() << "out of " << indToEdm.size() << std::endl;
 
   // store tracks
   storeTracks(iEvent, tracks, httopo);
diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc
index 5cf4aac491901..191e9009f6d6e 100644
--- a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc
+++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc
@@ -2,7 +2,9 @@
 
 #include "CUDADataFormats/Common/interface/Product.h"
 #include "CUDADataFormats/Common/interface/HostProduct.h"
-#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h"
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h"
+#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h"
 #include "DataFormats/Common/interface/Handle.h"
 #include "FWCore/Framework/interface/Event.h"
 #include "FWCore/Framework/interface/EventSetup.h"
@@ -15,6 +17,7 @@
 #include "FWCore/Utilities/interface/EDGetToken.h"
 #include "FWCore/Utilities/interface/InputTag.h"
 #include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 
 // Switch on to enable checks and printout for found tracks
 // #define PIXEL_DEBUG_PRODUCE
@@ -32,15 +35,15 @@ class PixelTrackSoAFromCUDA : public edm::stream::EDProducer<edm::ExternalWork>
                edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
   void produce(edm::Event& iEvent, edm::EventSetup const& iSetup) override;
 
-  edm::EDGetTokenT<cms::cuda::Product<PixelTrackHeterogeneous>> tokenCUDA_;
-  edm::EDPutTokenT<PixelTrackHeterogeneous> tokenSOA_;
+  edm::EDGetTokenT<cms::cuda::Product<pixelTrack::TrackSoADevice>> tokenCUDA_;
+  edm::EDPutTokenT<pixelTrack::TrackSoAHost> tokenSOA_;
 
-  cms::cuda::host::unique_ptr<pixelTrack::TrackSoA> soa_;
+  pixelTrack::TrackSoAHost tracks_h;
 };
 
 PixelTrackSoAFromCUDA::PixelTrackSoAFromCUDA(const edm::ParameterSet& iConfig)
-    : tokenCUDA_(consumes<cms::cuda::Product<PixelTrackHeterogeneous>>(iConfig.getParameter<edm::InputTag>("src"))),
-      tokenSOA_(produces<PixelTrackHeterogeneous>()) {}
+    : tokenCUDA_(consumes<cms::cuda::Product<pixelTrack::TrackSoADevice>>(iConfig.getParameter<edm::InputTag>("src"))),
+      tokenSOA_(produces<pixelTrack::TrackSoAHost>()) {}
 
 void PixelTrackSoAFromCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
   edm::ParameterSetDescription desc;
@@ -52,43 +55,45 @@ void PixelTrackSoAFromCUDA::fillDescriptions(edm::ConfigurationDescriptions& des
 void PixelTrackSoAFromCUDA::acquire(edm::Event const& iEvent,
                                     edm::EventSetup const& iSetup,
                                     edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
-  cms::cuda::Product<PixelTrackHeterogeneous> const& inputDataWrapped = iEvent.get(tokenCUDA_);
+  cms::cuda::Product<pixelTrack::TrackSoADevice> const& inputDataWrapped = iEvent.get(tokenCUDA_);
   cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)};
-  auto const& inputData = ctx.get(inputDataWrapped);
-
-  soa_ = inputData.toHostAsync(ctx.stream());
+  auto const& tracks_d = ctx.get(inputDataWrapped);   // Tracks on device
+  tracks_h = pixelTrack::TrackSoAHost(ctx.stream());  // Create an instance of Tracks on Host, using the stream
+  cudaCheck(cudaMemcpyAsync(tracks_h.buffer().get(),
+                            tracks_d.const_buffer().get(),
+                            tracks_d.bufferSize(),
+                            cudaMemcpyDeviceToHost,
+                            ctx.stream()));  // Copy data from Device to Host
+  cudaCheck(cudaGetLastError());
 }
 
 void PixelTrackSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& iSetup) {
   // check that the fixed-size SoA does not overflow
-  auto const& tsoa = *soa_;
-  auto maxTracks = tsoa.stride();
-  auto nTracks = tsoa.nTracks();
+  auto maxTracks = tracks_h.view().metadata().size();
+  auto nTracks = tracks_h.view().nTracks();
   assert(nTracks < maxTracks);
+
   if (nTracks == maxTracks - 1) {
     edm::LogWarning("PixelTracks") << "Unsorted reconstructed pixel tracks truncated to " << maxTracks - 1
                                    << " candidates";
   }
 
 #ifdef PIXEL_DEBUG_PRODUCE
-  std::cout << "size of SoA " << sizeof(tsoa) << " stride " << maxTracks << std::endl;
-  std::cout << "found " << nTracks << " tracks in cpu SoA at " << &tsoa << std::endl;
+  std::cout << " stride " << maxTracks << std::endl;
+  std::cout << "found " << nTracks << std::endl;
 
   int32_t nt = 0;
   for (int32_t it = 0; it < maxTracks; ++it) {
-    auto nHits = tsoa.nHits(it);
-    assert(nHits == int(tsoa.hitIndices.size(it)));
+    auto nHits = pixelTrack::utilities::nHits(tracks_h.view(), it);
+    assert(nHits == int(tracks_h.view().hitIndices().size(it)));
     if (nHits == 0)
       break;  // this is a guard: maybe we need to move to nTracks...
     nt++;
   }
   assert(nTracks == nt);
 #endif
-
   // DO NOT  make a copy  (actually TWO....)
-  iEvent.emplace(tokenSOA_, std::move(soa_));
-
-  assert(!soa_);
+  iEvent.emplace(tokenSOA_, std::move(tracks_h));
 }
 
 DEFINE_FWK_MODULE(PixelTrackSoAFromCUDA);
diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelVertexProducerCUDA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelVertexProducerCUDA.cc
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h
index 6ec6afb83cba1..2b2d93cf7415a 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h
@@ -18,7 +18,7 @@
 
 using HitsOnGPU = TrackingRecHit2DSOAView;
 using Tuples = pixelTrack::HitContainer;
-using OutputSoA = pixelTrack::TrackSoA;
+using OutputSoAView = pixelTrack::TrackSoAView;
 using tindex_type = caConstants::tindex_type;
 constexpr auto invalidTkId = std::numeric_limits<tindex_type>::max();
 
@@ -169,12 +169,12 @@ __global__ void kernel_BLFastFit(Tuples const *__restrict__ foundNtuplets,
 template <int N>
 __global__ void kernel_BLFit(caConstants::TupleMultiplicity const *__restrict__ tupleMultiplicity,
                              double bField,
-                             OutputSoA *results,
+                             OutputSoAView results_view,
                              tindex_type const *__restrict__ ptkids,
                              double *__restrict__ phits,
                              float *__restrict__ phits_ge,
                              double *__restrict__ pfast_fit) {
-  assert(results);
+  // assert(results_view); // TODO Find equivalent assertion for View
   assert(pfast_fit);
 
   // same as above...
@@ -203,10 +203,11 @@ __global__ void kernel_BLFit(caConstants::TupleMultiplicity const *__restrict__
     brokenline::lineFit(hits_ge, fast_fit, bField, data, line);
     brokenline::circleFit(hits, hits_ge, fast_fit, bField, data, circle);
 
-    results->stateAtBS.copyFromCircle(circle.par, circle.cov, line.par, line.cov, 1.f / float(bField), tkid);
-    results->pt(tkid) = float(bField) / float(std::abs(circle.par(2)));
-    results->eta(tkid) = asinhf(line.par(0));
-    results->chi2(tkid) = (circle.chi2 + line.chi2) / (2 * N - 5);
+    pixelTrack::utilities::copyFromCircle(
+        results_view, circle.par, circle.cov, line.par, line.cov, 1.f / float(bField), tkid);
+    results_view[tkid].pt() = float(bField) / float(std::abs(circle.par(2)));
+    results_view[tkid].eta() = asinhf(line.par(0));
+    results_view[tkid].chi2() = (circle.chi2 + line.chi2) / (2 * N - 5);
 
 #ifdef BROKENLINE_DEBUG
     if (!(circle.chi2 >= 0) || !(line.chi2 >= 0))
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc
index 72c482c6189db..2e48865b682bf 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc
@@ -20,7 +20,8 @@
 #include "RecoTracker/TkMSParametrization/interface/PixelRecoUtilities.h"
 
 #include "CAHitNtupletGeneratorOnGPU.h"
-#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h"
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h"
 #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h"
 
 class CAHitNtupletCUDA : public edm::global::EDProducer<> {
@@ -39,10 +40,15 @@ class CAHitNtupletCUDA : public edm::global::EDProducer<> {
   bool onGPU_;
 
   edm::ESGetToken<MagneticField, IdealMagneticFieldRecord> tokenField_;
+  // GPU
+  // Produces a view on GPU, which is used by PixelTrackSoAFromCUDA
   edm::EDGetTokenT<cms::cuda::Product<TrackingRecHit2DGPU>> tokenHitGPU_;
-  edm::EDPutTokenT<cms::cuda::Product<PixelTrackHeterogeneous>> tokenTrackGPU_;
+  edm::EDPutTokenT<cms::cuda::Product<pixelTrack::TrackSoADevice>> tokenTrackGPU_;
+
+  // CPU
+  // Produces a view on CPU, which is used by PixelTrackProducerFromSoA
   edm::EDGetTokenT<TrackingRecHit2DCPU> tokenHitCPU_;
-  edm::EDPutTokenT<PixelTrackHeterogeneous> tokenTrackCPU_;
+  edm::EDPutTokenT<pixelTrack::TrackSoAHost> tokenTrackCPU_;
 
   CAHitNtupletGeneratorOnGPU gpuAlgo_;
 };
@@ -52,10 +58,10 @@ CAHitNtupletCUDA::CAHitNtupletCUDA(const edm::ParameterSet& iConfig)
   if (onGPU_) {
     tokenHitGPU_ =
         consumes<cms::cuda::Product<TrackingRecHit2DGPU>>(iConfig.getParameter<edm::InputTag>("pixelRecHitSrc"));
-    tokenTrackGPU_ = produces<cms::cuda::Product<PixelTrackHeterogeneous>>();
+    tokenTrackGPU_ = produces<cms::cuda::Product<pixelTrack::TrackSoADevice>>();
   } else {
     tokenHitCPU_ = consumes<TrackingRecHit2DCPU>(iConfig.getParameter<edm::InputTag>("pixelRecHitSrc"));
-    tokenTrackCPU_ = produces<PixelTrackHeterogeneous>();
+    tokenTrackCPU_ = produces<pixelTrack::TrackSoAHost>();
   }
 }
 
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
index 66208debdc98d..65a3f3a8dff4c 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
@@ -78,15 +78,11 @@ void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsOnCPU const &hh, cudaStr
 }
 
 template <>
-void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) {
-  auto *tuples_d = &tracks_d->hitIndices;
-  auto *detId_d = &tracks_d->detIndices;
-  auto *quality_d = tracks_d->qualityData();
-
-  assert(tuples_d && quality_d);
-
+void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh,
+                                                    TkSoAView tracks_view,
+                                                    cudaStream_t cudaStream) {
   // zero tuples
-  cms::cuda::launchZero(tuples_d, cudaStream);
+  cms::cuda::launchZero(&tracks_view.hitIndices(), cudaStream);
 
   auto nhits = hh.nHits();
 
@@ -119,24 +115,22 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA *
                        device_theCells_.get(),
                        device_nCells_,
                        device_theCellTracks_.get(),
-                       tuples_d,
+                       tracks_view,
                        device_hitTuple_apc_,
-                       quality_d,
                        params_.minHitsPerNtuplet_);
   if (params_.doStats_)
     kernel_mark_used(device_theCells_.get(), device_nCells_);
 
-  cms::cuda::finalizeBulk(device_hitTuple_apc_, tuples_d);
+  cms::cuda::finalizeBulk(device_hitTuple_apc_, &tracks_view.hitIndices());
 
-  kernel_fillHitDetIndices(tuples_d, hh.view(), detId_d);
-  kernel_fillNLayers(tracks_d, device_hitTuple_apc_);
+  kernel_fillHitDetIndices(tracks_view, hh.view());
+  kernel_fillNLayers(tracks_view, device_hitTuple_apc_);
 
   // remove duplicates (tracks that share a doublet)
-  kernel_earlyDuplicateRemover(device_theCells_.get(), device_nCells_, tracks_d, quality_d, params_.dupPassThrough_);
-
-  kernel_countMultiplicity(tuples_d, quality_d, device_tupleMultiplicity_.get());
+  kernel_earlyDuplicateRemover(device_theCells_.get(), device_nCells_, tracks_view, params_.dupPassThrough_);
+  kernel_countMultiplicity(tracks_view, device_tupleMultiplicity_.get());
   cms::cuda::launchFinalize(device_tupleMultiplicity_.get(), cudaStream);
-  kernel_fillMultiplicity(tuples_d, quality_d, device_tupleMultiplicity_.get());
+  kernel_fillMultiplicity(tracks_view, device_tupleMultiplicity_.get());
 
   if (nhits > 1 && params_.lateFishbone_) {
     gpuPixelDoublets::fishbone(hh.view(), device_theCells_.get(), device_nCells_, isOuterHitOfCell_, nhits, true);
@@ -144,14 +138,14 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA *
 }
 
 template <>
-void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) {
+void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh,
+                                                     TkSoAView tracks_view,
+                                                     cudaStream_t cudaStream) {
   int32_t nhits = hh.nHits();
 
-  auto const *tuples_d = &tracks_d->hitIndices;
-  auto *quality_d = tracks_d->qualityData();
-
+  auto *quality_d = pixelTrack::utilities::qualityData(tracks_view);
   // classify tracks based on kinematics
-  kernel_classifyTracks(tuples_d, tracks_d, params_.cuts_, quality_d);
+  kernel_classifyTracks(tracks_view, quality_d, params_.cuts_);
 
   if (params_.lateFishbone_) {
     // apply fishbone cleaning to good tracks
@@ -159,38 +153,34 @@ void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, TkSoA
   }
 
   // remove duplicates (tracks that share a doublet)
-  kernel_fastDuplicateRemover(device_theCells_.get(), device_nCells_, tracks_d, params_.dupPassThrough_);
+  kernel_fastDuplicateRemover(device_theCells_.get(), device_nCells_, tracks_view, params_.dupPassThrough_);
 
   // fill hit->track "map"
   if (params_.doSharedHitCut_ || params_.doStats_) {
-    kernel_countHitInTracks(tuples_d, quality_d, device_hitToTuple_.get());
+    kernel_countHitInTracks(tracks_view, device_hitToTuple_.get());
     cms::cuda::launchFinalize(hitToTupleView_, cudaStream);
-    kernel_fillHitInTracks(tuples_d, quality_d, device_hitToTuple_.get());
+    kernel_fillHitInTracks(tracks_view, device_hitToTuple_.get());
   }
 
   // remove duplicates (tracks that share at least one hit)
   if (params_.doSharedHitCut_) {
     kernel_rejectDuplicate(
-        tracks_d, quality_d, params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get());
-
-    kernel_sharedHitCleaner(hh.view(),
-                            tracks_d,
-                            quality_d,
-                            params_.minHitsForSharingCut_,
-                            params_.dupPassThrough_,
-                            device_hitToTuple_.get());
+        tracks_view, params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get());
+
+    kernel_sharedHitCleaner(
+        hh.view(), tracks_view, params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get());
     if (params_.useSimpleTripletCleaner_) {
       kernel_simpleTripletCleaner(
-          tracks_d, quality_d, params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get());
+          tracks_view, params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get());
     } else {
       kernel_tripletCleaner(
-          tracks_d, quality_d, params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get());
+          tracks_view, params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get());
     }
   }
 
   if (params_.doStats_) {
     std::lock_guard guard(lock_stat);
-    kernel_checkOverflows(tuples_d,
+    kernel_checkOverflows(tracks_view,
                           device_tupleMultiplicity_.get(),
                           device_hitToTuple_.get(),
                           device_hitTuple_apc_,
@@ -208,7 +198,7 @@ void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, TkSoA
     // counters (add flag???)
     std::lock_guard guard(lock_stat);
     kernel_doStatsForHitInTracks(device_hitToTuple_.get(), counters_);
-    kernel_doStatsForTracks(tuples_d, quality_d, counters_);
+    kernel_doStatsForTracks(tracks_view, quality_d, counters_);
   }
 
 #ifdef DUMP_GPU_TK_TUPLES
@@ -217,7 +207,7 @@ void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, TkSoA
   {
     std::lock_guard<std::mutex> guard(lock);
     ++iev;
-    kernel_print_found_ntuplets(hh.view(), tuples_d, tracks_d, quality_d, device_hitToTuple_.get(), 0, 1000000, iev);
+    kernel_print_found_ntuplets(hh.view(), tracks_view, device_hitToTuple_.get(), 0, 1000000, iev);
   }
 #endif
 }
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
index 913b6d5a32d28..9cbdcae1a13d8 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
@@ -2,14 +2,14 @@
 #include <mutex>
 
 template <>
-void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) {
+void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh,
+                                                    TkSoAView tracks_view,
+                                                    cudaStream_t cudaStream) {
   // these are pointer on GPU!
-  auto *tuples_d = &tracks_d->hitIndices;
-  auto *detId_d = &tracks_d->detIndices;
-  auto *quality_d = tracks_d->qualityData();
+  auto *quality_d = pixelTrack::utilities::qualityData(tracks_view);
 
   // zero tuples
-  cms::cuda::launchZero(tuples_d, cudaStream);
+  cms::cuda::launchZero(&(tracks_view.hitIndices()), cudaStream);
 
   int32_t nhits = hh.nHits();
 
@@ -70,9 +70,8 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *
                                                                      device_theCells_.get(),
                                                                      device_nCells_,
                                                                      device_theCellTracks_.get(),
-                                                                     tuples_d,
+                                                                     tracks_view,
                                                                      device_hitTuple_apc_,
-                                                                     quality_d,
                                                                      params_.minHitsPerNtuplet_);
   cudaCheck(cudaGetLastError());
 
@@ -87,26 +86,25 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *
 
   blockSize = 128;
   numberOfBlocks = (HitContainer::ctNOnes() + blockSize - 1) / blockSize;
-  cms::cuda::finalizeBulk<<<numberOfBlocks, blockSize, 0, cudaStream>>>(device_hitTuple_apc_, tuples_d);
+  cms::cuda::finalizeBulk<<<numberOfBlocks, blockSize, 0, cudaStream>>>(device_hitTuple_apc_,
+                                                                        &tracks_view.hitIndices());
 
-  kernel_fillHitDetIndices<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d, hh.view(), detId_d);
+  kernel_fillHitDetIndices<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tracks_view, hh.view());
   cudaCheck(cudaGetLastError());
-  kernel_fillNLayers<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tracks_d, device_hitTuple_apc_);
+  kernel_fillNLayers<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tracks_view, device_hitTuple_apc_);
   cudaCheck(cudaGetLastError());
 
   // remove duplicates (tracks that share a doublet)
   numberOfBlocks = nDoubletBlocks(blockSize);
   kernel_earlyDuplicateRemover<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
-      device_theCells_.get(), device_nCells_, tracks_d, quality_d, params_.dupPassThrough_);
+      device_theCells_.get(), device_nCells_, tracks_view, params_.dupPassThrough_);
   cudaCheck(cudaGetLastError());
 
   blockSize = 128;
   numberOfBlocks = (3 * caConstants::maxTuples / 4 + blockSize - 1) / blockSize;
-  kernel_countMultiplicity<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
-      tuples_d, quality_d, device_tupleMultiplicity_.get());
+  kernel_countMultiplicity<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tracks_view, device_tupleMultiplicity_.get());
   cms::cuda::launchFinalize(device_tupleMultiplicity_.get(), cudaStream);
-  kernel_fillMultiplicity<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
-      tuples_d, quality_d, device_tupleMultiplicity_.get());
+  kernel_fillMultiplicity<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tracks_view, device_tupleMultiplicity_.get());
   cudaCheck(cudaGetLastError());
 
   // do not run the fishbone if there are hits only in BPIX1
@@ -222,10 +220,11 @@ void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStr
 }
 
 template <>
-void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) {
+void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh,
+                                                     TkSoAView tracks_view,
+                                                     cudaStream_t cudaStream) {
   // these are pointer on GPU!
-  auto const *tuples_d = &tracks_d->hitIndices;
-  auto *quality_d = tracks_d->qualityData();
+  auto *quality_d = pixelTrack::utilities::qualityData(tracks_view);
 
   int32_t nhits = hh.nHits();
 
@@ -233,7 +232,8 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA
 
   // classify tracks based on kinematics
   auto numberOfBlocks = nQuadrupletBlocks(blockSize);
-  kernel_classifyTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d, tracks_d, params_.cuts_, quality_d);
+  kernel_classifyTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tracks_view, quality_d, params_.cuts_);
+
   cudaCheck(cudaGetLastError());
 
   if (params_.lateFishbone_) {
@@ -247,7 +247,7 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA
   // mark duplicates (tracks that share a doublet)
   numberOfBlocks = nDoubletBlocks(blockSize);
   kernel_fastDuplicateRemover<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
-      device_theCells_.get(), device_nCells_, tracks_d, params_.dupPassThrough_);
+      device_theCells_.get(), device_nCells_, tracks_view, params_.dupPassThrough_);
   cudaCheck(cudaGetLastError());
 #ifdef GPU_DEBUG
   cudaCheck(cudaDeviceSynchronize());
@@ -257,14 +257,13 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA
     // fill hit->track "map"
     assert(hitToTupleView_.offSize > nhits);
     numberOfBlocks = nQuadrupletBlocks(blockSize);
-    kernel_countHitInTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
-        tuples_d, quality_d, device_hitToTuple_.get());
+    kernel_countHitInTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tracks_view, device_hitToTuple_.get());
     cudaCheck(cudaGetLastError());
     assert((hitToTupleView_.assoc == device_hitToTuple_.get()) &&
            (hitToTupleView_.offStorage == device_hitToTupleStorage_.get()) && (hitToTupleView_.offSize > 0));
     cms::cuda::launchFinalize(hitToTupleView_, cudaStream);
     cudaCheck(cudaGetLastError());
-    kernel_fillHitInTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d, quality_d, device_hitToTuple_.get());
+    kernel_fillHitInTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tracks_view, device_hitToTuple_.get());
     cudaCheck(cudaGetLastError());
 #ifdef GPU_DEBUG
     cudaCheck(cudaDeviceSynchronize());
@@ -276,21 +275,17 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA
     numberOfBlocks = (hitToTupleView_.offSize + blockSize - 1) / blockSize;
 
     kernel_rejectDuplicate<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
-        tracks_d, quality_d, params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get());
+        tracks_view, params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get());
 
-    kernel_sharedHitCleaner<<<numberOfBlocks, blockSize, 0, cudaStream>>>(hh.view(),
-                                                                          tracks_d,
-                                                                          quality_d,
-                                                                          params_.minHitsForSharingCut_,
-                                                                          params_.dupPassThrough_,
-                                                                          device_hitToTuple_.get());
+    kernel_sharedHitCleaner<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+        hh.view(), tracks_view, params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get());
 
     if (params_.useSimpleTripletCleaner_) {
       kernel_simpleTripletCleaner<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
-          tracks_d, quality_d, params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get());
+          tracks_view, params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get());
     } else {
       kernel_tripletCleaner<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
-          tracks_d, quality_d, params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get());
+          tracks_view, params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get());
     }
     cudaCheck(cudaGetLastError());
 #ifdef GPU_DEBUG
@@ -300,7 +295,7 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA
 
   if (params_.doStats_) {
     numberOfBlocks = (std::max(nhits, int(params_.maxNumberOfDoublets_)) + blockSize - 1) / blockSize;
-    kernel_checkOverflows<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d,
+    kernel_checkOverflows<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tracks_view,
                                                                         device_tupleMultiplicity_.get(),
                                                                         device_hitToTuple_.get(),
                                                                         device_hitTuple_apc_,
@@ -321,7 +316,7 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA
     kernel_doStatsForHitInTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(device_hitToTuple_.get(), counters_);
     cudaCheck(cudaGetLastError());
     numberOfBlocks = (3 * caConstants::maxNumberOfQuadruplets / 4 + blockSize - 1) / blockSize;
-    kernel_doStatsForTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d, quality_d, counters_);
+    kernel_doStatsForTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tracks_view, quality_d, counters_);
     cudaCheck(cudaGetLastError());
   }
 #ifdef GPU_DEBUG
@@ -337,11 +332,11 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA
     ++iev;
     for (int k = 0; k < 20000; k += 500) {
       kernel_print_found_ntuplets<<<1, 32, 0, cudaStream>>>(
-          hh.view(), tuples_d, tracks_d, quality_d, device_hitToTuple_.get(), k, k + 500, iev);
+          hh.view(), tracks_view, device_hitToTuple_.get(), k, k + 500, iev);
       cudaDeviceSynchronize();
     }
     kernel_print_found_ntuplets<<<1, 32, 0, cudaStream>>>(
-        hh.view(), tuples_d, tracks_d, quality_d, device_hitToTuple_.get(), 20000, 1000000, iev);
+        hh.view(), tracks_view, device_hitToTuple_.get(), 20000, 1000000, iev);
     cudaDeviceSynchronize();
     // cudaStreamSynchronize(cudaStream);
   }
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h
index 8af1176fe92c6..529f7de99ea98 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h
@@ -3,7 +3,8 @@
 
 // #define GPU_DEBUG
 
-#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
+#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h"
+
 #include "GPUCACell.h"
 
 // #define DUMP_GPU_TK_TUPLES
@@ -34,7 +35,8 @@ namespace cAHitNtupletGenerator {
   using TupleMultiplicity = caConstants::TupleMultiplicity;
 
   using Quality = pixelTrack::Quality;
-  using TkSoA = pixelTrack::TrackSoA;
+  using TkSoAView = pixelTrack::TrackSoAView;
+  using TkSoAConstView = pixelTrack::TrackSoAConstView;
   using HitContainer = pixelTrack::HitContainer;
 
   struct QualityCuts {
@@ -173,7 +175,8 @@ class CAHitNtupletGeneratorKernels {
   using TupleMultiplicity = caConstants::TupleMultiplicity;
 
   using Quality = pixelTrack::Quality;
-  using TkSoA = pixelTrack::TrackSoA;
+  using TkSoAView = pixelTrack::TrackSoAView;
+  using TkSoAConstView = pixelTrack::TrackSoAConstView;
   using HitContainer = pixelTrack::HitContainer;
 
   CAHitNtupletGeneratorKernels(Params const& params)
@@ -182,9 +185,9 @@ class CAHitNtupletGeneratorKernels {
 
   TupleMultiplicity const* tupleMultiplicity() const { return device_tupleMultiplicity_.get(); }
 
-  void launchKernels(HitsOnCPU const& hh, TkSoA* tuples_d, cudaStream_t cudaStream);
+  void launchKernels(HitsOnCPU const& hh, TkSoAView tracks_view, cudaStream_t cudaStream);
 
-  void classifyTuples(HitsOnCPU const& hh, TkSoA* tuples_d, cudaStream_t cudaStream);
+  void classifyTuples(HitsOnCPU const& hh, TkSoAView tracks_view, cudaStream_t cudaStream);
 
   void buildDoublets(HitsOnCPU const& hh, cudaStream_t stream);
   void allocateOnGPU(int32_t nHits, cudaStream_t stream);
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
index bbe5df891a735..75f52305ab39b 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
@@ -14,6 +14,7 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
 #include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h"
+#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h"
 
 #include "CAConstants.h"
 #include "CAHitNtupletGeneratorKernels.h"
@@ -28,7 +29,8 @@ using HitToTuple = caConstants::HitToTuple;
 using TupleMultiplicity = caConstants::TupleMultiplicity;
 
 using Quality = pixelTrack::Quality;
-using TkSoA = pixelTrack::TrackSoA;
+using TkSoAView = pixelTrack::TrackSoAView;
+using TkSoAConstView = pixelTrack::TrackSoAConstView;
 using HitContainer = pixelTrack::HitContainer;
 
 namespace {
@@ -39,7 +41,7 @@ namespace {
 
 }  // namespace
 
-__global__ void kernel_checkOverflows(HitContainer const *foundNtuplets,
+__global__ void kernel_checkOverflows(TkSoAView tracks_view,
                                       caConstants::TupleMultiplicity const *tupleMultiplicity,
                                       CAHitNtupletGeneratorKernelsGPU::HitToTuple const *hitToTuple,
                                       cms::cuda::AtomicPairCounter *apc,
@@ -72,16 +74,16 @@ __global__ void kernel_checkOverflows(HitContainer const *foundNtuplets,
            nHits,
            hitToTuple->totOnes());
     if (apc->get().m < caConstants::maxNumberOfQuadruplets) {
-      assert(foundNtuplets->size(apc->get().m) == 0);
-      assert(foundNtuplets->size() == apc->get().n);
+      assert(tracks_view.hitIndices().size(apc->get().m) == 0);
+      assert(tracks_view.hitIndices().size() == apc->get().n);
     }
   }
 
-  for (int idx = first, nt = foundNtuplets->nOnes(); idx < nt; idx += gridDim.x * blockDim.x) {
-    if (foundNtuplets->size(idx) > 7)  // current real limit
-      printf("ERROR %d, %d\n", idx, foundNtuplets->size(idx));
-    assert(foundNtuplets->size(idx) <= caConstants::maxHitsOnTrack);
-    for (auto ih = foundNtuplets->begin(idx); ih != foundNtuplets->end(idx); ++ih)
+  for (int idx = first, nt = tracks_view.hitIndices().nOnes(); idx < nt; idx += gridDim.x * blockDim.x) {
+    if (tracks_view.hitIndices().size(idx) > 7)  // current real limit
+      printf("ERROR %d, %d\n", idx, tracks_view.hitIndices().size(idx));
+    assert(tracks_view.hitIndices().size(idx) <= caConstants::maxHitsOnTrack);
+    for (auto ih = tracks_view.hitIndices().begin(idx); ih != tracks_view.hitIndices().end(idx); ++ih)
       assert(int(*ih) < nHits);
   }
 #endif
@@ -139,13 +141,10 @@ __global__ void kernel_fishboneCleaner(GPUCACell const *cells, uint32_t const *_
 // It does not seem to affect efficiency in any way!
 __global__ void kernel_earlyDuplicateRemover(GPUCACell const *cells,
                                              uint32_t const *__restrict__ nCells,
-                                             TkSoA const *__restrict__ ptracks,
-                                             Quality *quality,
+                                             TkSoAView tracks_view,
                                              bool dupPassThrough) {
   // quality to mark rejected
-  constexpr auto reject = pixelTrack::Quality::edup;  /// cannot be loose
-
-  auto const &tracks = *ptracks;
+  constexpr auto reject = (uint8_t)pixelTrack::Quality::edup;  /// cannot be loose
 
   assert(nCells);
   auto first = threadIdx.x + blockIdx.x * blockDim.x;
@@ -159,7 +158,7 @@ __global__ void kernel_earlyDuplicateRemover(GPUCACell const *cells,
 
     // find maxNl
     for (auto it : thisCell.tracks()) {
-      auto nl = tracks.nLayers(it);
+      auto nl = tracks_view[it].nLayers();
       maxNl = std::max(nl, maxNl);
     }
 
@@ -168,8 +167,8 @@ __global__ void kernel_earlyDuplicateRemover(GPUCACell const *cells,
     //  maxNl = std::min(4, maxNl);
 
     for (auto it : thisCell.tracks()) {
-      if (tracks.nLayers(it) < maxNl)
-        quality[it] = reject;  //no race:  simple assignment of the same constant
+      if (tracks_view[it].nLayers() < maxNl)
+        tracks_view[it].quality() = reject;  //no race:  simple assignment of the same constant
     }
   }
 }
@@ -177,11 +176,11 @@ __global__ void kernel_earlyDuplicateRemover(GPUCACell const *cells,
 // assume the above (so, short tracks already removed)
 __global__ void kernel_fastDuplicateRemover(GPUCACell const *__restrict__ cells,
                                             uint32_t const *__restrict__ nCells,
-                                            TkSoA *__restrict__ tracks,
+                                            TkSoAView tracks_view,
                                             bool dupPassThrough) {
   // quality to mark rejected
-  auto const reject = dupPassThrough ? pixelTrack::Quality::loose : pixelTrack::Quality::dup;
-  constexpr auto loose = pixelTrack::Quality::loose;
+  auto const reject = dupPassThrough ? (uint8_t)pixelTrack::Quality::loose : (uint8_t)pixelTrack::Quality::dup;
+  constexpr auto loose = (uint8_t)pixelTrack::Quality::loose;
 
   assert(nCells);
 
@@ -196,42 +195,42 @@ __global__ void kernel_fastDuplicateRemover(GPUCACell const *__restrict__ cells,
 
     /* chi2 penalize higher-pt tracks  (try rescale it?)
     auto score = [&](auto it) {
-      return tracks->nLayers(it) < 4 ? 
-              std::abs(tracks->tip(it)) :  // tip for triplets
-              tracks->chi2(it);            //chi2 for quads
+      return tracks_view[it].nLayers() < 4 ?
+              std::abs(pixelTrack::utilities::tip(tracks_view, it)) :  // tip for triplets
+              tracks_view[it].chi2(it);            //chi2 for quads
     };
     */
 
-    auto score = [&](auto it) { return std::abs(tracks->tip(it)); };
+    auto score = [&](auto it) { return std::abs(pixelTrack::utilities::tip(tracks_view, it)); };
 
     // full crazy combinatorics
     int ntr = thisCell.tracks().size();
     for (int i = 0; i < ntr - 1; ++i) {
       auto it = thisCell.tracks()[i];
-      auto qi = tracks->quality(it);
+      auto qi = tracks_view[it].quality();
       if (qi <= reject)
         continue;
-      auto opi = tracks->stateAtBS.state(it)(2);
-      auto e2opi = tracks->stateAtBS.covariance(it)(9);
-      auto cti = tracks->stateAtBS.state(it)(3);
-      auto e2cti = tracks->stateAtBS.covariance(it)(12);
+      auto opi = tracks_view[it].state()(2);
+      auto e2opi = tracks_view[it].covariance()(9);
+      auto cti = tracks_view[it].state()(3);
+      auto e2cti = tracks_view[it].covariance()(12);
       for (auto j = i + 1; j < ntr; ++j) {
         auto jt = thisCell.tracks()[j];
-        auto qj = tracks->quality(jt);
+        auto qj = tracks_view[jt].quality();
         if (qj <= reject)
           continue;
-        auto opj = tracks->stateAtBS.state(jt)(2);
-        auto ctj = tracks->stateAtBS.state(jt)(3);
-        auto dct = nSigma2 * (tracks->stateAtBS.covariance(jt)(12) + e2cti);
+        auto opj = tracks_view[jt].state()(2);
+        auto ctj = tracks_view[jt].state()(3);
+        auto dct = nSigma2 * (tracks_view[jt].covariance()(12) + e2cti);
         if ((cti - ctj) * (cti - ctj) > dct)
           continue;
-        auto dop = nSigma2 * (tracks->stateAtBS.covariance(jt)(9) + e2opi);
+        auto dop = nSigma2 * (tracks_view[jt].covariance()(9) + e2opi);
         if ((opi - opj) * (opi - opj) > dop)
           continue;
         if ((qj < qi) || (qj == qi && score(it) < score(jt)))
-          tracks->quality(jt) = reject;
+          tracks_view[jt].quality() = reject;
         else {
-          tracks->quality(it) = reject;
+          tracks_view[it].quality() = reject;
           break;
         }
       }
@@ -240,8 +239,8 @@ __global__ void kernel_fastDuplicateRemover(GPUCACell const *__restrict__ cells,
     // find maxQual
     auto maxQual = reject;  // no duplicate!
     for (auto it : thisCell.tracks()) {
-      if (tracks->quality(it) > maxQual)
-        maxQual = tracks->quality(it);
+      if (tracks_view[it].quality() > maxQual)
+        maxQual = tracks_view[it].quality();
     }
 
     if (maxQual <= loose)
@@ -249,7 +248,7 @@ __global__ void kernel_fastDuplicateRemover(GPUCACell const *__restrict__ cells,
 
     // find min score
     for (auto it : thisCell.tracks()) {
-      if (tracks->quality(it) == maxQual && score(it) < mc) {
+      if (tracks_view[it].quality() == maxQual && score(it) < mc) {
         mc = score(it);
         im = it;
       }
@@ -260,8 +259,8 @@ __global__ void kernel_fastDuplicateRemover(GPUCACell const *__restrict__ cells,
 
     // mark all other duplicates  (not yet, keep it loose)
     for (auto it : thisCell.tracks()) {
-      if (tracks->quality(it) > loose && it != im)
-        tracks->quality(it) = loose;  //no race:  simple assignment of the same constant
+      if (tracks_view[it].quality() > loose && it != im)
+        tracks_view[it].quality() = loose;  //no race:  simple assignment of the same constant
     }
   }
 }
@@ -337,9 +336,8 @@ __global__ void kernel_find_ntuplets(GPUCACell::Hits const *__restrict__ hhp,
                                      GPUCACell *__restrict__ cells,
                                      uint32_t const *nCells,
                                      gpuPixelDoublets::CellTracksVector *cellTracks,
-                                     HitContainer *foundNtuplets,
+                                     TkSoAView tracks_view,
                                      cms::cuda::AtomicPairCounter *apc,
-                                     Quality *__restrict__ quality,
                                      unsigned int minHitsPerNtuplet) {
   // recursive: not obvious to widen
   auto const &hh = *hhp;
@@ -357,8 +355,15 @@ __global__ void kernel_find_ntuplets(GPUCACell::Hits const *__restrict__ hhp,
     if (doit) {
       GPUCACell::TmpTuple stack;
       stack.reset();
-      thisCell.find_ntuplets<6>(
-          hh, cells, *cellTracks, *foundNtuplets, *apc, quality, stack, minHitsPerNtuplet, pid < 3);
+      thisCell.find_ntuplets<6>(hh,
+                                cells,
+                                *cellTracks,
+                                tracks_view.hitIndices(),
+                                *apc,
+                                pixelTrack::utilities::qualityData(tracks_view),
+                                stack,
+                                minHitsPerNtuplet,
+                                pid < 3);
       assert(stack.empty());
       // printf("in %d found quadruplets: %d\n", cellIndex, apc->get());
     }
@@ -374,17 +379,16 @@ __global__ void kernel_mark_used(GPUCACell *__restrict__ cells, uint32_t const *
   }
 }
 
-__global__ void kernel_countMultiplicity(HitContainer const *__restrict__ foundNtuplets,
-                                         Quality const *__restrict__ quality,
-                                         caConstants::TupleMultiplicity *tupleMultiplicity) {
+// TODO: change arg type to TkSoAConstview
+__global__ void kernel_countMultiplicity(TkSoAView tracks_view, caConstants::TupleMultiplicity *tupleMultiplicity) {
   auto first = blockIdx.x * blockDim.x + threadIdx.x;
-  for (int it = first, nt = foundNtuplets->nOnes(); it < nt; it += gridDim.x * blockDim.x) {
-    auto nhits = foundNtuplets->size(it);
+  for (int it = first, nt = tracks_view.hitIndices().nOnes(); it < nt; it += gridDim.x * blockDim.x) {
+    auto nhits = tracks_view.hitIndices().size(it);
     if (nhits < 3)
       continue;
-    if (quality[it] == pixelTrack::Quality::edup)
+    if (tracks_view[it].quality() == (uint8_t)pixelTrack::Quality::edup)
       continue;
-    assert(quality[it] == pixelTrack::Quality::bad);
+    assert(tracks_view[it].quality() == (uint8_t)pixelTrack::Quality::bad);
     if (nhits > 7)  // current limit
       printf("wrong mult %d %d\n", it, nhits);
     assert(nhits <= caConstants::maxHitsOnTrack);
@@ -392,17 +396,16 @@ __global__ void kernel_countMultiplicity(HitContainer const *__restrict__ foundN
   }
 }
 
-__global__ void kernel_fillMultiplicity(HitContainer const *__restrict__ foundNtuplets,
-                                        Quality const *__restrict__ quality,
-                                        caConstants::TupleMultiplicity *tupleMultiplicity) {
+// TODO: change arg type to TkSoAConstview
+__global__ void kernel_fillMultiplicity(TkSoAView tracks_view, caConstants::TupleMultiplicity *tupleMultiplicity) {
   auto first = blockIdx.x * blockDim.x + threadIdx.x;
-  for (int it = first, nt = foundNtuplets->nOnes(); it < nt; it += gridDim.x * blockDim.x) {
-    auto nhits = foundNtuplets->size(it);
+  for (int it = first, nt = tracks_view.hitIndices().nOnes(); it < nt; it += gridDim.x * blockDim.x) {
+    auto nhits = tracks_view.hitIndices().size(it);
     if (nhits < 3)
       continue;
-    if (quality[it] == pixelTrack::Quality::edup)
+    if (tracks_view[it].quality() == (uint8_t)pixelTrack::Quality::edup)
       continue;
-    assert(quality[it] == pixelTrack::Quality::bad);
+    assert(tracks_view[it].quality() == (uint8_t)pixelTrack::Quality::bad);
     if (nhits > 7)
       printf("wrong mult %d %d\n", it, nhits);
     assert(nhits <= caConstants::maxHitsOnTrack);
@@ -410,13 +413,17 @@ __global__ void kernel_fillMultiplicity(HitContainer const *__restrict__ foundNt
   }
 }
 
-__global__ void kernel_classifyTracks(HitContainer const *__restrict__ tuples,
-                                      TkSoA const *__restrict__ tracks,
-                                      CAHitNtupletGeneratorKernelsGPU::QualityCuts cuts,
-                                      Quality *__restrict__ quality) {
+/*
+  Supply both the original TkSoA and the TkSoAView which contains
+the SoA Data
+ */
+__global__ void kernel_classifyTracks(TkSoAView tracks_view,
+                                      Quality *__restrict__ quality,
+                                      CAHitNtupletGeneratorKernelsGPU::QualityCuts cuts) {
   int first = blockDim.x * blockIdx.x + threadIdx.x;
-  for (int it = first, nt = tuples->nOnes(); it < nt; it += gridDim.x * blockDim.x) {
-    auto nhits = tuples->size(it);
+
+  for (int it = first, nt = tracks_view.hitIndices().nOnes(); it < nt; it += gridDim.x * blockDim.x) {
+    auto nhits = tracks_view.hitIndices().size(it);
     if (nhits == 0)
       break;  // guard
 
@@ -433,11 +440,11 @@ __global__ void kernel_classifyTracks(HitContainer const *__restrict__ tuples,
     // if the fit has any invalid parameters, mark it as bad
     bool isNaN = false;
     for (int i = 0; i < 5; ++i) {
-      isNaN |= std::isnan(tracks->stateAtBS.state(it)(i));
+      isNaN |= std::isnan(tracks_view[it].state()(i));
     }
     if (isNaN) {
 #ifdef NTUPLE_DEBUG
-      printf("NaN in fit %d size %d chi2 %f\n", it, tuples->size(it), tracks->chi2(it));
+      printf("NaN in fit %d size %d chi2 %f\n", it, tracks_view.hitIndices().size(it), tracks_view[it].chi2());
 #endif
       continue;
     }
@@ -468,16 +475,16 @@ __global__ void kernel_classifyTracks(HitContainer const *__restrict__ tuples,
     };
 
     // (see CAHitNtupletGeneratorGPU.cc)
-    float pt = std::min<float>(tracks->pt(it), cuts.chi2MaxPt);
+    float pt = std::min<float>(tracks_view[it].pt(), cuts.chi2MaxPt);
     float chi2Cut = cuts.chi2Scale * (cuts.chi2Coeff[0] + roughLog(pt) * cuts.chi2Coeff[1]);
-    if (tracks->chi2(it) >= chi2Cut) {
+    if (tracks_view[it].chi2() >= chi2Cut) {
 #ifdef NTUPLE_FIT_DEBUG
       printf("Bad chi2 %d size %d pt %f eta %f chi2 %f\n",
              it,
-             tuples->size(it),
-             tracks->pt(it),
-             tracks->eta(it),
-             tracks->chi2(it));
+             tracks_view.hitIndices().size(it),
+             tracks_view[it].pt(),
+             tracks_view[it].eta(),
+             tracks_view[it].chi2());
 #endif
       continue;
     }
@@ -490,20 +497,22 @@ __global__ void kernel_classifyTracks(HitContainer const *__restrict__ tuples,
     //   - for quadruplets: |Tip| < 0.5 cm, pT > 0.3 GeV, |Zip| < 12.0 cm
     // (see CAHitNtupletGeneratorGPU.cc)
     auto const &region = (nhits > 3) ? cuts.quadruplet : cuts.triplet;
-    bool isOk = (std::abs(tracks->tip(it)) < region.maxTip) and (tracks->pt(it) > region.minPt) and
-                (std::abs(tracks->zip(it)) < region.maxZip);
+    bool isOk = (std::abs(pixelTrack::utilities::tip(tracks_view, it)) < region.maxTip) and
+                (tracks_view[it].pt() > region.minPt) and
+                (std::abs(pixelTrack::utilities::zip(tracks_view, it)) < region.maxZip);
 
-    if (isOk)
+    if (isOk) {
       quality[it] = pixelTrack::Quality::highPurity;
+    }
   }
 }
 
-__global__ void kernel_doStatsForTracks(HitContainer const *__restrict__ tuples,
+__global__ void kernel_doStatsForTracks(TkSoAView tracks_view,
                                         Quality const *__restrict__ quality,
                                         CAHitNtupletGeneratorKernelsGPU::Counters *counters) {
   int first = blockDim.x * blockIdx.x + threadIdx.x;
-  for (int idx = first, ntot = tuples->nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
-    if (tuples->size(idx) == 0)
+  for (int idx = first, ntot = tracks_view.hitIndices().nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
+    if (tracks_view.hitIndices().size(idx) == 0)
       break;  //guard
     if (quality[idx] < pixelTrack::Quality::loose)
       continue;
@@ -514,58 +523,56 @@ __global__ void kernel_doStatsForTracks(HitContainer const *__restrict__ tuples,
   }
 }
 
-__global__ void kernel_countHitInTracks(HitContainer const *__restrict__ tuples,
-                                        Quality const *__restrict__ quality,
+__global__ void kernel_countHitInTracks(TkSoAView tracks_view,
                                         CAHitNtupletGeneratorKernelsGPU::HitToTuple *hitToTuple) {
   int first = blockDim.x * blockIdx.x + threadIdx.x;
-  for (int idx = first, ntot = tuples->nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
-    if (tuples->size(idx) == 0)
+  for (int idx = first, ntot = tracks_view.hitIndices().nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
+    if (tracks_view.hitIndices().size(idx) == 0)
       break;  // guard
-    for (auto h = tuples->begin(idx); h != tuples->end(idx); ++h)
+    for (auto h = tracks_view.hitIndices().begin(idx); h != tracks_view.hitIndices().end(idx); ++h)
       hitToTuple->count(*h);
   }
 }
 
-__global__ void kernel_fillHitInTracks(HitContainer const *__restrict__ tuples,
-                                       Quality const *__restrict__ quality,
+__global__ void kernel_fillHitInTracks(TkSoAView tracks_view,  // TODO: Make ConstView
                                        CAHitNtupletGeneratorKernelsGPU::HitToTuple *hitToTuple) {
   int first = blockDim.x * blockIdx.x + threadIdx.x;
-  for (int idx = first, ntot = tuples->nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
-    if (tuples->size(idx) == 0)
+  for (int idx = first, ntot = tracks_view.hitIndices().nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
+    if (tracks_view.hitIndices().size(idx) == 0)
       break;  // guard
-    for (auto h = tuples->begin(idx); h != tuples->end(idx); ++h)
+    for (auto h = tracks_view.hitIndices().begin(idx); h != tracks_view.hitIndices().end(idx); ++h)
       hitToTuple->fill(*h, idx);
   }
 }
 
-__global__ void kernel_fillHitDetIndices(HitContainer const *__restrict__ tuples,
-                                         TrackingRecHit2DSOAView const *__restrict__ hhp,
-                                         HitContainer *__restrict__ hitDetIndices) {
+__global__ void kernel_fillHitDetIndices(TkSoAView tracks_view, TrackingRecHit2DSOAView const *__restrict__ hhp) {
   int first = blockDim.x * blockIdx.x + threadIdx.x;
   // copy offsets
-  for (int idx = first, ntot = tuples->totOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
-    hitDetIndices->off[idx] = tuples->off[idx];
+  for (int idx = first, ntot = tracks_view.hitIndices().totOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
+    tracks_view.detIndices().off[idx] = tracks_view.hitIndices().off[idx];
   }
   // fill hit indices
   auto const &hh = *hhp;
   auto nhits = hh.nHits();
-  for (int idx = first, ntot = tuples->size(); idx < ntot; idx += gridDim.x * blockDim.x) {
-    assert(tuples->content[idx] < nhits);
-    hitDetIndices->content[idx] = hh.detectorIndex(tuples->content[idx]);
+  for (int idx = first, ntot = tracks_view.hitIndices().size(); idx < ntot; idx += gridDim.x * blockDim.x) {
+    assert(tracks_view.hitIndices().content[idx] < nhits);
+    tracks_view.detIndices().content[idx] = hh.detectorIndex(tracks_view.hitIndices().content[idx]);
   }
 }
 
-__global__ void kernel_fillNLayers(TkSoA *__restrict__ ptracks, cms::cuda::AtomicPairCounter *apc) {
-  auto &tracks = *ptracks;
+/*
+  Needs both TkSoA and TkSoAView for accessing SoA, computeNumberOfLayers(), nHits(), stride()
+ */
+__global__ void kernel_fillNLayers(TkSoAView tracks_view, cms::cuda::AtomicPairCounter *apc) {
   auto first = blockIdx.x * blockDim.x + threadIdx.x;
   // clamp the number of tracks to the capacity of the SoA
-  auto ntracks = std::min<int>(apc->get().m, tracks.stride() - 1);
+  auto ntracks = std::min<int>(apc->get().m, tracks_view.metadata().size() - 1);
   if (0 == first)
-    tracks.setNTracks(ntracks);
+    tracks_view.nTracks() = ntracks;
   for (int idx = first, nt = ntracks; idx < nt; idx += gridDim.x * blockDim.x) {
-    auto nHits = tracks.nHits(idx);
+    auto nHits = pixelTrack::utilities::nHits(tracks_view, idx);
     assert(nHits >= 3);
-    tracks.nLayers(idx) = tracks.computeNumberOfLayers(idx);
+    tracks_view[idx].nLayers() = pixelTrack::utilities::computeNumberOfLayers(tracks_view, idx);
   }
 }
 
@@ -622,10 +629,8 @@ __global__ void kernel_markSharedHit(int const *__restrict__ nshared,
                                      HitContainer const *__restrict__ tuples,
                                      Quality *__restrict__ quality,
                                      bool dupPassThrough) {
-  // constexpr auto bad = pixelTrack::Quality::bad;
   constexpr auto dup = pixelTrack::Quality::dup;
   constexpr auto loose = pixelTrack::Quality::loose;
-  // constexpr auto strict = pixelTrack::Quality::strict;
 
   // quality to mark rejected
   auto const reject = dupPassThrough ? loose : dup;
@@ -642,16 +647,14 @@ __global__ void kernel_markSharedHit(int const *__restrict__ nshared,
 }
 
 // mostly for very forward triplets.....
-__global__ void kernel_rejectDuplicate(TkSoA const *__restrict__ ptracks,
-                                       Quality *__restrict__ quality,
+__global__ void kernel_rejectDuplicate(TkSoAView tracks_view,
                                        uint16_t nmin,
                                        bool dupPassThrough,
                                        CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ phitToTuple) {
   // quality to mark rejected
-  auto const reject = dupPassThrough ? pixelTrack::Quality::loose : pixelTrack::Quality::dup;
+  auto const reject = dupPassThrough ? (uint8_t)pixelTrack::Quality::loose : (uint8_t)pixelTrack::Quality::dup;
 
   auto &hitToTuple = *phitToTuple;
-  auto const &tracks = *ptracks;
 
   int first = blockDim.x * blockIdx.x + threadIdx.x;
   for (int idx = first, ntot = hitToTuple.nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
@@ -660,41 +663,41 @@ __global__ void kernel_rejectDuplicate(TkSoA const *__restrict__ ptracks,
 
     /* chi2 is bad for large pt
     auto score = [&](auto it, auto nl) {
-      return nl < 4 ? std::abs(tracks.tip(it)) :  // tip for triplets
-                 tracks.chi2(it);                 //chi2
+      return nl < 4 ? std::abs(pixelTrack::utilities::tip(tracks_view, it)) :  // tip for triplets
+                 pixelTrack::utilities::chi2(tracks_view, it);                 //chi2
     };
     */
-    auto score = [&](auto it, auto nl) { return std::abs(tracks.tip(it)); };
+    auto score = [&](auto it, auto nl) { return std::abs(pixelTrack::utilities::tip(tracks_view, it)); };
 
     // full combinatorics
     for (auto ip = hitToTuple.begin(idx); ip < hitToTuple.end(idx) - 1; ++ip) {
       auto const it = *ip;
-      auto qi = quality[it];
+      auto qi = tracks_view[it].quality();
       if (qi <= reject)
         continue;
-      auto opi = tracks.stateAtBS.state(it)(2);
-      auto e2opi = tracks.stateAtBS.covariance(it)(9);
-      auto cti = tracks.stateAtBS.state(it)(3);
-      auto e2cti = tracks.stateAtBS.covariance(it)(12);
-      auto nli = tracks.nLayers(it);
+      auto opi = tracks_view[it].state()(2);
+      auto e2opi = tracks_view[it].covariance()(9);
+      auto cti = tracks_view[it].state()(3);
+      auto e2cti = tracks_view[it].covariance()(12);
+      auto nli = tracks_view[it].nLayers();
       for (auto jp = ip + 1; jp < hitToTuple.end(idx); ++jp) {
         auto const jt = *jp;
-        auto qj = quality[jt];
+        auto qj = tracks_view[jt].quality();
         if (qj <= reject)
           continue;
-        auto opj = tracks.stateAtBS.state(jt)(2);
-        auto ctj = tracks.stateAtBS.state(jt)(3);
-        auto dct = nSigma2 * (tracks.stateAtBS.covariance(jt)(12) + e2cti);
+        auto opj = tracks_view[jt].state()(2);
+        auto ctj = tracks_view[jt].state()(3);
+        auto dct = nSigma2 * (tracks_view[jt].covariance()(12) + e2cti);
         if ((cti - ctj) * (cti - ctj) > dct)
           continue;
-        auto dop = nSigma2 * (tracks.stateAtBS.covariance(jt)(9) + e2opi);
+        auto dop = nSigma2 * (tracks_view[jt].covariance()(9) + e2opi);
         if ((opi - opj) * (opi - opj) > dop)
           continue;
-        auto nlj = tracks.nLayers(jt);
+        auto nlj = tracks_view[jt].nLayers();
         if (nlj < nli || (nlj == nli && (qj < qi || (qj == qi && score(it, nli) < score(jt, nlj)))))
-          quality[jt] = reject;
+          tracks_view[jt].quality() = reject;
         else {
-          quality[it] = reject;
+          tracks_view[it].quality() = reject;
           break;
         }
       }
@@ -703,18 +706,16 @@ __global__ void kernel_rejectDuplicate(TkSoA const *__restrict__ ptracks,
 }
 
 __global__ void kernel_sharedHitCleaner(TrackingRecHit2DSOAView const *__restrict__ hhp,
-                                        TkSoA const *__restrict__ ptracks,
-                                        Quality *__restrict__ quality,
+                                        TkSoAView tracks_view,
                                         int nmin,
                                         bool dupPassThrough,
                                         CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ phitToTuple) {
   // quality to mark rejected
-  auto const reject = dupPassThrough ? pixelTrack::Quality::loose : pixelTrack::Quality::dup;
+  auto const reject = dupPassThrough ? (uint8_t)pixelTrack::Quality::loose : (uint8_t)pixelTrack::Quality::dup;
   // quality of longest track
-  auto const longTqual = pixelTrack::Quality::highPurity;
+  auto const longTqual = (uint8_t)pixelTrack::Quality::highPurity;
 
   auto &hitToTuple = *phitToTuple;
-  auto const &tracks = *ptracks;
 
   auto const &hh = *hhp;
   int l1end = hh.hitsLayerStart()[1];
@@ -728,10 +729,10 @@ __global__ void kernel_sharedHitCleaner(TrackingRecHit2DSOAView const *__restric
 
     // find maxNl
     for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) {
-      if (quality[*it] < longTqual)
+      if (tracks_view[*it].quality() < longTqual)
         continue;
-      // if (tracks.nHits(*it)==3) continue;
-      auto nl = tracks.nLayers(*it);
+      // if (tracks_view[*it].nHits()==3) continue;
+      auto nl = tracks_view[*it].nLayers();
       maxNl = std::max(nl, maxNl);
     }
 
@@ -743,30 +744,28 @@ __global__ void kernel_sharedHitCleaner(TrackingRecHit2DSOAView const *__restric
 
     // kill all tracks shorter than maxHl (only triplets???
     for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) {
-      auto nl = tracks.nLayers(*it);
+      auto nl = tracks_view[*it].nLayers();
 
       //checking if shared hit is on bpix1 and if the tuple is short enough
       if (idx < l1end and nl > nmin)
         continue;
 
-      if (nl < maxNl && quality[*it] > reject)
-        quality[*it] = reject;
+      if (nl < maxNl && tracks_view[*it].quality() > reject)
+        tracks_view[*it].quality() = reject;
     }
   }
 }
 
-__global__ void kernel_tripletCleaner(TkSoA const *__restrict__ ptracks,
-                                      Quality *__restrict__ quality,
+__global__ void kernel_tripletCleaner(TkSoAView tracks_view,
                                       uint16_t nmin,
                                       bool dupPassThrough,
                                       CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ phitToTuple) {
   // quality to mark rejected
-  auto const reject = pixelTrack::Quality::loose;
+  auto const reject = (uint8_t)pixelTrack::Quality::loose;
   /// min quality of good
-  auto const good = pixelTrack::Quality::strict;
+  auto const good = (uint8_t)pixelTrack::Quality::strict;
 
   auto &hitToTuple = *phitToTuple;
-  auto const &tracks = *ptracks;
 
   int first = blockDim.x * blockIdx.x + threadIdx.x;
   for (int idx = first, ntot = hitToTuple.nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
@@ -779,9 +778,9 @@ __global__ void kernel_tripletCleaner(TkSoA const *__restrict__ ptracks,
 
     // check if only triplets
     for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) {
-      if (quality[*it] <= good)
+      if (tracks_view[*it].quality() <= good)
         continue;
-      onlyTriplets &= tracks.isTriplet(*it);
+      onlyTriplets &= pixelTrack::utilities::isTriplet(tracks_view, *it);
       if (!onlyTriplets)
         break;
     }
@@ -793,8 +792,8 @@ __global__ void kernel_tripletCleaner(TkSoA const *__restrict__ ptracks,
     // for triplets choose best tip!  (should we first find best quality???)
     for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) {
       auto const it = *ip;
-      if (quality[it] >= good && std::abs(tracks.tip(it)) < mc) {
-        mc = std::abs(tracks.tip(it));
+      if (tracks_view[it].quality() >= good && std::abs(pixelTrack::utilities::tip(tracks_view, it)) < mc) {
+        mc = std::abs(pixelTrack::utilities::tip(tracks_view, it));
         im = it;
       }
     }
@@ -805,26 +804,24 @@ __global__ void kernel_tripletCleaner(TkSoA const *__restrict__ ptracks,
     // mark worse ambiguities
     for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) {
       auto const it = *ip;
-      if (quality[it] > reject && it != im)
-        quality[it] = reject;  //no race:  simple assignment of the same constant
+      if (tracks_view[it].quality() > reject && it != im)
+        tracks_view[it].quality() = reject;  //no race:  simple assignment of the same constant
     }
 
   }  // loop over hits
 }
 
 __global__ void kernel_simpleTripletCleaner(
-    TkSoA const *__restrict__ ptracks,
-    Quality *__restrict__ quality,
+    TkSoAView tracks_view,
     uint16_t nmin,
     bool dupPassThrough,
     CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ phitToTuple) {
   // quality to mark rejected
-  auto const reject = pixelTrack::Quality::loose;
+  auto const reject = (uint8_t)pixelTrack::Quality::loose;
   /// min quality of good
-  auto const good = pixelTrack::Quality::loose;
+  auto const good = (uint8_t)pixelTrack::Quality::loose;
 
   auto &hitToTuple = *phitToTuple;
-  auto const &tracks = *ptracks;
 
   int first = blockDim.x * blockIdx.x + threadIdx.x;
   for (int idx = first, ntot = hitToTuple.nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
@@ -837,8 +834,8 @@ __global__ void kernel_simpleTripletCleaner(
     // choose best tip!  (should we first find best quality???)
     for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) {
       auto const it = *ip;
-      if (quality[it] >= good && std::abs(tracks.tip(it)) < mc) {
-        mc = std::abs(tracks.tip(it));
+      if (tracks_view[it].quality() >= good && std::abs(pixelTrack::utilities::tip(tracks_view, it)) < mc) {
+        mc = std::abs(pixelTrack::utilities::tip(tracks_view, it));
         im = it;
       }
     }
@@ -849,52 +846,50 @@ __global__ void kernel_simpleTripletCleaner(
     // mark worse ambiguities
     for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) {
       auto const it = *ip;
-      if (quality[it] > reject && tracks.isTriplet(it) && it != im)
-        quality[it] = reject;  //no race:  simple assignment of the same constant
+      if (tracks_view[it].quality() > reject && pixelTrack::utilities::isTriplet(tracks_view, it) && it != im)
+        tracks_view[it].quality() = reject;  //no race:  simple assignment of the same constant
     }
 
   }  // loop over hits
 }
 
 __global__ void kernel_print_found_ntuplets(TrackingRecHit2DSOAView const *__restrict__ hhp,
-                                            HitContainer const *__restrict__ ptuples,
-                                            TkSoA const *__restrict__ ptracks,
-                                            Quality const *__restrict__ quality,
+                                            TkSoAView tracks_view,
                                             CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ phitToTuple,
                                             int32_t firstPrint,
                                             int32_t lastPrint,
                                             int iev) {
-  constexpr auto loose = pixelTrack::Quality::loose;
+  constexpr auto loose = (uint8_t)pixelTrack::Quality::loose;
   auto const &hh = *hhp;
-  auto const &foundNtuplets = *ptuples;
-  auto const &tracks = *ptracks;
+  // auto const &foundNtuplets = *ptuples;
+
   int first = firstPrint + blockDim.x * blockIdx.x + threadIdx.x;
-  for (int i = first, np = std::min(lastPrint, foundNtuplets.nOnes()); i < np; i += blockDim.x * gridDim.x) {
-    auto nh = foundNtuplets.size(i);
+  for (int i = first, np = std::min(lastPrint, tracks_view.hitIndices().nOnes()); i < np; i += blockDim.x * gridDim.x) {
+    auto nh = tracks_view.hitIndices().size(i);
     if (nh < 3)
       continue;
-    if (quality[i] < loose)
+    if (tracks_view[i].quality() < loose)
       continue;
     printf("TK: %d %d %d %d %f %f %f %f %f %f %f %.3f %.3f %.3f %.3f %.3f %.3f %.3f\n",
            10000 * iev + i,
-           int(quality[i]),
+           int(tracks_view[i].quality()),
            nh,
-           tracks.nLayers(i),
-           tracks.charge(i),
-           tracks.pt(i),
-           tracks.eta(i),
-           tracks.phi(i),
-           tracks.tip(i),
-           tracks.zip(i),
+           tracks_view[i].nLayers(),
+           pixelTrack::utilities::charge(tracks_view, i),
+           tracks_view[i].pt(),
+           tracks_view[i].eta(),
+           pixelTrack::utilities::phi(tracks_view, i),
+           pixelTrack::utilities::tip(tracks_view, i),
+           pixelTrack::utilities::zip(tracks_view, i),
            //           asinhf(fit_results[i].par(3)),
-           tracks.chi2(i),
-           hh.zGlobal(*foundNtuplets.begin(i)),
-           hh.zGlobal(*(foundNtuplets.begin(i) + 1)),
-           hh.zGlobal(*(foundNtuplets.begin(i) + 2)),
-           nh > 3 ? hh.zGlobal(int(*(foundNtuplets.begin(i) + 3))) : 0,
-           nh > 4 ? hh.zGlobal(int(*(foundNtuplets.begin(i) + 4))) : 0,
-           nh > 5 ? hh.zGlobal(int(*(foundNtuplets.begin(i) + 5))) : 0,
-           nh > 6 ? hh.zGlobal(int(*(foundNtuplets.begin(i) + nh - 1))) : 0);
+           tracks_view[i].chi2(),
+           hh.zGlobal(*tracks_view.hitIndices().begin(i)),
+           hh.zGlobal(*(tracks_view.hitIndices().begin(i) + 1)),
+           hh.zGlobal(*(tracks_view.hitIndices().begin(i) + 2)),
+           nh > 3 ? hh.zGlobal(int(*(tracks_view.hitIndices().begin(i) + 3))) : 0,
+           nh > 4 ? hh.zGlobal(int(*(tracks_view.hitIndices().begin(i) + 4))) : 0,
+           nh > 5 ? hh.zGlobal(int(*(tracks_view.hitIndices().begin(i) + 5))) : 0,
+           nh > 6 ? hh.zGlobal(int(*(tracks_view.hitIndices().begin(i) + nh - 1))) : 0);
   }
 }
 
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
index f650ca8ab2a08..4893ebdcc828f 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
@@ -19,6 +19,8 @@
 #include "FWCore/Utilities/interface/isFinite.h"
 #include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
 #include "TrackingTools/DetLayers/interface/BarrelDetLayer.h"
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h"
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h"
 
 #include "CAHitNtupletGeneratorOnGPU.h"
 
@@ -184,29 +186,27 @@ void CAHitNtupletGeneratorOnGPU::endJob() {
   }
 }
 
-PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecHit2DGPU const& hits_d,
-                                                                    float bfield,
-                                                                    cudaStream_t stream) const {
-  PixelTrackHeterogeneous tracks(cms::cuda::make_device_unique<pixelTrack::TrackSoA>(stream));
-
-  auto* soa = tracks.get();
-  assert(soa);
+pixelTrack::TrackSoADevice CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecHit2DGPU const& hits_d,
+                                                                       float bfield,
+                                                                       cudaStream_t stream) const {
+  pixelTrack::TrackSoADevice tracks(stream);
+  auto* soa = &tracks;
 
   CAHitNtupletGeneratorKernelsGPU kernels(m_params);
   kernels.setCounters(m_counters);
   kernels.allocateOnGPU(hits_d.nHits(), stream);
 
   kernels.buildDoublets(hits_d, stream);
-  kernels.launchKernels(hits_d, soa, stream);
+  kernels.launchKernels(hits_d, soa->view(), stream);
 
   HelixFitOnGPU fitter(bfield, m_params.fitNas4_);
-  fitter.allocateOnGPU(&(soa->hitIndices), kernels.tupleMultiplicity(), soa);
+  fitter.allocateOnGPU(kernels.tupleMultiplicity(), soa->view());
   if (m_params.useRiemannFit_) {
     fitter.launchRiemannKernels(hits_d.view(), hits_d.nHits(), caConstants::maxNumberOfQuadruplets, stream);
   } else {
     fitter.launchBrokenLineKernels(hits_d.view(), hits_d.nHits(), caConstants::maxNumberOfQuadruplets, stream);
   }
-  kernels.classifyTuples(hits_d, soa, stream);
+  kernels.classifyTuples(hits_d, soa->view(), stream);
 
 #ifdef GPU_DEBUG
   cudaDeviceSynchronize();
@@ -217,25 +217,22 @@ PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecH
   return tracks;
 }
 
-PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuples(TrackingRecHit2DCPU const& hits_d, float bfield) const {
-  PixelTrackHeterogeneous tracks(std::make_unique<pixelTrack::TrackSoA>());
-
-  auto* soa = tracks.get();
-  assert(soa);
+pixelTrack::TrackSoAHost CAHitNtupletGeneratorOnGPU::makeTuples(TrackingRecHit2DCPU const& hits_d, float bfield) const {
+  pixelTrack::TrackSoAHost tracks(nullptr);
 
   CAHitNtupletGeneratorKernelsCPU kernels(m_params);
   kernels.setCounters(m_counters);
   kernels.allocateOnGPU(hits_d.nHits(), nullptr);
 
   kernels.buildDoublets(hits_d, nullptr);
-  kernels.launchKernels(hits_d, soa, nullptr);
+  kernels.launchKernels(hits_d, tracks.view(), nullptr);
 
   if (0 == hits_d.nHits())
     return tracks;
 
   // now fit
   HelixFitOnGPU fitter(bfield, m_params.fitNas4_);
-  fitter.allocateOnGPU(&(soa->hitIndices), kernels.tupleMultiplicity(), soa);
+  fitter.allocateOnGPU(kernels.tupleMultiplicity(), tracks.view());
 
   if (m_params.useRiemannFit_) {
     fitter.launchRiemannKernelsOnCPU(hits_d.view(), hits_d.nHits(), caConstants::maxNumberOfQuadruplets);
@@ -243,16 +240,16 @@ PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuples(TrackingRecHit2DC
     fitter.launchBrokenLineKernelsOnCPU(hits_d.view(), hits_d.nHits(), caConstants::maxNumberOfQuadruplets);
   }
 
-  kernels.classifyTuples(hits_d, soa, nullptr);
+  kernels.classifyTuples(hits_d, tracks.view(), nullptr);
 
 #ifdef GPU_DEBUG
   std::cout << "finished building pixel tracks on CPU" << std::endl;
 #endif
 
   // check that the fixed-size SoA does not overflow
-  auto const& tsoa = *soa;
-  auto maxTracks = tsoa.stride();
-  auto nTracks = tsoa.nTracks();
+
+  auto maxTracks = tracks.view().metadata().size();
+  auto nTracks = tracks.view().nTracks();
   assert(nTracks < maxTracks);
   if (nTracks == maxTracks - 1) {
     edm::LogWarning("PixelTracks") << "Unsorted reconstructed pixel tracks truncated to " << maxTracks - 1
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h
index ae4576d883530..323ad0d071f0c 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h
@@ -3,7 +3,8 @@
 
 #include <cuda_runtime.h>
 #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h"
-#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h"
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h"
 
 #include "DataFormats/SiPixelDetId/interface/PixelSubdetector.h"
 #include "FWCore/ParameterSet/interface/ParameterSet.h"
@@ -28,7 +29,8 @@ class CAHitNtupletGeneratorOnGPU {
   using hindex_type = TrackingRecHit2DSOAView::hindex_type;
 
   using Quality = pixelTrack::Quality;
-  using OutputSoA = pixelTrack::TrackSoA;
+  using OutputSoAHost = pixelTrack::TrackSoAHost;
+  using OutputSoADevice = pixelTrack::TrackSoADevice;
   using HitContainer = pixelTrack::HitContainer;
   using Tuple = HitContainer;
 
@@ -47,9 +49,13 @@ class CAHitNtupletGeneratorOnGPU {
   void beginJob();
   void endJob();
 
-  PixelTrackHeterogeneous makeTuplesAsync(TrackingRecHit2DGPU const& hits_d, float bfield, cudaStream_t stream) const;
+  // On GPU
+  pixelTrack::TrackSoADevice makeTuplesAsync(TrackingRecHit2DGPU const& hits_d,
+                                             float bfield,
+                                             cudaStream_t stream) const;
 
-  PixelTrackHeterogeneous makeTuples(TrackingRecHit2DCPU const& hits_d, float bfield) const;
+  // On CPU
+  pixelTrack::TrackSoAHost makeTuples(TrackingRecHit2DCPU const& hits_d, float bfield) const;
 
 private:
   void buildDoublets(HitsOnCPU const& hh, cudaStream_t stream) const;
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
index 2e3747a2b6841..c430a86129173 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
@@ -14,7 +14,8 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/VecArray.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
 #include "RecoPixelVertexing/PixelTriplets/interface/CircleEq.h"
-#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
+#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h"
+
 #include "CAConstants.h"
 
 class GPUCACell {
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc
index 880bdb47dfb5c..f757f574f6142 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc
@@ -1,16 +1,14 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HelixFitOnGPU.h"
 
-void HelixFitOnGPU::allocateOnGPU(Tuples const *tuples,
-                                  TupleMultiplicity const *tupleMultiplicity,
-                                  OutputSoA *helix_fit_results) {
-  tuples_ = tuples;
+void HelixFitOnGPU::allocateOnGPU(TupleMultiplicity const *tupleMultiplicity, OutputSoAView helix_fit_results) {
+  tuples_ = &helix_fit_results.hitIndices();
   tupleMultiplicity_ = tupleMultiplicity;
   outputSoa_ = helix_fit_results;
 
   assert(tuples_);
   assert(tupleMultiplicity_);
-  assert(outputSoa_);
+  // assert(outputSoa_); // TODO find equivalent assertion for View
 }
 
 void HelixFitOnGPU::deallocateOnGPU() {}
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h
index 9a9c85970af33..9fd2112476c9d 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h
@@ -1,7 +1,7 @@
 #ifndef RecoPixelVertexing_PixelTriplets_plugins_HelixFitOnGPU_h
 #define RecoPixelVertexing_PixelTriplets_plugins_HelixFitOnGPU_h
 
-#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
+#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h"
 #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h"
 #include "RecoPixelVertexing/PixelTrackFitting/interface/FitResult.h"
 
@@ -36,7 +36,7 @@ class HelixFitOnGPU {
   using HitsView = TrackingRecHit2DSOAView;
 
   using Tuples = pixelTrack::HitContainer;
-  using OutputSoA = pixelTrack::TrackSoA;
+  using OutputSoAView = pixelTrack::TrackSoAView;
 
   using TupleMultiplicity = caConstants::TupleMultiplicity;
 
@@ -50,7 +50,7 @@ class HelixFitOnGPU {
   void launchRiemannKernelsOnCPU(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples);
   void launchBrokenLineKernelsOnCPU(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples);
 
-  void allocateOnGPU(Tuples const *tuples, TupleMultiplicity const *tupleMultiplicity, OutputSoA *outputSoA);
+  void allocateOnGPU(TupleMultiplicity const *tupleMultiplicity, OutputSoAView outputSoA);
   void deallocateOnGPU();
 
 private:
@@ -59,7 +59,7 @@ class HelixFitOnGPU {
   // fowarded
   Tuples const *tuples_ = nullptr;
   TupleMultiplicity const *tupleMultiplicity_ = nullptr;
-  OutputSoA *outputSoa_;
+  OutputSoAView outputSoa_;
   float bField_;
 
   const bool fitNas4_;
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h
index 926002d674b83..e511680bf76b7 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h
@@ -16,7 +16,7 @@
 
 using HitsOnGPU = TrackingRecHit2DSOAView;
 using Tuples = pixelTrack::HitContainer;
-using OutputSoA = pixelTrack::TrackSoA;
+using OutputSoAView = pixelTrack::TrackSoAView;
 
 template <int N>
 __global__ void kernel_FastFit(Tuples const *__restrict__ foundNtuplets,
@@ -128,13 +128,13 @@ template <int N>
 __global__ void kernel_LineFit(caConstants::TupleMultiplicity const *__restrict__ tupleMultiplicity,
                                uint32_t nHits,
                                double bField,
-                               OutputSoA *results,
+                               OutputSoAView results_view,
                                double *__restrict__ phits,
                                float *__restrict__ phits_ge,
                                double *__restrict__ pfast_fit_input,
                                riemannFit::CircleFit *__restrict__ circle_fit,
                                uint32_t offset) {
-  assert(results);
+  // assert(results); // TODO find equivalent for View
   assert(circle_fit);
   assert(N <= nHits);
 
@@ -149,7 +149,7 @@ __global__ void kernel_LineFit(caConstants::TupleMultiplicity const *__restrict_
       break;
 
     // get it for the ntuple container (one to one to helix)
-    auto tkid = *(tupleMultiplicity->begin(nHits) + tuple_idx);
+    int32_t tkid = *(tupleMultiplicity->begin(nHits) + tuple_idx);
 
     riemannFit::Map3xNd<N> hits(phits + local_idx);
     riemannFit::Map4d fast_fit(pfast_fit_input + local_idx);
@@ -159,11 +159,16 @@ __global__ void kernel_LineFit(caConstants::TupleMultiplicity const *__restrict_
 
     riemannFit::fromCircleToPerigee(circle_fit[local_idx]);
 
-    results->stateAtBS.copyFromCircle(
-        circle_fit[local_idx].par, circle_fit[local_idx].cov, line_fit.par, line_fit.cov, 1.f / float(bField), tkid);
-    results->pt(tkid) = bField / std::abs(circle_fit[local_idx].par(2));
-    results->eta(tkid) = asinhf(line_fit.par(0));
-    results->chi2(tkid) = (circle_fit[local_idx].chi2 + line_fit.chi2) / (2 * N - 5);
+    pixelTrack::utilities::copyFromCircle(results_view,
+                                          circle_fit[local_idx].par,
+                                          circle_fit[local_idx].cov,
+                                          line_fit.par,
+                                          line_fit.cov,
+                                          1.f / float(bField),
+                                          tkid);
+    results_view[tkid].pt() = bField / std::abs(circle_fit[local_idx].par(2));
+    results_view[tkid].eta() = asinhf(line_fit.par(0));
+    results_view[tkid].chi2() = (circle_fit[local_idx].chi2 + line_fit.chi2) / (2 * N - 5);
 
 #ifdef RIEMANN_DEBUG
     printf("kernelLineFit size %d for %d hits circle.par(0,1,2): %d %f,%f,%f\n",
diff --git a/RecoPixelVertexing/PixelTriplets/test/BuildFile.xml b/RecoPixelVertexing/PixelTriplets/test/BuildFile.xml
index d480d7408b9e2..522b186f3351b 100644
--- a/RecoPixelVertexing/PixelTriplets/test/BuildFile.xml
+++ b/RecoPixelVertexing/PixelTriplets/test/BuildFile.xml
@@ -26,4 +26,5 @@
 <bin file="CAsizes_t.cpp">
   <use name="cuda"/>
   <use name="eigen"/>
+  <use name="boost"/>
 </bin>
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc
index 34b0ed9e29fc1..db60fb7ebf4bb 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc
@@ -16,6 +16,10 @@
 #include "FWCore/Utilities/interface/EDGetToken.h"
 #include "FWCore/Utilities/interface/RunningAverage.h"
 #include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h"
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h"
+#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h"
+#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h"
 
 #include "gpuVertexFinder.h"
 
@@ -35,10 +39,10 @@ class PixelVertexProducerCUDA : public edm::global::EDProducer<> {
 
   bool onGPU_;
 
-  edm::EDGetTokenT<cms::cuda::Product<PixelTrackHeterogeneous>> tokenGPUTrack_;
-  edm::EDPutTokenT<ZVertexCUDAProduct> tokenGPUVertex_;
-  edm::EDGetTokenT<PixelTrackHeterogeneous> tokenCPUTrack_;
-  edm::EDPutTokenT<ZVertexHeterogeneous> tokenCPUVertex_;
+  edm::EDGetTokenT<cms::cuda::Product<pixelTrack::TrackSoADevice>> tokenGPUTrack_;
+  edm::EDPutTokenT<cms::cuda::Product<ZVertex::ZVertexSoADevice>> tokenGPUVertex_;
+  edm::EDGetTokenT<pixelTrack::TrackSoAHost> tokenCPUTrack_;
+  edm::EDPutTokenT<ZVertex::ZVertexSoAHost> tokenCPUVertex_;
 
   const gpuVertexFinder::Producer gpuAlgo_;
 
@@ -62,11 +66,11 @@ PixelVertexProducerCUDA::PixelVertexProducerCUDA(const edm::ParameterSet& conf)
 {
   if (onGPU_) {
     tokenGPUTrack_ =
-        consumes<cms::cuda::Product<PixelTrackHeterogeneous>>(conf.getParameter<edm::InputTag>("pixelTrackSrc"));
-    tokenGPUVertex_ = produces<ZVertexCUDAProduct>();
+        consumes<cms::cuda::Product<pixelTrack::TrackSoADevice>>(conf.getParameter<edm::InputTag>("pixelTrackSrc"));
+    tokenGPUVertex_ = produces<cms::cuda::Product<ZVertex::ZVertexSoADevice>>();
   } else {
-    tokenCPUTrack_ = consumes<PixelTrackHeterogeneous>(conf.getParameter<edm::InputTag>("pixelTrackSrc"));
-    tokenCPUVertex_ = produces<ZVertexHeterogeneous>();
+    tokenCPUTrack_ = consumes<pixelTrack::TrackSoAHost>(conf.getParameter<edm::InputTag>("pixelTrackSrc"));
+    tokenCPUVertex_ = produces<ZVertex::ZVertexSoAHost>();
   }
 }
 
@@ -97,40 +101,35 @@ void PixelVertexProducerCUDA::fillDescriptions(edm::ConfigurationDescriptions& d
 void PixelVertexProducerCUDA::produceOnGPU(edm::StreamID streamID,
                                            edm::Event& iEvent,
                                            const edm::EventSetup& iSetup) const {
-  edm::Handle<cms::cuda::Product<PixelTrackHeterogeneous>> hTracks;
+  edm::Handle<cms::cuda::Product<pixelTrack::TrackSoADevice>> hTracks;
   iEvent.getByToken(tokenGPUTrack_, hTracks);
 
   cms::cuda::ScopedContextProduce ctx{*hTracks};
-  auto const* tracks = ctx.get(*hTracks).get();
+  auto& tracks = ctx.get(*hTracks);
 
-  assert(tracks);
-
-  ctx.emplace(iEvent, tokenGPUVertex_, gpuAlgo_.makeAsync(ctx.stream(), tracks, ptMin_, ptMax_));
+  ctx.emplace(iEvent, tokenGPUVertex_, gpuAlgo_.makeAsync(ctx.stream(), tracks.view(), ptMin_, ptMax_));
 }
 
 void PixelVertexProducerCUDA::produceOnCPU(edm::StreamID streamID,
                                            edm::Event& iEvent,
                                            const edm::EventSetup& iSetup) const {
-  auto const* tracks = iEvent.get(tokenCPUTrack_).get();
-  assert(tracks);
+  auto& tracks = iEvent.get(tokenCPUTrack_);
 
 #ifdef PIXVERTEX_DEBUG_PRODUCE
-  auto const& tsoa = *tracks;
-  auto maxTracks = tsoa.stride();
-  std::cout << "size of SoA " << sizeof(tsoa) << " stride " << maxTracks << std::endl;
+
+  auto maxTracks = tracks.view().metadata().size();
 
   int32_t nt = 0;
   for (int32_t it = 0; it < maxTracks; ++it) {
-    auto nHits = tsoa.nHits(it);
-    assert(nHits == int(tsoa.hitIndices.size(it)));
+    auto nHits = pixelTrack::utilities::nHits(tracks.view(), it);
+    assert(nHits == int(tracks.view().hitIndices().size(it)));
     if (nHits == 0)
       break;  // this is a guard: maybe we need to move to nTracks...
     nt++;
   }
-  std::cout << "found " << nt << " tracks in cpu SoA for Vertexing at " << tracks << std::endl;
 #endif  // PIXVERTEX_DEBUG_PRODUCE
 
-  iEvent.emplace(tokenCPUVertex_, gpuAlgo_.make(tracks, ptMin_, ptMax_));
+  iEvent.emplace(tokenCPUVertex_, gpuAlgo_.make(tracks.view(), ptMin_, ptMax_));
 }
 
 void PixelVertexProducerCUDA::produce(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const {
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerFromSoA.cc b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerFromSoA.cc
index 029c619b42e58..61ec3f9a6a5be 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerFromSoA.cc
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerFromSoA.cc
@@ -1,4 +1,5 @@
-#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h"
+#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h"
+#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h"
 #include "DataFormats/BeamSpot/interface/BeamSpot.h"
 #include "DataFormats/Common/interface/OrphanHandle.h"
 #include "DataFormats/TrackReco/interface/Track.h"
@@ -35,14 +36,14 @@ class PixelVertexProducerFromSoA : public edm::global::EDProducer<> {
 private:
   void produce(edm::StreamID streamID, edm::Event &iEvent, const edm::EventSetup &iSetup) const override;
 
-  edm::EDGetTokenT<ZVertexHeterogeneous> tokenVertex_;
+  edm::EDGetTokenT<ZVertex::ZVertexSoAHost> tokenVertex_;
   edm::EDGetTokenT<reco::BeamSpot> tokenBeamSpot_;
   edm::EDGetTokenT<reco::TrackCollection> tokenTracks_;
   edm::EDGetTokenT<IndToEdm> tokenIndToEdm_;
 };
 
 PixelVertexProducerFromSoA::PixelVertexProducerFromSoA(const edm::ParameterSet &conf)
-    : tokenVertex_(consumes<ZVertexHeterogeneous>(conf.getParameter<edm::InputTag>("src"))),
+    : tokenVertex_(consumes<ZVertex::ZVertexSoAHost>(conf.getParameter<edm::InputTag>("src"))),
       tokenBeamSpot_(consumes<reco::BeamSpot>(conf.getParameter<edm::InputTag>("beamSpot"))),
       tokenTracks_(consumes<reco::TrackCollection>(conf.getParameter<edm::InputTag>("TrackCollection"))),
       tokenIndToEdm_(consumes<IndToEdm>(conf.getParameter<edm::InputTag>("TrackCollection"))) {
@@ -81,9 +82,9 @@ void PixelVertexProducerFromSoA::produce(edm::StreamID streamID, edm::Event &iEv
     dydz = bs.dydz();
   }
 
-  auto const &soa = *(iEvent.get(tokenVertex_).get());
+  auto const &soa = iEvent.get(tokenVertex_);
 
-  int nv = soa.nvFinal;
+  int nv = soa.view().nvFinal();
 
 #ifdef PIXVERTEX_DEBUG_PRODUCE
   std::cout << "converting " << nv << " vertices "
@@ -92,20 +93,20 @@ void PixelVertexProducerFromSoA::produce(edm::StreamID streamID, edm::Event &iEv
 
   std::set<uint16_t> uind;  // for verifing index consistency
   for (int j = nv - 1; j >= 0; --j) {
-    auto i = soa.sortInd[j];  // on gpu sorted in ascending order....
+    auto i = soa.view()[j].sortInd();  // on gpu sorted in ascending order....
     assert(i < nv);
     uind.insert(i);
     assert(itrk.empty());
-    auto z = soa.zv[i];
+    auto z = soa.view()[i].zv();
     auto x = x0 + dxdz * z;
     auto y = y0 + dydz * z;
     z += z0;
     reco::Vertex::Error err;
-    err(2, 2) = 1.f / soa.wv[i];
+    err(2, 2) = 1.f / soa.view()[i].wv();
     err(2, 2) *= 2.;  // artifically inflate error
     //Copy also the tracks (no intention to be efficient....)
     for (auto k = 0U; k < indToEdm.size(); ++k) {
-      if (soa.idv[k] == int16_t(i))
+      if (soa.view()[k].idv() == int16_t(i))
         itrk.push_back(k);
     }
     auto nt = itrk.size();
@@ -119,7 +120,7 @@ void PixelVertexProducerFromSoA::produce(edm::StreamID streamID, edm::Event &iEv
       itrk.clear();
       continue;
     }  // remove outliers
-    (*vertexes).emplace_back(reco::Vertex::Point(x, y, z), err, soa.chi2[i], soa.ndof[i], nt);
+    (*vertexes).emplace_back(reco::Vertex::Point(x, y, z), err, soa.view()[i].chi2(), soa.view()[i].ndof(), nt);
     auto &v = (*vertexes).back();
     v.reserve(itrk.size());
     for (auto it : itrk) {
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexSoAFromCUDA.cc b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexSoAFromCUDA.cc
index dc125878b1058..7dd714f22de6f 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexSoAFromCUDA.cc
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexSoAFromCUDA.cc
@@ -2,7 +2,8 @@
 
 #include "CUDADataFormats/Common/interface/Product.h"
 #include "CUDADataFormats/Common/interface/HostProduct.h"
-#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h"
+#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h"
+#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h"
 #include "DataFormats/Common/interface/Handle.h"
 #include "FWCore/Framework/interface/ESHandle.h"
 #include "FWCore/Framework/interface/Event.h"
@@ -30,15 +31,15 @@ class PixelVertexSoAFromCUDA : public edm::stream::EDProducer<edm::ExternalWork>
                edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
   void produce(edm::Event& iEvent, edm::EventSetup const& iSetup) override;
 
-  edm::EDGetTokenT<cms::cuda::Product<ZVertexHeterogeneous>> tokenCUDA_;
-  edm::EDPutTokenT<ZVertexHeterogeneous> tokenSOA_;
+  edm::EDGetTokenT<cms::cuda::Product<ZVertex::ZVertexSoADevice>> tokenCUDA_;
+  edm::EDPutTokenT<ZVertex::ZVertexSoAHost> tokenSOA_;
 
-  cms::cuda::host::unique_ptr<ZVertexSoA> m_soa;
+  ZVertex::ZVertexSoAHost zvertex_h;
 };
 
 PixelVertexSoAFromCUDA::PixelVertexSoAFromCUDA(const edm::ParameterSet& iConfig)
-    : tokenCUDA_(consumes<cms::cuda::Product<ZVertexHeterogeneous>>(iConfig.getParameter<edm::InputTag>("src"))),
-      tokenSOA_(produces<ZVertexHeterogeneous>()) {}
+    : tokenCUDA_(consumes<cms::cuda::Product<ZVertex::ZVertexSoADevice>>(iConfig.getParameter<edm::InputTag>("src"))),
+      tokenSOA_(produces<ZVertex::ZVertexSoAHost>()) {}
 
 void PixelVertexSoAFromCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
   edm::ParameterSetDescription desc;
@@ -50,16 +51,21 @@ void PixelVertexSoAFromCUDA::fillDescriptions(edm::ConfigurationDescriptions& de
 void PixelVertexSoAFromCUDA::acquire(edm::Event const& iEvent,
                                      edm::EventSetup const& iSetup,
                                      edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
-  auto const& inputDataWrapped = iEvent.get(tokenCUDA_);
+  cms::cuda::Product<ZVertex::ZVertexSoADevice> const& inputDataWrapped = iEvent.get(tokenCUDA_);
   cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)};
-  auto const& inputData = ctx.get(inputDataWrapped);
-
-  m_soa = inputData.toHostAsync(ctx.stream());
+  auto const& zvertex_d = ctx.get(inputDataWrapped);  // Tracks on device
+  zvertex_h = ZVertex::ZVertexSoAHost(ctx.stream());  // Create an instance of Tracks on Host, using the stream
+  cudaCheck(cudaMemcpyAsync(zvertex_h.buffer().get(),
+                            zvertex_d.const_buffer().get(),
+                            zvertex_d.bufferSize(),
+                            cudaMemcpyDeviceToHost,
+                            ctx.stream()));  // Copy data from Device to Host
+  cudaCheck(cudaGetLastError());
 }
 
 void PixelVertexSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& iSetup) {
   // No copies....
-  iEvent.emplace(tokenSOA_, ZVertexHeterogeneous(std::move(m_soa)));
+  iEvent.emplace(tokenSOA_, std::move(zvertex_h));
 }
 
 DEFINE_FWK_MODULE(PixelVertexSoAFromCUDA);
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/WorkSpaceSoAHeterogeneousDevice.h b/RecoPixelVertexing/PixelVertexFinding/plugins/WorkSpaceSoAHeterogeneousDevice.h
new file mode 100644
index 0000000000000..1c704d1374ca7
--- /dev/null
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/WorkSpaceSoAHeterogeneousDevice.h
@@ -0,0 +1,25 @@
+#ifndef RecoPixelVertexing_PixelVertexFinding_WorkSpaceSoAHeterogeneousDevice_h
+#define RecoPixelVertexing_PixelVertexFinding_WorkSpaceSoAHeterogeneousDevice_h
+
+#include <cstdint>
+#include "WorkSpaceUtilities.h"
+#include "CUDADataFormats/Vertex/interface/ZVertexUtilities.h"
+#include "RecoPixelVertexing/PixelVertexFinding/plugins/WorkSpaceUtilities.h"
+#include "CUDADataFormats/Common/interface/PortableDeviceCollection.h"
+
+template <int32_t S>
+class WorkSpaceSoAHeterogeneousDevice : public cms::cuda::PortableDeviceCollection<WorkSpaceSoAHeterogeneousLayout<>> {
+public:
+  WorkSpaceSoAHeterogeneousDevice() = default;
+
+  // Constructor which specifies the SoA size and CUDA stream
+  explicit WorkSpaceSoAHeterogeneousDevice(cudaStream_t stream)
+      : PortableDeviceCollection<WorkSpaceSoAHeterogeneousLayout<>>(S, stream) {}
+};
+
+namespace gpuVertexFinder {
+  namespace workSpace {
+    using WorkSpaceSoADevice = WorkSpaceSoAHeterogeneousDevice<ZVertex::utilities::MAXTRACKS>;
+  }
+}  // namespace gpuVertexFinder
+#endif
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/WorkSpaceSoAHeterogeneousHost.h b/RecoPixelVertexing/PixelVertexFinding/plugins/WorkSpaceSoAHeterogeneousHost.h
new file mode 100644
index 0000000000000..1051da0bbcee8
--- /dev/null
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/WorkSpaceSoAHeterogeneousHost.h
@@ -0,0 +1,25 @@
+#ifndef RecoPixelVertexing_PixelVertexFinding_WorkSpaceSoAHeterogeneousHost_h
+#define RecoPixelVertexing_PixelVertexFinding_WorkSpaceSoAHeterogeneousHost_h
+
+#include <cstdint>
+#include "WorkSpaceUtilities.h"
+#include "CUDADataFormats/Vertex/interface/ZVertexUtilities.h"
+#include "RecoPixelVertexing/PixelVertexFinding/plugins/WorkSpaceUtilities.h"
+#include "CUDADataFormats/Common/interface/PortableHostCollection.h"
+
+template <int32_t S>
+class WorkSpaceSoAHeterogeneousHost : public cms::cuda::PortableHostCollection<WorkSpaceSoAHeterogeneousLayout<>> {
+public:
+  WorkSpaceSoAHeterogeneousHost() = default;
+
+  // Constructor which specifies the SoA size and CUDA stream
+  explicit WorkSpaceSoAHeterogeneousHost(cudaStream_t stream)
+      : PortableHostCollection<WorkSpaceSoAHeterogeneousLayout<>>(S, stream) {}
+};
+
+namespace gpuVertexFinder {
+  namespace workSpace {
+    using WorkSpaceSoAHost = WorkSpaceSoAHeterogeneousHost<ZVertex::utilities::MAXTRACKS>;
+  }
+}  // namespace gpuVertexFinder
+#endif
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/WorkSpaceUtilities.h b/RecoPixelVertexing/PixelVertexFinding/plugins/WorkSpaceUtilities.h
new file mode 100644
index 0000000000000..a86ade097ec7c
--- /dev/null
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/WorkSpaceUtilities.h
@@ -0,0 +1,36 @@
+#ifndef RecoPixelVertexing_PixelVertexFinding_WorkSpace_h
+#define RecoPixelVertexing_PixelVertexFinding_WorkSpace_h
+
+#include <cstdint>
+#include <cuda_runtime.h>
+#include "DataFormats/SoATemplate/interface/SoALayout.h"
+
+// Intermediate data used in the vertex reco algos
+// For internal use only
+GENERATE_SOA_LAYOUT(WorkSpaceSoAHeterogeneousLayout,
+                    SOA_COLUMN(uint16_t, itrk),            // index of original track
+                    SOA_COLUMN(float, zt),                 // input track z at bs
+                    SOA_COLUMN(float, ezt2),               // input error^2 on the above
+                    SOA_COLUMN(float, ptt2),               // input pt^2 on the above
+                    SOA_COLUMN(uint8_t, izt),              // interized z-position of input tracks
+                    SOA_COLUMN(int32_t, iv),               // vertex index for each associated track
+                    SOA_SCALAR(uint32_t, ntrks),           // number of "selected tracks"
+                    SOA_SCALAR(uint32_t, nvIntermediate))  // the number of vertices after splitting pruning etc.
+
+// Methods that operate on View and ConstView of the WorkSpaceSoALayout.
+namespace gpuVertexFinder {
+  namespace workSpace {
+    using WorkSpaceSoALayout = WorkSpaceSoAHeterogeneousLayout<>;
+    using WorkSpaceSoAView = WorkSpaceSoAHeterogeneousLayout<>::View;
+    using WorkSpaceSoAConstView = WorkSpaceSoAHeterogeneousLayout<>::ConstView;
+
+    namespace utilities {
+      __host__ __device__ inline void init(WorkSpaceSoAView &workspace_view) {
+        workspace_view.ntrks() = 0;
+        workspace_view.nvIntermediate() = 0;
+      }
+    }  // namespace utilities
+  }    // namespace workSpace
+}  // namespace gpuVertexFinder
+
+#endif
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksByDensity.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksByDensity.h
index f71aa56842a67..4124f80e017eb 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksByDensity.h
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksByDensity.h
@@ -17,8 +17,8 @@ namespace gpuVertexFinder {
   //
   // based on Rodrighez&Laio algo
   //
-  __device__ __forceinline__ void clusterTracksByDensity(gpuVertexFinder::ZVertices* pdata,
-                                                         gpuVertexFinder::WorkSpace* pws,
+  __device__ __forceinline__ void clusterTracksByDensity(VtxSoAView pdata,
+                                                         WsSoAView pws,
                                                          int minT,      // min number of neighbours to be "seed"
                                                          float eps,     // max absolute distance to cluster
                                                          float errmax,  // max error to be "seed"
@@ -32,20 +32,21 @@ namespace gpuVertexFinder {
 
     auto er2mx = errmax * errmax;
 
-    auto& __restrict__ data = *pdata;
-    auto& __restrict__ ws = *pws;
-    auto nt = ws.ntrks;
-    float const* __restrict__ zt = ws.zt;
-    float const* __restrict__ ezt2 = ws.ezt2;
+    auto& __restrict__ data = pdata;
+    auto& __restrict__ ws = pws;
+    auto nt = ws.ntrks();
+    float const* __restrict__ zt = ws.zt();
+    float const* __restrict__ ezt2 = ws.ezt2();
 
-    uint32_t& nvFinal = data.nvFinal;
-    uint32_t& nvIntermediate = ws.nvIntermediate;
+    uint32_t& nvFinal = data.nvFinal();
+    uint32_t& nvIntermediate = ws.nvIntermediate();
 
-    uint8_t* __restrict__ izt = ws.izt;
-    int32_t* __restrict__ nn = data.ndof;
-    int32_t* __restrict__ iv = ws.iv;
+    uint8_t* __restrict__ izt = ws.izt();
+    int32_t* __restrict__ nn = data.ndof();
+    int32_t* __restrict__ iv = ws.iv();
 
-    assert(pdata);
+    //TODO: check if there is a way to assert this
+    //assert(pdata);
     assert(zt);
 
     using Hist = cms::cuda::HistoContainer<uint8_t, 256, 16000, 8, uint16_t>;
@@ -63,7 +64,7 @@ namespace gpuVertexFinder {
 
     // fill hist  (bin shall be wider than "eps")
     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
-      assert(i < ZVertices::MAXTRACKS);
+      assert(i < ZVertex::utilities::MAXTRACKS);
       int iz = int(zt[i] * 10.);  // valid if eps<=0.1
       // iz = std::clamp(iz, INT8_MIN, INT8_MAX);  // sorry c++17 only
       iz = std::min(std::max(iz, INT8_MIN), INT8_MAX);
@@ -197,7 +198,7 @@ namespace gpuVertexFinder {
     }
     __syncthreads();
 
-    assert(foundClusters < ZVertices::MAXVTX);
+    assert(foundClusters < ZVertex::utilities::MAXVTX);
 
     // propagate the negative id to all the tracks in the cluster.
     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
@@ -219,8 +220,8 @@ namespace gpuVertexFinder {
       printf("found %d proto vertices\n", foundClusters);
   }
 
-  __global__ void clusterTracksByDensityKernel(gpuVertexFinder::ZVertices* pdata,
-                                               gpuVertexFinder::WorkSpace* pws,
+  __global__ void clusterTracksByDensityKernel(VtxSoAView pdata,
+                                               WsSoAView pws,
                                                int minT,      // min number of neighbours to be "seed"
                                                float eps,     // max absolute distance to cluster
                                                float errmax,  // max error to be "seed"
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksDBSCAN.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksDBSCAN.h
index a11283a7b2065..43e420a4c0cbc 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksDBSCAN.h
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksDBSCAN.h
@@ -14,8 +14,8 @@ namespace gpuVertexFinder {
 
   // this algo does not really scale as it works in a single block...
   // enough for <10K tracks we have
-  __global__ void clusterTracksDBSCAN(ZVertices* pdata,
-                                      WorkSpace* pws,
+  __global__ void clusterTracksDBSCAN(VtxSoAView pdata,
+                                      WsSoAView pws,
                                       int minT,      // min number of neighbours to be "core"
                                       float eps,     // max absolute distance to cluster
                                       float errmax,  // max error to be "seed"
@@ -28,20 +28,21 @@ namespace gpuVertexFinder {
 
     auto er2mx = errmax * errmax;
 
-    auto& __restrict__ data = *pdata;
-    auto& __restrict__ ws = *pws;
-    auto nt = ws.ntrks;
-    float const* __restrict__ zt = ws.zt;
-    float const* __restrict__ ezt2 = ws.ezt2;
+    auto& __restrict__ data = pdata;
+    auto& __restrict__ ws = pws;
+    auto nt = ws.ntrks();
+    float const* __restrict__ zt = ws.zt();
+    float const* __restrict__ ezt2 = ws.ezt2();
 
-    uint32_t& nvFinal = data.nvFinal;
-    uint32_t& nvIntermediate = ws.nvIntermediate;
+    uint32_t& nvFinal = data.nvFinal();
+    uint32_t& nvIntermediate = ws.nvIntermediate();
 
-    uint8_t* __restrict__ izt = ws.izt;
-    int32_t* __restrict__ nn = data.ndof;
-    int32_t* __restrict__ iv = ws.iv;
+    uint8_t* __restrict__ izt = ws.izt();
+    int32_t* __restrict__ nn = data.ndof();
+    int32_t* __restrict__ iv = ws.iv();
 
-    assert(pdata);
+    //TODO: check if there is a way to assert this
+    //assert(pdata);
     assert(zt);
 
     using Hist = cms::cuda::HistoContainer<uint8_t, 256, 16000, 8, uint16_t>;
@@ -59,7 +60,7 @@ namespace gpuVertexFinder {
 
     // fill hist  (bin shall be wider than "eps")
     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
-      assert(i < ZVertices::MAXTRACKS);
+      assert(i < ZVertex::utilities::MAXTRACKS);
       int iz = int(zt[i] * 10.);  // valid if eps<=0.1
       iz = std::clamp(iz, INT8_MIN, INT8_MAX);
       izt[i] = iz - INT8_MIN;
@@ -214,7 +215,7 @@ namespace gpuVertexFinder {
     }
     __syncthreads();
 
-    assert(foundClusters < ZVertices::MAXVTX);
+    assert(foundClusters < ZVertex::utilities::MAXVTX);
 
     // propagate the negative id to all the tracks in the cluster.
     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksIterative.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksIterative.h
index 66d246fcfa4fa..1b172cabf9318 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksIterative.h
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksIterative.h
@@ -14,8 +14,8 @@ namespace gpuVertexFinder {
 
   // this algo does not really scale as it works in a single block...
   // enough for <10K tracks we have
-  __global__ void clusterTracksIterative(ZVertices* pdata,
-                                         WorkSpace* pws,
+  __global__ void clusterTracksIterative(VtxSoAView pdata,
+                                         WsSoAView pws,
                                          int minT,      // min number of neighbours to be "core"
                                          float eps,     // max absolute distance to cluster
                                          float errmax,  // max error to be "seed"
@@ -28,20 +28,21 @@ namespace gpuVertexFinder {
 
     auto er2mx = errmax * errmax;
 
-    auto& __restrict__ data = *pdata;
-    auto& __restrict__ ws = *pws;
-    auto nt = ws.ntrks;
-    float const* __restrict__ zt = ws.zt;
-    float const* __restrict__ ezt2 = ws.ezt2;
+    auto& __restrict__ data = pdata;
+    auto& __restrict__ ws = pws;
+    auto nt = ws.ntrks();
+    float const* __restrict__ zt = ws.zt();
+    float const* __restrict__ ezt2 = ws.ezt2();
 
-    uint32_t& nvFinal = data.nvFinal;
-    uint32_t& nvIntermediate = ws.nvIntermediate;
+    uint32_t& nvFinal = data.nvFinal();
+    uint32_t& nvIntermediate = ws.nvIntermediate();
 
-    uint8_t* __restrict__ izt = ws.izt;
-    int32_t* __restrict__ nn = data.ndof;
-    int32_t* __restrict__ iv = ws.iv;
+    uint8_t* __restrict__ izt = ws.izt();
+    int32_t* __restrict__ nn = data.ndof();
+    int32_t* __restrict__ iv = ws.iv();
 
-    assert(pdata);
+    //TODO: check if there is a way to assert this
+    //assert(pdata);
     assert(zt);
 
     using Hist = cms::cuda::HistoContainer<uint8_t, 256, 16000, 8, uint16_t>;
@@ -59,7 +60,7 @@ namespace gpuVertexFinder {
 
     // fill hist  (bin shall be wider than "eps")
     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
-      assert(i < ZVertices::MAXTRACKS);
+      assert(i < ZVertex::utilities::MAXTRACKS);
       int iz = int(zt[i] * 10.);  // valid if eps<=0.1
       iz = std::clamp(iz, INT8_MIN, INT8_MAX);
       izt[i] = iz - INT8_MIN;
@@ -185,7 +186,7 @@ namespace gpuVertexFinder {
     }
     __syncthreads();
 
-    assert(foundClusters < ZVertices::MAXVTX);
+    assert(foundClusters < ZVertex::utilities::MAXVTX);
 
     // propagate the negative id to all the tracks in the cluster.
     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuFitVertices.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuFitVertices.h
index 0acf67244528a..7b926023b4e19 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuFitVertices.h
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuFitVertices.h
@@ -12,27 +12,28 @@
 
 namespace gpuVertexFinder {
 
-  __device__ __forceinline__ void fitVertices(ZVertices* pdata,
-                                              WorkSpace* pws,
+  __device__ __forceinline__ void fitVertices(VtxSoAView pdata,
+                                              WsSoAView pws,
                                               float chi2Max  // for outlier rejection
   ) {
     constexpr bool verbose = false;  // in principle the compiler should optmize out if false
 
-    auto& __restrict__ data = *pdata;
-    auto& __restrict__ ws = *pws;
-    auto nt = ws.ntrks;
-    float const* __restrict__ zt = ws.zt;
-    float const* __restrict__ ezt2 = ws.ezt2;
-    float* __restrict__ zv = data.zv;
-    float* __restrict__ wv = data.wv;
-    float* __restrict__ chi2 = data.chi2;
-    uint32_t& nvFinal = data.nvFinal;
-    uint32_t& nvIntermediate = ws.nvIntermediate;
-
-    int32_t* __restrict__ nn = data.ndof;
-    int32_t* __restrict__ iv = ws.iv;
-
-    assert(pdata);
+    auto& __restrict__ data = pdata;
+    auto& __restrict__ ws = pws;
+    auto nt = ws.ntrks();
+    float const* __restrict__ zt = ws.zt();
+    float const* __restrict__ ezt2 = ws.ezt2();
+    float* __restrict__ zv = data.zv();
+    float* __restrict__ wv = data.wv();
+    float* __restrict__ chi2 = data.chi2();
+    uint32_t& nvFinal = data.nvFinal();
+    uint32_t& nvIntermediate = ws.nvIntermediate();
+
+    int32_t* __restrict__ nn = data.ndof();
+    int32_t* __restrict__ iv = ws.iv();
+
+    //TODO: check if there is a way to assert this
+    //assert(pdata);
     assert(zt);
 
     assert(nvFinal <= nvIntermediate);
@@ -101,8 +102,8 @@ namespace gpuVertexFinder {
       printf("and %d noise\n", noise);
   }
 
-  __global__ void fitVerticesKernel(ZVertices* pdata,
-                                    WorkSpace* pws,
+  __global__ void fitVerticesKernel(VtxSoAView pdata,
+                                    WsSoAView pws,
                                     float chi2Max  // for outlier rejection
   ) {
     fitVertices(pdata, pws, chi2Max);
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSortByPt2.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSortByPt2.h
index 93f78d498b26f..ff8cea612de47 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSortByPt2.h
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSortByPt2.h
@@ -15,16 +15,16 @@
 
 namespace gpuVertexFinder {
 
-  __device__ __forceinline__ void sortByPt2(ZVertices* pdata, WorkSpace* pws) {
-    auto& __restrict__ data = *pdata;
-    auto& __restrict__ ws = *pws;
-    auto nt = ws.ntrks;
-    float const* __restrict__ ptt2 = ws.ptt2;
-    uint32_t const& nvFinal = data.nvFinal;
+  __device__ __forceinline__ void sortByPt2(VtxSoAView pdata, WsSoAView pws) {
+    auto& __restrict__ data = pdata;
+    auto& __restrict__ ws = pws;
+    auto nt = ws.ntrks();
+    float const* __restrict__ ptt2 = ws.ptt2();
+    uint32_t const& nvFinal = data.nvFinal();
 
-    int32_t const* __restrict__ iv = ws.iv;
-    float* __restrict__ ptv2 = data.ptv2;
-    uint16_t* __restrict__ sortInd = data.sortInd;
+    int32_t const* __restrict__ iv = ws.iv();
+    float* __restrict__ ptv2 = data.ptv2();
+    uint16_t* __restrict__ sortInd = data.sortInd();
 
     // if (threadIdx.x == 0)
     //    printf("sorting %d vertices\n",nvFinal);
@@ -34,7 +34,7 @@ namespace gpuVertexFinder {
 
     // fill indexing
     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
-      data.idv[ws.itrk[i]] = iv[i];
+      data[ws[i].itrk()].idv() = iv[i];
     }
 
     // can be done asynchronoisly at the end of previous event
@@ -66,7 +66,7 @@ namespace gpuVertexFinder {
 #endif
   }
 
-  __global__ void sortByPt2Kernel(ZVertices* pdata, WorkSpace* pws) { sortByPt2(pdata, pws); }
+  __global__ void sortByPt2Kernel(VtxSoAView pdata, WsSoAView pws) { sortByPt2(pdata, pws); }
 
 }  // namespace gpuVertexFinder
 
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSplitVertices.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSplitVertices.h
index 0fe8bd882dcc5..f90978811b839 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSplitVertices.h
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSplitVertices.h
@@ -12,23 +12,24 @@
 
 namespace gpuVertexFinder {
 
-  __device__ __forceinline__ void splitVertices(ZVertices* pdata, WorkSpace* pws, float maxChi2) {
+  __device__ __forceinline__ void splitVertices(VtxSoAView pdata, WsSoAView pws, float maxChi2) {
     constexpr bool verbose = false;  // in principle the compiler should optmize out if false
 
-    auto& __restrict__ data = *pdata;
-    auto& __restrict__ ws = *pws;
-    auto nt = ws.ntrks;
-    float const* __restrict__ zt = ws.zt;
-    float const* __restrict__ ezt2 = ws.ezt2;
-    float* __restrict__ zv = data.zv;
-    float* __restrict__ wv = data.wv;
-    float const* __restrict__ chi2 = data.chi2;
-    uint32_t& nvFinal = data.nvFinal;
-
-    int32_t const* __restrict__ nn = data.ndof;
-    int32_t* __restrict__ iv = ws.iv;
-
-    assert(pdata);
+    auto& __restrict__ data = pdata;
+    auto& __restrict__ ws = pws;
+    auto nt = ws.ntrks();
+    float const* __restrict__ zt = ws.zt();
+    float const* __restrict__ ezt2 = ws.ezt2();
+    float* __restrict__ zv = data.zv();
+    float* __restrict__ wv = data.wv();
+    float const* __restrict__ chi2 = data.chi2();
+    uint32_t& nvFinal = data.nvFinal();
+
+    int32_t const* __restrict__ nn = data.ndof();
+    int32_t* __restrict__ iv = ws.iv();
+
+    //TODO: check if there is a way to assert this
+    //assert(pdata);
     assert(zt);
 
     // one vertex per block
@@ -120,7 +121,7 @@ namespace gpuVertexFinder {
       // get a new global vertex
       __shared__ uint32_t igv;
       if (0 == threadIdx.x)
-        igv = atomicAdd(&ws.nvIntermediate, 1);
+        igv = atomicAdd(&ws.nvIntermediate(), 1);
       __syncthreads();
       for (auto k = threadIdx.x; k < nq; k += blockDim.x) {
         if (1 == newV[k])
@@ -130,7 +131,7 @@ namespace gpuVertexFinder {
     }  // loop on vertices
   }
 
-  __global__ void splitVerticesKernel(ZVertices* pdata, WorkSpace* pws, float maxChi2) {
+  __global__ void splitVerticesKernel(VtxSoAView pdata, WsSoAView pws, float maxChi2) {
     splitVertices(pdata, pws, maxChi2);
   }
 
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc
index 20b007d2d029f..686f9899d8439 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc
@@ -1,5 +1,8 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 
+#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h"
+#include "CUDADataFormats/Vertex/interface/ZVertexUtilities.h"
+
 #include "gpuClusterTracksByDensity.h"
 #include "gpuClusterTracksDBSCAN.h"
 #include "gpuClusterTracksIterative.h"
@@ -18,27 +21,25 @@ namespace gpuVertexFinder {
   // split vertices with a chi2/NDoF greater than this
   constexpr float maxChi2ForSplit = 9.f;
 
-  __global__ void loadTracks(TkSoA const* ptracks, ZVertexSoA* soa, WorkSpace* pws, float ptMin, float ptMax) {
-    assert(ptracks);
-    assert(soa);
-    auto const& tracks = *ptracks;
-    auto const& fit = tracks.stateAtBS;
-    auto const* quality = tracks.qualityData();
+  __global__ void loadTracks(TkSoAConstView tracks_view, VtxSoAView soa, WsSoAView pws, float ptMin, float ptMax) {
+    //TODO: check if there is a way to assert this
+    //assert(soa);
+    auto const* quality = pixelTrack::utilities::qualityData(tracks_view);
 
     auto first = blockIdx.x * blockDim.x + threadIdx.x;
-    for (int idx = first, nt = tracks.nTracks(); idx < nt; idx += gridDim.x * blockDim.x) {
-      auto nHits = tracks.nHits(idx);
+    for (int idx = first, nt = tracks_view.nTracks(); idx < nt; idx += gridDim.x * blockDim.x) {
+      auto nHits = pixelTrack::utilities::nHits(tracks_view, idx);
       assert(nHits >= 3);
 
       // initialize soa...
-      soa->idv[idx] = -1;
+      soa[idx].idv() = -1;
 
-      if (tracks.isTriplet(idx))
+      if (pixelTrack::utilities::isTriplet(tracks_view, idx))
         continue;  // no triplets
       if (quality[idx] < pixelTrack::Quality::highPurity)
         continue;
 
-      auto pt = tracks.pt(idx);
+      auto pt = tracks_view[idx].pt();
 
       if (pt < ptMin)
         continue;
@@ -46,19 +47,19 @@ namespace gpuVertexFinder {
       // clamp pt
       pt = std::min(pt, ptMax);
 
-      auto& data = *pws;
-      auto it = atomicAdd(&data.ntrks, 1);
-      data.itrk[it] = idx;
-      data.zt[it] = tracks.zip(idx);
-      data.ezt2[it] = fit.covariance(idx)(14);
-      data.ptt2[it] = pt * pt;
+      auto& data = pws;
+      auto it = atomicAdd(&data.ntrks(), 1);
+      data[it].itrk() = idx;
+      data[it].zt() = pixelTrack::utilities::zip(tracks_view, idx);
+      data[it].ezt2() = tracks_view[idx].covariance()(14);
+      data[it].ptt2() = pt * pt;
     }
   }
 
 // #define THREE_KERNELS
 #ifndef THREE_KERNELS
-  __global__ void vertexFinderOneKernel(gpuVertexFinder::ZVertices* pdata,
-                                        gpuVertexFinder::WorkSpace* pws,
+  __global__ void vertexFinderOneKernel(VtxSoAView pdata,
+                                        WsSoAView pws,
                                         int minT,      // min number of neighbours to be "seed"
                                         float eps,     // max absolute distance to cluster
                                         float errmax,  // max error to be "seed"
@@ -75,8 +76,8 @@ namespace gpuVertexFinder {
     sortByPt2(pdata, pws);
   }
 #else
-  __global__ void vertexFinderKernel1(gpuVertexFinder::ZVertices* pdata,
-                                      gpuVertexFinder::WorkSpace* pws,
+  __global__ void vertexFinderKernel1(VtxSoAView pdata,
+                                      WsSoAView pws,
                                       int minT,      // min number of neighbours to be "seed"
                                       float eps,     // max absolute distance to cluster
                                       float errmax,  // max error to be "seed"
@@ -87,7 +88,7 @@ namespace gpuVertexFinder {
     fitVertices(pdata, pws, maxChi2ForFirstFit);
   }
 
-  __global__ void vertexFinderKernel2(gpuVertexFinder::ZVertices* pdata, gpuVertexFinder::WorkSpace* pws) {
+  __global__ void vertexFinderKernel2(VtxSoAView pdata, WsSoAView pws) {
     fitVertices(pdata, pws, maxChi2ForFinalFit);
     __syncthreads();
     sortByPt2(pdata, pws);
@@ -95,37 +96,40 @@ namespace gpuVertexFinder {
 #endif
 
 #ifdef __CUDACC__
-  ZVertexHeterogeneous Producer::makeAsync(cudaStream_t stream, TkSoA const* tksoa, float ptMin, float ptMax) const {
+  ZVertex::ZVertexSoADevice Producer::makeAsync(cudaStream_t stream,
+                                                TkSoAConstView tracks_view,
+                                                float ptMin,
+                                                float ptMax) const {
 #ifdef PIXVERTEX_DEBUG_PRODUCE
     std::cout << "producing Vertices on GPU" << std::endl;
 #endif  // PIXVERTEX_DEBUG_PRODUCE
-    ZVertexHeterogeneous vertices(cms::cuda::make_device_unique<ZVertexSoA>(stream));
+    ZVertex::ZVertexSoADevice vertices(stream);
 #else
-  ZVertexHeterogeneous Producer::make(TkSoA const* tksoa, float ptMin, float ptMax) const {
+  ZVertex::ZVertexSoAHost Producer::make(TkSoAConstView tracks_view, float ptMin, float ptMax) const {
 #ifdef PIXVERTEX_DEBUG_PRODUCE
     std::cout << "producing Vertices on  CPU" << std::endl;
 #endif  // PIXVERTEX_DEBUG_PRODUCE
-    ZVertexHeterogeneous vertices(std::make_unique<ZVertexSoA>());
+    ZVertex::ZVertexSoAHost vertices(nullptr);
 #endif
-    assert(tksoa);
-    auto* soa = vertices.get();
-    assert(soa);
+    auto soa = vertices.view();
+    //TODO: check if there is a way to assert this
+    //assert(soa);
 
 #ifdef __CUDACC__
-    auto ws_d = cms::cuda::make_device_unique<WorkSpace>(stream);
+    auto ws_d = gpuVertexFinder::workSpace::WorkSpaceSoADevice(stream);
 #else
-    auto ws_d = std::make_unique<WorkSpace>();
+    auto ws_d = gpuVertexFinder::workSpace::WorkSpaceSoAHost(nullptr);
 #endif
 
 #ifdef __CUDACC__
-    init<<<1, 1, 0, stream>>>(soa, ws_d.get());
+    init<<<1, 1, 0, stream>>>(soa, ws_d.view());
     auto blockSize = 128;
-    auto numberOfBlocks = (TkSoA::stride() + blockSize - 1) / blockSize;
-    loadTracks<<<numberOfBlocks, blockSize, 0, stream>>>(tksoa, soa, ws_d.get(), ptMin, ptMax);
+    auto numberOfBlocks = (tracks_view.metadata().size() + blockSize - 1) / blockSize;
+    loadTracks<<<numberOfBlocks, blockSize, 0, stream>>>(tracks_view, soa, ws_d.view(), ptMin, ptMax);
     cudaCheck(cudaGetLastError());
 #else
-    init(soa, ws_d.get());
-    loadTracks(tksoa, soa, ws_d.get(), ptMin, ptMax);
+    init(soa, ws_d.view());
+    loadTracks(tracks_view, soa, ws_d.view(), ptMin, ptMax);
 #endif
 
 #ifdef __CUDACC__
@@ -137,50 +141,51 @@ namespace gpuVertexFinder {
     if (oneKernel_) {
       // implemented only for density clustesrs
 #ifndef THREE_KERNELS
-      vertexFinderOneKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
+      vertexFinderOneKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.view(), minT, eps, errmax, chi2max);
 #else
-      vertexFinderKernel1<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
+      vertexFinderKernel1<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.view(), minT, eps, errmax, chi2max);
       cudaCheck(cudaGetLastError());
       // one block per vertex...
-      splitVerticesKernel<<<numBlocks, threadsPerBlock, 0, stream>>>(soa, ws_d.get(), maxChi2ForSplit);
+      splitVerticesKernel<<<numBlocks, threadsPerBlock, 0, stream>>>(soa, ws_d.view(), maxChi2ForSplit);
       cudaCheck(cudaGetLastError());
-      vertexFinderKernel2<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get());
+      vertexFinderKernel2<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.view());
 #endif
     } else {  // five kernels
       if (useDensity_) {
-        clusterTracksByDensityKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
+        clusterTracksByDensityKernel<<<1, maxThreadsForPrint, 0, stream>>>(
+            soa, ws_d.view(), minT, eps, errmax, chi2max);
       } else if (useDBSCAN_) {
-        clusterTracksDBSCAN<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
+        clusterTracksDBSCAN<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.view(), minT, eps, errmax, chi2max);
       } else if (useIterative_) {
-        clusterTracksIterative<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
+        clusterTracksIterative<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.view(), minT, eps, errmax, chi2max);
       }
       cudaCheck(cudaGetLastError());
-      fitVerticesKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), maxChi2ForFirstFit);
+      fitVerticesKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.view(), maxChi2ForFirstFit);
       cudaCheck(cudaGetLastError());
       // one block per vertex...
-      splitVerticesKernel<<<numBlocks, threadsPerBlock, 0, stream>>>(soa, ws_d.get(), maxChi2ForSplit);
+      splitVerticesKernel<<<numBlocks, threadsPerBlock, 0, stream>>>(soa, ws_d.view(), maxChi2ForSplit);
       cudaCheck(cudaGetLastError());
-      fitVerticesKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), maxChi2ForFinalFit);
+      fitVerticesKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.view(), maxChi2ForFinalFit);
       cudaCheck(cudaGetLastError());
-      sortByPt2Kernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get());
+      sortByPt2Kernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.view());
     }
     cudaCheck(cudaGetLastError());
 #else  // __CUDACC__
     if (useDensity_) {
-      clusterTracksByDensity(soa, ws_d.get(), minT, eps, errmax, chi2max);
+      clusterTracksByDensity(soa, ws_d.view(), minT, eps, errmax, chi2max);
     } else if (useDBSCAN_) {
-      clusterTracksDBSCAN(soa, ws_d.get(), minT, eps, errmax, chi2max);
+      clusterTracksDBSCAN(soa, ws_d.view(), minT, eps, errmax, chi2max);
     } else if (useIterative_) {
-      clusterTracksIterative(soa, ws_d.get(), minT, eps, errmax, chi2max);
+      clusterTracksIterative(soa, ws_d.view(), minT, eps, errmax, chi2max);
     }
 #ifdef PIXVERTEX_DEBUG_PRODUCE
-    std::cout << "found " << (*ws_d).nvIntermediate << " vertices " << std::endl;
+    std::cout << "found " << ws_d.view().nvIntermediate() << " vertices " << std::endl;
 #endif  // PIXVERTEX_DEBUG_PRODUCE
-    fitVertices(soa, ws_d.get(), maxChi2ForFirstFit);
+    fitVertices(soa, ws_d.view(), maxChi2ForFirstFit);
     // one block per vertex!
-    splitVertices(soa, ws_d.get(), maxChi2ForSplit);
-    fitVertices(soa, ws_d.get(), maxChi2ForFinalFit);
-    sortByPt2(soa, ws_d.get());
+    splitVertices(soa, ws_d.view(), maxChi2ForSplit);
+    fitVertices(soa, ws_d.view(), maxChi2ForFinalFit);
+    sortByPt2(soa, ws_d.view());
 #endif
 
     return vertices;
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h
index 2b6a8107d927f..cc8224521680c 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h
@@ -4,46 +4,27 @@
 #include <cstddef>
 #include <cstdint>
 
-#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h"
-#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
+#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h"
+#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h"
+#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h"
+#include "CUDADataFormats/Vertex/interface/ZVertexUtilities.h"
+#include "WorkSpaceUtilities.h"
+#include "WorkSpaceSoAHeterogeneousHost.h"
+#include "WorkSpaceSoAHeterogeneousDevice.h"
 
 namespace gpuVertexFinder {
 
-  using ZVertices = ZVertexSoA;
-  using TkSoA = pixelTrack::TrackSoA;
+  using VtxSoAView = ZVertex::ZVertexSoAView;
+  using TkSoAConstView = pixelTrack::TrackSoAConstView;
+  using WsSoAView = gpuVertexFinder::workSpace::WorkSpaceSoAView;
 
-  // workspace used in the vertex reco algos
-  struct WorkSpace {
-    static constexpr uint32_t MAXTRACKS = ZVertexSoA::MAXTRACKS;
-    static constexpr uint32_t MAXVTX = ZVertexSoA::MAXVTX;
-
-    uint32_t ntrks;            // number of "selected tracks"
-    uint16_t itrk[MAXTRACKS];  // index of original track
-    float zt[MAXTRACKS];       // input track z at bs
-    float ezt2[MAXTRACKS];     // input error^2 on the above
-    float ptt2[MAXTRACKS];     // input pt^2 on the above
-    uint8_t izt[MAXTRACKS];    // interized z-position of input tracks
-    int32_t iv[MAXTRACKS];     // vertex index for each associated track
-
-    uint32_t nvIntermediate;  // the number of vertices after splitting pruning etc.
-
-    __host__ __device__ void init() {
-      ntrks = 0;
-      nvIntermediate = 0;
-    }
-  };
-
-  __global__ void init(ZVertexSoA* pdata, WorkSpace* pws) {
-    pdata->init();
-    pws->init();
+  __global__ void init(VtxSoAView pdata, WsSoAView pws) {
+    ZVertex::utilities::init(pdata);
+    gpuVertexFinder::workSpace::utilities::init(pws);
   }
 
   class Producer {
   public:
-    using ZVertices = ZVertexSoA;
-    using WorkSpace = gpuVertexFinder::WorkSpace;
-    using TkSoA = pixelTrack::TrackSoA;
-
     Producer(bool oneKernel,
              bool useDensity,
              bool useDBSCAN,
@@ -64,8 +45,8 @@ namespace gpuVertexFinder {
 
     ~Producer() = default;
 
-    ZVertexHeterogeneous makeAsync(cudaStream_t stream, TkSoA const* tksoa, float ptMin, float ptMax) const;
-    ZVertexHeterogeneous make(TkSoA const* tksoa, float ptMin, float ptMax) const;
+    ZVertex::ZVertexSoADevice makeAsync(cudaStream_t stream, TkSoAConstView tracks_view, float ptMin, float ptMax) const;
+    ZVertex::ZVertexSoAHost make(TkSoAConstView tracks_view, float ptMin, float ptMax) const;
 
   private:
     const bool oneKernel_;
diff --git a/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h b/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h
index 5f8a0646c726a..c2cea5a9a1f13 100644
--- a/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h
+++ b/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h
@@ -7,6 +7,17 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/launch.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/allocate_device.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/currentDevice.h"
+
+#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h"  // TODO: included in order to compile Eigen columns first!!!
+#include "CUDADataFormats/Vertex/interface/ZVertexUtilities.h"
+#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h"
+#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h"
+
+#include "RecoPixelVertexing/PixelVertexFinding/plugins/WorkSpaceUtilities.h"
+#include "RecoPixelVertexing/PixelVertexFinding/plugins/WorkSpaceSoAHeterogeneousHost.h"
+#include "RecoPixelVertexing/PixelVertexFinding/plugins/WorkSpaceSoAHeterogeneousDevice.h"
 #ifdef USE_DBSCAN
 #include "RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksDBSCAN.h"
 #define CLUSTERIZE gpuVertexFinder::clusterTracksDBSCAN
@@ -23,22 +34,22 @@
 
 #ifdef ONE_KERNEL
 #ifdef __CUDACC__
-__global__ void vertexFinderOneKernel(gpuVertexFinder::ZVertices* pdata,
-                                      gpuVertexFinder::WorkSpace* pws,
+__global__ void vertexFinderOneKernel(gpuVertexFinder::VtxSoAView pdata,
+                                      gpuVertexFinder::WsSoAView pws,
                                       int minT,      // min number of neighbours to be "seed"
                                       float eps,     // max absolute distance to cluster
                                       float errmax,  // max error to be "seed"
                                       float chi2max  // max normalized distance to cluster,
 ) {
-  clusterTracksByDensity(pdata, pws, minT, eps, errmax, chi2max);
+  gpuVertexFinder::clusterTracksByDensity(pdata, pws, minT, eps, errmax, chi2max);
   __syncthreads();
-  fitVertices(pdata, pws, 50.);
+  gpuVertexFinder::fitVertices(pdata, pws, 50.);
   __syncthreads();
-  splitVertices(pdata, pws, 9.f);
+  gpuVertexFinder::splitVertices(pdata, pws, 9.f);
   __syncthreads();
-  fitVertices(pdata, pws, 5000.);
+  gpuVertexFinder::fitVertices(pdata, pws, 5000.);
   __syncthreads();
-  sortByPt2(pdata, pws);
+  gpuVertexFinder::sortByPt2(pdata, pws);
 }
 #endif
 #endif
@@ -101,25 +112,24 @@ struct ClusterGenerator {
   std::exponential_distribution<float> ptGen;
 };
 
-// a macro SORRY
-#define LOC_ONGPU(M) ((char*)(onGPU_d.get()) + offsetof(gpuVertexFinder::ZVertices, M))
-#define LOC_WS(M) ((char*)(ws_d.get()) + offsetof(gpuVertexFinder::WorkSpace, M))
-
-__global__ void print(gpuVertexFinder::ZVertices const* pdata, gpuVertexFinder::WorkSpace const* pws) {
-  auto const& __restrict__ data = *pdata;
-  auto const& __restrict__ ws = *pws;
-  printf("nt,nv %d %d,%d\n", ws.ntrks, data.nvFinal, ws.nvIntermediate);
+__global__ void print(gpuVertexFinder::VtxSoAView pdata, gpuVertexFinder::WsSoAView pws) {
+  auto& __restrict__ ws = pws;
+  printf("nt,nv %d %d,%d\n", ws.ntrks(), pdata.nvFinal(), ws.nvIntermediate());
 }
 
 int main() {
+  cudaStream_t stream;
 #ifdef __CUDACC__
   cms::cudatest::requireDevices();
+  cudaCheck(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
 
-  auto onGPU_d = cms::cuda::make_device_unique<gpuVertexFinder::ZVertices[]>(1, nullptr);
-  auto ws_d = cms::cuda::make_device_unique<gpuVertexFinder::WorkSpace[]>(1, nullptr);
+  ZVertex::ZVertexSoADevice onGPU_d(stream);
+  gpuVertexFinder::workSpace::WorkSpaceSoADevice ws_d(stream);
 #else
-  auto onGPU_d = std::make_unique<gpuVertexFinder::ZVertices>();
-  auto ws_d = std::make_unique<gpuVertexFinder::WorkSpace>();
+  stream = nullptr;
+
+  ZVertex::ZVertexSoAHost onGPU_d(stream);
+  gpuVertexFinder::workSpace::WorkSpaceSoAHost ws_d(stream);
 #endif
 
   Event ev;
@@ -135,24 +145,26 @@ int main() {
       gen(ev);
 
 #ifdef __CUDACC__
-      init<<<1, 1, 0, 0>>>(onGPU_d.get(), ws_d.get());
+      gpuVertexFinder::init<<<1, 1, 0, stream>>>(onGPU_d.view(), ws_d.view());
 #else
-      onGPU_d->init();
-      ws_d->init();
+      gpuVertexFinder::init(onGPU_d.view(), ws_d.view());
 #endif
 
       std::cout << "v,t size " << ev.zvert.size() << ' ' << ev.ztrack.size() << std::endl;
       auto nt = ev.ztrack.size();
 #ifdef __CUDACC__
-      cudaCheck(cudaMemcpy(LOC_WS(ntrks), &nt, sizeof(uint32_t), cudaMemcpyHostToDevice));
-      cudaCheck(cudaMemcpy(LOC_WS(zt), ev.ztrack.data(), sizeof(float) * ev.ztrack.size(), cudaMemcpyHostToDevice));
-      cudaCheck(cudaMemcpy(LOC_WS(ezt2), ev.eztrack.data(), sizeof(float) * ev.eztrack.size(), cudaMemcpyHostToDevice));
-      cudaCheck(cudaMemcpy(LOC_WS(ptt2), ev.pttrack.data(), sizeof(float) * ev.eztrack.size(), cudaMemcpyHostToDevice));
+      cudaCheck(cudaMemcpy(&ws_d.view().ntrks(), &nt, sizeof(uint32_t), cudaMemcpyHostToDevice));
+      cudaCheck(
+          cudaMemcpy(ws_d.view().zt(), ev.ztrack.data(), sizeof(float) * ev.ztrack.size(), cudaMemcpyHostToDevice));
+      cudaCheck(
+          cudaMemcpy(ws_d.view().ezt2(), ev.eztrack.data(), sizeof(float) * ev.eztrack.size(), cudaMemcpyHostToDevice));
+      cudaCheck(
+          cudaMemcpy(ws_d.view().ptt2(), ev.pttrack.data(), sizeof(float) * ev.eztrack.size(), cudaMemcpyHostToDevice));
 #else
-      ::memcpy(LOC_WS(ntrks), &nt, sizeof(uint32_t));
-      ::memcpy(LOC_WS(zt), ev.ztrack.data(), sizeof(float) * ev.ztrack.size());
-      ::memcpy(LOC_WS(ezt2), ev.eztrack.data(), sizeof(float) * ev.eztrack.size());
-      ::memcpy(LOC_WS(ptt2), ev.pttrack.data(), sizeof(float) * ev.eztrack.size());
+      ::memcpy(&ws_d.view().ntrks(), &nt, sizeof(uint32_t));
+      ::memcpy(ws_d.view().zt(), ev.ztrack.data(), sizeof(float) * ev.ztrack.size());
+      ::memcpy(ws_d.view().ezt2(), ev.eztrack.data(), sizeof(float) * ev.eztrack.size());
+      ::memcpy(ws_d.view().ptt2(), ev.pttrack.data(), sizeof(float) * ev.eztrack.size());
 #endif
 
       std::cout << "M eps, pset " << kk << ' ' << eps << ' ' << (i % 4) << std::endl;
@@ -168,30 +180,30 @@ int main() {
 
       uint32_t nv = 0;
 #ifdef __CUDACC__
-      print<<<1, 1, 0, 0>>>(onGPU_d.get(), ws_d.get());
+      print<<<1, 1, 0, stream>>>(onGPU_d.view(), ws_d.view());
       cudaCheck(cudaGetLastError());
       cudaDeviceSynchronize();
 
 #ifdef ONE_KERNEL
-      cms::cuda::launch(vertexFinderOneKernel, {1, 512 + 256}, onGPU_d.get(), ws_d.get(), kk, par[0], par[1], par[2]);
+      cms::cuda::launch(vertexFinderOneKernel, {1, 512 + 256}, onGPU_d.view(), ws_d.view(), kk, par[0], par[1], par[2]);
 #else
-      cms::cuda::launch(CLUSTERIZE, {1, 512 + 256}, onGPU_d.get(), ws_d.get(), kk, par[0], par[1], par[2]);
+      cms::cuda::launch(CLUSTERIZE, {1, 512 + 256}, onGPU_d.view(), ws_d.view(), kk, par[0], par[1], par[2]);
 #endif
-      print<<<1, 1, 0, 0>>>(onGPU_d.get(), ws_d.get());
+      print<<<1, 1, 0, stream>>>(onGPU_d.view(), ws_d.view());
 
       cudaCheck(cudaGetLastError());
       cudaDeviceSynchronize();
 
-      cms::cuda::launch(gpuVertexFinder::fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 50.f);
+      cms::cuda::launch(gpuVertexFinder::fitVerticesKernel, {1, 1024 - 256}, onGPU_d.view(), ws_d.view(), 50.f);
       cudaCheck(cudaGetLastError());
-      cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost));
+      cudaCheck(cudaMemcpy(&nv, &onGPU_d.view().nvFinal(), sizeof(uint32_t), cudaMemcpyDeviceToHost));
 
 #else
-      print(onGPU_d.get(), ws_d.get());
-      CLUSTERIZE(onGPU_d.get(), ws_d.get(), kk, par[0], par[1], par[2]);
-      print(onGPU_d.get(), ws_d.get());
-      fitVertices(onGPU_d.get(), ws_d.get(), 50.f);
-      nv = onGPU_d->nvFinal;
+      print(onGPU_d.view(), ws_d.view());
+      CLUSTERIZE(onGPU_d.view(), ws_d.view(), kk, par[0], par[1], par[2]);
+      print(onGPU_d.view(), ws_d.view());
+      gpuVertexFinder::fitVertices(onGPU_d.view(), ws_d.view(), 50.f);
+      nv = onGPU_d.view().nvFinal();
 #endif
 
       if (nv == 0) {
@@ -221,18 +233,18 @@ int main() {
       nn = hnn;
       ind = hind;
 #else
-      zv = onGPU_d->zv;
-      wv = onGPU_d->wv;
-      ptv2 = onGPU_d->ptv2;
-      nn = onGPU_d->ndof;
-      ind = onGPU_d->sortInd;
+      zv = onGPU_d.view().zv();
+      wv = onGPU_d.view().wv();
+      ptv2 = onGPU_d.view().ptv2();
+      nn = onGPU_d.view().ndof();
+      ind = onGPU_d.view().sortInd();
 #endif
 
 #ifdef __CUDACC__
-      cudaCheck(cudaMemcpy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t), cudaMemcpyDeviceToHost));
-      cudaCheck(cudaMemcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float), cudaMemcpyDeviceToHost));
+      cudaCheck(cudaMemcpy(nn, onGPU_d.view().ndof(), nv * sizeof(int32_t), cudaMemcpyDeviceToHost));
+      cudaCheck(cudaMemcpy(chi2, onGPU_d.view().chi2(), nv * sizeof(float), cudaMemcpyDeviceToHost));
 #else
-      memcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float));
+      memcpy(chi2, onGPU_d.view().chi2(), nv * sizeof(float));
 #endif
 
       for (auto j = 0U; j < nv; ++j)
@@ -244,14 +256,14 @@ int main() {
       }
 
 #ifdef __CUDACC__
-      cms::cuda::launch(gpuVertexFinder::fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 50.f);
-      cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost));
-      cudaCheck(cudaMemcpy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t), cudaMemcpyDeviceToHost));
-      cudaCheck(cudaMemcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float), cudaMemcpyDeviceToHost));
+      cms::cuda::launch(gpuVertexFinder::fitVerticesKernel, {1, 1024 - 256}, onGPU_d.view(), ws_d.view(), 50.f);
+      cudaCheck(cudaMemcpy(&nv, &onGPU_d.view().nvFinal(), sizeof(uint32_t), cudaMemcpyDeviceToHost));
+      cudaCheck(cudaMemcpy(nn, onGPU_d.view().ndof(), nv * sizeof(int32_t), cudaMemcpyDeviceToHost));
+      cudaCheck(cudaMemcpy(chi2, onGPU_d.view().chi2(), nv * sizeof(float), cudaMemcpyDeviceToHost));
 #else
-      fitVertices(onGPU_d.get(), ws_d.get(), 50.f);
-      nv = onGPU_d->nvFinal;
-      memcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float));
+      gpuVertexFinder::fitVertices(onGPU_d.view(), ws_d.view(), 50.f);
+      nv = onGPU_d.view().nvFinal();
+      memcpy(chi2, onGPU_d.view().chi2(), nv * sizeof(float));
 #endif
 
       for (auto j = 0U; j < nv; ++j)
@@ -264,26 +276,26 @@ int main() {
 
 #ifdef __CUDACC__
       // one vertex per block!!!
-      cms::cuda::launch(gpuVertexFinder::splitVerticesKernel, {1024, 64}, onGPU_d.get(), ws_d.get(), 9.f);
-      cudaCheck(cudaMemcpy(&nv, LOC_WS(nvIntermediate), sizeof(uint32_t), cudaMemcpyDeviceToHost));
+      cms::cuda::launch(gpuVertexFinder::splitVerticesKernel, {1024, 64}, onGPU_d.view(), ws_d.view(), 9.f);
+      cudaCheck(cudaMemcpy(&nv, &ws_d.view().nvIntermediate(), sizeof(uint32_t), cudaMemcpyDeviceToHost));
 #else
-      splitVertices(onGPU_d.get(), ws_d.get(), 9.f);
-      nv = ws_d->nvIntermediate;
+      gpuVertexFinder::splitVertices(onGPU_d.view(), ws_d.view(), 9.f);
+      nv = ws_d.view().nvIntermediate();
 #endif
       std::cout << "after split " << nv << std::endl;
 
 #ifdef __CUDACC__
-      cms::cuda::launch(gpuVertexFinder::fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 5000.f);
+      cms::cuda::launch(gpuVertexFinder::fitVerticesKernel, {1, 1024 - 256}, onGPU_d.view(), ws_d.view(), 5000.f);
       cudaCheck(cudaGetLastError());
 
-      cms::cuda::launch(gpuVertexFinder::sortByPt2Kernel, {1, 256}, onGPU_d.get(), ws_d.get());
+      cms::cuda::launch(gpuVertexFinder::sortByPt2Kernel, {1, 256}, onGPU_d.view(), ws_d.view());
       cudaCheck(cudaGetLastError());
-      cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost));
+      cudaCheck(cudaMemcpy(&nv, &onGPU_d.view().nvFinal(), sizeof(uint32_t), cudaMemcpyDeviceToHost));
 #else
-      fitVertices(onGPU_d.get(), ws_d.get(), 5000.f);
-      sortByPt2(onGPU_d.get(), ws_d.get());
-      nv = onGPU_d->nvFinal;
-      memcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float));
+      gpuVertexFinder::fitVertices(onGPU_d.view(), ws_d.view(), 5000.f);
+      gpuVertexFinder::sortByPt2(onGPU_d.view(), ws_d.view());
+      nv = onGPU_d.view().nvFinal();
+      memcpy(chi2, onGPU_d.view().chi2(), nv * sizeof(float));
 #endif
 
       if (nv == 0) {
@@ -292,12 +304,12 @@ int main() {
       }
 
 #ifdef __CUDACC__
-      cudaCheck(cudaMemcpy(zv, LOC_ONGPU(zv), nv * sizeof(float), cudaMemcpyDeviceToHost));
-      cudaCheck(cudaMemcpy(wv, LOC_ONGPU(wv), nv * sizeof(float), cudaMemcpyDeviceToHost));
-      cudaCheck(cudaMemcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float), cudaMemcpyDeviceToHost));
-      cudaCheck(cudaMemcpy(ptv2, LOC_ONGPU(ptv2), nv * sizeof(float), cudaMemcpyDeviceToHost));
-      cudaCheck(cudaMemcpy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t), cudaMemcpyDeviceToHost));
-      cudaCheck(cudaMemcpy(ind, LOC_ONGPU(sortInd), nv * sizeof(uint16_t), cudaMemcpyDeviceToHost));
+      cudaCheck(cudaMemcpy(zv, onGPU_d.view().zv(), nv * sizeof(float), cudaMemcpyDeviceToHost));
+      cudaCheck(cudaMemcpy(wv, onGPU_d.view().wv(), nv * sizeof(float), cudaMemcpyDeviceToHost));
+      cudaCheck(cudaMemcpy(chi2, onGPU_d.view().chi2(), nv * sizeof(float), cudaMemcpyDeviceToHost));
+      cudaCheck(cudaMemcpy(ptv2, onGPU_d.view().ptv2(), nv * sizeof(float), cudaMemcpyDeviceToHost));
+      cudaCheck(cudaMemcpy(nn, onGPU_d.view().ndof(), nv * sizeof(int32_t), cudaMemcpyDeviceToHost));
+      cudaCheck(cudaMemcpy(ind, onGPU_d.view().sortInd(), nv * sizeof(uint16_t), cudaMemcpyDeviceToHost));
 #endif
       for (auto j = 0U; j < nv; ++j)
         if (nn[j] > 0)
diff --git a/RecoTauTag/HLTProducers/src/L2TauTagNNProducer.cc b/RecoTauTag/HLTProducers/src/L2TauTagNNProducer.cc
index 34e04b0f7aedb..aa8565e9aed1f 100644
--- a/RecoTauTag/HLTProducers/src/L2TauTagNNProducer.cc
+++ b/RecoTauTag/HLTProducers/src/L2TauTagNNProducer.cc
@@ -45,12 +45,14 @@
 #include "DataFormats/TrajectoryState/interface/LocalTrajectoryParameters.h"
 #include "DataFormats/GeometrySurface/interface/Plane.h"
 #include "DataFormats/BeamSpot/interface/BeamSpot.h"
-#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
 #include "MagneticField/Records/interface/IdealMagneticFieldRecord.h"
 #include "CUDADataFormats/SiPixelCluster/interface/gpuClusteringConstants.h"
-#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h"
-#include "CUDADataFormats/Vertex/interface/ZVertexSoA.h"
-#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h"
+#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h"
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h"
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h"
+#include "CUDADataFormats/Vertex/interface/ZVertexUtilities.h"
+#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h"
+#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h"
 
 namespace L2TauTagNNv1 {
   constexpr int nCellEta = 5;
@@ -179,16 +181,16 @@ class L2TauNNProducer : public edm::stream::EDProducer<edm::GlobalCache<L2TauNNP
                        const caloRecHitCollections& caloRecHits);
   void fillPatatracks(tensorflow::Tensor& cellGridMatrix,
                       const std::vector<l1t::TauRef>& allTaus,
-                      const pixelTrack::TrackSoA& patatracks_tsoa,
-                      const ZVertexSoA& patavtx_soa,
+                      const pixelTrack::TrackSoAHost& patatracks_tsoa,
+                      const ZVertex::ZVertexSoAHost& patavtx_soa,
                       const reco::BeamSpot& beamspot,
                       const MagneticField* magfi);
-  void selectGoodTracksAndVertices(const ZVertexSoA& patavtx_soa,
-                                   const pixelTrack::TrackSoA& patatracks_tsoa,
+  void selectGoodTracksAndVertices(const ZVertex::ZVertexSoAHost& patavtx_soa,
+                                   const pixelTrack::TrackSoAHost& patatracks_tsoa,
                                    std::vector<int>& trkGood,
                                    std::vector<int>& vtxGood);
   std::pair<float, float> impactParameter(int it,
-                                          const pixelTrack::TrackSoA& patatracks_tsoa,
+                                          const pixelTrack::TrackSoAHost& patatracks_tsoa,
                                           float patatrackPhi,
                                           const reco::BeamSpot& beamspot,
                                           const MagneticField* magfi);
@@ -207,8 +209,8 @@ class L2TauNNProducer : public edm::stream::EDProducer<edm::GlobalCache<L2TauNNP
   const edm::EDGetTokenT<EcalRecHitCollection> eeToken_;
   const edm::ESGetToken<CaloGeometry, CaloGeometryRecord> geometryToken_;
   const edm::ESGetToken<MagneticField, IdealMagneticFieldRecord> bFieldToken_;
-  const edm::EDGetTokenT<ZVertexHeterogeneous> pataVerticesToken_;
-  const edm::EDGetTokenT<PixelTrackHeterogeneous> pataTracksToken_;
+  const edm::EDGetTokenT<ZVertex::ZVertexSoAHost> pataVerticesToken_;
+  const edm::EDGetTokenT<pixelTrack::TrackSoAHost> pataTracksToken_;
   const edm::EDGetTokenT<reco::BeamSpot> beamSpotToken_;
   const unsigned int maxVtx_;
   const float fractionSumPt2_;
@@ -292,8 +294,8 @@ L2TauNNProducer::L2TauNNProducer(const edm::ParameterSet& cfg, const L2TauNNProd
       eeToken_(consumes<EcalRecHitCollection>(cfg.getParameter<edm::InputTag>("eeInput"))),
       geometryToken_(esConsumes<CaloGeometry, CaloGeometryRecord>()),
       bFieldToken_(esConsumes<MagneticField, IdealMagneticFieldRecord>()),
-      pataVerticesToken_(consumes<ZVertexHeterogeneous>(cfg.getParameter<edm::InputTag>("pataVertices"))),
-      pataTracksToken_(consumes<PixelTrackHeterogeneous>(cfg.getParameter<edm::InputTag>("pataTracks"))),
+      pataVerticesToken_(consumes<ZVertex::ZVertexSoAHost>(cfg.getParameter<edm::InputTag>("pataVertices"))),
+      pataTracksToken_(consumes<pixelTrack::TrackSoAHost>(cfg.getParameter<edm::InputTag>("pataTracks"))),
       beamSpotToken_(consumes<reco::BeamSpot>(cfg.getParameter<edm::InputTag>("BeamSpot"))),
       maxVtx_(cfg.getParameter<uint>("maxVtx")),
       fractionSumPt2_(cfg.getParameter<double>("fractionSumPt2")),
@@ -569,32 +571,32 @@ void L2TauNNProducer::fillCaloRecHits(tensorflow::Tensor& cellGridMatrix,
   }
 }
 
-void L2TauNNProducer::selectGoodTracksAndVertices(const ZVertexSoA& patavtx_soa,
-                                                  const pixelTrack::TrackSoA& patatracks_tsoa,
+void L2TauNNProducer::selectGoodTracksAndVertices(const ZVertex::ZVertexSoAHost& patavtx_soa,
+                                                  const pixelTrack::TrackSoAHost& patatracks_tsoa,
                                                   std::vector<int>& trkGood,
                                                   std::vector<int>& vtxGood) {
-  const auto maxTracks = patatracks_tsoa.stride();
-  const int nv = patavtx_soa.nvFinal;
+  const auto maxTracks = patatracks_tsoa.view().metadata().size();
+  const int nv = patavtx_soa.view().nvFinal();
   trkGood.clear();
   trkGood.reserve(maxTracks);
   vtxGood.clear();
   vtxGood.reserve(nv);
-  auto const* quality = patatracks_tsoa.qualityData();
+  auto const* quality = pixelTrack::utilities::qualityData(patatracks_tsoa.view());
 
   // No need to sort either as the algorithms is just using the max (not even the location, just the max value of pt2sum).
   std::vector<float> pTSquaredSum(nv, 0);
   std::vector<int> nTrkAssociated(nv, 0);
 
   for (int32_t trk_idx = 0; trk_idx < maxTracks; ++trk_idx) {
-    auto nHits = patatracks_tsoa.nHits(trk_idx);
+    auto nHits = pixelTrack::utilities::nHits(patatracks_tsoa.view(), trk_idx);
     if (nHits == 0) {
       break;
     }
-    int vtx_ass_to_track = patavtx_soa.idv[trk_idx];
+    int vtx_ass_to_track = patavtx_soa.view()[trk_idx].idv();
     if (vtx_ass_to_track >= 0 && vtx_ass_to_track < nv) {
-      auto patatrackPt = patatracks_tsoa.pt[trk_idx];
+      auto patatrackPt = patatracks_tsoa.view()[trk_idx].pt();
       ++nTrkAssociated[vtx_ass_to_track];
-      if (patatrackPt >= trackPtMin_ && patatracks_tsoa.chi2(trk_idx) <= trackChi2Max_) {
+      if (patatrackPt >= trackPtMin_ && patatracks_tsoa.const_view()[trk_idx].chi2() <= trackChi2Max_) {
         patatrackPt = std::min(patatrackPt, trackPtMax_);
         pTSquaredSum[vtx_ass_to_track] += patatrackPt * patatrackPt;
       }
@@ -606,7 +608,7 @@ void L2TauNNProducer::selectGoodTracksAndVertices(const ZVertexSoA& patavtx_soa,
   if (nv > 0) {
     const auto minFOM_fromFrac = (*std::max_element(pTSquaredSum.begin(), pTSquaredSum.end())) * fractionSumPt2_;
     for (int j = nv - 1; j >= 0 && vtxGood.size() < maxVtx_; --j) {
-      auto vtx_idx = patavtx_soa.sortInd[j];
+      auto vtx_idx = patavtx_soa.view()[j].sortInd();
       assert(vtx_idx < nv);
       if (nTrkAssociated[vtx_idx] >= 2 && pTSquaredSum[vtx_idx] >= minFOM_fromFrac &&
           pTSquaredSum[vtx_idx] > minSumPt2_) {
@@ -617,15 +619,15 @@ void L2TauNNProducer::selectGoodTracksAndVertices(const ZVertexSoA& patavtx_soa,
 }
 
 std::pair<float, float> L2TauNNProducer::impactParameter(int it,
-                                                         const pixelTrack::TrackSoA& patatracks_tsoa,
+                                                         const pixelTrack::TrackSoAHost& patatracks_tsoa,
                                                          float patatrackPhi,
                                                          const reco::BeamSpot& beamspot,
                                                          const MagneticField* magfi) {
-  auto const& fit = patatracks_tsoa.stateAtBS;
+  // auto const& fit = patatracks_tsoa.stateAtBS;
   /* dxy and dz */
   riemannFit::Vector5d ipar, opar;
   riemannFit::Matrix5d icov, ocov;
-  fit.copyToDense(ipar, icov, it);
+  pixelTrack::utilities::copyToDense(patatracks_tsoa.view(), ipar, icov, it);
   riemannFit::transformToPerigeePlane(ipar, icov, opar, ocov);
   LocalTrajectoryParameters lpar(opar(0), opar(1), opar(2), opar(3), opar(4), 1.);
   float sp = std::sin(patatrackPhi);
@@ -650,8 +652,8 @@ std::pair<float, float> L2TauNNProducer::impactParameter(int it,
 
 void L2TauNNProducer::fillPatatracks(tensorflow::Tensor& cellGridMatrix,
                                      const std::vector<l1t::TauRef>& allTaus,
-                                     const pixelTrack::TrackSoA& patatracks_tsoa,
-                                     const ZVertexSoA& patavtx_soa,
+                                     const pixelTrack::TrackSoAHost& patatracks_tsoa,
+                                     const ZVertex::ZVertexSoAHost& patavtx_soa,
                                      const reco::BeamSpot& beamspot,
                                      const MagneticField* magfi) {
   using NNInputs = L2TauTagNNv1::NNInputs;
@@ -675,19 +677,19 @@ void L2TauNNProducer::fillPatatracks(tensorflow::Tensor& cellGridMatrix,
     const float tauPhi = allTaus[tau_idx]->phi();
 
     for (const auto it : trkGood) {
-      const float patatrackPt = patatracks_tsoa.pt[it];
+      const float patatrackPt = patatracks_tsoa.const_view()[it].pt();
       if (patatrackPt <= 0)
         continue;
-      const float patatrackPhi = patatracks_tsoa.phi(it);
-      const float patatrackEta = patatracks_tsoa.eta(it);
-      const float patatrackCharge = patatracks_tsoa.charge(it);
-      const float patatrackChi2OverNdof = patatracks_tsoa.chi2(it);
-      const auto nHits = patatracks_tsoa.nHits(it);
+      const float patatrackPhi = pixelTrack::utilities::phi(patatracks_tsoa.const_view(), it);
+      const float patatrackEta = patatracks_tsoa.const_view()[it].eta();
+      const float patatrackCharge = pixelTrack::utilities::charge(patatracks_tsoa.const_view(), it);
+      const float patatrackChi2OverNdof = patatracks_tsoa.view()[it].chi2();
+      const auto nHits = pixelTrack::utilities::nHits(patatracks_tsoa.const_view(), it);
       if (nHits <= 0)
         continue;
       const int patatrackNdof = 2 * std::min(6, nHits) - 5;
 
-      const int vtx_idx_assTrk = patavtx_soa.idv[it];
+      const int vtx_idx_assTrk = patavtx_soa.view()[it].idv();
       if (reco::deltaR2(patatrackEta, patatrackPhi, tauEta, tauPhi) < dR2_max) {
         std::tie(deta, dphi, eta_idx, phi_idx) =
             getEtaPhiIndices(patatrackEta, patatrackPhi, allTaus[tau_idx]->polarP4());
@@ -763,8 +765,8 @@ void L2TauNNProducer::produce(edm::Event& event, const edm::EventSetup& eventset
   const auto eeCal = event.getHandle(eeToken_);
   const auto hbhe = event.getHandle(hbheToken_);
   const auto ho = event.getHandle(hoToken_);
-  const auto& patatracks_SoA = *event.get(pataTracksToken_);
-  const auto& vertices_SoA = *event.get(pataVerticesToken_);
+  auto& patatracks_SoA = event.get(pataTracksToken_);
+  auto& vertices_SoA = event.get(pataVerticesToken_);
   const auto bsHandle = event.getHandle(beamSpotToken_);
 
   auto const fieldESH = eventsetup.getHandle(bFieldToken_);
diff --git a/RecoTracker/TkSeedGenerator/plugins/SeedProducerFromSoA.cc b/RecoTracker/TkSeedGenerator/plugins/SeedProducerFromSoA.cc
index 0e5823fc46c46..4301749e441fe 100644
--- a/RecoTracker/TkSeedGenerator/plugins/SeedProducerFromSoA.cc
+++ b/RecoTracker/TkSeedGenerator/plugins/SeedProducerFromSoA.cc
@@ -1,4 +1,6 @@
-#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
+//#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h"
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h"
 #include "DataFormats/BeamSpot/interface/BeamSpot.h"
 #include "DataFormats/GeometrySurface/interface/Plane.h"
 #include "DataFormats/TrackerCommon/interface/TrackerTopology.h"
@@ -45,7 +47,7 @@ class SeedProducerFromSoA : public edm::global::EDProducer<> {
 
   // Event data tokens
   const edm::EDGetTokenT<reco::BeamSpot> tBeamSpot_;
-  const edm::EDGetTokenT<PixelTrackHeterogeneous> tokenTrack_;
+  const edm::EDGetTokenT<pixelTrack::TrackSoAHost> tokenTrack_;
   // Event setup tokens
   const edm::ESGetToken<MagneticField, IdealMagneticFieldRecord> idealMagneticFieldToken_;
   const edm::ESGetToken<TrackerGeometry, TrackerDigiGeometryRecord> trackerDigiGeometryToken_;
@@ -55,7 +57,7 @@ class SeedProducerFromSoA : public edm::global::EDProducer<> {
 
 SeedProducerFromSoA::SeedProducerFromSoA(const edm::ParameterSet& iConfig)
     : tBeamSpot_(consumes<reco::BeamSpot>(iConfig.getParameter<edm::InputTag>("beamSpot"))),
-      tokenTrack_(consumes<PixelTrackHeterogeneous>(iConfig.getParameter<edm::InputTag>("src"))),
+      tokenTrack_(consumes<pixelTrack::TrackSoAHost>(iConfig.getParameter<edm::InputTag>("src"))),
       idealMagneticFieldToken_(esConsumes()),
       trackerDigiGeometryToken_(esConsumes()),
       trackerPropagatorToken_(esConsumes(edm::ESInputTag("PropagatorWithMaterial"))),
@@ -89,16 +91,16 @@ void SeedProducerFromSoA::produce(edm::StreamID streamID, edm::Event& iEvent, co
   // std::cout << "beamspot " << bsh.x0() << ' ' << bsh.y0() << ' ' << bsh.z0() << std::endl;
   GlobalPoint bs(bsh.x0(), bsh.y0(), bsh.z0());
 
-  const auto& tsoa = *(iEvent.get(tokenTrack_));
+  auto& tsoa = iEvent.get(tokenTrack_);
 
-  auto const* quality = tsoa.qualityData();
-  auto const& fit = tsoa.stateAtBS;
-  auto const& detIndices = tsoa.detIndices;
-  auto maxTracks = tsoa.stride();
+  auto const* quality = pixelTrack::utilities::qualityData(tsoa.view());
+  //auto const& fit = tsoa.stateAtBS;
+  auto const& detIndices = tsoa.view().detIndices();
+  auto maxTracks = tsoa.view().metadata().size();
 
   int32_t nt = 0;
   for (int32_t it = 0; it < maxTracks; ++it) {
-    auto nHits = tsoa.nHits(it);
+    auto nHits = pixelTrack::utilities::nHits(tsoa.view(), it);
     if (nHits == 0)
       break;  // this is a guard: maybe we need to move to nTracks...
 
@@ -120,11 +122,11 @@ void SeedProducerFromSoA::produce(edm::StreamID streamID, edm::Event& iEvent, co
 
     // mind: this values are respect the beamspot!
 
-    float phi = tsoa.phi(it);
+    float phi = pixelTrack::utilities::phi(tsoa.view(), it);
 
     riemannFit::Vector5d ipar, opar;
     riemannFit::Matrix5d icov, ocov;
-    fit.copyToDense(ipar, icov, it);
+    pixelTrack::utilities::copyToDense(tsoa.view(), ipar, icov, it);
     riemannFit::transformToPerigeePlane(ipar, icov, opar, ocov);
 
     LocalTrajectoryParameters lpar(opar(0), opar(1), opar(2), opar(3), opar(4), 1.);