diff --git a/CUDADataFormats/Common/BuildFile.xml b/CUDADataFormats/Common/BuildFile.xml
index f6b68fe69b400..e4971bdf3ebbe 100644
--- a/CUDADataFormats/Common/BuildFile.xml
+++ b/CUDADataFormats/Common/BuildFile.xml
@@ -1,4 +1,5 @@
 <iftool name="cuda">
+  <use name="cuda"/>
   <use name="rootcore"/>
   <use name="DataFormats/Common"/>
   <use name="HeterogeneousCore/CUDAUtilities"/>
diff --git a/CUDADataFormats/Common/interface/PortableDeviceCollection.h b/CUDADataFormats/Common/interface/PortableDeviceCollection.h
new file mode 100644
index 0000000000000..78f72cb3d5437
--- /dev/null
+++ b/CUDADataFormats/Common/interface/PortableDeviceCollection.h
@@ -0,0 +1,67 @@
+#ifndef CUDADataFormats_Common_interface_PortableDeviceCollection_h
+#define CUDADataFormats_Common_interface_PortableDeviceCollection_h
+
+#include <cassert>
+#include <cstdlib>
+
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+
+namespace cms::cuda {
+
+  // generic SoA-based product in device memory
+  template <typename T>
+  class PortableDeviceCollection {
+  public:
+    using Layout = T;
+    using View = typename Layout::View;
+    using ConstView = typename Layout::ConstView;
+    using Buffer = cms::cuda::device::unique_ptr<std::byte[]>;
+
+    PortableDeviceCollection() = default;
+
+    PortableDeviceCollection(int32_t elements, cudaStream_t stream)
+        : buffer_{cms::cuda::make_device_unique<std::byte[]>(Layout::computeDataSize(elements), stream)},
+          layout_{buffer_.get(), elements},
+          view_{layout_} {
+      // CUDA device memory uses a default alignment of at least 128 bytes
+      assert(reinterpret_cast<uintptr_t>(buffer_.get()) % Layout::alignment == 0);
+    }
+
+    // non-copyable
+    PortableDeviceCollection(PortableDeviceCollection const&) = delete;
+    PortableDeviceCollection& operator=(PortableDeviceCollection const&) = delete;
+
+    // movable
+    PortableDeviceCollection(PortableDeviceCollection&&) = default;
+    PortableDeviceCollection& operator=(PortableDeviceCollection&&) = default;
+
+    // default destructor
+    ~PortableDeviceCollection() = default;
+
+    // access the View
+    View& view() { return view_; }
+    ConstView const& view() const { return view_; }
+    ConstView const& const_view() const { return view_; }
+
+    View& operator*() { return view_; }
+    ConstView const& operator*() const { return view_; }
+
+    View* operator->() { return &view_; }
+    ConstView const* operator->() const { return &view_; }
+
+    // access the Buffer
+    Buffer& buffer() { return buffer_; }
+    Buffer const& buffer() const { return buffer_; }
+    Buffer const& const_buffer() const { return buffer_; }
+
+    size_t bufferSize() const { return layout_.metadata().byteSize(); }
+
+  private:
+    Buffer buffer_;  //!
+    Layout layout_;  //
+    View view_;      //!
+  };
+
+}  // namespace cms::cuda
+
+#endif  // CUDADataFormats_Common_interface_PortableDeviceCollection_h
diff --git a/CUDADataFormats/Common/interface/PortableHostCollection.h b/CUDADataFormats/Common/interface/PortableHostCollection.h
new file mode 100644
index 0000000000000..cfaf40c85b3bc
--- /dev/null
+++ b/CUDADataFormats/Common/interface/PortableHostCollection.h
@@ -0,0 +1,85 @@
+#ifndef CUDADataFormats_Common_interface_PortableHostCollection_h
+#define CUDADataFormats_Common_interface_PortableHostCollection_h
+
+#include <cassert>
+#include <cstdlib>
+
+#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
+
+namespace cms::cuda {
+
+  // generic SoA-based product in host memory
+  template <typename T>
+  class PortableHostCollection {
+  public:
+    using Layout = T;
+    using View = typename Layout::View;
+    using ConstView = typename Layout::ConstView;
+    using Buffer = cms::cuda::host::unique_ptr<std::byte[]>;
+
+    PortableHostCollection() = default;
+
+    PortableHostCollection(int32_t elements)
+        // allocate pageable host memory
+        : buffer_{cms::cuda::make_host_unique<std::byte[]>(Layout::computeDataSize(elements))},
+          layout_{buffer_.get(), elements},
+          view_{layout_} {
+      // make_host_unique for pageable host memory uses a default alignment of 128 bytes
+      assert(reinterpret_cast<uintptr_t>(buffer_.get()) % Layout::alignment == 0);
+    }
+
+    PortableHostCollection(int32_t elements, cudaStream_t stream)
+        // allocate pinned host memory, accessible by the current device
+        : buffer_{cms::cuda::make_host_unique<std::byte[]>(Layout::computeDataSize(elements), stream)},
+          layout_{buffer_.get(), elements},
+          view_{layout_} {
+      // CUDA pinned host memory uses a default alignment of at least 128 bytes
+      assert(reinterpret_cast<uintptr_t>(buffer_.get()) % Layout::alignment == 0);
+    }
+
+    // non-copyable
+    PortableHostCollection(PortableHostCollection const&) = delete;
+    PortableHostCollection& operator=(PortableHostCollection const&) = delete;
+
+    // movable
+    PortableHostCollection(PortableHostCollection&&) = default;
+    PortableHostCollection& operator=(PortableHostCollection&&) = default;
+
+    // default destructor
+    ~PortableHostCollection() = default;
+
+    // access the View
+    View& view() { return view_; }
+    ConstView const& view() const { return view_; }
+    ConstView const& const_view() const { return view_; }
+
+    View& operator*() { return view_; }
+    ConstView const& operator*() const { return view_; }
+
+    View* operator->() { return &view_; }
+    ConstView const* operator->() const { return &view_; }
+
+    // access the Buffer
+    Buffer& buffer() { return buffer_; }
+    Buffer const& buffer() const { return buffer_; }
+    Buffer const& const_buffer() const { return buffer_; }
+
+    size_t bufferSize() const { return layout_.metadata().byteSize(); }
+
+    // part of the ROOT read streamer
+    static void ROOTReadStreamer(PortableHostCollection* newObj, Layout const& layout) {
+      newObj->~PortableHostCollection();
+      // allocate pinned host memory using the legacy stream, that synchronises with all (blocking) streams
+      new (newObj) PortableHostCollection(layout.metadata().size());
+      newObj->layout_.ROOTReadStreamer(layout);
+    }
+
+  private:
+    Buffer buffer_;  //!
+    Layout layout_;  //
+    View view_;      //!
+  };
+
+}  // namespace cms::cuda
+
+#endif  // CUDADataFormats_Common_interface_PortableHostCollection_h
diff --git a/CUDADataFormats/PortableTestObjects/BuildFile.xml b/CUDADataFormats/PortableTestObjects/BuildFile.xml
new file mode 100644
index 0000000000000..595a743a6c4c5
--- /dev/null
+++ b/CUDADataFormats/PortableTestObjects/BuildFile.xml
@@ -0,0 +1,7 @@
+<use name="rootcore"/>
+<use name="CUDADataFormats/Common"/>
+<use name="DataFormats/Common"/>
+<use name="DataFormats/PortableTestObjects"/>
+<export>
+  <lib name="1"/>
+</export>
diff --git a/CUDADataFormats/PortableTestObjects/interface/TestDeviceCollection.h b/CUDADataFormats/PortableTestObjects/interface/TestDeviceCollection.h
new file mode 100644
index 0000000000000..c0dc803228646
--- /dev/null
+++ b/CUDADataFormats/PortableTestObjects/interface/TestDeviceCollection.h
@@ -0,0 +1,14 @@
+#ifndef CUDADataFormats_PortableTestObjects_interface_TestDeviceCollection_h
+#define CUDADataFormats_PortableTestObjects_interface_TestDeviceCollection_h
+
+#include "CUDADataFormats/Common/interface/PortableDeviceCollection.h"
+#include "DataFormats/PortableTestObjects/interface/TestSoA.h"
+
+namespace cudatest {
+
+  // SoA with x, y, z, id fields in device global memory
+  using TestDeviceCollection = cms::cuda::PortableDeviceCollection<portabletest::TestSoA>;
+
+}  // namespace cudatest
+
+#endif  // CUDADataFormats_PortableTestObjects_interface_TestDeviceCollection_h
diff --git a/CUDADataFormats/PortableTestObjects/interface/TestHostCollection.h b/CUDADataFormats/PortableTestObjects/interface/TestHostCollection.h
new file mode 100644
index 0000000000000..3e2eea89488c7
--- /dev/null
+++ b/CUDADataFormats/PortableTestObjects/interface/TestHostCollection.h
@@ -0,0 +1,14 @@
+#ifndef CUDADataFormats_PortableTestObjects_interface_TestHostCollection_h
+#define CUDADataFormats_PortableTestObjects_interface_TestHostCollection_h
+
+#include "CUDADataFormats/Common/interface/PortableHostCollection.h"
+#include "DataFormats/PortableTestObjects/interface/TestSoA.h"
+
+namespace cudatest {
+
+  // SoA with x, y, z, id fields in host memory
+  using TestHostCollection = cms::cuda::PortableHostCollection<portabletest::TestSoA>;
+
+}  // namespace cudatest
+
+#endif  // CUDADataFormats_PortableTestObjects_interface_TestHostCollection_h
diff --git a/CUDADataFormats/PortableTestObjects/src/classes.h b/CUDADataFormats/PortableTestObjects/src/classes.h
new file mode 100644
index 0000000000000..6f26217bd6646
--- /dev/null
+++ b/CUDADataFormats/PortableTestObjects/src/classes.h
@@ -0,0 +1,5 @@
+#include "CUDADataFormats/Common/interface/Product.h"
+#include "CUDADataFormats/PortableTestObjects/interface/TestDeviceCollection.h"
+#include "CUDADataFormats/PortableTestObjects/interface/TestHostCollection.h"
+#include "DataFormats/Common/interface/Wrapper.h"
+#include "DataFormats/PortableTestObjects/interface/TestSoA.h"
diff --git a/CUDADataFormats/PortableTestObjects/src/classes_def.xml b/CUDADataFormats/PortableTestObjects/src/classes_def.xml
new file mode 100644
index 0000000000000..892bedcca23d6
--- /dev/null
+++ b/CUDADataFormats/PortableTestObjects/src/classes_def.xml
@@ -0,0 +1,21 @@
+<lcgdict>
+  <class name="cudatest::TestHostCollection"/>
+  <read
+    sourceClass="cudatest::TestHostCollection"
+    targetClass="cudatest::TestHostCollection"
+    version="[1-]"
+    source="portabletest::TestSoA layout_;"
+    target="buffer_"
+    embed="false">
+  <![CDATA[
+    cudatest::TestHostCollection::ROOTReadStreamer(newObj, onfile.layout_);
+  ]]>
+  </read>
+  <class name="edm::Wrapper<cudatest::TestHostCollection>" splitLevel="0"/>
+
+  <class name="cudatest::TestDeviceCollection" persistent="false"/>
+  <class name="edm::Wrapper<cudatest::TestDeviceCollection>" persistent="false"/>
+
+  <class name="cms::cuda::Product<cudatest::TestDeviceCollection>" persistent="false"/>
+  <class name="edm::Wrapper<cms::cuda::Product<cudatest::TestDeviceCollection>>" persistent="false"/>
+</lcgdict>
diff --git a/DataFormats/Portable/interface/PortableDeviceCollection.h b/DataFormats/Portable/interface/PortableDeviceCollection.h
index ce87b82ca69ea..6b3a4f51eb964 100644
--- a/DataFormats/Portable/interface/PortableDeviceCollection.h
+++ b/DataFormats/Portable/interface/PortableDeviceCollection.h
@@ -1,6 +1,7 @@
 #ifndef DataFormats_Portable_interface_PortableDeviceCollection_h
 #define DataFormats_Portable_interface_PortableDeviceCollection_h
 
+#include <cassert>
 #include <optional>
 #include <type_traits>
 
diff --git a/DataFormats/Portable/interface/PortableHostCollection.h b/DataFormats/Portable/interface/PortableHostCollection.h
index b505064965390..0784bb83e3256 100644
--- a/DataFormats/Portable/interface/PortableHostCollection.h
+++ b/DataFormats/Portable/interface/PortableHostCollection.h
@@ -1,6 +1,7 @@
 #ifndef DataFormats_Portable_interface_PortableHostCollection_h
 #define DataFormats_Portable_interface_PortableHostCollection_h
 
+#include <cassert>
 #include <optional>
 
 #include "HeterogeneousCore/AlpakaInterface/interface/config.h"
diff --git a/DataFormats/SoATemplate/test/SoALayoutAndView_t.cu b/DataFormats/SoATemplate/test/SoALayoutAndView_t.cu
index a4f9b0bc7d78f..00e3465a1304a 100644
--- a/DataFormats/SoATemplate/test/SoALayoutAndView_t.cu
+++ b/DataFormats/SoATemplate/test/SoALayoutAndView_t.cu
@@ -1,3 +1,4 @@
+#include <cassert>
 #include <cstdlib>
 #include <memory>
 
diff --git a/HeterogeneousCore/AlpakaInterface/interface/getDeviceCachingAllocator.h b/HeterogeneousCore/AlpakaInterface/interface/getDeviceCachingAllocator.h
index 3fc7fe8471e8d..f47e7637fa2de 100644
--- a/HeterogeneousCore/AlpakaInterface/interface/getDeviceCachingAllocator.h
+++ b/HeterogeneousCore/AlpakaInterface/interface/getDeviceCachingAllocator.h
@@ -1,6 +1,7 @@
 #ifndef HeterogeneousCore_AlpakaInterface_interface_getDeviceCachingAllocator_h
 #define HeterogeneousCore_AlpakaInterface_interface_getDeviceCachingAllocator_h
 
+#include <cassert>
 #include <memory>
 
 #include "FWCore/Utilities/interface/thread_safety_macros.h"
diff --git a/HeterogeneousCore/AlpakaInterface/interface/host.h b/HeterogeneousCore/AlpakaInterface/interface/host.h
index 0303313df12be..66c8a121f2881 100644
--- a/HeterogeneousCore/AlpakaInterface/interface/host.h
+++ b/HeterogeneousCore/AlpakaInterface/interface/host.h
@@ -1,6 +1,8 @@
 #ifndef HeterogeneousCore_AlpakaInterface_interface_host_h
 #define HeterogeneousCore_AlpakaInterface_interface_host_h
 
+#include <cassert>
+
 namespace cms::alpakatools {
 
   namespace detail {
diff --git a/HeterogeneousCore/AlpakaTest/test/BuildFile.xml b/HeterogeneousCore/AlpakaTest/test/BuildFile.xml
index c53ee8acdb820..67c261e79d685 100644
--- a/HeterogeneousCore/AlpakaTest/test/BuildFile.xml
+++ b/HeterogeneousCore/AlpakaTest/test/BuildFile.xml
@@ -1,4 +1 @@
-<test
-  name="testHeterogeneousCoreAlpakaTestWriteRead"
-  command="cmsRun ${LOCALTOP}/src/HeterogeneousCore/AlpakaTest/test/writer.py; cmsRun ${LOCALTOP}/src/HeterogeneousCore/AlpakaTest/test/reader.py"
-/>
+<test name="testHeterogeneousCoreAlpakaTestWriteRead" command="testHeterogeneousCoreAlpakaTestWriteRead.sh"/>
diff --git a/HeterogeneousCore/AlpakaTest/test/testHeterogeneousCoreAlpakaTestWriteRead.sh b/HeterogeneousCore/AlpakaTest/test/testHeterogeneousCoreAlpakaTestWriteRead.sh
new file mode 100755
index 0000000000000..df4868aa04141
--- /dev/null
+++ b/HeterogeneousCore/AlpakaTest/test/testHeterogeneousCoreAlpakaTestWriteRead.sh
@@ -0,0 +1,24 @@
+ #! /bin/bash -e
+
+if ! [ "${LOCALTOP}" ]; then
+  export LOCALTOP=${CMSSW_BASE}
+  cd ${CMSSW_BASE}
+fi
+
+mkdir -p testHeterogeneousCoreAlpakaTestWriteRead
+cd testHeterogeneousCoreAlpakaTestWriteRead
+rm -f test.root
+echo "--------------------------------------------------------------------------------"
+echo "$ cmsRun ${LOCALTOP}/src/HeterogeneousCore/AlpakaTest/test/writer.py"
+echo
+cmsRun ${LOCALTOP}/src/HeterogeneousCore/AlpakaTest/test/writer.py
+echo
+echo "--------------------------------------------------------------------------------"
+echo "$ cmsRun ${LOCALTOP}/src/HeterogeneousCore/AlpakaTest/test/reader.py"
+echo
+cmsRun ${LOCALTOP}/src/HeterogeneousCore/AlpakaTest/test/reader.py
+echo
+echo "--------------------------------------------------------------------------------"
+echo "$ edmDumpEventContent test.root"
+echo
+edmDumpEventContent test.root
diff --git a/HeterogeneousCore/CUDATest/plugins/TestAlgo.cc b/HeterogeneousCore/CUDATest/plugins/TestAlgo.cc
new file mode 100644
index 0000000000000..3f96aff87eedb
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/plugins/TestAlgo.cc
@@ -0,0 +1,19 @@
+#include "CUDADataFormats/PortableTestObjects/interface/TestHostCollection.h"
+
+#include "TestAlgo.h"
+
+namespace cudatest {
+
+  static void testAlgoKernel(cudatest::TestHostCollection::View view, int32_t size) {
+    view.r() = 1.;
+
+    for (auto i = 0; i < size; ++i) {
+      view[i] = {0., 0., 0., i};
+    }
+  }
+
+  void TestAlgo::fill(cudatest::TestHostCollection& collection) const {
+    testAlgoKernel(collection.view(), collection->metadata().size());
+  }
+
+}  // namespace cudatest
diff --git a/HeterogeneousCore/CUDATest/plugins/TestAlgo.cu b/HeterogeneousCore/CUDATest/plugins/TestAlgo.cu
new file mode 100644
index 0000000000000..a2434de387b6e
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/plugins/TestAlgo.cu
@@ -0,0 +1,30 @@
+#include <cuda_runtime.h>
+
+#include "CUDADataFormats/PortableTestObjects/interface/TestDeviceCollection.h"
+
+#include "TestAlgo.h"
+
+namespace cudatest {
+
+  static __global__ void testAlgoKernel(cudatest::TestDeviceCollection::View view, int32_t size) {
+    const int32_t thread = blockIdx.x * blockDim.x + threadIdx.x;
+    const int32_t stride = blockDim.x * gridDim.x;
+
+    if (thread == 0) {
+      view.r() = 1.;
+    }
+    for (auto i = thread; i < size; i += stride) {
+      view[i] = {0., 0., 0., i};
+    }
+  }
+
+  void TestAlgo::fill(cudatest::TestDeviceCollection& collection, cudaStream_t stream) const {
+    const uint32_t maxThreadsPerBlock = 1024;
+
+    uint32_t threadsPerBlock = maxThreadsPerBlock;
+    uint32_t blocksPerGrid = (collection->metadata().size() + threadsPerBlock - 1) / threadsPerBlock;
+
+    testAlgoKernel<<<blocksPerGrid, threadsPerBlock, 0, stream>>>(collection.view(), collection->metadata().size());
+  }
+
+}  // namespace cudatest
diff --git a/HeterogeneousCore/CUDATest/plugins/TestAlgo.h b/HeterogeneousCore/CUDATest/plugins/TestAlgo.h
new file mode 100644
index 0000000000000..a91a773234f68
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/plugins/TestAlgo.h
@@ -0,0 +1,17 @@
+#ifndef HeterogeneousCore_CUDATest_plugins_TestAlgo_h
+#define HeterogeneousCore_CUDATest_plugins_TestAlgo_h
+
+#include "CUDADataFormats/PortableTestObjects/interface/TestDeviceCollection.h"
+#include "CUDADataFormats/PortableTestObjects/interface/TestHostCollection.h"
+
+namespace cudatest {
+
+  class TestAlgo {
+  public:
+    void fill(cudatest::TestDeviceCollection& collection, cudaStream_t stream) const;
+    void fill(cudatest::TestHostCollection& collection) const;
+  };
+
+}  // namespace cudatest
+
+#endif  // HeterogeneousCore_CUDATest_plugins_TestAlgo_h
diff --git a/HeterogeneousCore/CUDATest/plugins/TestPortableAnalyzer.cc b/HeterogeneousCore/CUDATest/plugins/TestPortableAnalyzer.cc
new file mode 100644
index 0000000000000..49ee3cb3b9721
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/plugins/TestPortableAnalyzer.cc
@@ -0,0 +1,62 @@
+#include <cassert>
+
+#include "CUDADataFormats/PortableTestObjects/interface/TestHostCollection.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/Frameworkfwd.h"
+#include "FWCore/Framework/interface/stream/EDAnalyzer.h"
+#include "FWCore/MessageLogger/interface/MessageLogger.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/Utilities/interface/EDGetToken.h"
+#include "FWCore/Utilities/interface/InputTag.h"
+
+class TestPortableAnalyzer : public edm::stream::EDAnalyzer<> {
+public:
+  TestPortableAnalyzer(edm::ParameterSet const& config)
+      : source_{config.getParameter<edm::InputTag>("source")}, token_{consumes(source_)} {}
+
+  void analyze(edm::Event const& event, edm::EventSetup const&) override {
+    cudatest::TestHostCollection const& product = event.get(token_);
+
+    auto const& view = product.const_view();
+    for (int32_t i = 0; i < view.metadata().size(); ++i) {
+      assert(view[i].id() == i);
+    }
+
+    edm::LogInfo msg("TestPortableAnalyzer");
+    msg << source_.encode() << ".size() = " << view.metadata().size() << '\n';
+    msg << "  data = " << product.buffer().get() << ",\n"
+        << "  x    = " << view.metadata().addressOf_x() << ",\n"
+        << "  y    = " << view.metadata().addressOf_y() << ",\n"
+        << "  z    = " << view.metadata().addressOf_z() << ",\n"
+        << "  id   = " << view.metadata().addressOf_id() << ",\n"
+        << "  r    = " << view.metadata().addressOf_r() << '\n';
+    msg << std::hex << "  [y - x] = 0x"
+        << reinterpret_cast<intptr_t>(view.metadata().addressOf_y()) -
+               reinterpret_cast<intptr_t>(view.metadata().addressOf_x())
+        << "  [z - y] = 0x"
+        << reinterpret_cast<intptr_t>(view.metadata().addressOf_z()) -
+               reinterpret_cast<intptr_t>(view.metadata().addressOf_y())
+        << "  [id - z] = 0x"
+        << reinterpret_cast<intptr_t>(view.metadata().addressOf_id()) -
+               reinterpret_cast<intptr_t>(view.metadata().addressOf_z())
+        << "  [r - id] = 0x"
+        << reinterpret_cast<intptr_t>(view.metadata().addressOf_r()) -
+               reinterpret_cast<intptr_t>(view.metadata().addressOf_id());
+  }
+
+  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+    edm::ParameterSetDescription desc;
+    desc.add<edm::InputTag>("source");
+    descriptions.addWithDefaultLabel(desc);
+  }
+
+private:
+  const edm::InputTag source_;
+  const edm::EDGetTokenT<cudatest::TestHostCollection> token_;
+};
+
+#include "FWCore/Framework/interface/MakerMacros.h"
+DEFINE_FWK_MODULE(TestPortableAnalyzer);
diff --git a/HeterogeneousCore/CUDATest/plugins/TestPortableProducerCPU.cc b/HeterogeneousCore/CUDATest/plugins/TestPortableProducerCPU.cc
new file mode 100644
index 0000000000000..19d1611c6b899
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/plugins/TestPortableProducerCPU.cc
@@ -0,0 +1,45 @@
+#include "CUDADataFormats/PortableTestObjects/interface/TestHostCollection.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/Frameworkfwd.h"
+#include "FWCore/Framework/interface/stream/EDProducer.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/ServiceRegistry/interface/Service.h"
+#include "FWCore/Utilities/interface/EDGetToken.h"
+#include "FWCore/Utilities/interface/InputTag.h"
+#include "FWCore/Utilities/interface/StreamID.h"
+
+#include "TestAlgo.h"
+
+class TestPortableProducerCPU : public edm::stream::EDProducer<> {
+public:
+  TestPortableProducerCPU(edm::ParameterSet const& config)
+      : hostToken_{produces()}, size_{config.getParameter<int32_t>("size")} {}
+
+  void produce(edm::Event& event, edm::EventSetup const&) override {
+    // run the algorithm
+    cudatest::TestHostCollection hostProduct{size_};
+    algo_.fill(hostProduct);
+
+    // put the product into the event
+    event.emplace(hostToken_, std::move(hostProduct));
+  }
+
+  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+    edm::ParameterSetDescription desc;
+    desc.add<int32_t>("size");
+    descriptions.addWithDefaultLabel(desc);
+  }
+
+private:
+  const edm::EDPutTokenT<cudatest::TestHostCollection> hostToken_;
+  const int32_t size_;
+
+  // implementation of the algorithm
+  cudatest::TestAlgo algo_;
+};
+
+#include "FWCore/Framework/interface/MakerMacros.h"
+DEFINE_FWK_MODULE(TestPortableProducerCPU);
diff --git a/HeterogeneousCore/CUDATest/plugins/TestPortableProducerCUDA.cc b/HeterogeneousCore/CUDATest/plugins/TestPortableProducerCUDA.cc
new file mode 100644
index 0000000000000..debafd6b874dd
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/plugins/TestPortableProducerCUDA.cc
@@ -0,0 +1,49 @@
+#include "CUDADataFormats/Common/interface/Product.h"
+#include "CUDADataFormats/PortableTestObjects/interface/TestDeviceCollection.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/Frameworkfwd.h"
+#include "FWCore/Framework/interface/stream/EDProducer.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/Utilities/interface/EDGetToken.h"
+#include "FWCore/Utilities/interface/InputTag.h"
+#include "FWCore/Utilities/interface/StreamID.h"
+#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
+
+#include "TestAlgo.h"
+
+class TestPortableProducerCUDA : public edm::stream::EDProducer<> {
+public:
+  TestPortableProducerCUDA(edm::ParameterSet const& config)
+      : deviceToken_{produces()}, size_{config.getParameter<int32_t>("size")} {}
+
+  void produce(edm::Event& event, edm::EventSetup const&) override {
+    // create a context based on the EDM stream number
+    cms::cuda::ScopedContextProduce ctx(event.streamID());
+
+    // run the algorithm, potentially asynchronously
+    cudatest::TestDeviceCollection deviceProduct{size_, ctx.stream()};
+    algo_.fill(deviceProduct, ctx.stream());
+
+    // put the asynchronous product into the event without waiting
+    ctx.emplace(event, deviceToken_, std::move(deviceProduct));
+  }
+
+  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+    edm::ParameterSetDescription desc;
+    desc.add<int32_t>("size");
+    descriptions.addWithDefaultLabel(desc);
+  }
+
+private:
+  const edm::EDPutTokenT<cms::cuda::Product<cudatest::TestDeviceCollection>> deviceToken_;
+  const int32_t size_;
+
+  // implementation of the algorithm
+  cudatest::TestAlgo algo_;
+};
+
+#include "FWCore/Framework/interface/MakerMacros.h"
+DEFINE_FWK_MODULE(TestPortableProducerCUDA);
diff --git a/HeterogeneousCore/CUDATest/plugins/TestPortableTranscriber.cc b/HeterogeneousCore/CUDATest/plugins/TestPortableTranscriber.cc
new file mode 100644
index 0000000000000..4b251ad720d0e
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/plugins/TestPortableTranscriber.cc
@@ -0,0 +1,58 @@
+#include "CUDADataFormats/Common/interface/Product.h"
+#include "CUDADataFormats/PortableTestObjects/interface/TestDeviceCollection.h"
+#include "CUDADataFormats/PortableTestObjects/interface/TestHostCollection.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/Frameworkfwd.h"
+#include "FWCore/Framework/interface/stream/EDProducer.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/Utilities/interface/EDGetToken.h"
+#include "FWCore/Utilities/interface/InputTag.h"
+#include "FWCore/Utilities/interface/StreamID.h"
+#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"
+
+class TestPortableTranscriber : public edm::stream::EDProducer<edm::ExternalWork> {
+public:
+  TestPortableTranscriber(edm::ParameterSet const& config)
+      : deviceToken_{consumes(config.getParameter<edm::InputTag>("source"))}, hostToken_{produces()} {}
+
+  void acquire(edm::Event const& event, edm::EventSetup const& setup, edm::WaitingTaskWithArenaHolder task) override {
+    // create a context reusing the same device and queue as the producer of the input collection
+    auto const& input = event.get(deviceToken_);
+    cms::cuda::ScopedContextAcquire ctx{input, std::move(task)};
+
+    cudatest::TestDeviceCollection const& deviceProduct = ctx.get(input);
+
+    // allocate a host product based on the metadata of the device product
+    hostProduct_ = cudatest::TestHostCollection{deviceProduct->metadata().size(), ctx.stream()};
+
+    // copy the content of the device product to the host product
+    cms::cuda::copyAsync(hostProduct_.buffer(), deviceProduct.const_buffer(), deviceProduct.bufferSize(), ctx.stream());
+
+    // do not wait for the asynchronous operation to complete
+  }
+
+  void produce(edm::Event& event, edm::EventSetup const&) override {
+    // produce() is called once the asynchronous operation has completed, so there is no need for an explicit wait
+    event.emplace(hostToken_, std::move(hostProduct_));
+  }
+
+  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+    edm::ParameterSetDescription desc;
+    desc.add<edm::InputTag>("source");
+    descriptions.addWithDefaultLabel(desc);
+  }
+
+private:
+  const edm::EDGetTokenT<cms::cuda::Product<cudatest::TestDeviceCollection>> deviceToken_;
+  const edm::EDPutTokenT<cudatest::TestHostCollection> hostToken_;
+
+  // hold the output product between acquire() and produce()
+  cudatest::TestHostCollection hostProduct_;
+};
+
+#include "FWCore/Framework/interface/MakerMacros.h"
+DEFINE_FWK_MODULE(TestPortableTranscriber);
diff --git a/HeterogeneousCore/CUDATest/test/BuildFile.xml b/HeterogeneousCore/CUDATest/test/BuildFile.xml
index 60351fbfe9b88..0faf0245bbe78 100644
--- a/HeterogeneousCore/CUDATest/test/BuildFile.xml
+++ b/HeterogeneousCore/CUDATest/test/BuildFile.xml
@@ -11,4 +11,5 @@
     <test name="TestCUDATest" command="runtests.sh cpu"/>
   </ifrelease>
 
+  <test name="testHeterogeneousCoreCUDATestWriteRead" command="testHeterogeneousCoreCUDATestWriteRead.sh"/>
 </iftool>
diff --git a/HeterogeneousCore/CUDATest/test/reader.py b/HeterogeneousCore/CUDATest/test/reader.py
new file mode 100644
index 0000000000000..ba4111e4ab73c
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/test/reader.py
@@ -0,0 +1,27 @@
+import FWCore.ParameterSet.Config as cms
+
+process = cms.Process('Reader')
+
+# read the products from a 'test.root' file
+process.source = cms.Source('PoolSource',
+    fileNames = cms.untracked.vstring('file:test.root')
+)
+
+# enable logging for the TestPortableAnalyzer
+process.MessageLogger.TestPortableAnalyzer = cms.untracked.PSet()
+
+# analyse the first product
+process.testAnalyzer = cms.EDAnalyzer('TestPortableAnalyzer',
+    source = cms.InputTag('testProducer')
+)
+
+# analyse the second product
+process.testAnalyzerSerial = cms.EDAnalyzer('TestPortableAnalyzer',
+    source = cms.InputTag('testProducerSerial')
+)
+
+process.cuda_path = cms.Path(process.testAnalyzer)
+
+process.serial_path = cms.Path(process.testAnalyzerSerial)
+
+process.maxEvents.input = 10
diff --git a/HeterogeneousCore/CUDATest/test/testHeterogeneousCoreCUDATestWriteRead.sh b/HeterogeneousCore/CUDATest/test/testHeterogeneousCoreCUDATestWriteRead.sh
new file mode 100755
index 0000000000000..4402e119e1012
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/test/testHeterogeneousCoreCUDATestWriteRead.sh
@@ -0,0 +1,24 @@
+ #! /bin/bash -e
+
+if ! [ "${LOCALTOP}" ]; then
+  export LOCALTOP=${CMSSW_BASE}
+  cd ${CMSSW_BASE}
+fi
+
+mkdir -p testHeterogeneousCoreCUDATestWriteRead
+cd testHeterogeneousCoreCUDATestWriteRead
+rm -f test.root
+echo "--------------------------------------------------------------------------------"
+echo "$ cmsRun ${LOCALTOP}/src/HeterogeneousCore/CUDATest/test/writer.py"
+echo
+cmsRun ${LOCALTOP}/src/HeterogeneousCore/CUDATest/test/writer.py
+echo
+echo "--------------------------------------------------------------------------------"
+echo "$ cmsRun ${LOCALTOP}/src/HeterogeneousCore/CUDATest/test/reader.py"
+echo
+cmsRun ${LOCALTOP}/src/HeterogeneousCore/CUDATest/test/reader.py
+echo
+echo "--------------------------------------------------------------------------------"
+echo "$ edmDumpEventContent test.root"
+echo
+edmDumpEventContent test.root
diff --git a/HeterogeneousCore/CUDATest/test/writer.py b/HeterogeneousCore/CUDATest/test/writer.py
new file mode 100644
index 0000000000000..5c93a20bcf926
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/test/writer.py
@@ -0,0 +1,76 @@
+import FWCore.ParameterSet.Config as cms
+from HeterogeneousCore.CUDACore.SwitchProducerCUDA import SwitchProducerCUDA
+
+process = cms.Process('Writer')
+
+process.source = cms.Source('EmptySource')
+
+process.load('Configuration.StandardSequences.Accelerators_cff')
+
+# enable logging for the TestPortableAnalyzer
+process.MessageLogger.TestPortableAnalyzer = cms.untracked.PSet()
+
+# run the producer on a CUDA gpu (if available)
+process.testProducerCuda = cms.EDProducer('TestPortableProducerCUDA',
+    size = cms.int32(42)
+)
+
+# copy the product from the gpu (if available) to the host
+process.testTranscriberFromCuda = cms.EDProducer('TestPortableTranscriber',
+    source = cms.InputTag('testProducerCuda')
+)
+
+# run the producer on the cpu
+process.testProducerCpu = cms.EDProducer('TestPortableProducerCPU',
+    size = cms.int32(42)
+)
+
+# either run the producer on a CUDA gpu (if available) and copy the product to the cpu, or run the producer directly on the cpu
+process.testProducer = SwitchProducerCUDA(
+    cpu = cms.EDAlias(
+        testProducerCpu = cms.VPSet(cms.PSet(type = cms.string('*')))
+    ),
+    cuda = cms.EDAlias(
+        testTranscriberFromCuda = cms.VPSet(cms.PSet(type = cms.string('*')))
+    )
+)
+
+# analyse the product
+process.testAnalyzer = cms.EDAnalyzer('TestPortableAnalyzer',
+    source = cms.InputTag('testProducer')
+)
+
+# run a second producer explicitly on the cpu
+process.testProducerSerial = cms.EDProducer('TestPortableProducerCPU',
+    size = cms.int32(99)
+)
+
+# analyse the second product
+process.testAnalyzerSerial = cms.EDAnalyzer('TestPortableAnalyzer',
+    source = cms.InputTag('testProducerSerial')
+)
+
+# write the two products to a 'test.root' file
+process.output = cms.OutputModule('PoolOutputModule',
+    fileName = cms.untracked.string('test.root'),
+    outputCommands = cms.untracked.vstring(
+        'drop *',
+        'keep *_testProducer_*_*',
+        'keep *_testProducerSerial_*_*',
+  )
+)
+
+process.producer_task = cms.Task(process.testProducerCuda, process.testTranscriberFromCuda, process.testProducerCpu)
+
+process.process_path = cms.Path(
+    process.testProducer +
+    process.testAnalyzer,
+    process.producer_task)
+
+process.serial_path = cms.Path(
+    process.testProducerSerial +
+    process.testAnalyzerSerial)
+
+process.output_path = cms.EndPath(process.output)
+
+process.maxEvents.input = 10
diff --git a/HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h b/HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h
index a64b7c9869d6c..ea89cc7490d85 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h
@@ -1,6 +1,7 @@
 #ifndef HeterogeneousCore_CUDAUtilities_interface_host_unique_ptr_h
 #define HeterogeneousCore_CUDAUtilities_interface_host_unique_ptr_h
 
+#include <cstdlib>
 #include <memory>
 #include <functional>
 
@@ -10,11 +11,32 @@ namespace cms {
   namespace cuda {
     namespace host {
       namespace impl {
-        // Additional layer of types to distinguish from host::unique_ptr
+
+        enum class MemoryType : bool {
+          kDefault = false,
+          kPinned = true,
+        };
+
+        // Custom deleter for host memory, with an internal state to distinguish pageable and pinned host memory
         class HostDeleter {
         public:
-          void operator()(void *ptr) { cms::cuda::free_host(ptr); }
+          // The default constructor is needed by the default constructor of unique_ptr<T, HostDeleter>,
+          // which is needed by the default constructor of HostProduct<T>, which is needed by the ROOT dictionary
+          HostDeleter() : type_{MemoryType::kDefault} {}
+          HostDeleter(MemoryType type) : type_{type} {}
+
+          void operator()(void *ptr) {
+            if (type_ == MemoryType::kPinned) {
+              cms::cuda::free_host(ptr);
+            } else {
+              std::free(ptr);
+            }
+          }
+
+        private:
+          MemoryType type_;
         };
+
       }  // namespace impl
 
       template <typename T>
@@ -36,24 +58,55 @@ namespace cms {
       }  // namespace impl
     }    // namespace host
 
+    // Allocate pageable host memory
+    template <typename T>
+    typename host::impl::make_host_unique_selector<T>::non_array make_host_unique() {
+      static_assert(std::is_trivially_constructible<T>::value,
+                    "Allocating with non-trivial constructor on the host memory is not supported");
+      // Allocate a buffer aligned to 128 bytes, to match the CUDA cache line size
+      const size_t alignment = 128;
+      // std::aligned_alloc() requires the size to be a multiple of the alignment
+      const size_t size = (sizeof(T) + alignment - 1) / alignment * alignment;
+      void *mem = std::aligned_alloc(alignment, size);
+      return typename host::impl::make_host_unique_selector<T>::non_array{reinterpret_cast<T *>(mem),
+                                                                          host::impl::MemoryType::kDefault};
+    }
+
+    template <typename T>
+    typename host::impl::make_host_unique_selector<T>::unbounded_array make_host_unique(size_t n) {
+      using element_type = typename std::remove_extent<T>::type;
+      static_assert(std::is_trivially_constructible<element_type>::value,
+                    "Allocating with non-trivial constructor on the host memory is not supported");
+      // Allocate a buffer aligned to 128 bytes, to match the CUDA cache line size
+      const size_t alignment = 128;
+      // std::aligned_alloc() requires the size to be a multiple of the alignment
+      const size_t size = (n * sizeof(element_type) + alignment - 1) / alignment * alignment;
+      void *mem = std::aligned_alloc(alignment, size);
+      return typename host::impl::make_host_unique_selector<T>::unbounded_array{reinterpret_cast<element_type *>(mem),
+                                                                                host::impl::MemoryType::kDefault};
+    }
+
     // Allocate pinned host memory
     template <typename T>
     typename host::impl::make_host_unique_selector<T>::non_array make_host_unique(cudaStream_t stream) {
       static_assert(std::is_trivially_constructible<T>::value,
-                    "Allocating with non-trivial constructor on the pinned host memory is not supported");
+                    "Allocating with non-trivial constructor on the host memory is not supported");
       void *mem = allocate_host(sizeof(T), stream);
-      return typename host::impl::make_host_unique_selector<T>::non_array{reinterpret_cast<T *>(mem)};
+      return typename host::impl::make_host_unique_selector<T>::non_array{reinterpret_cast<T *>(mem),  //
+                                                                          host::impl::MemoryType::kPinned};
     }
 
     template <typename T>
     typename host::impl::make_host_unique_selector<T>::unbounded_array make_host_unique(size_t n, cudaStream_t stream) {
       using element_type = typename std::remove_extent<T>::type;
       static_assert(std::is_trivially_constructible<element_type>::value,
-                    "Allocating with non-trivial constructor on the pinned host memory is not supported");
+                    "Allocating with non-trivial constructor on the host memory is not supported");
       void *mem = allocate_host(n * sizeof(element_type), stream);
-      return typename host::impl::make_host_unique_selector<T>::unbounded_array{reinterpret_cast<element_type *>(mem)};
+      return typename host::impl::make_host_unique_selector<T>::unbounded_array{reinterpret_cast<element_type *>(mem),
+                                                                                host::impl::MemoryType::kPinned};
     }
 
+    // Arrays of known bounds are not supported by std::unique_ptr
     template <typename T, typename... Args>
     typename host::impl::make_host_unique_selector<T>::bounded_array make_host_unique(Args &&...) = delete;
 
@@ -61,7 +114,8 @@ namespace cms {
     template <typename T>
     typename host::impl::make_host_unique_selector<T>::non_array make_host_unique_uninitialized(cudaStream_t stream) {
       void *mem = allocate_host(sizeof(T), stream);
-      return typename host::impl::make_host_unique_selector<T>::non_array{reinterpret_cast<T *>(mem)};
+      return typename host::impl::make_host_unique_selector<T>::non_array{reinterpret_cast<T *>(mem),  //
+                                                                          host::impl::MemoryType::kPinned};
     }
 
     template <typename T>
@@ -69,11 +123,14 @@ namespace cms {
         size_t n, cudaStream_t stream) {
       using element_type = typename std::remove_extent<T>::type;
       void *mem = allocate_host(n * sizeof(element_type), stream);
-      return typename host::impl::make_host_unique_selector<T>::unbounded_array{reinterpret_cast<element_type *>(mem)};
+      return typename host::impl::make_host_unique_selector<T>::unbounded_array{reinterpret_cast<element_type *>(mem),
+                                                                                host::impl::MemoryType::kPinned};
     }
 
+    // Arrays of known bounds are not supported by std::unique_ptr
     template <typename T, typename... Args>
     typename host::impl::make_host_unique_selector<T>::bounded_array make_host_unique_uninitialized(Args &&...) = delete;
+
   }  // namespace cuda
 }  // namespace cms