diff --git a/CUDADataFormats/Common/BuildFile.xml b/CUDADataFormats/Common/BuildFile.xml
index f6b68fe69b400..e4971bdf3ebbe 100644
--- a/CUDADataFormats/Common/BuildFile.xml
+++ b/CUDADataFormats/Common/BuildFile.xml
@@ -1,4 +1,5 @@
+
diff --git a/CUDADataFormats/Common/interface/PortableDeviceCollection.h b/CUDADataFormats/Common/interface/PortableDeviceCollection.h
new file mode 100644
index 0000000000000..78f72cb3d5437
--- /dev/null
+++ b/CUDADataFormats/Common/interface/PortableDeviceCollection.h
@@ -0,0 +1,67 @@
+#ifndef CUDADataFormats_Common_interface_PortableDeviceCollection_h
+#define CUDADataFormats_Common_interface_PortableDeviceCollection_h
+
+#include
+#include
+
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+
+namespace cms::cuda {
+
+ // generic SoA-based product in device memory
+ template
+ class PortableDeviceCollection {
+ public:
+ using Layout = T;
+ using View = typename Layout::View;
+ using ConstView = typename Layout::ConstView;
+ using Buffer = cms::cuda::device::unique_ptr;
+
+ PortableDeviceCollection() = default;
+
+ PortableDeviceCollection(int32_t elements, cudaStream_t stream)
+ : buffer_{cms::cuda::make_device_unique(Layout::computeDataSize(elements), stream)},
+ layout_{buffer_.get(), elements},
+ view_{layout_} {
+ // CUDA device memory uses a default alignment of at least 128 bytes
+ assert(reinterpret_cast(buffer_.get()) % Layout::alignment == 0);
+ }
+
+ // non-copyable
+ PortableDeviceCollection(PortableDeviceCollection const&) = delete;
+ PortableDeviceCollection& operator=(PortableDeviceCollection const&) = delete;
+
+ // movable
+ PortableDeviceCollection(PortableDeviceCollection&&) = default;
+ PortableDeviceCollection& operator=(PortableDeviceCollection&&) = default;
+
+ // default destructor
+ ~PortableDeviceCollection() = default;
+
+ // access the View
+ View& view() { return view_; }
+ ConstView const& view() const { return view_; }
+ ConstView const& const_view() const { return view_; }
+
+ View& operator*() { return view_; }
+ ConstView const& operator*() const { return view_; }
+
+ View* operator->() { return &view_; }
+ ConstView const* operator->() const { return &view_; }
+
+ // access the Buffer
+ Buffer& buffer() { return buffer_; }
+ Buffer const& buffer() const { return buffer_; }
+ Buffer const& const_buffer() const { return buffer_; }
+
+ size_t bufferSize() const { return layout_.metadata().byteSize(); }
+
+ private:
+ Buffer buffer_; //!
+ Layout layout_; //
+ View view_; //!
+ };
+
+} // namespace cms::cuda
+
+#endif // CUDADataFormats_Common_interface_PortableDeviceCollection_h
diff --git a/CUDADataFormats/Common/interface/PortableHostCollection.h b/CUDADataFormats/Common/interface/PortableHostCollection.h
new file mode 100644
index 0000000000000..cfaf40c85b3bc
--- /dev/null
+++ b/CUDADataFormats/Common/interface/PortableHostCollection.h
@@ -0,0 +1,85 @@
+#ifndef CUDADataFormats_Common_interface_PortableHostCollection_h
+#define CUDADataFormats_Common_interface_PortableHostCollection_h
+
+#include
+#include
+
+#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
+
+namespace cms::cuda {
+
+ // generic SoA-based product in host memory
+ template
+ class PortableHostCollection {
+ public:
+ using Layout = T;
+ using View = typename Layout::View;
+ using ConstView = typename Layout::ConstView;
+ using Buffer = cms::cuda::host::unique_ptr;
+
+ PortableHostCollection() = default;
+
+ PortableHostCollection(int32_t elements)
+ // allocate pageable host memory
+ : buffer_{cms::cuda::make_host_unique(Layout::computeDataSize(elements))},
+ layout_{buffer_.get(), elements},
+ view_{layout_} {
+ // make_host_unique for pageable host memory uses a default alignment of 128 bytes
+ assert(reinterpret_cast(buffer_.get()) % Layout::alignment == 0);
+ }
+
+ PortableHostCollection(int32_t elements, cudaStream_t stream)
+ // allocate pinned host memory, accessible by the current device
+ : buffer_{cms::cuda::make_host_unique(Layout::computeDataSize(elements), stream)},
+ layout_{buffer_.get(), elements},
+ view_{layout_} {
+ // CUDA pinned host memory uses a default alignment of at least 128 bytes
+ assert(reinterpret_cast(buffer_.get()) % Layout::alignment == 0);
+ }
+
+ // non-copyable
+ PortableHostCollection(PortableHostCollection const&) = delete;
+ PortableHostCollection& operator=(PortableHostCollection const&) = delete;
+
+ // movable
+ PortableHostCollection(PortableHostCollection&&) = default;
+ PortableHostCollection& operator=(PortableHostCollection&&) = default;
+
+ // default destructor
+ ~PortableHostCollection() = default;
+
+ // access the View
+ View& view() { return view_; }
+ ConstView const& view() const { return view_; }
+ ConstView const& const_view() const { return view_; }
+
+ View& operator*() { return view_; }
+ ConstView const& operator*() const { return view_; }
+
+ View* operator->() { return &view_; }
+ ConstView const* operator->() const { return &view_; }
+
+ // access the Buffer
+ Buffer& buffer() { return buffer_; }
+ Buffer const& buffer() const { return buffer_; }
+ Buffer const& const_buffer() const { return buffer_; }
+
+ size_t bufferSize() const { return layout_.metadata().byteSize(); }
+
+ // part of the ROOT read streamer
+ static void ROOTReadStreamer(PortableHostCollection* newObj, Layout const& layout) {
+ newObj->~PortableHostCollection();
+ // allocate pinned host memory using the legacy stream, that synchronises with all (blocking) streams
+ new (newObj) PortableHostCollection(layout.metadata().size());
+ newObj->layout_.ROOTReadStreamer(layout);
+ }
+
+ private:
+ Buffer buffer_; //!
+ Layout layout_; //
+ View view_; //!
+ };
+
+} // namespace cms::cuda
+
+#endif // CUDADataFormats_Common_interface_PortableHostCollection_h
diff --git a/CUDADataFormats/PortableTestObjects/BuildFile.xml b/CUDADataFormats/PortableTestObjects/BuildFile.xml
new file mode 100644
index 0000000000000..595a743a6c4c5
--- /dev/null
+++ b/CUDADataFormats/PortableTestObjects/BuildFile.xml
@@ -0,0 +1,7 @@
+
+
+
+
+
+
+
diff --git a/CUDADataFormats/PortableTestObjects/interface/TestDeviceCollection.h b/CUDADataFormats/PortableTestObjects/interface/TestDeviceCollection.h
new file mode 100644
index 0000000000000..c0dc803228646
--- /dev/null
+++ b/CUDADataFormats/PortableTestObjects/interface/TestDeviceCollection.h
@@ -0,0 +1,14 @@
+#ifndef CUDADataFormats_PortableTestObjects_interface_TestDeviceCollection_h
+#define CUDADataFormats_PortableTestObjects_interface_TestDeviceCollection_h
+
+#include "CUDADataFormats/Common/interface/PortableDeviceCollection.h"
+#include "DataFormats/PortableTestObjects/interface/TestSoA.h"
+
+namespace cudatest {
+
+ // SoA with x, y, z, id fields in device global memory
+ using TestDeviceCollection = cms::cuda::PortableDeviceCollection;
+
+} // namespace cudatest
+
+#endif // CUDADataFormats_PortableTestObjects_interface_TestDeviceCollection_h
diff --git a/CUDADataFormats/PortableTestObjects/interface/TestHostCollection.h b/CUDADataFormats/PortableTestObjects/interface/TestHostCollection.h
new file mode 100644
index 0000000000000..3e2eea89488c7
--- /dev/null
+++ b/CUDADataFormats/PortableTestObjects/interface/TestHostCollection.h
@@ -0,0 +1,14 @@
+#ifndef CUDADataFormats_PortableTestObjects_interface_TestHostCollection_h
+#define CUDADataFormats_PortableTestObjects_interface_TestHostCollection_h
+
+#include "CUDADataFormats/Common/interface/PortableHostCollection.h"
+#include "DataFormats/PortableTestObjects/interface/TestSoA.h"
+
+namespace cudatest {
+
+ // SoA with x, y, z, id fields in host memory
+ using TestHostCollection = cms::cuda::PortableHostCollection;
+
+} // namespace cudatest
+
+#endif // CUDADataFormats_PortableTestObjects_interface_TestHostCollection_h
diff --git a/CUDADataFormats/PortableTestObjects/src/classes.h b/CUDADataFormats/PortableTestObjects/src/classes.h
new file mode 100644
index 0000000000000..6f26217bd6646
--- /dev/null
+++ b/CUDADataFormats/PortableTestObjects/src/classes.h
@@ -0,0 +1,5 @@
+#include "CUDADataFormats/Common/interface/Product.h"
+#include "CUDADataFormats/PortableTestObjects/interface/TestDeviceCollection.h"
+#include "CUDADataFormats/PortableTestObjects/interface/TestHostCollection.h"
+#include "DataFormats/Common/interface/Wrapper.h"
+#include "DataFormats/PortableTestObjects/interface/TestSoA.h"
diff --git a/CUDADataFormats/PortableTestObjects/src/classes_def.xml b/CUDADataFormats/PortableTestObjects/src/classes_def.xml
new file mode 100644
index 0000000000000..892bedcca23d6
--- /dev/null
+++ b/CUDADataFormats/PortableTestObjects/src/classes_def.xml
@@ -0,0 +1,21 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/DataFormats/Portable/interface/PortableDeviceCollection.h b/DataFormats/Portable/interface/PortableDeviceCollection.h
index ce87b82ca69ea..6b3a4f51eb964 100644
--- a/DataFormats/Portable/interface/PortableDeviceCollection.h
+++ b/DataFormats/Portable/interface/PortableDeviceCollection.h
@@ -1,6 +1,7 @@
#ifndef DataFormats_Portable_interface_PortableDeviceCollection_h
#define DataFormats_Portable_interface_PortableDeviceCollection_h
+#include
#include
#include
diff --git a/DataFormats/Portable/interface/PortableHostCollection.h b/DataFormats/Portable/interface/PortableHostCollection.h
index b505064965390..0784bb83e3256 100644
--- a/DataFormats/Portable/interface/PortableHostCollection.h
+++ b/DataFormats/Portable/interface/PortableHostCollection.h
@@ -1,6 +1,7 @@
#ifndef DataFormats_Portable_interface_PortableHostCollection_h
#define DataFormats_Portable_interface_PortableHostCollection_h
+#include
#include
#include "HeterogeneousCore/AlpakaInterface/interface/config.h"
diff --git a/DataFormats/SoATemplate/test/SoALayoutAndView_t.cu b/DataFormats/SoATemplate/test/SoALayoutAndView_t.cu
index a4f9b0bc7d78f..00e3465a1304a 100644
--- a/DataFormats/SoATemplate/test/SoALayoutAndView_t.cu
+++ b/DataFormats/SoATemplate/test/SoALayoutAndView_t.cu
@@ -1,3 +1,4 @@
+#include
#include
#include
diff --git a/HeterogeneousCore/AlpakaInterface/interface/getDeviceCachingAllocator.h b/HeterogeneousCore/AlpakaInterface/interface/getDeviceCachingAllocator.h
index 3fc7fe8471e8d..f47e7637fa2de 100644
--- a/HeterogeneousCore/AlpakaInterface/interface/getDeviceCachingAllocator.h
+++ b/HeterogeneousCore/AlpakaInterface/interface/getDeviceCachingAllocator.h
@@ -1,6 +1,7 @@
#ifndef HeterogeneousCore_AlpakaInterface_interface_getDeviceCachingAllocator_h
#define HeterogeneousCore_AlpakaInterface_interface_getDeviceCachingAllocator_h
+#include
#include
#include "FWCore/Utilities/interface/thread_safety_macros.h"
diff --git a/HeterogeneousCore/AlpakaInterface/interface/host.h b/HeterogeneousCore/AlpakaInterface/interface/host.h
index 0303313df12be..66c8a121f2881 100644
--- a/HeterogeneousCore/AlpakaInterface/interface/host.h
+++ b/HeterogeneousCore/AlpakaInterface/interface/host.h
@@ -1,6 +1,8 @@
#ifndef HeterogeneousCore_AlpakaInterface_interface_host_h
#define HeterogeneousCore_AlpakaInterface_interface_host_h
+#include
+
namespace cms::alpakatools {
namespace detail {
diff --git a/HeterogeneousCore/AlpakaTest/test/BuildFile.xml b/HeterogeneousCore/AlpakaTest/test/BuildFile.xml
index c53ee8acdb820..67c261e79d685 100644
--- a/HeterogeneousCore/AlpakaTest/test/BuildFile.xml
+++ b/HeterogeneousCore/AlpakaTest/test/BuildFile.xml
@@ -1,4 +1 @@
-
+
diff --git a/HeterogeneousCore/AlpakaTest/test/testHeterogeneousCoreAlpakaTestWriteRead.sh b/HeterogeneousCore/AlpakaTest/test/testHeterogeneousCoreAlpakaTestWriteRead.sh
new file mode 100755
index 0000000000000..df4868aa04141
--- /dev/null
+++ b/HeterogeneousCore/AlpakaTest/test/testHeterogeneousCoreAlpakaTestWriteRead.sh
@@ -0,0 +1,24 @@
+ #! /bin/bash -e
+
+if ! [ "${LOCALTOP}" ]; then
+ export LOCALTOP=${CMSSW_BASE}
+ cd ${CMSSW_BASE}
+fi
+
+mkdir -p testHeterogeneousCoreAlpakaTestWriteRead
+cd testHeterogeneousCoreAlpakaTestWriteRead
+rm -f test.root
+echo "--------------------------------------------------------------------------------"
+echo "$ cmsRun ${LOCALTOP}/src/HeterogeneousCore/AlpakaTest/test/writer.py"
+echo
+cmsRun ${LOCALTOP}/src/HeterogeneousCore/AlpakaTest/test/writer.py
+echo
+echo "--------------------------------------------------------------------------------"
+echo "$ cmsRun ${LOCALTOP}/src/HeterogeneousCore/AlpakaTest/test/reader.py"
+echo
+cmsRun ${LOCALTOP}/src/HeterogeneousCore/AlpakaTest/test/reader.py
+echo
+echo "--------------------------------------------------------------------------------"
+echo "$ edmDumpEventContent test.root"
+echo
+edmDumpEventContent test.root
diff --git a/HeterogeneousCore/CUDATest/plugins/TestAlgo.cc b/HeterogeneousCore/CUDATest/plugins/TestAlgo.cc
new file mode 100644
index 0000000000000..3f96aff87eedb
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/plugins/TestAlgo.cc
@@ -0,0 +1,19 @@
+#include "CUDADataFormats/PortableTestObjects/interface/TestHostCollection.h"
+
+#include "TestAlgo.h"
+
+namespace cudatest {
+
+ static void testAlgoKernel(cudatest::TestHostCollection::View view, int32_t size) {
+ view.r() = 1.;
+
+ for (auto i = 0; i < size; ++i) {
+ view[i] = {0., 0., 0., i};
+ }
+ }
+
+ void TestAlgo::fill(cudatest::TestHostCollection& collection) const {
+ testAlgoKernel(collection.view(), collection->metadata().size());
+ }
+
+} // namespace cudatest
diff --git a/HeterogeneousCore/CUDATest/plugins/TestAlgo.cu b/HeterogeneousCore/CUDATest/plugins/TestAlgo.cu
new file mode 100644
index 0000000000000..a2434de387b6e
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/plugins/TestAlgo.cu
@@ -0,0 +1,30 @@
+#include
+
+#include "CUDADataFormats/PortableTestObjects/interface/TestDeviceCollection.h"
+
+#include "TestAlgo.h"
+
+namespace cudatest {
+
+ static __global__ void testAlgoKernel(cudatest::TestDeviceCollection::View view, int32_t size) {
+ const int32_t thread = blockIdx.x * blockDim.x + threadIdx.x;
+ const int32_t stride = blockDim.x * gridDim.x;
+
+ if (thread == 0) {
+ view.r() = 1.;
+ }
+ for (auto i = thread; i < size; i += stride) {
+ view[i] = {0., 0., 0., i};
+ }
+ }
+
+ void TestAlgo::fill(cudatest::TestDeviceCollection& collection, cudaStream_t stream) const {
+ const uint32_t maxThreadsPerBlock = 1024;
+
+ uint32_t threadsPerBlock = maxThreadsPerBlock;
+ uint32_t blocksPerGrid = (collection->metadata().size() + threadsPerBlock - 1) / threadsPerBlock;
+
+ testAlgoKernel<<>>(collection.view(), collection->metadata().size());
+ }
+
+} // namespace cudatest
diff --git a/HeterogeneousCore/CUDATest/plugins/TestAlgo.h b/HeterogeneousCore/CUDATest/plugins/TestAlgo.h
new file mode 100644
index 0000000000000..a91a773234f68
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/plugins/TestAlgo.h
@@ -0,0 +1,17 @@
+#ifndef HeterogeneousCore_CUDATest_plugins_TestAlgo_h
+#define HeterogeneousCore_CUDATest_plugins_TestAlgo_h
+
+#include "CUDADataFormats/PortableTestObjects/interface/TestDeviceCollection.h"
+#include "CUDADataFormats/PortableTestObjects/interface/TestHostCollection.h"
+
+namespace cudatest {
+
+ class TestAlgo {
+ public:
+ void fill(cudatest::TestDeviceCollection& collection, cudaStream_t stream) const;
+ void fill(cudatest::TestHostCollection& collection) const;
+ };
+
+} // namespace cudatest
+
+#endif // HeterogeneousCore_CUDATest_plugins_TestAlgo_h
diff --git a/HeterogeneousCore/CUDATest/plugins/TestPortableAnalyzer.cc b/HeterogeneousCore/CUDATest/plugins/TestPortableAnalyzer.cc
new file mode 100644
index 0000000000000..49ee3cb3b9721
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/plugins/TestPortableAnalyzer.cc
@@ -0,0 +1,62 @@
+#include
+
+#include "CUDADataFormats/PortableTestObjects/interface/TestHostCollection.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/Frameworkfwd.h"
+#include "FWCore/Framework/interface/stream/EDAnalyzer.h"
+#include "FWCore/MessageLogger/interface/MessageLogger.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/Utilities/interface/EDGetToken.h"
+#include "FWCore/Utilities/interface/InputTag.h"
+
+class TestPortableAnalyzer : public edm::stream::EDAnalyzer<> {
+public:
+ TestPortableAnalyzer(edm::ParameterSet const& config)
+ : source_{config.getParameter("source")}, token_{consumes(source_)} {}
+
+ void analyze(edm::Event const& event, edm::EventSetup const&) override {
+ cudatest::TestHostCollection const& product = event.get(token_);
+
+ auto const& view = product.const_view();
+ for (int32_t i = 0; i < view.metadata().size(); ++i) {
+ assert(view[i].id() == i);
+ }
+
+ edm::LogInfo msg("TestPortableAnalyzer");
+ msg << source_.encode() << ".size() = " << view.metadata().size() << '\n';
+ msg << " data = " << product.buffer().get() << ",\n"
+ << " x = " << view.metadata().addressOf_x() << ",\n"
+ << " y = " << view.metadata().addressOf_y() << ",\n"
+ << " z = " << view.metadata().addressOf_z() << ",\n"
+ << " id = " << view.metadata().addressOf_id() << ",\n"
+ << " r = " << view.metadata().addressOf_r() << '\n';
+ msg << std::hex << " [y - x] = 0x"
+ << reinterpret_cast(view.metadata().addressOf_y()) -
+ reinterpret_cast(view.metadata().addressOf_x())
+ << " [z - y] = 0x"
+ << reinterpret_cast(view.metadata().addressOf_z()) -
+ reinterpret_cast(view.metadata().addressOf_y())
+ << " [id - z] = 0x"
+ << reinterpret_cast(view.metadata().addressOf_id()) -
+ reinterpret_cast(view.metadata().addressOf_z())
+ << " [r - id] = 0x"
+ << reinterpret_cast(view.metadata().addressOf_r()) -
+ reinterpret_cast(view.metadata().addressOf_id());
+ }
+
+ static void fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+ edm::ParameterSetDescription desc;
+ desc.add("source");
+ descriptions.addWithDefaultLabel(desc);
+ }
+
+private:
+ const edm::InputTag source_;
+ const edm::EDGetTokenT token_;
+};
+
+#include "FWCore/Framework/interface/MakerMacros.h"
+DEFINE_FWK_MODULE(TestPortableAnalyzer);
diff --git a/HeterogeneousCore/CUDATest/plugins/TestPortableProducerCPU.cc b/HeterogeneousCore/CUDATest/plugins/TestPortableProducerCPU.cc
new file mode 100644
index 0000000000000..19d1611c6b899
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/plugins/TestPortableProducerCPU.cc
@@ -0,0 +1,45 @@
+#include "CUDADataFormats/PortableTestObjects/interface/TestHostCollection.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/Frameworkfwd.h"
+#include "FWCore/Framework/interface/stream/EDProducer.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/ServiceRegistry/interface/Service.h"
+#include "FWCore/Utilities/interface/EDGetToken.h"
+#include "FWCore/Utilities/interface/InputTag.h"
+#include "FWCore/Utilities/interface/StreamID.h"
+
+#include "TestAlgo.h"
+
+class TestPortableProducerCPU : public edm::stream::EDProducer<> {
+public:
+ TestPortableProducerCPU(edm::ParameterSet const& config)
+ : hostToken_{produces()}, size_{config.getParameter("size")} {}
+
+ void produce(edm::Event& event, edm::EventSetup const&) override {
+ // run the algorithm
+ cudatest::TestHostCollection hostProduct{size_};
+ algo_.fill(hostProduct);
+
+ // put the product into the event
+ event.emplace(hostToken_, std::move(hostProduct));
+ }
+
+ static void fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+ edm::ParameterSetDescription desc;
+ desc.add("size");
+ descriptions.addWithDefaultLabel(desc);
+ }
+
+private:
+ const edm::EDPutTokenT hostToken_;
+ const int32_t size_;
+
+ // implementation of the algorithm
+ cudatest::TestAlgo algo_;
+};
+
+#include "FWCore/Framework/interface/MakerMacros.h"
+DEFINE_FWK_MODULE(TestPortableProducerCPU);
diff --git a/HeterogeneousCore/CUDATest/plugins/TestPortableProducerCUDA.cc b/HeterogeneousCore/CUDATest/plugins/TestPortableProducerCUDA.cc
new file mode 100644
index 0000000000000..debafd6b874dd
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/plugins/TestPortableProducerCUDA.cc
@@ -0,0 +1,49 @@
+#include "CUDADataFormats/Common/interface/Product.h"
+#include "CUDADataFormats/PortableTestObjects/interface/TestDeviceCollection.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/Frameworkfwd.h"
+#include "FWCore/Framework/interface/stream/EDProducer.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/Utilities/interface/EDGetToken.h"
+#include "FWCore/Utilities/interface/InputTag.h"
+#include "FWCore/Utilities/interface/StreamID.h"
+#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
+
+#include "TestAlgo.h"
+
+class TestPortableProducerCUDA : public edm::stream::EDProducer<> {
+public:
+ TestPortableProducerCUDA(edm::ParameterSet const& config)
+ : deviceToken_{produces()}, size_{config.getParameter("size")} {}
+
+ void produce(edm::Event& event, edm::EventSetup const&) override {
+ // create a context based on the EDM stream number
+ cms::cuda::ScopedContextProduce ctx(event.streamID());
+
+ // run the algorithm, potentially asynchronously
+ cudatest::TestDeviceCollection deviceProduct{size_, ctx.stream()};
+ algo_.fill(deviceProduct, ctx.stream());
+
+ // put the asynchronous product into the event without waiting
+ ctx.emplace(event, deviceToken_, std::move(deviceProduct));
+ }
+
+ static void fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+ edm::ParameterSetDescription desc;
+ desc.add("size");
+ descriptions.addWithDefaultLabel(desc);
+ }
+
+private:
+ const edm::EDPutTokenT> deviceToken_;
+ const int32_t size_;
+
+ // implementation of the algorithm
+ cudatest::TestAlgo algo_;
+};
+
+#include "FWCore/Framework/interface/MakerMacros.h"
+DEFINE_FWK_MODULE(TestPortableProducerCUDA);
diff --git a/HeterogeneousCore/CUDATest/plugins/TestPortableTranscriber.cc b/HeterogeneousCore/CUDATest/plugins/TestPortableTranscriber.cc
new file mode 100644
index 0000000000000..4b251ad720d0e
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/plugins/TestPortableTranscriber.cc
@@ -0,0 +1,58 @@
+#include "CUDADataFormats/Common/interface/Product.h"
+#include "CUDADataFormats/PortableTestObjects/interface/TestDeviceCollection.h"
+#include "CUDADataFormats/PortableTestObjects/interface/TestHostCollection.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/Frameworkfwd.h"
+#include "FWCore/Framework/interface/stream/EDProducer.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/Utilities/interface/EDGetToken.h"
+#include "FWCore/Utilities/interface/InputTag.h"
+#include "FWCore/Utilities/interface/StreamID.h"
+#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"
+
+class TestPortableTranscriber : public edm::stream::EDProducer {
+public:
+ TestPortableTranscriber(edm::ParameterSet const& config)
+ : deviceToken_{consumes(config.getParameter("source"))}, hostToken_{produces()} {}
+
+ void acquire(edm::Event const& event, edm::EventSetup const& setup, edm::WaitingTaskWithArenaHolder task) override {
+ // create a context reusing the same device and queue as the producer of the input collection
+ auto const& input = event.get(deviceToken_);
+ cms::cuda::ScopedContextAcquire ctx{input, std::move(task)};
+
+ cudatest::TestDeviceCollection const& deviceProduct = ctx.get(input);
+
+ // allocate a host product based on the metadata of the device product
+ hostProduct_ = cudatest::TestHostCollection{deviceProduct->metadata().size(), ctx.stream()};
+
+ // copy the content of the device product to the host product
+ cms::cuda::copyAsync(hostProduct_.buffer(), deviceProduct.const_buffer(), deviceProduct.bufferSize(), ctx.stream());
+
+ // do not wait for the asynchronous operation to complete
+ }
+
+ void produce(edm::Event& event, edm::EventSetup const&) override {
+ // produce() is called once the asynchronous operation has completed, so there is no need for an explicit wait
+ event.emplace(hostToken_, std::move(hostProduct_));
+ }
+
+ static void fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+ edm::ParameterSetDescription desc;
+ desc.add("source");
+ descriptions.addWithDefaultLabel(desc);
+ }
+
+private:
+ const edm::EDGetTokenT> deviceToken_;
+ const edm::EDPutTokenT hostToken_;
+
+ // hold the output product between acquire() and produce()
+ cudatest::TestHostCollection hostProduct_;
+};
+
+#include "FWCore/Framework/interface/MakerMacros.h"
+DEFINE_FWK_MODULE(TestPortableTranscriber);
diff --git a/HeterogeneousCore/CUDATest/test/BuildFile.xml b/HeterogeneousCore/CUDATest/test/BuildFile.xml
index 60351fbfe9b88..0faf0245bbe78 100644
--- a/HeterogeneousCore/CUDATest/test/BuildFile.xml
+++ b/HeterogeneousCore/CUDATest/test/BuildFile.xml
@@ -11,4 +11,5 @@
+
diff --git a/HeterogeneousCore/CUDATest/test/reader.py b/HeterogeneousCore/CUDATest/test/reader.py
new file mode 100644
index 0000000000000..ba4111e4ab73c
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/test/reader.py
@@ -0,0 +1,27 @@
+import FWCore.ParameterSet.Config as cms
+
+process = cms.Process('Reader')
+
+# read the products from a 'test.root' file
+process.source = cms.Source('PoolSource',
+ fileNames = cms.untracked.vstring('file:test.root')
+)
+
+# enable logging for the TestPortableAnalyzer
+process.MessageLogger.TestPortableAnalyzer = cms.untracked.PSet()
+
+# analyse the first product
+process.testAnalyzer = cms.EDAnalyzer('TestPortableAnalyzer',
+ source = cms.InputTag('testProducer')
+)
+
+# analyse the second product
+process.testAnalyzerSerial = cms.EDAnalyzer('TestPortableAnalyzer',
+ source = cms.InputTag('testProducerSerial')
+)
+
+process.cuda_path = cms.Path(process.testAnalyzer)
+
+process.serial_path = cms.Path(process.testAnalyzerSerial)
+
+process.maxEvents.input = 10
diff --git a/HeterogeneousCore/CUDATest/test/testHeterogeneousCoreCUDATestWriteRead.sh b/HeterogeneousCore/CUDATest/test/testHeterogeneousCoreCUDATestWriteRead.sh
new file mode 100755
index 0000000000000..4402e119e1012
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/test/testHeterogeneousCoreCUDATestWriteRead.sh
@@ -0,0 +1,24 @@
+ #! /bin/bash -e
+
+if ! [ "${LOCALTOP}" ]; then
+ export LOCALTOP=${CMSSW_BASE}
+ cd ${CMSSW_BASE}
+fi
+
+mkdir -p testHeterogeneousCoreCUDATestWriteRead
+cd testHeterogeneousCoreCUDATestWriteRead
+rm -f test.root
+echo "--------------------------------------------------------------------------------"
+echo "$ cmsRun ${LOCALTOP}/src/HeterogeneousCore/CUDATest/test/writer.py"
+echo
+cmsRun ${LOCALTOP}/src/HeterogeneousCore/CUDATest/test/writer.py
+echo
+echo "--------------------------------------------------------------------------------"
+echo "$ cmsRun ${LOCALTOP}/src/HeterogeneousCore/CUDATest/test/reader.py"
+echo
+cmsRun ${LOCALTOP}/src/HeterogeneousCore/CUDATest/test/reader.py
+echo
+echo "--------------------------------------------------------------------------------"
+echo "$ edmDumpEventContent test.root"
+echo
+edmDumpEventContent test.root
diff --git a/HeterogeneousCore/CUDATest/test/writer.py b/HeterogeneousCore/CUDATest/test/writer.py
new file mode 100644
index 0000000000000..5c93a20bcf926
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/test/writer.py
@@ -0,0 +1,76 @@
+import FWCore.ParameterSet.Config as cms
+from HeterogeneousCore.CUDACore.SwitchProducerCUDA import SwitchProducerCUDA
+
+process = cms.Process('Writer')
+
+process.source = cms.Source('EmptySource')
+
+process.load('Configuration.StandardSequences.Accelerators_cff')
+
+# enable logging for the TestPortableAnalyzer
+process.MessageLogger.TestPortableAnalyzer = cms.untracked.PSet()
+
+# run the producer on a CUDA gpu (if available)
+process.testProducerCuda = cms.EDProducer('TestPortableProducerCUDA',
+ size = cms.int32(42)
+)
+
+# copy the product from the gpu (if available) to the host
+process.testTranscriberFromCuda = cms.EDProducer('TestPortableTranscriber',
+ source = cms.InputTag('testProducerCuda')
+)
+
+# run the producer on the cpu
+process.testProducerCpu = cms.EDProducer('TestPortableProducerCPU',
+ size = cms.int32(42)
+)
+
+# either run the producer on a CUDA gpu (if available) and copy the product to the cpu, or run the producer directly on the cpu
+process.testProducer = SwitchProducerCUDA(
+ cpu = cms.EDAlias(
+ testProducerCpu = cms.VPSet(cms.PSet(type = cms.string('*')))
+ ),
+ cuda = cms.EDAlias(
+ testTranscriberFromCuda = cms.VPSet(cms.PSet(type = cms.string('*')))
+ )
+)
+
+# analyse the product
+process.testAnalyzer = cms.EDAnalyzer('TestPortableAnalyzer',
+ source = cms.InputTag('testProducer')
+)
+
+# run a second producer explicitly on the cpu
+process.testProducerSerial = cms.EDProducer('TestPortableProducerCPU',
+ size = cms.int32(99)
+)
+
+# analyse the second product
+process.testAnalyzerSerial = cms.EDAnalyzer('TestPortableAnalyzer',
+ source = cms.InputTag('testProducerSerial')
+)
+
+# write the two products to a 'test.root' file
+process.output = cms.OutputModule('PoolOutputModule',
+ fileName = cms.untracked.string('test.root'),
+ outputCommands = cms.untracked.vstring(
+ 'drop *',
+ 'keep *_testProducer_*_*',
+ 'keep *_testProducerSerial_*_*',
+ )
+)
+
+process.producer_task = cms.Task(process.testProducerCuda, process.testTranscriberFromCuda, process.testProducerCpu)
+
+process.process_path = cms.Path(
+ process.testProducer +
+ process.testAnalyzer,
+ process.producer_task)
+
+process.serial_path = cms.Path(
+ process.testProducerSerial +
+ process.testAnalyzerSerial)
+
+process.output_path = cms.EndPath(process.output)
+
+process.maxEvents.input = 10
diff --git a/HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h b/HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h
index a64b7c9869d6c..ea89cc7490d85 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h
@@ -1,6 +1,7 @@
#ifndef HeterogeneousCore_CUDAUtilities_interface_host_unique_ptr_h
#define HeterogeneousCore_CUDAUtilities_interface_host_unique_ptr_h
+#include
#include
#include
@@ -10,11 +11,32 @@ namespace cms {
namespace cuda {
namespace host {
namespace impl {
- // Additional layer of types to distinguish from host::unique_ptr
+
+ enum class MemoryType : bool {
+ kDefault = false,
+ kPinned = true,
+ };
+
+ // Custom deleter for host memory, with an internal state to distinguish pageable and pinned host memory
class HostDeleter {
public:
- void operator()(void *ptr) { cms::cuda::free_host(ptr); }
+ // The default constructor is needed by the default constructor of unique_ptr,
+ // which is needed by the default constructor of HostProduct, which is needed by the ROOT dictionary
+ HostDeleter() : type_{MemoryType::kDefault} {}
+ HostDeleter(MemoryType type) : type_{type} {}
+
+ void operator()(void *ptr) {
+ if (type_ == MemoryType::kPinned) {
+ cms::cuda::free_host(ptr);
+ } else {
+ std::free(ptr);
+ }
+ }
+
+ private:
+ MemoryType type_;
};
+
} // namespace impl
template
@@ -36,24 +58,55 @@ namespace cms {
} // namespace impl
} // namespace host
+ // Allocate pageable host memory
+ template
+ typename host::impl::make_host_unique_selector::non_array make_host_unique() {
+ static_assert(std::is_trivially_constructible::value,
+ "Allocating with non-trivial constructor on the host memory is not supported");
+ // Allocate a buffer aligned to 128 bytes, to match the CUDA cache line size
+ const size_t alignment = 128;
+ // std::aligned_alloc() requires the size to be a multiple of the alignment
+ const size_t size = (sizeof(T) + alignment - 1) / alignment * alignment;
+ void *mem = std::aligned_alloc(alignment, size);
+ return typename host::impl::make_host_unique_selector::non_array{reinterpret_cast(mem),
+ host::impl::MemoryType::kDefault};
+ }
+
+ template
+ typename host::impl::make_host_unique_selector::unbounded_array make_host_unique(size_t n) {
+ using element_type = typename std::remove_extent::type;
+ static_assert(std::is_trivially_constructible::value,
+ "Allocating with non-trivial constructor on the host memory is not supported");
+ // Allocate a buffer aligned to 128 bytes, to match the CUDA cache line size
+ const size_t alignment = 128;
+ // std::aligned_alloc() requires the size to be a multiple of the alignment
+ const size_t size = (n * sizeof(element_type) + alignment - 1) / alignment * alignment;
+ void *mem = std::aligned_alloc(alignment, size);
+ return typename host::impl::make_host_unique_selector::unbounded_array{reinterpret_cast(mem),
+ host::impl::MemoryType::kDefault};
+ }
+
// Allocate pinned host memory
template
typename host::impl::make_host_unique_selector::non_array make_host_unique(cudaStream_t stream) {
static_assert(std::is_trivially_constructible::value,
- "Allocating with non-trivial constructor on the pinned host memory is not supported");
+ "Allocating with non-trivial constructor on the host memory is not supported");
void *mem = allocate_host(sizeof(T), stream);
- return typename host::impl::make_host_unique_selector::non_array{reinterpret_cast(mem)};
+ return typename host::impl::make_host_unique_selector::non_array{reinterpret_cast(mem), //
+ host::impl::MemoryType::kPinned};
}
template
typename host::impl::make_host_unique_selector::unbounded_array make_host_unique(size_t n, cudaStream_t stream) {
using element_type = typename std::remove_extent::type;
static_assert(std::is_trivially_constructible::value,
- "Allocating with non-trivial constructor on the pinned host memory is not supported");
+ "Allocating with non-trivial constructor on the host memory is not supported");
void *mem = allocate_host(n * sizeof(element_type), stream);
- return typename host::impl::make_host_unique_selector::unbounded_array{reinterpret_cast(mem)};
+ return typename host::impl::make_host_unique_selector::unbounded_array{reinterpret_cast(mem),
+ host::impl::MemoryType::kPinned};
}
+ // Arrays of known bounds are not supported by std::unique_ptr
template
typename host::impl::make_host_unique_selector::bounded_array make_host_unique(Args &&...) = delete;
@@ -61,7 +114,8 @@ namespace cms {
template
typename host::impl::make_host_unique_selector::non_array make_host_unique_uninitialized(cudaStream_t stream) {
void *mem = allocate_host(sizeof(T), stream);
- return typename host::impl::make_host_unique_selector::non_array{reinterpret_cast(mem)};
+ return typename host::impl::make_host_unique_selector::non_array{reinterpret_cast(mem), //
+ host::impl::MemoryType::kPinned};
}
template
@@ -69,11 +123,14 @@ namespace cms {
size_t n, cudaStream_t stream) {
using element_type = typename std::remove_extent::type;
void *mem = allocate_host(n * sizeof(element_type), stream);
- return typename host::impl::make_host_unique_selector::unbounded_array{reinterpret_cast(mem)};
+ return typename host::impl::make_host_unique_selector::unbounded_array{reinterpret_cast(mem),
+ host::impl::MemoryType::kPinned};
}
+ // Arrays of known bounds are not supported by std::unique_ptr
template
typename host::impl::make_host_unique_selector::bounded_array make_host_unique_uninitialized(Args &&...) = delete;
+
} // namespace cuda
} // namespace cms