diff --git a/CUDADataFormats/PortableTestObjects/BuildFile.xml b/CUDADataFormats/PortableTestObjects/BuildFile.xml
new file mode 100644
index 0000000000000..595a743a6c4c5
--- /dev/null
+++ b/CUDADataFormats/PortableTestObjects/BuildFile.xml
@@ -0,0 +1,7 @@
+
+
+
+
+
+
+
diff --git a/CUDADataFormats/PortableTestObjects/interface/TestDeviceCollection.h b/CUDADataFormats/PortableTestObjects/interface/TestDeviceCollection.h
new file mode 100644
index 0000000000000..c0dc803228646
--- /dev/null
+++ b/CUDADataFormats/PortableTestObjects/interface/TestDeviceCollection.h
@@ -0,0 +1,14 @@
+#ifndef CUDADataFormats_PortableTestObjects_interface_TestDeviceCollection_h
+#define CUDADataFormats_PortableTestObjects_interface_TestDeviceCollection_h
+
+#include "CUDADataFormats/Common/interface/PortableDeviceCollection.h"
+#include "DataFormats/PortableTestObjects/interface/TestSoA.h"
+
+namespace cudatest {
+
+ // SoA with x, y, z, id fields in device global memory
+ using TestDeviceCollection = cms::cuda::PortableDeviceCollection;
+
+} // namespace cudatest
+
+#endif // CUDADataFormats_PortableTestObjects_interface_TestDeviceCollection_h
diff --git a/CUDADataFormats/PortableTestObjects/interface/TestHostCollection.h b/CUDADataFormats/PortableTestObjects/interface/TestHostCollection.h
new file mode 100644
index 0000000000000..3e2eea89488c7
--- /dev/null
+++ b/CUDADataFormats/PortableTestObjects/interface/TestHostCollection.h
@@ -0,0 +1,14 @@
+#ifndef CUDADataFormats_PortableTestObjects_interface_TestHostCollection_h
+#define CUDADataFormats_PortableTestObjects_interface_TestHostCollection_h
+
+#include "CUDADataFormats/Common/interface/PortableHostCollection.h"
+#include "DataFormats/PortableTestObjects/interface/TestSoA.h"
+
+namespace cudatest {
+
+ // SoA with x, y, z, id fields in host memory
+ using TestHostCollection = cms::cuda::PortableHostCollection;
+
+} // namespace cudatest
+
+#endif // CUDADataFormats_PortableTestObjects_interface_TestHostCollection_h
diff --git a/CUDADataFormats/PortableTestObjects/src/classes.h b/CUDADataFormats/PortableTestObjects/src/classes.h
new file mode 100644
index 0000000000000..6f26217bd6646
--- /dev/null
+++ b/CUDADataFormats/PortableTestObjects/src/classes.h
@@ -0,0 +1,5 @@
+#include "CUDADataFormats/Common/interface/Product.h"
+#include "CUDADataFormats/PortableTestObjects/interface/TestDeviceCollection.h"
+#include "CUDADataFormats/PortableTestObjects/interface/TestHostCollection.h"
+#include "DataFormats/Common/interface/Wrapper.h"
+#include "DataFormats/PortableTestObjects/interface/TestSoA.h"
diff --git a/CUDADataFormats/PortableTestObjects/src/classes_def.xml b/CUDADataFormats/PortableTestObjects/src/classes_def.xml
new file mode 100644
index 0000000000000..892bedcca23d6
--- /dev/null
+++ b/CUDADataFormats/PortableTestObjects/src/classes_def.xml
@@ -0,0 +1,21 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/HeterogeneousCore/CUDATest/plugins/TestAlgo.cc b/HeterogeneousCore/CUDATest/plugins/TestAlgo.cc
new file mode 100644
index 0000000000000..3f96aff87eedb
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/plugins/TestAlgo.cc
@@ -0,0 +1,19 @@
+#include "CUDADataFormats/PortableTestObjects/interface/TestHostCollection.h"
+
+#include "TestAlgo.h"
+
+namespace cudatest {
+
+ static void testAlgoKernel(cudatest::TestHostCollection::View view, int32_t size) {
+ view.r() = 1.;
+
+ for (auto i = 0; i < size; ++i) {
+ view[i] = {0., 0., 0., i};
+ }
+ }
+
+ void TestAlgo::fill(cudatest::TestHostCollection& collection) const {
+ testAlgoKernel(collection.view(), collection->metadata().size());
+ }
+
+} // namespace cudatest
diff --git a/HeterogeneousCore/CUDATest/plugins/TestAlgo.cu b/HeterogeneousCore/CUDATest/plugins/TestAlgo.cu
new file mode 100644
index 0000000000000..a2434de387b6e
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/plugins/TestAlgo.cu
@@ -0,0 +1,30 @@
+#include
+
+#include "CUDADataFormats/PortableTestObjects/interface/TestDeviceCollection.h"
+
+#include "TestAlgo.h"
+
+namespace cudatest {
+
+ static __global__ void testAlgoKernel(cudatest::TestDeviceCollection::View view, int32_t size) {
+ const int32_t thread = blockIdx.x * blockDim.x + threadIdx.x;
+ const int32_t stride = blockDim.x * gridDim.x;
+
+ if (thread == 0) {
+ view.r() = 1.;
+ }
+ for (auto i = thread; i < size; i += stride) {
+ view[i] = {0., 0., 0., i};
+ }
+ }
+
+ void TestAlgo::fill(cudatest::TestDeviceCollection& collection, cudaStream_t stream) const {
+ const uint32_t maxThreadsPerBlock = 1024;
+
+ uint32_t threadsPerBlock = maxThreadsPerBlock;
+ uint32_t blocksPerGrid = (collection->metadata().size() + threadsPerBlock - 1) / threadsPerBlock;
+
+ testAlgoKernel<<>>(collection.view(), collection->metadata().size());
+ }
+
+} // namespace cudatest
diff --git a/HeterogeneousCore/CUDATest/plugins/TestAlgo.h b/HeterogeneousCore/CUDATest/plugins/TestAlgo.h
new file mode 100644
index 0000000000000..a91a773234f68
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/plugins/TestAlgo.h
@@ -0,0 +1,17 @@
+#ifndef HeterogeneousCore_CUDATest_plugins_TestAlgo_h
+#define HeterogeneousCore_CUDATest_plugins_TestAlgo_h
+
+#include "CUDADataFormats/PortableTestObjects/interface/TestDeviceCollection.h"
+#include "CUDADataFormats/PortableTestObjects/interface/TestHostCollection.h"
+
+namespace cudatest {
+
+ class TestAlgo {
+ public:
+ void fill(cudatest::TestDeviceCollection& collection, cudaStream_t stream) const;
+ void fill(cudatest::TestHostCollection& collection) const;
+ };
+
+} // namespace cudatest
+
+#endif // HeterogeneousCore_CUDATest_plugins_TestAlgo_h
diff --git a/HeterogeneousCore/CUDATest/plugins/TestPortableAnalyzer.cc b/HeterogeneousCore/CUDATest/plugins/TestPortableAnalyzer.cc
new file mode 100644
index 0000000000000..49ee3cb3b9721
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/plugins/TestPortableAnalyzer.cc
@@ -0,0 +1,62 @@
+#include
+
+#include "CUDADataFormats/PortableTestObjects/interface/TestHostCollection.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/Frameworkfwd.h"
+#include "FWCore/Framework/interface/stream/EDAnalyzer.h"
+#include "FWCore/MessageLogger/interface/MessageLogger.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/Utilities/interface/EDGetToken.h"
+#include "FWCore/Utilities/interface/InputTag.h"
+
+class TestPortableAnalyzer : public edm::stream::EDAnalyzer<> {
+public:
+ TestPortableAnalyzer(edm::ParameterSet const& config)
+ : source_{config.getParameter("source")}, token_{consumes(source_)} {}
+
+ void analyze(edm::Event const& event, edm::EventSetup const&) override {
+ cudatest::TestHostCollection const& product = event.get(token_);
+
+ auto const& view = product.const_view();
+ for (int32_t i = 0; i < view.metadata().size(); ++i) {
+ assert(view[i].id() == i);
+ }
+
+ edm::LogInfo msg("TestPortableAnalyzer");
+ msg << source_.encode() << ".size() = " << view.metadata().size() << '\n';
+ msg << " data = " << product.buffer().get() << ",\n"
+ << " x = " << view.metadata().addressOf_x() << ",\n"
+ << " y = " << view.metadata().addressOf_y() << ",\n"
+ << " z = " << view.metadata().addressOf_z() << ",\n"
+ << " id = " << view.metadata().addressOf_id() << ",\n"
+ << " r = " << view.metadata().addressOf_r() << '\n';
+ msg << std::hex << " [y - x] = 0x"
+ << reinterpret_cast(view.metadata().addressOf_y()) -
+ reinterpret_cast(view.metadata().addressOf_x())
+ << " [z - y] = 0x"
+ << reinterpret_cast(view.metadata().addressOf_z()) -
+ reinterpret_cast(view.metadata().addressOf_y())
+ << " [id - z] = 0x"
+ << reinterpret_cast(view.metadata().addressOf_id()) -
+ reinterpret_cast(view.metadata().addressOf_z())
+ << " [r - id] = 0x"
+ << reinterpret_cast(view.metadata().addressOf_r()) -
+ reinterpret_cast(view.metadata().addressOf_id());
+ }
+
+ static void fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+ edm::ParameterSetDescription desc;
+ desc.add("source");
+ descriptions.addWithDefaultLabel(desc);
+ }
+
+private:
+ const edm::InputTag source_;
+ const edm::EDGetTokenT token_;
+};
+
+#include "FWCore/Framework/interface/MakerMacros.h"
+DEFINE_FWK_MODULE(TestPortableAnalyzer);
diff --git a/HeterogeneousCore/CUDATest/plugins/TestPortableProducerCPU.cc b/HeterogeneousCore/CUDATest/plugins/TestPortableProducerCPU.cc
new file mode 100644
index 0000000000000..19d1611c6b899
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/plugins/TestPortableProducerCPU.cc
@@ -0,0 +1,45 @@
+#include "CUDADataFormats/PortableTestObjects/interface/TestHostCollection.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/Frameworkfwd.h"
+#include "FWCore/Framework/interface/stream/EDProducer.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/ServiceRegistry/interface/Service.h"
+#include "FWCore/Utilities/interface/EDGetToken.h"
+#include "FWCore/Utilities/interface/InputTag.h"
+#include "FWCore/Utilities/interface/StreamID.h"
+
+#include "TestAlgo.h"
+
+class TestPortableProducerCPU : public edm::stream::EDProducer<> {
+public:
+ TestPortableProducerCPU(edm::ParameterSet const& config)
+ : hostToken_{produces()}, size_{config.getParameter("size")} {}
+
+ void produce(edm::Event& event, edm::EventSetup const&) override {
+ // run the algorithm
+ cudatest::TestHostCollection hostProduct{size_};
+ algo_.fill(hostProduct);
+
+ // put the product into the event
+ event.emplace(hostToken_, std::move(hostProduct));
+ }
+
+ static void fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+ edm::ParameterSetDescription desc;
+ desc.add("size");
+ descriptions.addWithDefaultLabel(desc);
+ }
+
+private:
+ const edm::EDPutTokenT hostToken_;
+ const int32_t size_;
+
+ // implementation of the algorithm
+ cudatest::TestAlgo algo_;
+};
+
+#include "FWCore/Framework/interface/MakerMacros.h"
+DEFINE_FWK_MODULE(TestPortableProducerCPU);
diff --git a/HeterogeneousCore/CUDATest/plugins/TestPortableProducerCUDA.cc b/HeterogeneousCore/CUDATest/plugins/TestPortableProducerCUDA.cc
new file mode 100644
index 0000000000000..bf229135eee01
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/plugins/TestPortableProducerCUDA.cc
@@ -0,0 +1,58 @@
+#include "CUDADataFormats/Common/interface/Product.h"
+#include "CUDADataFormats/PortableTestObjects/interface/TestDeviceCollection.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/Frameworkfwd.h"
+#include "FWCore/Framework/interface/stream/EDProducer.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/ServiceRegistry/interface/Service.h"
+#include "FWCore/Utilities/interface/EDGetToken.h"
+#include "FWCore/Utilities/interface/InputTag.h"
+#include "FWCore/Utilities/interface/StreamID.h"
+#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+
+#include "TestAlgo.h"
+
+class TestPortableProducerCUDA : public edm::stream::EDProducer<> {
+public:
+ TestPortableProducerCUDA(edm::ParameterSet const& config)
+ : deviceToken_{produces()}, size_{config.getParameter("size")} {}
+
+ void beginStream(edm::StreamID) override {
+ edm::Service service;
+ if (not service->enabled()) {
+ throw cms::Exception("Configuration") << "CUDAService is disabled.";
+ }
+ }
+
+ void produce(edm::Event& event, edm::EventSetup const&) override {
+ // create a context based on the EDM stream number
+ cms::cuda::ScopedContextProduce ctx(event.streamID());
+
+ // run the algorithm, potentially asynchronously
+ cudatest::TestDeviceCollection deviceProduct{size_, ctx.stream()};
+ algo_.fill(deviceProduct, ctx.stream());
+
+ // put the asynchronous product into the event without waiting
+ ctx.emplace(event, deviceToken_, std::move(deviceProduct));
+ }
+
+ static void fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+ edm::ParameterSetDescription desc;
+ desc.add("size");
+ descriptions.addWithDefaultLabel(desc);
+ }
+
+private:
+ const edm::EDPutTokenT> deviceToken_;
+ const int32_t size_;
+
+ // implementation of the algorithm
+ cudatest::TestAlgo algo_;
+};
+
+#include "FWCore/Framework/interface/MakerMacros.h"
+DEFINE_FWK_MODULE(TestPortableProducerCUDA);
diff --git a/HeterogeneousCore/CUDATest/plugins/TestPortableTranscriber.cc b/HeterogeneousCore/CUDATest/plugins/TestPortableTranscriber.cc
new file mode 100644
index 0000000000000..1b026f275b8a1
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/plugins/TestPortableTranscriber.cc
@@ -0,0 +1,67 @@
+#include "CUDADataFormats/Common/interface/Product.h"
+#include "CUDADataFormats/PortableTestObjects/interface/TestDeviceCollection.h"
+#include "CUDADataFormats/PortableTestObjects/interface/TestHostCollection.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/Frameworkfwd.h"
+#include "FWCore/Framework/interface/stream/EDProducer.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/ServiceRegistry/interface/Service.h"
+#include "FWCore/Utilities/interface/EDGetToken.h"
+#include "FWCore/Utilities/interface/InputTag.h"
+#include "FWCore/Utilities/interface/StreamID.h"
+#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+
+class TestPortableTranscriber : public edm::stream::EDProducer {
+public:
+ TestPortableTranscriber(edm::ParameterSet const& config)
+ : deviceToken_{consumes(config.getParameter("source"))}, hostToken_{produces()} {}
+
+ void beginStream(edm::StreamID) override {
+ edm::Service service;
+ if (not service->enabled()) {
+ throw cms::Exception("Configuration") << "CUDAService is disabled.";
+ }
+ }
+
+ void acquire(edm::Event const& event, edm::EventSetup const& setup, edm::WaitingTaskWithArenaHolder task) override {
+ // create a context reusing the same device and queue as the producer of the input collection
+ auto const& input = event.get(deviceToken_);
+ cms::cuda::ScopedContextAcquire ctx{input, std::move(task)};
+
+ cudatest::TestDeviceCollection const& deviceProduct = ctx.get(input);
+
+ // allocate a host product based on the metadata of the device product
+ hostProduct_ = cudatest::TestHostCollection{deviceProduct->metadata().size(), ctx.stream()};
+
+ // copy the content of the device product to the host product
+ cms::cuda::copyAsync(hostProduct_.buffer(), deviceProduct.const_buffer(), deviceProduct.bufferSize(), ctx.stream());
+
+ // do not wait for the asynchronous operation to complete
+ }
+
+ void produce(edm::Event& event, edm::EventSetup const&) override {
+ // produce() is called once the asynchronous operation has completed, so there is no need for an explicit wait
+ event.emplace(hostToken_, std::move(hostProduct_));
+ }
+
+ static void fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+ edm::ParameterSetDescription desc;
+ desc.add("source");
+ descriptions.addWithDefaultLabel(desc);
+ }
+
+private:
+ const edm::EDGetTokenT> deviceToken_;
+ const edm::EDPutTokenT hostToken_;
+
+ // hold the output product between acquire() and produce()
+ cudatest::TestHostCollection hostProduct_;
+};
+
+#include "FWCore/Framework/interface/MakerMacros.h"
+DEFINE_FWK_MODULE(TestPortableTranscriber);
diff --git a/HeterogeneousCore/CUDATest/python/reader.py b/HeterogeneousCore/CUDATest/python/reader.py
new file mode 100644
index 0000000000000..ba4111e4ab73c
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/python/reader.py
@@ -0,0 +1,27 @@
+import FWCore.ParameterSet.Config as cms
+
+process = cms.Process('Reader')
+
+# read the products from a 'test.root' file
+process.source = cms.Source('PoolSource',
+ fileNames = cms.untracked.vstring('file:test.root')
+)
+
+# enable logging for the TestPortableAnalyzer
+process.MessageLogger.TestPortableAnalyzer = cms.untracked.PSet()
+
+# analyse the first product
+process.testAnalyzer = cms.EDAnalyzer('TestPortableAnalyzer',
+ source = cms.InputTag('testProducer')
+)
+
+# analyse the second product
+process.testAnalyzerSerial = cms.EDAnalyzer('TestPortableAnalyzer',
+ source = cms.InputTag('testProducerSerial')
+)
+
+process.cuda_path = cms.Path(process.testAnalyzer)
+
+process.serial_path = cms.Path(process.testAnalyzerSerial)
+
+process.maxEvents.input = 10
diff --git a/HeterogeneousCore/CUDATest/python/writer.py b/HeterogeneousCore/CUDATest/python/writer.py
new file mode 100644
index 0000000000000..5c93a20bcf926
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/python/writer.py
@@ -0,0 +1,76 @@
+import FWCore.ParameterSet.Config as cms
+from HeterogeneousCore.CUDACore.SwitchProducerCUDA import SwitchProducerCUDA
+
+process = cms.Process('Writer')
+
+process.source = cms.Source('EmptySource')
+
+process.load('Configuration.StandardSequences.Accelerators_cff')
+
+# enable logging for the TestPortableAnalyzer
+process.MessageLogger.TestPortableAnalyzer = cms.untracked.PSet()
+
+# run the producer on a CUDA gpu (if available)
+process.testProducerCuda = cms.EDProducer('TestPortableProducerCUDA',
+ size = cms.int32(42)
+)
+
+# copy the product from the gpu (if available) to the host
+process.testTranscriberFromCuda = cms.EDProducer('TestPortableTranscriber',
+ source = cms.InputTag('testProducerCuda')
+)
+
+# run the producer on the cpu
+process.testProducerCpu = cms.EDProducer('TestPortableProducerCPU',
+ size = cms.int32(42)
+)
+
+# either run the producer on a CUDA gpu (if available) and copy the product to the cpu, or run the producer directly on the cpu
+process.testProducer = SwitchProducerCUDA(
+ cpu = cms.EDAlias(
+ testProducerCpu = cms.VPSet(cms.PSet(type = cms.string('*')))
+ ),
+ cuda = cms.EDAlias(
+ testTranscriberFromCuda = cms.VPSet(cms.PSet(type = cms.string('*')))
+ )
+)
+
+# analyse the product
+process.testAnalyzer = cms.EDAnalyzer('TestPortableAnalyzer',
+ source = cms.InputTag('testProducer')
+)
+
+# run a second producer explicitly on the cpu
+process.testProducerSerial = cms.EDProducer('TestPortableProducerCPU',
+ size = cms.int32(99)
+)
+
+# analyse the second product
+process.testAnalyzerSerial = cms.EDAnalyzer('TestPortableAnalyzer',
+ source = cms.InputTag('testProducerSerial')
+)
+
+# write the two products to a 'test.root' file
+process.output = cms.OutputModule('PoolOutputModule',
+ fileName = cms.untracked.string('test.root'),
+ outputCommands = cms.untracked.vstring(
+ 'drop *',
+ 'keep *_testProducer_*_*',
+ 'keep *_testProducerSerial_*_*',
+ )
+)
+
+process.producer_task = cms.Task(process.testProducerCuda, process.testTranscriberFromCuda, process.testProducerCpu)
+
+process.process_path = cms.Path(
+ process.testProducer +
+ process.testAnalyzer,
+ process.producer_task)
+
+process.serial_path = cms.Path(
+ process.testProducerSerial +
+ process.testAnalyzerSerial)
+
+process.output_path = cms.EndPath(process.output)
+
+process.maxEvents.input = 10