From c1f6600e53e7d9bf7b396108f005943758d4234c Mon Sep 17 00:00:00 2001 From: Andrea Bocci Date: Tue, 6 Sep 2022 17:14:39 +0200 Subject: [PATCH] Reimplement in CUDA the PortableTestObjects tests --- .../PortableTestObjects/BuildFile.xml | 7 ++ .../interface/TestDeviceCollection.h | 14 ++++ .../interface/TestHostCollection.h | 14 ++++ .../PortableTestObjects/src/classes.h | 5 ++ .../PortableTestObjects/src/classes_def.xml | 21 +++++ .../CUDATest/plugins/TestAlgo.cc | 19 +++++ .../CUDATest/plugins/TestAlgo.cu | 30 ++++++++ HeterogeneousCore/CUDATest/plugins/TestAlgo.h | 13 ++++ .../CUDATest/plugins/TestPortableAnalyzer.cc | 62 +++++++++++++++ .../plugins/TestPortableProducerCPU.cc | 45 +++++++++++ .../plugins/TestPortableProducerCUDA.cc | 58 ++++++++++++++ .../plugins/TestPortableTranscriber.cc | 67 ++++++++++++++++ HeterogeneousCore/CUDATest/python/reader.py | 27 +++++++ HeterogeneousCore/CUDATest/python/writer.py | 76 +++++++++++++++++++ 14 files changed, 458 insertions(+) create mode 100644 CUDADataFormats/PortableTestObjects/BuildFile.xml create mode 100644 CUDADataFormats/PortableTestObjects/interface/TestDeviceCollection.h create mode 100644 CUDADataFormats/PortableTestObjects/interface/TestHostCollection.h create mode 100644 CUDADataFormats/PortableTestObjects/src/classes.h create mode 100644 CUDADataFormats/PortableTestObjects/src/classes_def.xml create mode 100644 HeterogeneousCore/CUDATest/plugins/TestAlgo.cc create mode 100644 HeterogeneousCore/CUDATest/plugins/TestAlgo.cu create mode 100644 HeterogeneousCore/CUDATest/plugins/TestAlgo.h create mode 100644 HeterogeneousCore/CUDATest/plugins/TestPortableAnalyzer.cc create mode 100644 HeterogeneousCore/CUDATest/plugins/TestPortableProducerCPU.cc create mode 100644 HeterogeneousCore/CUDATest/plugins/TestPortableProducerCUDA.cc create mode 100644 HeterogeneousCore/CUDATest/plugins/TestPortableTranscriber.cc create mode 100644 HeterogeneousCore/CUDATest/python/reader.py create mode 100644 HeterogeneousCore/CUDATest/python/writer.py diff --git a/CUDADataFormats/PortableTestObjects/BuildFile.xml b/CUDADataFormats/PortableTestObjects/BuildFile.xml new file mode 100644 index 0000000000000..595a743a6c4c5 --- /dev/null +++ b/CUDADataFormats/PortableTestObjects/BuildFile.xml @@ -0,0 +1,7 @@ + + + + + + + diff --git a/CUDADataFormats/PortableTestObjects/interface/TestDeviceCollection.h b/CUDADataFormats/PortableTestObjects/interface/TestDeviceCollection.h new file mode 100644 index 0000000000000..c0dc803228646 --- /dev/null +++ b/CUDADataFormats/PortableTestObjects/interface/TestDeviceCollection.h @@ -0,0 +1,14 @@ +#ifndef CUDADataFormats_PortableTestObjects_interface_TestDeviceCollection_h +#define CUDADataFormats_PortableTestObjects_interface_TestDeviceCollection_h + +#include "CUDADataFormats/Common/interface/PortableDeviceCollection.h" +#include "DataFormats/PortableTestObjects/interface/TestSoA.h" + +namespace cudatest { + + // SoA with x, y, z, id fields in device global memory + using TestDeviceCollection = cms::cuda::PortableDeviceCollection; + +} // namespace cudatest + +#endif // CUDADataFormats_PortableTestObjects_interface_TestDeviceCollection_h diff --git a/CUDADataFormats/PortableTestObjects/interface/TestHostCollection.h b/CUDADataFormats/PortableTestObjects/interface/TestHostCollection.h new file mode 100644 index 0000000000000..3e2eea89488c7 --- /dev/null +++ b/CUDADataFormats/PortableTestObjects/interface/TestHostCollection.h @@ -0,0 +1,14 @@ +#ifndef CUDADataFormats_PortableTestObjects_interface_TestHostCollection_h +#define CUDADataFormats_PortableTestObjects_interface_TestHostCollection_h + +#include "CUDADataFormats/Common/interface/PortableHostCollection.h" +#include "DataFormats/PortableTestObjects/interface/TestSoA.h" + +namespace cudatest { + + // SoA with x, y, z, id fields in host memory + using TestHostCollection = cms::cuda::PortableHostCollection; + +} // namespace cudatest + +#endif // CUDADataFormats_PortableTestObjects_interface_TestHostCollection_h diff --git a/CUDADataFormats/PortableTestObjects/src/classes.h b/CUDADataFormats/PortableTestObjects/src/classes.h new file mode 100644 index 0000000000000..6f26217bd6646 --- /dev/null +++ b/CUDADataFormats/PortableTestObjects/src/classes.h @@ -0,0 +1,5 @@ +#include "CUDADataFormats/Common/interface/Product.h" +#include "CUDADataFormats/PortableTestObjects/interface/TestDeviceCollection.h" +#include "CUDADataFormats/PortableTestObjects/interface/TestHostCollection.h" +#include "DataFormats/Common/interface/Wrapper.h" +#include "DataFormats/PortableTestObjects/interface/TestSoA.h" diff --git a/CUDADataFormats/PortableTestObjects/src/classes_def.xml b/CUDADataFormats/PortableTestObjects/src/classes_def.xml new file mode 100644 index 0000000000000..892bedcca23d6 --- /dev/null +++ b/CUDADataFormats/PortableTestObjects/src/classes_def.xml @@ -0,0 +1,21 @@ + + + + + + + + + + + + + diff --git a/HeterogeneousCore/CUDATest/plugins/TestAlgo.cc b/HeterogeneousCore/CUDATest/plugins/TestAlgo.cc new file mode 100644 index 0000000000000..0faa430eab2d1 --- /dev/null +++ b/HeterogeneousCore/CUDATest/plugins/TestAlgo.cc @@ -0,0 +1,19 @@ +#include "CUDADataFormats/PortableTestObjects/interface/TestHostCollection.h" + +#include "TestAlgo.h" + +namespace { + + void testAlgoKernel(cudatest::TestHostCollection::View view, int32_t size) { + view.r() = 1.; + + for (auto i = 0; i < size; ++i) { + view[i] = {0., 0., 0., i}; + } + } + +} // namespace + +void TestAlgo::fill(cudatest::TestHostCollection& collection) const { + testAlgoKernel(collection.view(), collection->metadata().size()); +} diff --git a/HeterogeneousCore/CUDATest/plugins/TestAlgo.cu b/HeterogeneousCore/CUDATest/plugins/TestAlgo.cu new file mode 100644 index 0000000000000..d8a7dbe247978 --- /dev/null +++ b/HeterogeneousCore/CUDATest/plugins/TestAlgo.cu @@ -0,0 +1,30 @@ +#include + +#include "CUDADataFormats/PortableTestObjects/interface/TestDeviceCollection.h" + +#include "TestAlgo.h" + +namespace { + + __global__ void testAlgoKernel(cudatest::TestDeviceCollection::View view, int32_t size) { + const int32_t thread = blockIdx.x * blockDim.x + threadIdx.x; + const int32_t stride = blockDim.x * gridDim.x; + + if (thread == 0) { + view.r() = 1.; + } + for (auto i = thread; i < size; i += stride) { + view[i] = {0., 0., 0., i}; + } + } + +} // namespace + +void TestAlgo::fill(cudatest::TestDeviceCollection& collection, cudaStream_t stream) const { + const uint32_t maxThreadsPerBlock = 1024; + + uint32_t threadsPerBlock = maxThreadsPerBlock; + uint32_t blocksPerGrid = (collection->metadata().size() + threadsPerBlock - 1) / threadsPerBlock; + + testAlgoKernel<<>>(collection.view(), collection->metadata().size()); +} diff --git a/HeterogeneousCore/CUDATest/plugins/TestAlgo.h b/HeterogeneousCore/CUDATest/plugins/TestAlgo.h new file mode 100644 index 0000000000000..5e8dac24841f7 --- /dev/null +++ b/HeterogeneousCore/CUDATest/plugins/TestAlgo.h @@ -0,0 +1,13 @@ +#ifndef HeterogeneousCore_CUDATest_plugins_TestAlgo_h +#define HeterogeneousCore_CUDATest_plugins_TestAlgo_h + +#include "CUDADataFormats/PortableTestObjects/interface/TestDeviceCollection.h" +#include "CUDADataFormats/PortableTestObjects/interface/TestHostCollection.h" + +class TestAlgo { +public: + void fill(cudatest::TestDeviceCollection& collection, cudaStream_t stream) const; + void fill(cudatest::TestHostCollection& collection) const; +}; + +#endif // HeterogeneousCore_CUDATest_plugins_TestAlgo_h diff --git a/HeterogeneousCore/CUDATest/plugins/TestPortableAnalyzer.cc b/HeterogeneousCore/CUDATest/plugins/TestPortableAnalyzer.cc new file mode 100644 index 0000000000000..49ee3cb3b9721 --- /dev/null +++ b/HeterogeneousCore/CUDATest/plugins/TestPortableAnalyzer.cc @@ -0,0 +1,62 @@ +#include + +#include "CUDADataFormats/PortableTestObjects/interface/TestHostCollection.h" +#include "FWCore/Framework/interface/Event.h" +#include "FWCore/Framework/interface/EventSetup.h" +#include "FWCore/Framework/interface/Frameworkfwd.h" +#include "FWCore/Framework/interface/stream/EDAnalyzer.h" +#include "FWCore/MessageLogger/interface/MessageLogger.h" +#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "FWCore/ParameterSet/interface/ParameterSetDescription.h" +#include "FWCore/Utilities/interface/EDGetToken.h" +#include "FWCore/Utilities/interface/InputTag.h" + +class TestPortableAnalyzer : public edm::stream::EDAnalyzer<> { +public: + TestPortableAnalyzer(edm::ParameterSet const& config) + : source_{config.getParameter("source")}, token_{consumes(source_)} {} + + void analyze(edm::Event const& event, edm::EventSetup const&) override { + cudatest::TestHostCollection const& product = event.get(token_); + + auto const& view = product.const_view(); + for (int32_t i = 0; i < view.metadata().size(); ++i) { + assert(view[i].id() == i); + } + + edm::LogInfo msg("TestPortableAnalyzer"); + msg << source_.encode() << ".size() = " << view.metadata().size() << '\n'; + msg << " data = " << product.buffer().get() << ",\n" + << " x = " << view.metadata().addressOf_x() << ",\n" + << " y = " << view.metadata().addressOf_y() << ",\n" + << " z = " << view.metadata().addressOf_z() << ",\n" + << " id = " << view.metadata().addressOf_id() << ",\n" + << " r = " << view.metadata().addressOf_r() << '\n'; + msg << std::hex << " [y - x] = 0x" + << reinterpret_cast(view.metadata().addressOf_y()) - + reinterpret_cast(view.metadata().addressOf_x()) + << " [z - y] = 0x" + << reinterpret_cast(view.metadata().addressOf_z()) - + reinterpret_cast(view.metadata().addressOf_y()) + << " [id - z] = 0x" + << reinterpret_cast(view.metadata().addressOf_id()) - + reinterpret_cast(view.metadata().addressOf_z()) + << " [r - id] = 0x" + << reinterpret_cast(view.metadata().addressOf_r()) - + reinterpret_cast(view.metadata().addressOf_id()); + } + + static void fillDescriptions(edm::ConfigurationDescriptions& descriptions) { + edm::ParameterSetDescription desc; + desc.add("source"); + descriptions.addWithDefaultLabel(desc); + } + +private: + const edm::InputTag source_; + const edm::EDGetTokenT token_; +}; + +#include "FWCore/Framework/interface/MakerMacros.h" +DEFINE_FWK_MODULE(TestPortableAnalyzer); diff --git a/HeterogeneousCore/CUDATest/plugins/TestPortableProducerCPU.cc b/HeterogeneousCore/CUDATest/plugins/TestPortableProducerCPU.cc new file mode 100644 index 0000000000000..456c4f75837ed --- /dev/null +++ b/HeterogeneousCore/CUDATest/plugins/TestPortableProducerCPU.cc @@ -0,0 +1,45 @@ +#include "CUDADataFormats/PortableTestObjects/interface/TestHostCollection.h" +#include "FWCore/Framework/interface/Event.h" +#include "FWCore/Framework/interface/EventSetup.h" +#include "FWCore/Framework/interface/Frameworkfwd.h" +#include "FWCore/Framework/interface/stream/EDProducer.h" +#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "FWCore/ParameterSet/interface/ParameterSetDescription.h" +#include "FWCore/ServiceRegistry/interface/Service.h" +#include "FWCore/Utilities/interface/EDGetToken.h" +#include "FWCore/Utilities/interface/InputTag.h" +#include "FWCore/Utilities/interface/StreamID.h" + +#include "TestAlgo.h" + +class TestPortableProducerCPU : public edm::stream::EDProducer<> { +public: + TestPortableProducerCPU(edm::ParameterSet const& config) + : hostToken_{produces()}, size_{config.getParameter("size")} {} + + void produce(edm::Event& event, edm::EventSetup const&) override { + // run the algorithm + cudatest::TestHostCollection hostProduct{size_}; + algo_.fill(hostProduct); + + // put the product into the event + event.emplace(hostToken_, std::move(hostProduct)); + } + + static void fillDescriptions(edm::ConfigurationDescriptions& descriptions) { + edm::ParameterSetDescription desc; + desc.add("size"); + descriptions.addWithDefaultLabel(desc); + } + +private: + const edm::EDPutTokenT hostToken_; + const int32_t size_; + + // implementation of the algorithm + TestAlgo algo_; +}; + +#include "FWCore/Framework/interface/MakerMacros.h" +DEFINE_FWK_MODULE(TestPortableProducerCPU); diff --git a/HeterogeneousCore/CUDATest/plugins/TestPortableProducerCUDA.cc b/HeterogeneousCore/CUDATest/plugins/TestPortableProducerCUDA.cc new file mode 100644 index 0000000000000..e96e875ffdaef --- /dev/null +++ b/HeterogeneousCore/CUDATest/plugins/TestPortableProducerCUDA.cc @@ -0,0 +1,58 @@ +#include "CUDADataFormats/Common/interface/Product.h" +#include "CUDADataFormats/PortableTestObjects/interface/TestDeviceCollection.h" +#include "FWCore/Framework/interface/Event.h" +#include "FWCore/Framework/interface/EventSetup.h" +#include "FWCore/Framework/interface/Frameworkfwd.h" +#include "FWCore/Framework/interface/stream/EDProducer.h" +#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "FWCore/ParameterSet/interface/ParameterSetDescription.h" +#include "FWCore/ServiceRegistry/interface/Service.h" +#include "FWCore/Utilities/interface/EDGetToken.h" +#include "FWCore/Utilities/interface/InputTag.h" +#include "FWCore/Utilities/interface/StreamID.h" +#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h" +#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h" + +#include "TestAlgo.h" + +class TestPortableProducerCUDA : public edm::stream::EDProducer<> { +public: + TestPortableProducerCUDA(edm::ParameterSet const& config) + : deviceToken_{produces()}, size_{config.getParameter("size")} {} + + void beginStream(edm::StreamID) override { + edm::Service service; + if (not service->enabled()) { + throw cms::Exception("Configuration") << "CUDAService is disabled."; + } + } + + void produce(edm::Event& event, edm::EventSetup const&) override { + // create a context based on the EDM stream number + cms::cuda::ScopedContextProduce ctx(event.streamID()); + + // run the algorithm, potentially asynchronously + cudatest::TestDeviceCollection deviceProduct{size_, ctx.stream()}; + algo_.fill(deviceProduct, ctx.stream()); + + // put the asynchronous product into the event without waiting + ctx.emplace(event, deviceToken_, std::move(deviceProduct)); + } + + static void fillDescriptions(edm::ConfigurationDescriptions& descriptions) { + edm::ParameterSetDescription desc; + desc.add("size"); + descriptions.addWithDefaultLabel(desc); + } + +private: + const edm::EDPutTokenT> deviceToken_; + const int32_t size_; + + // implementation of the algorithm + TestAlgo algo_; +}; + +#include "FWCore/Framework/interface/MakerMacros.h" +DEFINE_FWK_MODULE(TestPortableProducerCUDA); diff --git a/HeterogeneousCore/CUDATest/plugins/TestPortableTranscriber.cc b/HeterogeneousCore/CUDATest/plugins/TestPortableTranscriber.cc new file mode 100644 index 0000000000000..1b026f275b8a1 --- /dev/null +++ b/HeterogeneousCore/CUDATest/plugins/TestPortableTranscriber.cc @@ -0,0 +1,67 @@ +#include "CUDADataFormats/Common/interface/Product.h" +#include "CUDADataFormats/PortableTestObjects/interface/TestDeviceCollection.h" +#include "CUDADataFormats/PortableTestObjects/interface/TestHostCollection.h" +#include "FWCore/Framework/interface/Event.h" +#include "FWCore/Framework/interface/EventSetup.h" +#include "FWCore/Framework/interface/Frameworkfwd.h" +#include "FWCore/Framework/interface/stream/EDProducer.h" +#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "FWCore/ParameterSet/interface/ParameterSetDescription.h" +#include "FWCore/ServiceRegistry/interface/Service.h" +#include "FWCore/Utilities/interface/EDGetToken.h" +#include "FWCore/Utilities/interface/InputTag.h" +#include "FWCore/Utilities/interface/StreamID.h" +#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h" +#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h" +#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h" + +class TestPortableTranscriber : public edm::stream::EDProducer { +public: + TestPortableTranscriber(edm::ParameterSet const& config) + : deviceToken_{consumes(config.getParameter("source"))}, hostToken_{produces()} {} + + void beginStream(edm::StreamID) override { + edm::Service service; + if (not service->enabled()) { + throw cms::Exception("Configuration") << "CUDAService is disabled."; + } + } + + void acquire(edm::Event const& event, edm::EventSetup const& setup, edm::WaitingTaskWithArenaHolder task) override { + // create a context reusing the same device and queue as the producer of the input collection + auto const& input = event.get(deviceToken_); + cms::cuda::ScopedContextAcquire ctx{input, std::move(task)}; + + cudatest::TestDeviceCollection const& deviceProduct = ctx.get(input); + + // allocate a host product based on the metadata of the device product + hostProduct_ = cudatest::TestHostCollection{deviceProduct->metadata().size(), ctx.stream()}; + + // copy the content of the device product to the host product + cms::cuda::copyAsync(hostProduct_.buffer(), deviceProduct.const_buffer(), deviceProduct.bufferSize(), ctx.stream()); + + // do not wait for the asynchronous operation to complete + } + + void produce(edm::Event& event, edm::EventSetup const&) override { + // produce() is called once the asynchronous operation has completed, so there is no need for an explicit wait + event.emplace(hostToken_, std::move(hostProduct_)); + } + + static void fillDescriptions(edm::ConfigurationDescriptions& descriptions) { + edm::ParameterSetDescription desc; + desc.add("source"); + descriptions.addWithDefaultLabel(desc); + } + +private: + const edm::EDGetTokenT> deviceToken_; + const edm::EDPutTokenT hostToken_; + + // hold the output product between acquire() and produce() + cudatest::TestHostCollection hostProduct_; +}; + +#include "FWCore/Framework/interface/MakerMacros.h" +DEFINE_FWK_MODULE(TestPortableTranscriber); diff --git a/HeterogeneousCore/CUDATest/python/reader.py b/HeterogeneousCore/CUDATest/python/reader.py new file mode 100644 index 0000000000000..ba4111e4ab73c --- /dev/null +++ b/HeterogeneousCore/CUDATest/python/reader.py @@ -0,0 +1,27 @@ +import FWCore.ParameterSet.Config as cms + +process = cms.Process('Reader') + +# read the products from a 'test.root' file +process.source = cms.Source('PoolSource', + fileNames = cms.untracked.vstring('file:test.root') +) + +# enable logging for the TestPortableAnalyzer +process.MessageLogger.TestPortableAnalyzer = cms.untracked.PSet() + +# analyse the first product +process.testAnalyzer = cms.EDAnalyzer('TestPortableAnalyzer', + source = cms.InputTag('testProducer') +) + +# analyse the second product +process.testAnalyzerSerial = cms.EDAnalyzer('TestPortableAnalyzer', + source = cms.InputTag('testProducerSerial') +) + +process.cuda_path = cms.Path(process.testAnalyzer) + +process.serial_path = cms.Path(process.testAnalyzerSerial) + +process.maxEvents.input = 10 diff --git a/HeterogeneousCore/CUDATest/python/writer.py b/HeterogeneousCore/CUDATest/python/writer.py new file mode 100644 index 0000000000000..5c93a20bcf926 --- /dev/null +++ b/HeterogeneousCore/CUDATest/python/writer.py @@ -0,0 +1,76 @@ +import FWCore.ParameterSet.Config as cms +from HeterogeneousCore.CUDACore.SwitchProducerCUDA import SwitchProducerCUDA + +process = cms.Process('Writer') + +process.source = cms.Source('EmptySource') + +process.load('Configuration.StandardSequences.Accelerators_cff') + +# enable logging for the TestPortableAnalyzer +process.MessageLogger.TestPortableAnalyzer = cms.untracked.PSet() + +# run the producer on a CUDA gpu (if available) +process.testProducerCuda = cms.EDProducer('TestPortableProducerCUDA', + size = cms.int32(42) +) + +# copy the product from the gpu (if available) to the host +process.testTranscriberFromCuda = cms.EDProducer('TestPortableTranscriber', + source = cms.InputTag('testProducerCuda') +) + +# run the producer on the cpu +process.testProducerCpu = cms.EDProducer('TestPortableProducerCPU', + size = cms.int32(42) +) + +# either run the producer on a CUDA gpu (if available) and copy the product to the cpu, or run the producer directly on the cpu +process.testProducer = SwitchProducerCUDA( + cpu = cms.EDAlias( + testProducerCpu = cms.VPSet(cms.PSet(type = cms.string('*'))) + ), + cuda = cms.EDAlias( + testTranscriberFromCuda = cms.VPSet(cms.PSet(type = cms.string('*'))) + ) +) + +# analyse the product +process.testAnalyzer = cms.EDAnalyzer('TestPortableAnalyzer', + source = cms.InputTag('testProducer') +) + +# run a second producer explicitly on the cpu +process.testProducerSerial = cms.EDProducer('TestPortableProducerCPU', + size = cms.int32(99) +) + +# analyse the second product +process.testAnalyzerSerial = cms.EDAnalyzer('TestPortableAnalyzer', + source = cms.InputTag('testProducerSerial') +) + +# write the two products to a 'test.root' file +process.output = cms.OutputModule('PoolOutputModule', + fileName = cms.untracked.string('test.root'), + outputCommands = cms.untracked.vstring( + 'drop *', + 'keep *_testProducer_*_*', + 'keep *_testProducerSerial_*_*', + ) +) + +process.producer_task = cms.Task(process.testProducerCuda, process.testTranscriberFromCuda, process.testProducerCpu) + +process.process_path = cms.Path( + process.testProducer + + process.testAnalyzer, + process.producer_task) + +process.serial_path = cms.Path( + process.testProducerSerial + + process.testAnalyzerSerial) + +process.output_path = cms.EndPath(process.output) + +process.maxEvents.input = 10