diff --git a/CUDADataFormats/Common/BuildFile.xml b/CUDADataFormats/Common/BuildFile.xml index f6b68fe69b400..e4971bdf3ebbe 100644 --- a/CUDADataFormats/Common/BuildFile.xml +++ b/CUDADataFormats/Common/BuildFile.xml @@ -1,4 +1,5 @@ + diff --git a/CUDADataFormats/Common/interface/PortableDeviceCollection.h b/CUDADataFormats/Common/interface/PortableDeviceCollection.h new file mode 100644 index 0000000000000..78f72cb3d5437 --- /dev/null +++ b/CUDADataFormats/Common/interface/PortableDeviceCollection.h @@ -0,0 +1,67 @@ +#ifndef CUDADataFormats_Common_interface_PortableDeviceCollection_h +#define CUDADataFormats_Common_interface_PortableDeviceCollection_h + +#include +#include + +#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" + +namespace cms::cuda { + + // generic SoA-based product in device memory + template + class PortableDeviceCollection { + public: + using Layout = T; + using View = typename Layout::View; + using ConstView = typename Layout::ConstView; + using Buffer = cms::cuda::device::unique_ptr; + + PortableDeviceCollection() = default; + + PortableDeviceCollection(int32_t elements, cudaStream_t stream) + : buffer_{cms::cuda::make_device_unique(Layout::computeDataSize(elements), stream)}, + layout_{buffer_.get(), elements}, + view_{layout_} { + // CUDA device memory uses a default alignment of at least 128 bytes + assert(reinterpret_cast(buffer_.get()) % Layout::alignment == 0); + } + + // non-copyable + PortableDeviceCollection(PortableDeviceCollection const&) = delete; + PortableDeviceCollection& operator=(PortableDeviceCollection const&) = delete; + + // movable + PortableDeviceCollection(PortableDeviceCollection&&) = default; + PortableDeviceCollection& operator=(PortableDeviceCollection&&) = default; + + // default destructor + ~PortableDeviceCollection() = default; + + // access the View + View& view() { return view_; } + ConstView const& view() const { return view_; } + ConstView const& const_view() const { return view_; } + + View& operator*() { return view_; } + ConstView const& operator*() const { return view_; } + + View* operator->() { return &view_; } + ConstView const* operator->() const { return &view_; } + + // access the Buffer + Buffer& buffer() { return buffer_; } + Buffer const& buffer() const { return buffer_; } + Buffer const& const_buffer() const { return buffer_; } + + size_t bufferSize() const { return layout_.metadata().byteSize(); } + + private: + Buffer buffer_; //! + Layout layout_; // + View view_; //! + }; + +} // namespace cms::cuda + +#endif // CUDADataFormats_Common_interface_PortableDeviceCollection_h diff --git a/CUDADataFormats/Common/interface/PortableHostCollection.h b/CUDADataFormats/Common/interface/PortableHostCollection.h new file mode 100644 index 0000000000000..cfaf40c85b3bc --- /dev/null +++ b/CUDADataFormats/Common/interface/PortableHostCollection.h @@ -0,0 +1,85 @@ +#ifndef CUDADataFormats_Common_interface_PortableHostCollection_h +#define CUDADataFormats_Common_interface_PortableHostCollection_h + +#include +#include + +#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" + +namespace cms::cuda { + + // generic SoA-based product in host memory + template + class PortableHostCollection { + public: + using Layout = T; + using View = typename Layout::View; + using ConstView = typename Layout::ConstView; + using Buffer = cms::cuda::host::unique_ptr; + + PortableHostCollection() = default; + + PortableHostCollection(int32_t elements) + // allocate pageable host memory + : buffer_{cms::cuda::make_host_unique(Layout::computeDataSize(elements))}, + layout_{buffer_.get(), elements}, + view_{layout_} { + // make_host_unique for pageable host memory uses a default alignment of 128 bytes + assert(reinterpret_cast(buffer_.get()) % Layout::alignment == 0); + } + + PortableHostCollection(int32_t elements, cudaStream_t stream) + // allocate pinned host memory, accessible by the current device + : buffer_{cms::cuda::make_host_unique(Layout::computeDataSize(elements), stream)}, + layout_{buffer_.get(), elements}, + view_{layout_} { + // CUDA pinned host memory uses a default alignment of at least 128 bytes + assert(reinterpret_cast(buffer_.get()) % Layout::alignment == 0); + } + + // non-copyable + PortableHostCollection(PortableHostCollection const&) = delete; + PortableHostCollection& operator=(PortableHostCollection const&) = delete; + + // movable + PortableHostCollection(PortableHostCollection&&) = default; + PortableHostCollection& operator=(PortableHostCollection&&) = default; + + // default destructor + ~PortableHostCollection() = default; + + // access the View + View& view() { return view_; } + ConstView const& view() const { return view_; } + ConstView const& const_view() const { return view_; } + + View& operator*() { return view_; } + ConstView const& operator*() const { return view_; } + + View* operator->() { return &view_; } + ConstView const* operator->() const { return &view_; } + + // access the Buffer + Buffer& buffer() { return buffer_; } + Buffer const& buffer() const { return buffer_; } + Buffer const& const_buffer() const { return buffer_; } + + size_t bufferSize() const { return layout_.metadata().byteSize(); } + + // part of the ROOT read streamer + static void ROOTReadStreamer(PortableHostCollection* newObj, Layout const& layout) { + newObj->~PortableHostCollection(); + // allocate pinned host memory using the legacy stream, that synchronises with all (blocking) streams + new (newObj) PortableHostCollection(layout.metadata().size()); + newObj->layout_.ROOTReadStreamer(layout); + } + + private: + Buffer buffer_; //! + Layout layout_; // + View view_; //! + }; + +} // namespace cms::cuda + +#endif // CUDADataFormats_Common_interface_PortableHostCollection_h diff --git a/CUDADataFormats/PortableTestObjects/BuildFile.xml b/CUDADataFormats/PortableTestObjects/BuildFile.xml new file mode 100644 index 0000000000000..595a743a6c4c5 --- /dev/null +++ b/CUDADataFormats/PortableTestObjects/BuildFile.xml @@ -0,0 +1,7 @@ + + + + + + + diff --git a/CUDADataFormats/PortableTestObjects/interface/TestDeviceCollection.h b/CUDADataFormats/PortableTestObjects/interface/TestDeviceCollection.h new file mode 100644 index 0000000000000..c0dc803228646 --- /dev/null +++ b/CUDADataFormats/PortableTestObjects/interface/TestDeviceCollection.h @@ -0,0 +1,14 @@ +#ifndef CUDADataFormats_PortableTestObjects_interface_TestDeviceCollection_h +#define CUDADataFormats_PortableTestObjects_interface_TestDeviceCollection_h + +#include "CUDADataFormats/Common/interface/PortableDeviceCollection.h" +#include "DataFormats/PortableTestObjects/interface/TestSoA.h" + +namespace cudatest { + + // SoA with x, y, z, id fields in device global memory + using TestDeviceCollection = cms::cuda::PortableDeviceCollection; + +} // namespace cudatest + +#endif // CUDADataFormats_PortableTestObjects_interface_TestDeviceCollection_h diff --git a/CUDADataFormats/PortableTestObjects/interface/TestHostCollection.h b/CUDADataFormats/PortableTestObjects/interface/TestHostCollection.h new file mode 100644 index 0000000000000..3e2eea89488c7 --- /dev/null +++ b/CUDADataFormats/PortableTestObjects/interface/TestHostCollection.h @@ -0,0 +1,14 @@ +#ifndef CUDADataFormats_PortableTestObjects_interface_TestHostCollection_h +#define CUDADataFormats_PortableTestObjects_interface_TestHostCollection_h + +#include "CUDADataFormats/Common/interface/PortableHostCollection.h" +#include "DataFormats/PortableTestObjects/interface/TestSoA.h" + +namespace cudatest { + + // SoA with x, y, z, id fields in host memory + using TestHostCollection = cms::cuda::PortableHostCollection; + +} // namespace cudatest + +#endif // CUDADataFormats_PortableTestObjects_interface_TestHostCollection_h diff --git a/CUDADataFormats/PortableTestObjects/src/classes.h b/CUDADataFormats/PortableTestObjects/src/classes.h new file mode 100644 index 0000000000000..6f26217bd6646 --- /dev/null +++ b/CUDADataFormats/PortableTestObjects/src/classes.h @@ -0,0 +1,5 @@ +#include "CUDADataFormats/Common/interface/Product.h" +#include "CUDADataFormats/PortableTestObjects/interface/TestDeviceCollection.h" +#include "CUDADataFormats/PortableTestObjects/interface/TestHostCollection.h" +#include "DataFormats/Common/interface/Wrapper.h" +#include "DataFormats/PortableTestObjects/interface/TestSoA.h" diff --git a/CUDADataFormats/PortableTestObjects/src/classes_def.xml b/CUDADataFormats/PortableTestObjects/src/classes_def.xml new file mode 100644 index 0000000000000..892bedcca23d6 --- /dev/null +++ b/CUDADataFormats/PortableTestObjects/src/classes_def.xml @@ -0,0 +1,21 @@ + + + + + + + + + + + + + diff --git a/DataFormats/Portable/interface/PortableDeviceCollection.h b/DataFormats/Portable/interface/PortableDeviceCollection.h index ce87b82ca69ea..6b3a4f51eb964 100644 --- a/DataFormats/Portable/interface/PortableDeviceCollection.h +++ b/DataFormats/Portable/interface/PortableDeviceCollection.h @@ -1,6 +1,7 @@ #ifndef DataFormats_Portable_interface_PortableDeviceCollection_h #define DataFormats_Portable_interface_PortableDeviceCollection_h +#include #include #include diff --git a/DataFormats/Portable/interface/PortableHostCollection.h b/DataFormats/Portable/interface/PortableHostCollection.h index b505064965390..0784bb83e3256 100644 --- a/DataFormats/Portable/interface/PortableHostCollection.h +++ b/DataFormats/Portable/interface/PortableHostCollection.h @@ -1,6 +1,7 @@ #ifndef DataFormats_Portable_interface_PortableHostCollection_h #define DataFormats_Portable_interface_PortableHostCollection_h +#include #include #include "HeterogeneousCore/AlpakaInterface/interface/config.h" diff --git a/DataFormats/SoATemplate/test/SoALayoutAndView_t.cu b/DataFormats/SoATemplate/test/SoALayoutAndView_t.cu index a4f9b0bc7d78f..00e3465a1304a 100644 --- a/DataFormats/SoATemplate/test/SoALayoutAndView_t.cu +++ b/DataFormats/SoATemplate/test/SoALayoutAndView_t.cu @@ -1,3 +1,4 @@ +#include #include #include diff --git a/HeterogeneousCore/AlpakaInterface/interface/getDeviceCachingAllocator.h b/HeterogeneousCore/AlpakaInterface/interface/getDeviceCachingAllocator.h index 3fc7fe8471e8d..f47e7637fa2de 100644 --- a/HeterogeneousCore/AlpakaInterface/interface/getDeviceCachingAllocator.h +++ b/HeterogeneousCore/AlpakaInterface/interface/getDeviceCachingAllocator.h @@ -1,6 +1,7 @@ #ifndef HeterogeneousCore_AlpakaInterface_interface_getDeviceCachingAllocator_h #define HeterogeneousCore_AlpakaInterface_interface_getDeviceCachingAllocator_h +#include #include #include "FWCore/Utilities/interface/thread_safety_macros.h" diff --git a/HeterogeneousCore/AlpakaInterface/interface/host.h b/HeterogeneousCore/AlpakaInterface/interface/host.h index 0303313df12be..66c8a121f2881 100644 --- a/HeterogeneousCore/AlpakaInterface/interface/host.h +++ b/HeterogeneousCore/AlpakaInterface/interface/host.h @@ -1,6 +1,8 @@ #ifndef HeterogeneousCore_AlpakaInterface_interface_host_h #define HeterogeneousCore_AlpakaInterface_interface_host_h +#include + namespace cms::alpakatools { namespace detail { diff --git a/HeterogeneousCore/AlpakaTest/test/BuildFile.xml b/HeterogeneousCore/AlpakaTest/test/BuildFile.xml index c53ee8acdb820..67c261e79d685 100644 --- a/HeterogeneousCore/AlpakaTest/test/BuildFile.xml +++ b/HeterogeneousCore/AlpakaTest/test/BuildFile.xml @@ -1,4 +1 @@ - + diff --git a/HeterogeneousCore/AlpakaTest/test/testHeterogeneousCoreAlpakaTestWriteRead.sh b/HeterogeneousCore/AlpakaTest/test/testHeterogeneousCoreAlpakaTestWriteRead.sh new file mode 100755 index 0000000000000..df4868aa04141 --- /dev/null +++ b/HeterogeneousCore/AlpakaTest/test/testHeterogeneousCoreAlpakaTestWriteRead.sh @@ -0,0 +1,24 @@ + #! /bin/bash -e + +if ! [ "${LOCALTOP}" ]; then + export LOCALTOP=${CMSSW_BASE} + cd ${CMSSW_BASE} +fi + +mkdir -p testHeterogeneousCoreAlpakaTestWriteRead +cd testHeterogeneousCoreAlpakaTestWriteRead +rm -f test.root +echo "--------------------------------------------------------------------------------" +echo "$ cmsRun ${LOCALTOP}/src/HeterogeneousCore/AlpakaTest/test/writer.py" +echo +cmsRun ${LOCALTOP}/src/HeterogeneousCore/AlpakaTest/test/writer.py +echo +echo "--------------------------------------------------------------------------------" +echo "$ cmsRun ${LOCALTOP}/src/HeterogeneousCore/AlpakaTest/test/reader.py" +echo +cmsRun ${LOCALTOP}/src/HeterogeneousCore/AlpakaTest/test/reader.py +echo +echo "--------------------------------------------------------------------------------" +echo "$ edmDumpEventContent test.root" +echo +edmDumpEventContent test.root diff --git a/HeterogeneousCore/CUDATest/plugins/TestAlgo.cc b/HeterogeneousCore/CUDATest/plugins/TestAlgo.cc new file mode 100644 index 0000000000000..3f96aff87eedb --- /dev/null +++ b/HeterogeneousCore/CUDATest/plugins/TestAlgo.cc @@ -0,0 +1,19 @@ +#include "CUDADataFormats/PortableTestObjects/interface/TestHostCollection.h" + +#include "TestAlgo.h" + +namespace cudatest { + + static void testAlgoKernel(cudatest::TestHostCollection::View view, int32_t size) { + view.r() = 1.; + + for (auto i = 0; i < size; ++i) { + view[i] = {0., 0., 0., i}; + } + } + + void TestAlgo::fill(cudatest::TestHostCollection& collection) const { + testAlgoKernel(collection.view(), collection->metadata().size()); + } + +} // namespace cudatest diff --git a/HeterogeneousCore/CUDATest/plugins/TestAlgo.cu b/HeterogeneousCore/CUDATest/plugins/TestAlgo.cu new file mode 100644 index 0000000000000..a2434de387b6e --- /dev/null +++ b/HeterogeneousCore/CUDATest/plugins/TestAlgo.cu @@ -0,0 +1,30 @@ +#include + +#include "CUDADataFormats/PortableTestObjects/interface/TestDeviceCollection.h" + +#include "TestAlgo.h" + +namespace cudatest { + + static __global__ void testAlgoKernel(cudatest::TestDeviceCollection::View view, int32_t size) { + const int32_t thread = blockIdx.x * blockDim.x + threadIdx.x; + const int32_t stride = blockDim.x * gridDim.x; + + if (thread == 0) { + view.r() = 1.; + } + for (auto i = thread; i < size; i += stride) { + view[i] = {0., 0., 0., i}; + } + } + + void TestAlgo::fill(cudatest::TestDeviceCollection& collection, cudaStream_t stream) const { + const uint32_t maxThreadsPerBlock = 1024; + + uint32_t threadsPerBlock = maxThreadsPerBlock; + uint32_t blocksPerGrid = (collection->metadata().size() + threadsPerBlock - 1) / threadsPerBlock; + + testAlgoKernel<<>>(collection.view(), collection->metadata().size()); + } + +} // namespace cudatest diff --git a/HeterogeneousCore/CUDATest/plugins/TestAlgo.h b/HeterogeneousCore/CUDATest/plugins/TestAlgo.h new file mode 100644 index 0000000000000..a91a773234f68 --- /dev/null +++ b/HeterogeneousCore/CUDATest/plugins/TestAlgo.h @@ -0,0 +1,17 @@ +#ifndef HeterogeneousCore_CUDATest_plugins_TestAlgo_h +#define HeterogeneousCore_CUDATest_plugins_TestAlgo_h + +#include "CUDADataFormats/PortableTestObjects/interface/TestDeviceCollection.h" +#include "CUDADataFormats/PortableTestObjects/interface/TestHostCollection.h" + +namespace cudatest { + + class TestAlgo { + public: + void fill(cudatest::TestDeviceCollection& collection, cudaStream_t stream) const; + void fill(cudatest::TestHostCollection& collection) const; + }; + +} // namespace cudatest + +#endif // HeterogeneousCore_CUDATest_plugins_TestAlgo_h diff --git a/HeterogeneousCore/CUDATest/plugins/TestPortableAnalyzer.cc b/HeterogeneousCore/CUDATest/plugins/TestPortableAnalyzer.cc new file mode 100644 index 0000000000000..49ee3cb3b9721 --- /dev/null +++ b/HeterogeneousCore/CUDATest/plugins/TestPortableAnalyzer.cc @@ -0,0 +1,62 @@ +#include + +#include "CUDADataFormats/PortableTestObjects/interface/TestHostCollection.h" +#include "FWCore/Framework/interface/Event.h" +#include "FWCore/Framework/interface/EventSetup.h" +#include "FWCore/Framework/interface/Frameworkfwd.h" +#include "FWCore/Framework/interface/stream/EDAnalyzer.h" +#include "FWCore/MessageLogger/interface/MessageLogger.h" +#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "FWCore/ParameterSet/interface/ParameterSetDescription.h" +#include "FWCore/Utilities/interface/EDGetToken.h" +#include "FWCore/Utilities/interface/InputTag.h" + +class TestPortableAnalyzer : public edm::stream::EDAnalyzer<> { +public: + TestPortableAnalyzer(edm::ParameterSet const& config) + : source_{config.getParameter("source")}, token_{consumes(source_)} {} + + void analyze(edm::Event const& event, edm::EventSetup const&) override { + cudatest::TestHostCollection const& product = event.get(token_); + + auto const& view = product.const_view(); + for (int32_t i = 0; i < view.metadata().size(); ++i) { + assert(view[i].id() == i); + } + + edm::LogInfo msg("TestPortableAnalyzer"); + msg << source_.encode() << ".size() = " << view.metadata().size() << '\n'; + msg << " data = " << product.buffer().get() << ",\n" + << " x = " << view.metadata().addressOf_x() << ",\n" + << " y = " << view.metadata().addressOf_y() << ",\n" + << " z = " << view.metadata().addressOf_z() << ",\n" + << " id = " << view.metadata().addressOf_id() << ",\n" + << " r = " << view.metadata().addressOf_r() << '\n'; + msg << std::hex << " [y - x] = 0x" + << reinterpret_cast(view.metadata().addressOf_y()) - + reinterpret_cast(view.metadata().addressOf_x()) + << " [z - y] = 0x" + << reinterpret_cast(view.metadata().addressOf_z()) - + reinterpret_cast(view.metadata().addressOf_y()) + << " [id - z] = 0x" + << reinterpret_cast(view.metadata().addressOf_id()) - + reinterpret_cast(view.metadata().addressOf_z()) + << " [r - id] = 0x" + << reinterpret_cast(view.metadata().addressOf_r()) - + reinterpret_cast(view.metadata().addressOf_id()); + } + + static void fillDescriptions(edm::ConfigurationDescriptions& descriptions) { + edm::ParameterSetDescription desc; + desc.add("source"); + descriptions.addWithDefaultLabel(desc); + } + +private: + const edm::InputTag source_; + const edm::EDGetTokenT token_; +}; + +#include "FWCore/Framework/interface/MakerMacros.h" +DEFINE_FWK_MODULE(TestPortableAnalyzer); diff --git a/HeterogeneousCore/CUDATest/plugins/TestPortableProducerCPU.cc b/HeterogeneousCore/CUDATest/plugins/TestPortableProducerCPU.cc new file mode 100644 index 0000000000000..19d1611c6b899 --- /dev/null +++ b/HeterogeneousCore/CUDATest/plugins/TestPortableProducerCPU.cc @@ -0,0 +1,45 @@ +#include "CUDADataFormats/PortableTestObjects/interface/TestHostCollection.h" +#include "FWCore/Framework/interface/Event.h" +#include "FWCore/Framework/interface/EventSetup.h" +#include "FWCore/Framework/interface/Frameworkfwd.h" +#include "FWCore/Framework/interface/stream/EDProducer.h" +#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "FWCore/ParameterSet/interface/ParameterSetDescription.h" +#include "FWCore/ServiceRegistry/interface/Service.h" +#include "FWCore/Utilities/interface/EDGetToken.h" +#include "FWCore/Utilities/interface/InputTag.h" +#include "FWCore/Utilities/interface/StreamID.h" + +#include "TestAlgo.h" + +class TestPortableProducerCPU : public edm::stream::EDProducer<> { +public: + TestPortableProducerCPU(edm::ParameterSet const& config) + : hostToken_{produces()}, size_{config.getParameter("size")} {} + + void produce(edm::Event& event, edm::EventSetup const&) override { + // run the algorithm + cudatest::TestHostCollection hostProduct{size_}; + algo_.fill(hostProduct); + + // put the product into the event + event.emplace(hostToken_, std::move(hostProduct)); + } + + static void fillDescriptions(edm::ConfigurationDescriptions& descriptions) { + edm::ParameterSetDescription desc; + desc.add("size"); + descriptions.addWithDefaultLabel(desc); + } + +private: + const edm::EDPutTokenT hostToken_; + const int32_t size_; + + // implementation of the algorithm + cudatest::TestAlgo algo_; +}; + +#include "FWCore/Framework/interface/MakerMacros.h" +DEFINE_FWK_MODULE(TestPortableProducerCPU); diff --git a/HeterogeneousCore/CUDATest/plugins/TestPortableProducerCUDA.cc b/HeterogeneousCore/CUDATest/plugins/TestPortableProducerCUDA.cc new file mode 100644 index 0000000000000..debafd6b874dd --- /dev/null +++ b/HeterogeneousCore/CUDATest/plugins/TestPortableProducerCUDA.cc @@ -0,0 +1,49 @@ +#include "CUDADataFormats/Common/interface/Product.h" +#include "CUDADataFormats/PortableTestObjects/interface/TestDeviceCollection.h" +#include "FWCore/Framework/interface/Event.h" +#include "FWCore/Framework/interface/EventSetup.h" +#include "FWCore/Framework/interface/Frameworkfwd.h" +#include "FWCore/Framework/interface/stream/EDProducer.h" +#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "FWCore/ParameterSet/interface/ParameterSetDescription.h" +#include "FWCore/Utilities/interface/EDGetToken.h" +#include "FWCore/Utilities/interface/InputTag.h" +#include "FWCore/Utilities/interface/StreamID.h" +#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h" + +#include "TestAlgo.h" + +class TestPortableProducerCUDA : public edm::stream::EDProducer<> { +public: + TestPortableProducerCUDA(edm::ParameterSet const& config) + : deviceToken_{produces()}, size_{config.getParameter("size")} {} + + void produce(edm::Event& event, edm::EventSetup const&) override { + // create a context based on the EDM stream number + cms::cuda::ScopedContextProduce ctx(event.streamID()); + + // run the algorithm, potentially asynchronously + cudatest::TestDeviceCollection deviceProduct{size_, ctx.stream()}; + algo_.fill(deviceProduct, ctx.stream()); + + // put the asynchronous product into the event without waiting + ctx.emplace(event, deviceToken_, std::move(deviceProduct)); + } + + static void fillDescriptions(edm::ConfigurationDescriptions& descriptions) { + edm::ParameterSetDescription desc; + desc.add("size"); + descriptions.addWithDefaultLabel(desc); + } + +private: + const edm::EDPutTokenT> deviceToken_; + const int32_t size_; + + // implementation of the algorithm + cudatest::TestAlgo algo_; +}; + +#include "FWCore/Framework/interface/MakerMacros.h" +DEFINE_FWK_MODULE(TestPortableProducerCUDA); diff --git a/HeterogeneousCore/CUDATest/plugins/TestPortableTranscriber.cc b/HeterogeneousCore/CUDATest/plugins/TestPortableTranscriber.cc new file mode 100644 index 0000000000000..4b251ad720d0e --- /dev/null +++ b/HeterogeneousCore/CUDATest/plugins/TestPortableTranscriber.cc @@ -0,0 +1,58 @@ +#include "CUDADataFormats/Common/interface/Product.h" +#include "CUDADataFormats/PortableTestObjects/interface/TestDeviceCollection.h" +#include "CUDADataFormats/PortableTestObjects/interface/TestHostCollection.h" +#include "FWCore/Framework/interface/Event.h" +#include "FWCore/Framework/interface/EventSetup.h" +#include "FWCore/Framework/interface/Frameworkfwd.h" +#include "FWCore/Framework/interface/stream/EDProducer.h" +#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "FWCore/ParameterSet/interface/ParameterSetDescription.h" +#include "FWCore/Utilities/interface/EDGetToken.h" +#include "FWCore/Utilities/interface/InputTag.h" +#include "FWCore/Utilities/interface/StreamID.h" +#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h" +#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h" + +class TestPortableTranscriber : public edm::stream::EDProducer { +public: + TestPortableTranscriber(edm::ParameterSet const& config) + : deviceToken_{consumes(config.getParameter("source"))}, hostToken_{produces()} {} + + void acquire(edm::Event const& event, edm::EventSetup const& setup, edm::WaitingTaskWithArenaHolder task) override { + // create a context reusing the same device and queue as the producer of the input collection + auto const& input = event.get(deviceToken_); + cms::cuda::ScopedContextAcquire ctx{input, std::move(task)}; + + cudatest::TestDeviceCollection const& deviceProduct = ctx.get(input); + + // allocate a host product based on the metadata of the device product + hostProduct_ = cudatest::TestHostCollection{deviceProduct->metadata().size(), ctx.stream()}; + + // copy the content of the device product to the host product + cms::cuda::copyAsync(hostProduct_.buffer(), deviceProduct.const_buffer(), deviceProduct.bufferSize(), ctx.stream()); + + // do not wait for the asynchronous operation to complete + } + + void produce(edm::Event& event, edm::EventSetup const&) override { + // produce() is called once the asynchronous operation has completed, so there is no need for an explicit wait + event.emplace(hostToken_, std::move(hostProduct_)); + } + + static void fillDescriptions(edm::ConfigurationDescriptions& descriptions) { + edm::ParameterSetDescription desc; + desc.add("source"); + descriptions.addWithDefaultLabel(desc); + } + +private: + const edm::EDGetTokenT> deviceToken_; + const edm::EDPutTokenT hostToken_; + + // hold the output product between acquire() and produce() + cudatest::TestHostCollection hostProduct_; +}; + +#include "FWCore/Framework/interface/MakerMacros.h" +DEFINE_FWK_MODULE(TestPortableTranscriber); diff --git a/HeterogeneousCore/CUDATest/test/BuildFile.xml b/HeterogeneousCore/CUDATest/test/BuildFile.xml index 60351fbfe9b88..0faf0245bbe78 100644 --- a/HeterogeneousCore/CUDATest/test/BuildFile.xml +++ b/HeterogeneousCore/CUDATest/test/BuildFile.xml @@ -11,4 +11,5 @@ + diff --git a/HeterogeneousCore/CUDATest/test/reader.py b/HeterogeneousCore/CUDATest/test/reader.py new file mode 100644 index 0000000000000..ba4111e4ab73c --- /dev/null +++ b/HeterogeneousCore/CUDATest/test/reader.py @@ -0,0 +1,27 @@ +import FWCore.ParameterSet.Config as cms + +process = cms.Process('Reader') + +# read the products from a 'test.root' file +process.source = cms.Source('PoolSource', + fileNames = cms.untracked.vstring('file:test.root') +) + +# enable logging for the TestPortableAnalyzer +process.MessageLogger.TestPortableAnalyzer = cms.untracked.PSet() + +# analyse the first product +process.testAnalyzer = cms.EDAnalyzer('TestPortableAnalyzer', + source = cms.InputTag('testProducer') +) + +# analyse the second product +process.testAnalyzerSerial = cms.EDAnalyzer('TestPortableAnalyzer', + source = cms.InputTag('testProducerSerial') +) + +process.cuda_path = cms.Path(process.testAnalyzer) + +process.serial_path = cms.Path(process.testAnalyzerSerial) + +process.maxEvents.input = 10 diff --git a/HeterogeneousCore/CUDATest/test/testHeterogeneousCoreCUDATestWriteRead.sh b/HeterogeneousCore/CUDATest/test/testHeterogeneousCoreCUDATestWriteRead.sh new file mode 100755 index 0000000000000..4402e119e1012 --- /dev/null +++ b/HeterogeneousCore/CUDATest/test/testHeterogeneousCoreCUDATestWriteRead.sh @@ -0,0 +1,24 @@ + #! /bin/bash -e + +if ! [ "${LOCALTOP}" ]; then + export LOCALTOP=${CMSSW_BASE} + cd ${CMSSW_BASE} +fi + +mkdir -p testHeterogeneousCoreCUDATestWriteRead +cd testHeterogeneousCoreCUDATestWriteRead +rm -f test.root +echo "--------------------------------------------------------------------------------" +echo "$ cmsRun ${LOCALTOP}/src/HeterogeneousCore/CUDATest/test/writer.py" +echo +cmsRun ${LOCALTOP}/src/HeterogeneousCore/CUDATest/test/writer.py +echo +echo "--------------------------------------------------------------------------------" +echo "$ cmsRun ${LOCALTOP}/src/HeterogeneousCore/CUDATest/test/reader.py" +echo +cmsRun ${LOCALTOP}/src/HeterogeneousCore/CUDATest/test/reader.py +echo +echo "--------------------------------------------------------------------------------" +echo "$ edmDumpEventContent test.root" +echo +edmDumpEventContent test.root diff --git a/HeterogeneousCore/CUDATest/test/writer.py b/HeterogeneousCore/CUDATest/test/writer.py new file mode 100644 index 0000000000000..5c93a20bcf926 --- /dev/null +++ b/HeterogeneousCore/CUDATest/test/writer.py @@ -0,0 +1,76 @@ +import FWCore.ParameterSet.Config as cms +from HeterogeneousCore.CUDACore.SwitchProducerCUDA import SwitchProducerCUDA + +process = cms.Process('Writer') + +process.source = cms.Source('EmptySource') + +process.load('Configuration.StandardSequences.Accelerators_cff') + +# enable logging for the TestPortableAnalyzer +process.MessageLogger.TestPortableAnalyzer = cms.untracked.PSet() + +# run the producer on a CUDA gpu (if available) +process.testProducerCuda = cms.EDProducer('TestPortableProducerCUDA', + size = cms.int32(42) +) + +# copy the product from the gpu (if available) to the host +process.testTranscriberFromCuda = cms.EDProducer('TestPortableTranscriber', + source = cms.InputTag('testProducerCuda') +) + +# run the producer on the cpu +process.testProducerCpu = cms.EDProducer('TestPortableProducerCPU', + size = cms.int32(42) +) + +# either run the producer on a CUDA gpu (if available) and copy the product to the cpu, or run the producer directly on the cpu +process.testProducer = SwitchProducerCUDA( + cpu = cms.EDAlias( + testProducerCpu = cms.VPSet(cms.PSet(type = cms.string('*'))) + ), + cuda = cms.EDAlias( + testTranscriberFromCuda = cms.VPSet(cms.PSet(type = cms.string('*'))) + ) +) + +# analyse the product +process.testAnalyzer = cms.EDAnalyzer('TestPortableAnalyzer', + source = cms.InputTag('testProducer') +) + +# run a second producer explicitly on the cpu +process.testProducerSerial = cms.EDProducer('TestPortableProducerCPU', + size = cms.int32(99) +) + +# analyse the second product +process.testAnalyzerSerial = cms.EDAnalyzer('TestPortableAnalyzer', + source = cms.InputTag('testProducerSerial') +) + +# write the two products to a 'test.root' file +process.output = cms.OutputModule('PoolOutputModule', + fileName = cms.untracked.string('test.root'), + outputCommands = cms.untracked.vstring( + 'drop *', + 'keep *_testProducer_*_*', + 'keep *_testProducerSerial_*_*', + ) +) + +process.producer_task = cms.Task(process.testProducerCuda, process.testTranscriberFromCuda, process.testProducerCpu) + +process.process_path = cms.Path( + process.testProducer + + process.testAnalyzer, + process.producer_task) + +process.serial_path = cms.Path( + process.testProducerSerial + + process.testAnalyzerSerial) + +process.output_path = cms.EndPath(process.output) + +process.maxEvents.input = 10 diff --git a/HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h b/HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h index a64b7c9869d6c..ea89cc7490d85 100644 --- a/HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h +++ b/HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h @@ -1,6 +1,7 @@ #ifndef HeterogeneousCore_CUDAUtilities_interface_host_unique_ptr_h #define HeterogeneousCore_CUDAUtilities_interface_host_unique_ptr_h +#include #include #include @@ -10,11 +11,32 @@ namespace cms { namespace cuda { namespace host { namespace impl { - // Additional layer of types to distinguish from host::unique_ptr + + enum class MemoryType : bool { + kDefault = false, + kPinned = true, + }; + + // Custom deleter for host memory, with an internal state to distinguish pageable and pinned host memory class HostDeleter { public: - void operator()(void *ptr) { cms::cuda::free_host(ptr); } + // The default constructor is needed by the default constructor of unique_ptr, + // which is needed by the default constructor of HostProduct, which is needed by the ROOT dictionary + HostDeleter() : type_{MemoryType::kDefault} {} + HostDeleter(MemoryType type) : type_{type} {} + + void operator()(void *ptr) { + if (type_ == MemoryType::kPinned) { + cms::cuda::free_host(ptr); + } else { + std::free(ptr); + } + } + + private: + MemoryType type_; }; + } // namespace impl template @@ -36,24 +58,55 @@ namespace cms { } // namespace impl } // namespace host + // Allocate pageable host memory + template + typename host::impl::make_host_unique_selector::non_array make_host_unique() { + static_assert(std::is_trivially_constructible::value, + "Allocating with non-trivial constructor on the host memory is not supported"); + // Allocate a buffer aligned to 128 bytes, to match the CUDA cache line size + const size_t alignment = 128; + // std::aligned_alloc() requires the size to be a multiple of the alignment + const size_t size = (sizeof(T) + alignment - 1) / alignment * alignment; + void *mem = std::aligned_alloc(alignment, size); + return typename host::impl::make_host_unique_selector::non_array{reinterpret_cast(mem), + host::impl::MemoryType::kDefault}; + } + + template + typename host::impl::make_host_unique_selector::unbounded_array make_host_unique(size_t n) { + using element_type = typename std::remove_extent::type; + static_assert(std::is_trivially_constructible::value, + "Allocating with non-trivial constructor on the host memory is not supported"); + // Allocate a buffer aligned to 128 bytes, to match the CUDA cache line size + const size_t alignment = 128; + // std::aligned_alloc() requires the size to be a multiple of the alignment + const size_t size = (n * sizeof(element_type) + alignment - 1) / alignment * alignment; + void *mem = std::aligned_alloc(alignment, size); + return typename host::impl::make_host_unique_selector::unbounded_array{reinterpret_cast(mem), + host::impl::MemoryType::kDefault}; + } + // Allocate pinned host memory template typename host::impl::make_host_unique_selector::non_array make_host_unique(cudaStream_t stream) { static_assert(std::is_trivially_constructible::value, - "Allocating with non-trivial constructor on the pinned host memory is not supported"); + "Allocating with non-trivial constructor on the host memory is not supported"); void *mem = allocate_host(sizeof(T), stream); - return typename host::impl::make_host_unique_selector::non_array{reinterpret_cast(mem)}; + return typename host::impl::make_host_unique_selector::non_array{reinterpret_cast(mem), // + host::impl::MemoryType::kPinned}; } template typename host::impl::make_host_unique_selector::unbounded_array make_host_unique(size_t n, cudaStream_t stream) { using element_type = typename std::remove_extent::type; static_assert(std::is_trivially_constructible::value, - "Allocating with non-trivial constructor on the pinned host memory is not supported"); + "Allocating with non-trivial constructor on the host memory is not supported"); void *mem = allocate_host(n * sizeof(element_type), stream); - return typename host::impl::make_host_unique_selector::unbounded_array{reinterpret_cast(mem)}; + return typename host::impl::make_host_unique_selector::unbounded_array{reinterpret_cast(mem), + host::impl::MemoryType::kPinned}; } + // Arrays of known bounds are not supported by std::unique_ptr template typename host::impl::make_host_unique_selector::bounded_array make_host_unique(Args &&...) = delete; @@ -61,7 +114,8 @@ namespace cms { template typename host::impl::make_host_unique_selector::non_array make_host_unique_uninitialized(cudaStream_t stream) { void *mem = allocate_host(sizeof(T), stream); - return typename host::impl::make_host_unique_selector::non_array{reinterpret_cast(mem)}; + return typename host::impl::make_host_unique_selector::non_array{reinterpret_cast(mem), // + host::impl::MemoryType::kPinned}; } template @@ -69,11 +123,14 @@ namespace cms { size_t n, cudaStream_t stream) { using element_type = typename std::remove_extent::type; void *mem = allocate_host(n * sizeof(element_type), stream); - return typename host::impl::make_host_unique_selector::unbounded_array{reinterpret_cast(mem)}; + return typename host::impl::make_host_unique_selector::unbounded_array{reinterpret_cast(mem), + host::impl::MemoryType::kPinned}; } + // Arrays of known bounds are not supported by std::unique_ptr template typename host::impl::make_host_unique_selector::bounded_array make_host_unique_uninitialized(Args &&...) = delete; + } // namespace cuda } // namespace cms