Move BeamSpot transfer to GPU to its own producer (cms-sw#318)

Implement a non-caching host allocator, useful for host-to-device copy buffers: - not bound to any CUDA stream to allow use in EDM beginStream(); - with the possibility to pass flags to cudaHostAlloc(), e.g. cudaHostAllocWriteCombined. Add perfect forwarding overload for CUDAProduct constructor, enabling the use of CUDAScopedContext::emplace() in BeamSpotToCUDA::produce(). Move the BeamSpot host-to-device transfer to its own EDProducer, making use of beginStream()-allocated write-combined memory for the transfer.
makortel · Apr 23, 2019 · fcbb820 · fcbb820
1 parent 9407e0e
commit fcbb820
Show file tree

Hide file tree

Showing 23 changed files with 300 additions and 35 deletions.
diff --git a/CUDADataFormats/BeamSpot/BuildFile.xml b/CUDADataFormats/BeamSpot/BuildFile.xml
@@ -0,0 +1,8 @@
+<use name="FWCore/ServiceRegistry"/>
+<use name="HeterogeneousCore/CUDAServices"/>
+<use name="cuda-api-wrappers"/>
+<use name="rootcore"/>
+
+<export>
+    <lib name="1"/>
+</export>
diff --git a/CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h b/CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h
@@ -0,0 +1,32 @@
+#ifndef CUDADataFormats_BeamSpot_interface_BeamSpotCUDA_h
+#define CUDADataFormats_BeamSpot_interface_BeamSpotCUDA_h
+
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+
+#include <cuda/api_wrappers.h>
+
+class BeamSpotCUDA {
+public:
+  // alignas(128) doesn't really make sense as there is only one
+  // beamspot per event? 
+  struct Data {
+    float x,y,z;   // position
+    // TODO: add covariance matrix
+
+    float sigmaZ;
+    float beamWidthX, beamWidthY;
+    float dxdz, dydz;
+    float emittanceX, emittanceY;
+    float betaStar;
+  };
+
+  BeamSpotCUDA() = default;
+  BeamSpotCUDA(Data const* data_h, cuda::stream_t<>& stream);
+
+  Data const* data() const { return data_d_.get(); }
+
+private:
+  cudautils::device::unique_ptr<Data> data_d_;
+};
+
+#endif
diff --git a/CUDADataFormats/BeamSpot/src/BeamSpotCUDA.cc b/CUDADataFormats/BeamSpot/src/BeamSpotCUDA.cc
@@ -0,0 +1,11 @@
+#include "CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h"
+
+#include "FWCore/ServiceRegistry/interface/Service.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+
+BeamSpotCUDA::BeamSpotCUDA(Data const* data_h, cuda::stream_t<>& stream) {
+  edm::Service<CUDAService> cs;
+
+  data_d_ = cs->make_device_unique<Data>(stream);
+  cuda::memory::async::copy(data_d_.get(), data_h, sizeof(Data), stream.id());
+}
diff --git a/CUDADataFormats/BeamSpot/src/classes.h b/CUDADataFormats/BeamSpot/src/classes.h
@@ -0,0 +1,8 @@
+#ifndef CUDADataFormats_BeamSpot_classes_h
+#define CUDADataFormats_BeamSpot_classes_h
+
+#include "CUDADataFormats/Common/interface/CUDAProduct.h"
+#include "CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h"
+#include "DataFormats/Common/interface/Wrapper.h"
+
+#endif
diff --git a/CUDADataFormats/BeamSpot/src/classes_def.xml b/CUDADataFormats/BeamSpot/src/classes_def.xml
@@ -0,0 +1,4 @@
+<lcgdict>
+  <class name="CUDAProduct<BeamSpotCUDA>" persistent="false"/>
+  <class name="edm::Wrapper<CUDAProduct<BeamSpotCUDA>>" persistent="false"/>
+</lcgdict>
diff --git a/CUDADataFormats/Common/interface/CUDAProduct.h b/CUDADataFormats/Common/interface/CUDAProduct.h
@@ -45,6 +45,12 @@ class CUDAProduct: public CUDAProductBase {
     data_(std::move(data))
   {}
 
+  template <typename... Args>
+  explicit CUDAProduct(int device, std::shared_ptr<cuda::stream_t<>> stream, std::shared_ptr<cuda::event_t> event, Args&&... args):
+    CUDAProductBase(device, std::move(stream), std::move(event)),
+    data_(std::forward<Args>(args)...)
+  {}
+
   T data_; //!
 };
 

diff --git a/Configuration/StandardSequences/python/Reconstruction_cff.py b/Configuration/StandardSequences/python/Reconstruction_cff.py
@@ -198,9 +198,9 @@
 reconstruction_trackingOnly = cms.Sequence(localreco*globalreco_tracking)
 reconstruction_pixelTrackingOnly = cms.Sequence(
     pixeltrackerlocalreco*
-    offlineBeamSpot*
     siPixelClusterShapeCachePreSplitting*
-    recopixelvertexing
+    recopixelvertexing,
+    offlineBeamSpotTask
 )
 
 #need a fully expanded sequence copy

diff --git a/HeterogeneousCore/CUDAUtilities/BuildFile.xml b/HeterogeneousCore/CUDAUtilities/BuildFile.xml
@@ -1,5 +1,6 @@
 <use name="cub"/>
 <use name="cuda"/>
+<use name="cuda-api-wrappers"/>
 
 <export>
     <lib name="1"/>

diff --git a/HeterogeneousCore/CUDAUtilities/interface/host_noncached_unique_ptr.h b/HeterogeneousCore/CUDAUtilities/interface/host_noncached_unique_ptr.h
@@ -0,0 +1,65 @@
+#ifndef HeterogeneousCore_CUDAUtilities_interface_host_noncached_unique_ptr_h
+#define HeterogeneousCore_CUDAUtilities_interface_host_noncached_unique_ptr_h
+
+#include <memory>
+
+#include <cuda/api_wrappers.h>
+#include <cuda_runtime.h>
+
+namespace cudautils {
+  namespace host {
+    namespace noncached {
+      namespace impl {
+        // Additional layer of types to distinguish from host::unique_ptr
+        class HostDeleter {
+        public:
+          void operator()(void *ptr) {
+            cuda::throw_if_error(cudaFreeHost(ptr));
+          }
+        };
+      }
+
+      template <typename T>
+      using unique_ptr = std::unique_ptr<T, impl::HostDeleter>;
+
+      namespace impl {
+        template <typename T>
+        struct make_host_unique_selector { using non_array = cudautils::host::noncached::unique_ptr<T>; };
+        template <typename T>
+        struct make_host_unique_selector<T[]> { using unbounded_array = cudautils::host::noncached::unique_ptr<T[]>; };
+        template <typename T, size_t N>
+        struct make_host_unique_selector<T[N]> { struct bounded_array {}; };
+      }
+    }
+  }
+
+  /**
+   * The difference wrt. CUDAService::make_host_unique is that these
+   * do not cache, so they should not be called per-event.
+   */
+  template <typename T>
+  typename host::noncached::impl::make_host_unique_selector<T>::non_array
+  make_host_noncached_unique(unsigned int flags = cudaHostAllocDefault) {
+    static_assert(std::is_trivially_constructible<T>::value, "Allocating with non-trivial constructor on the pinned host memory is not supported");
+    void *mem;
+    cuda::throw_if_error(cudaHostAlloc(&mem, sizeof(T), flags));
+    return typename cudautils::host::noncached::impl::make_host_unique_selector<T>::non_array(reinterpret_cast<T *>(mem));
+  }
+
+  template <typename T>
+  typename host::noncached::impl::make_host_unique_selector<T>::unbounded_array
+  make_host_noncached_unique(size_t n, unsigned int flags = cudaHostAllocDefault) {
+    using element_type = typename std::remove_extent<T>::type;
+    static_assert(std::is_trivially_constructible<element_type>::value, "Allocating with non-trivial constructor on the pinned host memory is not supported");
+    void *mem;
+    cuda::throw_if_error(cudaHostAlloc(&mem, n*sizeof(element_type), flags));
+    return typename cudautils::host::noncached::impl::make_host_unique_selector<T>::unbounded_array(reinterpret_cast<element_type *>(mem));
+  }
+
+  template <typename T, typename ...Args>
+  typename cudautils::host::noncached::impl::make_host_unique_selector<T>::bounded_array
+  make_host_noncached_unique(Args&&...) = delete;
+}
+
+#endif
+
diff --git a/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml b/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml
@@ -68,7 +68,7 @@
   <flags CUDA_FLAGS="-g -DGPU_DEBUG"/>
 </bin>
 
-<bin file="testCatch2Main.cpp,copyAsync_t.cpp,memsetAsync_t.cpp" name="cudaMemUtils_t">
+<bin file="testCatch2Main.cpp,copyAsync_t.cpp,memsetAsync_t.cpp,host_noncached_unique_ptr_t.cpp" name="cudaMemUtils_t">
   <use name="HeterogeneousCore/CUDAServices"/>
   <use name="catch2"/>
 </bin>
diff --git a/HeterogeneousCore/CUDAUtilities/test/host_noncached_unique_ptr_t.cpp b/HeterogeneousCore/CUDAUtilities/test/host_noncached_unique_ptr_t.cpp
@@ -0,0 +1,22 @@
+#include "catch.hpp"
+
+#include "HeterogeneousCore/CUDAUtilities/interface/host_noncached_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h"
+
+TEST_CASE("host_noncached_unique_ptr", "[cudaMemTools]") {
+  exitSansCUDADevices();
+
+  SECTION("Single element") {
+    auto ptr1 = cudautils::make_host_noncached_unique<int>();
+    REQUIRE(ptr1 != nullptr);
+    auto ptr2 = cudautils::make_host_noncached_unique<int>(cudaHostAllocPortable | cudaHostAllocWriteCombined);
+    REQUIRE(ptr2 != nullptr);
+  }
+
+  SECTION("Multiple elements") {
+    auto ptr1 = cudautils::make_host_noncached_unique<int[]>(10);
+    REQUIRE(ptr1 != nullptr);
+    auto ptr2 = cudautils::make_host_noncached_unique<int[]>(10, cudaHostAllocPortable | cudaHostAllocWriteCombined);
+    REQUIRE(ptr2 != nullptr);
+  }
+}
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterCUDA.cc b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterCUDA.cc
@@ -24,7 +24,9 @@
 #include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
 #include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
 #include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ServiceRegistry/interface/Service.h"
 #include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
 #include "RecoLocalTracker/SiPixelClusterizer/interface/SiPixelFedCablingMapGPUWrapper.h"
 #include "RecoTracker/Record/interface/CkfComponentsRecord.h"
 
@@ -62,6 +64,7 @@ class SiPixelRawToClusterCUDA: public edm::stream::EDProducer<edm::ExternalWork>
   std::unique_ptr<PixelUnpackingRegions> regions_;
 
   pixelgpudetails::SiPixelRawToClusterGPUKernel gpuAlgo_;
+  std::unique_ptr<pixelgpudetails::SiPixelRawToClusterGPUKernel::WordFedAppender> wordFedAppender_;
   PixelDataFormatter::Errors errors_;
 
   const bool includeErrors_;
@@ -88,6 +91,11 @@ SiPixelRawToClusterCUDA::SiPixelRawToClusterCUDA(const edm::ParameterSet& iConfi
   }
 
   if(usePilotBlade_) edm::LogInfo("SiPixelRawToCluster")  << " Use pilot blade data (FED 40)";
+
+  edm::Service<CUDAService> cs;
+  if(cs->enabled()) {
+    wordFedAppender_ = std::make_unique<pixelgpudetails::SiPixelRawToClusterGPUKernel::WordFedAppender>();
+  }
 }
 
 void SiPixelRawToClusterCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
@@ -161,7 +169,6 @@ void SiPixelRawToClusterCUDA::acquire(const edm::Event& iEvent, const edm::Event
 
   // In CPU algorithm this loop is part of PixelDataFormatter::interpretRawData()
   ErrorChecker errorcheck;
-  auto wordFedAppender = pixelgpudetails::SiPixelRawToClusterGPUKernel::WordFedAppender(ctx.stream());
   for(int fedId: fedIds_) {
     if (!usePilotBlade_ && (fedId==40) ) continue; // skip pilot blade data
     if (regions_ && !regions_->mayUnpackFED(fedId)) continue;
@@ -209,13 +216,13 @@ void SiPixelRawToClusterCUDA::acquire(const edm::Event& iEvent, const edm::Event
     const cms_uint32_t * ew = (const cms_uint32_t *)(trailer);
 
     assert(0 == (ew-bw)%2);
-    wordFedAppender.initializeWordFed(fedId, wordCounterGPU, bw, (ew-bw));
+    wordFedAppender_->initializeWordFed(fedId, wordCounterGPU, bw, (ew-bw));
     wordCounterGPU+=(ew-bw);
 
   } // end of for loop
 
   gpuAlgo_.makeClustersAsync(gpuMap, gpuModulesToUnpack, gpuGains,
-                             wordFedAppender,
+                             *wordFedAppender_,
                              std::move(errors_),
                              wordCounterGPU, fedCounter, 
                              useQuality_, includeErrors_,

diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu
@@ -45,10 +45,9 @@ namespace pixelgpudetails {
   // number of words for all the FEDs
   constexpr uint32_t MAX_FED_WORDS   = pixelgpudetails::MAX_FED * pixelgpudetails::MAX_WORD;
 
-  SiPixelRawToClusterGPUKernel::WordFedAppender::WordFedAppender(cuda::stream_t<>& cudaStream) {
-    edm::Service<CUDAService> cs;
-    word_ = cs->make_host_unique<unsigned int[]>(MAX_FED_WORDS, cudaStream);
-    fedId_ = cs->make_host_unique<unsigned char[]>(MAX_FED_WORDS, cudaStream);
+  SiPixelRawToClusterGPUKernel::WordFedAppender::WordFedAppender() {
+    word_ = cudautils::make_host_noncached_unique<unsigned int[]>(MAX_FED_WORDS, cudaHostAllocWriteCombined);
+    fedId_ = cudautils::make_host_noncached_unique<unsigned char[]>(MAX_FED_WORDS, cudaHostAllocWriteCombined);
   }
 
   void SiPixelRawToClusterGPUKernel::WordFedAppender::initializeWordFed(int fedId, unsigned int wordCounterGPU, const cms_uint32_t *src, unsigned int length) {

diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h
@@ -11,6 +11,7 @@
 #include "FWCore/Utilities/interface/typedefs.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/host_noncached_unique_ptr.h"
 #include "DataFormats/SiPixelDigi/interface/PixelErrors.h"
 
 struct SiPixelFedCablingMapGPU;
@@ -159,7 +160,7 @@ namespace pixelgpudetails {
   public:
     class WordFedAppender {
     public:
-      WordFedAppender(cuda::stream_t<>& cudaStream);
+      WordFedAppender();
       ~WordFedAppender() = default;
 
       void initializeWordFed(int fedId, unsigned int wordCounterGPU, const cms_uint32_t *src, unsigned int length);
@@ -168,8 +169,8 @@ namespace pixelgpudetails {
       const unsigned char *fedId() const { return fedId_.get(); }
 
     private:
-      cudautils::host::unique_ptr<unsigned int[]> word_;
-      cudautils::host::unique_ptr<unsigned char[]> fedId_;
+      cudautils::host::noncached::unique_ptr<unsigned int[]> word_;
+      cudautils::host::noncached::unique_ptr<unsigned char[]> fedId_;
     };
 
     SiPixelRawToClusterGPUKernel() = default;

diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/BuildFile.xml b/RecoLocalTracker/SiPixelRecHits/plugins/BuildFile.xml
@@ -1,3 +1,4 @@
+<use name="CUDADataFormats/BeamSpot"/>
 <use name="DataFormats/TrackerCommon"/>
 <use name="HeterogeneousCore/CUDACore"/>
 <use name="HeterogeneousCore/Producer"/>

diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu
@@ -36,7 +36,6 @@ namespace pixelgpudetails {
 
     constexpr auto MAX_HITS = siPixelRecHitsHeterogeneousProduct::maxHits();
 
-    cudaCheck(cudaMalloc((void **) & gpu_.bs_d, 3 * sizeof(float)));
     cudaCheck(cudaMalloc((void **) & gpu_.hitsLayerStart_d, 11 * sizeof(uint32_t)));
 
     // Coalesce all 32bit and 16bit arrays to two big blobs
@@ -111,7 +110,6 @@ namespace pixelgpudetails {
 #endif
   }
   PixelRecHitGPUKernel::~PixelRecHitGPUKernel() {
-    cudaCheck(cudaFree(gpu_.bs_d));
     cudaCheck(cudaFree(gpu_.hitsLayerStart_d));
     cudaCheck(cudaFree(gpu_.owner_32bit_));
     cudaCheck(cudaFree(gpu_.owner_16bit_));
@@ -131,11 +129,10 @@ namespace pixelgpudetails {
 
   void PixelRecHitGPUKernel::makeHitsAsync(SiPixelDigisCUDA const& digis_d,
                                            SiPixelClustersCUDA const& clusters_d,
-                                           float const * bs,
+                                           BeamSpotCUDA const& bs_d,
                                            pixelCPEforGPU::ParamsOnGPU const * cpeParams,
                                            bool transferToCPU,
                                            cuda::stream_t<>& stream) {
-    cudaCheck(cudaMemcpyAsync(gpu_.bs_d, bs, 3 * sizeof(float), cudaMemcpyDefault, stream.id()));
     gpu_.hitsModuleStart_d = clusters_d.clusModuleStart();
     gpu_.cpeParams = cpeParams; // copy it for use in clients
     cudaCheck(cudaMemcpyAsync(gpu_d, &gpu_, sizeof(HitsOnGPU), cudaMemcpyDefault, stream.id()));
@@ -148,7 +145,7 @@ namespace pixelgpudetails {
 #endif
     gpuPixelRecHits::getHits<<<blocks, threadsPerBlock, 0, stream.id()>>>(
       cpeParams,
-      gpu_.bs_d,
+      bs_d.data(),
       digis_d.moduleInd(),
       digis_d.xx(), digis_d.yy(), digis_d.adc(),
       clusters_d.moduleStart(),

diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.h b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.h
@@ -1,6 +1,7 @@
 #ifndef RecoLocalTracker_SiPixelRecHits_plugins_PixelRecHits_h
 #define RecoLocalTracker_SiPixelRecHits_plugins_PixelRecHits_h
 
+#include "CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h"
 #include "CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h"
 #include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h"
 #include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusteringConstants.h"
@@ -34,7 +35,7 @@ namespace pixelgpudetails {
 
     void makeHitsAsync(SiPixelDigisCUDA const& digis_d,
                        SiPixelClustersCUDA const& clusters_d,
-                       float const * bs,
+                       BeamSpotCUDA const& bs_d,
                        pixelCPEforGPU::ParamsOnGPU const * cpeParams,
                        bool transferToCPU,
                        cuda::stream_t<>& stream);