cms-patatrack · makortel · Nov 5, 2019 · Nov 5, 2019 · Nov 5, 2019 · Nov 5, 2019
diff --git a/CUDADataFormats/BeamSpot/src/BeamSpotCUDA.cc b/CUDADataFormats/BeamSpot/src/BeamSpotCUDA.cc
@@ -4,6 +4,6 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
 
 BeamSpotCUDA::BeamSpotCUDA(Data const* data_h, cudaStream_t stream) {
-  data_d_ = cms::cuda::make_device_unique<Data>(stream);
+  data_d_ = cms::cuda::make_device_unique<Data>();
   cudaCheck(cudaMemcpyAsync(data_d_.get(), data_h, sizeof(Data), cudaMemcpyHostToDevice, stream));
 }
diff --git a/CUDADataFormats/Common/interface/HeterogeneousSoA.h b/CUDADataFormats/Common/interface/HeterogeneousSoA.h
@@ -56,81 +56,86 @@ namespace cudaCompat {
     template <typename T>
     using unique_ptr = cms::cuda::device::unique_ptr<T>;
 
-    template <typename T>
-    static auto make_unique(cudaStream_t stream) {
-      return cms::cuda::make_device_unique<T>(stream);
-    }
-
-    template <typename T>
-    static auto make_unique(size_t size, cudaStream_t stream) {
-      return cms::cuda::make_device_unique<T>(size, stream);
+    template <typename T, typename... Args>
+    static auto make_unique(Args &&... args) {
+      return cms::cuda::make_device_unique<T>(std::forward<Args>(args)...);
     }
 
-    template <typename T>
-    static auto make_host_unique(cudaStream_t stream) {
-      return cms::cuda::make_host_unique<T>(stream);
+    template <typename T, typename... Args>
+    static auto make_host_unique(Args &&... args) {
+      return cms::cuda::make_host_unique<T>(std::forward<Args>(args)...);
     }
 
-    template <typename T>
-    static auto make_device_unique(cudaStream_t stream) {
-      return cms::cuda::make_device_unique<T>(stream);
-    }
-
-    template <typename T>
-    static auto make_device_unique(size_t size, cudaStream_t stream) {
-      return cms::cuda::make_device_unique<T>(size, stream);
+    template <typename T, typename... Args>
+    static auto make_device_unique(Args &&... args) {
+      return cms::cuda::make_device_unique<T>(std::forward<Args>(args)...);
     }
   };
 
   struct HostTraits {
     template <typename T>
     using unique_ptr = cms::cuda::host::unique_ptr<T>;
 
-    template <typename T>
-    static auto make_unique(cudaStream_t stream) {
-      return cms::cuda::make_host_unique<T>(stream);
+    template <typename T, typename... Args>
+    static auto make_unique(Args &&... args) {
+      return cms::cuda::make_host_unique<T>(std::forward<Args>(args)...);
     }
 
-    template <typename T>
-    static auto make_host_unique(cudaStream_t stream) {
-      return cms::cuda::make_host_unique<T>(stream);
+    template <typename T, typename... Args>
+    static auto make_host_unique(Args &&... args) {
+      return cms::cuda::make_host_unique<T>(std::forward<Args>(args)...);
     }
 
-    template <typename T>
-    static auto make_device_unique(cudaStream_t stream) {
-      return cms::cuda::make_device_unique<T>(stream);
-    }
-
-    template <typename T>
-    static auto make_device_unique(size_t size, cudaStream_t stream) {
-      return cms::cuda::make_device_unique<T>(size, stream);
+    template <typename T, typename... Args>
+    static auto make_device_unique(Args &&... args) {
+      return cms::cuda::make_device_unique<T>(std::forward<Args>(args)...);
     }
   };
 
   struct CPUTraits {
     template <typename T>
     using unique_ptr = std::unique_ptr<T>;
 
+    template <typename T>
+    static auto make_unique() {
+      return std::make_unique<T>();
+    }
     template <typename T>
     static auto make_unique(cudaStream_t) {
       return std::make_unique<T>();
     }
 
+    template <typename T>
+    static auto make_unique(size_t size) {
+      return std::make_unique<T>(size);
+    }
     template <typename T>
     static auto make_unique(size_t size, cudaStream_t) {
       return std::make_unique<T>(size);
     }
 
+    template <typename T>
+    static auto make_host_unique() {
+      return std::make_unique<T>();
+    }
     template <typename T>
     static auto make_host_unique(cudaStream_t) {
       return std::make_unique<T>();
     }
 
+    template <typename T>
+    static auto make_device_unique() {
+      return std::make_unique<T>();
+    }
     template <typename T>
     static auto make_device_unique(cudaStream_t) {
       return std::make_unique<T>();
     }
 
+    template <typename T>
+    static auto make_device_unique(size_t size) {
+      return std::make_unique<T>(size);
+    }
     template <typename T>
     static auto make_device_unique(size_t size, cudaStream_t) {
       return std::make_unique<T>(size);
@@ -146,13 +151,12 @@ class HeterogeneousSoAImpl {
   template <typename V>
   using unique_ptr = typename Traits::template unique_ptr<V>;
 
-  HeterogeneousSoAImpl() = default;  // make root happy
+  HeterogeneousSoAImpl();
   ~HeterogeneousSoAImpl() = default;
   HeterogeneousSoAImpl(HeterogeneousSoAImpl &&) = default;
   HeterogeneousSoAImpl &operator=(HeterogeneousSoAImpl &&) = default;
 
   explicit HeterogeneousSoAImpl(unique_ptr<T> &&p) : m_ptr(std::move(p)) {}
-  explicit HeterogeneousSoAImpl(cudaStream_t stream);
 
   T const *get() const { return m_ptr.get(); }
 
@@ -165,8 +169,8 @@ class HeterogeneousSoAImpl {
 };
 
 template <typename T, typename Traits>
-HeterogeneousSoAImpl<T, Traits>::HeterogeneousSoAImpl(cudaStream_t stream) {
-  m_ptr = Traits::template make_unique<T>(stream);
+HeterogeneousSoAImpl<T, Traits>::HeterogeneousSoAImpl() {
+  m_ptr = Traits::template make_unique<T>();
 }
 
 // in reality valid only for GPU version...

diff --git a/CUDADataFormats/SiPixelCluster/src/SiPixelClustersCUDA.cc b/CUDADataFormats/SiPixelCluster/src/SiPixelClustersCUDA.cc
@@ -5,17 +5,19 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"
 
 SiPixelClustersCUDA::SiPixelClustersCUDA(size_t maxClusters, cudaStream_t stream) {
-  moduleStart_d = cms::cuda::make_device_unique<uint32_t[]>(maxClusters + 1, stream);
-  clusInModule_d = cms::cuda::make_device_unique<uint32_t[]>(maxClusters, stream);
-  moduleId_d = cms::cuda::make_device_unique<uint32_t[]>(maxClusters, stream);
-  clusModuleStart_d = cms::cuda::make_device_unique<uint32_t[]>(maxClusters + 1, stream);
+  moduleStart_d = cms::cuda::make_device_unique<uint32_t[]>(maxClusters + 1);
+  clusInModule_d = cms::cuda::make_device_unique<uint32_t[]>(maxClusters);
+  moduleId_d = cms::cuda::make_device_unique<uint32_t[]>(maxClusters);
+  clusModuleStart_d = cms::cuda::make_device_unique<uint32_t[]>(maxClusters + 1);
 
+  // device-side ownership to guarantee that the host memory is alive
+  // until the copy finishes
   auto view = cms::cuda::make_host_unique<DeviceConstView>(stream);
   view->moduleStart_ = moduleStart_d.get();
   view->clusInModule_ = clusInModule_d.get();
   view->moduleId_ = moduleId_d.get();
   view->clusModuleStart_ = clusModuleStart_d.get();
 
-  view_d = cms::cuda::make_device_unique<DeviceConstView>(stream);
+  view_d = cms::cuda::make_device_unique<DeviceConstView>();
   cms::cuda::copyAsync(view_d, view, stream);
 }
diff --git a/CUDADataFormats/SiPixelDigi/src/SiPixelDigiErrorsCUDA.cc b/CUDADataFormats/SiPixelDigi/src/SiPixelDigiErrorsCUDA.cc
@@ -9,11 +9,13 @@
 
 SiPixelDigiErrorsCUDA::SiPixelDigiErrorsCUDA(size_t maxFedWords, PixelFormatterErrors errors, cudaStream_t stream)
     : formatterErrors_h(std::move(errors)) {
-  error_d = cms::cuda::make_device_unique<GPU::SimpleVector<PixelErrorCompact>>(stream);
-  data_d = cms::cuda::make_device_unique<PixelErrorCompact[]>(maxFedWords, stream);
+  error_d = cms::cuda::make_device_unique<GPU::SimpleVector<PixelErrorCompact>>();
+  data_d = cms::cuda::make_device_unique<PixelErrorCompact[]>(maxFedWords);
 
   cms::cuda::memsetAsync(data_d, 0x00, maxFedWords, stream);
 
+  // device-side ownership to guarantee that the host memory is alive
+  // until the copy finishes
   error_h = cms::cuda::make_host_unique<GPU::SimpleVector<PixelErrorCompact>>(stream);
   GPU::make_SimpleVector(error_h.get(), maxFedWords, data_d.get());
   assert(error_h->empty());

diff --git a/CUDADataFormats/SiPixelDigi/src/SiPixelDigisCUDA.cc b/CUDADataFormats/SiPixelDigi/src/SiPixelDigisCUDA.cc
@@ -5,23 +5,25 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"
 
 SiPixelDigisCUDA::SiPixelDigisCUDA(size_t maxFedWords, cudaStream_t stream) {
-  xx_d = cms::cuda::make_device_unique<uint16_t[]>(maxFedWords, stream);
-  yy_d = cms::cuda::make_device_unique<uint16_t[]>(maxFedWords, stream);
-  adc_d = cms::cuda::make_device_unique<uint16_t[]>(maxFedWords, stream);
-  moduleInd_d = cms::cuda::make_device_unique<uint16_t[]>(maxFedWords, stream);
-  clus_d = cms::cuda::make_device_unique<int32_t[]>(maxFedWords, stream);
+  xx_d = cms::cuda::make_device_unique<uint16_t[]>(maxFedWords);
+  yy_d = cms::cuda::make_device_unique<uint16_t[]>(maxFedWords);
+  adc_d = cms::cuda::make_device_unique<uint16_t[]>(maxFedWords);
+  moduleInd_d = cms::cuda::make_device_unique<uint16_t[]>(maxFedWords);
+  clus_d = cms::cuda::make_device_unique<int32_t[]>(maxFedWords);
 
-  pdigi_d = cms::cuda::make_device_unique<uint32_t[]>(maxFedWords, stream);
-  rawIdArr_d = cms::cuda::make_device_unique<uint32_t[]>(maxFedWords, stream);
+  pdigi_d = cms::cuda::make_device_unique<uint32_t[]>(maxFedWords);
+  rawIdArr_d = cms::cuda::make_device_unique<uint32_t[]>(maxFedWords);
 
+  // device-side ownership to guarantee that the host memory is alive
+  // until the copy finishes
   auto view = cms::cuda::make_host_unique<DeviceConstView>(stream);
   view->xx_ = xx_d.get();
   view->yy_ = yy_d.get();
   view->adc_ = adc_d.get();
   view->moduleInd_ = moduleInd_d.get();
   view->clus_ = clus_d.get();
 
-  view_d = cms::cuda::make_device_unique<DeviceConstView>(stream);
+  view_d = cms::cuda::make_device_unique<DeviceConstView>();
   cms::cuda::copyAsync(view_d, view, stream);
 }
 

diff --git a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h
@@ -73,11 +73,13 @@ TrackingRecHit2DHeterogeneous<Traits>::TrackingRecHit2DHeterogeneous(uint32_t nH
                                                                      uint32_t const* hitsModuleStart,
                                                                      cudaStream_t stream)
     : m_nHits(nHits), m_hitsModuleStart(hitsModuleStart) {
+  // device-side ownership to guarantee that the host memory is alive
+  // until the copy finishes
   auto view = Traits::template make_host_unique<TrackingRecHit2DSOAView>(stream);
 
   view->m_nHits = nHits;
-  m_view = Traits::template make_device_unique<TrackingRecHit2DSOAView>(stream);
-  m_AverageGeometryStore = Traits::template make_device_unique<TrackingRecHit2DSOAView::AverageGeometry>(stream);
+  m_view = Traits::template make_device_unique<TrackingRecHit2DSOAView>();
+  m_AverageGeometryStore = Traits::template make_device_unique<TrackingRecHit2DSOAView::AverageGeometry>();
   view->m_averageGeometry = m_AverageGeometryStore.get();
   view->m_cpeParams = cpeParams;
   view->m_hitsModuleStart = hitsModuleStart;
@@ -101,9 +103,9 @@ TrackingRecHit2DHeterogeneous<Traits>::TrackingRecHit2DHeterogeneous(uint32_t nH
   // if ordering is relevant they may have to be stored phi-ordered by layer or so
   // this will break 1to1 correspondence with cluster and module locality
   // so unless proven VERY inefficient we keep it ordered as generated
-  m_store16 = Traits::template make_device_unique<uint16_t[]>(nHits * n16, stream);
-  m_store32 = Traits::template make_device_unique<float[]>(nHits * n32 + 11, stream);
-  m_HistStore = Traits::template make_device_unique<TrackingRecHit2DSOAView::Hist>(stream);
+  m_store16 = Traits::template make_device_unique<uint16_t[]>(nHits * n16);
+  m_store32 = Traits::template make_device_unique<float[]>(nHits * n32 + 11);
+  m_HistStore = Traits::template make_device_unique<TrackingRecHit2DSOAView::Hist>();
 
   auto get16 = [&](int i) { return m_store16.get() + i * nHits; };
   auto get32 = [&](int i) { return m_store32.get() + i * nHits; };

diff --git a/HeterogeneousCore/CUDAUtilities/interface/allocate_device.h b/HeterogeneousCore/CUDAUtilities/interface/allocate_device.h
@@ -6,6 +6,14 @@
 namespace cms {
   namespace cuda {
     // Allocate device memory
+    // This variant does not create device-side ownership
+    void *allocate_device(int dev, size_t nbytes);
+
+    // Allocate device memory
+    // This variant creates device-side ownership. When freed, all work
+    // in the stream up to the freeing point must be finished for the
+    // memory block to be considered free (except for new allocation in
+    // the same stream)
     void *allocate_device(int dev, size_t nbytes, cudaStream_t stream);
 
     // Free device memory (to be called from unique_ptr)

diff --git a/HeterogeneousCore/CUDAUtilities/interface/allocate_host.h b/HeterogeneousCore/CUDAUtilities/interface/allocate_host.h
@@ -6,6 +6,14 @@
 namespace cms {
   namespace cuda {
     // Allocate pinned host memory (to be called from unique_ptr)
+    // This variant does not create device-side ownership
+    void *allocate_host(size_t nbytes);
+
+    // Allocate pinned host memory (to be called from unique_ptr)
+    // This variant creates device-side ownership. When freed, all work
+    // in the stream up to the freeing point must be finished for the
+    // memory block to be considered free (except for new allocation in
+    // the same stream)
     void *allocate_host(size_t nbytes, cudaStream_t stream);
 
     // Free pinned host memory (to be called from unique_ptr)

diff --git a/HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h b/HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h
@@ -48,6 +48,16 @@ namespace cms {
       }  // namespace impl
     }    // namespace device
 
+    template <typename T>
+    typename device::impl::make_device_unique_selector<T>::non_array make_device_unique() {
+      static_assert(std::is_trivially_constructible<T>::value,
+                    "Allocating with non-trivial constructor on the device memory is not supported");
+      int dev = currentDevice();
+      void *mem = allocate_device(dev, sizeof(T));
+      return typename device::impl::make_device_unique_selector<T>::non_array{reinterpret_cast<T *>(mem),
+                                                                              device::impl::DeviceDeleter{dev}};
+    }
+
     template <typename T>
     typename device::impl::make_device_unique_selector<T>::non_array make_device_unique(cudaStream_t stream) {
       static_assert(std::is_trivially_constructible<T>::value,
@@ -58,6 +68,17 @@ namespace cms {
                                                                               device::impl::DeviceDeleter{dev}};
     }
 
+    template <typename T>
+    typename device::impl::make_device_unique_selector<T>::unbounded_array make_device_unique(size_t n) {
+      using element_type = typename std::remove_extent<T>::type;
+      static_assert(std::is_trivially_constructible<element_type>::value,
+                    "Allocating with non-trivial constructor on the device memory is not supported");
+      int dev = currentDevice();
+      void *mem = allocate_device(dev, n * sizeof(element_type));
+      return typename device::impl::make_device_unique_selector<T>::unbounded_array{
+          reinterpret_cast<element_type *>(mem), device::impl::DeviceDeleter{dev}};
+    }
+
     template <typename T>
     typename device::impl::make_device_unique_selector<T>::unbounded_array make_device_unique(size_t n,
                                                                                               cudaStream_t stream) {
@@ -74,6 +95,14 @@ namespace cms {
     typename device::impl::make_device_unique_selector<T>::bounded_array make_device_unique(Args &&...) = delete;
 
     // No check for the trivial constructor, make it clear in the interface
+    template <typename T>
+    typename device::impl::make_device_unique_selector<T>::non_array make_device_unique_uninitialized() {
+      int dev = currentDevice();
+      void *mem = allocate_device(dev, sizeof(T));
+      return typename device::impl::make_device_unique_selector<T>::non_array{reinterpret_cast<T *>(mem),
+                                                                              device::impl::DeviceDeleter{dev}};
+    }
+
     template <typename T>
     typename device::impl::make_device_unique_selector<T>::non_array make_device_unique_uninitialized(
         cudaStream_t stream) {
@@ -83,6 +112,15 @@ namespace cms {
                                                                               device::impl::DeviceDeleter{dev}};
     }
 
+    template <typename T>
+    typename device::impl::make_device_unique_selector<T>::unbounded_array make_device_unique_uninitialized(size_t n) {
+      using element_type = typename std::remove_extent<T>::type;
+      int dev = currentDevice();
+      void *mem = allocate_device(dev, n * sizeof(element_type));
+      return typename device::impl::make_device_unique_selector<T>::unbounded_array{
+          reinterpret_cast<element_type *>(mem), device::impl::DeviceDeleter{dev}};
+    }
+
     template <typename T>
     typename device::impl::make_device_unique_selector<T>::unbounded_array make_device_unique_uninitialized(
         size_t n, cudaStream_t stream) {