change name

cms-sw · May 5, 2022 · 6b050bd · 6b050bd
1 parent c429d13
commit 6b050bd
Show file tree

Hide file tree

Showing 6 changed files with 94 additions and 81 deletions.
diff --git a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h
@@ -2,10 +2,12 @@
 #define CUDADataFormats_TrackingRecHit_interface_TrackingRecHit2DHeterogeneous_h
 
 #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DSOAView.h"
-#include "CUDADataFormats/Common/interface/HeterogeneousSoA.h"
 #include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h"
 
-template <typename Traits>
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaMemoryPool.h"
+
+template <memoryPool::Where where>
 class TrackingRecHit2DHeterogeneous {
 public:
   enum class Storage32 {
@@ -30,7 +32,7 @@ class TrackingRecHit2DHeterogeneous {
   };
 
   template <typename T>
-  using unique_ptr = typename Traits::template unique_ptr<T>;
+  using unique_ptr = typename memoryPool::unique_ptr<T>;
 
   using PhiBinner = TrackingRecHit2DSOAView::PhiBinner;
 
@@ -43,7 +45,7 @@ class TrackingRecHit2DHeterogeneous {
       pixelCPEforGPU::ParamsOnGPU const* cpeParams,
       uint32_t const* hitsModuleStart,
       cudaStream_t stream,
-      TrackingRecHit2DHeterogeneous<cms::cudacompat::GPUTraits> const* input = nullptr);
+      TrackingRecHit2DHeterogeneous<memoryPool::onDevice> const* input = nullptr);
 
   explicit TrackingRecHit2DHeterogeneous(
       float* store32, uint16_t* store16, uint32_t* modules, int nHits, cudaStream_t stream = nullptr);
@@ -67,22 +69,22 @@ class TrackingRecHit2DHeterogeneous {
   auto phiBinnerStorage() { return m_phiBinnerStorage; }
   auto iphi() { return m_iphi; }
 
-  cms::cuda::host::unique_ptr<float[]> localCoordToHostAsync(cudaStream_t stream) const;
+  unique_ptr<float> localCoordToHostAsync(cudaStream_t stream) const;
 
-  cms::cuda::host::unique_ptr<uint32_t[]> hitsModuleStartToHostAsync(cudaStream_t stream) const;
+  unique_ptr<uint32_t> hitsModuleStartToHostAsync(cudaStream_t stream) const;
 
-  cms::cuda::host::unique_ptr<uint16_t[]> store16ToHostAsync(cudaStream_t stream) const;
-  cms::cuda::host::unique_ptr<float[]> store32ToHostAsync(cudaStream_t stream) const;
+  unique_ptr<uint16_t> store16ToHostAsync(cudaStream_t stream) const;
+  unique_ptr<float> store32ToHostAsync(cudaStream_t stream) const;
 
   // needs specialization for Host
-  void copyFromGPU(TrackingRecHit2DHeterogeneous<cms::cudacompat::GPUTraits> const* input, cudaStream_t stream);
+  void copyFromGPU(TrackingRecHit2DHeterogeneous<memoryPool::onDevice> const* input, cudaStream_t stream);
 
 private:
   static constexpr uint32_t n16 = 4;                 // number of elements in m_store16
   static constexpr uint32_t n32 = 10;                // number of elements in m_store32
   static_assert(sizeof(uint32_t) == sizeof(float));  // just stating the obvious
   static_assert(n32 == static_cast<uint32_t>(Storage32::kLayers));
-  unique_ptr<uint16_t[]> m_store16;  //!
+  unique_ptr<uint16_t> m_store16;  //!
   unique_ptr<float[]> m_store32;     //!
 
   unique_ptr<TrackingRecHit2DSOAView::PhiBinner> m_PhiBinnerStore;              //!
@@ -103,39 +105,38 @@ class TrackingRecHit2DHeterogeneous {
   int16_t* m_iphi;
 };
 
-using TrackingRecHit2DGPU = TrackingRecHit2DHeterogeneous<cms::cudacompat::GPUTraits>;
-using TrackingRecHit2DCPU = TrackingRecHit2DHeterogeneous<cms::cudacompat::CPUTraits>;
-using TrackingRecHit2DHost = TrackingRecHit2DHeterogeneous<cms::cudacompat::HostTraits>;
+using TrackingRecHit2DGPU = TrackingRecHit2DHeterogeneous<memoryPool::onDevice>;
+using TrackingRecHit2DCPU = TrackingRecHit2DHeterogeneous<memoryPool::onCPU>;
+using TrackingRecHit2DHost = TrackingRecHit2DHeterogeneous<memoryPool::onHost>;
 
-#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 
-template <typename Traits>
-TrackingRecHit2DHeterogeneous<Traits>::TrackingRecHit2DHeterogeneous(
+template <memoryPool::Where where>
+TrackingRecHit2DHeterogeneous<where>::TrackingRecHit2DHeterogeneous(
     uint32_t nHits,
     bool isPhase2,
     int32_t offsetBPIX2,
     pixelCPEforGPU::ParamsOnGPU const* cpeParams,
     uint32_t const* hitsModuleStart,
     cudaStream_t stream,
-    TrackingRecHit2DHeterogeneous<cms::cudacompat::GPUTraits> const* input)
+    TrackingRecHit2DHeterogeneous<memoryPool::onDevice> const* input)
     : m_nHits(nHits), m_offsetBPIX2(offsetBPIX2), m_hitsModuleStart(hitsModuleStart) {
-  auto view = Traits::template make_host_unique<TrackingRecHit2DSOAView>(stream);
+  using namespace memoryPool::cuda;
+  auto view = make_unique<TrackingRecHit2DSOAView>(1,stream,memoryPool::onCPU==where ? memoryPool::onCPU : memoryPool::onHost);
 
   m_nMaxModules = isPhase2 ? phase2PixelTopology::numberOfModules : phase1PixelTopology::numberOfModules;
 
   view->m_nHits = nHits;
   view->m_nMaxModules = m_nMaxModules;
-  m_view = Traits::template make_unique<TrackingRecHit2DSOAView>(stream);  // leave it on host and pass it by value?
-  m_AverageGeometryStore = Traits::template make_unique<TrackingRecHit2DSOAView::AverageGeometry>(stream);
+  m_view = make_unique<TrackingRecHit2DSOAView>(1,stream,where);  // leave it on host and pass it by value?
+  m_AverageGeometryStore = make_unique<TrackingRecHit2DSOAView::AverageGeometry>(1,stream,where);
   view->m_averageGeometry = m_AverageGeometryStore.get();
   view->m_cpeParams = cpeParams;
   view->m_hitsModuleStart = hitsModuleStart;
 
   // if empy do not bother
   if (0 == nHits) {
-    if constexpr (std::is_same_v<Traits, cms::cudacompat::GPUTraits>) {
-      cms::cuda::copyAsync(m_view, view, stream);
+    if constexpr (memoryPool::onDevice == where) {
+      cudaCheck(cudaMemcpyAsync(m_view.get(), view.get(), sizeof(TrackingRecHit2DSOAView), cudaMemcpyHostToDevice, stream));
     } else {
       m_view.reset(view.release());  // NOLINT: std::move() breaks CUDA version
     }
@@ -149,17 +150,17 @@ TrackingRecHit2DHeterogeneous<Traits>::TrackingRecHit2DHeterogeneous(
   // so unless proven VERY inefficient we keep it ordered as generated
 
   // host copy is "reduced"  (to be reviewed at some point)
-  if constexpr (std::is_same_v<Traits, cms::cudacompat::HostTraits>) {
+  if constexpr (memoryPool::onHost == where) {
     // it has to compile for ALL cases
     copyFromGPU(input, stream);
   } else {
     assert(input == nullptr);
 
     auto nL = isPhase2 ? phase2PixelTopology::numberOfLayers : phase1PixelTopology::numberOfLayers;
 
-    m_store16 = Traits::template make_unique<uint16_t[]>(nHits * n16, stream);
-    m_store32 = Traits::template make_unique<float[]>(nHits * n32 + nL + 1, stream);
-    m_PhiBinnerStore = Traits::template make_unique<TrackingRecHit2DSOAView::PhiBinner>(stream);
+    m_store16 = make_unique<uint16_t>(nHits * n16, stream, where);
+    m_store32 = make_unique<float>(nHits * n32 + nL + 1, stream, where);
+    m_PhiBinnerStore = make_unique<TrackingRecHit2DSOAView::PhiBinner>(1,stream,where);
   }
 
   static_assert(sizeof(TrackingRecHit2DSOAView::hindex_type) == sizeof(float));
@@ -178,7 +179,7 @@ TrackingRecHit2DHeterogeneous<Traits>::TrackingRecHit2DHeterogeneous(
   view->m_yerr = get32(Storage32::kYerror);
   view->m_chargeAndStatus = reinterpret_cast<uint32_t*>(get32(Storage32::kCharge));
 
-  if constexpr (!std::is_same_v<Traits, cms::cudacompat::HostTraits>) {
+  if constexpr (memoryPool::onHost != where) {
     assert(input == nullptr);
     view->m_xg = get32(Storage32::kXGlobal);
     view->m_yg = get32(Storage32::kYGlobal);
@@ -197,45 +198,46 @@ TrackingRecHit2DHeterogeneous<Traits>::TrackingRecHit2DHeterogeneous(
   }
 
   // transfer view
-  if constexpr (std::is_same_v<Traits, cms::cudacompat::GPUTraits>) {
-    cms::cuda::copyAsync(m_view, view, stream);
+  if constexpr (memoryPool::onDevice == where) {
+    cudaCheck(cudaMemcpyAsync(m_view.get(), view.get(), sizeof(TrackingRecHit2DSOAView), cudaMemcpyHostToDevice, stream));
   } else {
     m_view.reset(view.release());  // NOLINT: std::move() breaks CUDA version
   }
 }
 
 //this is intended to be used only for CPU SoA but doesn't hurt to have it for all cases
-template <typename Traits>
-TrackingRecHit2DHeterogeneous<Traits>::TrackingRecHit2DHeterogeneous(
+template <memoryPool::Where where>
+TrackingRecHit2DHeterogeneous<where>::TrackingRecHit2DHeterogeneous(
     float* store32, uint16_t* store16, uint32_t* modules, int nHits, cudaStream_t stream)
     : m_nHits(nHits), m_hitsModuleStart(modules) {
-  auto view = Traits::template make_host_unique<TrackingRecHit2DSOAView>(stream);
 
-  m_view = Traits::template make_unique<TrackingRecHit2DSOAView>(stream);
+  auto view = make_unique<TrackingRecHit2DSOAView>(1,stream,memoryPool::onCPU==where? memoryPool::onCPU : memoryPool::onHost);
+
+  m_view = make_unique<TrackingRecHit2DSOAView>(stream);
 
   view->m_nHits = nHits;
 
   if (0 == nHits) {
-    if constexpr (std::is_same_v<Traits, cms::cudacompat::GPUTraits>) {
-      cms::cuda::copyAsync(m_view, view, stream);
+    if constexpr (memoryPool::onDevice == where) {
+      cudaCheck(cudaMemcpyAsync(m_view.get(), view.get(), sizeof(TrackingRecHit2DSOAView), cudaMemcpyHostToDevice, stream));
     } else {
       m_view = std::move(view);
     }
     return;
   }
 
-  m_store16 = Traits::template make_unique<uint16_t[]>(nHits * n16, stream);
-  m_store32 = Traits::template make_unique<float[]>(nHits * n32, stream);
-  m_PhiBinnerStore = Traits::template make_unique<TrackingRecHit2DSOAView::PhiBinner>(stream);
-  m_AverageGeometryStore = Traits::template make_unique<TrackingRecHit2DSOAView::AverageGeometry>(stream);
+  m_store16 = make_unique<uint16_t>(nHits * n16, stream,where);
+  m_store32 = make_unique<float>(nHits * n32, stream,where);
+  m_PhiBinnerStore = make_unique<TrackingRecHit2DSOAView::PhiBinner>(1,stream,where);
+  m_AverageGeometryStore = make_unique<TrackingRecHit2DSOAView::AverageGeometry>(1,stream,where);
 
   view->m_averageGeometry = m_AverageGeometryStore.get();
   view->m_hitsModuleStart = m_hitsModuleStart;
 
   //store transfer
-  if constexpr (std::is_same_v<Traits, cms::cudacompat::GPUTraits>) {
-    cms::cuda::copyAsync(m_store16, store16, stream);
-    cms::cuda::copyAsync(m_store32, store32, stream);
+  if constexpr (memoryPool::onDevice == where) {
+     cudaCheck(cudaMemcpyAsync(m_store32.get(), store32, nHits * n32, cudaMemcpyHostToDevice,stream));
+     cudaCheck(cudaMemcpyAsync(m_store16.get(), store16, nHits * n16, cudaMemcpyHostToDevice,stream));
   } else {
     std::copy(store32, store32 + nHits * n32, m_store32.get());  // want to copy it
     std::copy(store16, store16 + nHits * n16, m_store16.get());
@@ -267,10 +269,10 @@ TrackingRecHit2DHeterogeneous<Traits>::TrackingRecHit2DHeterogeneous(
   view->m_ysize = reinterpret_cast<int16_t*>(get16(Storage16::kYSize));
 
   // transfer view
-  if constexpr (std::is_same_v<Traits, cms::cudacompat::GPUTraits>) {
-    cms::cuda::copyAsync(m_view, view, stream);
+  if constexpr (memoryPool::onDevice == where) {
+      cudaCheck(cudaMemcpyAsync(m_view.get(), view.get(), sizeof(TrackingRecHit2DSOAView), cudaMemcpyHostToDevice, stream));
   } else {
-    m_view = std::move(view);
+      m_view = std::move(view);
   }
 }
 

diff --git a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DSOAView.h b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DSOAView.h
@@ -4,6 +4,7 @@
 #include <cuda_runtime.h>
 
 #include "CUDADataFormats/SiPixelCluster/interface/gpuClusteringConstants.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/memoryPool.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h"
 #include "Geometry/CommonTopologies/interface/SimplePixelTopology.h"
@@ -25,7 +26,7 @@ class TrackingRecHit2DSOAView {
 
   using AverageGeometry = pixelTopology::AverageGeometry;
 
-  template <typename>
+  template <memoryPool::Where>
   friend class TrackingRecHit2DHeterogeneous;
   friend class TrackingRecHit2DReduced;
 

diff --git a/CUDADataFormats/TrackingRecHit/src/TrackingRecHit2DHeterogeneous.cc b/CUDADataFormats/TrackingRecHit/src/TrackingRecHit2DHeterogeneous.cc
@@ -1,33 +1,36 @@
 #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaMemoryPool.h"
+
+using namespace memoryPool::cuda;
 
 template <>
-cms::cuda::host::unique_ptr<float[]> TrackingRecHit2DGPU::localCoordToHostAsync(cudaStream_t stream) const {
-  auto ret = cms::cuda::make_host_unique<float[]>(5 * nHits(), stream);
-  cms::cuda::copyAsync(ret, m_store32, 5 * nHits(), stream);
+memoryPool::unique_ptr<float> TrackingRecHit2DGPU::localCoordToHostAsync(cudaStream_t stream) const {
+  auto ret = make_unique<float>(5 * nHits(), stream,onHost);
+  cudaCheck(
+      cudaMemcpyAsync(ret.get(), m_store32.get(), 5 * nHits(), cudaMemcpyDefault,stream));
   return ret;
 }
 
 template <>
-cms::cuda::host::unique_ptr<float[]> TrackingRecHit2DGPU::store32ToHostAsync(cudaStream_t stream) const {
-  auto ret = cms::cuda::make_host_unique<float[]>(static_cast<int>(n32) * nHits(), stream);
-  cms::cuda::copyAsync(ret, m_store32, static_cast<int>(n32) * nHits(), stream);
+memoryPool::unique_ptr<float> TrackingRecHit2DGPU::store32ToHostAsync(cudaStream_t stream) const {
+  auto ret = make_unique<float>(static_cast<int>(n32) * nHits(), stream,onHost));
+    cudaCheck(
+      cudaMemcpyAsync(ret.get(), m_store32.get(), static_cast<int>(n32) * nHits(), cudaMemcpyDefault,stream));
   return ret;
 }
 
 template <>
-cms::cuda::host::unique_ptr<uint16_t[]> TrackingRecHit2DGPU::store16ToHostAsync(cudaStream_t stream) const {
-  auto ret = cms::cuda::make_host_unique<uint16_t[]>(static_cast<int>(n16) * nHits(), stream);
-  cms::cuda::copyAsync(ret, m_store16, static_cast<int>(n16) * nHits(), stream);
+memoryPool::unique_ptr<uint16_t> TrackingRecHit2DGPU::store16ToHostAsync(cudaStream_t stream) const {
+  auto ret = make_unique<uint16_t>(static_cast<int>(n16) * nHits(), stream,onHost));
+    cudaCheck(
+      cudaMemcpyAsync(ret.get(), m_store16, static_cast<int>(n16) * nHits(), cudaMemcpyDefault,stream));
   return ret;
 }
 
 template <>
-cms::cuda::host::unique_ptr<uint32_t[]> TrackingRecHit2DGPU::hitsModuleStartToHostAsync(cudaStream_t stream) const {
-  auto ret = cms::cuda::make_host_unique<uint32_t[]>(nMaxModules() + 1, stream);
+memoryPool::unique_ptr<uint32_t> TrackingRecHit2DGPU::hitsModuleStartToHostAsync(cudaStream_t stream) const {
+  auto ret = make_unique<uint32_t>(nMaxModules() + 1, stream,onHost));
   cudaCheck(
       cudaMemcpyAsync(ret.get(), m_hitsModuleStart, sizeof(uint32_t) * (nMaxModules() + 1), cudaMemcpyDefault, stream));
   return ret;

diff --git a/HeterogeneousCore/CUDAUtilities/interface/cudaMemoryPool.h b/HeterogeneousCore/CUDAUtilities/interface/cudaMemoryPool.h
@@ -18,6 +18,11 @@ namespace memoryPool {
     // schedule free
     void free(cudaStream_t stream, std::vector<int> buckets, SimplePoolAllocator &pool);
 
+    template<typename T>
+    auto copy(buffer<T> & dst,buffer<T> const & src, uint64_t size, cudaStream_t stream) {
+        return cudaMemcpyAsync(dst.get(), src.get(), sizeof(T)*size,  cudaMemcpyDefault, stream);
+    }
+
     struct CudaDeleterBase : public DeleterBase {
       CudaDeleterBase(cudaStream_t const &stream, Where where) : DeleterBase(getPool(where)), m_stream(stream) {}
 
@@ -46,24 +51,24 @@ namespace memoryPool {
     };
 
     template <typename T>
-    unique_ptr<T> make_unique(uint64_t size, Deleter del) {
+    buffer<T> make_buffer(uint64_t size, Deleter del) {
       auto ret = alloc(sizeof(T) * size, *del.pool());
       if (ret.second < 0)
         throw std::bad_alloc();
       del.setBucket(ret.second);
-      return unique_ptr<T>((T *)(ret.first), del);
+      return buffer<T>((T *)(ret.first), del);
     }
 
     template <typename T>
-    unique_ptr<T> make_unique(uint64_t size, cudaStream_t const &stream, Where where) {
-      return make_unique<T>(sizeof(T) * size, Deleter(std::make_shared<DeleteOne>(stream, getPool(where))));
+    buffer<T> make_buffer(uint64_t size, cudaStream_t const &stream, Where where) {
+      return make_buffer<T>(sizeof(T) * size, Deleter(std::make_shared<DeleteOne>(stream, getPool(where))));
     }
 
     /*
       template< class T, class... Args >
-      memoryPool::unique_ptr<T> make_unique( Args&&... args );
+      memoryPool::buffer<T> make_buffer( Args&&... args );
       template< class T, class... Args >
-      memoryPool::unique_ptr<T> make_unique(Deleter del, Args&&... args );
+      memoryPool::buffer<T> make_buffer(Deleter del, Args&&... args );
 */
 
   }  // namespace cuda

diff --git a/HeterogeneousCore/CUDAUtilities/interface/memoryPool.h b/HeterogeneousCore/CUDAUtilities/interface/memoryPool.h
@@ -42,6 +42,6 @@ namespace memoryPool {
   };
 
   template <typename T>
-  using unique_ptr = std::unique_ptr<T, Deleter>;
+  using buffer = std::unique_ptr<T, Deleter>;
 
 }  // namespace memoryPool
diff --git a/HeterogeneousCore/CUDAUtilities/test/testPoolUI.cu b/HeterogeneousCore/CUDAUtilities/test/testPoolUI.cu
@@ -3,7 +3,7 @@
 
 template <memoryPool::Where where>
 struct dataProducer {
-  static auto operator()(cudaStream_t stream) { return memoryPool::cuda::make_unique<int>(20, stream, where); }
+  auto operator()(cudaStream_t stream) { return memoryPool::cuda::make_buffer<int>(20, stream, where); }
 };
 
 int main() {
@@ -33,28 +33,30 @@ int main() {
   auto& stream = streams[0];
 
   {
-    auto pd = memoryPool::cuda::make_unique<int>(20, stream, memoryPool::onDevice);
-    auto ph = memoryPool::cuda::make_unique<int>(20, stream, memoryPool::onHost);
-    auto pc = memoryPool::cuda::make_unique<int>(20, stream, memoryPool::onCPU);
+    auto pd = memoryPool::cuda::make_buffer<int>(20, stream, memoryPool::onDevice);
+    auto ph = memoryPool::cuda::make_buffer<int>(20, stream, memoryPool::onHost);
+    auto pc = memoryPool::cuda::make_buffer<int>(20, stream, memoryPool::onCPU);
 
-    auto dp = dataProducer<onDevice>(stream);
+    auto dp = dataProducer<memoryPool::onDevice>()(stream);
 
+    memoryPool::cuda::copy(ph,pd,20,stream);
+    memoryPool::cuda::copy(pd,ph,20,stream);
     memoryPool::cuda::dumpStat();
   }
 
   {
     memoryPool::Deleter devDeleter(std::make_shared<memoryPool::cuda::BundleDelete>(stream, memoryPool::onDevice));
     memoryPool::Deleter hosDeleter(std::make_shared<memoryPool::cuda::BundleDelete>(stream, memoryPool::onHost));
 
-    auto p0 = memoryPool::cuda::make_unique<int>(20, devDeleter);
-    auto p1 = memoryPool::cuda::make_unique<double>(20, devDeleter);
-    auto p2 = memoryPool::cuda::make_unique<bool>(20, devDeleter);
-    auto p3 = memoryPool::cuda::make_unique<int>(20, devDeleter);
+    auto p0 = memoryPool::cuda::make_buffer<int>(20, devDeleter);
+    auto p1 = memoryPool::cuda::make_buffer<double>(20, devDeleter);
+    auto p2 = memoryPool::cuda::make_buffer<bool>(20, devDeleter);
+    auto p3 = memoryPool::cuda::make_buffer<int>(20, devDeleter);
 
-    auto hp0 = memoryPool::cuda::make_unique<int>(20, hosDeleter);
-    auto hp1 = memoryPool::cuda::make_unique<double>(20, hosDeleter);
-    auto hp2 = memoryPool::cuda::make_unique<bool>(20, hosDeleter);
-    auto hp3 = memoryPool::cuda::make_unique<int>(20, hosDeleter);
+    auto hp0 = memoryPool::cuda::make_buffer<int>(20, hosDeleter);
+    auto hp1 = memoryPool::cuda::make_buffer<double>(20, hosDeleter);
+    auto hp2 = memoryPool::cuda::make_buffer<bool>(20, hosDeleter);
+    auto hp3 = memoryPool::cuda::make_buffer<int>(20, hosDeleter);
 
     memoryPool::cuda::dumpStat();
   }