migrate more to new pool

cms-sw · May 7, 2022 · 59bcb2b · 59bcb2b
1 parent 1a43ba7
commit 59bcb2b
Show file tree

Hide file tree

Showing 9 changed files with 302 additions and 46 deletions.
diff --git a/HeterogeneousCore/CUDAUtilities/interface/cudaMemoryPoolImpl.h b/HeterogeneousCore/CUDAUtilities/interface/cudaMemoryPoolImpl.h
@@ -13,7 +13,7 @@ namespace {
 
   //  free callback
   void CUDART_CB freeCallback(void *p) {
-    // std::cout << "free callaback" << std::endl;
+    std::cout << "free callaback" << std::endl;
     auto payload = (memoryPool::Payload *)(p);
     memoryPool::scheduleFree(payload);
   }
@@ -22,7 +22,7 @@ namespace {
 
 struct CudaAlloc {
   static void scheduleFree(memoryPool::Payload *payload, cudaStream_t stream) {
-    // std::cout    << "schedule free" << std::endl;
+    std::cout    << "schedule free" << std::endl;
     if (stream)
       cudaCheck(cudaLaunchHostFunc(stream, freeCallback, payload));
     else
@@ -36,9 +36,10 @@ struct CudaDeviceAlloc : public CudaAlloc {
   static Pointer alloc(size_t size) {
     Pointer p = nullptr;
     auto err = cudaMalloc(&p, size);
+    std::cout << "alloc " << size << ((err == cudaSuccess) ? " ok" : " err") << std::endl;
     return err == cudaSuccess ? p : nullptr;
   }
-  static void free(Pointer ptr) { cudaFree(ptr); }
+  static void free(Pointer ptr) { auto err = cudaFree(ptr); std::cout << "free" << ((err == cudaSuccess) ? " ok" : " err") <<std::endl;}
 };
 
 struct CudaHostAlloc : public CudaAlloc {
@@ -47,6 +48,7 @@ struct CudaHostAlloc : public CudaAlloc {
   static Pointer alloc(size_t size) {
     Pointer p = nullptr;
     auto err = cudaMallocHost(&p, size);
+    std::cout << "alloc H " << size << ((err == cudaSuccess) ? " ok" : " err") << std::endl;
     return err == cudaSuccess ? p : nullptr;
   }
   static void free(Pointer ptr) { cudaFreeHost(ptr); }

diff --git a/HeterogeneousCore/CUDAUtilities/interface/memoryPool.h b/HeterogeneousCore/CUDAUtilities/interface/memoryPool.h
@@ -2,6 +2,7 @@
 #include <memory>
 #include <new>
 
+#include<iostream>
 class SimplePoolAllocator;
 
 namespace memoryPool {
@@ -32,6 +33,8 @@ namespace memoryPool {
     void operator()(void* p) {
       if (!me)
         throw std::bad_alloc();
+      if(!p) std::cout << "delete null pointer!!! " << m_bucket << std::endl;
+      // assert(p == pool()->pointer(m_bucket));
       (*me)(m_bucket);
     }
 

diff --git a/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml b/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml
@@ -131,8 +131,14 @@
   </bin>
 
   <bin file="testPoolUI.cu" name="testPoolUI">
-    <flags CUDA_FLAGS="-g"/>
-    <flags CXXFLAGS="-g"/>
+    <flags CUDA_FLAGS="-g -DGPU_DEBUG"/>
+    <flags CXXFLAGS="-g -DGPU_DEBUG"/>
+  </bin>
+
+
+  <bin file="testPoolUImt.cu" name="testPoolUImt">
+    <flags CUDA_FLAGS="-g -DGPU_DEBUG"/>
+    <flags CXXFLAGS="-g -DGPU_DEBUG"/>
   </bin>
 
   <bin file="testSimplePoolAllocator.cu" name="testSimplePoolAllocatorGPU">

diff --git a/HeterogeneousCore/CUDAUtilities/test/testPoolUImt.cu b/HeterogeneousCore/CUDAUtilities/test/testPoolUImt.cu
@@ -0,0 +1,243 @@
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaMemoryPool.h"
+
+#include <cmath>
+#include <unistd.h>
+
+#include <random>
+#include <limits>
+
+#include <atomic>
+#include <thread>
+#include <mutex>
+
+typedef std::thread Thread;
+typedef std::vector<std::thread> ThreadGroup;
+typedef std::mutex Mutex;
+typedef std::lock_guard<std::mutex> Lock;
+
+struct Node {
+  int it = -1;
+  int i = -1;
+  void *p = nullptr;
+#ifdef __CUDACC__
+  int c = 0;
+#else
+  std::atomic<int> c = 0;
+#endif
+};
+
+#ifdef __CUDACC__
+
+// generic callback
+template <typename F>
+void CUDART_CB myCallback(void *fun) {
+  (*(F *)(fun))();
+}
+
+__global__ void kernel_set(int s, Node ** p, int me) {
+  int first = blockIdx.x * blockDim.x + threadIdx.x;
+  for (int i = first; i < s; i += gridDim.x * blockDim.x) {
+    assert(p[i]);
+    auto n = p[i];
+    n->it = me;
+    n->i = i;
+    n->p = p[i];
+    n->c = 1;
+  }
+}
+
+__global__ void kernel_test(int s, Node **p, int me) {
+  int first = blockIdx.x * blockDim.x + threadIdx.x;
+  for (int i = first; i < s; i += gridDim.x * blockDim.x) {
+    assert(p[i]);
+    auto n = p[i];
+    atomicSub(&(n->c), 1);
+    assert(n->it == me);
+    assert(n->i == i);
+    assert(n->p == p[i]);
+    assert(0 == n->c);
+  }
+}
+#endif
+
+template <memoryPool::Where where>
+void go() {
+  auto start = std::chrono::high_resolution_clock::now();
+
+  const int NUMTHREADS = 24;
+
+#ifdef __CUDACC__
+  printf("Using CUDA %d\n", CUDART_VERSION);
+  int cuda_device = 0;
+  cudaDeviceProp deviceProp;
+  cudaGetDeviceProperties(&deviceProp, cuda_device);
+  printf("CUDA Capable: SM %d.%d hardware\n", deviceProp.major, deviceProp.minor);
+
+  cudaStream_t streams[NUMTHREADS];
+
+  for (int i = 0; i < NUMTHREADS; i++) {
+    cudaStreamCreate(&(streams[i]));
+  }
+
+#endif
+
+
+  bool stop = false;
+  bool bin24 = false;
+  Thread monitor([&] {
+    int n = 10;
+    while (n--) {
+      sleep(5);
+      memoryPool::cuda::dumpStat();
+      if (5 == n)
+        bin24 = true;
+    }
+    std::cout << "\nstop\n" << std::endl;
+    stop = true;
+  });
+
+  int s = 40;
+  {
+  std::cout << "try to allocate " << s << std::endl;
+  auto stream = streams[0];
+  {
+    auto pd = memoryPool::cuda::make_buffer<int>(s, stream, where);
+    assert(pd.get());
+    memoryPool::cuda::dumpStat();
+    pd = memoryPool::cuda::make_buffer<int>(s, stream, where);
+    memoryPool::cuda::dumpStat();
+  }
+  cudaStreamSynchronize(stream);
+  memoryPool::cuda::dumpStat();
+
+  }
+  std::atomic<int> nt = 0;
+
+  auto test = [&] {
+    int const me = nt++;
+    auto delta = std::chrono::high_resolution_clock::now() - start;
+
+    std::mt19937 eng(me + std::chrono::duration_cast<std::chrono::milliseconds>(delta).count());
+    std::uniform_int_distribution<int> rgen1(1, 100);
+    std::uniform_int_distribution<int> rgen20(3, 20);
+    std::uniform_int_distribution<int> rgen24(3, 24);
+    std::cout << "first RN " << rgen1(eng) << " at "
+              << std::chrono::duration_cast<std::chrono::milliseconds>(delta).count() << " in " << me << std::endl;
+
+#ifdef __CUDACC__
+    Node **dp = nullptr;
+    Node **hp = nullptr;
+    cudaMalloc(&dp, 100 * sizeof(void *));
+    assert(dp);
+    cudaMallocHost(&hp, 100 * sizeof(void *));
+    assert(hp);
+#endif
+
+    int iter = 0;
+    while (true) {
+      if (stop)
+        break;
+      iter++;
+      auto &stream = streams[me];
+
+      memoryPool::Deleter devDeleter(std::make_shared<memoryPool::cuda::BundleDelete>(stream,where));
+      auto n = rgen1(eng);
+      bool large = 0 == (iter % (128 + me));
+      for (int k = 0; k < n; ++k) {
+        int b = bin24 ? rgen24(eng) : rgen20(eng);
+        // once in while let's allocate 2GB
+        if (large) {
+          b = 31;
+          large = false;
+        }
+        uint64_t s = 1LL << b;
+        assert(s > 0);
+        auto p0 = memoryPool::cuda::make_buffer<Node>(s/sizeof(Node) + sizeof(Node), devDeleter);
+        if (!p0.get()) {
+          std::cout << "\n\n!!!Failed " << me << " at " << iter << std::endl;
+          memoryPool::cuda::dumpStat();
+          return;
+        }
+        auto p = p0.get();
+        if (nullptr == p) {
+          std::cout << "error not detected??? " << b << ' ' << std::endl;
+          memoryPool::cuda::dumpStat();
+        }
+        assert(p);
+        hp[k] = p;
+      }
+#ifdef __CUDACC__
+      assert(n <= 100);
+      // do something???
+      cudaMemcpyAsync(dp, hp, n * sizeof(void *), cudaMemcpyHostToDevice, stream);
+      kernel_set<<<1, 128, 0, stream>>>(n, dp, me);
+      kernel_test<<<1, 128, 0, stream>>>(n, dp, me);
+
+      // better sync each "event"
+      cudaStreamSynchronize(stream);
+#else
+      // do something???
+      for (int k = 0; k < n; ++k) {
+        auto p = hp[k];
+        assert(p);
+        auto n = p;
+        n->it = me;
+        n->i = i;
+        n->p = p;
+        n->c = 1;
+      }
+      for (int k = 0; k < n; ++k) {
+        auto p = hp[k];
+        assert(p);
+        auto n = p;
+        n->c--;
+        assert(n->it == me);
+        assert(n->i == i);
+        assert(n->p == p);
+        assert(0 == n->c);
+      }
+#endif
+    }
+  };
+
+  ThreadGroup threads;
+  threads.reserve(NUMTHREADS);
+
+  for (int i = 0; i < NUMTHREADS; ++i) {
+    threads.emplace_back(test);
+  }
+
+  for (auto &t : threads)
+    t.join();
+
+  threads.clear();
+  monitor.join();
+  std::cout << "\nfinished\n" << std::endl;
+  memoryPool::cuda::dumpStat();
+}
+
+#ifdef __CUDACC__
+#include <cuda_runtime.h>
+#include <cuda_runtime_api.h>
+
+#endif
+
+int main() {
+#ifdef __CUDACC__
+  {
+    int devices = 0;
+    auto status = cudaGetDeviceCount(&devices);
+    if (status != cudaSuccess || 0 == devices)
+      return 0;
+    std::cout << "found " << devices << " cuda devices" << std::endl;
+  }
+
+  std::cout << "\ntesting cuda device" << std::endl;
+  go<memoryPool::onDevice>();
+#else
+  std::cout << "testing posix" << std::endl;
+  go<memoryPool::onCPU>();
+#endif
+
+  return 0;
+}
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu
@@ -1,5 +1,6 @@
+
 #include "BrokenLineFitOnGPU.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaMemoryPool.h"
 
 void HelixFitOnGPU::launchBrokenLineKernels(HitsView const *hv,
                                             uint32_t hitsInFit,
@@ -11,13 +12,11 @@ void HelixFitOnGPU::launchBrokenLineKernels(HitsView const *hv,
   auto numberOfBlocks = (maxNumberOfConcurrentFits_ + blockSize - 1) / blockSize;
 
   //  Fit internals
-  auto tkidGPU = cms::cuda::make_device_unique<caConstants::tindex_type[]>(maxNumberOfConcurrentFits_, stream);
-  auto hitsGPU = cms::cuda::make_device_unique<double[]>(
-      maxNumberOfConcurrentFits_ * sizeof(riemannFit::Matrix3xNd<6>) / sizeof(double), stream);
-  auto hits_geGPU = cms::cuda::make_device_unique<float[]>(
-      maxNumberOfConcurrentFits_ * sizeof(riemannFit::Matrix6xNf<6>) / sizeof(float), stream);
-  auto fast_fit_resultsGPU = cms::cuda::make_device_unique<double[]>(
-      maxNumberOfConcurrentFits_ * sizeof(riemannFit::Vector4d) / sizeof(double), stream);
+  memoryPool::Deleter deleter = memoryPool::Deleter(std::make_shared<memoryPool::cuda::BundleDelete>(stream, memoryPool::onDevice));
+  auto tkidGPU = memoryPool::cuda::make_buffer<caConstants::tindex_type>(maxNumberOfConcurrentFits_,deleter);
+  auto hitsGPU = memoryPool::cuda::make_buffer<double>(maxNumberOfConcurrentFits_ * sizeof(riemannFit::Matrix3xNd<6>) / sizeof(double), deleter);
+  auto hits_geGPU = memoryPool::cuda::make_buffer<float>(maxNumberOfConcurrentFits_ * sizeof(riemannFit::Matrix6xNf<6>) / sizeof(float), deleter);
+  auto fast_fit_resultsGPU = memoryPool::cuda::make_buffer<double>(maxNumberOfConcurrentFits_ * sizeof(riemannFit::Vector4d) / sizeof(double), deleter);
 
   for (uint32_t offset = 0; offset < maxNumberOfTuples; offset += maxNumberOfConcurrentFits_) {
     // fit triplets

diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
@@ -22,18 +22,15 @@ void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsOnCPU const &hh, cudaStr
 #endif
 
   // use "nhits" to heuristically dimension the workspace
-
-  // no need to use the Traits allocations, since we know this is being compiled for the CPU
-  //device_isOuterHitOfCell_ = Traits::template make_unique<GPUCACell::OuterHitOfCell[]>(std::max(1U, nhits), stream);
-  device_isOuterHitOfCell_ = std::make_unique<GPUCACell::OuterHitOfCellContainer[]>(std::max(1U, nhits));
+  memoryPool::Deleter deleter = memoryPool::Deleter(std::make_shared<memoryPool::cuda::BundleDelete>(nullptr, memoryPool::onCPU));
+  device_isOuterHitOfCell_ = memoryPool::cuda::make_buffer<GPUCACell::OuterHitOfCellContainer>(std::max(1U, nhits),deleter);
   assert(device_isOuterHitOfCell_.get());
   isOuterHitOfCell_ = GPUCACell::OuterHitOfCell{device_isOuterHitOfCell_.get(), hh.offsetBPIX2()};
 
   auto cellStorageSize = caConstants::maxNumOfActiveDoublets * sizeof(GPUCACell::CellNeighbors) +
                          caConstants::maxNumOfActiveDoublets * sizeof(GPUCACell::CellTracks);
   // no need to use the Traits allocations, since we know this is being compiled for the CPU
-  //cellStorage_ = Traits::template make_unique<unsigned char[]>(cellStorageSize, stream);
-  cellStorage_ = std::make_unique<unsigned char[]>(cellStorageSize);
+  cellStorage_ = memoryPool::cuda::make_buffer<unsigned char>(cellStorageSize,deleter);
   device_theCellNeighborsContainer_ = (GPUCACell::CellNeighbors *)cellStorage_.get();
   device_theCellTracksContainer_ = (GPUCACell::CellTracks *)(cellStorage_.get() + caConstants::maxNumOfActiveDoublets *
                                                                                       sizeof(GPUCACell::CellNeighbors));
@@ -45,9 +42,7 @@ void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsOnCPU const &hh, cudaStr
                                  device_theCellTracks_.get(),
                                  device_theCellTracksContainer_);
 
-  // no need to use the Traits allocations, since we know this is being compiled for the CPU
-  //device_theCells_ = Traits::template make_unique<GPUCACell[]>(params_.maxNumberOfDoublets_, stream);
-  device_theCells_ = std::make_unique<GPUCACell[]>(params_.maxNumberOfDoublets_);
+  device_theCells_ = memoryPool::cuda::make_buffer<GPUCACell>(params_.maxNumberOfDoublets_,deleter);
   if (0 == nhits)
     return;  // protect against empty events