move inline

cms-sw · May 6, 2022 · 5291489 · 5291489
1 parent 21a646e
commit 5291489
Show file tree

Hide file tree

Showing 3 changed files with 367 additions and 0 deletions.
diff --git a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneousImpl.h b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneousImpl.h
@@ -0,0 +1,222 @@
+// #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h"
+
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DSOAView.h"
+#include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h"
+
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaMemoryPool.h"
+
+
+TrackingRecHit2DHeterogeneous::TrackingRecHit2DHeterogeneous(
+    uint32_t nHits,
+    bool isPhase2,
+    int32_t offsetBPIX2,
+    pixelCPEforGPU::ParamsOnGPU const* cpeParams,
+    uint32_t const* hitsModuleStart,
+    memoryPool::Where where,
+    cudaStream_t stream,
+    TrackingRecHit2DHeterogeneous const* input)
+    : m_nHits(nHits), m_offsetBPIX2(offsetBPIX2), m_hitsModuleStart(hitsModuleStart) {
+
+  using namespace memoryPool::cuda;
+
+  memoryPool::Deleter deleter = memoryPool::Deleter(std::make_shared<memoryPool::cuda::BundleDelete>(stream,where));
+  assert(deleter.pool());
+  auto view = make_buffer<TrackingRecHit2DSOAView>(1,stream, memoryPool::onCPU==where ? memoryPool::onCPU : memoryPool::onHost);
+  assert(view.get());
+  m_nMaxModules = isPhase2 ? phase2PixelTopology::numberOfModules : phase1PixelTopology::numberOfModules;
+  assert(view.get());
+  view->m_nHits = nHits;
+  view->m_nMaxModules = m_nMaxModules;
+  m_view = make_buffer<TrackingRecHit2DSOAView>(1, deleter); // stream, where); // deleter);  // leave it on host and pass it by value?
+  assert(m_view.get());
+  m_AverageGeometryStore = make_buffer<TrackingRecHit2DSOAView::AverageGeometry>(1,deleter);
+  view->m_averageGeometry = m_AverageGeometryStore.get();
+  view->m_cpeParams = cpeParams;
+  view->m_hitsModuleStart = hitsModuleStart;
+
+  // if empy do not bother
+  if (0 == nHits) {
+    if  (memoryPool::onDevice == where) {
+      cudaCheck(memoryPool::cuda::copy(m_view, view, sizeof(TrackingRecHit2DSOAView), stream));
+    } else {
+      memoryPool::cuda::swapBuffer(m_view,view);  
+    }
+    return;
+  }
+
+  // the single arrays are not 128 bit alligned...
+  // the hits are actually accessed in order only in building
+  // if ordering is relevant they may have to be stored phi-ordered by layer or so
+  // this will break 1to1 correspondence with cluster and module locality
+  // so unless proven VERY inefficient we keep it ordered as generated
+
+  // host copy is "reduced"  (to be reviewed at some point)
+  if (memoryPool::onHost == where) {
+    // it has to compile for ALL cases
+    copyFromGPU(input, stream);
+  } else {
+    assert(input == nullptr);
+
+    auto nL = isPhase2 ? phase2PixelTopology::numberOfLayers : phase1PixelTopology::numberOfLayers;
+
+    m_store16 = make_buffer<uint16_t>(nHits * n16, deleter);
+    m_store32 = make_buffer<float>(nHits * n32 + nL + 1, deleter);
+    m_PhiBinnerStore = make_buffer<TrackingRecHit2DSOAView::PhiBinner>(1,deleter);
+  }
+
+  static_assert(sizeof(TrackingRecHit2DSOAView::hindex_type) == sizeof(float));
+  static_assert(sizeof(TrackingRecHit2DSOAView::hindex_type) == sizeof(TrackingRecHit2DSOAView::PhiBinner::index_type));
+
+  auto get32 = [&](Storage32 i) { return m_store32.get() + static_cast<int>(i) * nHits; };
+
+  // copy all the pointers
+  m_phiBinner = view->m_phiBinner = m_PhiBinnerStore.get();
+  m_phiBinnerStorage = view->m_phiBinnerStorage =
+      reinterpret_cast<TrackingRecHit2DSOAView::PhiBinner::index_type*>(get32(Storage32::kPhiStorage));
+
+  view->m_xl = get32(Storage32::kXLocal);
+  view->m_yl = get32(Storage32::kYLocal);
+  view->m_xerr = get32(Storage32::kXerror);
+  view->m_yerr = get32(Storage32::kYerror);
+  view->m_chargeAndStatus = reinterpret_cast<uint32_t*>(get32(Storage32::kCharge));
+
+  if (memoryPool::onHost != where) {
+    assert(input == nullptr);
+    view->m_xg = get32(Storage32::kXGlobal);
+    view->m_yg = get32(Storage32::kYGlobal);
+    view->m_zg = get32(Storage32::kZGlobal);
+    view->m_rg = get32(Storage32::kRGlobal);
+
+    auto get16 = [&](Storage16 i) { return m_store16.get() + static_cast<int>(i) * nHits; };
+    m_iphi = view->m_iphi = reinterpret_cast<int16_t*>(get16(Storage16::kPhi));
+
+    view->m_xsize = reinterpret_cast<int16_t*>(get16(Storage16::kXSize));
+    view->m_ysize = reinterpret_cast<int16_t*>(get16(Storage16::kYSize));
+    view->m_detInd = get16(Storage16::kDetId);
+
+    m_phiBinner = view->m_phiBinner = m_PhiBinnerStore.get();
+    m_hitsLayerStart = view->m_hitsLayerStart = reinterpret_cast<uint32_t*>(get32(Storage32::kLayers));
+  }
+
+  // transfer view
+  if (memoryPool::onDevice == where) {
+    cudaCheck(cudaMemcpyAsync(m_view.get(), view.get(),sizeof(TrackingRecHit2DSOAView), cudaMemcpyHostToDevice, stream));
+//    cudaCheck(memoryPool::cuda::copy(m_view, view, sizeof(TrackingRecHit2DSOAView), stream));
+  } else {
+     memoryPool::cuda::swapBuffer(m_view,view);
+  }
+}
+
+//this is intended to be used only for CPU SoA but doesn't hurt to have it for all cases
+TrackingRecHit2DHeterogeneous::TrackingRecHit2DHeterogeneous(
+    float* store32, uint16_t* store16, uint32_t* modules, int nHits, memoryPool::Where where, cudaStream_t stream)
+    : m_nHits(nHits), m_hitsModuleStart(modules) {
+
+  using namespace memoryPool::cuda;
+  auto view = make_buffer<TrackingRecHit2DSOAView>(1,stream,memoryPool::onCPU==where? memoryPool::onCPU : memoryPool::onHost);
+
+  m_view = make_buffer<TrackingRecHit2DSOAView>(1,stream,where);
+
+  view->m_nHits = nHits;
+
+  if (0 == nHits) {
+    if (memoryPool::onDevice == where) {
+     cudaCheck(memoryPool::cuda::copy(m_view, view, sizeof(TrackingRecHit2DSOAView), stream));
+    } else {
+      m_view = std::move(view);
+    }
+    return;
+  }
+
+  m_store16 = make_buffer<uint16_t>(nHits * n16, stream,where);
+  m_store32 = make_buffer<float>(nHits * n32, stream,where);
+  m_PhiBinnerStore = make_buffer<TrackingRecHit2DSOAView::PhiBinner>(1,stream,where);
+  m_AverageGeometryStore = make_buffer<TrackingRecHit2DSOAView::AverageGeometry>(1,stream,where);
+
+  view->m_averageGeometry = m_AverageGeometryStore.get();
+  view->m_hitsModuleStart = m_hitsModuleStart;
+
+  //store transfer
+  if (memoryPool::onDevice == where) {
+     cudaCheck(cudaMemcpyAsync(m_store32.get(), store32, nHits * n32, cudaMemcpyHostToDevice,stream));
+     cudaCheck(cudaMemcpyAsync(m_store16.get(), store16, nHits * n16, cudaMemcpyHostToDevice,stream));
+  } else {
+    std::copy(store32, store32 + nHits * n32, m_store32.get());  // want to copy it
+    std::copy(store16, store16 + nHits * n16, m_store16.get());
+  }
+
+  //getters
+  auto get32 = [&](Storage32 i) { return m_store32.get() + static_cast<int>(i) * nHits; };
+  auto get16 = [&](Storage16 i) { return m_store16.get() + static_cast<int>(i) * nHits; };
+
+  //Store 32
+  view->m_xl = get32(Storage32::kXLocal);
+  view->m_yl = get32(Storage32::kYLocal);
+  view->m_xerr = get32(Storage32::kXerror);
+  view->m_yerr = get32(Storage32::kYerror);
+  view->m_chargeAndStatus = reinterpret_cast<uint32_t*>(get32(Storage32::kCharge));
+  view->m_xg = get32(Storage32::kXGlobal);
+  view->m_yg = get32(Storage32::kYGlobal);
+  view->m_zg = get32(Storage32::kZGlobal);
+  view->m_rg = get32(Storage32::kRGlobal);
+
+  m_phiBinner = view->m_phiBinner = m_PhiBinnerStore.get();
+  m_phiBinnerStorage = view->m_phiBinnerStorage =
+      reinterpret_cast<TrackingRecHit2DSOAView::PhiBinner::index_type*>(get32(Storage32::kPhiStorage));
+
+  //Store 16
+  view->m_detInd = get16(Storage16::kDetId);
+  m_iphi = view->m_iphi = reinterpret_cast<int16_t*>(get16(Storage16::kPhi));
+  view->m_xsize = reinterpret_cast<int16_t*>(get16(Storage16::kXSize));
+  view->m_ysize = reinterpret_cast<int16_t*>(get16(Storage16::kYSize));
+
+  // transfer view
+  if (memoryPool::onDevice == where) {
+     cudaCheck(memoryPool::cuda::copy(m_view, view, sizeof(TrackingRecHit2DSOAView), stream));
+  } else {
+      m_view = std::move(view);
+  }
+}
+
+
+
+using namespace memoryPool::cuda;
+
+memoryPool::buffer<float> TrackingRecHit2DGPU::localCoordToHostAsync(cudaStream_t stream) const {
+  auto ret = make_buffer<float>(5 * nHits(), stream,memoryPool::onHost);
+  cudaCheck(
+      cudaMemcpyAsync(ret.get(), m_store32.get(), 5 * nHits(), cudaMemcpyDefault,stream));
+  return ret;
+}
+
+
+memoryPool::buffer<float> TrackingRecHit2DGPU::store32ToHostAsync(cudaStream_t stream) const {
+  auto ret = make_buffer<float>(static_cast<int>(n32) * nHits(), stream,memoryPool::onHost);
+    cudaCheck(
+      cudaMemcpyAsync(ret.get(), m_store32.get(), static_cast<int>(n32) * nHits(), cudaMemcpyDefault,stream));
+  return ret;
+}
+
+
+memoryPool::buffer<uint16_t> TrackingRecHit2DGPU::store16ToHostAsync(cudaStream_t stream) const {
+  auto ret = make_buffer<uint16_t>(static_cast<int>(n16) * nHits(), stream,memoryPool::onHost);
+    cudaCheck(
+      cudaMemcpyAsync(ret.get(), m_store16.get(), static_cast<int>(n16) * nHits(), cudaMemcpyDefault,stream));
+  return ret;
+}
+
+memoryPool::buffer<uint32_t> TrackingRecHit2DGPU::hitsModuleStartToHostAsync(cudaStream_t stream) const {
+  auto ret = make_buffer<uint32_t>(nMaxModules() + 1, stream,memoryPool::onHost);
+  if (m_hitsModuleStart) cudaCheck(
+      cudaMemcpyAsync(ret.get(), m_hitsModuleStart, sizeof(uint32_t) * (nMaxModules() + 1), cudaMemcpyDefault, stream));
+  return ret;
+}
+
+void TrackingRecHit2DHost::copyFromGPU(TrackingRecHit2DGPU const* input, cudaStream_t stream) {
+  assert(input);
+  m_store32 = input->localCoordToHostAsync(stream);
+}
+
+
+
diff --git a/HeterogeneousCore/CUDAUtilities/interface/cudaMemoryPoolImpl.h b/HeterogeneousCore/CUDAUtilities/interface/cudaMemoryPoolImpl.h
@@ -0,0 +1,81 @@
+// #include "HeterogeneousCore/CUDAUtilities/interface/cudaMemoryPool.h"
+
+#include "HeterogeneousCore/CUDAUtilities/interface/SimplePoolAllocator.h"
+
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+#include <cuda_runtime.h>
+#include <cuda_runtime_api.h>
+
+#include<iostream>
+
+namespace {
+
+   //  free callback
+    void CUDART_CB freeCallback(void *p) {
+      // std::cout << "free callaback" << std::endl;
+      auto payload = (memoryPool::Payload *)(p);
+      memoryPool::scheduleFree(payload);
+    }
+
+}
+
+struct CudaAlloc {
+  static void  scheduleFree(memoryPool::Payload * payload, cudaStream_t stream) {
+    // std::cout    << "schedule free" << std::endl;
+    if (stream)
+       cudaCheck(cudaLaunchHostFunc(stream, freeCallback, payload));
+     else
+       memoryPool::scheduleFree(payload);
+  }
+};
+
+
+struct CudaDeviceAlloc : public CudaAlloc {
+  using Pointer = void *;
+
+  static Pointer alloc(size_t size) {
+    Pointer p = nullptr;
+    auto err = cudaMalloc(&p, size);
+    return err == cudaSuccess ? p : nullptr;
+  }
+  static void free(Pointer ptr) { cudaFree(ptr); }
+
+};
+
+struct CudaHostAlloc : public CudaAlloc {
+  using Pointer = void *;
+
+  static Pointer alloc(size_t size) {
+    Pointer p = nullptr;
+    auto err = cudaMallocHost(&p, size);
+    return err == cudaSuccess ? p : nullptr;
+  }
+  static void free(Pointer ptr) { cudaFreeHost(ptr); }
+};
+
+
+namespace memoryPool {
+  namespace cuda {
+
+    void dumpStat();
+
+    SimplePoolAllocator *getPool(Where where);
+
+    // allocate either on current device or on host (actually anywhere, not cuda specific)
+    inline std::pair<void *, int> alloc(uint64_t size, SimplePoolAllocator &pool) {
+      int i = pool.alloc(size);
+      void *p = pool.pointer(i);
+      return std::pair<void *, int>(p, i);
+    }
+
+    // schedule free
+    inline void free(cudaStream_t stream, std::vector<int> buckets, SimplePoolAllocator &pool) {
+      auto payload = new Payload{&pool, std::move(buckets)};
+      CudaHostAlloc::scheduleFree(payload,stream);
+    }
+
+  }  // namespace cuda
+}  // namespace memoryPool
+
+
diff --git a/HeterogeneousCore/CUDAUtilities/src/cudaMemoryPool.cc b/HeterogeneousCore/CUDAUtilities/src/cudaMemoryPool.cc
@@ -0,0 +1,64 @@
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaMemoryPool.h"
+
+#include "HeterogeneousCore/CUDAUtilities/interface/SimplePoolAllocator.h"
+
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+#include <cuda_runtime.h>
+#include <cuda_runtime_api.h>
+
+#include<iostream>
+
+namespace {
+
+  constexpr int poolSize = 128 * 1024;
+
+  SimplePoolAllocatorImpl<PosixAlloc> cpuPool(poolSize);
+
+  SimplePoolAllocatorImpl<CudaHostAlloc> hostPool(poolSize);
+
+  struct DevicePools {
+    using Pool = SimplePoolAllocatorImpl<CudaDeviceAlloc>;
+    DevicePools(int size) {
+      int devices = 0;
+      auto status = cudaGetDeviceCount(&devices);
+      if (status == cudaSuccess && devices > 0) {
+        m_devicePools.reserve(devices);
+        for (int i = 0; i < devices; ++i)
+          m_devicePools.emplace_back(new Pool(size));
+      }
+    }
+    //return pool for current device
+    Pool &operator()() {
+      int dev = -1;
+      cudaGetDevice(&dev);
+      return *m_devicePools[dev];
+    }
+
+    std::vector<std::unique_ptr<Pool>> m_devicePools;
+  };
+
+  DevicePools devicePool(poolSize);
+
+}  // namespace
+
+namespace memoryPool {
+  namespace cuda {
+
+    void dumpStat() {
+      std::cout << "device pool" << std::endl;
+      devicePool().dumpStat();
+      std::cout << "host pool" << std::endl;
+      hostPool.dumpStat();
+    }
+
+    SimplePoolAllocator *getPool(Where where) {
+      return onCPU == where
+                 ? (SimplePoolAllocator *)(&cpuPool)
+                 : (onDevice == where ? (SimplePoolAllocator *)(&devicePool()) : (SimplePoolAllocator *)(&hostPool));
+    }
+
+  }  // namespace cuda
+}  // namespace memoryPool
+
+