diff --git a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneousImpl.h b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneousImpl.h new file mode 100644 index 0000000000000..b2d0d2b719b65 --- /dev/null +++ b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneousImpl.h @@ -0,0 +1,222 @@ +// #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" + +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DSOAView.h" +#include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h" + +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaMemoryPool.h" + + +TrackingRecHit2DHeterogeneous::TrackingRecHit2DHeterogeneous( + uint32_t nHits, + bool isPhase2, + int32_t offsetBPIX2, + pixelCPEforGPU::ParamsOnGPU const* cpeParams, + uint32_t const* hitsModuleStart, + memoryPool::Where where, + cudaStream_t stream, + TrackingRecHit2DHeterogeneous const* input) + : m_nHits(nHits), m_offsetBPIX2(offsetBPIX2), m_hitsModuleStart(hitsModuleStart) { + + using namespace memoryPool::cuda; + + memoryPool::Deleter deleter = memoryPool::Deleter(std::make_shared(stream,where)); + assert(deleter.pool()); + auto view = make_buffer(1,stream, memoryPool::onCPU==where ? memoryPool::onCPU : memoryPool::onHost); + assert(view.get()); + m_nMaxModules = isPhase2 ? phase2PixelTopology::numberOfModules : phase1PixelTopology::numberOfModules; + assert(view.get()); + view->m_nHits = nHits; + view->m_nMaxModules = m_nMaxModules; + m_view = make_buffer(1, deleter); // stream, where); // deleter); // leave it on host and pass it by value? + assert(m_view.get()); + m_AverageGeometryStore = make_buffer(1,deleter); + view->m_averageGeometry = m_AverageGeometryStore.get(); + view->m_cpeParams = cpeParams; + view->m_hitsModuleStart = hitsModuleStart; + + // if empy do not bother + if (0 == nHits) { + if (memoryPool::onDevice == where) { + cudaCheck(memoryPool::cuda::copy(m_view, view, sizeof(TrackingRecHit2DSOAView), stream)); + } else { + memoryPool::cuda::swapBuffer(m_view,view); + } + return; + } + + // the single arrays are not 128 bit alligned... + // the hits are actually accessed in order only in building + // if ordering is relevant they may have to be stored phi-ordered by layer or so + // this will break 1to1 correspondence with cluster and module locality + // so unless proven VERY inefficient we keep it ordered as generated + + // host copy is "reduced" (to be reviewed at some point) + if (memoryPool::onHost == where) { + // it has to compile for ALL cases + copyFromGPU(input, stream); + } else { + assert(input == nullptr); + + auto nL = isPhase2 ? phase2PixelTopology::numberOfLayers : phase1PixelTopology::numberOfLayers; + + m_store16 = make_buffer(nHits * n16, deleter); + m_store32 = make_buffer(nHits * n32 + nL + 1, deleter); + m_PhiBinnerStore = make_buffer(1,deleter); + } + + static_assert(sizeof(TrackingRecHit2DSOAView::hindex_type) == sizeof(float)); + static_assert(sizeof(TrackingRecHit2DSOAView::hindex_type) == sizeof(TrackingRecHit2DSOAView::PhiBinner::index_type)); + + auto get32 = [&](Storage32 i) { return m_store32.get() + static_cast(i) * nHits; }; + + // copy all the pointers + m_phiBinner = view->m_phiBinner = m_PhiBinnerStore.get(); + m_phiBinnerStorage = view->m_phiBinnerStorage = + reinterpret_cast(get32(Storage32::kPhiStorage)); + + view->m_xl = get32(Storage32::kXLocal); + view->m_yl = get32(Storage32::kYLocal); + view->m_xerr = get32(Storage32::kXerror); + view->m_yerr = get32(Storage32::kYerror); + view->m_chargeAndStatus = reinterpret_cast(get32(Storage32::kCharge)); + + if (memoryPool::onHost != where) { + assert(input == nullptr); + view->m_xg = get32(Storage32::kXGlobal); + view->m_yg = get32(Storage32::kYGlobal); + view->m_zg = get32(Storage32::kZGlobal); + view->m_rg = get32(Storage32::kRGlobal); + + auto get16 = [&](Storage16 i) { return m_store16.get() + static_cast(i) * nHits; }; + m_iphi = view->m_iphi = reinterpret_cast(get16(Storage16::kPhi)); + + view->m_xsize = reinterpret_cast(get16(Storage16::kXSize)); + view->m_ysize = reinterpret_cast(get16(Storage16::kYSize)); + view->m_detInd = get16(Storage16::kDetId); + + m_phiBinner = view->m_phiBinner = m_PhiBinnerStore.get(); + m_hitsLayerStart = view->m_hitsLayerStart = reinterpret_cast(get32(Storage32::kLayers)); + } + + // transfer view + if (memoryPool::onDevice == where) { + cudaCheck(cudaMemcpyAsync(m_view.get(), view.get(),sizeof(TrackingRecHit2DSOAView), cudaMemcpyHostToDevice, stream)); +// cudaCheck(memoryPool::cuda::copy(m_view, view, sizeof(TrackingRecHit2DSOAView), stream)); + } else { + memoryPool::cuda::swapBuffer(m_view,view); + } +} + +//this is intended to be used only for CPU SoA but doesn't hurt to have it for all cases +TrackingRecHit2DHeterogeneous::TrackingRecHit2DHeterogeneous( + float* store32, uint16_t* store16, uint32_t* modules, int nHits, memoryPool::Where where, cudaStream_t stream) + : m_nHits(nHits), m_hitsModuleStart(modules) { + + using namespace memoryPool::cuda; + auto view = make_buffer(1,stream,memoryPool::onCPU==where? memoryPool::onCPU : memoryPool::onHost); + + m_view = make_buffer(1,stream,where); + + view->m_nHits = nHits; + + if (0 == nHits) { + if (memoryPool::onDevice == where) { + cudaCheck(memoryPool::cuda::copy(m_view, view, sizeof(TrackingRecHit2DSOAView), stream)); + } else { + m_view = std::move(view); + } + return; + } + + m_store16 = make_buffer(nHits * n16, stream,where); + m_store32 = make_buffer(nHits * n32, stream,where); + m_PhiBinnerStore = make_buffer(1,stream,where); + m_AverageGeometryStore = make_buffer(1,stream,where); + + view->m_averageGeometry = m_AverageGeometryStore.get(); + view->m_hitsModuleStart = m_hitsModuleStart; + + //store transfer + if (memoryPool::onDevice == where) { + cudaCheck(cudaMemcpyAsync(m_store32.get(), store32, nHits * n32, cudaMemcpyHostToDevice,stream)); + cudaCheck(cudaMemcpyAsync(m_store16.get(), store16, nHits * n16, cudaMemcpyHostToDevice,stream)); + } else { + std::copy(store32, store32 + nHits * n32, m_store32.get()); // want to copy it + std::copy(store16, store16 + nHits * n16, m_store16.get()); + } + + //getters + auto get32 = [&](Storage32 i) { return m_store32.get() + static_cast(i) * nHits; }; + auto get16 = [&](Storage16 i) { return m_store16.get() + static_cast(i) * nHits; }; + + //Store 32 + view->m_xl = get32(Storage32::kXLocal); + view->m_yl = get32(Storage32::kYLocal); + view->m_xerr = get32(Storage32::kXerror); + view->m_yerr = get32(Storage32::kYerror); + view->m_chargeAndStatus = reinterpret_cast(get32(Storage32::kCharge)); + view->m_xg = get32(Storage32::kXGlobal); + view->m_yg = get32(Storage32::kYGlobal); + view->m_zg = get32(Storage32::kZGlobal); + view->m_rg = get32(Storage32::kRGlobal); + + m_phiBinner = view->m_phiBinner = m_PhiBinnerStore.get(); + m_phiBinnerStorage = view->m_phiBinnerStorage = + reinterpret_cast(get32(Storage32::kPhiStorage)); + + //Store 16 + view->m_detInd = get16(Storage16::kDetId); + m_iphi = view->m_iphi = reinterpret_cast(get16(Storage16::kPhi)); + view->m_xsize = reinterpret_cast(get16(Storage16::kXSize)); + view->m_ysize = reinterpret_cast(get16(Storage16::kYSize)); + + // transfer view + if (memoryPool::onDevice == where) { + cudaCheck(memoryPool::cuda::copy(m_view, view, sizeof(TrackingRecHit2DSOAView), stream)); + } else { + m_view = std::move(view); + } +} + + + +using namespace memoryPool::cuda; + +memoryPool::buffer TrackingRecHit2DGPU::localCoordToHostAsync(cudaStream_t stream) const { + auto ret = make_buffer(5 * nHits(), stream,memoryPool::onHost); + cudaCheck( + cudaMemcpyAsync(ret.get(), m_store32.get(), 5 * nHits(), cudaMemcpyDefault,stream)); + return ret; +} + + +memoryPool::buffer TrackingRecHit2DGPU::store32ToHostAsync(cudaStream_t stream) const { + auto ret = make_buffer(static_cast(n32) * nHits(), stream,memoryPool::onHost); + cudaCheck( + cudaMemcpyAsync(ret.get(), m_store32.get(), static_cast(n32) * nHits(), cudaMemcpyDefault,stream)); + return ret; +} + + +memoryPool::buffer TrackingRecHit2DGPU::store16ToHostAsync(cudaStream_t stream) const { + auto ret = make_buffer(static_cast(n16) * nHits(), stream,memoryPool::onHost); + cudaCheck( + cudaMemcpyAsync(ret.get(), m_store16.get(), static_cast(n16) * nHits(), cudaMemcpyDefault,stream)); + return ret; +} + +memoryPool::buffer TrackingRecHit2DGPU::hitsModuleStartToHostAsync(cudaStream_t stream) const { + auto ret = make_buffer(nMaxModules() + 1, stream,memoryPool::onHost); + if (m_hitsModuleStart) cudaCheck( + cudaMemcpyAsync(ret.get(), m_hitsModuleStart, sizeof(uint32_t) * (nMaxModules() + 1), cudaMemcpyDefault, stream)); + return ret; +} + +void TrackingRecHit2DHost::copyFromGPU(TrackingRecHit2DGPU const* input, cudaStream_t stream) { + assert(input); + m_store32 = input->localCoordToHostAsync(stream); +} + + + diff --git a/HeterogeneousCore/CUDAUtilities/interface/cudaMemoryPoolImpl.h b/HeterogeneousCore/CUDAUtilities/interface/cudaMemoryPoolImpl.h new file mode 100644 index 0000000000000..88edce0929818 --- /dev/null +++ b/HeterogeneousCore/CUDAUtilities/interface/cudaMemoryPoolImpl.h @@ -0,0 +1,81 @@ +// #include "HeterogeneousCore/CUDAUtilities/interface/cudaMemoryPool.h" + +#include "HeterogeneousCore/CUDAUtilities/interface/SimplePoolAllocator.h" + +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" + +#include +#include + +#include + +namespace { + + // free callback + void CUDART_CB freeCallback(void *p) { + // std::cout << "free callaback" << std::endl; + auto payload = (memoryPool::Payload *)(p); + memoryPool::scheduleFree(payload); + } + +} + +struct CudaAlloc { + static void scheduleFree(memoryPool::Payload * payload, cudaStream_t stream) { + // std::cout << "schedule free" << std::endl; + if (stream) + cudaCheck(cudaLaunchHostFunc(stream, freeCallback, payload)); + else + memoryPool::scheduleFree(payload); + } +}; + + +struct CudaDeviceAlloc : public CudaAlloc { + using Pointer = void *; + + static Pointer alloc(size_t size) { + Pointer p = nullptr; + auto err = cudaMalloc(&p, size); + return err == cudaSuccess ? p : nullptr; + } + static void free(Pointer ptr) { cudaFree(ptr); } + +}; + +struct CudaHostAlloc : public CudaAlloc { + using Pointer = void *; + + static Pointer alloc(size_t size) { + Pointer p = nullptr; + auto err = cudaMallocHost(&p, size); + return err == cudaSuccess ? p : nullptr; + } + static void free(Pointer ptr) { cudaFreeHost(ptr); } +}; + + +namespace memoryPool { + namespace cuda { + + void dumpStat(); + + SimplePoolAllocator *getPool(Where where); + + // allocate either on current device or on host (actually anywhere, not cuda specific) + inline std::pair alloc(uint64_t size, SimplePoolAllocator &pool) { + int i = pool.alloc(size); + void *p = pool.pointer(i); + return std::pair(p, i); + } + + // schedule free + inline void free(cudaStream_t stream, std::vector buckets, SimplePoolAllocator &pool) { + auto payload = new Payload{&pool, std::move(buckets)}; + CudaHostAlloc::scheduleFree(payload,stream); + } + + } // namespace cuda +} // namespace memoryPool + + diff --git a/HeterogeneousCore/CUDAUtilities/src/cudaMemoryPool.cc b/HeterogeneousCore/CUDAUtilities/src/cudaMemoryPool.cc new file mode 100644 index 0000000000000..316a49d52265d --- /dev/null +++ b/HeterogeneousCore/CUDAUtilities/src/cudaMemoryPool.cc @@ -0,0 +1,64 @@ +#include "HeterogeneousCore/CUDAUtilities/interface/cudaMemoryPool.h" + +#include "HeterogeneousCore/CUDAUtilities/interface/SimplePoolAllocator.h" + +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" + +#include +#include + +#include + +namespace { + + constexpr int poolSize = 128 * 1024; + + SimplePoolAllocatorImpl cpuPool(poolSize); + + SimplePoolAllocatorImpl hostPool(poolSize); + + struct DevicePools { + using Pool = SimplePoolAllocatorImpl; + DevicePools(int size) { + int devices = 0; + auto status = cudaGetDeviceCount(&devices); + if (status == cudaSuccess && devices > 0) { + m_devicePools.reserve(devices); + for (int i = 0; i < devices; ++i) + m_devicePools.emplace_back(new Pool(size)); + } + } + //return pool for current device + Pool &operator()() { + int dev = -1; + cudaGetDevice(&dev); + return *m_devicePools[dev]; + } + + std::vector> m_devicePools; + }; + + DevicePools devicePool(poolSize); + +} // namespace + +namespace memoryPool { + namespace cuda { + + void dumpStat() { + std::cout << "device pool" << std::endl; + devicePool().dumpStat(); + std::cout << "host pool" << std::endl; + hostPool.dumpStat(); + } + + SimplePoolAllocator *getPool(Where where) { + return onCPU == where + ? (SimplePoolAllocator *)(&cpuPool) + : (onDevice == where ? (SimplePoolAllocator *)(&devicePool()) : (SimplePoolAllocator *)(&hostPool)); + } + + } // namespace cuda +} // namespace memoryPool + +