-
Notifications
You must be signed in to change notification settings - Fork 4.3k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
3 changed files
with
367 additions
and
0 deletions.
There are no files selected for viewing
222 changes: 222 additions & 0 deletions
222
CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneousImpl.h
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,222 @@ | ||
// #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" | ||
|
||
#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DSOAView.h" | ||
#include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h" | ||
|
||
#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" | ||
#include "HeterogeneousCore/CUDAUtilities/interface/cudaMemoryPool.h" | ||
|
||
|
||
TrackingRecHit2DHeterogeneous::TrackingRecHit2DHeterogeneous( | ||
uint32_t nHits, | ||
bool isPhase2, | ||
int32_t offsetBPIX2, | ||
pixelCPEforGPU::ParamsOnGPU const* cpeParams, | ||
uint32_t const* hitsModuleStart, | ||
memoryPool::Where where, | ||
cudaStream_t stream, | ||
TrackingRecHit2DHeterogeneous const* input) | ||
: m_nHits(nHits), m_offsetBPIX2(offsetBPIX2), m_hitsModuleStart(hitsModuleStart) { | ||
|
||
using namespace memoryPool::cuda; | ||
|
||
memoryPool::Deleter deleter = memoryPool::Deleter(std::make_shared<memoryPool::cuda::BundleDelete>(stream,where)); | ||
assert(deleter.pool()); | ||
auto view = make_buffer<TrackingRecHit2DSOAView>(1,stream, memoryPool::onCPU==where ? memoryPool::onCPU : memoryPool::onHost); | ||
assert(view.get()); | ||
m_nMaxModules = isPhase2 ? phase2PixelTopology::numberOfModules : phase1PixelTopology::numberOfModules; | ||
assert(view.get()); | ||
view->m_nHits = nHits; | ||
view->m_nMaxModules = m_nMaxModules; | ||
m_view = make_buffer<TrackingRecHit2DSOAView>(1, deleter); // stream, where); // deleter); // leave it on host and pass it by value? | ||
assert(m_view.get()); | ||
m_AverageGeometryStore = make_buffer<TrackingRecHit2DSOAView::AverageGeometry>(1,deleter); | ||
view->m_averageGeometry = m_AverageGeometryStore.get(); | ||
view->m_cpeParams = cpeParams; | ||
view->m_hitsModuleStart = hitsModuleStart; | ||
|
||
// if empy do not bother | ||
if (0 == nHits) { | ||
if (memoryPool::onDevice == where) { | ||
cudaCheck(memoryPool::cuda::copy(m_view, view, sizeof(TrackingRecHit2DSOAView), stream)); | ||
} else { | ||
memoryPool::cuda::swapBuffer(m_view,view); | ||
} | ||
return; | ||
} | ||
|
||
// the single arrays are not 128 bit alligned... | ||
// the hits are actually accessed in order only in building | ||
// if ordering is relevant they may have to be stored phi-ordered by layer or so | ||
// this will break 1to1 correspondence with cluster and module locality | ||
// so unless proven VERY inefficient we keep it ordered as generated | ||
|
||
// host copy is "reduced" (to be reviewed at some point) | ||
if (memoryPool::onHost == where) { | ||
// it has to compile for ALL cases | ||
copyFromGPU(input, stream); | ||
} else { | ||
assert(input == nullptr); | ||
|
||
auto nL = isPhase2 ? phase2PixelTopology::numberOfLayers : phase1PixelTopology::numberOfLayers; | ||
|
||
m_store16 = make_buffer<uint16_t>(nHits * n16, deleter); | ||
m_store32 = make_buffer<float>(nHits * n32 + nL + 1, deleter); | ||
m_PhiBinnerStore = make_buffer<TrackingRecHit2DSOAView::PhiBinner>(1,deleter); | ||
} | ||
|
||
static_assert(sizeof(TrackingRecHit2DSOAView::hindex_type) == sizeof(float)); | ||
static_assert(sizeof(TrackingRecHit2DSOAView::hindex_type) == sizeof(TrackingRecHit2DSOAView::PhiBinner::index_type)); | ||
|
||
auto get32 = [&](Storage32 i) { return m_store32.get() + static_cast<int>(i) * nHits; }; | ||
|
||
// copy all the pointers | ||
m_phiBinner = view->m_phiBinner = m_PhiBinnerStore.get(); | ||
m_phiBinnerStorage = view->m_phiBinnerStorage = | ||
reinterpret_cast<TrackingRecHit2DSOAView::PhiBinner::index_type*>(get32(Storage32::kPhiStorage)); | ||
|
||
view->m_xl = get32(Storage32::kXLocal); | ||
view->m_yl = get32(Storage32::kYLocal); | ||
view->m_xerr = get32(Storage32::kXerror); | ||
view->m_yerr = get32(Storage32::kYerror); | ||
view->m_chargeAndStatus = reinterpret_cast<uint32_t*>(get32(Storage32::kCharge)); | ||
|
||
if (memoryPool::onHost != where) { | ||
assert(input == nullptr); | ||
view->m_xg = get32(Storage32::kXGlobal); | ||
view->m_yg = get32(Storage32::kYGlobal); | ||
view->m_zg = get32(Storage32::kZGlobal); | ||
view->m_rg = get32(Storage32::kRGlobal); | ||
|
||
auto get16 = [&](Storage16 i) { return m_store16.get() + static_cast<int>(i) * nHits; }; | ||
m_iphi = view->m_iphi = reinterpret_cast<int16_t*>(get16(Storage16::kPhi)); | ||
|
||
view->m_xsize = reinterpret_cast<int16_t*>(get16(Storage16::kXSize)); | ||
view->m_ysize = reinterpret_cast<int16_t*>(get16(Storage16::kYSize)); | ||
view->m_detInd = get16(Storage16::kDetId); | ||
|
||
m_phiBinner = view->m_phiBinner = m_PhiBinnerStore.get(); | ||
m_hitsLayerStart = view->m_hitsLayerStart = reinterpret_cast<uint32_t*>(get32(Storage32::kLayers)); | ||
} | ||
|
||
// transfer view | ||
if (memoryPool::onDevice == where) { | ||
cudaCheck(cudaMemcpyAsync(m_view.get(), view.get(),sizeof(TrackingRecHit2DSOAView), cudaMemcpyHostToDevice, stream)); | ||
// cudaCheck(memoryPool::cuda::copy(m_view, view, sizeof(TrackingRecHit2DSOAView), stream)); | ||
} else { | ||
memoryPool::cuda::swapBuffer(m_view,view); | ||
} | ||
} | ||
|
||
//this is intended to be used only for CPU SoA but doesn't hurt to have it for all cases | ||
TrackingRecHit2DHeterogeneous::TrackingRecHit2DHeterogeneous( | ||
float* store32, uint16_t* store16, uint32_t* modules, int nHits, memoryPool::Where where, cudaStream_t stream) | ||
: m_nHits(nHits), m_hitsModuleStart(modules) { | ||
|
||
using namespace memoryPool::cuda; | ||
auto view = make_buffer<TrackingRecHit2DSOAView>(1,stream,memoryPool::onCPU==where? memoryPool::onCPU : memoryPool::onHost); | ||
|
||
m_view = make_buffer<TrackingRecHit2DSOAView>(1,stream,where); | ||
|
||
view->m_nHits = nHits; | ||
|
||
if (0 == nHits) { | ||
if (memoryPool::onDevice == where) { | ||
cudaCheck(memoryPool::cuda::copy(m_view, view, sizeof(TrackingRecHit2DSOAView), stream)); | ||
} else { | ||
m_view = std::move(view); | ||
} | ||
return; | ||
} | ||
|
||
m_store16 = make_buffer<uint16_t>(nHits * n16, stream,where); | ||
m_store32 = make_buffer<float>(nHits * n32, stream,where); | ||
m_PhiBinnerStore = make_buffer<TrackingRecHit2DSOAView::PhiBinner>(1,stream,where); | ||
m_AverageGeometryStore = make_buffer<TrackingRecHit2DSOAView::AverageGeometry>(1,stream,where); | ||
|
||
view->m_averageGeometry = m_AverageGeometryStore.get(); | ||
view->m_hitsModuleStart = m_hitsModuleStart; | ||
|
||
//store transfer | ||
if (memoryPool::onDevice == where) { | ||
cudaCheck(cudaMemcpyAsync(m_store32.get(), store32, nHits * n32, cudaMemcpyHostToDevice,stream)); | ||
cudaCheck(cudaMemcpyAsync(m_store16.get(), store16, nHits * n16, cudaMemcpyHostToDevice,stream)); | ||
} else { | ||
std::copy(store32, store32 + nHits * n32, m_store32.get()); // want to copy it | ||
std::copy(store16, store16 + nHits * n16, m_store16.get()); | ||
} | ||
|
||
//getters | ||
auto get32 = [&](Storage32 i) { return m_store32.get() + static_cast<int>(i) * nHits; }; | ||
auto get16 = [&](Storage16 i) { return m_store16.get() + static_cast<int>(i) * nHits; }; | ||
|
||
//Store 32 | ||
view->m_xl = get32(Storage32::kXLocal); | ||
view->m_yl = get32(Storage32::kYLocal); | ||
view->m_xerr = get32(Storage32::kXerror); | ||
view->m_yerr = get32(Storage32::kYerror); | ||
view->m_chargeAndStatus = reinterpret_cast<uint32_t*>(get32(Storage32::kCharge)); | ||
view->m_xg = get32(Storage32::kXGlobal); | ||
view->m_yg = get32(Storage32::kYGlobal); | ||
view->m_zg = get32(Storage32::kZGlobal); | ||
view->m_rg = get32(Storage32::kRGlobal); | ||
|
||
m_phiBinner = view->m_phiBinner = m_PhiBinnerStore.get(); | ||
m_phiBinnerStorage = view->m_phiBinnerStorage = | ||
reinterpret_cast<TrackingRecHit2DSOAView::PhiBinner::index_type*>(get32(Storage32::kPhiStorage)); | ||
|
||
//Store 16 | ||
view->m_detInd = get16(Storage16::kDetId); | ||
m_iphi = view->m_iphi = reinterpret_cast<int16_t*>(get16(Storage16::kPhi)); | ||
view->m_xsize = reinterpret_cast<int16_t*>(get16(Storage16::kXSize)); | ||
view->m_ysize = reinterpret_cast<int16_t*>(get16(Storage16::kYSize)); | ||
|
||
// transfer view | ||
if (memoryPool::onDevice == where) { | ||
cudaCheck(memoryPool::cuda::copy(m_view, view, sizeof(TrackingRecHit2DSOAView), stream)); | ||
} else { | ||
m_view = std::move(view); | ||
} | ||
} | ||
|
||
|
||
|
||
using namespace memoryPool::cuda; | ||
|
||
memoryPool::buffer<float> TrackingRecHit2DGPU::localCoordToHostAsync(cudaStream_t stream) const { | ||
auto ret = make_buffer<float>(5 * nHits(), stream,memoryPool::onHost); | ||
cudaCheck( | ||
cudaMemcpyAsync(ret.get(), m_store32.get(), 5 * nHits(), cudaMemcpyDefault,stream)); | ||
return ret; | ||
} | ||
|
||
|
||
memoryPool::buffer<float> TrackingRecHit2DGPU::store32ToHostAsync(cudaStream_t stream) const { | ||
auto ret = make_buffer<float>(static_cast<int>(n32) * nHits(), stream,memoryPool::onHost); | ||
cudaCheck( | ||
cudaMemcpyAsync(ret.get(), m_store32.get(), static_cast<int>(n32) * nHits(), cudaMemcpyDefault,stream)); | ||
return ret; | ||
} | ||
|
||
|
||
memoryPool::buffer<uint16_t> TrackingRecHit2DGPU::store16ToHostAsync(cudaStream_t stream) const { | ||
auto ret = make_buffer<uint16_t>(static_cast<int>(n16) * nHits(), stream,memoryPool::onHost); | ||
cudaCheck( | ||
cudaMemcpyAsync(ret.get(), m_store16.get(), static_cast<int>(n16) * nHits(), cudaMemcpyDefault,stream)); | ||
return ret; | ||
} | ||
|
||
memoryPool::buffer<uint32_t> TrackingRecHit2DGPU::hitsModuleStartToHostAsync(cudaStream_t stream) const { | ||
auto ret = make_buffer<uint32_t>(nMaxModules() + 1, stream,memoryPool::onHost); | ||
if (m_hitsModuleStart) cudaCheck( | ||
cudaMemcpyAsync(ret.get(), m_hitsModuleStart, sizeof(uint32_t) * (nMaxModules() + 1), cudaMemcpyDefault, stream)); | ||
return ret; | ||
} | ||
|
||
void TrackingRecHit2DHost::copyFromGPU(TrackingRecHit2DGPU const* input, cudaStream_t stream) { | ||
assert(input); | ||
m_store32 = input->localCoordToHostAsync(stream); | ||
} | ||
|
||
|
||
|
81 changes: 81 additions & 0 deletions
81
HeterogeneousCore/CUDAUtilities/interface/cudaMemoryPoolImpl.h
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,81 @@ | ||
// #include "HeterogeneousCore/CUDAUtilities/interface/cudaMemoryPool.h" | ||
|
||
#include "HeterogeneousCore/CUDAUtilities/interface/SimplePoolAllocator.h" | ||
|
||
#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" | ||
|
||
#include <cuda_runtime.h> | ||
#include <cuda_runtime_api.h> | ||
|
||
#include<iostream> | ||
|
||
namespace { | ||
|
||
// free callback | ||
void CUDART_CB freeCallback(void *p) { | ||
// std::cout << "free callaback" << std::endl; | ||
auto payload = (memoryPool::Payload *)(p); | ||
memoryPool::scheduleFree(payload); | ||
} | ||
|
||
} | ||
|
||
struct CudaAlloc { | ||
static void scheduleFree(memoryPool::Payload * payload, cudaStream_t stream) { | ||
// std::cout << "schedule free" << std::endl; | ||
if (stream) | ||
cudaCheck(cudaLaunchHostFunc(stream, freeCallback, payload)); | ||
else | ||
memoryPool::scheduleFree(payload); | ||
} | ||
}; | ||
|
||
|
||
struct CudaDeviceAlloc : public CudaAlloc { | ||
using Pointer = void *; | ||
|
||
static Pointer alloc(size_t size) { | ||
Pointer p = nullptr; | ||
auto err = cudaMalloc(&p, size); | ||
return err == cudaSuccess ? p : nullptr; | ||
} | ||
static void free(Pointer ptr) { cudaFree(ptr); } | ||
|
||
}; | ||
|
||
struct CudaHostAlloc : public CudaAlloc { | ||
using Pointer = void *; | ||
|
||
static Pointer alloc(size_t size) { | ||
Pointer p = nullptr; | ||
auto err = cudaMallocHost(&p, size); | ||
return err == cudaSuccess ? p : nullptr; | ||
} | ||
static void free(Pointer ptr) { cudaFreeHost(ptr); } | ||
}; | ||
|
||
|
||
namespace memoryPool { | ||
namespace cuda { | ||
|
||
void dumpStat(); | ||
|
||
SimplePoolAllocator *getPool(Where where); | ||
|
||
// allocate either on current device or on host (actually anywhere, not cuda specific) | ||
inline std::pair<void *, int> alloc(uint64_t size, SimplePoolAllocator &pool) { | ||
int i = pool.alloc(size); | ||
void *p = pool.pointer(i); | ||
return std::pair<void *, int>(p, i); | ||
} | ||
|
||
// schedule free | ||
inline void free(cudaStream_t stream, std::vector<int> buckets, SimplePoolAllocator &pool) { | ||
auto payload = new Payload{&pool, std::move(buckets)}; | ||
CudaHostAlloc::scheduleFree(payload,stream); | ||
} | ||
|
||
} // namespace cuda | ||
} // namespace memoryPool | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,64 @@ | ||
#include "HeterogeneousCore/CUDAUtilities/interface/cudaMemoryPool.h" | ||
|
||
#include "HeterogeneousCore/CUDAUtilities/interface/SimplePoolAllocator.h" | ||
|
||
#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" | ||
|
||
#include <cuda_runtime.h> | ||
#include <cuda_runtime_api.h> | ||
|
||
#include<iostream> | ||
|
||
namespace { | ||
|
||
constexpr int poolSize = 128 * 1024; | ||
|
||
SimplePoolAllocatorImpl<PosixAlloc> cpuPool(poolSize); | ||
|
||
SimplePoolAllocatorImpl<CudaHostAlloc> hostPool(poolSize); | ||
|
||
struct DevicePools { | ||
using Pool = SimplePoolAllocatorImpl<CudaDeviceAlloc>; | ||
DevicePools(int size) { | ||
int devices = 0; | ||
auto status = cudaGetDeviceCount(&devices); | ||
if (status == cudaSuccess && devices > 0) { | ||
m_devicePools.reserve(devices); | ||
for (int i = 0; i < devices; ++i) | ||
m_devicePools.emplace_back(new Pool(size)); | ||
} | ||
} | ||
//return pool for current device | ||
Pool &operator()() { | ||
int dev = -1; | ||
cudaGetDevice(&dev); | ||
return *m_devicePools[dev]; | ||
} | ||
|
||
std::vector<std::unique_ptr<Pool>> m_devicePools; | ||
}; | ||
|
||
DevicePools devicePool(poolSize); | ||
|
||
} // namespace | ||
|
||
namespace memoryPool { | ||
namespace cuda { | ||
|
||
void dumpStat() { | ||
std::cout << "device pool" << std::endl; | ||
devicePool().dumpStat(); | ||
std::cout << "host pool" << std::endl; | ||
hostPool.dumpStat(); | ||
} | ||
|
||
SimplePoolAllocator *getPool(Where where) { | ||
return onCPU == where | ||
? (SimplePoolAllocator *)(&cpuPool) | ||
: (onDevice == where ? (SimplePoolAllocator *)(&devicePool()) : (SimplePoolAllocator *)(&hostPool)); | ||
} | ||
|
||
} // namespace cuda | ||
} // namespace memoryPool | ||
|
||
|