Skip to content

Commit

Permalink
move inline
Browse files Browse the repository at this point in the history
  • Loading branch information
VinInn committed May 6, 2022
1 parent 21a646e commit 5291489
Show file tree
Hide file tree
Showing 3 changed files with 367 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,222 @@
// #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h"

#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DSOAView.h"
#include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h"

#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
#include "HeterogeneousCore/CUDAUtilities/interface/cudaMemoryPool.h"


TrackingRecHit2DHeterogeneous::TrackingRecHit2DHeterogeneous(
uint32_t nHits,
bool isPhase2,
int32_t offsetBPIX2,
pixelCPEforGPU::ParamsOnGPU const* cpeParams,
uint32_t const* hitsModuleStart,
memoryPool::Where where,
cudaStream_t stream,
TrackingRecHit2DHeterogeneous const* input)
: m_nHits(nHits), m_offsetBPIX2(offsetBPIX2), m_hitsModuleStart(hitsModuleStart) {

using namespace memoryPool::cuda;

memoryPool::Deleter deleter = memoryPool::Deleter(std::make_shared<memoryPool::cuda::BundleDelete>(stream,where));
assert(deleter.pool());
auto view = make_buffer<TrackingRecHit2DSOAView>(1,stream, memoryPool::onCPU==where ? memoryPool::onCPU : memoryPool::onHost);
assert(view.get());
m_nMaxModules = isPhase2 ? phase2PixelTopology::numberOfModules : phase1PixelTopology::numberOfModules;
assert(view.get());
view->m_nHits = nHits;
view->m_nMaxModules = m_nMaxModules;
m_view = make_buffer<TrackingRecHit2DSOAView>(1, deleter); // stream, where); // deleter); // leave it on host and pass it by value?
assert(m_view.get());
m_AverageGeometryStore = make_buffer<TrackingRecHit2DSOAView::AverageGeometry>(1,deleter);
view->m_averageGeometry = m_AverageGeometryStore.get();
view->m_cpeParams = cpeParams;
view->m_hitsModuleStart = hitsModuleStart;

// if empy do not bother
if (0 == nHits) {
if (memoryPool::onDevice == where) {
cudaCheck(memoryPool::cuda::copy(m_view, view, sizeof(TrackingRecHit2DSOAView), stream));
} else {
memoryPool::cuda::swapBuffer(m_view,view);
}
return;
}

// the single arrays are not 128 bit alligned...
// the hits are actually accessed in order only in building
// if ordering is relevant they may have to be stored phi-ordered by layer or so
// this will break 1to1 correspondence with cluster and module locality
// so unless proven VERY inefficient we keep it ordered as generated

// host copy is "reduced" (to be reviewed at some point)
if (memoryPool::onHost == where) {
// it has to compile for ALL cases
copyFromGPU(input, stream);
} else {
assert(input == nullptr);

auto nL = isPhase2 ? phase2PixelTopology::numberOfLayers : phase1PixelTopology::numberOfLayers;

m_store16 = make_buffer<uint16_t>(nHits * n16, deleter);
m_store32 = make_buffer<float>(nHits * n32 + nL + 1, deleter);
m_PhiBinnerStore = make_buffer<TrackingRecHit2DSOAView::PhiBinner>(1,deleter);
}

static_assert(sizeof(TrackingRecHit2DSOAView::hindex_type) == sizeof(float));
static_assert(sizeof(TrackingRecHit2DSOAView::hindex_type) == sizeof(TrackingRecHit2DSOAView::PhiBinner::index_type));

auto get32 = [&](Storage32 i) { return m_store32.get() + static_cast<int>(i) * nHits; };

// copy all the pointers
m_phiBinner = view->m_phiBinner = m_PhiBinnerStore.get();
m_phiBinnerStorage = view->m_phiBinnerStorage =
reinterpret_cast<TrackingRecHit2DSOAView::PhiBinner::index_type*>(get32(Storage32::kPhiStorage));

view->m_xl = get32(Storage32::kXLocal);
view->m_yl = get32(Storage32::kYLocal);
view->m_xerr = get32(Storage32::kXerror);
view->m_yerr = get32(Storage32::kYerror);
view->m_chargeAndStatus = reinterpret_cast<uint32_t*>(get32(Storage32::kCharge));

if (memoryPool::onHost != where) {
assert(input == nullptr);
view->m_xg = get32(Storage32::kXGlobal);
view->m_yg = get32(Storage32::kYGlobal);
view->m_zg = get32(Storage32::kZGlobal);
view->m_rg = get32(Storage32::kRGlobal);

auto get16 = [&](Storage16 i) { return m_store16.get() + static_cast<int>(i) * nHits; };
m_iphi = view->m_iphi = reinterpret_cast<int16_t*>(get16(Storage16::kPhi));

view->m_xsize = reinterpret_cast<int16_t*>(get16(Storage16::kXSize));
view->m_ysize = reinterpret_cast<int16_t*>(get16(Storage16::kYSize));
view->m_detInd = get16(Storage16::kDetId);

m_phiBinner = view->m_phiBinner = m_PhiBinnerStore.get();
m_hitsLayerStart = view->m_hitsLayerStart = reinterpret_cast<uint32_t*>(get32(Storage32::kLayers));
}

// transfer view
if (memoryPool::onDevice == where) {
cudaCheck(cudaMemcpyAsync(m_view.get(), view.get(),sizeof(TrackingRecHit2DSOAView), cudaMemcpyHostToDevice, stream));
// cudaCheck(memoryPool::cuda::copy(m_view, view, sizeof(TrackingRecHit2DSOAView), stream));
} else {
memoryPool::cuda::swapBuffer(m_view,view);
}
}

//this is intended to be used only for CPU SoA but doesn't hurt to have it for all cases
TrackingRecHit2DHeterogeneous::TrackingRecHit2DHeterogeneous(
float* store32, uint16_t* store16, uint32_t* modules, int nHits, memoryPool::Where where, cudaStream_t stream)
: m_nHits(nHits), m_hitsModuleStart(modules) {

using namespace memoryPool::cuda;
auto view = make_buffer<TrackingRecHit2DSOAView>(1,stream,memoryPool::onCPU==where? memoryPool::onCPU : memoryPool::onHost);

m_view = make_buffer<TrackingRecHit2DSOAView>(1,stream,where);

view->m_nHits = nHits;

if (0 == nHits) {
if (memoryPool::onDevice == where) {
cudaCheck(memoryPool::cuda::copy(m_view, view, sizeof(TrackingRecHit2DSOAView), stream));
} else {
m_view = std::move(view);
}
return;
}

m_store16 = make_buffer<uint16_t>(nHits * n16, stream,where);
m_store32 = make_buffer<float>(nHits * n32, stream,where);
m_PhiBinnerStore = make_buffer<TrackingRecHit2DSOAView::PhiBinner>(1,stream,where);
m_AverageGeometryStore = make_buffer<TrackingRecHit2DSOAView::AverageGeometry>(1,stream,where);

view->m_averageGeometry = m_AverageGeometryStore.get();
view->m_hitsModuleStart = m_hitsModuleStart;

//store transfer
if (memoryPool::onDevice == where) {
cudaCheck(cudaMemcpyAsync(m_store32.get(), store32, nHits * n32, cudaMemcpyHostToDevice,stream));
cudaCheck(cudaMemcpyAsync(m_store16.get(), store16, nHits * n16, cudaMemcpyHostToDevice,stream));
} else {
std::copy(store32, store32 + nHits * n32, m_store32.get()); // want to copy it
std::copy(store16, store16 + nHits * n16, m_store16.get());
}

//getters
auto get32 = [&](Storage32 i) { return m_store32.get() + static_cast<int>(i) * nHits; };
auto get16 = [&](Storage16 i) { return m_store16.get() + static_cast<int>(i) * nHits; };

//Store 32
view->m_xl = get32(Storage32::kXLocal);
view->m_yl = get32(Storage32::kYLocal);
view->m_xerr = get32(Storage32::kXerror);
view->m_yerr = get32(Storage32::kYerror);
view->m_chargeAndStatus = reinterpret_cast<uint32_t*>(get32(Storage32::kCharge));
view->m_xg = get32(Storage32::kXGlobal);
view->m_yg = get32(Storage32::kYGlobal);
view->m_zg = get32(Storage32::kZGlobal);
view->m_rg = get32(Storage32::kRGlobal);

m_phiBinner = view->m_phiBinner = m_PhiBinnerStore.get();
m_phiBinnerStorage = view->m_phiBinnerStorage =
reinterpret_cast<TrackingRecHit2DSOAView::PhiBinner::index_type*>(get32(Storage32::kPhiStorage));

//Store 16
view->m_detInd = get16(Storage16::kDetId);
m_iphi = view->m_iphi = reinterpret_cast<int16_t*>(get16(Storage16::kPhi));
view->m_xsize = reinterpret_cast<int16_t*>(get16(Storage16::kXSize));
view->m_ysize = reinterpret_cast<int16_t*>(get16(Storage16::kYSize));

// transfer view
if (memoryPool::onDevice == where) {
cudaCheck(memoryPool::cuda::copy(m_view, view, sizeof(TrackingRecHit2DSOAView), stream));
} else {
m_view = std::move(view);
}
}



using namespace memoryPool::cuda;

memoryPool::buffer<float> TrackingRecHit2DGPU::localCoordToHostAsync(cudaStream_t stream) const {
auto ret = make_buffer<float>(5 * nHits(), stream,memoryPool::onHost);
cudaCheck(
cudaMemcpyAsync(ret.get(), m_store32.get(), 5 * nHits(), cudaMemcpyDefault,stream));
return ret;
}


memoryPool::buffer<float> TrackingRecHit2DGPU::store32ToHostAsync(cudaStream_t stream) const {
auto ret = make_buffer<float>(static_cast<int>(n32) * nHits(), stream,memoryPool::onHost);
cudaCheck(
cudaMemcpyAsync(ret.get(), m_store32.get(), static_cast<int>(n32) * nHits(), cudaMemcpyDefault,stream));
return ret;
}


memoryPool::buffer<uint16_t> TrackingRecHit2DGPU::store16ToHostAsync(cudaStream_t stream) const {
auto ret = make_buffer<uint16_t>(static_cast<int>(n16) * nHits(), stream,memoryPool::onHost);
cudaCheck(
cudaMemcpyAsync(ret.get(), m_store16.get(), static_cast<int>(n16) * nHits(), cudaMemcpyDefault,stream));
return ret;
}

memoryPool::buffer<uint32_t> TrackingRecHit2DGPU::hitsModuleStartToHostAsync(cudaStream_t stream) const {
auto ret = make_buffer<uint32_t>(nMaxModules() + 1, stream,memoryPool::onHost);
if (m_hitsModuleStart) cudaCheck(
cudaMemcpyAsync(ret.get(), m_hitsModuleStart, sizeof(uint32_t) * (nMaxModules() + 1), cudaMemcpyDefault, stream));
return ret;
}

void TrackingRecHit2DHost::copyFromGPU(TrackingRecHit2DGPU const* input, cudaStream_t stream) {
assert(input);
m_store32 = input->localCoordToHostAsync(stream);
}



81 changes: 81 additions & 0 deletions HeterogeneousCore/CUDAUtilities/interface/cudaMemoryPoolImpl.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
// #include "HeterogeneousCore/CUDAUtilities/interface/cudaMemoryPool.h"

#include "HeterogeneousCore/CUDAUtilities/interface/SimplePoolAllocator.h"

#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"

#include <cuda_runtime.h>
#include <cuda_runtime_api.h>

#include<iostream>

namespace {

// free callback
void CUDART_CB freeCallback(void *p) {
// std::cout << "free callaback" << std::endl;
auto payload = (memoryPool::Payload *)(p);
memoryPool::scheduleFree(payload);
}

}

struct CudaAlloc {
static void scheduleFree(memoryPool::Payload * payload, cudaStream_t stream) {
// std::cout << "schedule free" << std::endl;
if (stream)
cudaCheck(cudaLaunchHostFunc(stream, freeCallback, payload));
else
memoryPool::scheduleFree(payload);
}
};


struct CudaDeviceAlloc : public CudaAlloc {
using Pointer = void *;

static Pointer alloc(size_t size) {
Pointer p = nullptr;
auto err = cudaMalloc(&p, size);
return err == cudaSuccess ? p : nullptr;
}
static void free(Pointer ptr) { cudaFree(ptr); }

};

struct CudaHostAlloc : public CudaAlloc {
using Pointer = void *;

static Pointer alloc(size_t size) {
Pointer p = nullptr;
auto err = cudaMallocHost(&p, size);
return err == cudaSuccess ? p : nullptr;
}
static void free(Pointer ptr) { cudaFreeHost(ptr); }
};


namespace memoryPool {
namespace cuda {

void dumpStat();

SimplePoolAllocator *getPool(Where where);

// allocate either on current device or on host (actually anywhere, not cuda specific)
inline std::pair<void *, int> alloc(uint64_t size, SimplePoolAllocator &pool) {
int i = pool.alloc(size);
void *p = pool.pointer(i);
return std::pair<void *, int>(p, i);
}

// schedule free
inline void free(cudaStream_t stream, std::vector<int> buckets, SimplePoolAllocator &pool) {
auto payload = new Payload{&pool, std::move(buckets)};
CudaHostAlloc::scheduleFree(payload,stream);
}

} // namespace cuda
} // namespace memoryPool


64 changes: 64 additions & 0 deletions HeterogeneousCore/CUDAUtilities/src/cudaMemoryPool.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
#include "HeterogeneousCore/CUDAUtilities/interface/cudaMemoryPool.h"

#include "HeterogeneousCore/CUDAUtilities/interface/SimplePoolAllocator.h"

#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"

#include <cuda_runtime.h>
#include <cuda_runtime_api.h>

#include<iostream>

namespace {

constexpr int poolSize = 128 * 1024;

SimplePoolAllocatorImpl<PosixAlloc> cpuPool(poolSize);

SimplePoolAllocatorImpl<CudaHostAlloc> hostPool(poolSize);

struct DevicePools {
using Pool = SimplePoolAllocatorImpl<CudaDeviceAlloc>;
DevicePools(int size) {
int devices = 0;
auto status = cudaGetDeviceCount(&devices);
if (status == cudaSuccess && devices > 0) {
m_devicePools.reserve(devices);
for (int i = 0; i < devices; ++i)
m_devicePools.emplace_back(new Pool(size));
}
}
//return pool for current device
Pool &operator()() {
int dev = -1;
cudaGetDevice(&dev);
return *m_devicePools[dev];
}

std::vector<std::unique_ptr<Pool>> m_devicePools;
};

DevicePools devicePool(poolSize);

} // namespace

namespace memoryPool {
namespace cuda {

void dumpStat() {
std::cout << "device pool" << std::endl;
devicePool().dumpStat();
std::cout << "host pool" << std::endl;
hostPool.dumpStat();
}

SimplePoolAllocator *getPool(Where where) {
return onCPU == where
? (SimplePoolAllocator *)(&cpuPool)
: (onDevice == where ? (SimplePoolAllocator *)(&devicePool()) : (SimplePoolAllocator *)(&hostPool));
}

} // namespace cuda
} // namespace memoryPool


0 comments on commit 5291489

Please sign in to comment.