Skip to content

Commit

Permalink
change name
Browse files Browse the repository at this point in the history
  • Loading branch information
VinInn committed May 5, 2022
1 parent c429d13 commit 6b050bd
Show file tree
Hide file tree
Showing 6 changed files with 94 additions and 81 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,12 @@
#define CUDADataFormats_TrackingRecHit_interface_TrackingRecHit2DHeterogeneous_h

#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DSOAView.h"
#include "CUDADataFormats/Common/interface/HeterogeneousSoA.h"
#include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h"

template <typename Traits>
#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
#include "HeterogeneousCore/CUDAUtilities/interface/cudaMemoryPool.h"

template <memoryPool::Where where>
class TrackingRecHit2DHeterogeneous {
public:
enum class Storage32 {
Expand All @@ -30,7 +32,7 @@ class TrackingRecHit2DHeterogeneous {
};

template <typename T>
using unique_ptr = typename Traits::template unique_ptr<T>;
using unique_ptr = typename memoryPool::unique_ptr<T>;

using PhiBinner = TrackingRecHit2DSOAView::PhiBinner;

Expand All @@ -43,7 +45,7 @@ class TrackingRecHit2DHeterogeneous {
pixelCPEforGPU::ParamsOnGPU const* cpeParams,
uint32_t const* hitsModuleStart,
cudaStream_t stream,
TrackingRecHit2DHeterogeneous<cms::cudacompat::GPUTraits> const* input = nullptr);
TrackingRecHit2DHeterogeneous<memoryPool::onDevice> const* input = nullptr);

explicit TrackingRecHit2DHeterogeneous(
float* store32, uint16_t* store16, uint32_t* modules, int nHits, cudaStream_t stream = nullptr);
Expand All @@ -67,22 +69,22 @@ class TrackingRecHit2DHeterogeneous {
auto phiBinnerStorage() { return m_phiBinnerStorage; }
auto iphi() { return m_iphi; }

cms::cuda::host::unique_ptr<float[]> localCoordToHostAsync(cudaStream_t stream) const;
unique_ptr<float> localCoordToHostAsync(cudaStream_t stream) const;

cms::cuda::host::unique_ptr<uint32_t[]> hitsModuleStartToHostAsync(cudaStream_t stream) const;
unique_ptr<uint32_t> hitsModuleStartToHostAsync(cudaStream_t stream) const;

cms::cuda::host::unique_ptr<uint16_t[]> store16ToHostAsync(cudaStream_t stream) const;
cms::cuda::host::unique_ptr<float[]> store32ToHostAsync(cudaStream_t stream) const;
unique_ptr<uint16_t> store16ToHostAsync(cudaStream_t stream) const;
unique_ptr<float> store32ToHostAsync(cudaStream_t stream) const;

// needs specialization for Host
void copyFromGPU(TrackingRecHit2DHeterogeneous<cms::cudacompat::GPUTraits> const* input, cudaStream_t stream);
void copyFromGPU(TrackingRecHit2DHeterogeneous<memoryPool::onDevice> const* input, cudaStream_t stream);

private:
static constexpr uint32_t n16 = 4; // number of elements in m_store16
static constexpr uint32_t n32 = 10; // number of elements in m_store32
static_assert(sizeof(uint32_t) == sizeof(float)); // just stating the obvious
static_assert(n32 == static_cast<uint32_t>(Storage32::kLayers));
unique_ptr<uint16_t[]> m_store16; //!
unique_ptr<uint16_t> m_store16; //!
unique_ptr<float[]> m_store32; //!

unique_ptr<TrackingRecHit2DSOAView::PhiBinner> m_PhiBinnerStore; //!
Expand All @@ -103,39 +105,38 @@ class TrackingRecHit2DHeterogeneous {
int16_t* m_iphi;
};

using TrackingRecHit2DGPU = TrackingRecHit2DHeterogeneous<cms::cudacompat::GPUTraits>;
using TrackingRecHit2DCPU = TrackingRecHit2DHeterogeneous<cms::cudacompat::CPUTraits>;
using TrackingRecHit2DHost = TrackingRecHit2DHeterogeneous<cms::cudacompat::HostTraits>;
using TrackingRecHit2DGPU = TrackingRecHit2DHeterogeneous<memoryPool::onDevice>;
using TrackingRecHit2DCPU = TrackingRecHit2DHeterogeneous<memoryPool::onCPU>;
using TrackingRecHit2DHost = TrackingRecHit2DHeterogeneous<memoryPool::onHost>;

#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"
#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"

template <typename Traits>
TrackingRecHit2DHeterogeneous<Traits>::TrackingRecHit2DHeterogeneous(
template <memoryPool::Where where>
TrackingRecHit2DHeterogeneous<where>::TrackingRecHit2DHeterogeneous(
uint32_t nHits,
bool isPhase2,
int32_t offsetBPIX2,
pixelCPEforGPU::ParamsOnGPU const* cpeParams,
uint32_t const* hitsModuleStart,
cudaStream_t stream,
TrackingRecHit2DHeterogeneous<cms::cudacompat::GPUTraits> const* input)
TrackingRecHit2DHeterogeneous<memoryPool::onDevice> const* input)
: m_nHits(nHits), m_offsetBPIX2(offsetBPIX2), m_hitsModuleStart(hitsModuleStart) {
auto view = Traits::template make_host_unique<TrackingRecHit2DSOAView>(stream);
using namespace memoryPool::cuda;
auto view = make_unique<TrackingRecHit2DSOAView>(1,stream,memoryPool::onCPU==where ? memoryPool::onCPU : memoryPool::onHost);

m_nMaxModules = isPhase2 ? phase2PixelTopology::numberOfModules : phase1PixelTopology::numberOfModules;

view->m_nHits = nHits;
view->m_nMaxModules = m_nMaxModules;
m_view = Traits::template make_unique<TrackingRecHit2DSOAView>(stream); // leave it on host and pass it by value?
m_AverageGeometryStore = Traits::template make_unique<TrackingRecHit2DSOAView::AverageGeometry>(stream);
m_view = make_unique<TrackingRecHit2DSOAView>(1,stream,where); // leave it on host and pass it by value?
m_AverageGeometryStore = make_unique<TrackingRecHit2DSOAView::AverageGeometry>(1,stream,where);
view->m_averageGeometry = m_AverageGeometryStore.get();
view->m_cpeParams = cpeParams;
view->m_hitsModuleStart = hitsModuleStart;

// if empy do not bother
if (0 == nHits) {
if constexpr (std::is_same_v<Traits, cms::cudacompat::GPUTraits>) {
cms::cuda::copyAsync(m_view, view, stream);
if constexpr (memoryPool::onDevice == where) {
cudaCheck(cudaMemcpyAsync(m_view.get(), view.get(), sizeof(TrackingRecHit2DSOAView), cudaMemcpyHostToDevice, stream));
} else {
m_view.reset(view.release()); // NOLINT: std::move() breaks CUDA version
}
Expand All @@ -149,17 +150,17 @@ TrackingRecHit2DHeterogeneous<Traits>::TrackingRecHit2DHeterogeneous(
// so unless proven VERY inefficient we keep it ordered as generated

// host copy is "reduced" (to be reviewed at some point)
if constexpr (std::is_same_v<Traits, cms::cudacompat::HostTraits>) {
if constexpr (memoryPool::onHost == where) {
// it has to compile for ALL cases
copyFromGPU(input, stream);
} else {
assert(input == nullptr);

auto nL = isPhase2 ? phase2PixelTopology::numberOfLayers : phase1PixelTopology::numberOfLayers;

m_store16 = Traits::template make_unique<uint16_t[]>(nHits * n16, stream);
m_store32 = Traits::template make_unique<float[]>(nHits * n32 + nL + 1, stream);
m_PhiBinnerStore = Traits::template make_unique<TrackingRecHit2DSOAView::PhiBinner>(stream);
m_store16 = make_unique<uint16_t>(nHits * n16, stream, where);
m_store32 = make_unique<float>(nHits * n32 + nL + 1, stream, where);
m_PhiBinnerStore = make_unique<TrackingRecHit2DSOAView::PhiBinner>(1,stream,where);
}

static_assert(sizeof(TrackingRecHit2DSOAView::hindex_type) == sizeof(float));
Expand All @@ -178,7 +179,7 @@ TrackingRecHit2DHeterogeneous<Traits>::TrackingRecHit2DHeterogeneous(
view->m_yerr = get32(Storage32::kYerror);
view->m_chargeAndStatus = reinterpret_cast<uint32_t*>(get32(Storage32::kCharge));

if constexpr (!std::is_same_v<Traits, cms::cudacompat::HostTraits>) {
if constexpr (memoryPool::onHost != where) {
assert(input == nullptr);
view->m_xg = get32(Storage32::kXGlobal);
view->m_yg = get32(Storage32::kYGlobal);
Expand All @@ -197,45 +198,46 @@ TrackingRecHit2DHeterogeneous<Traits>::TrackingRecHit2DHeterogeneous(
}

// transfer view
if constexpr (std::is_same_v<Traits, cms::cudacompat::GPUTraits>) {
cms::cuda::copyAsync(m_view, view, stream);
if constexpr (memoryPool::onDevice == where) {
cudaCheck(cudaMemcpyAsync(m_view.get(), view.get(), sizeof(TrackingRecHit2DSOAView), cudaMemcpyHostToDevice, stream));
} else {
m_view.reset(view.release()); // NOLINT: std::move() breaks CUDA version
}
}

//this is intended to be used only for CPU SoA but doesn't hurt to have it for all cases
template <typename Traits>
TrackingRecHit2DHeterogeneous<Traits>::TrackingRecHit2DHeterogeneous(
template <memoryPool::Where where>
TrackingRecHit2DHeterogeneous<where>::TrackingRecHit2DHeterogeneous(
float* store32, uint16_t* store16, uint32_t* modules, int nHits, cudaStream_t stream)
: m_nHits(nHits), m_hitsModuleStart(modules) {
auto view = Traits::template make_host_unique<TrackingRecHit2DSOAView>(stream);

m_view = Traits::template make_unique<TrackingRecHit2DSOAView>(stream);
auto view = make_unique<TrackingRecHit2DSOAView>(1,stream,memoryPool::onCPU==where? memoryPool::onCPU : memoryPool::onHost);

m_view = make_unique<TrackingRecHit2DSOAView>(stream);

view->m_nHits = nHits;

if (0 == nHits) {
if constexpr (std::is_same_v<Traits, cms::cudacompat::GPUTraits>) {
cms::cuda::copyAsync(m_view, view, stream);
if constexpr (memoryPool::onDevice == where) {
cudaCheck(cudaMemcpyAsync(m_view.get(), view.get(), sizeof(TrackingRecHit2DSOAView), cudaMemcpyHostToDevice, stream));
} else {
m_view = std::move(view);
}
return;
}

m_store16 = Traits::template make_unique<uint16_t[]>(nHits * n16, stream);
m_store32 = Traits::template make_unique<float[]>(nHits * n32, stream);
m_PhiBinnerStore = Traits::template make_unique<TrackingRecHit2DSOAView::PhiBinner>(stream);
m_AverageGeometryStore = Traits::template make_unique<TrackingRecHit2DSOAView::AverageGeometry>(stream);
m_store16 = make_unique<uint16_t>(nHits * n16, stream,where);
m_store32 = make_unique<float>(nHits * n32, stream,where);
m_PhiBinnerStore = make_unique<TrackingRecHit2DSOAView::PhiBinner>(1,stream,where);
m_AverageGeometryStore = make_unique<TrackingRecHit2DSOAView::AverageGeometry>(1,stream,where);

view->m_averageGeometry = m_AverageGeometryStore.get();
view->m_hitsModuleStart = m_hitsModuleStart;

//store transfer
if constexpr (std::is_same_v<Traits, cms::cudacompat::GPUTraits>) {
cms::cuda::copyAsync(m_store16, store16, stream);
cms::cuda::copyAsync(m_store32, store32, stream);
if constexpr (memoryPool::onDevice == where) {
cudaCheck(cudaMemcpyAsync(m_store32.get(), store32, nHits * n32, cudaMemcpyHostToDevice,stream));
cudaCheck(cudaMemcpyAsync(m_store16.get(), store16, nHits * n16, cudaMemcpyHostToDevice,stream));
} else {
std::copy(store32, store32 + nHits * n32, m_store32.get()); // want to copy it
std::copy(store16, store16 + nHits * n16, m_store16.get());
Expand Down Expand Up @@ -267,10 +269,10 @@ TrackingRecHit2DHeterogeneous<Traits>::TrackingRecHit2DHeterogeneous(
view->m_ysize = reinterpret_cast<int16_t*>(get16(Storage16::kYSize));

// transfer view
if constexpr (std::is_same_v<Traits, cms::cudacompat::GPUTraits>) {
cms::cuda::copyAsync(m_view, view, stream);
if constexpr (memoryPool::onDevice == where) {
cudaCheck(cudaMemcpyAsync(m_view.get(), view.get(), sizeof(TrackingRecHit2DSOAView), cudaMemcpyHostToDevice, stream));
} else {
m_view = std::move(view);
m_view = std::move(view);
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#include <cuda_runtime.h>

#include "CUDADataFormats/SiPixelCluster/interface/gpuClusteringConstants.h"
#include "HeterogeneousCore/CUDAUtilities/interface/memoryPool.h"
#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
#include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h"
#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h"
Expand All @@ -25,7 +26,7 @@ class TrackingRecHit2DSOAView {

using AverageGeometry = pixelTopology::AverageGeometry;

template <typename>
template <memoryPool::Where>
friend class TrackingRecHit2DHeterogeneous;
friend class TrackingRecHit2DReduced;

Expand Down
31 changes: 17 additions & 14 deletions CUDADataFormats/TrackingRecHit/src/TrackingRecHit2DHeterogeneous.cc
Original file line number Diff line number Diff line change
@@ -1,33 +1,36 @@
#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h"
#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"
#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
#include "HeterogeneousCore/CUDAUtilities/interface/cudaMemoryPool.h"

using namespace memoryPool::cuda;

template <>
cms::cuda::host::unique_ptr<float[]> TrackingRecHit2DGPU::localCoordToHostAsync(cudaStream_t stream) const {
auto ret = cms::cuda::make_host_unique<float[]>(5 * nHits(), stream);
cms::cuda::copyAsync(ret, m_store32, 5 * nHits(), stream);
memoryPool::unique_ptr<float> TrackingRecHit2DGPU::localCoordToHostAsync(cudaStream_t stream) const {
auto ret = make_unique<float>(5 * nHits(), stream,onHost);
cudaCheck(
cudaMemcpyAsync(ret.get(), m_store32.get(), 5 * nHits(), cudaMemcpyDefault,stream));
return ret;
}

template <>
cms::cuda::host::unique_ptr<float[]> TrackingRecHit2DGPU::store32ToHostAsync(cudaStream_t stream) const {
auto ret = cms::cuda::make_host_unique<float[]>(static_cast<int>(n32) * nHits(), stream);
cms::cuda::copyAsync(ret, m_store32, static_cast<int>(n32) * nHits(), stream);
memoryPool::unique_ptr<float> TrackingRecHit2DGPU::store32ToHostAsync(cudaStream_t stream) const {
auto ret = make_unique<float>(static_cast<int>(n32) * nHits(), stream,onHost));
cudaCheck(
cudaMemcpyAsync(ret.get(), m_store32.get(), static_cast<int>(n32) * nHits(), cudaMemcpyDefault,stream));
return ret;
}

template <>
cms::cuda::host::unique_ptr<uint16_t[]> TrackingRecHit2DGPU::store16ToHostAsync(cudaStream_t stream) const {
auto ret = cms::cuda::make_host_unique<uint16_t[]>(static_cast<int>(n16) * nHits(), stream);
cms::cuda::copyAsync(ret, m_store16, static_cast<int>(n16) * nHits(), stream);
memoryPool::unique_ptr<uint16_t> TrackingRecHit2DGPU::store16ToHostAsync(cudaStream_t stream) const {
auto ret = make_unique<uint16_t>(static_cast<int>(n16) * nHits(), stream,onHost));
cudaCheck(
cudaMemcpyAsync(ret.get(), m_store16, static_cast<int>(n16) * nHits(), cudaMemcpyDefault,stream));
return ret;
}

template <>
cms::cuda::host::unique_ptr<uint32_t[]> TrackingRecHit2DGPU::hitsModuleStartToHostAsync(cudaStream_t stream) const {
auto ret = cms::cuda::make_host_unique<uint32_t[]>(nMaxModules() + 1, stream);
memoryPool::unique_ptr<uint32_t> TrackingRecHit2DGPU::hitsModuleStartToHostAsync(cudaStream_t stream) const {
auto ret = make_unique<uint32_t>(nMaxModules() + 1, stream,onHost));
cudaCheck(
cudaMemcpyAsync(ret.get(), m_hitsModuleStart, sizeof(uint32_t) * (nMaxModules() + 1), cudaMemcpyDefault, stream));
return ret;
Expand Down
17 changes: 11 additions & 6 deletions HeterogeneousCore/CUDAUtilities/interface/cudaMemoryPool.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,11 @@ namespace memoryPool {
// schedule free
void free(cudaStream_t stream, std::vector<int> buckets, SimplePoolAllocator &pool);

template<typename T>
auto copy(buffer<T> & dst,buffer<T> const & src, uint64_t size, cudaStream_t stream) {
return cudaMemcpyAsync(dst.get(), src.get(), sizeof(T)*size, cudaMemcpyDefault, stream);
}

struct CudaDeleterBase : public DeleterBase {
CudaDeleterBase(cudaStream_t const &stream, Where where) : DeleterBase(getPool(where)), m_stream(stream) {}

Expand Down Expand Up @@ -46,24 +51,24 @@ namespace memoryPool {
};

template <typename T>
unique_ptr<T> make_unique(uint64_t size, Deleter del) {
buffer<T> make_buffer(uint64_t size, Deleter del) {
auto ret = alloc(sizeof(T) * size, *del.pool());
if (ret.second < 0)
throw std::bad_alloc();
del.setBucket(ret.second);
return unique_ptr<T>((T *)(ret.first), del);
return buffer<T>((T *)(ret.first), del);
}

template <typename T>
unique_ptr<T> make_unique(uint64_t size, cudaStream_t const &stream, Where where) {
return make_unique<T>(sizeof(T) * size, Deleter(std::make_shared<DeleteOne>(stream, getPool(where))));
buffer<T> make_buffer(uint64_t size, cudaStream_t const &stream, Where where) {
return make_buffer<T>(sizeof(T) * size, Deleter(std::make_shared<DeleteOne>(stream, getPool(where))));
}

/*
template< class T, class... Args >
memoryPool::unique_ptr<T> make_unique( Args&&... args );
memoryPool::buffer<T> make_buffer( Args&&... args );
template< class T, class... Args >
memoryPool::unique_ptr<T> make_unique(Deleter del, Args&&... args );
memoryPool::buffer<T> make_buffer(Deleter del, Args&&... args );
*/

} // namespace cuda
Expand Down
2 changes: 1 addition & 1 deletion HeterogeneousCore/CUDAUtilities/interface/memoryPool.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,6 @@ namespace memoryPool {
};

template <typename T>
using unique_ptr = std::unique_ptr<T, Deleter>;
using buffer = std::unique_ptr<T, Deleter>;

} // namespace memoryPool
28 changes: 15 additions & 13 deletions HeterogeneousCore/CUDAUtilities/test/testPoolUI.cu
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

template <memoryPool::Where where>
struct dataProducer {
static auto operator()(cudaStream_t stream) { return memoryPool::cuda::make_unique<int>(20, stream, where); }
auto operator()(cudaStream_t stream) { return memoryPool::cuda::make_buffer<int>(20, stream, where); }
};

int main() {
Expand Down Expand Up @@ -33,28 +33,30 @@ int main() {
auto& stream = streams[0];

{
auto pd = memoryPool::cuda::make_unique<int>(20, stream, memoryPool::onDevice);
auto ph = memoryPool::cuda::make_unique<int>(20, stream, memoryPool::onHost);
auto pc = memoryPool::cuda::make_unique<int>(20, stream, memoryPool::onCPU);
auto pd = memoryPool::cuda::make_buffer<int>(20, stream, memoryPool::onDevice);
auto ph = memoryPool::cuda::make_buffer<int>(20, stream, memoryPool::onHost);
auto pc = memoryPool::cuda::make_buffer<int>(20, stream, memoryPool::onCPU);

auto dp = dataProducer<onDevice>(stream);
auto dp = dataProducer<memoryPool::onDevice>()(stream);

memoryPool::cuda::copy(ph,pd,20,stream);
memoryPool::cuda::copy(pd,ph,20,stream);
memoryPool::cuda::dumpStat();
}

{
memoryPool::Deleter devDeleter(std::make_shared<memoryPool::cuda::BundleDelete>(stream, memoryPool::onDevice));
memoryPool::Deleter hosDeleter(std::make_shared<memoryPool::cuda::BundleDelete>(stream, memoryPool::onHost));

auto p0 = memoryPool::cuda::make_unique<int>(20, devDeleter);
auto p1 = memoryPool::cuda::make_unique<double>(20, devDeleter);
auto p2 = memoryPool::cuda::make_unique<bool>(20, devDeleter);
auto p3 = memoryPool::cuda::make_unique<int>(20, devDeleter);
auto p0 = memoryPool::cuda::make_buffer<int>(20, devDeleter);
auto p1 = memoryPool::cuda::make_buffer<double>(20, devDeleter);
auto p2 = memoryPool::cuda::make_buffer<bool>(20, devDeleter);
auto p3 = memoryPool::cuda::make_buffer<int>(20, devDeleter);

auto hp0 = memoryPool::cuda::make_unique<int>(20, hosDeleter);
auto hp1 = memoryPool::cuda::make_unique<double>(20, hosDeleter);
auto hp2 = memoryPool::cuda::make_unique<bool>(20, hosDeleter);
auto hp3 = memoryPool::cuda::make_unique<int>(20, hosDeleter);
auto hp0 = memoryPool::cuda::make_buffer<int>(20, hosDeleter);
auto hp1 = memoryPool::cuda::make_buffer<double>(20, hosDeleter);
auto hp2 = memoryPool::cuda::make_buffer<bool>(20, hosDeleter);
auto hp3 = memoryPool::cuda::make_buffer<int>(20, hosDeleter);

memoryPool::cuda::dumpStat();
}
Expand Down

0 comments on commit 6b050bd

Please sign in to comment.