Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[RFC] Reduce calls to cudaEventRecord() via the caching allocators #412

Open
wants to merge 5 commits into
base: CMSSW_11_0_X_Patatrack
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CUDADataFormats/BeamSpot/src/BeamSpotCUDA.cc
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,6 @@
#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"

BeamSpotCUDA::BeamSpotCUDA(Data const* data_h, cudaStream_t stream) {
data_d_ = cms::cuda::make_device_unique<Data>(stream);
data_d_ = cms::cuda::make_device_unique<Data>();
cudaCheck(cudaMemcpyAsync(data_d_.get(), data_h, sizeof(Data), cudaMemcpyHostToDevice, stream));
}
78 changes: 41 additions & 37 deletions CUDADataFormats/Common/interface/HeterogeneousSoA.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,81 +56,86 @@ namespace cudaCompat {
template <typename T>
using unique_ptr = cms::cuda::device::unique_ptr<T>;

template <typename T>
static auto make_unique(cudaStream_t stream) {
return cms::cuda::make_device_unique<T>(stream);
}

template <typename T>
static auto make_unique(size_t size, cudaStream_t stream) {
return cms::cuda::make_device_unique<T>(size, stream);
template <typename T, typename... Args>
static auto make_unique(Args &&... args) {
return cms::cuda::make_device_unique<T>(std::forward<Args>(args)...);
}

template <typename T>
static auto make_host_unique(cudaStream_t stream) {
return cms::cuda::make_host_unique<T>(stream);
template <typename T, typename... Args>
static auto make_host_unique(Args &&... args) {
return cms::cuda::make_host_unique<T>(std::forward<Args>(args)...);
}

template <typename T>
static auto make_device_unique(cudaStream_t stream) {
return cms::cuda::make_device_unique<T>(stream);
}

template <typename T>
static auto make_device_unique(size_t size, cudaStream_t stream) {
return cms::cuda::make_device_unique<T>(size, stream);
template <typename T, typename... Args>
static auto make_device_unique(Args &&... args) {
return cms::cuda::make_device_unique<T>(std::forward<Args>(args)...);
}
};

struct HostTraits {
template <typename T>
using unique_ptr = cms::cuda::host::unique_ptr<T>;

template <typename T>
static auto make_unique(cudaStream_t stream) {
return cms::cuda::make_host_unique<T>(stream);
template <typename T, typename... Args>
static auto make_unique(Args &&... args) {
return cms::cuda::make_host_unique<T>(std::forward<Args>(args)...);
}

template <typename T>
static auto make_host_unique(cudaStream_t stream) {
return cms::cuda::make_host_unique<T>(stream);
template <typename T, typename... Args>
static auto make_host_unique(Args &&... args) {
return cms::cuda::make_host_unique<T>(std::forward<Args>(args)...);
}

template <typename T>
static auto make_device_unique(cudaStream_t stream) {
return cms::cuda::make_device_unique<T>(stream);
}

template <typename T>
static auto make_device_unique(size_t size, cudaStream_t stream) {
return cms::cuda::make_device_unique<T>(size, stream);
template <typename T, typename... Args>
static auto make_device_unique(Args &&... args) {
return cms::cuda::make_device_unique<T>(std::forward<Args>(args)...);
}
};

struct CPUTraits {
template <typename T>
using unique_ptr = std::unique_ptr<T>;

template <typename T>
static auto make_unique() {
return std::make_unique<T>();
}
template <typename T>
static auto make_unique(cudaStream_t) {
return std::make_unique<T>();
}

template <typename T>
static auto make_unique(size_t size) {
return std::make_unique<T>(size);
}
template <typename T>
static auto make_unique(size_t size, cudaStream_t) {
return std::make_unique<T>(size);
}

template <typename T>
static auto make_host_unique() {
return std::make_unique<T>();
}
template <typename T>
static auto make_host_unique(cudaStream_t) {
return std::make_unique<T>();
}

template <typename T>
static auto make_device_unique() {
return std::make_unique<T>();
}
template <typename T>
static auto make_device_unique(cudaStream_t) {
return std::make_unique<T>();
}

template <typename T>
static auto make_device_unique(size_t size) {
return std::make_unique<T>(size);
}
template <typename T>
static auto make_device_unique(size_t size, cudaStream_t) {
return std::make_unique<T>(size);
Expand All @@ -146,13 +151,12 @@ class HeterogeneousSoAImpl {
template <typename V>
using unique_ptr = typename Traits::template unique_ptr<V>;

HeterogeneousSoAImpl() = default; // make root happy
HeterogeneousSoAImpl();
~HeterogeneousSoAImpl() = default;
HeterogeneousSoAImpl(HeterogeneousSoAImpl &&) = default;
HeterogeneousSoAImpl &operator=(HeterogeneousSoAImpl &&) = default;

explicit HeterogeneousSoAImpl(unique_ptr<T> &&p) : m_ptr(std::move(p)) {}
explicit HeterogeneousSoAImpl(cudaStream_t stream);

T const *get() const { return m_ptr.get(); }

Expand All @@ -165,8 +169,8 @@ class HeterogeneousSoAImpl {
};

template <typename T, typename Traits>
HeterogeneousSoAImpl<T, Traits>::HeterogeneousSoAImpl(cudaStream_t stream) {
m_ptr = Traits::template make_unique<T>(stream);
HeterogeneousSoAImpl<T, Traits>::HeterogeneousSoAImpl() {
m_ptr = Traits::template make_unique<T>();
}

// in reality valid only for GPU version...
Expand Down
12 changes: 7 additions & 5 deletions CUDADataFormats/SiPixelCluster/src/SiPixelClustersCUDA.cc
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,19 @@
#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"

SiPixelClustersCUDA::SiPixelClustersCUDA(size_t maxClusters, cudaStream_t stream) {
moduleStart_d = cms::cuda::make_device_unique<uint32_t[]>(maxClusters + 1, stream);
clusInModule_d = cms::cuda::make_device_unique<uint32_t[]>(maxClusters, stream);
moduleId_d = cms::cuda::make_device_unique<uint32_t[]>(maxClusters, stream);
clusModuleStart_d = cms::cuda::make_device_unique<uint32_t[]>(maxClusters + 1, stream);
moduleStart_d = cms::cuda::make_device_unique<uint32_t[]>(maxClusters + 1);
clusInModule_d = cms::cuda::make_device_unique<uint32_t[]>(maxClusters);
moduleId_d = cms::cuda::make_device_unique<uint32_t[]>(maxClusters);
clusModuleStart_d = cms::cuda::make_device_unique<uint32_t[]>(maxClusters + 1);

// device-side ownership to guarantee that the host memory is alive
// until the copy finishes
auto view = cms::cuda::make_host_unique<DeviceConstView>(stream);
view->moduleStart_ = moduleStart_d.get();
view->clusInModule_ = clusInModule_d.get();
view->moduleId_ = moduleId_d.get();
view->clusModuleStart_ = clusModuleStart_d.get();

view_d = cms::cuda::make_device_unique<DeviceConstView>(stream);
view_d = cms::cuda::make_device_unique<DeviceConstView>();
cms::cuda::copyAsync(view_d, view, stream);
}
6 changes: 4 additions & 2 deletions CUDADataFormats/SiPixelDigi/src/SiPixelDigiErrorsCUDA.cc
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,13 @@

SiPixelDigiErrorsCUDA::SiPixelDigiErrorsCUDA(size_t maxFedWords, PixelFormatterErrors errors, cudaStream_t stream)
: formatterErrors_h(std::move(errors)) {
error_d = cms::cuda::make_device_unique<GPU::SimpleVector<PixelErrorCompact>>(stream);
data_d = cms::cuda::make_device_unique<PixelErrorCompact[]>(maxFedWords, stream);
error_d = cms::cuda::make_device_unique<GPU::SimpleVector<PixelErrorCompact>>();
data_d = cms::cuda::make_device_unique<PixelErrorCompact[]>(maxFedWords);

cms::cuda::memsetAsync(data_d, 0x00, maxFedWords, stream);

// device-side ownership to guarantee that the host memory is alive
// until the copy finishes
error_h = cms::cuda::make_host_unique<GPU::SimpleVector<PixelErrorCompact>>(stream);
GPU::make_SimpleVector(error_h.get(), maxFedWords, data_d.get());
assert(error_h->empty());
Expand Down
18 changes: 10 additions & 8 deletions CUDADataFormats/SiPixelDigi/src/SiPixelDigisCUDA.cc
Original file line number Diff line number Diff line change
Expand Up @@ -5,23 +5,25 @@
#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"

SiPixelDigisCUDA::SiPixelDigisCUDA(size_t maxFedWords, cudaStream_t stream) {
xx_d = cms::cuda::make_device_unique<uint16_t[]>(maxFedWords, stream);
yy_d = cms::cuda::make_device_unique<uint16_t[]>(maxFedWords, stream);
adc_d = cms::cuda::make_device_unique<uint16_t[]>(maxFedWords, stream);
moduleInd_d = cms::cuda::make_device_unique<uint16_t[]>(maxFedWords, stream);
clus_d = cms::cuda::make_device_unique<int32_t[]>(maxFedWords, stream);
xx_d = cms::cuda::make_device_unique<uint16_t[]>(maxFedWords);
yy_d = cms::cuda::make_device_unique<uint16_t[]>(maxFedWords);
adc_d = cms::cuda::make_device_unique<uint16_t[]>(maxFedWords);
moduleInd_d = cms::cuda::make_device_unique<uint16_t[]>(maxFedWords);
clus_d = cms::cuda::make_device_unique<int32_t[]>(maxFedWords);

pdigi_d = cms::cuda::make_device_unique<uint32_t[]>(maxFedWords, stream);
rawIdArr_d = cms::cuda::make_device_unique<uint32_t[]>(maxFedWords, stream);
pdigi_d = cms::cuda::make_device_unique<uint32_t[]>(maxFedWords);
rawIdArr_d = cms::cuda::make_device_unique<uint32_t[]>(maxFedWords);

// device-side ownership to guarantee that the host memory is alive
// until the copy finishes
auto view = cms::cuda::make_host_unique<DeviceConstView>(stream);
view->xx_ = xx_d.get();
view->yy_ = yy_d.get();
view->adc_ = adc_d.get();
view->moduleInd_ = moduleInd_d.get();
view->clus_ = clus_d.get();

view_d = cms::cuda::make_device_unique<DeviceConstView>(stream);
view_d = cms::cuda::make_device_unique<DeviceConstView>();
cms::cuda::copyAsync(view_d, view, stream);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -73,11 +73,13 @@ TrackingRecHit2DHeterogeneous<Traits>::TrackingRecHit2DHeterogeneous(uint32_t nH
uint32_t const* hitsModuleStart,
cudaStream_t stream)
: m_nHits(nHits), m_hitsModuleStart(hitsModuleStart) {
// device-side ownership to guarantee that the host memory is alive
// until the copy finishes
auto view = Traits::template make_host_unique<TrackingRecHit2DSOAView>(stream);

view->m_nHits = nHits;
m_view = Traits::template make_device_unique<TrackingRecHit2DSOAView>(stream);
m_AverageGeometryStore = Traits::template make_device_unique<TrackingRecHit2DSOAView::AverageGeometry>(stream);
m_view = Traits::template make_device_unique<TrackingRecHit2DSOAView>();
m_AverageGeometryStore = Traits::template make_device_unique<TrackingRecHit2DSOAView::AverageGeometry>();
view->m_averageGeometry = m_AverageGeometryStore.get();
view->m_cpeParams = cpeParams;
view->m_hitsModuleStart = hitsModuleStart;
Expand All @@ -101,9 +103,9 @@ TrackingRecHit2DHeterogeneous<Traits>::TrackingRecHit2DHeterogeneous(uint32_t nH
// if ordering is relevant they may have to be stored phi-ordered by layer or so
// this will break 1to1 correspondence with cluster and module locality
// so unless proven VERY inefficient we keep it ordered as generated
m_store16 = Traits::template make_device_unique<uint16_t[]>(nHits * n16, stream);
m_store32 = Traits::template make_device_unique<float[]>(nHits * n32 + 11, stream);
m_HistStore = Traits::template make_device_unique<TrackingRecHit2DSOAView::Hist>(stream);
m_store16 = Traits::template make_device_unique<uint16_t[]>(nHits * n16);
m_store32 = Traits::template make_device_unique<float[]>(nHits * n32 + 11);
m_HistStore = Traits::template make_device_unique<TrackingRecHit2DSOAView::Hist>();

auto get16 = [&](int i) { return m_store16.get() + i * nHits; };
auto get32 = [&](int i) { return m_store32.get() + i * nHits; };
Expand Down
8 changes: 8 additions & 0 deletions HeterogeneousCore/CUDAUtilities/interface/allocate_device.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,14 @@
namespace cms {
namespace cuda {
// Allocate device memory
// This variant does not create device-side ownership
void *allocate_device(int dev, size_t nbytes);

// Allocate device memory
// This variant creates device-side ownership. When freed, all work
// in the stream up to the freeing point must be finished for the
// memory block to be considered free (except for new allocation in
// the same stream)
void *allocate_device(int dev, size_t nbytes, cudaStream_t stream);

// Free device memory (to be called from unique_ptr)
Expand Down
8 changes: 8 additions & 0 deletions HeterogeneousCore/CUDAUtilities/interface/allocate_host.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,14 @@
namespace cms {
namespace cuda {
// Allocate pinned host memory (to be called from unique_ptr)
// This variant does not create device-side ownership
void *allocate_host(size_t nbytes);

// Allocate pinned host memory (to be called from unique_ptr)
// This variant creates device-side ownership. When freed, all work
// in the stream up to the freeing point must be finished for the
// memory block to be considered free (except for new allocation in
// the same stream)
void *allocate_host(size_t nbytes, cudaStream_t stream);

// Free pinned host memory (to be called from unique_ptr)
Expand Down
38 changes: 38 additions & 0 deletions HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,16 @@ namespace cms {
} // namespace impl
} // namespace device

template <typename T>
typename device::impl::make_device_unique_selector<T>::non_array make_device_unique() {
static_assert(std::is_trivially_constructible<T>::value,
"Allocating with non-trivial constructor on the device memory is not supported");
int dev = currentDevice();
void *mem = allocate_device(dev, sizeof(T));
return typename device::impl::make_device_unique_selector<T>::non_array{reinterpret_cast<T *>(mem),
device::impl::DeviceDeleter{dev}};
}

template <typename T>
typename device::impl::make_device_unique_selector<T>::non_array make_device_unique(cudaStream_t stream) {
static_assert(std::is_trivially_constructible<T>::value,
Expand All @@ -58,6 +68,17 @@ namespace cms {
device::impl::DeviceDeleter{dev}};
}

template <typename T>
typename device::impl::make_device_unique_selector<T>::unbounded_array make_device_unique(size_t n) {
using element_type = typename std::remove_extent<T>::type;
static_assert(std::is_trivially_constructible<element_type>::value,
"Allocating with non-trivial constructor on the device memory is not supported");
int dev = currentDevice();
void *mem = allocate_device(dev, n * sizeof(element_type));
return typename device::impl::make_device_unique_selector<T>::unbounded_array{
reinterpret_cast<element_type *>(mem), device::impl::DeviceDeleter{dev}};
}

template <typename T>
typename device::impl::make_device_unique_selector<T>::unbounded_array make_device_unique(size_t n,
cudaStream_t stream) {
Expand All @@ -74,6 +95,14 @@ namespace cms {
typename device::impl::make_device_unique_selector<T>::bounded_array make_device_unique(Args &&...) = delete;

// No check for the trivial constructor, make it clear in the interface
template <typename T>
typename device::impl::make_device_unique_selector<T>::non_array make_device_unique_uninitialized() {
int dev = currentDevice();
void *mem = allocate_device(dev, sizeof(T));
return typename device::impl::make_device_unique_selector<T>::non_array{reinterpret_cast<T *>(mem),
device::impl::DeviceDeleter{dev}};
}

template <typename T>
typename device::impl::make_device_unique_selector<T>::non_array make_device_unique_uninitialized(
cudaStream_t stream) {
Expand All @@ -83,6 +112,15 @@ namespace cms {
device::impl::DeviceDeleter{dev}};
}

template <typename T>
typename device::impl::make_device_unique_selector<T>::unbounded_array make_device_unique_uninitialized(size_t n) {
using element_type = typename std::remove_extent<T>::type;
int dev = currentDevice();
void *mem = allocate_device(dev, n * sizeof(element_type));
return typename device::impl::make_device_unique_selector<T>::unbounded_array{
reinterpret_cast<element_type *>(mem), device::impl::DeviceDeleter{dev}};
}

template <typename T>
typename device::impl::make_device_unique_selector<T>::unbounded_array make_device_unique_uninitialized(
size_t n, cudaStream_t stream) {
Expand Down
Loading