Skip to content

Commit

Permalink
Add infrastructure around cub CachingDeviceAllocator, and use it in S…
Browse files Browse the repository at this point in the history
…iPixelRawToCluster (cms-sw#172)

Add infrastructure around cub CachingDeviceAllocator for device
memory allocations, and CachingHostAllocator for pinned (or managed)
host memory.

CUDAService uses the CachingHostAllocator to allocate requested
GPU->CPU/CPU->GPU buffers and data products.
Configuration options can be used to request:
  - to print all memory (re)allocations and frees;
  - to preallocate device and host buffers.

SiPixelRawToCluster uses the CachingDeviceAllocator for temporary
buffers and data products.

Fix a memory problem with SiPixelFedCablingMapGPUWrapper::ModulesToUnpack.
  • Loading branch information
makortel authored and fwyzard committed Nov 27, 2018
1 parent 24ae7d8 commit fcd3552
Show file tree
Hide file tree
Showing 24 changed files with 1,468 additions and 310 deletions.
16 changes: 16 additions & 0 deletions CUDADataFormats/Common/interface/device_unique_ptr.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#ifndef CUDADataFormats_Common_interface_device_unique_ptr_h
#define CUDADataFormats_Common_interface_device_unique_ptr_h

#include <memory>
#include <functional>

namespace edm {
namespace cuda {
namespace device {
template <typename T>
using unique_ptr = std::unique_ptr<T, std::function<void(void *)>>;
}
}
}

#endif
16 changes: 16 additions & 0 deletions CUDADataFormats/Common/interface/host_unique_ptr.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#ifndef CUDADataFormats_Common_interface_host_unique_ptr_h
#define CUDADataFormats_Common_interface_host_unique_ptr_h

#include <memory>
#include <functional>

namespace edm {
namespace cuda {
namespace host {
template <typename T>
using unique_ptr = std::unique_ptr<T, std::function<void(void *)>>;
}
}
}

#endif
8 changes: 8 additions & 0 deletions CUDADataFormats/SiPixelCluster/BuildFile.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
<use name="FWCore/ServiceRegistry"/>
<use name="HeterogeneousCore/CUDAServices"/>
<use name="cuda-api-wrappers"/>

<export>
<lib name="1"/>
</export>

73 changes: 73 additions & 0 deletions CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
#ifndef CUDADataFormats_SiPixelCluster_interface_SiPixelClustersCUDA_h
#define CUDADataFormats_SiPixelCluster_interface_SiPixelClustersCUDA_h

#include "CUDADataFormats/Common/interface/device_unique_ptr.h"

#include <cuda/api_wrappers.h>

class SiPixelClustersCUDA {
public:
SiPixelClustersCUDA() = default;
explicit SiPixelClustersCUDA(size_t feds, size_t nelements, cuda::stream_t<>& stream);
~SiPixelClustersCUDA() = default;

SiPixelClustersCUDA(const SiPixelClustersCUDA&) = delete;
SiPixelClustersCUDA& operator=(const SiPixelClustersCUDA&) = delete;
SiPixelClustersCUDA(SiPixelClustersCUDA&&) = default;
SiPixelClustersCUDA& operator=(SiPixelClustersCUDA&&) = default;

uint32_t *moduleStart() { return moduleStart_d.get(); }
int32_t *clus() { return clus_d.get(); }
uint32_t *clusInModule() { return clusInModule_d.get(); }
uint32_t *moduleId() { return moduleId_d.get(); }
uint32_t *clusModuleStart() { return clusModuleStart_d.get(); }

uint32_t const *moduleStart() const { return moduleStart_d.get(); }
int32_t const *clus() const { return clus_d.get(); }
uint32_t const *clusInModule() const { return clusInModule_d.get(); }
uint32_t const *moduleId() const { return moduleId_d.get(); }
uint32_t const *clusModuleStart() const { return clusModuleStart_d.get(); }

uint32_t const *c_moduleStart() const { return moduleStart_d.get(); }
int32_t const *c_clus() const { return clus_d.get(); }
uint32_t const *c_clusInModule() const { return clusInModule_d.get(); }
uint32_t const *c_moduleId() const { return moduleId_d.get(); }
uint32_t const *c_clusModuleStart() const { return clusModuleStart_d.get(); }

class DeviceConstView {
public:
DeviceConstView() = default;

#ifdef __CUDACC__
__device__ __forceinline__ uint32_t moduleStart(int i) const { return __ldg(moduleStart_+i); }
__device__ __forceinline__ int32_t clus(int i) const { return __ldg(clus_+i); }
__device__ __forceinline__ uint32_t clusInModule(int i) const { return __ldg(clusInModule_+i); }
__device__ __forceinline__ uint32_t moduleId(int i) const { return __ldg(moduleId_+i); }
__device__ __forceinline__ uint32_t clusModuleStart(int i) const { return __ldg(clusModuleStart_+i); }
#endif

friend SiPixelClustersCUDA;

private:
uint32_t const *moduleStart_ = nullptr;
int32_t const *clus_ = nullptr;
uint32_t const *clusInModule_ = nullptr;
uint32_t const *moduleId_ = nullptr;
uint32_t const *clusModuleStart_ = nullptr;
};

DeviceConstView *view() const { return view_d.get(); }

private:
edm::cuda::device::unique_ptr<uint32_t[]> moduleStart_d; // index of the first pixel of each module
edm::cuda::device::unique_ptr<int32_t[]> clus_d; // cluster id of each pixel
edm::cuda::device::unique_ptr<uint32_t[]> clusInModule_d; // number of clusters found in each module
edm::cuda::device::unique_ptr<uint32_t[]> moduleId_d; // module id of each module

// originally from rechits
edm::cuda::device::unique_ptr<uint32_t[]> clusModuleStart_d;

edm::cuda::device::unique_ptr<DeviceConstView> view_d; // "me" pointer
};

#endif
24 changes: 24 additions & 0 deletions CUDADataFormats/SiPixelCluster/src/SiPixelClustersCUDA.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#include "CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h"

#include "FWCore/ServiceRegistry/interface/Service.h"
#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"

SiPixelClustersCUDA::SiPixelClustersCUDA(size_t feds, size_t nelements, cuda::stream_t<>& stream) {
edm::Service<CUDAService> cs;

moduleStart_d = cs->make_device_unique<uint32_t[]>(nelements+1, stream);
clus_d = cs->make_device_unique< int32_t[]>(feds, stream);
clusInModule_d = cs->make_device_unique<uint32_t[]>(nelements, stream);
moduleId_d = cs->make_device_unique<uint32_t[]>(nelements, stream);
clusModuleStart_d = cs->make_device_unique<uint32_t[]>(nelements+1, stream);

auto view = cs->make_host_unique<DeviceConstView>(stream);
view->moduleStart_ = moduleStart_d.get();
view->clus_ = clus_d.get();
view->clusInModule_ = clusInModule_d.get();
view->moduleId_ = moduleId_d.get();
view->clusModuleStart_ = clusModuleStart_d.get();

view_d = cs->make_device_unique<DeviceConstView>(stream);
cudaMemcpyAsync(view_d.get(), view.get(), sizeof(DeviceConstView), cudaMemcpyDefault, stream.id());
}
7 changes: 7 additions & 0 deletions CUDADataFormats/SiPixelDigi/BuildFile.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
<use name="FWCore/ServiceRegistry"/>
<use name="HeterogeneousCore/CUDAServices"/>
<use name="cuda-api-wrappers"/>

<export>
<lib name="1"/>
</export>
65 changes: 65 additions & 0 deletions CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
#ifndef CUDADataFormats_SiPixelDigi_interface_SiPixelDigisCUDA_h
#define CUDADataFormats_SiPixelDigi_interface_SiPixelDigisCUDA_h

#include "CUDADataFormats/Common/interface/device_unique_ptr.h"
#include "FWCore/Utilities/interface/propagate_const.h"

#include <cuda/api_wrappers.h>

class SiPixelDigisCUDA {
public:
SiPixelDigisCUDA() = default;
explicit SiPixelDigisCUDA(size_t nelements, cuda::stream_t<>& stream);
~SiPixelDigisCUDA() = default;

SiPixelDigisCUDA(const SiPixelDigisCUDA&) = delete;
SiPixelDigisCUDA& operator=(const SiPixelDigisCUDA&) = delete;
SiPixelDigisCUDA(SiPixelDigisCUDA&&) = default;
SiPixelDigisCUDA& operator=(SiPixelDigisCUDA&&) = default;

uint16_t * xx() { return xx_d.get(); }
uint16_t * yy() { return yy_d.get(); }
uint16_t * adc() { return adc_d.get(); }
uint16_t * moduleInd() { return moduleInd_d.get(); }

uint16_t const *xx() const { return xx_d.get(); }
uint16_t const *yy() const { return yy_d.get(); }
uint16_t const *adc() const { return adc_d.get(); }
uint16_t const *moduleInd() const { return moduleInd_d.get(); }

uint16_t const *c_xx() const { return xx_d.get(); }
uint16_t const *c_yy() const { return yy_d.get(); }
uint16_t const *c_adc() const { return adc_d.get(); }
uint16_t const *c_moduleInd() const { return moduleInd_d.get(); }

class DeviceConstView {
public:
DeviceConstView() = default;

#ifdef __CUDACC__
__device__ __forceinline__ uint16_t xx(int i) const { return __ldg(xx_+i); }
__device__ __forceinline__ uint16_t yy(int i) const { return __ldg(yy_+i); }
__device__ __forceinline__ uint16_t adc(int i) const { return __ldg(adc_+i); }
__device__ __forceinline__ uint16_t moduleInd(int i) const { return __ldg(moduleInd_+i); }
#endif

friend class SiPixelDigisCUDA;

private:
uint16_t const *xx_ = nullptr;
uint16_t const *yy_ = nullptr;
uint16_t const *adc_ = nullptr;
uint16_t const *moduleInd_ = nullptr;
};

const DeviceConstView *view() const { return view_d.get(); }

private:
edm::cuda::device::unique_ptr<uint16_t[]> xx_d; // local coordinates of each pixel
edm::cuda::device::unique_ptr<uint16_t[]> yy_d; //
edm::cuda::device::unique_ptr<uint16_t[]> adc_d; // ADC of each pixel
edm::cuda::device::unique_ptr<uint16_t[]> moduleInd_d; // module id of each pixel
edm::cuda::device::unique_ptr<DeviceConstView> view_d; // "me" pointer
};

#endif
24 changes: 24 additions & 0 deletions CUDADataFormats/SiPixelDigi/src/SiPixelDigisCUDA.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h"

#include "FWCore/ServiceRegistry/interface/Service.h"
#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"

#include <cuda_runtime.h>

SiPixelDigisCUDA::SiPixelDigisCUDA(size_t nelements, cuda::stream_t<>& stream) {
edm::Service<CUDAService> cs;

xx_d = cs->make_device_unique<uint16_t[]>(nelements, stream);
yy_d = cs->make_device_unique<uint16_t[]>(nelements, stream);
adc_d = cs->make_device_unique<uint16_t[]>(nelements, stream);
moduleInd_d = cs->make_device_unique<uint16_t[]>(nelements, stream);

auto view = cs->make_host_unique<DeviceConstView>(stream);
view->xx_ = xx_d.get();
view->yy_ = yy_d.get();
view->adc_ = adc_d.get();
view->moduleInd_ = moduleInd_d.get();

view_d = cs->make_device_unique<DeviceConstView>(stream);
cudaMemcpyAsync(view_d.get(), view.get(), sizeof(DeviceConstView), cudaMemcpyDefault, stream.id());
}
16 changes: 10 additions & 6 deletions HeterogeneousCore/CUDACore/src/GPUCuda.cc
Original file line number Diff line number Diff line change
Expand Up @@ -74,15 +74,19 @@ namespace heterogeneous {
waitingTaskHolder, // copy needed for the catch block
locationSetter = iEvent.locationSetter()
](cuda::stream::id_t streamId, cuda::status_t status) mutable {
if(status == cudaSuccess) {
if (status == cudaSuccess) {
locationSetter(HeterogeneousDeviceId(HeterogeneousDevice::kGPUCuda, deviceId));
LogTrace("GPUCuda") << " GPU kernel finished (in callback) device " << deviceId << " CUDA stream " << streamId;
waitingTaskHolder.doneWaiting(nullptr);
}
else {
auto error = cudaGetErrorName(status);
auto message = cudaGetErrorString(status);
waitingTaskHolder.doneWaiting(std::make_exception_ptr(cms::Exception("CUDAError") << "Callback of CUDA stream " << streamId << " in device " << deviceId << " error " << error << ": " << message));
} else {
// wrap the exception in a try-catch block to let GDB "catch throw" break on it
try {
auto error = cudaGetErrorName(status);
auto message = cudaGetErrorString(status);
throw cms::Exception("CUDAError") << "Callback of CUDA stream " << streamId << " in device " << deviceId << " error " << error << ": " << message;
} catch(...) {
waitingTaskHolder.doneWaiting(std::current_exception());
}
}
});
} catch(...) {
Expand Down
2 changes: 2 additions & 0 deletions HeterogeneousCore/CUDAServices/BuildFile.xml
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,10 @@
<use name="FWCore/ServiceRegistry"/>
<use name="FWCore/ParameterSet"/>
<use name="FWCore/MessageLogger"/>
<use name="HeterogeneousCore/CUDAUtilities"/>
<use name="cuda"/>
<use name="cuda-api-wrappers"/>
<use name="cub"/>

<export>
<lib name="1"/>
Expand Down
87 changes: 87 additions & 0 deletions HeterogeneousCore/CUDAServices/interface/CUDAService.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,35 @@
#include <utility>
#include <vector>

#include <cuda/api_wrappers.h>

#include "FWCore/Utilities/interface/StreamID.h"

#include "CUDADataFormats/Common/interface/device_unique_ptr.h"
#include "CUDADataFormats/Common/interface/host_unique_ptr.h"

namespace edm {
class ParameterSet;
class ActivityRegistry;
class ConfigurationDescriptions;
}

namespace cudaserviceimpl {
template <typename T>
struct make_device_unique_selector { using non_array = edm::cuda::device::unique_ptr<T>; };
template <typename T>
struct make_device_unique_selector<T[]> { using unbounded_array = edm::cuda::device::unique_ptr<T[]>; };
template <typename T, size_t N>
struct make_device_unique_selector<T[N]> { struct bounded_array {}; };

template <typename T>
struct make_host_unique_selector { using non_array = edm::cuda::host::unique_ptr<T>; };
template <typename T>
struct make_host_unique_selector<T[]> { using unbounded_array = edm::cuda::host::unique_ptr<T[]>; };
template <typename T, size_t N>
struct make_host_unique_selector<T[N]> { struct bounded_array {}; };
}

/**
* TODO:
* - CUDA stream management?
Expand Down Expand Up @@ -47,7 +68,73 @@ class CUDAService {
// Get the current device
int getCurrentDevice() const;

// Allocate device memory
template <typename T>
typename cudaserviceimpl::make_device_unique_selector<T>::non_array
make_device_unique(cuda::stream_t<>& stream) {
int dev = getCurrentDevice();
void *mem = allocate_device(dev, sizeof(T), stream);
return typename cudaserviceimpl::make_device_unique_selector<T>::non_array(reinterpret_cast<T *>(mem),
[this, dev](void *ptr) {
this->free_device(dev, ptr);
});
}

template <typename T>
typename cudaserviceimpl::make_device_unique_selector<T>::unbounded_array
make_device_unique(size_t n, cuda::stream_t<>& stream) {
int dev = getCurrentDevice();
using element_type = typename std::remove_extent<T>::type;
void *mem = allocate_device(dev, n*sizeof(element_type), stream);
return typename cudaserviceimpl::make_device_unique_selector<T>::unbounded_array(reinterpret_cast<element_type *>(mem),
[this, dev](void *ptr) {
this->free_device(dev, ptr);
});
}

template <typename T, typename ...Args>
typename cudaserviceimpl::make_device_unique_selector<T>::bounded_array
make_device_unique(Args&&...) = delete;

// Allocate pinned host memory
template <typename T>
typename cudaserviceimpl::make_host_unique_selector<T>::non_array
make_host_unique(cuda::stream_t<>& stream) {
void *mem = allocate_host(sizeof(T), stream);
return typename cudaserviceimpl::make_host_unique_selector<T>::non_array(reinterpret_cast<T *>(mem),
[this](void *ptr) {
this->free_host(ptr);
});
}

template <typename T>
typename cudaserviceimpl::make_host_unique_selector<T>::unbounded_array
make_host_unique(size_t n, cuda::stream_t<>& stream) {
using element_type = typename std::remove_extent<T>::type;
void *mem = allocate_host(n*sizeof(element_type), stream);
return typename cudaserviceimpl::make_host_unique_selector<T>::unbounded_array(reinterpret_cast<element_type *>(mem),
[this](void *ptr) {
this->free_host(ptr);
});
}

template <typename T, typename ...Args>
typename cudaserviceimpl::make_host_unique_selector<T>::bounded_array
make_host_unique(Args&&...) = delete;

// Free device memory (to be called from unique_ptr)
void free_device(int device, void *ptr);

// Free pinned host memory (to be called from unique_ptr)
void free_host(void *ptr);

private:
// PIMPL to hide details of allocator
struct Allocator;
std::unique_ptr<Allocator> allocator_;
void *allocate_device(int dev, size_t nbytes, cuda::stream_t<>& stream);
void *allocate_host(size_t nbytes, cuda::stream_t<>& stream);

int numberOfDevices_ = 0;
unsigned int numberOfStreamsTotal_ = 0;
std::vector<std::pair<int, int>> computeCapabilities_;
Expand Down
Loading

0 comments on commit fcd3552

Please sign in to comment.