Skip to content

Commit

Permalink
Reduce GPU memory usage (#509)
Browse files Browse the repository at this point in the history
Adjust the growth factor in the caching allocators to use more granular bins, reducing the memory wasted by the allocations.

Use a dynamic buffer for CA cells components.

Fix a possible data race in the prefix scan.
  • Loading branch information
VinInn authored and fwyzard committed Jul 15, 2020
1 parent 3f09c90 commit 11c3c16
Show file tree
Hide file tree
Showing 14 changed files with 156 additions and 50 deletions.
7 changes: 7 additions & 0 deletions HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,13 @@ namespace cms {
extern thread_local dim3 blockIdx;
extern thread_local dim3 gridDim;

template <typename T1, typename T2>
T1 atomicCAS(T1* address, T1 compare, T2 val) {
T1 old = *address;
*address = old == compare ? val : old;
return old;
}

template <typename T1, typename T2>
T1 atomicInc(T1* a, T2 b) {
auto ret = *a;
Expand Down
11 changes: 7 additions & 4 deletions HeterogeneousCore/CUDAUtilities/interface/prefixScan.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,9 @@ namespace cms {
namespace cuda {

// limited to 32*32 elements....
template <typename T>
__host__ __device__ __forceinline__ void blockPrefixScan(T const* __restrict__ ci,
T* __restrict__ co,
template <typename VT, typename T>
__host__ __device__ __forceinline__ void blockPrefixScan(VT const* ci,
VT* co,
uint32_t size,
T* ws
#ifndef __CUDA_ARCH__
Expand Down Expand Up @@ -138,7 +138,9 @@ namespace cms {

// in principle not limited....
template <typename T>
__global__ void multiBlockPrefixScan(T const* ci, T* co, int32_t size, int32_t* pc) {
__global__ void multiBlockPrefixScan(T const* ici, T* ico, int32_t size, int32_t* pc) {
volatile T const* ci = ici;
volatile T* co = ico;
__shared__ T ws[32];
#ifdef __CUDA_ARCH__
assert(sizeof(T) * gridDim.x <= dynamic_smem_size()); // size of psum below
Expand All @@ -152,6 +154,7 @@ namespace cms {
// count blocks that finished
__shared__ bool isLastBlockDone;
if (0 == threadIdx.x) {
__threadfence();
auto value = atomicAdd(pc, 1); // block counter
isLastBlockDone = (value == (int(gridDim.x) - 1));
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,11 @@ namespace cms::cuda::allocator {
// Use caching or not
constexpr bool useCaching = true;
// Growth factor (bin_growth in cub::CachingDeviceAllocator
constexpr unsigned int binGrowth = 8;
constexpr unsigned int binGrowth = 2;
// Smallest bin, corresponds to binGrowth^minBin bytes (min_bin in cub::CacingDeviceAllocator
constexpr unsigned int minBin = 1;
constexpr unsigned int minBin = 8;
// Largest bin, corresponds to binGrowth^maxBin bytes (max_bin in cub::CachingDeviceAllocator). Note that unlike in cub, allocations larger than binGrowth^maxBin are set to fail.
constexpr unsigned int maxBin = 10;
constexpr unsigned int maxBin = 30;
// Total storage for the allocator. 0 means no limit.
constexpr size_t maxCachedBytes = 0;
// Fraction of total device memory taken for the allocator. In case there are multiple devices with different amounts of memory, the smallest of them is taken. If maxCachedBytes is non-zero, the smallest of them is taken.
Expand Down
6 changes: 3 additions & 3 deletions RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ namespace CAConstants {
constexpr uint32_t maxNumberOfQuadruplets() { return maxNumberOfTuples(); }
#ifndef ONLY_PHICUT
#ifndef GPU_SMALL_EVENTS
constexpr uint32_t maxNumberOfDoublets() { return 448 * 1024; }
constexpr uint32_t maxNumberOfDoublets() { return 512 * 1024; }
constexpr uint32_t maxCellsPerHit() { return 128; }
#else
constexpr uint32_t maxNumberOfDoublets() { return 128 * 1024; }
Expand All @@ -37,7 +37,7 @@ namespace CAConstants {
constexpr uint32_t maxNumberOfDoublets() { return 2 * 1024 * 1024; }
constexpr uint32_t maxCellsPerHit() { return 8 * 128; }
#endif
constexpr uint32_t maxNumOfActiveDoublets() { return maxNumberOfDoublets() / 4; }
constexpr uint32_t maxNumOfActiveDoublets() { return maxNumberOfDoublets() / 8; }

constexpr uint32_t maxNumberOfLayerPairs() { return 20; }
constexpr uint32_t maxNumberOfLayers() { return 10; }
Expand All @@ -49,7 +49,7 @@ namespace CAConstants {

#ifndef ONLY_PHICUT
using CellNeighbors = cms::cuda::VecArray<uint32_t, 36>;
using CellTracks = cms::cuda::VecArray<tindex_type, 42>;
using CellTracks = cms::cuda::VecArray<tindex_type, 48>;
#else
using CellNeighbors = cms::cuda::VecArray<uint32_t, 64>;
using CellTracks = cms::cuda::VecArray<tindex_type, 64>;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,20 @@ void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsOnCPU const &hh, cudaStr
device_isOuterHitOfCell_.reset(
(GPUCACell::OuterHitOfCell *)malloc(std::max(1U, nhits) * sizeof(GPUCACell::OuterHitOfCell)));
assert(device_isOuterHitOfCell_.get());

cellStorage_.reset((unsigned char *)malloc(CAConstants::maxNumOfActiveDoublets() * sizeof(GPUCACell::CellNeighbors) +
CAConstants::maxNumOfActiveDoublets() * sizeof(GPUCACell::CellTracks)));
device_theCellNeighborsContainer_ = (GPUCACell::CellNeighbors *)cellStorage_.get();
device_theCellTracksContainer_ =
(GPUCACell::CellTracks *)(cellStorage_.get() +
CAConstants::maxNumOfActiveDoublets() * sizeof(GPUCACell::CellNeighbors));

gpuPixelDoublets::initDoublets(device_isOuterHitOfCell_.get(),
nhits,
device_theCellNeighbors_,
device_theCellNeighborsContainer_.get(),
device_theCellTracks_,
device_theCellTracksContainer_.get());
device_theCellNeighbors_.get(),
device_theCellNeighborsContainer_,
device_theCellTracks_.get(),
device_theCellTracksContainer_);

// device_theCells_ = Traits:: template make_unique<GPUCACell[]>(cs, m_params.maxNumberOfDoublets_, stream);
device_theCells_.reset((GPUCACell *)malloc(sizeof(GPUCACell) * m_params.maxNumberOfDoublets_));
Expand All @@ -47,8 +55,8 @@ void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsOnCPU const &hh, cudaStr
assert(nActualPairs <= gpuPixelDoublets::nPairs);
gpuPixelDoublets::getDoubletsFromHisto(device_theCells_.get(),
device_nCells_,
device_theCellNeighbors_,
device_theCellTracks_,
device_theCellNeighbors_.get(),
device_theCellTracks_.get(),
hh.view(),
device_isOuterHitOfCell_.get(),
nActualPairs,
Expand Down Expand Up @@ -84,7 +92,7 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA *
hh.view(),
device_theCells_.get(),
device_nCells_,
device_theCellNeighbors_,
device_theCellNeighbors_.get(),
device_isOuterHitOfCell_.get(),
m_params.hardCurvCut_,
m_params.ptmin_,
Expand All @@ -101,7 +109,7 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA *
kernel_find_ntuplets(hh.view(),
device_theCells_.get(),
device_nCells_,
device_theCellTracks_,
device_theCellTracks_.get(),
tuples_d,
device_hitTuple_apc_,
quality_d,
Expand Down Expand Up @@ -129,8 +137,8 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA *
device_hitTuple_apc_,
device_theCells_.get(),
device_nCells_,
device_theCellNeighbors_,
device_theCellTracks_,
device_theCellNeighbors_.get(),
device_theCellTracks_.get(),
device_isOuterHitOfCell_.get(),
nhits,
m_params.maxNumberOfDoublets_,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *
hh.view(),
device_theCells_.get(),
device_nCells_,
device_theCellNeighbors_,
device_theCellNeighbors_.get(),
device_isOuterHitOfCell_.get(),
m_params.hardCurvCut_,
m_params.ptmin_,
Expand All @@ -78,7 +78,7 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *
kernel_find_ntuplets<<<numberOfBlocks, blockSize, 0, cudaStream>>>(hh.view(),
device_theCells_.get(),
device_nCells_,
device_theCellTracks_,
device_theCellTracks_.get(),
tuples_d,
device_hitTuple_apc_,
quality_d,
Expand Down Expand Up @@ -132,8 +132,8 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *
device_hitTuple_apc_,
device_theCells_.get(),
device_nCells_,
device_theCellNeighbors_,
device_theCellTracks_,
device_theCellNeighbors_.get(),
device_theCellTracks_.get(),
device_isOuterHitOfCell_.get(),
nhits,
m_params.maxNumberOfDoublets_,
Expand All @@ -144,6 +144,9 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *
cudaDeviceSynchronize();
cudaCheck(cudaGetLastError());
#endif

// free space asap
// device_isOuterHitOfCell_.reset();
}

template <>
Expand All @@ -162,16 +165,26 @@ void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStr
// in principle we can use "nhits" to heuristically dimension the workspace...
device_isOuterHitOfCell_ = cms::cuda::make_device_unique<GPUCACell::OuterHitOfCell[]>(std::max(1U, nhits), stream);
assert(device_isOuterHitOfCell_.get());

cellStorage_ = cms::cuda::make_device_unique<unsigned char[]>(
CAConstants::maxNumOfActiveDoublets() * sizeof(GPUCACell::CellNeighbors) +
CAConstants::maxNumOfActiveDoublets() * sizeof(GPUCACell::CellTracks),
stream);
device_theCellNeighborsContainer_ = (GPUCACell::CellNeighbors *)cellStorage_.get();
device_theCellTracksContainer_ =
(GPUCACell::CellTracks *)(cellStorage_.get() +
CAConstants::maxNumOfActiveDoublets() * sizeof(GPUCACell::CellNeighbors));

{
int threadsPerBlock = 128;
// at least one block!
int blocks = (std::max(1U, nhits) + threadsPerBlock - 1) / threadsPerBlock;
gpuPixelDoublets::initDoublets<<<blocks, threadsPerBlock, 0, stream>>>(device_isOuterHitOfCell_.get(),
nhits,
device_theCellNeighbors_,
device_theCellNeighborsContainer_.get(),
device_theCellTracks_,
device_theCellTracksContainer_.get());
device_theCellNeighbors_.get(),
device_theCellNeighborsContainer_,
device_theCellTracks_.get(),
device_theCellTracksContainer_);
cudaCheck(cudaGetLastError());
}

Expand Down Expand Up @@ -201,8 +214,8 @@ void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStr
dim3 thrs(stride, threadsPerBlock, 1);
gpuPixelDoublets::getDoubletsFromHisto<<<blks, thrs, 0, stream>>>(device_theCells_.get(),
device_nCells_,
device_theCellNeighbors_,
device_theCellTracks_,
device_theCellNeighbors_.get(),
device_theCellTracks_.get(),
hh.view(),
device_isOuterHitOfCell_.get(),
nActualPairs,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -179,10 +179,11 @@ class CAHitNtupletGeneratorKernels {

private:
// workspace
CAConstants::CellNeighborsVector* device_theCellNeighbors_ = nullptr;
unique_ptr<CAConstants::CellNeighbors[]> device_theCellNeighborsContainer_;
CAConstants::CellTracksVector* device_theCellTracks_ = nullptr;
unique_ptr<CAConstants::CellTracks[]> device_theCellTracksContainer_;
unique_ptr<unsigned char[]> cellStorage_;
unique_ptr<CAConstants::CellNeighborsVector> device_theCellNeighbors_;
CAConstants::CellNeighbors* device_theCellNeighborsContainer_;
unique_ptr<CAConstants::CellTracksVector> device_theCellTracks_;
CAConstants::CellTracks* device_theCellTracksContainer_;

unique_ptr<GPUCACell[]> device_theCells_;
unique_ptr<GPUCACell::OuterHitOfCell[]> device_isOuterHitOfCell_;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,8 @@ void CAHitNtupletGeneratorKernelsCPU::allocateOnGPU(cudaStream_t stream) {
// ALLOCATIONS FOR THE INTERMEDIATE RESULTS (STAYS ON WORKER)
//////////////////////////////////////////////////////////

/* not used at the moment
cudaCheck(cudaMalloc(&device_theCellNeighbors_, sizeof(CAConstants::CellNeighborsVector)));
cudaCheck(cudaMemset(device_theCellNeighbors_, 0, sizeof(CAConstants::CellNeighborsVector)));
cudaCheck(cudaMalloc(&device_theCellTracks_, sizeof(CAConstants::CellTracksVector)));
cudaCheck(cudaMemset(device_theCellTracks_, 0, sizeof(CAConstants::CellTracksVector)));
*/
device_theCellNeighbors_ = Traits::template make_unique<CAConstants::CellNeighborsVector>(stream);
device_theCellTracks_ = Traits::template make_unique<CAConstants::CellTracksVector>(stream);

device_hitToTuple_ = Traits::template make_unique<HitToTuple>(stream);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,10 @@ __global__ void kernel_checkOverflows(HitContainer const *foundNtuplets,
printf("Tuples overflow\n");
if (*nCells >= maxNumberOfDoublets)
printf("Cells overflow\n");
if (cellNeighbors && cellNeighbors->full())
printf("cellNeighbors overflow\n");
if (cellTracks && cellTracks->full())
printf("cellTracks overflow\n");
}

for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -168,14 +168,15 @@ PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecH

CAHitNtupletGeneratorKernelsGPU kernels(m_params);
kernels.counters_ = m_counters;
HelixFitOnGPU fitter(bfield, m_params.fit5as4_);

kernels.allocateOnGPU(stream);
fitter.allocateOnGPU(&(soa->hitIndices), kernels.tupleMultiplicity(), soa);

kernels.buildDoublets(hits_d, stream);
kernels.launchKernels(hits_d, soa, stream);
kernels.fillHitDetIndices(hits_d.view(), soa, stream); // in principle needed only if Hits not "available"

HelixFitOnGPU fitter(bfield, m_params.fit5as4_);
fitter.allocateOnGPU(&(soa->hitIndices), kernels.tupleMultiplicity(), soa);
if (m_params.useRiemannFit_) {
fitter.launchRiemannKernels(hits_d.view(), hits_d.nHits(), CAConstants::maxNumberOfQuadruplets(), stream);
} else {
Expand Down
48 changes: 40 additions & 8 deletions RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,24 +56,56 @@ class GPUCACell {
theInnerZ = hh.zGlobal(innerHitId);
theInnerR = hh.rGlobal(innerHitId);

outerNeighbors().reset();
tracks().reset();
// link to default empty
theOuterNeighbors = &cellNeighbors[0];
theTracks = &cellTracks[0];
assert(outerNeighbors().empty());
assert(tracks().empty());
}

__device__ __forceinline__ int addOuterNeighbor(CellNeighbors::value_t t, CellNeighborsVector& cellNeighbors) {
// use smart cache
if (outerNeighbors().empty()) {
auto i = cellNeighbors.extend(); // maybe waisted....
if (i > 0) {
cellNeighbors[i].reset();
#ifdef __CUDACC__
auto zero = (ptrAsInt)(&cellNeighbors[0]);
atomicCAS((ptrAsInt*)(&theOuterNeighbors),
zero,
(ptrAsInt)(&cellNeighbors[i])); // if fails we cannot give "i" back...
#else
theOuterNeighbors = &cellNeighbors[i];
#endif
} else
return -1;
}
__threadfence();
return outerNeighbors().push_back(t);
}

__device__ __forceinline__ int addTrack(CellTracks::value_t t, CellTracksVector& cellTracks) {
if (tracks().empty()) {
auto i = cellTracks.extend(); // maybe waisted....
if (i > 0) {
cellTracks[i].reset();
#ifdef __CUDACC__
auto zero = (ptrAsInt)(&cellTracks[0]);
atomicCAS((ptrAsInt*)(&theTracks), zero, (ptrAsInt)(&cellTracks[i])); // if fails we cannot give "i" back...
#else
theTracks = &cellTracks[i];
#endif
} else
return -1;
}
__threadfence();
return tracks().push_back(t);
}

__device__ __forceinline__ CellTracks& tracks() { return theTracks; }
__device__ __forceinline__ CellTracks const& tracks() const { return theTracks; }
__device__ __forceinline__ CellNeighbors& outerNeighbors() { return theOuterNeighbors; }
__device__ __forceinline__ CellNeighbors const& outerNeighbors() const { return theOuterNeighbors; }
__device__ __forceinline__ CellTracks& tracks() { return *theTracks; }
__device__ __forceinline__ CellTracks const& tracks() const { return *theTracks; }
__device__ __forceinline__ CellNeighbors& outerNeighbors() { return *theOuterNeighbors; }
__device__ __forceinline__ CellNeighbors const& outerNeighbors() const { return *theOuterNeighbors; }
__device__ __forceinline__ float get_inner_x(Hits const& hh) const { return hh.xGlobal(theInnerHitId); }
__device__ __forceinline__ float get_outer_x(Hits const& hh) const { return hh.xGlobal(theOuterHitId); }
__device__ __forceinline__ float get_inner_y(Hits const& hh) const { return hh.yGlobal(theInnerHitId); }
Expand Down Expand Up @@ -297,8 +329,8 @@ class GPUCACell {
}

private:
CellNeighbors theOuterNeighbors;
CellTracks theTracks;
CellNeighbors* theOuterNeighbors;
CellTracks* theTracks;

public:
int32_t theDoubletId;
Expand Down
11 changes: 11 additions & 0 deletions RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,17 @@ namespace gpuPixelDoublets {
int first = blockIdx.x * blockDim.x + threadIdx.x;
for (int i = first; i < nHits; i += gridDim.x * blockDim.x)
isOuterHitOfCell[i].reset();

if (0 == first) {
cellNeighbors->construct(CAConstants::maxNumOfActiveDoublets(), cellNeighborsContainer);
cellTracks->construct(CAConstants::maxNumOfActiveDoublets(), cellTracksContainer);
auto i = cellNeighbors->extend();
assert(0 == i);
(*cellNeighbors)[0].reset();
i = cellTracks->extend();
assert(0 == i);
(*cellTracks)[0].reset();
}
}

constexpr auto getDoubletsFromHistoMaxBlockSize = 64; // for both x and y
Expand Down
5 changes: 5 additions & 0 deletions RecoPixelVertexing/PixelTriplets/test/BuildFile.xml
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,8 @@

<bin file="CircleEq_t.cpp">
</bin>

<bin file="CAsizes_t.cpp">
<use name="cuda"/>
<use name="eigen"/>
</bin>
Loading

0 comments on commit 11c3c16

Please sign in to comment.