Skip to content

Commit

Permalink
Reduce GPU memory usage (#509)
Browse files Browse the repository at this point in the history
Adjust the growth factor in the caching allocators to use more granular bins, reducing the memory wasted by the allocations.

Use a dynamic buffer for CA cells components.

Fix a possible data race in the prefix scan.
  • Loading branch information
VinInn authored and fwyzard committed Nov 16, 2020
1 parent 0f18fd0 commit 3f1677a
Show file tree
Hide file tree
Showing 11 changed files with 139 additions and 43 deletions.
6 changes: 3 additions & 3 deletions RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ namespace CAConstants {
constexpr uint32_t maxNumberOfQuadruplets() { return maxNumberOfTuples(); }
#ifndef ONLY_PHICUT
#ifndef GPU_SMALL_EVENTS
constexpr uint32_t maxNumberOfDoublets() { return 448 * 1024; }
constexpr uint32_t maxNumberOfDoublets() { return 512 * 1024; }
constexpr uint32_t maxCellsPerHit() { return 128; }
#else
constexpr uint32_t maxNumberOfDoublets() { return 128 * 1024; }
Expand All @@ -37,7 +37,7 @@ namespace CAConstants {
constexpr uint32_t maxNumberOfDoublets() { return 2 * 1024 * 1024; }
constexpr uint32_t maxCellsPerHit() { return 8 * 128; }
#endif
constexpr uint32_t maxNumOfActiveDoublets() { return maxNumberOfDoublets() / 4; }
constexpr uint32_t maxNumOfActiveDoublets() { return maxNumberOfDoublets() / 8; }

constexpr uint32_t maxNumberOfLayerPairs() { return 20; }
constexpr uint32_t maxNumberOfLayers() { return 10; }
Expand All @@ -49,7 +49,7 @@ namespace CAConstants {

#ifndef ONLY_PHICUT
using CellNeighbors = cms::cuda::VecArray<uint32_t, 36>;
using CellTracks = cms::cuda::VecArray<tindex_type, 42>;
using CellTracks = cms::cuda::VecArray<tindex_type, 48>;
#else
using CellNeighbors = cms::cuda::VecArray<uint32_t, 64>;
using CellTracks = cms::cuda::VecArray<tindex_type, 64>;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,20 @@ void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsOnCPU const &hh, cudaStr
device_isOuterHitOfCell_.reset(
(GPUCACell::OuterHitOfCell *)malloc(std::max(1U, nhits) * sizeof(GPUCACell::OuterHitOfCell)));
assert(device_isOuterHitOfCell_.get());

cellStorage_.reset((unsigned char *)malloc(CAConstants::maxNumOfActiveDoublets() * sizeof(GPUCACell::CellNeighbors) +
CAConstants::maxNumOfActiveDoublets() * sizeof(GPUCACell::CellTracks)));
device_theCellNeighborsContainer_ = (GPUCACell::CellNeighbors *)cellStorage_.get();
device_theCellTracksContainer_ =
(GPUCACell::CellTracks *)(cellStorage_.get() +
CAConstants::maxNumOfActiveDoublets() * sizeof(GPUCACell::CellNeighbors));

gpuPixelDoublets::initDoublets(device_isOuterHitOfCell_.get(),
nhits,
device_theCellNeighbors_,
device_theCellNeighborsContainer_.get(),
device_theCellTracks_,
device_theCellTracksContainer_.get());
device_theCellNeighbors_.get(),
device_theCellNeighborsContainer_,
device_theCellTracks_.get(),
device_theCellTracksContainer_);

// device_theCells_ = Traits:: template make_unique<GPUCACell[]>(cs, m_params.maxNumberOfDoublets_, stream);
device_theCells_.reset((GPUCACell *)malloc(sizeof(GPUCACell) * m_params.maxNumberOfDoublets_));
Expand All @@ -47,8 +55,8 @@ void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsOnCPU const &hh, cudaStr
assert(nActualPairs <= gpuPixelDoublets::nPairs);
gpuPixelDoublets::getDoubletsFromHisto(device_theCells_.get(),
device_nCells_,
device_theCellNeighbors_,
device_theCellTracks_,
device_theCellNeighbors_.get(),
device_theCellTracks_.get(),
hh.view(),
device_isOuterHitOfCell_.get(),
nActualPairs,
Expand Down Expand Up @@ -84,7 +92,7 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA *
hh.view(),
device_theCells_.get(),
device_nCells_,
device_theCellNeighbors_,
device_theCellNeighbors_.get(),
device_isOuterHitOfCell_.get(),
m_params.hardCurvCut_,
m_params.ptmin_,
Expand All @@ -101,7 +109,7 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA *
kernel_find_ntuplets(hh.view(),
device_theCells_.get(),
device_nCells_,
device_theCellTracks_,
device_theCellTracks_.get(),
tuples_d,
device_hitTuple_apc_,
quality_d,
Expand Down Expand Up @@ -129,8 +137,8 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA *
device_hitTuple_apc_,
device_theCells_.get(),
device_nCells_,
device_theCellNeighbors_,
device_theCellTracks_,
device_theCellNeighbors_.get(),
device_theCellTracks_.get(),
device_isOuterHitOfCell_.get(),
nhits,
m_params.maxNumberOfDoublets_,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *
hh.view(),
device_theCells_.get(),
device_nCells_,
device_theCellNeighbors_,
device_theCellNeighbors_.get(),
device_isOuterHitOfCell_.get(),
m_params.hardCurvCut_,
m_params.ptmin_,
Expand All @@ -78,7 +78,7 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *
kernel_find_ntuplets<<<numberOfBlocks, blockSize, 0, cudaStream>>>(hh.view(),
device_theCells_.get(),
device_nCells_,
device_theCellTracks_,
device_theCellTracks_.get(),
tuples_d,
device_hitTuple_apc_,
quality_d,
Expand Down Expand Up @@ -132,8 +132,8 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *
device_hitTuple_apc_,
device_theCells_.get(),
device_nCells_,
device_theCellNeighbors_,
device_theCellTracks_,
device_theCellNeighbors_.get(),
device_theCellTracks_.get(),
device_isOuterHitOfCell_.get(),
nhits,
m_params.maxNumberOfDoublets_,
Expand All @@ -144,6 +144,9 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *
cudaDeviceSynchronize();
cudaCheck(cudaGetLastError());
#endif

// free space asap
// device_isOuterHitOfCell_.reset();
}

template <>
Expand All @@ -162,16 +165,26 @@ void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStr
// in principle we can use "nhits" to heuristically dimension the workspace...
device_isOuterHitOfCell_ = cms::cuda::make_device_unique<GPUCACell::OuterHitOfCell[]>(std::max(1U, nhits), stream);
assert(device_isOuterHitOfCell_.get());

cellStorage_ = cms::cuda::make_device_unique<unsigned char[]>(
CAConstants::maxNumOfActiveDoublets() * sizeof(GPUCACell::CellNeighbors) +
CAConstants::maxNumOfActiveDoublets() * sizeof(GPUCACell::CellTracks),
stream);
device_theCellNeighborsContainer_ = (GPUCACell::CellNeighbors *)cellStorage_.get();
device_theCellTracksContainer_ =
(GPUCACell::CellTracks *)(cellStorage_.get() +
CAConstants::maxNumOfActiveDoublets() * sizeof(GPUCACell::CellNeighbors));

{
int threadsPerBlock = 128;
// at least one block!
int blocks = (std::max(1U, nhits) + threadsPerBlock - 1) / threadsPerBlock;
gpuPixelDoublets::initDoublets<<<blocks, threadsPerBlock, 0, stream>>>(device_isOuterHitOfCell_.get(),
nhits,
device_theCellNeighbors_,
device_theCellNeighborsContainer_.get(),
device_theCellTracks_,
device_theCellTracksContainer_.get());
device_theCellNeighbors_.get(),
device_theCellNeighborsContainer_,
device_theCellTracks_.get(),
device_theCellTracksContainer_);
cudaCheck(cudaGetLastError());
}

Expand Down Expand Up @@ -201,8 +214,8 @@ void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStr
dim3 thrs(stride, threadsPerBlock, 1);
gpuPixelDoublets::getDoubletsFromHisto<<<blks, thrs, 0, stream>>>(device_theCells_.get(),
device_nCells_,
device_theCellNeighbors_,
device_theCellTracks_,
device_theCellNeighbors_.get(),
device_theCellTracks_.get(),
hh.view(),
device_isOuterHitOfCell_.get(),
nActualPairs,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -179,10 +179,11 @@ class CAHitNtupletGeneratorKernels {

private:
// workspace
CAConstants::CellNeighborsVector* device_theCellNeighbors_ = nullptr;
unique_ptr<CAConstants::CellNeighbors[]> device_theCellNeighborsContainer_;
CAConstants::CellTracksVector* device_theCellTracks_ = nullptr;
unique_ptr<CAConstants::CellTracks[]> device_theCellTracksContainer_;
unique_ptr<unsigned char[]> cellStorage_;
unique_ptr<CAConstants::CellNeighborsVector> device_theCellNeighbors_;
CAConstants::CellNeighbors* device_theCellNeighborsContainer_;
unique_ptr<CAConstants::CellTracksVector> device_theCellTracks_;
CAConstants::CellTracks* device_theCellTracksContainer_;

unique_ptr<GPUCACell[]> device_theCells_;
unique_ptr<GPUCACell::OuterHitOfCell[]> device_isOuterHitOfCell_;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,8 @@ void CAHitNtupletGeneratorKernelsCPU::allocateOnGPU(cudaStream_t stream) {
// ALLOCATIONS FOR THE INTERMEDIATE RESULTS (STAYS ON WORKER)
//////////////////////////////////////////////////////////

/* not used at the moment
cudaCheck(cudaMalloc(&device_theCellNeighbors_, sizeof(CAConstants::CellNeighborsVector)));
cudaCheck(cudaMemset(device_theCellNeighbors_, 0, sizeof(CAConstants::CellNeighborsVector)));
cudaCheck(cudaMalloc(&device_theCellTracks_, sizeof(CAConstants::CellTracksVector)));
cudaCheck(cudaMemset(device_theCellTracks_, 0, sizeof(CAConstants::CellTracksVector)));
*/
device_theCellNeighbors_ = Traits::template make_unique<CAConstants::CellNeighborsVector>(stream);
device_theCellTracks_ = Traits::template make_unique<CAConstants::CellTracksVector>(stream);

device_hitToTuple_ = Traits::template make_unique<HitToTuple>(stream);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,10 @@ __global__ void kernel_checkOverflows(HitContainer const *foundNtuplets,
printf("Tuples overflow\n");
if (*nCells >= maxNumberOfDoublets)
printf("Cells overflow\n");
if (cellNeighbors && cellNeighbors->full())
printf("cellNeighbors overflow\n");
if (cellTracks && cellTracks->full())
printf("cellTracks overflow\n");
}

for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -168,14 +168,15 @@ PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecH

CAHitNtupletGeneratorKernelsGPU kernels(m_params);
kernels.counters_ = m_counters;
HelixFitOnGPU fitter(bfield, m_params.fit5as4_);

kernels.allocateOnGPU(stream);
fitter.allocateOnGPU(&(soa->hitIndices), kernels.tupleMultiplicity(), soa);

kernels.buildDoublets(hits_d, stream);
kernels.launchKernels(hits_d, soa, stream);
kernels.fillHitDetIndices(hits_d.view(), soa, stream); // in principle needed only if Hits not "available"

HelixFitOnGPU fitter(bfield, m_params.fit5as4_);
fitter.allocateOnGPU(&(soa->hitIndices), kernels.tupleMultiplicity(), soa);
if (m_params.useRiemannFit_) {
fitter.launchRiemannKernels(hits_d.view(), hits_d.nHits(), CAConstants::maxNumberOfQuadruplets(), stream);
} else {
Expand Down
48 changes: 40 additions & 8 deletions RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,24 +56,56 @@ class GPUCACell {
theInnerZ = hh.zGlobal(innerHitId);
theInnerR = hh.rGlobal(innerHitId);

outerNeighbors().reset();
tracks().reset();
// link to default empty
theOuterNeighbors = &cellNeighbors[0];
theTracks = &cellTracks[0];
assert(outerNeighbors().empty());
assert(tracks().empty());
}

__device__ __forceinline__ int addOuterNeighbor(CellNeighbors::value_t t, CellNeighborsVector& cellNeighbors) {
// use smart cache
if (outerNeighbors().empty()) {
auto i = cellNeighbors.extend(); // maybe waisted....
if (i > 0) {
cellNeighbors[i].reset();
#ifdef __CUDACC__
auto zero = (ptrAsInt)(&cellNeighbors[0]);
atomicCAS((ptrAsInt*)(&theOuterNeighbors),
zero,
(ptrAsInt)(&cellNeighbors[i])); // if fails we cannot give "i" back...
#else
theOuterNeighbors = &cellNeighbors[i];
#endif
} else
return -1;
}
__threadfence();
return outerNeighbors().push_back(t);
}

__device__ __forceinline__ int addTrack(CellTracks::value_t t, CellTracksVector& cellTracks) {
if (tracks().empty()) {
auto i = cellTracks.extend(); // maybe waisted....
if (i > 0) {
cellTracks[i].reset();
#ifdef __CUDACC__
auto zero = (ptrAsInt)(&cellTracks[0]);
atomicCAS((ptrAsInt*)(&theTracks), zero, (ptrAsInt)(&cellTracks[i])); // if fails we cannot give "i" back...
#else
theTracks = &cellTracks[i];
#endif
} else
return -1;
}
__threadfence();
return tracks().push_back(t);
}

__device__ __forceinline__ CellTracks& tracks() { return theTracks; }
__device__ __forceinline__ CellTracks const& tracks() const { return theTracks; }
__device__ __forceinline__ CellNeighbors& outerNeighbors() { return theOuterNeighbors; }
__device__ __forceinline__ CellNeighbors const& outerNeighbors() const { return theOuterNeighbors; }
__device__ __forceinline__ CellTracks& tracks() { return *theTracks; }
__device__ __forceinline__ CellTracks const& tracks() const { return *theTracks; }
__device__ __forceinline__ CellNeighbors& outerNeighbors() { return *theOuterNeighbors; }
__device__ __forceinline__ CellNeighbors const& outerNeighbors() const { return *theOuterNeighbors; }
__device__ __forceinline__ float get_inner_x(Hits const& hh) const { return hh.xGlobal(theInnerHitId); }
__device__ __forceinline__ float get_outer_x(Hits const& hh) const { return hh.xGlobal(theOuterHitId); }
__device__ __forceinline__ float get_inner_y(Hits const& hh) const { return hh.yGlobal(theInnerHitId); }
Expand Down Expand Up @@ -297,8 +329,8 @@ class GPUCACell {
}

private:
CellNeighbors theOuterNeighbors;
CellTracks theTracks;
CellNeighbors* theOuterNeighbors;
CellTracks* theTracks;

public:
int32_t theDoubletId;
Expand Down
11 changes: 11 additions & 0 deletions RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,17 @@ namespace gpuPixelDoublets {
int first = blockIdx.x * blockDim.x + threadIdx.x;
for (int i = first; i < nHits; i += gridDim.x * blockDim.x)
isOuterHitOfCell[i].reset();

if (0 == first) {
cellNeighbors->construct(CAConstants::maxNumOfActiveDoublets(), cellNeighborsContainer);
cellTracks->construct(CAConstants::maxNumOfActiveDoublets(), cellTracksContainer);
auto i = cellNeighbors->extend();
assert(0 == i);
(*cellNeighbors)[0].reset();
i = cellTracks->extend();
assert(0 == i);
(*cellTracks)[0].reset();
}
}

constexpr auto getDoubletsFromHistoMaxBlockSize = 64; // for both x and y
Expand Down
5 changes: 5 additions & 0 deletions RecoPixelVertexing/PixelTriplets/test/BuildFile.xml
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,8 @@

<bin file="CircleEq_t.cpp">
</bin>

<bin file="CAsizes_t.cpp">
<use name="cuda"/>
<use name="eigen"/>
</bin>
25 changes: 25 additions & 0 deletions RecoPixelVertexing/PixelTriplets/test/CAsizes_t.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#include "RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h"

#include <typeinfo>
#include <iostream>

template <typename T>
void print() {
std::cout << "size of " << typeid(T).name() << ' ' << sizeof(T) << std::endl;
}

int main() {
using namespace CAConstants;

print<GPUCACell>();
print<CellNeighbors>();
print<CellTracks>();
print<OuterHitOfCell>();
print<TuplesContainer>();
print<HitToTuple>();
print<TupleMultiplicity>();

print<CellNeighborsVector>();

return 0;
}

0 comments on commit 3f1677a

Please sign in to comment.