Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

use a dynamic buffer for CA cells components, adjust allocator growing factor to reduce memory used #509

Merged
merged 8 commits into from
Jul 15, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,13 @@ namespace cms {
extern thread_local dim3 blockIdx;
extern thread_local dim3 gridDim;

template <typename T1, typename T2>
T1 atomicCAS(T1* address, T1 compare, T2 val) {
T1 old = *address;
*address = old == compare ? val : old;
return old;
}

template <typename T1, typename T2>
T1 atomicInc(T1* a, T2 b) {
auto ret = *a;
Expand Down
11 changes: 7 additions & 4 deletions HeterogeneousCore/CUDAUtilities/interface/prefixScan.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,9 @@ namespace cms {
namespace cuda {

// limited to 32*32 elements....
template <typename T>
__host__ __device__ __forceinline__ void blockPrefixScan(T const* __restrict__ ci,
T* __restrict__ co,
template <typename VT, typename T>
__host__ __device__ __forceinline__ void blockPrefixScan(VT const* ci,
VT* co,
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is VT supposed to be either T or volatile T ?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes,at least in this contest

uint32_t size,
T* ws
#ifndef __CUDA_ARCH__
Expand Down Expand Up @@ -138,7 +138,9 @@ namespace cms {

// in principle not limited....
template <typename T>
__global__ void multiBlockPrefixScan(T const* ci, T* co, int32_t size, int32_t* pc) {
__global__ void multiBlockPrefixScan(T const* ici, T* ico, int32_t size, int32_t* pc) {
volatile T const* ci = ici;
volatile T* co = ico;
__shared__ T ws[32];
#ifdef __CUDA_ARCH__
assert(sizeof(T) * gridDim.x <= dynamic_smem_size()); // size of psum below
Expand All @@ -152,6 +154,7 @@ namespace cms {
// count blocks that finished
__shared__ bool isLastBlockDone;
if (0 == threadIdx.x) {
__threadfence();
auto value = atomicAdd(pc, 1); // block counter
isLastBlockDone = (value == (int(gridDim.x) - 1));
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,11 @@ namespace cms::cuda::allocator {
// Use caching or not
constexpr bool useCaching = true;
// Growth factor (bin_growth in cub::CachingDeviceAllocator
constexpr unsigned int binGrowth = 8;
constexpr unsigned int binGrowth = 2;
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Makes sense.

// Smallest bin, corresponds to binGrowth^minBin bytes (min_bin in cub::CacingDeviceAllocator
constexpr unsigned int minBin = 1;
constexpr unsigned int minBin = 8;
Copy link

@fwyzard fwyzard Jul 13, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so, the smallest bin is now 256 (instead of 8) bytes ...

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(which makes sense, I don't think cudaMalloc actually returns memory chunks smaller than 256 bytes, since in all the tests I ran it looks like the memory is always aligned at least to that)

// Largest bin, corresponds to binGrowth^maxBin bytes (max_bin in cub::CachingDeviceAllocator). Note that unlike in cub, allocations larger than binGrowth^maxBin are set to fail.
constexpr unsigned int maxBin = 10;
constexpr unsigned int maxBin = 30;
Copy link

@fwyzard fwyzard Jul 13, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

... and the largest is 1 GB (as before) ?

// Total storage for the allocator. 0 means no limit.
constexpr size_t maxCachedBytes = 0;
// Fraction of total device memory taken for the allocator. In case there are multiple devices with different amounts of memory, the smallest of them is taken. If maxCachedBytes is non-zero, the smallest of them is taken.
Expand Down
6 changes: 3 additions & 3 deletions RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ namespace CAConstants {
constexpr uint32_t maxNumberOfQuadruplets() { return maxNumberOfTuples(); }
#ifndef ONLY_PHICUT
#ifndef GPU_SMALL_EVENTS
constexpr uint32_t maxNumberOfDoublets() { return 448 * 1024; }
constexpr uint32_t maxNumberOfDoublets() { return 512 * 1024; }
constexpr uint32_t maxCellsPerHit() { return 128; }
#else
constexpr uint32_t maxNumberOfDoublets() { return 128 * 1024; }
Expand All @@ -37,7 +37,7 @@ namespace CAConstants {
constexpr uint32_t maxNumberOfDoublets() { return 2 * 1024 * 1024; }
constexpr uint32_t maxCellsPerHit() { return 8 * 128; }
#endif
constexpr uint32_t maxNumOfActiveDoublets() { return maxNumberOfDoublets() / 4; }
constexpr uint32_t maxNumOfActiveDoublets() { return maxNumberOfDoublets() / 8; }

constexpr uint32_t maxNumberOfLayerPairs() { return 20; }
constexpr uint32_t maxNumberOfLayers() { return 10; }
Expand All @@ -49,7 +49,7 @@ namespace CAConstants {

#ifndef ONLY_PHICUT
using CellNeighbors = cms::cuda::VecArray<uint32_t, 36>;
using CellTracks = cms::cuda::VecArray<tindex_type, 42>;
using CellTracks = cms::cuda::VecArray<tindex_type, 48>;
#else
using CellNeighbors = cms::cuda::VecArray<uint32_t, 64>;
using CellTracks = cms::cuda::VecArray<tindex_type, 64>;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,20 @@ void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsOnCPU const &hh, cudaStr
device_isOuterHitOfCell_.reset(
(GPUCACell::OuterHitOfCell *)malloc(std::max(1U, nhits) * sizeof(GPUCACell::OuterHitOfCell)));
assert(device_isOuterHitOfCell_.get());

cellStorage_.reset((unsigned char *)malloc(CAConstants::maxNumOfActiveDoublets() * sizeof(GPUCACell::CellNeighbors) +
CAConstants::maxNumOfActiveDoublets() * sizeof(GPUCACell::CellTracks)));
device_theCellNeighborsContainer_ = (GPUCACell::CellNeighbors *)cellStorage_.get();
device_theCellTracksContainer_ =
(GPUCACell::CellTracks *)(cellStorage_.get() +
CAConstants::maxNumOfActiveDoublets() * sizeof(GPUCACell::CellNeighbors));

gpuPixelDoublets::initDoublets(device_isOuterHitOfCell_.get(),
nhits,
device_theCellNeighbors_,
device_theCellNeighborsContainer_.get(),
device_theCellTracks_,
device_theCellTracksContainer_.get());
device_theCellNeighbors_.get(),
device_theCellNeighborsContainer_,
device_theCellTracks_.get(),
device_theCellTracksContainer_);

// device_theCells_ = Traits:: template make_unique<GPUCACell[]>(cs, m_params.maxNumberOfDoublets_, stream);
device_theCells_.reset((GPUCACell *)malloc(sizeof(GPUCACell) * m_params.maxNumberOfDoublets_));
Expand All @@ -47,8 +55,8 @@ void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsOnCPU const &hh, cudaStr
assert(nActualPairs <= gpuPixelDoublets::nPairs);
gpuPixelDoublets::getDoubletsFromHisto(device_theCells_.get(),
device_nCells_,
device_theCellNeighbors_,
device_theCellTracks_,
device_theCellNeighbors_.get(),
device_theCellTracks_.get(),
hh.view(),
device_isOuterHitOfCell_.get(),
nActualPairs,
Expand Down Expand Up @@ -84,7 +92,7 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA *
hh.view(),
device_theCells_.get(),
device_nCells_,
device_theCellNeighbors_,
device_theCellNeighbors_.get(),
device_isOuterHitOfCell_.get(),
m_params.hardCurvCut_,
m_params.ptmin_,
Expand All @@ -101,7 +109,7 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA *
kernel_find_ntuplets(hh.view(),
device_theCells_.get(),
device_nCells_,
device_theCellTracks_,
device_theCellTracks_.get(),
tuples_d,
device_hitTuple_apc_,
quality_d,
Expand Down Expand Up @@ -129,8 +137,8 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA *
device_hitTuple_apc_,
device_theCells_.get(),
device_nCells_,
device_theCellNeighbors_,
device_theCellTracks_,
device_theCellNeighbors_.get(),
device_theCellTracks_.get(),
device_isOuterHitOfCell_.get(),
nhits,
m_params.maxNumberOfDoublets_,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *
hh.view(),
device_theCells_.get(),
device_nCells_,
device_theCellNeighbors_,
device_theCellNeighbors_.get(),
device_isOuterHitOfCell_.get(),
m_params.hardCurvCut_,
m_params.ptmin_,
Expand All @@ -78,7 +78,7 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *
kernel_find_ntuplets<<<numberOfBlocks, blockSize, 0, cudaStream>>>(hh.view(),
device_theCells_.get(),
device_nCells_,
device_theCellTracks_,
device_theCellTracks_.get(),
tuples_d,
device_hitTuple_apc_,
quality_d,
Expand Down Expand Up @@ -132,8 +132,8 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *
device_hitTuple_apc_,
device_theCells_.get(),
device_nCells_,
device_theCellNeighbors_,
device_theCellTracks_,
device_theCellNeighbors_.get(),
device_theCellTracks_.get(),
device_isOuterHitOfCell_.get(),
nhits,
m_params.maxNumberOfDoublets_,
Expand All @@ -144,6 +144,9 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *
cudaDeviceSynchronize();
cudaCheck(cudaGetLastError());
#endif

// free space asap
// device_isOuterHitOfCell_.reset();
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is this the change that didn't make any difference ?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, I though I committed the one with the "reset", will test again

}

template <>
Expand All @@ -162,16 +165,26 @@ void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStr
// in principle we can use "nhits" to heuristically dimension the workspace...
device_isOuterHitOfCell_ = cms::cuda::make_device_unique<GPUCACell::OuterHitOfCell[]>(std::max(1U, nhits), stream);
assert(device_isOuterHitOfCell_.get());

cellStorage_ = cms::cuda::make_device_unique<unsigned char[]>(
CAConstants::maxNumOfActiveDoublets() * sizeof(GPUCACell::CellNeighbors) +
CAConstants::maxNumOfActiveDoublets() * sizeof(GPUCACell::CellTracks),
stream);
device_theCellNeighborsContainer_ = (GPUCACell::CellNeighbors *)cellStorage_.get();
device_theCellTracksContainer_ =
(GPUCACell::CellTracks *)(cellStorage_.get() +
CAConstants::maxNumOfActiveDoublets() * sizeof(GPUCACell::CellNeighbors));

{
int threadsPerBlock = 128;
// at least one block!
int blocks = (std::max(1U, nhits) + threadsPerBlock - 1) / threadsPerBlock;
gpuPixelDoublets::initDoublets<<<blocks, threadsPerBlock, 0, stream>>>(device_isOuterHitOfCell_.get(),
nhits,
device_theCellNeighbors_,
device_theCellNeighborsContainer_.get(),
device_theCellTracks_,
device_theCellTracksContainer_.get());
device_theCellNeighbors_.get(),
device_theCellNeighborsContainer_,
device_theCellTracks_.get(),
device_theCellTracksContainer_);
cudaCheck(cudaGetLastError());
}

Expand Down Expand Up @@ -201,8 +214,8 @@ void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStr
dim3 thrs(stride, threadsPerBlock, 1);
gpuPixelDoublets::getDoubletsFromHisto<<<blks, thrs, 0, stream>>>(device_theCells_.get(),
device_nCells_,
device_theCellNeighbors_,
device_theCellTracks_,
device_theCellNeighbors_.get(),
device_theCellTracks_.get(),
hh.view(),
device_isOuterHitOfCell_.get(),
nActualPairs,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -179,10 +179,11 @@ class CAHitNtupletGeneratorKernels {

private:
// workspace
CAConstants::CellNeighborsVector* device_theCellNeighbors_ = nullptr;
unique_ptr<CAConstants::CellNeighbors[]> device_theCellNeighborsContainer_;
CAConstants::CellTracksVector* device_theCellTracks_ = nullptr;
unique_ptr<CAConstants::CellTracks[]> device_theCellTracksContainer_;
unique_ptr<unsigned char[]> cellStorage_;
unique_ptr<CAConstants::CellNeighborsVector> device_theCellNeighbors_;
CAConstants::CellNeighbors* device_theCellNeighborsContainer_;
unique_ptr<CAConstants::CellTracksVector> device_theCellTracks_;
CAConstants::CellTracks* device_theCellTracksContainer_;

unique_ptr<GPUCACell[]> device_theCells_;
unique_ptr<GPUCACell::OuterHitOfCell[]> device_isOuterHitOfCell_;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,8 @@ void CAHitNtupletGeneratorKernelsCPU::allocateOnGPU(cudaStream_t stream) {
// ALLOCATIONS FOR THE INTERMEDIATE RESULTS (STAYS ON WORKER)
//////////////////////////////////////////////////////////

/* not used at the moment
cudaCheck(cudaMalloc(&device_theCellNeighbors_, sizeof(CAConstants::CellNeighborsVector)));
cudaCheck(cudaMemset(device_theCellNeighbors_, 0, sizeof(CAConstants::CellNeighborsVector)));
cudaCheck(cudaMalloc(&device_theCellTracks_, sizeof(CAConstants::CellTracksVector)));
cudaCheck(cudaMemset(device_theCellTracks_, 0, sizeof(CAConstants::CellTracksVector)));
*/
device_theCellNeighbors_ = Traits::template make_unique<CAConstants::CellNeighborsVector>(stream);
device_theCellTracks_ = Traits::template make_unique<CAConstants::CellTracksVector>(stream);

device_hitToTuple_ = Traits::template make_unique<HitToTuple>(stream);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,10 @@ __global__ void kernel_checkOverflows(HitContainer const *foundNtuplets,
printf("Tuples overflow\n");
if (*nCells >= maxNumberOfDoublets)
printf("Cells overflow\n");
if (cellNeighbors && cellNeighbors->full())
printf("cellNeighbors overflow\n");
if (cellTracks && cellTracks->full())
printf("cellTracks overflow\n");
}

for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -168,14 +168,15 @@ PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecH

CAHitNtupletGeneratorKernelsGPU kernels(m_params);
kernels.counters_ = m_counters;
HelixFitOnGPU fitter(bfield, m_params.fit5as4_);

kernels.allocateOnGPU(stream);
fitter.allocateOnGPU(&(soa->hitIndices), kernels.tupleMultiplicity(), soa);

kernels.buildDoublets(hits_d, stream);
kernels.launchKernels(hits_d, soa, stream);
kernels.fillHitDetIndices(hits_d.view(), soa, stream); // in principle needed only if Hits not "available"

HelixFitOnGPU fitter(bfield, m_params.fit5as4_);
fitter.allocateOnGPU(&(soa->hitIndices), kernels.tupleMultiplicity(), soa);
if (m_params.useRiemannFit_) {
fitter.launchRiemannKernels(hits_d.view(), hits_d.nHits(), CAConstants::maxNumberOfQuadruplets(), stream);
} else {
Expand Down
48 changes: 40 additions & 8 deletions RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,24 +56,56 @@ class GPUCACell {
theInnerZ = hh.zGlobal(innerHitId);
theInnerR = hh.rGlobal(innerHitId);

outerNeighbors().reset();
tracks().reset();
// link to default empty
theOuterNeighbors = &cellNeighbors[0];
theTracks = &cellTracks[0];
assert(outerNeighbors().empty());
assert(tracks().empty());
}

__device__ __forceinline__ int addOuterNeighbor(CellNeighbors::value_t t, CellNeighborsVector& cellNeighbors) {
// use smart cache
if (outerNeighbors().empty()) {
auto i = cellNeighbors.extend(); // maybe waisted....
if (i > 0) {
cellNeighbors[i].reset();
#ifdef __CUDACC__
auto zero = (ptrAsInt)(&cellNeighbors[0]);
atomicCAS((ptrAsInt*)(&theOuterNeighbors),
zero,
(ptrAsInt)(&cellNeighbors[i])); // if fails we cannot give "i" back...
#else
theOuterNeighbors = &cellNeighbors[i];
#endif
} else
return -1;
}
__threadfence();
return outerNeighbors().push_back(t);
}

__device__ __forceinline__ int addTrack(CellTracks::value_t t, CellTracksVector& cellTracks) {
if (tracks().empty()) {
auto i = cellTracks.extend(); // maybe waisted....
if (i > 0) {
cellTracks[i].reset();
#ifdef __CUDACC__
auto zero = (ptrAsInt)(&cellTracks[0]);
atomicCAS((ptrAsInt*)(&theTracks), zero, (ptrAsInt)(&cellTracks[i])); // if fails we cannot give "i" back...
#else
theTracks = &cellTracks[i];
#endif
} else
return -1;
}
__threadfence();
return tracks().push_back(t);
}

__device__ __forceinline__ CellTracks& tracks() { return theTracks; }
__device__ __forceinline__ CellTracks const& tracks() const { return theTracks; }
__device__ __forceinline__ CellNeighbors& outerNeighbors() { return theOuterNeighbors; }
__device__ __forceinline__ CellNeighbors const& outerNeighbors() const { return theOuterNeighbors; }
__device__ __forceinline__ CellTracks& tracks() { return *theTracks; }
__device__ __forceinline__ CellTracks const& tracks() const { return *theTracks; }
__device__ __forceinline__ CellNeighbors& outerNeighbors() { return *theOuterNeighbors; }
__device__ __forceinline__ CellNeighbors const& outerNeighbors() const { return *theOuterNeighbors; }
__device__ __forceinline__ float get_inner_x(Hits const& hh) const { return hh.xGlobal(theInnerHitId); }
__device__ __forceinline__ float get_outer_x(Hits const& hh) const { return hh.xGlobal(theOuterHitId); }
__device__ __forceinline__ float get_inner_y(Hits const& hh) const { return hh.yGlobal(theInnerHitId); }
Expand Down Expand Up @@ -297,8 +329,8 @@ class GPUCACell {
}

private:
CellNeighbors theOuterNeighbors;
CellTracks theTracks;
CellNeighbors* theOuterNeighbors;
CellTracks* theTracks;

public:
int32_t theDoubletId;
Expand Down
11 changes: 11 additions & 0 deletions RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,17 @@ namespace gpuPixelDoublets {
int first = blockIdx.x * blockDim.x + threadIdx.x;
for (int i = first; i < nHits; i += gridDim.x * blockDim.x)
isOuterHitOfCell[i].reset();

if (0 == first) {
cellNeighbors->construct(CAConstants::maxNumOfActiveDoublets(), cellNeighborsContainer);
cellTracks->construct(CAConstants::maxNumOfActiveDoublets(), cellTracksContainer);
auto i = cellNeighbors->extend();
assert(0 == i);
(*cellNeighbors)[0].reset();
i = cellTracks->extend();
assert(0 == i);
(*cellTracks)[0].reset();
}
}

constexpr auto getDoubletsFromHistoMaxBlockSize = 64; // for both x and y
Expand Down
5 changes: 5 additions & 0 deletions RecoPixelVertexing/PixelTriplets/test/BuildFile.xml
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,8 @@

<bin file="CircleEq_t.cpp">
</bin>

<bin file="CAsizes_t.cpp">
<use name="cuda"/>
<use name="eigen"/>
</bin>
Loading