-
Notifications
You must be signed in to change notification settings - Fork 5
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
use a dynamic buffer for CA cells components, adjust allocator growing factor to reduce memory used #509
use a dynamic buffer for CA cells components, adjust allocator growing factor to reduce memory used #509
Changes from all commits
090b1b2
e4b82bc
fc0bdd2
00287f8
29caebf
abcf9aa
715f4f0
0b021bf
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -13,11 +13,11 @@ namespace cms::cuda::allocator { | |
// Use caching or not | ||
constexpr bool useCaching = true; | ||
// Growth factor (bin_growth in cub::CachingDeviceAllocator | ||
constexpr unsigned int binGrowth = 8; | ||
constexpr unsigned int binGrowth = 2; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Makes sense. |
||
// Smallest bin, corresponds to binGrowth^minBin bytes (min_bin in cub::CacingDeviceAllocator | ||
constexpr unsigned int minBin = 1; | ||
constexpr unsigned int minBin = 8; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. so, the smallest bin is now 256 (instead of 8) bytes ... There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. (which makes sense, I don't think |
||
// Largest bin, corresponds to binGrowth^maxBin bytes (max_bin in cub::CachingDeviceAllocator). Note that unlike in cub, allocations larger than binGrowth^maxBin are set to fail. | ||
constexpr unsigned int maxBin = 10; | ||
constexpr unsigned int maxBin = 30; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ... and the largest is 1 GB (as before) ? |
||
// Total storage for the allocator. 0 means no limit. | ||
constexpr size_t maxCachedBytes = 0; | ||
// Fraction of total device memory taken for the allocator. In case there are multiple devices with different amounts of memory, the smallest of them is taken. If maxCachedBytes is non-zero, the smallest of them is taken. | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -51,7 +51,7 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA * | |
hh.view(), | ||
device_theCells_.get(), | ||
device_nCells_, | ||
device_theCellNeighbors_, | ||
device_theCellNeighbors_.get(), | ||
device_isOuterHitOfCell_.get(), | ||
m_params.hardCurvCut_, | ||
m_params.ptmin_, | ||
|
@@ -78,7 +78,7 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA * | |
kernel_find_ntuplets<<<numberOfBlocks, blockSize, 0, cudaStream>>>(hh.view(), | ||
device_theCells_.get(), | ||
device_nCells_, | ||
device_theCellTracks_, | ||
device_theCellTracks_.get(), | ||
tuples_d, | ||
device_hitTuple_apc_, | ||
quality_d, | ||
|
@@ -132,8 +132,8 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA * | |
device_hitTuple_apc_, | ||
device_theCells_.get(), | ||
device_nCells_, | ||
device_theCellNeighbors_, | ||
device_theCellTracks_, | ||
device_theCellNeighbors_.get(), | ||
device_theCellTracks_.get(), | ||
device_isOuterHitOfCell_.get(), | ||
nhits, | ||
m_params.maxNumberOfDoublets_, | ||
|
@@ -144,6 +144,9 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA * | |
cudaDeviceSynchronize(); | ||
cudaCheck(cudaGetLastError()); | ||
#endif | ||
|
||
// free space asap | ||
// device_isOuterHitOfCell_.reset(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is this the change that didn't make any difference ? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes, I though I committed the one with the "reset", will test again |
||
} | ||
|
||
template <> | ||
|
@@ -162,16 +165,26 @@ void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStr | |
// in principle we can use "nhits" to heuristically dimension the workspace... | ||
device_isOuterHitOfCell_ = cms::cuda::make_device_unique<GPUCACell::OuterHitOfCell[]>(std::max(1U, nhits), stream); | ||
assert(device_isOuterHitOfCell_.get()); | ||
|
||
cellStorage_ = cms::cuda::make_device_unique<unsigned char[]>( | ||
CAConstants::maxNumOfActiveDoublets() * sizeof(GPUCACell::CellNeighbors) + | ||
CAConstants::maxNumOfActiveDoublets() * sizeof(GPUCACell::CellTracks), | ||
stream); | ||
device_theCellNeighborsContainer_ = (GPUCACell::CellNeighbors *)cellStorage_.get(); | ||
device_theCellTracksContainer_ = | ||
(GPUCACell::CellTracks *)(cellStorage_.get() + | ||
CAConstants::maxNumOfActiveDoublets() * sizeof(GPUCACell::CellNeighbors)); | ||
|
||
{ | ||
int threadsPerBlock = 128; | ||
// at least one block! | ||
int blocks = (std::max(1U, nhits) + threadsPerBlock - 1) / threadsPerBlock; | ||
gpuPixelDoublets::initDoublets<<<blocks, threadsPerBlock, 0, stream>>>(device_isOuterHitOfCell_.get(), | ||
nhits, | ||
device_theCellNeighbors_, | ||
device_theCellNeighborsContainer_.get(), | ||
device_theCellTracks_, | ||
device_theCellTracksContainer_.get()); | ||
device_theCellNeighbors_.get(), | ||
device_theCellNeighborsContainer_, | ||
device_theCellTracks_.get(), | ||
device_theCellTracksContainer_); | ||
cudaCheck(cudaGetLastError()); | ||
} | ||
|
||
|
@@ -201,8 +214,8 @@ void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStr | |
dim3 thrs(stride, threadsPerBlock, 1); | ||
gpuPixelDoublets::getDoubletsFromHisto<<<blks, thrs, 0, stream>>>(device_theCells_.get(), | ||
device_nCells_, | ||
device_theCellNeighbors_, | ||
device_theCellTracks_, | ||
device_theCellNeighbors_.get(), | ||
device_theCellTracks_.get(), | ||
hh.view(), | ||
device_isOuterHitOfCell_.get(), | ||
nActualPairs, | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -27,3 +27,8 @@ | |
|
||
<bin file="CircleEq_t.cpp"> | ||
</bin> | ||
|
||
<bin file="CAsizes_t.cpp"> | ||
<use name="cuda"/> | ||
<use name="eigen"/> | ||
</bin> |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
is
VT
supposed to be eitherT
orvolatile T
?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yes,at least in this contest