Skip to content

Commit

Permalink
Speed up the doublet finder (#260)
Browse files Browse the repository at this point in the history
Introduce the inner loop parallelization in the doublet finder using the
stride pattern already used in the "fishbone", and make use of a 2D grid
instead of a hand-made stride.
  • Loading branch information
VinInn authored and fwyzard committed Jan 24, 2019
1 parent fb0dbd9 commit 00e8cf4
Show file tree
Hide file tree
Showing 3 changed files with 51 additions and 32 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -144,17 +144,19 @@ kernel_connect(AtomicPairCounter * apc1, AtomicPairCounter * apc2, // just to z
constexpr auto hardCurvCut = 1.f/(0.35f * 87.f); // FIXME VI tune
constexpr auto ptmin = 0.9f; // FIXME original "tune"

auto cellIndex = threadIdx.x + blockIdx.x * blockDim.x;
auto cellIndex = threadIdx.y + blockIdx.y * blockDim.y;
auto first = threadIdx.x;
auto stride = blockDim.x;

if (0==cellIndex) { (*apc1)=0; (*apc2)=0; }// ready for next kernel
if (0==(cellIndex+first)) { (*apc1)=0; (*apc2)=0; }// ready for next kernel

if (cellIndex >= (*nCells) ) return;
auto const & thisCell = cells[cellIndex];
if (thisCell.theDoubletId<0) return;
auto innerHitId = thisCell.get_inner_hit_id();
auto numberOfPossibleNeighbors = isOuterHitOfCell[innerHitId].size();
auto vi = isOuterHitOfCell[innerHitId].data();
for (auto j = 0; j < numberOfPossibleNeighbors; ++j) {
for (auto j = first; j < numberOfPossibleNeighbors; j+=stride) {
auto otherCell = __ldg(vi+j);
if (cells[otherCell].theDoubletId<0) continue;
if (thisCell.check_alignment(hh,
Expand All @@ -172,6 +174,8 @@ void kernel_find_ntuplets(
unsigned int minHitsPerNtuplet)
{

// recursive: not obvious to widen

auto cellIndex = threadIdx.x + blockIdx.x * blockDim.x;
if (cellIndex >= (*nCells) ) return;
auto &thisCell = cells[cellIndex];
Expand Down Expand Up @@ -246,23 +250,29 @@ void CAHitQuadrupletGeneratorKernels::launchKernels( // here goes algoparms....
assert(nhits <= PixelGPUConstants::maxNumberOfHits);

if (earlyFishbone_) {
auto blockSize = 128;
auto nthTot = 64;
auto stride = 4;
auto blockSize = nthTot/stride;
auto numberOfBlocks = (nhits + blockSize - 1)/blockSize;
numberOfBlocks *=stride;

fishbone<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
dim3 blks(1,numberOfBlocks,1);
dim3 thrs(stride,blockSize,1);
fishbone<<<blks,thrs, 0, cudaStream>>>(
hh.gpu_d,
device_theCells_, device_nCells_,
device_isOuterHitOfCell_,
nhits, stride, false
nhits, false
);
cudaCheck(cudaGetLastError());
}

auto blockSize = 64;
auto nthTot = 64;
auto stride = 4;
auto blockSize = nthTot/stride;
auto numberOfBlocks = (maxNumberOfDoublets_ + blockSize - 1)/blockSize;
kernel_connect<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
dim3 blks(1,numberOfBlocks,1);
dim3 thrs(stride,blockSize,1);

kernel_connect<<<blks, thrs, 0, cudaStream>>>(
gpu_.apc_d, device_hitToTuple_apc_, // needed only to be reset, ready for next kernel
hh.gpu_d,
device_theCells_, device_nCells_,
Expand All @@ -282,14 +292,17 @@ void CAHitQuadrupletGeneratorKernels::launchKernels( // here goes algoparms....
cudautils::finalizeBulk<<<numberOfBlocks, blockSize, 0, cudaStream>>>(gpu_.apc_d,gpu_.tuples_d);

if (lateFishbone_) {
auto stride=4;
numberOfBlocks = (nhits + blockSize - 1)/blockSize;
numberOfBlocks *=stride;
fishbone<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
auto nthTot = 128;
auto stride = 16;
auto blockSize = nthTot/stride;
auto numberOfBlocks = (nhits + blockSize - 1)/blockSize;
dim3 blks(1,numberOfBlocks,1);
dim3 thrs(stride,blockSize,1);
fishbone<<<blks,thrs, 0, cudaStream>>>(
hh.gpu_d,
device_theCells_, device_nCells_,
device_isOuterHitOfCell_,
nhits, stride, true
nhits, true
);
cudaCheck(cudaGetLastError());
}
Expand All @@ -312,9 +325,13 @@ void CAHitQuadrupletGeneratorKernels::launchKernels( // here goes algoparms....
void CAHitQuadrupletGeneratorKernels::buildDoublets(HitsOnCPU const & hh, cudaStream_t stream) {
auto nhits = hh.nHits;

int threadsPerBlock = gpuPixelDoublets::getDoubletsFromHistoMaxBlockSize;
int stride=1;
int threadsPerBlock = gpuPixelDoublets::getDoubletsFromHistoMaxBlockSize/stride;
int blocks = (3 * nhits + threadsPerBlock - 1) / threadsPerBlock;
gpuPixelDoublets::getDoubletsFromHisto<<<blocks, threadsPerBlock, 0, stream>>>(device_theCells_, device_nCells_, hh.gpu_d, device_isOuterHitOfCell_);
dim3 blks(1,blocks,1);
dim3 thrs(stride,threadsPerBlock,1);
gpuPixelDoublets::getDoubletsFromHisto<<<blks, thrs, 0, stream>>>(
device_theCells_, device_nCells_, hh.gpu_d, device_isOuterHitOfCell_);
cudaCheck(cudaGetLastError());
}

Expand All @@ -330,4 +347,3 @@ void CAHitQuadrupletGeneratorKernels::classifyTuples(HitsOnCPU const & hh, Tuple
kernel_fastDuplicateRemover<<<numberOfBlocks, blockSize, 0, cudaStream>>>(device_theCells_, device_nCells_,tuples.tuples_d,tuples.helix_fit_results_d, tuples.quality_d);

}

19 changes: 9 additions & 10 deletions RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ namespace gpuPixelDoublets {
GPUCACell * cells, uint32_t const * __restrict__ nCells,
GPUCACell::OuterHitOfCell const * __restrict__ isOuterHitOfCell,
uint32_t nHits,
uint32_t stride, bool checkTrack) {
bool checkTrack) {

constexpr auto maxCellsPerHit = GPUCACell::maxCellsPerHit;

Expand All @@ -35,13 +35,12 @@ namespace gpuPixelDoublets {
uint8_t const * __restrict__ layerp = hh.phase1TopologyLayer_d;
auto layer = [&](uint16_t id) { return __ldg(layerp+id/phase1PixelTopology::maxModuleStride);};

auto ldx = threadIdx.x + blockIdx.x * blockDim.x;
auto idx = ldx/stride;
auto first = ldx - idx*stride;
assert(first<stride);
// x run faster...
auto idy = threadIdx.y + blockIdx.y * blockDim.y;
auto first = threadIdx.x;

if (idx>=nHits) return;
auto const & vc = isOuterHitOfCell[idx];
if (idy>=nHits) return;
auto const & vc = isOuterHitOfCell[idy];
auto s = vc.size();
if (s<2) return;
// if alligned kill one of the two.
Expand All @@ -66,8 +65,8 @@ namespace gpuPixelDoublets {
++sg;
}
if (sg<2) return;
// here we parallelize
for (uint32_t ic=first; ic<sg-1; ic+=stride) {
// here we parallelize
for (uint32_t ic=first; ic<sg-1; ic+=blockDim.x) {
auto & ci = cells[cc[ic]];
for (auto jc=ic+1; jc<sg; ++jc) {
auto & cj = cells[cc[jc]];
Expand All @@ -90,4 +89,4 @@ namespace gpuPixelDoublets {

}

#endif
#endif // RecoLocalTracker_SiPixelRecHits_plugins_gpuFishbone_h
12 changes: 8 additions & 4 deletions RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,11 @@ namespace gpuPixelDoublets {
}
auto ntot = innerLayerCumulativeSize[nPairs-1];

auto idx = blockIdx.x * blockDim.x + threadIdx.x;
for (auto j = idx; j < ntot; j += blockDim.x * gridDim.x) {
// x runs faster
auto idy = blockIdx.y * blockDim.y + threadIdx.y;
auto first = threadIdx.x;
auto stride = blockDim.x;
for (auto j = idy; j < ntot; j += blockDim.y * gridDim.y ) {

uint32_t pairLayerId=0;
while (j >= innerLayerCumulativeSize[pairLayerId++]);
Expand Down Expand Up @@ -115,7 +118,8 @@ namespace gpuPixelDoublets {
nmin += hist.size(kk+hoff);
auto const * __restrict__ p = hist.begin(kk+hoff);
auto const * __restrict__ e = hist.end(kk+hoff);
for (;p < e; ++p) {
p+=first;
for (;p < e; p+=stride) {
auto oi=__ldg(p);
assert(oi>=offsets[outer]);
assert(oi<offsets[outer+1]);
Expand All @@ -139,7 +143,7 @@ namespace gpuPixelDoublets {
} // loop in block...
}

constexpr auto getDoubletsFromHistoMaxBlockSize = 64;
constexpr auto getDoubletsFromHistoMaxBlockSize = 64; // for both x and y
constexpr auto getDoubletsFromHistoMinBlocksPerMP = 16;

__global__
Expand Down

0 comments on commit 00e8cf4

Please sign in to comment.