Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Speed up the doublet finder #242

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -144,17 +144,19 @@ kernel_connect(AtomicPairCounter * apc1, AtomicPairCounter * apc2, // just to z
constexpr auto hardCurvCut = 1.f/(0.35f * 87.f); // FIXME VI tune
constexpr auto ptmin = 0.9f; // FIXME original "tune"

auto cellIndex = threadIdx.x + blockIdx.x * blockDim.x;
auto cellIndex = threadIdx.y + blockIdx.y * blockDim.y;
auto first = threadIdx.x;
auto stride = blockDim.x;

if (0==cellIndex) { (*apc1)=0; (*apc2)=0; }// ready for next kernel
if (0==(cellIndex+first)) { (*apc1)=0; (*apc2)=0; }// ready for next kernel

if (cellIndex >= (*nCells) ) return;
auto const & thisCell = cells[cellIndex];
if (thisCell.theDoubletId<0) return;
auto innerHitId = thisCell.get_inner_hit_id();
auto numberOfPossibleNeighbors = isOuterHitOfCell[innerHitId].size();
auto vi = isOuterHitOfCell[innerHitId].data();
for (auto j = 0; j < numberOfPossibleNeighbors; ++j) {
for (auto j = first; j < numberOfPossibleNeighbors; j+=stride) {
auto otherCell = __ldg(vi+j);
if (cells[otherCell].theDoubletId<0) continue;
if (thisCell.check_alignment(hh,
Expand All @@ -172,6 +174,8 @@ void kernel_find_ntuplets(
unsigned int minHitsPerNtuplet)
{

// recursive: not obvious to widen

auto cellIndex = threadIdx.x + blockIdx.x * blockDim.x;
if (cellIndex >= (*nCells) ) return;
auto &thisCell = cells[cellIndex];
Expand Down Expand Up @@ -246,23 +250,27 @@ void CAHitQuadrupletGeneratorKernels::launchKernels( // here goes algoparms....
assert(nhits <= PixelGPUConstants::maxNumberOfHits);

if (earlyFishbone_) {
auto blockSize = 128;
auto blockSize = 64;
auto stride = 4;
auto numberOfBlocks = (nhits + blockSize - 1)/blockSize;
numberOfBlocks *=stride;

fishbone<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
dim3 blks(1,numberOfBlocks,1);
dim3 thrs(stride,blockSize,1);
fishbone<<<blks,thrs, 0, cudaStream>>>(
hh.gpu_d,
device_theCells_, device_nCells_,
device_isOuterHitOfCell_,
nhits, stride, false
nhits, false
);
cudaCheck(cudaGetLastError());
}

auto blockSize = 64;
auto numberOfBlocks = (maxNumberOfDoublets_ + blockSize - 1)/blockSize;
kernel_connect<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
auto stride = 4;
dim3 blks(1,numberOfBlocks,1);
dim3 thrs(stride,blockSize,1);

kernel_connect<<<blks, thrs, 0, cudaStream>>>(
gpu_.apc_d, device_hitToTuple_apc_, // needed only to be reset, ready for next kernel
hh.gpu_d,
device_theCells_, device_nCells_,
Expand All @@ -282,14 +290,16 @@ void CAHitQuadrupletGeneratorKernels::launchKernels( // here goes algoparms....
cudautils::finalizeBulk<<<numberOfBlocks, blockSize, 0, cudaStream>>>(gpu_.apc_d,gpu_.tuples_d);

if (lateFishbone_) {
auto stride=4;
numberOfBlocks = (nhits + blockSize - 1)/blockSize;
numberOfBlocks *=stride;
fishbone<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
auto blockSize = 64;
auto stride = 4;
auto numberOfBlocks = (nhits + blockSize - 1)/blockSize;
dim3 blks(1,numberOfBlocks,1);
dim3 thrs(stride,blockSize,1);
fishbone<<<blks,thrs, 0, cudaStream>>>(
hh.gpu_d,
device_theCells_, device_nCells_,
device_isOuterHitOfCell_,
nhits, stride, true
nhits, true
);
cudaCheck(cudaGetLastError());
}
Expand All @@ -312,9 +322,13 @@ void CAHitQuadrupletGeneratorKernels::launchKernels( // here goes algoparms....
void CAHitQuadrupletGeneratorKernels::buildDoublets(HitsOnCPU const & hh, cudaStream_t stream) {
auto nhits = hh.nHits;

int threadsPerBlock = gpuPixelDoublets::getDoubletsFromHistoMaxBlockSize;
int stride=4;
int threadsPerBlock = gpuPixelDoublets::getDoubletsFromHistoMaxBlockSize/stride;
int blocks = (3 * nhits + threadsPerBlock - 1) / threadsPerBlock;
gpuPixelDoublets::getDoubletsFromHisto<<<blocks, threadsPerBlock, 0, stream>>>(device_theCells_, device_nCells_, hh.gpu_d, device_isOuterHitOfCell_);
dim3 blks(1,blocks,1);
dim3 thrs(stride,threadsPerBlock,1);
gpuPixelDoublets::getDoubletsFromHisto<<<blks, thrs, 0, stream>>>(
device_theCells_, device_nCells_, hh.gpu_d, device_isOuterHitOfCell_);
cudaCheck(cudaGetLastError());
}

Expand All @@ -330,4 +344,3 @@ void CAHitQuadrupletGeneratorKernels::classifyTuples(HitsOnCPU const & hh, Tuple
kernel_fastDuplicateRemover<<<numberOfBlocks, blockSize, 0, cudaStream>>>(device_theCells_, device_nCells_,tuples.tuples_d,tuples.helix_fit_results_d, tuples.quality_d);

}

19 changes: 9 additions & 10 deletions RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ namespace gpuPixelDoublets {
GPUCACell * cells, uint32_t const * __restrict__ nCells,
GPUCACell::OuterHitOfCell const * __restrict__ isOuterHitOfCell,
uint32_t nHits,
uint32_t stride, bool checkTrack) {
bool checkTrack) {

constexpr auto maxCellsPerHit = GPUCACell::maxCellsPerHit;

Expand All @@ -35,13 +35,12 @@ namespace gpuPixelDoublets {
uint8_t const * __restrict__ layerp = hh.phase1TopologyLayer_d;
auto layer = [&](uint16_t id) { return __ldg(layerp+id/phase1PixelTopology::maxModuleStride);};

auto ldx = threadIdx.x + blockIdx.x * blockDim.x;
auto idx = ldx/stride;
auto first = ldx - idx*stride;
assert(first<stride);
// x run faster...
auto idy = threadIdx.y + blockIdx.y * blockDim.y;
auto first = threadIdx.x;

if (idx>=nHits) return;
auto const & vc = isOuterHitOfCell[idx];
if (idy>=nHits) return;
auto const & vc = isOuterHitOfCell[idy];
auto s = vc.size();
if (s<2) return;
// if alligned kill one of the two.
Expand All @@ -66,8 +65,8 @@ namespace gpuPixelDoublets {
++sg;
}
if (sg<2) return;
// here we parallelize
for (uint32_t ic=first; ic<sg-1; ic+=stride) {
// here we parallelize
for (uint32_t ic=first; ic<sg-1; ic+=blockDim.x) {
auto & ci = cells[cc[ic]];
for (auto jc=ic+1; jc<sg; ++jc) {
auto & cj = cells[cc[jc]];
Expand All @@ -90,4 +89,4 @@ namespace gpuPixelDoublets {

}

#endif
#endif // RecoLocalTracker_SiPixelRecHits_plugins_gpuFishbone_h
12 changes: 8 additions & 4 deletions RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,11 @@ namespace gpuPixelDoublets {
}
auto ntot = innerLayerCumulativeSize[nPairs-1];

auto idx = blockIdx.x * blockDim.x + threadIdx.x;
for (auto j = idx; j < ntot; j += blockDim.x * gridDim.x) {
// x runs faster
auto idy = blockIdx.y * blockDim.y + threadIdx.y;
auto first = threadIdx.x;
auto stride = blockDim.x;
for (auto j = idy; j < ntot; j += blockDim.y * gridDim.y ) {

uint32_t pairLayerId=0;
while (j >= innerLayerCumulativeSize[pairLayerId++]);
Expand Down Expand Up @@ -115,7 +118,8 @@ namespace gpuPixelDoublets {
nmin += hist.size(kk+hoff);
auto const * __restrict__ p = hist.begin(kk+hoff);
auto const * __restrict__ e = hist.end(kk+hoff);
for (;p < e; ++p) {
p+=first;
for (;p < e; p+=stride) {
auto oi=__ldg(p);
assert(oi>=offsets[outer]);
assert(oi<offsets[outer+1]);
Expand All @@ -139,7 +143,7 @@ namespace gpuPixelDoublets {
} // loop in block...
}

constexpr auto getDoubletsFromHistoMaxBlockSize = 64;
constexpr auto getDoubletsFromHistoMaxBlockSize = 64; // for both x and y
constexpr auto getDoubletsFromHistoMinBlocksPerMP = 16;

__global__
Expand Down