Skip to content

Commit

Permalink
Optimise gpuPixelDoublets::doubletsFromHisto() kernel (cms-sw#167)
Browse files Browse the repository at this point in the history
Pre-compute few constants that could not be declared constexpr.
Reduce temporary buffer size.
Reduce the block size of the calls to gpuPixelDoublets::getDoubletsFromHisto() from 256 to 64, to make better usage of the GPU processors.
  • Loading branch information
fwyzard authored Sep 13, 2018
1 parent ba5ec1d commit 8293c5e
Show file tree
Hide file tree
Showing 2 changed files with 155 additions and 141 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ kernel_checkOverflows(GPU::SimpleVector<Quadruplet> *foundNtuplets,
if (idx < (*nCells) ) {
auto &thisCell = cells[idx];
if (thisCell.theOuterNeighbors.full()) //++tooManyNeighbors[thisCell.theLayerPairId];
printf("OuterNeighbors overflow %d in %d\n",idx,thisCell.theLayerPairId);
printf("OuterNeighbors overflow %d in %d\n", idx, thisCell.theLayerPairId);
}
if (idx < nHits) {
if (isOuterHitOfCell[idx].full()) // ++tooManyOuterHitOfCell;
Expand All @@ -43,8 +43,8 @@ kernel_connect(GPU::SimpleVector<Quadruplet> *foundNtuplets,
const float phiCut, const float hardPtCut,
unsigned int maxNumberOfDoublets_, unsigned int maxNumberOfHits_) {

float region_origin_x =0.;
float region_origin_y =0.;
float region_origin_x = 0.;
float region_origin_y = 0.;

auto cellIndex = threadIdx.x + blockIdx.x * blockDim.x;

Expand Down Expand Up @@ -86,7 +86,7 @@ __global__ void kernel_find_ntuplets(

__global__ void
kernel_print_found_ntuplets(GPU::SimpleVector<Quadruplet> *foundNtuplets, int maxPrint) {
for (int i = 0; i < std::min(maxPrint,foundNtuplets->size()); ++i) {
for (int i = 0; i < std::min(maxPrint, foundNtuplets->size()); ++i) {
printf("\nquadruplet %d: %d %d %d %d\n", i,
(*foundNtuplets)[i].hitId[0],
(*foundNtuplets)[i].hitId[1],
Expand Down Expand Up @@ -177,15 +177,15 @@ void CAHitQuadrupletGeneratorGPU::launchKernels(const TrackingRegion &region,
cudaCheck(cudaGetLastError());


numberOfBlocks = (std::max(int(nhits),maxNumberOfDoublets_) + 512 - 1)/512;
numberOfBlocks = (std::max(int(nhits), maxNumberOfDoublets_) + 512 - 1)/512;
kernel_checkOverflows<<<numberOfBlocks, 512, 0, cudaStream>>>(
d_foundNtupletsVec_[regionIndex],
device_theCells_, device_nCells_,
device_isOuterHitOfCell_, nhits
);


// kernel_print_found_ntuplets<<<1,1,0, cudaStream>>>(d_foundNtupletsVec_[regionIndex],10);
// kernel_print_found_ntuplets<<<1, 1, 0, cudaStream>>>(d_foundNtupletsVec_[regionIndex], 10);

if(transferToCPU) {
cudaCheck(cudaMemcpyAsync(h_foundNtupletsVec_[regionIndex], d_foundNtupletsVec_[regionIndex],
Expand All @@ -203,7 +203,7 @@ void CAHitQuadrupletGeneratorGPU::cleanup(cudaStream_t cudaStream) {
cudaCheck(cudaMemsetAsync(device_isOuterHitOfCell_, 0,
maxNumberOfLayers_ * maxNumberOfHits_ * sizeof(GPU::VecArray<unsigned int, maxCellsPerHit_>),
cudaStream));
cudaCheck(cudaMemsetAsync(device_nCells_,0,sizeof(uint32_t),cudaStream));
cudaCheck(cudaMemsetAsync(device_nCells_, 0, sizeof(uint32_t), cudaStream));
}

std::vector<std::array<int, 4>>
Expand All @@ -220,10 +220,10 @@ CAHitQuadrupletGeneratorGPU::fetchKernelResult(int regionIndex)
}

void CAHitQuadrupletGeneratorGPU::buildDoublets(HitsOnCPU const & hh, cudaStream_t stream) {
auto nhits = hh.nHits;
auto nhits = hh.nHits;

int threadsPerBlock = 256;
int blocks = (3*nhits + threadsPerBlock - 1) / threadsPerBlock;
gpuPixelDoublets::getDoubletsFromHisto<<<blocks, threadsPerBlock, 0, stream>>>(device_theCells_,device_nCells_,hh.gpu_d, device_isOuterHitOfCell_);
int threadsPerBlock = gpuPixelDoublets::getDoubletsFromHistoMaxBlockSize;
int blocks = (3 * nhits + threadsPerBlock - 1) / threadsPerBlock;
gpuPixelDoublets::getDoubletsFromHisto<<<blocks, threadsPerBlock, 0, stream>>>(device_theCells_, device_nCells_, hh.gpu_d, device_isOuterHitOfCell_);
cudaCheck(cudaGetLastError());
}
274 changes: 144 additions & 130 deletions RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,157 +20,171 @@ namespace gpuPixelDoublets {

template<typename Hist>
__device__
void doubletsFromHisto(uint8_t const * __restrict__ layerPairs, uint32_t nPairs, GPUCACell * cells, uint32_t * nCells,
int16_t const * __restrict__ iphi, Hist const * __restrict__ hist, uint32_t const * __restrict__ offsets,
void doubletsFromHisto(uint8_t const * __restrict__ layerPairs,
uint32_t nPairs,
GPUCACell * cells,
uint32_t * nCells,
int16_t const * __restrict__ iphi,
Hist const * __restrict__ hist,
uint32_t const * __restrict__ offsets,
siPixelRecHitsHeterogeneousProduct::HitsOnGPU const & __restrict__ hh,
GPU::VecArray< unsigned int, 256> * isOuterHitOfCell,
int16_t const * phicuts, float const * minz, float const * maxz, float const * maxr) {

GPU::VecArray< unsigned int, 256> * isOuterHitOfCell,
int16_t const * __restrict__ phicuts,
float const * __restrict__ minz,
float const * __restrict__ maxz,
float const * __restrict__ maxr)
{
auto layerSize = [=](uint8_t li) { return offsets[li+1]-offsets[li]; };

// to be optimized later
uint32_t innerLayerCumulativeSize[64];
assert(nPairs<=64);
// nPairsMax to be optimized later (originally was 64).
// If it should be much bigger, consider using a block-wide parallel prefix scan,
// e.g. see https://nvlabs.github.io/cub/classcub_1_1_warp_scan.html
const int nPairsMax = 16;
assert(nPairs <= nPairsMax);
uint32_t innerLayerCumulativeSize[nPairsMax];
innerLayerCumulativeSize[0] = layerSize(layerPairs[0]);
for (uint32_t i=1; i<nPairs; ++i) {
innerLayerCumulativeSize[i] = innerLayerCumulativeSize[i-1] + layerSize(layerPairs[2*i]);
for (uint32_t i = 1; i < nPairs; ++i) {
innerLayerCumulativeSize[i] = innerLayerCumulativeSize[i-1] + layerSize(layerPairs[2*i]);
}

auto ntot = innerLayerCumulativeSize[nPairs-1];


auto idx = blockIdx.x*blockDim.x + threadIdx.x;
for(auto j=idx;j<ntot;j+=blockDim.x*gridDim.x) {

uint32_t pairLayerId=0;
while(j>=innerLayerCumulativeSize[pairLayerId++]); --pairLayerId; // move to lower_bound ??

assert(pairLayerId<nPairs);
assert(j<innerLayerCumulativeSize[pairLayerId]);
assert(0==pairLayerId || j>=innerLayerCumulativeSize[pairLayerId-1]);

uint8_t inner = layerPairs[2*pairLayerId];
uint8_t outer = layerPairs[2*pairLayerId+1];
assert(outer>inner);

auto i = (0==pairLayerId) ? j : j-innerLayerCumulativeSize[pairLayerId-1];
i += offsets[inner];

// printf("Hit in Layer %d %d %d %d\n", i, inner, pairLayerId, j);

assert(i>=offsets[inner]);
assert(i<offsets[inner+1]);

// found hit corresponding to our cuda thread!!!!!
// do the job

auto mep = iphi[i];
auto mez = hh.zg_d[i];
auto mer = hh.rg_d[i];
auto cutoff = [&](int j) { return
abs(hh.zg_d[j]-mez) > maxz[pairLayerId] ||
abs(hh.zg_d[j]-mez) < minz[pairLayerId] ||
hh.rg_d[j]-mer > maxr[pairLayerId];
};

constexpr float z0cut = 12.f;
constexpr float hardPtCut = 0.5f;
constexpr float minRadius = hardPtCut * 87.f;
constexpr float minRadius2T4 = 4.f*minRadius*minRadius;
auto ptcut = [&](int j) {
auto r2t4 = minRadius2T4;
auto ri = mer;
auto ro = hh.rg_d[j];
auto dphi = short2phi( min( abs(int16_t(mep-iphi[j])),abs(int16_t(iphi[j]-mep)) ) );
return dphi*dphi*(r2t4 -ri*ro) > (ro-ri)*(ro-ri);
};
auto z0cutoff = [&](int j) {
auto zo = hh.zg_d[j];
auto ro = hh.rg_d[j];
auto dr = ro-mer;
return dr > maxr[pairLayerId] ||
dr<0 || std::abs((mez*ro - mer*zo)) > z0cut*dr;
};

auto iphicut = phicuts[pairLayerId];

auto kl = hist[outer].bin(int16_t(mep-iphicut));
auto kh = hist[outer].bin(int16_t(mep+iphicut));
auto incr = [](auto & k) { return k = (k+1)%Hist::nbins();};
int tot = 0;
int nmin = 0;
auto khh = kh;
incr(khh);

int tooMany=0;
for (auto kk=kl; kk!=khh; incr(kk)) {
if (kk!=kl && kk!=kh) nmin+=hist[outer].size(kk);
for (auto p=hist[outer].begin(kk); p<hist[outer].end(kk); ++p) {
auto oi=*p;
assert(oi>=offsets[outer]);
assert(oi<offsets[outer+1]);

if (std::min(std::abs(int16_t(iphi[oi]-mep)), std::abs(int16_t(mep-iphi[oi]))) > iphicut)
continue;
if (z0cutoff(oi) || ptcut(oi)) continue;
auto ind = atomicInc(nCells,MaxNumOfDoublets);
// int layerPairId, int doubletId, int innerHitId,int outerHitId)
cells[ind].init(hh,pairLayerId,ind,i,oi);
isOuterHitOfCell[oi].push_back(ind);
if (isOuterHitOfCell[oi].full()) ++tooMany;
++tot;
auto idx = blockIdx.x * blockDim.x + threadIdx.x;
for (auto j = idx; j < ntot; j += blockDim.x * gridDim.x) {

uint32_t pairLayerId=0;
while (j >= innerLayerCumulativeSize[pairLayerId++]);
--pairLayerId; // move to lower_bound ??

assert(pairLayerId < nPairs);
assert(j < innerLayerCumulativeSize[pairLayerId]);
assert(0 == pairLayerId || j >= innerLayerCumulativeSize[pairLayerId-1]);

uint8_t inner = layerPairs[2*pairLayerId];
uint8_t outer = layerPairs[2*pairLayerId+1];
assert(outer > inner);

auto i = (0 == pairLayerId) ? j : j-innerLayerCumulativeSize[pairLayerId-1];
i += offsets[inner];

// printf("Hit in Layer %d %d %d %d\n", i, inner, pairLayerId, j);

assert(i >= offsets[inner]);
assert(i < offsets[inner+1]);

// found hit corresponding to our cuda thread, now do the job
auto mep = iphi[i];
auto mez = hh.zg_d[i];
auto mer = hh.rg_d[i];

constexpr float z0cut = 12.f; // cm
constexpr float hardPtCut = 0.5f; // GeV
constexpr float minRadius = hardPtCut * 87.78f; // cm (1 GeV track has 1 GeV/c / (e * 3.8T) ~ 87 cm radius in a 3.8T field)
constexpr float minRadius2T4 = 4.f*minRadius*minRadius;
auto ptcut = [&](int j) {
auto r2t4 = minRadius2T4;
auto ri = mer;
auto ro = hh.rg_d[j];
auto dphi = short2phi( min( abs(int16_t(mep-iphi[j])), abs(int16_t(iphi[j]-mep)) ) );
return dphi*dphi * (r2t4 - ri*ro) > (ro-ri)*(ro-ri);
};
auto z0cutoff = [&](int j) {
auto zo = hh.zg_d[j];
auto ro = hh.rg_d[j];
auto dr = ro-mer;
return dr > maxr[pairLayerId] ||
dr<0 || std::abs((mez*ro - mer*zo)) > z0cut*dr;
};

auto iphicut = phicuts[pairLayerId];

auto kl = hist[outer].bin(int16_t(mep-iphicut));
auto kh = hist[outer].bin(int16_t(mep+iphicut));
auto incr = [](auto & k) { return k = (k+1) % Hist::nbins();};
int tot = 0;
int nmin = 0;
auto khh = kh;
incr(khh);

int tooMany=0;
for (auto kk = kl; kk != khh; incr(kk)) {
if (kk != kl && kk != kh)
nmin += hist[outer].size(kk);
for (auto p = hist[outer].begin(kk); p < hist[outer].end(kk); ++p) {
auto oi=*p;
assert(oi>=offsets[outer]);
assert(oi<offsets[outer+1]);

if (std::min(std::abs(int16_t(iphi[oi]-mep)), std::abs(int16_t(mep-iphi[oi]))) > iphicut)
continue;
if (z0cutoff(oi) || ptcut(oi)) continue;
auto ind = atomicInc(nCells, MaxNumOfDoublets);
// int layerPairId, int doubletId, int innerHitId, int outerHitId)
cells[ind].init(hh, pairLayerId, ind, i, oi);
isOuterHitOfCell[oi].push_back(ind);
if (isOuterHitOfCell[oi].full()) ++tooMany;
++tot;
}
}
}
if (tooMany>0) printf("OuterHitOfCell full for %d in layer %d/%d, %d:%d %d,%d\n", i, inner,outer, kl,kh,nmin,tot);
if (tooMany > 0)
printf("OuterHitOfCell full for %d in layer %d/%d, %d:%d %d,%d\n", i, inner, outer, kl, kh, nmin, tot);

if (hist[outer].nspills>0)
printf("spill bin to be checked in %d %d\n",outer,hist[outer].nspills);
if (hist[outer].nspills > 0)
printf("spill bin to be checked in %d %d\n", outer, hist[outer].nspills);

// if (0==hist[outer].nspills) assert(tot>=nmin);
// look in spill bin as well....
// if (0==hist[outer].nspills) assert(tot>=nmin);
// look in spill bin as well....


} // loop in block...
} // loop in block...
}

__global__
void getDoubletsFromHisto(GPUCACell * cells, uint32_t * nCells, siPixelRecHitsHeterogeneousProduct::HitsOnGPU const * __restrict__ hhp,
GPU::VecArray< unsigned int, 256> *isOuterHitOfCell) {

uint8_t const layerPairs[2*13] = {0,1 ,1,2 ,2,3
// ,0,4 ,1,4 ,2,4 ,4,5 ,5,6
,0,7 ,1,7 ,2,7 ,7,8 ,8,9
,0,4 ,1,4 ,2,4 ,4,5 ,5,6
};
constexpr auto getDoubletsFromHistoMaxBlockSize = 64;

const int16_t phi0p05 = phi2short(0.05);
const int16_t phi0p06 = phi2short(0.06);
const int16_t phi0p07 = phi2short(0.07);
__global__
__launch_bounds__(getDoubletsFromHistoMaxBlockSize)
void getDoubletsFromHisto(GPUCACell * cells,
uint32_t * nCells,
siPixelRecHitsHeterogeneousProduct::HitsOnGPU const * __restrict__ hhp,
GPU::VecArray<unsigned int, 256> * isOuterHitOfCell)
{
constexpr int nPairs = 13;
constexpr const uint8_t layerPairs[2*nPairs] = {
0, 1, 1, 2, 2, 3,
// 0, 4, 1, 4, 2, 4, 4, 5, 5, 6,
0, 7, 1, 7, 2, 7, 7, 8, 8, 9,
0, 4, 1, 4, 2, 4, 4, 5, 5, 6
};

int16_t const phicuts[13] { phi0p05, phi0p05, phi0p06
,phi0p07, phi0p06, phi0p06, phi0p05, phi0p05
,phi0p07, phi0p06, phi0p06, phi0p05, phi0p05
};
constexpr int16_t phi0p05 = 522; // round(521.52189...) = phi2short(0.05);
constexpr int16_t phi0p06 = 626; // round(625.82270...) = phi2short(0.06);
constexpr int16_t phi0p07 = 730; // round(730.12648...) = phi2short(0.07);

float const minz[13] = { 0., 0., 0.
,0., 0., 0., 0., 0.
,0., 0., 0., 0., 0.
};
constexpr const int16_t phicuts[nPairs] {
phi0p05, phi0p05, phi0p06,
phi0p07, phi0p06, phi0p06, phi0p05, phi0p05,
phi0p07, phi0p06, phi0p06, phi0p05, phi0p05
};

float const maxz[13] = { 20.,15.,12.
,30.,20.,20., 50., 50.
,30.,20.,20., 50., 50.
};
float const minz[nPairs] = {
0., 0., 0.,
0., 0., 0., 0., 0.,
0., 0., 0., 0., 0.
};

float const maxr[13] = { 20., 20., 20.
,9., 7., 6., 5., 5.
,9., 7., 6., 5., 5.
};
float const maxz[nPairs] = {
20., 15., 12.,
30., 20., 20., 50., 50.,
30., 20., 20., 50., 50.
};

float const maxr[nPairs] = {
20., 20., 20.,
9., 7., 6., 5., 5.,
9., 7., 6., 5., 5.
};

auto const & __restrict__ hh = *hhp;
doubletsFromHisto(layerPairs, 13, cells, nCells,
hh.iphi_d,hh.hist_d,hh.hitsLayerStart_d,
doubletsFromHisto(layerPairs, nPairs, cells, nCells,
hh.iphi_d, hh.hist_d, hh.hitsLayerStart_d,
hh, isOuterHitOfCell,
phicuts, minz, maxz, maxr);
}
Expand Down

0 comments on commit 8293c5e

Please sign in to comment.