Skip to content

Commit

Permalink
explicitly require 1D single block kernels to use Acc1D and have one …
Browse files Browse the repository at this point in the history
…block with asserts
  • Loading branch information
slava77devel committed Aug 12, 2024
1 parent c6a2468 commit 43ce20e
Show file tree
Hide file tree
Showing 6 changed files with 81 additions and 74 deletions.
77 changes: 22 additions & 55 deletions RecoTracker/LSTCore/src/alpaka/Event.dev.cc
Original file line number Diff line number Diff line change
Expand Up @@ -255,13 +255,10 @@ void lst::Event<Acc3D>::addPixelSegmentToEvent(std::vector<unsigned int> const&

alpaka::memcpy(queue, dst_view_miniDoubletModuleOccupancy, pixelMaxMDs_buf_h);

Vec3D const threadsPerBlockCreateMD{1, 1, 1024};
Vec3D const blocksPerGridCreateMD{1, 1, 1};
WorkDiv3D const createMDArrayRangesGPU_workDiv =
createWorkDiv(blocksPerGridCreateMD, threadsPerBlockCreateMD, elementsPerThread);
WorkDiv1D const createMDArrayRangesGPU_workDiv = createWorkDiv<Vec1D>({1}, {1024}, {1});

lst::createMDArrayRangesGPU createMDArrayRangesGPU_kernel;
alpaka::exec<Acc3D>(
alpaka::exec<Acc1D>(
queue, createMDArrayRangesGPU_workDiv, createMDArrayRangesGPU_kernel, *modulesBuffers_.data(), *rangesInGPU);

auto nTotalMDs_buf_h = cms::alpakatools::make_host_buffer<unsigned int[]>(queue, (Idx)1u);
Expand All @@ -281,13 +278,10 @@ void lst::Event<Acc3D>::addPixelSegmentToEvent(std::vector<unsigned int> const&
// can be optimized here: because we didn't distinguish pixel segments and outer-tracker segments and call them both "segments", so they use the index continuously.
// If we want to further study the memory footprint in detail, we can separate the two and allocate different memories to them

Vec3D const threadsPerBlockCreateSeg{1, 1, 1024};
Vec3D const blocksPerGridCreateSeg{1, 1, 1};
WorkDiv3D const createSegmentArrayRanges_workDiv =
createWorkDiv(blocksPerGridCreateSeg, threadsPerBlockCreateSeg, elementsPerThread);
WorkDiv1D const createSegmentArrayRanges_workDiv = createWorkDiv<Vec1D>({1}, {1024}, {1});

lst::createSegmentArrayRanges createSegmentArrayRanges_kernel;
alpaka::exec<Acc3D>(queue,
alpaka::exec<Acc1D>(queue,
createSegmentArrayRanges_workDiv,
createSegmentArrayRanges_kernel,
*modulesBuffers_.data(),
Expand Down Expand Up @@ -388,13 +382,10 @@ void lst::Event<Acc3D>::createMiniDoublets() {

alpaka::memcpy(queue, dst_view_miniDoubletModuleOccupancy, pixelMaxMDs_buf_h);

Vec3D const threadsPerBlockCreateMD{1, 1, 1024};
Vec3D const blocksPerGridCreateMD{1, 1, 1};
WorkDiv3D const createMDArrayRangesGPU_workDiv =
createWorkDiv(blocksPerGridCreateMD, threadsPerBlockCreateMD, elementsPerThread);
WorkDiv1D const createMDArrayRangesGPU_workDiv = createWorkDiv<Vec1D>({1}, {1024}, {1});

lst::createMDArrayRangesGPU createMDArrayRangesGPU_kernel;
alpaka::exec<Acc3D>(
alpaka::exec<Acc1D>(
queue, createMDArrayRangesGPU_workDiv, createMDArrayRangesGPU_kernel, *modulesBuffers_.data(), *rangesInGPU);

auto nTotalMDs_buf_h = cms::alpakatools::make_host_buffer<unsigned int[]>(queue, (Idx)1u);
Expand Down Expand Up @@ -424,13 +415,10 @@ void lst::Event<Acc3D>::createMiniDoublets() {
*mdsInGPU,
*rangesInGPU);

Vec3D const threadsPerBlockAddMD{1, 1, 1024};
Vec3D const blocksPerGridAddMD{1, 1, 1};
WorkDiv3D const addMiniDoubletRangesToEventExplicit_workDiv =
createWorkDiv(blocksPerGridAddMD, threadsPerBlockAddMD, elementsPerThread);
WorkDiv1D const addMiniDoubletRangesToEventExplicit_workDiv = createWorkDiv<Vec1D>({1}, {1024}, {1});

lst::addMiniDoubletRangesToEventExplicit addMiniDoubletRangesToEventExplicit_kernel;
alpaka::exec<Acc3D>(queue,
alpaka::exec<Acc1D>(queue,
addMiniDoubletRangesToEventExplicit_workDiv,
addMiniDoubletRangesToEventExplicit_kernel,
*modulesBuffers_.data(),
Expand Down Expand Up @@ -465,13 +453,10 @@ void lst::Event<Acc3D>::createSegmentsWithModuleMap() {
*segmentsInGPU,
*rangesInGPU);

Vec3D const threadsPerBlockAddSeg{1, 1, 1024};
Vec3D const blocksPerGridAddSeg{1, 1, 1};
WorkDiv3D const addSegmentRangesToEventExplicit_workDiv =
createWorkDiv(blocksPerGridAddSeg, threadsPerBlockAddSeg, elementsPerThread);
WorkDiv1D const addSegmentRangesToEventExplicit_workDiv = createWorkDiv<Vec1D>({1}, {1024}, {1});

lst::addSegmentRangesToEventExplicit addSegmentRangesToEventExplicit_kernel;
alpaka::exec<Acc3D>(queue,
alpaka::exec<Acc1D>(queue,
addSegmentRangesToEventExplicit_workDiv,
addSegmentRangesToEventExplicit_kernel,
*modulesBuffers_.data(),
Expand All @@ -485,13 +470,10 @@ void lst::Event<Acc3D>::createSegmentsWithModuleMap() {

void lst::Event<Acc3D>::createTriplets() {
if (tripletsInGPU == nullptr) {
Vec3D const threadsPerBlockCreateTrip{1, 1, 1024};
Vec3D const blocksPerGridCreateTrip{1, 1, 1};
WorkDiv3D const createTripletArrayRanges_workDiv =
createWorkDiv(blocksPerGridCreateTrip, threadsPerBlockCreateTrip, elementsPerThread);
WorkDiv1D const createTripletArrayRanges_workDiv = createWorkDiv<Vec1D>({1}, {1024}, {1});

lst::createTripletArrayRanges createTripletArrayRanges_kernel;
alpaka::exec<Acc3D>(queue,
alpaka::exec<Acc1D>(queue,
createTripletArrayRanges_workDiv,
createTripletArrayRanges_kernel,
*modulesBuffers_.data(),
Expand Down Expand Up @@ -563,13 +545,10 @@ void lst::Event<Acc3D>::createTriplets() {
index_gpu_buf.data(),
nonZeroModules);

Vec3D const threadsPerBlockAddTrip{1, 1, 1024};
Vec3D const blocksPerGridAddTrip{1, 1, 1};
WorkDiv3D const addTripletRangesToEventExplicit_workDiv =
createWorkDiv(blocksPerGridAddTrip, threadsPerBlockAddTrip, elementsPerThread);
WorkDiv1D const addTripletRangesToEventExplicit_workDiv = createWorkDiv<Vec1D>({1}, {1024}, {1});

lst::addTripletRangesToEventExplicit addTripletRangesToEventExplicit_kernel;
alpaka::exec<Acc3D>(queue,
alpaka::exec<Acc1D>(queue,
addTripletRangesToEventExplicit_workDiv,
addTripletRangesToEventExplicit_kernel,
*modulesBuffers_.data(),
Expand Down Expand Up @@ -604,13 +583,10 @@ void lst::Event<Acc3D>::createTrackCandidates(bool no_pls_dupclean, bool tc_pls_
*segmentsInGPU,
*pixelQuintupletsInGPU);

Vec3D const threadsPerBlock_addpT3asTrackCandidatesInGPU{1, 1, 512};
Vec3D const blocksPerGrid_addpT3asTrackCandidatesInGPU{1, 1, 1};
WorkDiv3D const addpT3asTrackCandidatesInGPU_workDiv = createWorkDiv(
blocksPerGrid_addpT3asTrackCandidatesInGPU, threadsPerBlock_addpT3asTrackCandidatesInGPU, elementsPerThread);
WorkDiv1D const addpT3asTrackCandidatesInGPU_workDiv = createWorkDiv<Vec1D>({1}, {512}, {1});

lst::addpT3asTrackCandidatesInGPU addpT3asTrackCandidatesInGPU_kernel;
alpaka::exec<Acc3D>(queue,
alpaka::exec<Acc1D>(queue,
addpT3asTrackCandidatesInGPU_workDiv,
addpT3asTrackCandidatesInGPU_kernel,
nLowerModules_,
Expand Down Expand Up @@ -849,13 +825,10 @@ void lst::Event<Acc3D>::createPixelTriplets() {
}

void lst::Event<Acc3D>::createQuintuplets() {
Vec3D const threadsPerBlockCreateQuints{1, 1, 1024};
Vec3D const blocksPerGridCreateQuints{1, 1, 1};
WorkDiv3D const createEligibleModulesListForQuintupletsGPU_workDiv =
createWorkDiv(blocksPerGridCreateQuints, threadsPerBlockCreateQuints, elementsPerThread);
WorkDiv1D const createEligibleModulesListForQuintupletsGPU_workDiv = createWorkDiv<Vec1D>({1}, {1024}, {1});

lst::createEligibleModulesListForQuintupletsGPU createEligibleModulesListForQuintupletsGPU_kernel;
alpaka::exec<Acc3D>(queue,
alpaka::exec<Acc1D>(queue,
createEligibleModulesListForQuintupletsGPU_workDiv,
createEligibleModulesListForQuintupletsGPU_kernel,
*modulesBuffers_.data(),
Expand Down Expand Up @@ -910,13 +883,10 @@ void lst::Event<Acc3D>::createQuintuplets() {
*quintupletsInGPU,
*rangesInGPU);

Vec3D const threadsPerBlockAddQuint{1, 1, 1024};
Vec3D const blocksPerGridAddQuint{1, 1, 1};
WorkDiv3D const addQuintupletRangesToEventExplicit_workDiv =
createWorkDiv(blocksPerGridAddQuint, threadsPerBlockAddQuint, elementsPerThread);
WorkDiv1D const addQuintupletRangesToEventExplicit_workDiv = createWorkDiv<Vec1D>({1}, {1024}, {1});

lst::addQuintupletRangesToEventExplicit addQuintupletRangesToEventExplicit_kernel;
alpaka::exec<Acc3D>(queue,
alpaka::exec<Acc1D>(queue,
addQuintupletRangesToEventExplicit_workDiv,
addQuintupletRangesToEventExplicit_kernel,
*modulesBuffers_.data(),
Expand Down Expand Up @@ -1044,13 +1014,10 @@ void lst::Event<Acc3D>::createPixelQuintuplets() {
removeDupPixelQuintupletsInGPUFromMap_kernel,
*pixelQuintupletsInGPU);

Vec3D const threadsPerBlockAddpT5asTrackCan{1, 1, 256};
Vec3D const blocksPerGridAddpT5asTrackCan{1, 1, 1};
WorkDiv3D const addpT5asTrackCandidateInGPU_workDiv =
createWorkDiv(blocksPerGridAddpT5asTrackCan, threadsPerBlockAddpT5asTrackCan, elementsPerThread);
WorkDiv1D const addpT5asTrackCandidateInGPU_workDiv = createWorkDiv<Vec1D>({1}, {256}, {1});

lst::addpT5asTrackCandidateInGPU addpT5asTrackCandidateInGPU_kernel;
alpaka::exec<Acc3D>(queue,
alpaka::exec<Acc1D>(queue,
addpT5asTrackCandidateInGPU_workDiv,
addpT5asTrackCandidateInGPU_kernel,
nLowerModules_,
Expand Down
14 changes: 11 additions & 3 deletions RecoTracker/LSTCore/src/alpaka/MiniDoublet.h
Original file line number Diff line number Diff line change
Expand Up @@ -968,6 +968,10 @@ namespace lst {
ALPAKA_FN_ACC void operator()(TAcc const& acc,
struct lst::Modules modulesInGPU,
struct lst::ObjectRanges rangesInGPU) const {
// implementation is 1D with a single block
static_assert(std::is_same_v<TAcc, Acc1D>, "Should be Acc1D");
ALPAKA_ASSERT_ACC((alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[0] == 1));

auto const globalThreadIdx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
auto const gridThreadExtent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc);

Expand All @@ -978,10 +982,10 @@ namespace lst {
}
alpaka::syncBlockThreads(acc);

// Initialize variables outside of the for loop.
// Create variables outside of the for loop.
int occupancy, category_number, eta_number;

for (uint16_t i = globalThreadIdx[2]; i < *modulesInGPU.nLowerModules; i += gridThreadExtent[2]) {
for (uint16_t i = globalThreadIdx[0]; i < *modulesInGPU.nLowerModules; i += gridThreadExtent[0]) {
short module_rings = modulesInGPU.rings[i];
short module_layers = modulesInGPU.layers[i];
short module_subdets = modulesInGPU.subdets[i];
Expand Down Expand Up @@ -1062,10 +1066,14 @@ namespace lst {
struct lst::MiniDoublets mdsInGPU,
struct lst::ObjectRanges rangesInGPU,
struct lst::Hits hitsInGPU) const {
// implementation is 1D with a single block
static_assert(std::is_same_v<TAcc, Acc1D>, "Should be Acc1D");
ALPAKA_ASSERT_ACC((alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[0] == 1));

auto const globalThreadIdx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
auto const gridThreadExtent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc);

for (uint16_t i = globalThreadIdx[2]; i < *modulesInGPU.nLowerModules; i += gridThreadExtent[2]) {
for (uint16_t i = globalThreadIdx[0]; i < *modulesInGPU.nLowerModules; i += gridThreadExtent[0]) {
if (mdsInGPU.nMDs[i] == 0 or hitsInGPU.hitRanges[i * 2] == -1) {
rangesInGPU.mdRanges[i * 2] = -1;
rangesInGPU.mdRanges[i * 2 + 1] = -1;
Expand Down
16 changes: 12 additions & 4 deletions RecoTracker/LSTCore/src/alpaka/Quintuplet.h
Original file line number Diff line number Diff line change
Expand Up @@ -2669,6 +2669,10 @@ namespace lst {
lst::Modules modulesInGPU,
lst::Triplets tripletsInGPU,
lst::ObjectRanges rangesInGPU) const {
// implementation is 1D with a single block
static_assert(std::is_same_v<TAcc, Acc1D>, "Should be Acc1D");
ALPAKA_ASSERT_ACC((alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[0] == 1));

auto const globalThreadIdx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
auto const gridThreadExtent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc);

Expand All @@ -2681,10 +2685,10 @@ namespace lst {
}
alpaka::syncBlockThreads(acc);

// Initialize variables outside of the for loop.
// Create variables outside of the for loop.
int occupancy, category_number, eta_number;

for (int i = globalThreadIdx[2]; i < *modulesInGPU.nLowerModules; i += gridThreadExtent[2]) {
for (int i = globalThreadIdx[0]; i < *modulesInGPU.nLowerModules; i += gridThreadExtent[0]) {
// Condition for a quintuple to exist for a module
// TCs don't exist for layers 5 and 6 barrel, and layers 2,3,4,5 endcap
short module_rings = modulesInGPU.rings[i];
Expand Down Expand Up @@ -2756,7 +2760,7 @@ namespace lst {

// Wait for all threads to finish before reporting final values
alpaka::syncBlockThreads(acc);
if (globalThreadIdx[2] == 0) {
if (cms::alpakatools::once_per_block(acc)) {
*rangesInGPU.nEligibleT5Modules = static_cast<uint16_t>(nEligibleT5Modulesx);
*rangesInGPU.device_nTotalQuints = static_cast<unsigned int>(nTotalQuintupletsx);
}
Expand All @@ -2769,10 +2773,14 @@ namespace lst {
lst::Modules modulesInGPU,
lst::Quintuplets quintupletsInGPU,
lst::ObjectRanges rangesInGPU) const {
// implementation is 1D with a single block
static_assert(std::is_same_v<TAcc, Acc1D>, "Should be Acc1D");
ALPAKA_ASSERT_ACC((alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[0] == 1));

auto const globalThreadIdx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
auto const gridThreadExtent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc);

for (uint16_t i = globalThreadIdx[2]; i < *modulesInGPU.nLowerModules; i += gridThreadExtent[2]) {
for (uint16_t i = globalThreadIdx[0]; i < *modulesInGPU.nLowerModules; i += gridThreadExtent[0]) {
if (quintupletsInGPU.nQuintuplets[i] == 0 or rangesInGPU.quintupletModuleIndices[i] == -1) {
rangesInGPU.quintupletRanges[i * 2] = -1;
rangesInGPU.quintupletRanges[i * 2 + 1] = -1;
Expand Down
16 changes: 12 additions & 4 deletions RecoTracker/LSTCore/src/alpaka/Segment.h
Original file line number Diff line number Diff line change
Expand Up @@ -801,6 +801,10 @@ namespace lst {
lst::Modules modulesInGPU,
lst::ObjectRanges rangesInGPU,
lst::MiniDoublets mdsInGPU) const {
// implementation is 1D with a single block
static_assert(std::is_same_v<TAcc, Acc1D>, "Should be Acc1D");
ALPAKA_ASSERT_ACC((alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[0] == 1));

auto const globalThreadIdx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
auto const gridThreadExtent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc);

Expand All @@ -811,10 +815,10 @@ namespace lst {
}
alpaka::syncBlockThreads(acc);

// Initialize variables outside of the for loop.
// Create variables outside of the for loop.
int occupancy, category_number, eta_number;

for (uint16_t i = globalThreadIdx[2]; i < *modulesInGPU.nLowerModules; i += gridThreadExtent[2]) {
for (uint16_t i = globalThreadIdx[0]; i < *modulesInGPU.nLowerModules; i += gridThreadExtent[0]) {
if (modulesInGPU.nConnectedModules[i] == 0) {
rangesInGPU.segmentModuleIndices[i] = nTotalSegments;
rangesInGPU.segmentModuleOccupancy[i] = 0;
Expand Down Expand Up @@ -888,7 +892,7 @@ namespace lst {

// Wait for all threads to finish before reporting final values
alpaka::syncBlockThreads(acc);
if (globalThreadIdx[2] == 0) {
if (cms::alpakatools::once_per_block(acc)) {
rangesInGPU.segmentModuleIndices[*modulesInGPU.nLowerModules] = nTotalSegments;
*rangesInGPU.device_nTotalSegs = nTotalSegments;
}
Expand All @@ -901,10 +905,14 @@ namespace lst {
lst::Modules modulesInGPU,
lst::Segments segmentsInGPU,
lst::ObjectRanges rangesInGPU) const {
// implementation is 1D with a single block
static_assert(std::is_same_v<TAcc, Acc1D>, "Should be Acc1D");
ALPAKA_ASSERT_ACC((alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[0] == 1));

auto const globalThreadIdx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
auto const gridThreadExtent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc);

for (uint16_t i = globalThreadIdx[2]; i < *modulesInGPU.nLowerModules; i += gridThreadExtent[2]) {
for (uint16_t i = globalThreadIdx[0]; i < *modulesInGPU.nLowerModules; i += gridThreadExtent[0]) {
if (segmentsInGPU.nSegments[i] == 0) {
rangesInGPU.segmentRanges[i * 2] = -1;
rangesInGPU.segmentRanges[i * 2 + 1] = -1;
Expand Down
16 changes: 12 additions & 4 deletions RecoTracker/LSTCore/src/alpaka/TrackCandidate.h
Original file line number Diff line number Diff line change
Expand Up @@ -389,13 +389,17 @@ namespace lst {
lst::TrackCandidates trackCandidatesInGPU,
lst::Segments segmentsInGPU,
lst::ObjectRanges rangesInGPU) const {
// implementation is 1D with a single block
static_assert(std::is_same_v<TAcc, Acc1D>, "Should be Acc1D");
ALPAKA_ASSERT_ACC((alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[0] == 1));

auto const globalThreadIdx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
auto const gridThreadExtent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc);

unsigned int nPixelTriplets = *pixelTripletsInGPU.nPixelTriplets;
unsigned int pLS_offset = rangesInGPU.segmentModuleIndices[nLowerModules];
for (unsigned int pixelTripletIndex = globalThreadIdx[2]; pixelTripletIndex < nPixelTriplets;
pixelTripletIndex += gridThreadExtent[2]) {
for (unsigned int pixelTripletIndex = globalThreadIdx[0]; pixelTripletIndex < nPixelTriplets;
pixelTripletIndex += gridThreadExtent[0]) {
if ((pixelTripletsInGPU.isDup[pixelTripletIndex]))
continue;

Expand Down Expand Up @@ -534,13 +538,17 @@ namespace lst {
lst::TrackCandidates trackCandidatesInGPU,
lst::Segments segmentsInGPU,
lst::ObjectRanges rangesInGPU) const {
// implementation is 1D with a single block
static_assert(std::is_same_v<TAcc, Acc1D>, "Should be Acc1D");
ALPAKA_ASSERT_ACC((alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[0] == 1));

auto const globalThreadIdx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
auto const gridThreadExtent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc);

int nPixelQuintuplets = *pixelQuintupletsInGPU.nPixelQuintuplets;
unsigned int pLS_offset = rangesInGPU.segmentModuleIndices[nLowerModules];
for (int pixelQuintupletIndex = globalThreadIdx[2]; pixelQuintupletIndex < nPixelQuintuplets;
pixelQuintupletIndex += gridThreadExtent[2]) {
for (int pixelQuintupletIndex = globalThreadIdx[0]; pixelQuintupletIndex < nPixelQuintuplets;
pixelQuintupletIndex += gridThreadExtent[0]) {
if (pixelQuintupletsInGPU.isDup[pixelQuintupletIndex])
continue;

Expand Down
Loading

0 comments on commit 43ce20e

Please sign in to comment.