Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adjustments in AlpakaCore/prefixScan and its test + Add helper functions to handle workdiv #167

Merged
merged 23 commits into from
Mar 30, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
902ee9e
prefixScan: Keep same workload between CUDA and ALPAKA tests, to be a…
ghugo83 Feb 1, 2021
6c09567
Remove unsupported #pragma unroll. Remove unused variables. Cannot us…
ghugo83 Feb 1, 2021
0038c02
Only calculate warpPrefixScan workDiv for GPU case
ghugo83 Feb 1, 2021
bea0450
Cleaner to directly call with cms::alpakatools::
ghugo83 Feb 1, 2021
4a4ca88
testPrefixScan: take into account dependency on number of elements in…
ghugo83 Feb 1, 2021
06d0c82
IMPORTANT: Should not have alpaka::wait::wait(queue); in between kern…
ghugo83 Feb 1, 2021
805756b
Comments
ghugo83 Feb 1, 2021
99cb983
clang-format
ghugo83 Feb 1, 2021
9dc2943
[alpaka] Add CMS_UNROLL_LOOP macro written by fwyzard from https://gi…
ghugo83 Feb 3, 2021
d6d8c0a
Add helper functions to directly handle element indices in non-stride…
ghugo83 Feb 12, 2021
8fb9967
Fixed issues in helper
ghugo83 Feb 12, 2021
d1f739b
Also add possibility to offset the indices in the helper functions + …
ghugo83 Feb 12, 2021
72b7c47
[alpaka] All call sites are now using the helper functions
ghugo83 Feb 12, 2021
a1f88bc
[alpakatest] Call sites are now using the 1D helper functions. No add…
ghugo83 Feb 15, 2021
3a9481b
Simplify helper functions
ghugo83 Feb 15, 2021
2f2b54b
Added comments in helpers
ghugo83 Feb 15, 2021
9d0b0d5
Latest fix in helper + Propagate latest helper functions to all alpak…
ghugo83 Feb 15, 2021
1c98947
clang-format
ghugo83 Feb 15, 2021
48d4d15
Pass function by value in helper functions
ghugo83 Mar 26, 2021
36f8c3d
Pass lamdas by && in helper functions
ghugo83 Mar 29, 2021
b4c3682
Inlining these helper functions leads to better perf
ghugo83 Mar 29, 2021
7c64537
Revert "Inlining these helper functions leads to better perf"
ghugo83 Mar 29, 2021
89d69fe
Revert "Pass lamdas by && in helper functions"
ghugo83 Mar 29, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 19 additions & 48 deletions src/alpaka/AlpakaCore/HistoContainer.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,21 +23,14 @@ namespace cms {
T const *__restrict__ v,
uint32_t const *__restrict__ offsets) const {
const uint32_t nt = offsets[nh];
const uint32_t gridDimension(alpaka::workdiv::getWorkDiv<alpaka::Grid, alpaka::Elems>(acc)[0u]);
const auto &[firstElementIdxNoStride, endElementIdxNoStride] =
cms::alpakatools::element_global_index_range_truncated(acc, Vec1::all(nt));
for (uint32_t threadIdx = firstElementIdxNoStride[0u], endElementIdx = endElementIdxNoStride[0u];
threadIdx < nt;
threadIdx += gridDimension, endElementIdx += gridDimension) {
for (uint32_t i = threadIdx; i < std::min(endElementIdx, nt); ++i) {
auto off = alpaka_std::upper_bound(offsets, offsets + nh + 1, i);
assert((*off) > 0);
int32_t ih = off - offsets - 1;
assert(ih >= 0);
assert(ih < int(nh));
h->count(acc, v[i], ih);
}
}
cms::alpakatools::for_each_element_1D_grid_stride(acc, nt, [&](uint32_t i) {
auto off = alpaka_std::upper_bound(offsets, offsets + nh + 1, i);
assert((*off) > 0);
int32_t ih = off - offsets - 1;
assert(ih >= 0);
assert(ih < int(nh));
h->count(acc, v[i], ih);
});
}
};

Expand All @@ -49,35 +42,23 @@ namespace cms {
T const *__restrict__ v,
uint32_t const *__restrict__ offsets) const {
const uint32_t nt = offsets[nh];
const uint32_t gridDimension(alpaka::workdiv::getWorkDiv<alpaka::Grid, alpaka::Elems>(acc)[0u]);
const auto &[firstElementIdxNoStride, endElementIdxNoStride] =
cms::alpakatools::element_global_index_range_truncated(acc, Vec1::all(nt));

for (uint32_t threadIdx = firstElementIdxNoStride[0u], endElementIdx = endElementIdxNoStride[0u];
threadIdx < nt;
threadIdx += gridDimension, endElementIdx += gridDimension) {
for (uint32_t i = threadIdx; i < std::min(endElementIdx, nt); ++i) {
auto off = alpaka_std::upper_bound(offsets, offsets + nh + 1, i);
assert((*off) > 0);
int32_t ih = off - offsets - 1;
assert(ih >= 0);
assert(ih < int(nh));
h->fill(acc, v[i], i, ih);
}
}
cms::alpakatools::for_each_element_1D_grid_stride(acc, nt, [&](uint32_t i) {
auto off = alpaka_std::upper_bound(offsets, offsets + nh + 1, i);
assert((*off) > 0);
int32_t ih = off - offsets - 1;
assert(ih >= 0);
assert(ih < int(nh));
h->fill(acc, v[i], i, ih);
});
}
};

struct launchZero {
template <typename T_Acc, typename Histo>
ALPAKA_FN_ACC ALPAKA_FN_INLINE __attribute__((always_inline)) void operator()(const T_Acc &acc,
Histo *__restrict__ h) const {
const auto &[firstElementIdxGlobal, endElementIdxGlobal] =
cms::alpakatools::element_global_index_range_truncated(acc, Vec1::all(Histo::totbins()));

for (uint32_t i = firstElementIdxGlobal[0u]; i < endElementIdxGlobal[0u]; ++i) {
h->off[i] = 0;
}
cms::alpakatools::for_each_element_in_thread_1D_index_in_grid(
acc, Histo::totbins(), [&](uint32_t i) { h->off[i] = 0; });
}
};

Expand Down Expand Up @@ -273,17 +254,7 @@ namespace cms {
return;
}

const uint32_t gridDimension(alpaka::workdiv::getWorkDiv<alpaka::Grid, alpaka::Elems>(acc)[0u]);
const auto &[firstElementIdxNoStride, endElementIdxNoStride] =
cms::alpakatools::element_global_index_range_truncated(acc, Vec1::all(totbins()));

for (uint32_t threadIdx = m + firstElementIdxNoStride[0u], endElementIdx = m + endElementIdxNoStride[0u];
threadIdx < totbins();
threadIdx += gridDimension, endElementIdx += gridDimension) {
for (uint32_t i = threadIdx; i < std::min(endElementIdx, totbins()); ++i) {
off[i] = n;
}
}
cms::alpakatools::for_each_element_1D_grid_stride(acc, totbins(), m, [&](uint32_t i) { off[i] = n; });
}

template <typename T_Acc>
Expand Down
252 changes: 235 additions & 17 deletions src/alpaka/AlpakaCore/alpakaWorkDivHelper.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,50 +29,268 @@ namespace cms {
}

/*
* Computes the range of the element(s) global index(es) in grid.
* 1D helper to only access 1 element per block
* (should obviously only be needed for debug / printout).
*/
template <typename T_Acc>
ALPAKA_FN_ACC bool once_per_block_1D(const T_Acc& acc, uint32_t i) {
const uint32_t blockDimension(alpaka::workdiv::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u]);
return (i % blockDimension == 0);
}

/*
* Computes the range of the elements indexes, local to the block.
* Warning: the max index is not truncated by the max number of elements of interest.
*/
template <typename T_Acc, typename T_Dim = alpaka::dim::Dim<T_Acc>>
ALPAKA_FN_ACC std::pair<Vec<T_Dim>, Vec<T_Dim>> element_global_index_range(const T_Acc& acc) {
Vec<T_Dim> firstElementIdxGlobalVec = Vec<T_Dim>::zeros();
Vec<T_Dim> endElementIdxUncutGlobalVec = Vec<T_Dim>::zeros();
ALPAKA_FN_ACC std::pair<Vec<T_Dim>, Vec<T_Dim>> element_index_range_in_block(const T_Acc& acc,
const Vec<T_Dim>& elementIdxShift) {
Vec<T_Dim> firstElementIdxVec = Vec<T_Dim>::zeros();
Vec<T_Dim> endElementIdxUncutVec = Vec<T_Dim>::zeros();

// Loop on all grid dimensions.
for (typename T_Dim::value_type dimIndex(0); dimIndex < T_Dim::value; ++dimIndex) {
// Global thread index in grid (along dimension dimIndex).
const uint32_t threadIdxGlobal(alpaka::idx::getIdx<alpaka::Grid, alpaka::Threads>(acc)[dimIndex]);
// Take into account the thread index in block.
const uint32_t threadIdxLocal(alpaka::idx::getIdx<alpaka::Block, alpaka::Threads>(acc)[dimIndex]);
const uint32_t threadDimension(alpaka::workdiv::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[dimIndex]);

// Global element index in grid (along dimension dimIndex).
// Compute the elements indexes in block.
// Obviously relevant for CPU only.
// For GPU, threadDimension = 1, and firstElementIdxGlobal = endElementIdxGlobal = threadIndexGlobal.
const uint32_t firstElementIdxGlobal = threadIdxGlobal * threadDimension;
const uint32_t endElementIdxUncutGlobal = firstElementIdxGlobal + threadDimension;
// For GPU, threadDimension = 1, and elementIdx = firstElementIdx = threadIdx + elementIdxShift.
const uint32_t firstElementIdxLocal = threadIdxLocal * threadDimension;
const uint32_t firstElementIdx = firstElementIdxLocal + elementIdxShift[dimIndex]; // Add the shift!
const uint32_t endElementIdxUncut = firstElementIdx + threadDimension;

firstElementIdxGlobalVec[dimIndex] = firstElementIdxGlobal;
endElementIdxUncutGlobalVec[dimIndex] = endElementIdxUncutGlobal;
firstElementIdxVec[dimIndex] = firstElementIdx;
endElementIdxUncutVec[dimIndex] = endElementIdxUncut;
}

return {firstElementIdxGlobalVec, endElementIdxUncutGlobalVec};
// Return element indexes, shifted by elementIdxShift.
return {firstElementIdxVec, endElementIdxUncutVec};
}

/*
* Computes the range of the element(s) global index(es) in grid.
* Computes the range of the elements indexes, local to the block.
* Truncated by the max number of elements of interest.
*/
template <typename T_Acc, typename T_Dim>
ALPAKA_FN_ACC std::pair<Vec<T_Dim>, Vec<T_Dim>> element_global_index_range_truncated(
const T_Acc& acc, const Vec<T_Dim>& maxNumberOfElements) {
ALPAKA_FN_ACC std::pair<Vec<T_Dim>, Vec<T_Dim>> element_index_range_in_block_truncated(
const T_Acc& acc, const Vec<T_Dim>& maxNumberOfElements, const Vec<T_Dim>& elementIdxShift) {
// Check dimension
static_assert(alpaka::dim::Dim<T_Acc>::value == T_Dim::value,
"Accelerator and maxNumberOfElements need to have same dimension.");
auto&& [firstElementIdxGlobalVec, endElementIdxGlobalVec] = element_global_index_range(acc);
auto&& [firstElementIdxLocalVec, endElementIdxLocalVec] = element_index_range_in_block(acc, elementIdxShift);

// Truncate
for (typename T_Dim::value_type dimIndex(0); dimIndex < T_Dim::value; ++dimIndex) {
endElementIdxLocalVec[dimIndex] = std::min(endElementIdxLocalVec[dimIndex], maxNumberOfElements[dimIndex]);
}

// Return element indexes, shifted by elementIdxShift, and truncated by maxNumberOfElements.
return {firstElementIdxLocalVec, endElementIdxLocalVec};
}

/*
* Computes the range of the elements indexes in grid.
* Warning: the max index is not truncated by the max number of elements of interest.
*/
template <typename T_Acc, typename T_Dim = alpaka::dim::Dim<T_Acc>>
ALPAKA_FN_ACC std::pair<Vec<T_Dim>, Vec<T_Dim>> element_index_range_in_grid(const T_Acc& acc,
Vec<T_Dim>& elementIdxShift) {
// Loop on all grid dimensions.
for (typename T_Dim::value_type dimIndex(0); dimIndex < T_Dim::value; ++dimIndex) {
// Take into account the block index in grid.
const uint32_t blockIdxInGrid(alpaka::idx::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[dimIndex]);
const uint32_t blockDimension(alpaka::workdiv::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[dimIndex]);

// Shift to get global indices in grid (instead of local to the block)
elementIdxShift[dimIndex] += blockIdxInGrid * blockDimension;
}

// Return element indexes, shifted by elementIdxShift.
return element_index_range_in_block(acc, elementIdxShift);
}

/*
* Computes the range of the elements indexes in grid.
* Truncated by the max number of elements of interest.
*/
template <typename T_Acc, typename T_Dim>
ALPAKA_FN_ACC std::pair<Vec<T_Dim>, Vec<T_Dim>> element_index_range_in_grid_truncated(
const T_Acc& acc, const Vec<T_Dim>& maxNumberOfElements, Vec<T_Dim>& elementIdxShift) {
// Check dimension
static_assert(alpaka::dim::Dim<T_Acc>::value == T_Dim::value,
"Accelerator and maxNumberOfElements need to have same dimension.");
auto&& [firstElementIdxGlobalVec, endElementIdxGlobalVec] = element_index_range_in_grid(acc, elementIdxShift);

// Truncate
for (typename T_Dim::value_type dimIndex(0); dimIndex < T_Dim::value; ++dimIndex) {
endElementIdxGlobalVec[dimIndex] = std::min(endElementIdxGlobalVec[dimIndex], maxNumberOfElements[dimIndex]);
}

// Return element indexes, shifted by elementIdxShift, and truncated by maxNumberOfElements.
return {firstElementIdxGlobalVec, endElementIdxGlobalVec};
}

/*
* Computes the range of the element(s) index(es) in grid.
* Truncated by the max number of elements of interest.
*/
template <typename T_Acc, typename T_Dim>
ALPAKA_FN_ACC std::pair<Vec<T_Dim>, Vec<T_Dim>> element_index_range_in_grid_truncated(
const T_Acc& acc, const Vec<T_Dim>& maxNumberOfElements) {
Vec<T_Dim> elementIdxShift = Vec<T_Dim>::zeros();
return element_index_range_in_grid_truncated(acc, maxNumberOfElements, elementIdxShift);
}

/*********************************************
* 1D HELPERS, LOOP ON ALL CPU ELEMENTS
********************************************/

/*
* Loop on all (CPU) elements.
* Elements loop makes sense in CPU case only. In GPU case, elementIdx = firstElementIdx = threadIdx + shift.
* Indexes are local to the BLOCK.
*/
template <typename T_Acc, typename Func>
ALPAKA_FN_ACC void for_each_element_in_thread_1D_index_in_block(const T_Acc& acc,
const uint32_t maxNumberOfElements,
const uint32_t elementIdxShift,
const Func func) {
Copy link
Collaborator

@makortel makortel Mar 27, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this should be either const Func& (as it was) or Func&&. It being const prevents mutating functors (which I'd think to be fine), and taking it by value implies a copy (which is not needed).

Copy link
Contributor Author

@ghugo83 ghugo83 Mar 29, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm the copies are eluded anyway?
Don't know, in practice, passing the lambda by value clearly leads to better performance than passing by const &, or passing by && and then forwarding.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Passing the lambda by && and then forwarding, and inlining these functions, seems to be a good compromise, will go for that.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

By quick look (that I should have done earlier), std and SYCL seem to take such functors by "non-const value", and Kokkos and Alpaka by "const reference". Just do double check, did you observe that passing the lambda by value gives better performance than reference?

If I've understood correctly, calling std::forward() many times on the same object is not strictly speaking safe (has the same issue as std::move()), even if it may work in practice in many cases.

Sorry for going back-and-forth, how about going back to the by-value (but without const), or keep the universal reference and remove the std::forward?

Copy link
Contributor Author

@ghugo83 ghugo83 Mar 29, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Re-looking better at this, I do not see clear diff in perf.

  • By value still makes the assumption that the function is light, which should be the case most of the time, but is not that nice.

  • Func&& without forwarding loses a bit of interest, as the parameter (even a temporary) would be bound to an lvalue, no?

Bah maybe this is too much hair-pulling

I think this should be either const Func& (as it was) or Func&&

I think I could revert to the initial implementation with const& ?

Also, as this does not change the helper functions interface anyway, maybe we could accept this, move on, and re-evaluate later when the full track reconstruction has been ported to have a better metric on the potential impact on perf?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  • By value still makes the assumption that the function is light, which should be the case most of the time, but is not that nice.

I agree, although that is the precedent that std sets (not that we should always agree with std).

  • Func&& without forwarding loses a bit of interest, as the parameter (even a temporary) would be bound to an lvalue, no?

Within the function body the func of Func&& func would indeed be an lvalue. But the function argument itself would bind to anything without requiring the the func to be const.

Bah maybe this is too much hair-pulling

I think this should be either const Func& (as it was) or Func&&

I think I could revert to the initial implementation with const& ?

Also, as this does not change the helper functions interface anyway, maybe we could accept this, move on, and re-evaluate later when the full track reconstruction has been ported to have a better metric on the potential impact on perf?

I agree, we should move on. I'm going to run tests for this and merge then. (although the initial implementation was/is const Func instead of const Func&)

const auto& [firstElementIdx, endElementIdx] = cms::alpakatools::element_index_range_in_block_truncated(
acc, Vec1::all(maxNumberOfElements), Vec1::all(elementIdxShift));

for (uint32_t elementIdx = firstElementIdx[0u]; elementIdx < endElementIdx[0u]; ++elementIdx) {
func(elementIdx);
}
}

/*
* Overload for elementIdxShift = 0
*/
template <typename T_Acc, typename Func>
ALPAKA_FN_ACC void for_each_element_in_thread_1D_index_in_block(const T_Acc& acc,
const uint32_t maxNumberOfElements,
const Func func) {
const uint32_t elementIdxShift = 0;
cms::alpakatools::for_each_element_in_thread_1D_index_in_block(acc, maxNumberOfElements, elementIdxShift, func);
}

/*
* Loop on all (CPU) elements.
* Elements loop makes sense in CPU case only. In GPU case, elementIdx = firstElementIdx = threadIdx + shift.
* Indexes are expressed in GRID 'frame-of-reference'.
*/
template <typename T_Acc, typename Func>
ALPAKA_FN_ACC void for_each_element_in_thread_1D_index_in_grid(const T_Acc& acc,
const uint32_t maxNumberOfElements,
uint32_t elementIdxShift,
const Func func) {
// Take into account the block index in grid to compute the element indices.
const uint32_t blockIdxInGrid(alpaka::idx::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0u]);
const uint32_t blockDimension(alpaka::workdiv::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u]);
elementIdxShift += blockIdxInGrid * blockDimension;

for_each_element_in_thread_1D_index_in_block(acc, maxNumberOfElements, elementIdxShift, func);
}

/*
* Overload for elementIdxShift = 0
*/
template <typename T_Acc, typename Func>
ALPAKA_FN_ACC void for_each_element_in_thread_1D_index_in_grid(const T_Acc& acc,
const uint32_t maxNumberOfElements,
const Func func) {
const uint32_t elementIdxShift = 0;
cms::alpakatools::for_each_element_in_thread_1D_index_in_grid(acc, maxNumberOfElements, elementIdxShift, func);
}

/******************************************************************************
* 1D HELPERS, LOOP ON ALL CPU ELEMENTS, AND ELEMENT/THREAD STRIDED ACCESS
******************************************************************************/

/*
* (CPU) Loop on all elements + (CPU/GPU) Strided access.
* Elements loop makes sense in CPU case only. In GPU case, elementIdx = firstElementIdx = threadIdx + shift.
* Stride to full problem size, by BLOCK size.
* Indexes are local to the BLOCK.
*/
template <typename T_Acc, typename Func>
ALPAKA_FN_ACC void for_each_element_1D_block_stride(const T_Acc& acc,
const uint32_t maxNumberOfElements,
const uint32_t elementIdxShift,
const Func func) {
// Get thread / element indices in block.
const auto& [firstElementIdxNoStride, endElementIdxNoStride] =
cms::alpakatools::element_index_range_in_block(acc, Vec1::all(elementIdxShift));

// Stride = block size.
const uint32_t blockDimension(alpaka::workdiv::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u]);

// Strided access.
for (uint32_t threadIdx = firstElementIdxNoStride[0u], endElementIdx = endElementIdxNoStride[0u];
threadIdx < maxNumberOfElements;
threadIdx += blockDimension, endElementIdx += blockDimension) {
// (CPU) Loop on all elements.
for (uint32_t i = threadIdx; i < std::min(endElementIdx, maxNumberOfElements); ++i) {
func(i);
}
}
}

/*
* Overload for elementIdxShift = 0
*/
template <typename T_Acc, typename Func>
ALPAKA_FN_ACC void for_each_element_1D_block_stride(const T_Acc& acc,
const uint32_t maxNumberOfElements,
const Func func) {
const uint32_t elementIdxShift = 0;
cms::alpakatools::for_each_element_1D_block_stride(acc, maxNumberOfElements, elementIdxShift, func);
}

/*
* (CPU) Loop on all elements + (CPU/GPU) Strided access.
* Elements loop makes sense in CPU case only. In GPU case, elementIdx = firstElementIdx = threadIdx + shift.
* Stride to full problem size, by GRID size.
* Indexes are local to the GRID.
*/
template <typename T_Acc, typename Func>
ALPAKA_FN_ACC void for_each_element_1D_grid_stride(const T_Acc& acc,
const uint32_t maxNumberOfElements,
const uint32_t elementIdxShift,
const Func func) {
Vec1 elementIdxShiftVec = Vec1::all(elementIdxShift);

// Get thread / element indices in block.
const auto& [firstElementIdxNoStride, endElementIdxNoStride] =
cms::alpakatools::element_index_range_in_grid(acc, elementIdxShiftVec);

// Stride = grid size.
const uint32_t gridDimension(alpaka::workdiv::getWorkDiv<alpaka::Grid, alpaka::Elems>(acc)[0u]);

// Strided access.
for (uint32_t threadIdx = firstElementIdxNoStride[0u], endElementIdx = endElementIdxNoStride[0u];
threadIdx < maxNumberOfElements;
threadIdx += gridDimension, endElementIdx += gridDimension) {
// (CPU) Loop on all elements.
for (uint32_t i = threadIdx; i < std::min(endElementIdx, maxNumberOfElements); ++i) {
func(i);
}
}
}

/*
* Overload for elementIdxShift = 0
*/
template <typename T_Acc, typename Func>
ALPAKA_FN_ACC void for_each_element_1D_grid_stride(const T_Acc& acc,
const uint32_t maxNumberOfElements,
const Func func) {
const uint32_t elementIdxShift = 0;
cms::alpakatools::for_each_element_1D_grid_stride(acc, maxNumberOfElements, elementIdxShift, func);
}

} // namespace alpakatools
} // namespace cms

Expand Down
Loading