From 79bb2fe383ca74f03363bb2b88d005c6a6a1aa77 Mon Sep 17 00:00:00 2001 From: Adrian Antonio Petre Date: Wed, 16 Jun 2021 17:06:15 +0200 Subject: [PATCH] x --- src/alpaka/AlpakaCore/alpakaWorkDivHelper.h | 239 ++++++++++++++------ 1 file changed, 175 insertions(+), 64 deletions(-) diff --git a/src/alpaka/AlpakaCore/alpakaWorkDivHelper.h b/src/alpaka/AlpakaCore/alpakaWorkDivHelper.h index e5bb1b838..cb157570d 100644 --- a/src/alpaka/AlpakaCore/alpakaWorkDivHelper.h +++ b/src/alpaka/AlpakaCore/alpakaWorkDivHelper.h @@ -326,27 +326,32 @@ namespace cms { template class elements_with_stride { public: + ALPAKA_FN_ACC elements_with_stride(const T_Acc& acc) { - const Idx threadIdxLocal(alpaka::getIdx(acc)[0u]); - const Idx blockIdxInGrid(alpaka::getIdx(acc)[0u]); + const unsigned int dimIndex = 0; + const Idx threadIdxLocal(alpaka::getIdx(acc)[dimIndex]); + const Idx blockIdxInGrid(alpaka::getIdx(acc)[dimIndex]); - const Idx blockDimension(alpaka::getWorkDiv(acc)[0u]); - const Idx gridDimension(alpaka::getWorkDiv(acc)[0u]); + const Idx blockDimension(alpaka::getWorkDiv(acc)[dimIndex]); + const Idx gridDimension(alpaka::getWorkDiv(acc)[dimIndex]); thread_ = blockDimension * blockIdxInGrid + threadIdxLocal; stride_ = blockDimension * gridDimension; extent_ = stride_; + blockDim = blockDimension; } - ALPAKA_FN_ACC elements_with_stride(const T_Acc& acc, T extent) : extent_(extent) { - const Idx threadIdxLocal(alpaka::getIdx(acc)[0u]); - const Idx blockIdxInGrid(alpaka::getIdx(acc)[0u]); + ALPAKA_FN_ACC elements_with_stride(const T_Acc& acc, T extent, const unsigned int dimIndex = 0) : extent_(extent) { + + const Idx threadIdxLocal(alpaka::getIdx(acc)[dimIndex]); + const Idx blockIdxInGrid(alpaka::getIdx(acc)[dimIndex]); - const Idx blockDimension(alpaka::getWorkDiv(acc)[0u]); - const Idx gridDimension(alpaka::getWorkDiv(acc)[0u]); + const Idx blockDimension(alpaka::getWorkDiv(acc)[dimIndex]); + const Idx gridDimension(alpaka::getWorkDiv(acc)[dimIndex]); thread_ = blockDimension * blockIdxInGrid + threadIdxLocal; stride_ = blockDimension * gridDimension; + blockDim = blockDimension; } @@ -357,14 +362,25 @@ namespace cms { ALPAKA_FN_ACC constexpr T operator*() const { return index_; } ALPAKA_FN_ACC constexpr iterator& operator++() { - // increment the first coordinate - index_ += stride_; - if (index_ < extent_) - return *this; + #ifdef ALPAKA_ACC_GPU_CUDA_ENABLED + // increment the index + index_ += stride_; + if (index_ < extent_) + return *this; + + #else // CPU Backend + // Iterate over all the elements for one thread + index_ += 1; + if (index_ < old_index_ + blockDim) { + return *this; + } + #endif + // the iterator has reached or ovrflowed the end of the extent, clamp it to the extent index_ = extent_; return *this; + } ALPAKA_FN_ACC constexpr iterator operator++(int) { @@ -380,26 +396,29 @@ namespace cms { } private: - ALPAKA_FN_ACC constexpr iterator(T thread, T stride, T extent) - : thread_{thread}, stride_{stride}, extent_{extent}, index_{thread_} {} + ALPAKA_FN_ACC constexpr iterator(T thread, T stride, T extent, T blockDim) + : thread_{thread}, stride_{stride}, extent_{extent}, index_{thread_}, old_index_{index_}, blockDim{blockDim} {} - ALPAKA_FN_ACC constexpr iterator(T thread, T stride, T extent, T index) - : thread_{thread}, stride_{stride}, extent_{extent}, index_{index} {} + ALPAKA_FN_ACC constexpr iterator(T thread, T stride, T extent, T index, T blockDim) + : thread_{thread}, stride_{stride}, extent_{extent}, index_{index}, old_index_{index_}, blockDim{blockDim} {} const T thread_; const T stride_; const T extent_; T index_; + const T old_index_; + const T blockDim; }; - ALPAKA_FN_ACC constexpr iterator begin() const { return iterator(thread_, stride_, extent_); } + ALPAKA_FN_ACC constexpr iterator begin() const { return iterator(thread_, stride_, extent_, blockDim); } - ALPAKA_FN_ACC constexpr iterator end() const { return iterator(thread_, stride_, extent_, extent_); } + ALPAKA_FN_ACC constexpr iterator end() const { return iterator(thread_, stride_, extent_, extent_, blockDim); } private: T thread_; T stride_; T extent_; + T blockDim; }; /* @@ -418,9 +437,13 @@ namespace cms { const Vec3 blockDimension(alpaka::getWorkDiv(acc)); const Vec3 gridDimension(alpaka::getWorkDiv(acc)); - thread_ = {blockDimension[0u] * blockIdxInGrid[0u] + threadIdxLocal[0u], 0, 0}; + thread_ = {blockDimension[0u] * blockIdxInGrid[0u] + threadIdxLocal[0u], + blockDimension[1u] * blockIdxInGrid[1u] + threadIdxLocal[1u], + blockDimension[2u] * blockIdxInGrid[2u] + threadIdxLocal[2u]}; stride_ = {blockDimension[0u] * gridDimension[0u], 1, 1}; extent_ = stride_; + + blockDim = blockDimension; } ALPAKA_FN_ACC elements_with_stride_1d(const T_Acc& acc, Vec3 extent) : extent_(extent) { @@ -430,8 +453,12 @@ namespace cms { const Vec3 blockDimension(alpaka::getWorkDiv(acc)); const Vec3 gridDimension(alpaka::getWorkDiv(acc)); - thread_ = {blockDimension[0u] * blockIdxInGrid[0u] + threadIdxLocal[0u], 0, 0}; + thread_ = {blockDimension[0u] * blockIdxInGrid[0u] + threadIdxLocal[0u], + blockDimension[1u] * blockIdxInGrid[1u] + threadIdxLocal[1u], + blockDimension[2u] * blockIdxInGrid[2u] + threadIdxLocal[2u]}; stride_ = {blockDimension[0u] * gridDimension[0u], 1, 1}; + + blockDim = blockDimension; } class iterator { @@ -441,10 +468,32 @@ namespace cms { ALPAKA_FN_ACC Vec3 operator*() const { return index_; } ALPAKA_FN_ACC constexpr iterator& operator++() { - // increment the first coordinate - index_[0u] += stride_[0u]; - if (index_[0u] < extent_[0u]) - return *this; + + #ifdef ALPAKA_ACC_GPU_CUDA_ENABLED + // increment the first coordinate + index_[0u] += stride_[0u]; + if (index_[0u] < extent_[0u]) + return *this; + #else + // increment the 3rd index and check its value + index_[2u] += 1; + if (index_[2u] == old_index_[2u] + blockDim[2u]) + index_[2u] = old_index_[2u]; + + // if the 3rd index was reset, increment the 2nd index + if (index_[2u] == old_index_[2u]) + index_[1u] += 1; + if (index_[1u] == old_index_[1u] + blockDim[1u]) + index_[1u] = old_index_[1u]; + + // if the 3rd and 2nd indices were set, increment the first coordinate + if (index_[1u] == old_index_[1u] && index_[2u] == old_index_[2u]) + index_[0u] += 1; + + if (index_[0u] < old_index_[0u] + blockDim[0u] && index_[0u] < extent_[0u]) { + return *this; + } + #endif // the iterator has reached or ovrflowed the end of the extent, clamp it to the extent index_ = extent_; @@ -464,26 +513,29 @@ namespace cms { } private: - ALPAKA_FN_ACC iterator(Vec3 thread, Vec3 stride, Vec3 extent) - : thread_{thread}, stride_{stride}, extent_{extent}, index_{thread_} {} + ALPAKA_FN_ACC iterator(Vec3 thread, Vec3 stride, Vec3 extent, Vec3 blockDim) + : thread_{thread}, stride_{stride}, extent_{extent}, index_{thread_}, old_index_{index_}, blockDim{blockDim} {} - ALPAKA_FN_ACC iterator(Vec3 thread, Vec3 stride, Vec3 extent, Vec3 index) - : thread_{thread}, stride_{stride}, extent_{extent}, index_{index} {} + ALPAKA_FN_ACC iterator(Vec3 thread, Vec3 stride, Vec3 extent, Vec3 index, Vec3 blockDim) + : thread_{thread}, stride_{stride}, extent_{extent}, index_{index}, old_index_{index_}, blockDim{blockDim} {} Vec3 thread_; Vec3 stride_; Vec3 extent_; Vec3 index_ ; + Vec3 old_index_; + Vec3 blockDim; }; - ALPAKA_FN_ACC constexpr iterator begin() const { return iterator(thread_, stride_, extent_); } + ALPAKA_FN_ACC constexpr iterator begin() const { return iterator(thread_, stride_, extent_, blockDim); } - ALPAKA_FN_ACC constexpr iterator end() const { return iterator(thread_, stride_, extent_, extent_); } + ALPAKA_FN_ACC constexpr iterator end() const { return iterator(thread_, stride_, extent_, extent_, blockDim); } private: - Vec3 thread_ = Vec3::all(1); - Vec3 stride_ = Vec3::all(1); - Vec3 extent_ = Vec3::all(1); + Vec3 thread_ = Vec3::all(1); + Vec3 stride_ = Vec3::all(1); + Vec3 extent_ = Vec3::all(1); + Vec3 blockDim = Vec3::all(1); }; @@ -505,11 +557,13 @@ namespace cms { thread_ = {blockDimension[0u] * blockIdxInGrid[0u] + threadIdxLocal[0u], blockDimension[1u] * blockIdxInGrid[1u] + threadIdxLocal[1u], - 0}; + blockDimension[2u] * blockIdxInGrid[2u] + threadIdxLocal[2u]}; stride_ = {blockDimension[0u] * gridDimension[0u], blockDimension[1u] * gridDimension[1u], 1}; extent_ = stride_; + + blockDim = blockDimension; } @@ -523,10 +577,12 @@ namespace cms { thread_ = {blockDimension[0u] * blockIdxInGrid[0u] + threadIdxLocal[0u], blockDimension[1u] * blockIdxInGrid[1u] + threadIdxLocal[1u], - 0}; + blockDimension[2u] * blockIdxInGrid[2u] + threadIdxLocal[2u]}; stride_ = {blockDimension[0u] * gridDimension[0u], blockDimension[1u] * gridDimension[1u], 1}; + + blockDim = blockDimension; } @@ -538,16 +594,39 @@ namespace cms { ALPAKA_FN_ACC Vec3 operator*() const { return index_; } ALPAKA_FN_ACC constexpr iterator& operator++() { - // increment the first coordinate - index_[0u] += stride_[0u]; - if (index_[0u] < extent_[0u]) - return *this; + - // if the first coordinate overflows, reset it and increment the second coordinate - index_[0u] = thread_[0u]; - index_[1u] += stride_[1u]; - if (index_[1u] < extent_[1u]) - return *this; + #ifdef ALPAKA_ACC_GPU_CUDA_ENABLED + // increment the first coordinate + index_[0u] += stride_[0u]; + if (index_[0u] < extent_[0u]) + return *this; + + // if the first coordinate overflows, reset it and increment the second coordinate + index_[0u] = thread_[0u]; + index_[1u] += stride_[1u]; + if (index_[1u] < extent_[1u]) + return *this; + #else + // increment the 3rd index and check its value + index_[2u] += 1; + if (index_[2u] == old_index_[2u] + blockDim[2u]) + index_[2u] = old_index_[2u]; + + // if the 3rd index was reset, increment the 2nd index + if (index_[2u] == old_index_[2u]) + index_[1u] += 1; + if (index_[1u] == old_index_[1u] + blockDim[1u] || index_[1u] == extent_[1u]) + index_[1u] = old_index_[1u]; + + // if the 3rd and 2nd indices were set, increment the first coordinate + if (index_[1u] == old_index_[1u] && index_[2u] == old_index_[2u]) + index_[0u] += 1; + + if (index_[0u] < old_index_[0u] + blockDim[0u] && index_[0u] < extent_[0u] && index_[1u] < extent_[1u]) { + return *this; + } + #endif // the iterator has reached or ovrflowed the end of the extent, clamp it to the extent index_ = extent_; @@ -567,26 +646,29 @@ namespace cms { } private: - ALPAKA_FN_ACC iterator(Vec3 thread, Vec3 stride, Vec3 extent) - : thread_{thread}, stride_{stride}, extent_{extent}, index_{thread_} {} + ALPAKA_FN_ACC iterator(Vec3 thread, Vec3 stride, Vec3 extent, Vec3 blockDim) + : thread_{thread}, stride_{stride}, extent_{extent}, index_{thread_}, old_index_{index_}, blockDim{blockDim} {} - ALPAKA_FN_ACC iterator(Vec3 thread, Vec3 stride, Vec3 extent, Vec3 index) - : thread_{thread}, stride_{stride}, extent_{extent}, index_{index} {} + ALPAKA_FN_ACC iterator(Vec3 thread, Vec3 stride, Vec3 extent, Vec3 index, Vec3 blockDim) + : thread_{thread}, stride_{stride}, extent_{extent}, index_{index}, old_index_{index_}, blockDim{blockDim} {} Vec3 thread_; Vec3 stride_; Vec3 extent_; Vec3 index_; + Vec3 old_index_; + Vec3 blockDim; }; - ALPAKA_FN_ACC constexpr iterator begin() const { return iterator(thread_, stride_, extent_); } + ALPAKA_FN_ACC constexpr iterator begin() const { return iterator(thread_, stride_, extent_, blockDim); } - ALPAKA_FN_ACC constexpr iterator end() const { return iterator(thread_, stride_, extent_, extent_); } + ALPAKA_FN_ACC constexpr iterator end() const { return iterator(thread_, stride_, extent_, extent_, blockDim); } private: - Vec3 thread_ = Vec3::all(1); - Vec3 stride_ = Vec3::all(1); - Vec3 extent_ = Vec3::all(1); + Vec3 thread_ = Vec3::all(1); + Vec3 stride_ = Vec3::all(1); + Vec3 extent_ = Vec3::all(1); + Vec3 blockDim = Vec3::all(1); }; @@ -613,6 +695,8 @@ namespace cms { blockDimension[1u] * gridDimension[1u], blockDimension[2u] * gridDimension[2u]}; extent_ = stride_; + + blockDim = blockDimension; } @@ -630,6 +714,8 @@ namespace cms { stride_ = {blockDimension[0u] * gridDimension[0u], blockDimension[1u] * gridDimension[1u], blockDimension[2u] * gridDimension[2u]}; + + blockDim = blockDimension; } @@ -641,7 +727,9 @@ namespace cms { ALPAKA_FN_ACC Vec3 operator*() const { return index_; } ALPAKA_FN_ACC constexpr iterator& operator++() { - // increment the first coordinate + + #ifdef ALPAKA_ACC_GPU_CUDA_ENABLED + // increment the first coordinate index_[0u] += stride_[0u]; if (index_[0u] < extent_[0u]) return *this; @@ -657,6 +745,26 @@ namespace cms { index_[2u] += stride_[2u]; if (index_[2u] < extent_[2u]) return *this; + #else + // increment the 3rd index and check its value + index_[2u] += 1; + if (index_[2u] == old_index_[2u] + blockDim[2u] || index_[2u] == extent_[2u]) + index_[2u] = old_index_[2u]; + + // if the 3rd index was reset, increment the 2nd index + if (index_[2u] == old_index_[2u]) + index_[1u] += 1; + if (index_[1u] == old_index_[1u] + blockDim[1u] || index_[1u] == extent_[1u]) + index_[1u] = old_index_[1u]; + + // if the 3rd and 2nd indices were set, increment the first coordinate + if (index_[1u] == old_index_[1u] && index_[2u] == old_index_[2u]) + index_[0u] += 1; + if (index_[0u] < old_index_[0u] + blockDim[0u] && index_[0u] < extent_[0u] && + index_[1u] < extent_[1u] && index_[2u] < extent_[2u]) { + return *this; + } + #endif // the iterator has reached or ovrflowed the end of the extent, clamp it to the extent index_ = extent_; @@ -676,26 +784,29 @@ namespace cms { } private: - ALPAKA_FN_ACC iterator(Vec3 thread, Vec3 stride, Vec3 extent) - : thread_{thread}, stride_{stride}, extent_{extent}, index_{thread_} {} + ALPAKA_FN_ACC iterator(Vec3 thread, Vec3 stride, Vec3 extent, Vec3 blockDim) + : thread_{thread}, stride_{stride}, extent_{extent}, index_{thread_}, old_index_{index_}, blockDim{blockDim} {} - ALPAKA_FN_ACC iterator(Vec3 thread, Vec3 stride, Vec3 extent, Vec3 index) - : thread_{thread}, stride_{stride}, extent_{extent}, index_{index} {} + ALPAKA_FN_ACC iterator(Vec3 thread, Vec3 stride, Vec3 extent, Vec3 index, Vec3 blockDim) + : thread_{thread}, stride_{stride}, extent_{extent}, index_{index}, old_index_{index_}, blockDim{blockDim} {} Vec3 thread_; Vec3 stride_; Vec3 extent_; Vec3 index_; + Vec3 old_index_; + Vec3 blockDim; }; - ALPAKA_FN_ACC constexpr iterator begin() const { return iterator(thread_, stride_, extent_); } + ALPAKA_FN_ACC constexpr iterator begin() const { return iterator(thread_, stride_, extent_, blockDim); } - ALPAKA_FN_ACC constexpr iterator end() const { return iterator(thread_, stride_, extent_, extent_); } + ALPAKA_FN_ACC constexpr iterator end() const { return iterator(thread_, stride_, extent_, extent_, blockDim); } private: - Vec3 thread_ = Vec3::all(1); - Vec3 stride_ = Vec3::all(1); - Vec3 extent_ = Vec3::all(1); + Vec3 thread_ = Vec3::all(1); + Vec3 stride_ = Vec3::all(1); + Vec3 extent_ = Vec3::all(1); + Vec3 blockDim = Vec3::all(1); }; } // namespace alpakatools