diff --git a/HeterogeneousCore/AlpakaInterface/interface/workdivision.h b/HeterogeneousCore/AlpakaInterface/interface/workdivision.h index 0d295855976da..1ac317bb02f99 100644 --- a/HeterogeneousCore/AlpakaInterface/interface/workdivision.h +++ b/HeterogeneousCore/AlpakaInterface/interface/workdivision.h @@ -302,6 +302,207 @@ namespace cms::alpakatools { const Vec extent_; }; + /********************************************* + * RANGE COMPUTATION + ********************************************/ + + /* + * Computes the range of the elements indexes, local to the block. + * Warning: the max index is not truncated by the max number of elements of interest. + */ + template + ALPAKA_FN_ACC std::pair element_index_range_in_block(const TAcc& acc, + const Idx elementIdxShift, + const unsigned int dimIndex = 0u) { + // Take into account the thread index in block. + const Idx threadIdxLocal(alpaka::getIdx(acc)[dimIndex]); + const Idx threadDimension(alpaka::getWorkDiv(acc)[dimIndex]); + + // Compute the elements indexes in block. + // Obviously relevant for CPU only. + // For GPU, threadDimension == 1, and elementIdx == firstElementIdx == threadIdx + elementIdxShift. + const Idx firstElementIdxLocal = threadIdxLocal * threadDimension; + const Idx firstElementIdx = firstElementIdxLocal + elementIdxShift; // Add the shift! + const Idx endElementIdxUncut = firstElementIdx + threadDimension; + + // Return element indexes, shifted by elementIdxShift. + return {firstElementIdx, endElementIdxUncut}; + } + + /* + * Computes the range of the elements indexes, local to the block. + * Truncated by the max number of elements of interest. + */ + template + ALPAKA_FN_ACC std::pair element_index_range_in_block_truncated(const TAcc& acc, + const Idx maxNumberOfElements, + const Idx elementIdxShift, + const unsigned int dimIndex = 0u) { + auto [firstElementIdxLocal, endElementIdxLocal] = element_index_range_in_block(acc, elementIdxShift, dimIndex); + + // Truncate + endElementIdxLocal = std::min(endElementIdxLocal, maxNumberOfElements); + + // Return element indexes, shifted by elementIdxShift, and truncated by maxNumberOfElements. + return {firstElementIdxLocal, endElementIdxLocal}; + } + + /* + * Computes the range of the elements indexes in grid. + * Warning: the max index is not truncated by the max number of elements of interest. + */ + template + ALPAKA_FN_ACC std::pair element_index_range_in_grid(const TAcc& acc, + Idx elementIdxShift, + const unsigned int dimIndex = 0u) { + // Take into account the block index in grid. + const Idx blockIdxInGrid(alpaka::getIdx(acc)[dimIndex]); + const Idx blockDimension(alpaka::getWorkDiv(acc)[dimIndex]); + + // Shift to get global indices in grid (instead of local to the block) + elementIdxShift += blockIdxInGrid * blockDimension; + + // Return element indexes, shifted by elementIdxShift. + return element_index_range_in_block(acc, elementIdxShift, dimIndex); + } + + /* + * Loop on all (CPU) elements. + * Elements loop makes sense in CPU case only. In GPU case, elementIdx = firstElementIdx = threadIdx + shift. + * Indexes are local to the BLOCK. + */ + template + ALPAKA_FN_ACC void for_each_element_in_block(const TAcc& acc, + const Idx maxNumberOfElements, + const Idx elementIdxShift, + const Func func, + const unsigned int dimIndex = 0) { + const auto& [firstElementIdx, endElementIdx] = + element_index_range_in_block_truncated(acc, maxNumberOfElements, elementIdxShift, dimIndex); + + for (Idx elementIdx = firstElementIdx; elementIdx < endElementIdx; ++elementIdx) { + func(elementIdx); + } + } + + /* + * Overload for elementIdxShift = 0 + */ + template + ALPAKA_FN_ACC void for_each_element_in_block(const TAcc& acc, + const Idx maxNumberOfElements, + const Func func, + const unsigned int dimIndex = 0) { + const Idx elementIdxShift = 0; + for_each_element_in_block(acc, maxNumberOfElements, elementIdxShift, func, dimIndex); + } + + /************************************************************** + * LOOP ON ALL ELEMENTS WITH ONE LOOP + **************************************************************/ + + /* + * Case where the input index i has reached the end of threadDimension: strides the input index. + * Otherwise: do nothing. + * NB 1: This helper function is used as a trick to only have one loop (like in legacy), instead of 2 loops + * (like in all the other Alpaka helpers, 'for_each_element_in_block_strided' for example, + * because of the additional loop over elements in Alpaka model). + * This allows to keep the 'continue' and 'break' statements as-is from legacy code, + * and hence avoids a lot of legacy code reshuffling. + * NB 2: Modifies i, firstElementIdx and endElementIdx. + */ + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool next_valid_element_index_strided( + Idx& i, Idx& firstElementIdx, Idx& endElementIdx, const Idx stride, const Idx maxNumberOfElements) { + bool isNextStrideElementValid = true; + if (i == endElementIdx) { + firstElementIdx += stride; + endElementIdx += stride; + i = firstElementIdx; + if (i >= maxNumberOfElements) { + isNextStrideElementValid = false; + } + } + return isNextStrideElementValid; + } + + template + ALPAKA_FN_ACC void for_each_element_in_block_strided(const TAcc& acc, + const Idx maxNumberOfElements, + const Idx elementIdxShift, + const Func func, + const unsigned int dimIndex = 0) { + // Get thread / element indices in block. + const auto& [firstElementIdxNoStride, endElementIdxNoStride] = + element_index_range_in_block(acc, elementIdxShift, dimIndex); + + // Stride = block size. + const Idx blockDimension(alpaka::getWorkDiv(acc)[dimIndex]); + + // Strided access. + for (Idx threadIdx = firstElementIdxNoStride, endElementIdx = endElementIdxNoStride; + threadIdx < maxNumberOfElements; + threadIdx += blockDimension, endElementIdx += blockDimension) { + // (CPU) Loop on all elements. + if (endElementIdx > maxNumberOfElements) { + endElementIdx = maxNumberOfElements; + } + for (Idx i = threadIdx; i < endElementIdx; ++i) { + func(i); + } + } + } + + /* + * Overload for elementIdxShift = 0 + */ + template + ALPAKA_FN_ACC void for_each_element_in_block_strided(const TAcc& acc, + const Idx maxNumberOfElements, + const Func func, + const unsigned int dimIndex = 0) { + const Idx elementIdxShift = 0; + for_each_element_in_block_strided(acc, maxNumberOfElements, elementIdxShift, func, dimIndex); + } + + template + ALPAKA_FN_ACC void for_each_element_in_grid_strided(const TAcc& acc, + const Idx maxNumberOfElements, + const Idx elementIdxShift, + const Func func, + const unsigned int dimIndex = 0) { + // Get thread / element indices in block. + const auto& [firstElementIdxNoStride, endElementIdxNoStride] = + element_index_range_in_grid(acc, elementIdxShift, dimIndex); + + // Stride = grid size. + const Idx gridDimension(alpaka::getWorkDiv(acc)[dimIndex]); + + // Strided access. + for (Idx threadIdx = firstElementIdxNoStride, endElementIdx = endElementIdxNoStride; + threadIdx < maxNumberOfElements; + threadIdx += gridDimension, endElementIdx += gridDimension) { + // (CPU) Loop on all elements. + if (endElementIdx > maxNumberOfElements) { + endElementIdx = maxNumberOfElements; + } + for (Idx i = threadIdx; i < endElementIdx; ++i) { + func(i); + } + } + } + + /* + * Overload for elementIdxShift = 0 + */ + template + ALPAKA_FN_ACC void for_each_element_in_grid_strided(const TAcc& acc, + const Idx maxNumberOfElements, + const Func func, + const unsigned int dimIndex = 0) { + const Idx elementIdxShift = 0; + for_each_element_in_grid_strided(acc, maxNumberOfElements, elementIdxShift, func, dimIndex); + } + } // namespace cms::alpakatools #endif // HeterogeneousCore_AlpakaInterface_interface_workdivision_h diff --git a/HeterogeneousCore/AlpakaUtilities/BuildFile.xml b/HeterogeneousCore/AlpakaUtilities/BuildFile.xml new file mode 100644 index 0000000000000..45d216ff7552d --- /dev/null +++ b/HeterogeneousCore/AlpakaUtilities/BuildFile.xml @@ -0,0 +1,6 @@ + + + + + + diff --git a/HeterogeneousCore/AlpakaUtilities/interface/AtomicPairCounter.h b/HeterogeneousCore/AlpakaUtilities/interface/AtomicPairCounter.h new file mode 100644 index 0000000000000..f36fe138514ba --- /dev/null +++ b/HeterogeneousCore/AlpakaUtilities/interface/AtomicPairCounter.h @@ -0,0 +1,53 @@ +#ifndef AlpakaCore_AtomicPairCounter_h +#define AlpakaCore_AtomicPairCounter_h + +#include + +#include + +namespace cms::alpakatools { + + class AtomicPairCounter { + public: + using c_type = unsigned long long int; + + ALPAKA_FN_HOST_ACC AtomicPairCounter() {} + ALPAKA_FN_HOST_ACC AtomicPairCounter(c_type i) { counter.ac = i; } + + ALPAKA_FN_HOST_ACC AtomicPairCounter& operator=(c_type i) { + counter.ac = i; + return *this; + } + + struct Counters { + uint32_t n; // in a "One to Many" association is the number of "One" + uint32_t m; // in a "One to Many" association is the total number of associations + }; + + union Atomic2 { + Counters counters; + c_type ac; + }; + + static constexpr c_type incr = 1UL << 32; + + ALPAKA_FN_ACC Counters get() const { return counter.counters; } + + // increment n by 1 and m by i. return previous value + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE Counters add(const TAcc& acc, uint32_t i) { + c_type c = i; + c += incr; + + Atomic2 ret; + ret.ac = alpaka::atomicAdd(acc, &counter.ac, c, alpaka::hierarchy::Blocks{}); + return ret.counters; + } + + private: + Atomic2 counter; + }; + +} // namespace cms::alpakatools + +#endif // AlpakaCore_AtomicPairCounter_h diff --git a/HeterogeneousCore/AlpakaUtilities/interface/HistoContainer.h b/HeterogeneousCore/AlpakaUtilities/interface/HistoContainer.h new file mode 100644 index 0000000000000..212f24a87d3b5 --- /dev/null +++ b/HeterogeneousCore/AlpakaUtilities/interface/HistoContainer.h @@ -0,0 +1,303 @@ +#ifndef AlpakaCore_HistoContainer_h +#define AlpakaCore_HistoContainer_h + +#include +#include +#include +#include +#include + +#include "HeterogeneousCore/AlpakaUtilities/interface/AtomicPairCounter.h" +#include "HeterogeneousCore/AlpakaUtilities/interface/alpakastdAlgorithm.h" +#include "HeterogeneousCore/AlpakaUtilities/interface/prefixScan.h" + +#include "HeterogeneousCore/AlpakaInterface/interface/memory.h" +#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h" +namespace cms { + namespace alpakatools { + + struct countFromVector { + template + ALPAKA_FN_ACC void operator()(const TAcc &acc, + Histo *__restrict__ h, + uint32_t nh, + T const *__restrict__ v, + uint32_t const *__restrict__ offsets) const { + const uint32_t nt = offsets[nh]; + for_each_element_in_grid_strided(acc, nt, [&](uint32_t i) { + auto off = alpaka_std::upper_bound(offsets, offsets + nh + 1, i); + ALPAKA_ASSERT_OFFLOAD((*off) > 0); + int32_t ih = off - offsets - 1; + ALPAKA_ASSERT_OFFLOAD(ih >= 0); + ALPAKA_ASSERT_OFFLOAD(ih < int(nh)); + h->count(acc, v[i], ih); + }); + } + }; + + struct fillFromVector { + template + ALPAKA_FN_ACC void operator()(const TAcc &acc, + Histo *__restrict__ h, + uint32_t nh, + T const *__restrict__ v, + uint32_t const *__restrict__ offsets) const { + const uint32_t nt = offsets[nh]; + for_each_element_in_grid_strided(acc, nt, [&](uint32_t i) { + auto off = alpaka_std::upper_bound(offsets, offsets + nh + 1, i); + ALPAKA_ASSERT_OFFLOAD((*off) > 0); + int32_t ih = off - offsets - 1; + ALPAKA_ASSERT_OFFLOAD(ih >= 0); + ALPAKA_ASSERT_OFFLOAD(ih < int(nh)); + h->fill(acc, v[i], i, ih); + }); + } + }; + + template + inline __attribute__((always_inline)) void launchZero(Histo *__restrict__ h, TQueue &queue) { + auto histoOffView = make_device_view(alpaka::getDev(queue), h->off, Histo::totbins()); + alpaka::memset(queue, histoOffView, 0); + } + + template + inline __attribute__((always_inline)) void launchFinalize(Histo *__restrict__ h, TQueue &queue) { + uint32_t *poff = h->off; + + const int num_items = Histo::totbins(); + + const auto threadsPerBlockOrElementsPerThread = 1024u; + const auto blocksPerGrid = divide_up_by(num_items, threadsPerBlockOrElementsPerThread); + const auto workDiv = make_workdiv(blocksPerGrid, threadsPerBlockOrElementsPerThread); + alpaka::exec(queue, workDiv, multiBlockPrefixScanFirstStep(), poff, poff, num_items); + + const auto workDivWith1Block = make_workdiv(1, threadsPerBlockOrElementsPerThread); + alpaka::exec( + queue, workDivWith1Block, multiBlockPrefixScanSecondStep(), poff, poff, num_items, blocksPerGrid); + } + + template + inline __attribute__((always_inline)) void fillManyFromVector(Histo *__restrict__ h, + uint32_t nh, + T const *v, + uint32_t const *offsets, + uint32_t totSize, + uint32_t nthreads, + TQueue &queue) { + launchZero(h, queue); + + const auto threadsPerBlockOrElementsPerThread = nthreads; + const auto blocksPerGrid = divide_up_by(totSize, nthreads); + const auto workDiv = make_workdiv(blocksPerGrid, threadsPerBlockOrElementsPerThread); + + alpaka::exec(queue, workDiv, countFromVector(), h, nh, v, offsets); + launchFinalize(h, queue); + + alpaka::exec(queue, workDiv, fillFromVector(), h, nh, v, offsets); + } + + struct finalizeBulk { + template + ALPAKA_FN_ACC void operator()(const TAcc &acc, AtomicPairCounter const *apc, Assoc *__restrict__ assoc) const { + assoc->bulkFinalizeFill(acc, *apc); + } + }; + + // iteratate over N bins left and right of the one containing "v" + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void forEachInBins(Hist const &hist, V value, int n, Func func) { + int bs = Hist::bin(value); + int be = std::min(int(Hist::nbins() - 1), bs + n); + bs = std::max(0, bs - n); + ALPAKA_ASSERT_OFFLOAD(be >= bs); + for (auto pj = hist.begin(bs); pj < hist.end(be); ++pj) { + func(*pj); + } + } + + // iteratate over bins containing all values in window wmin, wmax + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void forEachInWindow(Hist const &hist, V wmin, V wmax, Func const &func) { + auto bs = Hist::bin(wmin); + auto be = Hist::bin(wmax); + ALPAKA_ASSERT_OFFLOAD(be >= bs); + for (auto pj = hist.begin(bs); pj < hist.end(be); ++pj) { + func(*pj); + } + } + + template + class HistoContainer { + public: + using Counter = uint32_t; + + using CountersOnly = HistoContainer; + + using index_type = I; + using UT = typename std::make_unsigned::type; + + static constexpr uint32_t ilog2(uint32_t v) { + constexpr uint32_t b[] = {0x2, 0xC, 0xF0, 0xFF00, 0xFFFF0000}; + constexpr uint32_t s[] = {1, 2, 4, 8, 16}; + + uint32_t r = 0; // result of log2(v) will go here + for (auto i = 4; i >= 0; i--) + if (v & b[i]) { + v >>= s[i]; + r |= s[i]; + } + return r; + } + + static constexpr uint32_t sizeT() { return S; } + static constexpr uint32_t nbins() { return NBINS; } + static constexpr int32_t nhists() { return NHISTS; } + static constexpr uint32_t totbins() { return NHISTS * NBINS + 1; } + static constexpr uint32_t nbits() { return ilog2(NBINS - 1) + 1; } + static constexpr int32_t capacity() { return SIZE; } + + static constexpr auto histOff(uint32_t nh) { return NBINS * nh; } + + static constexpr UT bin(T t) { + constexpr uint32_t shift = sizeT() - nbits(); + constexpr uint32_t mask = (1 << nbits()) - 1; + return (t >> shift) & mask; + } + + ALPAKA_FN_ACC ALPAKA_FN_INLINE void zero() { + for (auto &i : off) + i = 0; + } + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void add(const TAcc &acc, CountersOnly const &co) { + for (uint32_t i = 0; i < totbins(); ++i) { + alpaka::atomicAdd(acc, off + i, co.off[i], alpaka::hierarchy::Blocks{}); + } + } + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE static uint32_t atomicIncrement(const TAcc &acc, Counter &x) { + return alpaka::atomicAdd(acc, &x, 1u, alpaka::hierarchy::Blocks{}); + } + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE static uint32_t atomicDecrement(const TAcc &acc, Counter &x) { + return alpaka::atomicSub(acc, &x, 1u, alpaka::hierarchy::Blocks{}); + } + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void countDirect(const TAcc &acc, T b) { + ALPAKA_ASSERT_OFFLOAD(b < nbins()); + atomicIncrement(acc, off[b]); + } + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void fillDirect(const TAcc &acc, T b, index_type j) { + ALPAKA_ASSERT_OFFLOAD(b < nbins()); + auto w = atomicDecrement(acc, off[b]); + ALPAKA_ASSERT_OFFLOAD(w > 0); + bins[w - 1] = j; + } + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE int32_t + bulkFill(const TAcc &acc, AtomicPairCounter &apc, index_type const *v, uint32_t n) { + auto c = apc.add(acc, n); + if (c.m >= nbins()) + return -int32_t(c.m); + off[c.m] = c.n; + for (uint32_t j = 0; j < n; ++j) + bins[c.n + j] = v[j]; + return c.m; + } + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void bulkFinalize(const TAcc &acc, AtomicPairCounter const &apc) { + off[apc.get().m] = apc.get().n; + } + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void bulkFinalizeFill(const TAcc &acc, AtomicPairCounter const &apc) { + auto m = apc.get().m; + auto n = apc.get().n; + + if (m >= nbins()) { // overflow! + off[nbins()] = uint32_t(off[nbins() - 1]); + return; + } + + for_each_element_in_grid_strided(acc, totbins(), m, [&](uint32_t i) { off[i] = n; }); + } + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void count(const TAcc &acc, T t) { + uint32_t b = bin(t); + ALPAKA_ASSERT_OFFLOAD(b < nbins()); + atomicIncrement(acc, off[b]); + } + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void fill(const TAcc &acc, T t, index_type j) { + uint32_t b = bin(t); + ALPAKA_ASSERT_OFFLOAD(b < nbins()); + auto w = atomicDecrement(acc, off[b]); + ALPAKA_ASSERT_OFFLOAD(w > 0); + bins[w - 1] = j; + } + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void count(const TAcc &acc, T t, uint32_t nh) { + uint32_t b = bin(t); + ALPAKA_ASSERT_OFFLOAD(b < nbins()); + b += histOff(nh); + ALPAKA_ASSERT_OFFLOAD(b < totbins()); + atomicIncrement(acc, off[b]); + } + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void fill(const TAcc &acc, T t, index_type j, uint32_t nh) { + uint32_t b = bin(t); + ALPAKA_ASSERT_OFFLOAD(b < nbins()); + b += histOff(nh); + ALPAKA_ASSERT_OFFLOAD(b < totbins()); + auto w = atomicDecrement(acc, off[b]); + ALPAKA_ASSERT_OFFLOAD(w > 0); + bins[w - 1] = j; + } + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void finalize(const TAcc &acc, Counter *ws = nullptr) { + ALPAKA_ASSERT_OFFLOAD(off[totbins() - 1] == 0); + blockPrefixScan(acc, off, totbins(), ws); + ALPAKA_ASSERT_OFFLOAD(off[totbins() - 1] == off[totbins() - 2]); + } + + constexpr auto size() const { return uint32_t(off[totbins() - 1]); } + constexpr auto size(uint32_t b) const { return off[b + 1] - off[b]; } + + constexpr index_type const *begin() const { return bins; } + constexpr index_type const *end() const { return begin() + size(); } + + constexpr index_type const *begin(uint32_t b) const { return bins + off[b]; } + constexpr index_type const *end(uint32_t b) const { return bins + off[b + 1]; } + + Counter off[totbins()]; + index_type bins[capacity()]; + }; + + template + using OneToManyAssoc = HistoContainer; + + } // namespace alpakatools +} // namespace cms +#endif // AlpakaCore_HistoContainer_h diff --git a/HeterogeneousCore/AlpakaUtilities/interface/SimpleVector.h b/HeterogeneousCore/AlpakaUtilities/interface/SimpleVector.h new file mode 100644 index 0000000000000..31fb463d26cc3 --- /dev/null +++ b/HeterogeneousCore/AlpakaUtilities/interface/SimpleVector.h @@ -0,0 +1,142 @@ +#ifndef HeterogeneousCore_AlpakaUtilities_SimpleVector_h +#define HeterogeneousCore_AlpakaUtilities_SimpleVector_h + +// author: Felice Pantaleo, CERN, 2018 +// alpaka integration in cmssw: Adriano Di Florio, 2022 + +#include +#include +#include + +namespace cms::alpakatools { + + template + struct SimpleVector { + constexpr SimpleVector() = default; + + // ownership of m_data stays within the caller + constexpr void construct(int capacity, T *data) { + m_size = 0; + m_capacity = capacity; + m_data = data; + } + + inline constexpr int push_back_unsafe(const T &element) { + auto previousSize = m_size; + m_size++; + if (previousSize < m_capacity) { + m_data[previousSize] = element; + return previousSize; + } else { + --m_size; + return -1; + } + } + + template + constexpr int emplace_back_unsafe(Ts &&...args) { + auto previousSize = m_size; + m_size++; + if (previousSize < m_capacity) { + (new (&m_data[previousSize]) T(std::forward(args)...)); + return previousSize; + } else { + --m_size; + return -1; + } + } + + ALPAKA_FN_ACC inline T &back() { return m_data[m_size - 1]; } + + ALPAKA_FN_ACC inline const T &back() const { + if (m_size > 0) { + return m_data[m_size - 1]; + } else + return T(); //undefined behaviour + } + + // thread-safe version of the vector, when used in a CUDA kernel + template + ALPAKA_FN_ACC int push_back(const TAcc &acc, const T &element) { + auto previousSize = alpaka::atomicAdd(acc, &m_size, 1, alpaka::hierarchy::Blocks{}); + if (previousSize < m_capacity) { + m_data[previousSize] = element; + return previousSize; + } else { + alpaka::atomicSub(acc, &m_size, 1, alpaka::hierarchy::Blocks{}); + return -1; + } + } + + template + ALPAKA_FN_ACC int emplace_back(const TAcc &acc, Ts &&...args) { + auto previousSize = alpaka::atomicAdd(acc, &m_size, 1, alpaka::hierarchy::Blocks{}); + if (previousSize < m_capacity) { + (new (&m_data[previousSize]) T(std::forward(args)...)); + return previousSize; + } else { + alpaka::atomicSub(acc, &m_size, 1, alpaka::hierarchy::Blocks{}); + return -1; + } + } + + // thread safe version of resize + template + ALPAKA_FN_ACC int extend(const TAcc &acc, int size = 1) { + auto previousSize = alpaka::atomicAdd(acc, &m_size, size, alpaka::hierarchy::Blocks{}); + if (previousSize < m_capacity) { + return previousSize; + } else { + alpaka::atomicSub(acc, &m_size, size, alpaka::hierarchy::Blocks{}); + return -1; + } + } + + template + ALPAKA_FN_ACC int shrink(const TAcc &acc, int size = 1) { + auto previousSize = alpaka::atomicSub(acc, &m_size, size, alpaka::hierarchy::Blocks{}); + if (previousSize >= size) { + return previousSize - size; + } else { + alpaka::atomicAdd(acc, &m_size, size, alpaka::hierarchy::Blocks{}); + return -1; + } + } + + inline constexpr bool empty() const { return m_size <= 0; } + inline constexpr bool full() const { return m_size >= m_capacity; } + inline constexpr T &operator[](int i) { return m_data[i]; } + inline constexpr const T &operator[](int i) const { return m_data[i]; } + inline constexpr void reset() { m_size = 0; } + inline constexpr int size() const { return m_size; } + inline constexpr int capacity() const { return m_capacity; } + inline constexpr T const *data() const { return m_data; } + inline constexpr void resize(int size) { m_size = size; } + inline constexpr void set_data(T *data) { m_data = data; } + + private: + int m_size; + int m_capacity; + + T *m_data; + }; + + // ownership of m_data stays within the caller + template + SimpleVector make_SimpleVector(int capacity, T *data) { + SimpleVector ret; + ret.construct(capacity, data); + return ret; + } + + // ownership of m_data stays within the caller + template + SimpleVector *make_SimpleVector(SimpleVector *mem, int capacity, T *data) { + auto ret = new (mem) SimpleVector(); + ret->construct(capacity, data); + return ret; + } + +} // namespace cms::alpakatools + +#endif // AlpakaCore_SimpleVector_h diff --git a/HeterogeneousCore/AlpakaUtilities/interface/VecArray.h b/HeterogeneousCore/AlpakaUtilities/interface/VecArray.h new file mode 100644 index 0000000000000..d263df62bc282 --- /dev/null +++ b/HeterogeneousCore/AlpakaUtilities/interface/VecArray.h @@ -0,0 +1,107 @@ +#ifndef AlpakaCore_VecArray_h +#define AlpakaCore_VecArray_h + +// +// Author: Felice Pantaleo, CERN +// + +#include + +#include + +namespace cms::alpakatools { + + template + class VecArray { + public: + using self = VecArray; + using value_t = T; + + inline constexpr int push_back_unsafe(const T &element) { + auto previousSize = m_size; + m_size++; + if (previousSize < maxSize) { + m_data[previousSize] = element; + return previousSize; + } else { + --m_size; + return -1; + } + } + + template + constexpr int emplace_back_unsafe(Ts &&...args) { + auto previousSize = m_size; + m_size++; + if (previousSize < maxSize) { + (new (&m_data[previousSize]) T(std::forward(args)...)); + return previousSize; + } else { + --m_size; + return -1; + } + } + + inline constexpr T &back() const { + if (m_size > 0) { + return m_data[m_size - 1]; + } else + return T(); //undefined behaviour + } + + // thread-safe version of the vector, when used in a CUDA kernel + template + ALPAKA_FN_ACC int push_back(const TAcc &acc, const T &element) { + auto previousSize = alpaka::atomicAdd(acc, &m_size, 1, alpaka::hierarchy::Blocks{}); + if (previousSize < maxSize) { + m_data[previousSize] = element; + return previousSize; + } else { + alpaka::atomicSub(acc, &m_size, 1, alpaka::hierarchy::Blocks{}); + return -1; + } + } + + template + ALPAKA_FN_ACC int emplace_back(const TAcc &acc, Ts &&...args) { + auto previousSize = alpaka::atomicAdd(acc, &m_size, 1, alpaka::hierarchy::Blocks{}); + if (previousSize < maxSize) { + (new (&m_data[previousSize]) T(std::forward(args)...)); + return previousSize; + } else { + alpaka::atomicSub(acc, &m_size, 1, alpaka::hierarchy::Blocks{}); + return -1; + } + } + + inline constexpr T pop_back() { + if (m_size > 0) { + auto previousSize = m_size--; + return m_data[previousSize - 1]; + } else + return T(); + } + + inline constexpr T const *begin() const { return m_data; } + inline constexpr T const *end() const { return m_data + m_size; } + inline constexpr T *begin() { return m_data; } + inline constexpr T *end() { return m_data + m_size; } + inline constexpr int size() const { return m_size; } + inline constexpr T &operator[](int i) { return m_data[i]; } + inline constexpr const T &operator[](int i) const { return m_data[i]; } + inline constexpr void reset() { m_size = 0; } + inline static constexpr int capacity() { return maxSize; } + inline constexpr T const *data() const { return m_data; } + inline constexpr void resize(int size) { m_size = size; } + inline constexpr bool empty() const { return 0 == m_size; } + inline constexpr bool full() const { return maxSize == m_size; } + + private: + T m_data[maxSize]; + + int m_size; + }; + +} // namespace cms::alpakatools + +#endif // AlpakaCore_VecArray_h diff --git a/HeterogeneousCore/AlpakaUtilities/interface/alpakastdAlgorithm.h b/HeterogeneousCore/AlpakaUtilities/interface/alpakastdAlgorithm.h new file mode 100644 index 0000000000000..05f8584452b22 --- /dev/null +++ b/HeterogeneousCore/AlpakaUtilities/interface/alpakastdAlgorithm.h @@ -0,0 +1,35 @@ +#ifndef AlpakaCore_alpakastdAlgorithm_h +#define AlpakaCore_alpakastdAlgorithm_h + +#include +#include +#include + +#include + +// reimplementation of std algorithms able to compile with Alpaka, +// mostly by declaring them constexpr + +namespace alpaka_std { + + template > + ALPAKA_FN_HOST_ACC constexpr RandomIt upper_bound(RandomIt first, RandomIt last, const T &value, Compare comp = {}) { + auto count = last - first; + + while (count > 0) { + auto it = first; + auto step = count / 2; + it += step; + if (!comp(value, *it)) { + first = ++it; + count -= step + 1; + } else { + count = step; + } + } + return first; + } + +} // namespace alpaka_std + +#endif // AlpakaCore_alpakastdAlgorithm_h diff --git a/HeterogeneousCore/AlpakaUtilities/interface/prefixScan.h b/HeterogeneousCore/AlpakaUtilities/interface/prefixScan.h new file mode 100644 index 0000000000000..7b555518a24c6 --- /dev/null +++ b/HeterogeneousCore/AlpakaUtilities/interface/prefixScan.h @@ -0,0 +1,228 @@ +#ifndef AlpakaCore_prefixScan_h +#define AlpakaCore_prefixScan_h + +#include +#include + +#include + +#include "HeterogeneousCore/AlpakaInterface/interface/config.h" +#include "FWCore/Utilities/interface/CMSUnrollLoop.h" + +namespace cms { + namespace alpakatools { + + // FIXME warpSize should be device-dependent + constexpr uint32_t warpSize = 32; + constexpr uint64_t warpMask = ~(~0ull << warpSize); + +#if (defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && defined(__CUDA_ARCH__)) || \ + (defined(ALPAKA_ACC_GPU_HIP_ENABLED) && defined(__HIP_DEVICE_COMPILE__)) + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void warpPrefixScan(uint32_t laneId, T const* ci, T* co, uint32_t i, uint32_t mask) { +#if defined(__HIP_DEVICE_COMPILE__) + ALPAKA_ASSERT_OFFLOAD(mask == warpMask); +#endif + // ci and co may be the same + auto x = ci[i]; + CMS_UNROLL_LOOP + for (uint32_t offset = 1; offset < warpSize; offset <<= 1) { +#if defined(__CUDA_ARCH__) + auto y = __shfl_up_sync(mask, x, offset); +#elif defined(__HIP_DEVICE_COMPILE__) + auto y = __shfl_up(x, offset); +#endif + if (laneId >= offset) + x += y; + } + co[i] = x; + } + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void warpPrefixScan(uint32_t laneId, T* c, uint32_t i, uint32_t mask) { +#if defined(__HIP_DEVICE_COMPILE__) + ALPAKA_ASSERT_OFFLOAD(mask == warpMask); +#endif + auto x = c[i]; + CMS_UNROLL_LOOP + for (uint32_t offset = 1; offset < warpSize; offset <<= 1) { +#if defined(__CUDA_ARCH__) + auto y = __shfl_up_sync(mask, x, offset); +#elif defined(__HIP_DEVICE_COMPILE__) + auto y = __shfl_up(x, offset); +#endif + if (laneId >= offset) + x += y; + } + c[i] = x; + } + +#endif // (defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && defined(__CUDA_ARCH__)) || (defined(ALPAKA_ACC_GPU_HIP_ENABLED) && defined(__HIP_DEVICE_COMPILE__)) + + // limited to warpSize² elements + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void blockPrefixScan( + const TAcc& acc, T const* ci, T* co, uint32_t size, T* ws = nullptr) { +#if (defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && defined(__CUDA_ARCH__)) || \ + (defined(ALPAKA_ACC_GPU_HIP_ENABLED) && defined(__HIP_DEVICE_COMPILE__)) + uint32_t const blockDimension(alpaka::getWorkDiv(acc)[0u]); + uint32_t const blockThreadIdx(alpaka::getIdx(acc)[0u]); + ALPAKA_ASSERT_OFFLOAD(ws); + ALPAKA_ASSERT_OFFLOAD(size <= warpSize * warpSize); + ALPAKA_ASSERT_OFFLOAD(0 == blockDimension % warpSize); + auto first = blockThreadIdx; +#if defined(__CUDA_ARCH__) + auto mask = __ballot_sync(warpMask, first < size); +#elif defined(__HIP_DEVICE_COMPILE__) + auto mask = warpMask; +#endif + auto laneId = blockThreadIdx & (warpSize - 1); + + for (auto i = first; i < size; i += blockDimension) { + warpPrefixScan(laneId, ci, co, i, mask); + auto warpId = i / warpSize; + // FIXME test ? + ALPAKA_ASSERT_OFFLOAD(warpId < warpSize); + if ((warpSize - 1) == laneId) + ws[warpId] = co[i]; +#if defined(__CUDA_ARCH__) + mask = __ballot_sync(mask, i + blockDimension < size); +#endif + } + alpaka::syncBlockThreads(acc); + if (size <= warpSize) + return; + if (blockThreadIdx < warpSize) { + warpPrefixScan(laneId, ws, blockThreadIdx, warpMask); + } + alpaka::syncBlockThreads(acc); + for (auto i = first + warpSize; i < size; i += blockDimension) { + uint32_t warpId = i / warpSize; + co[i] += ws[warpId - 1]; + } + alpaka::syncBlockThreads(acc); +#else + co[0] = ci[0]; + for (uint32_t i = 1; i < size; ++i) + co[i] = ci[i] + co[i - 1]; +#endif // (defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && defined(__CUDA_ARCH__)) || (defined(ALPAKA_ACC_GPU_HIP_ENABLED) && defined(__HIP_DEVICE_COMPILE__)) + } + + template + ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE void blockPrefixScan(const TAcc& acc, + T* __restrict__ c, + uint32_t size, + T* __restrict__ ws = nullptr) { +#if (defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && defined(__CUDA_ARCH__)) || \ + (defined(ALPAKA_ACC_GPU_HIP_ENABLED) && defined(__HIP_DEVICE_COMPILE__)) + uint32_t const blockDimension(alpaka::getWorkDiv(acc)[0u]); + uint32_t const blockThreadIdx(alpaka::getIdx(acc)[0u]); + ALPAKA_ASSERT_OFFLOAD(ws); + ALPAKA_ASSERT_OFFLOAD(size <= warpSize * warpSize); + ALPAKA_ASSERT_OFFLOAD(0 == blockDimension % warpSize); + auto first = blockThreadIdx; +#if defined(__CUDA_ARCH__) + auto mask = __ballot_sync(warpMask, first < size); +#elif defined(__HIP_DEVICE_COMPILE__) + auto mask = warpMask; +#endif + auto laneId = blockThreadIdx & (warpSize - 1); + + for (auto i = first; i < size; i += blockDimension) { + warpPrefixScan(laneId, c, i, mask); + auto warpId = i / warpSize; + ALPAKA_ASSERT_OFFLOAD(warpId < warpSize); + if ((warpSize - 1) == laneId) + ws[warpId] = c[i]; +#if defined(__CUDA_ARCH__) + mask = __ballot_sync(mask, i + blockDimension < size); +#endif + } + alpaka::syncBlockThreads(acc); + if (size <= warpSize) + return; + if (blockThreadIdx < warpSize) { + warpPrefixScan(laneId, ws, blockThreadIdx, warpMask); + } + alpaka::syncBlockThreads(acc); + for (auto i = first + warpSize; i < size; i += blockDimension) { + auto warpId = i / warpSize; + c[i] += ws[warpId - 1]; + } + alpaka::syncBlockThreads(acc); +#else + for (uint32_t i = 1; i < size; ++i) + c[i] += c[i - 1]; +#endif // (defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && defined(__CUDA_ARCH__)) || (defined(ALPAKA_ACC_GPU_HIP_ENABLED) && defined(__HIP_DEVICE_COMPILE__)) + } + + // limited to warpSize⁴ elements + template + struct multiBlockPrefixScanFirstStep { + template + ALPAKA_FN_ACC void operator()(const TAcc& acc, T const* ci, T* co, int32_t size) const { + uint32_t const blockDimension(alpaka::getWorkDiv(acc)[0u]); + uint32_t const threadDimension(alpaka::getWorkDiv(acc)[0u]); + uint32_t const blockIdx(alpaka::getIdx(acc)[0u]); + + auto& ws = alpaka::declareSharedVar(acc); + // first each block does a scan of size warpSize² (better be enough blocks) +#ifndef NDEBUG + [[maybe_unused]] uint32_t const gridDimension(alpaka::getWorkDiv(acc)[0u]); + ALPAKA_ASSERT_OFFLOAD(gridDimension / threadDimension <= (warpSize * warpSize)); +#endif +#if 0 + // this is not yet available in alpaka, see + // https://github.com/alpaka-group/alpaka/issues/1648 + ALPAKA_ASSERT_OFFLOAD(sizeof(T) * gridDimension <= dynamic_smem_size()); // size of psum below +#endif + int off = blockDimension * blockIdx * threadDimension; + if (size - off > 0) + blockPrefixScan(acc, ci + off, co + off, std::min(int(blockDimension * threadDimension), size - off), ws); + } + }; + + // limited to warpSize⁴ elements + template + struct multiBlockPrefixScanSecondStep { + template + ALPAKA_FN_ACC void operator()(const TAcc& acc, T const* ci, T* co, int32_t size, int32_t numBlocks) const { + uint32_t const blockDimension(alpaka::getWorkDiv(acc)[0u]); + uint32_t const threadDimension(alpaka::getWorkDiv(acc)[0u]); + uint32_t const threadIdx(alpaka::getIdx(acc)[0u]); + + T* const psum = alpaka::getDynSharedMem(acc); + + // first each block does a scan of size warpSize² (better be enough blocks) + ALPAKA_ASSERT_OFFLOAD(static_cast(blockDimension * threadDimension) >= numBlocks); + for (int elemId = 0; elemId < static_cast(threadDimension); ++elemId) { + int index = +threadIdx * threadDimension + elemId; + + if (index < numBlocks) { + int lastElementOfPreviousBlockId = index * blockDimension * threadDimension - 1; + psum[index] = (lastElementOfPreviousBlockId < size and lastElementOfPreviousBlockId >= 0) + ? co[lastElementOfPreviousBlockId] + : T(0); + } + } + + alpaka::syncBlockThreads(acc); + + auto& ws = alpaka::declareSharedVar(acc); + blockPrefixScan(acc, psum, psum, numBlocks, ws); + + for (int elemId = 0; elemId < static_cast(threadDimension); ++elemId) { + int first = threadIdx * threadDimension + elemId; + for (int i = first + blockDimension * threadDimension; i < size; i += blockDimension * threadDimension) { + auto k = i / (blockDimension * threadDimension); + co[i] += psum[k]; + } + } + } + }; + + } // namespace alpakatools +} // namespace cms + +#endif // AlpakaCore_prefixScan_h diff --git a/HeterogeneousCore/AlpakaUtilities/interface/radixSort.h b/HeterogeneousCore/AlpakaUtilities/interface/radixSort.h new file mode 100644 index 0000000000000..24b179b9f3929 --- /dev/null +++ b/HeterogeneousCore/AlpakaUtilities/interface/radixSort.h @@ -0,0 +1,250 @@ +#ifndef AlpakaCore_radixSort_h +#define AlpakaCore_radixSort_h + +#include +#include +#include + +namespace cms::alpakatools { + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void dummyReorder( + const TAcc& acc, T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) {} + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void reorderSigned( + const TAcc& acc, T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) { + //move negative first... + + auto& firstNeg = alpaka::declareSharedVar(acc); + firstNeg = a[ind[0]] < 0 ? 0 : size; + alpaka::syncBlockThreads(acc); + + // find first negative + for_each_element_in_block_strided(acc, size - 1, [&](uint32_t idx) { + if ((a[ind[idx]] ^ a[ind[idx + 1]]) < 0) + firstNeg = idx + 1; + }); + + alpaka::syncBlockThreads(acc); + + for_each_element_in_block_strided(acc, size, firstNeg, [&](uint32_t idx) { ind2[idx - firstNeg] = ind[idx]; }); + alpaka::syncBlockThreads(acc); + + for_each_element_in_block_strided(acc, firstNeg, [&](uint32_t idx) { ind2[idx + size - firstNeg] = ind[idx]; }); + alpaka::syncBlockThreads(acc); + + for_each_element_in_block_strided(acc, size, [&](uint32_t idx) { ind[idx] = ind2[idx]; }); + } + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void reorderFloat( + const TAcc& acc, T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) { + //move negative first... + + auto& firstNeg = alpaka::declareSharedVar(acc); + firstNeg = a[ind[0]] < 0 ? 0 : size; + alpaka::syncBlockThreads(acc); + + // find first negative + for_each_element_in_block_strided(acc, size - 1, [&](uint32_t idx) { + if ((a[ind[idx]] ^ a[ind[idx + 1]]) < 0) + firstNeg = idx + 1; + }); + alpaka::syncBlockThreads(acc); + + for_each_element_in_block_strided(acc, size, firstNeg, [&](uint32_t idx) { ind2[size - idx - 1] = ind[idx]; }); + alpaka::syncBlockThreads(acc); + + for_each_element_in_block_strided(acc, firstNeg, [&](uint32_t idx) { ind2[idx + size - firstNeg] = ind[idx]; }); + alpaka::syncBlockThreads(acc); + + for_each_element_in_block_strided(acc, size, [&](uint32_t idx) { ind[idx] = ind2[idx]; }); + } + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE __attribute__((always_inline)) void radixSortImpl( + const TAcc& acc, T const* __restrict__ a, uint16_t* ind, uint16_t* ind2, uint32_t size, RF reorder) { +#if (defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && defined(__CUDA_ARCH__)) || \ + (defined(ALPAKA_ACC_GPU_HIP_ENABLED) && defined(__HIP_DEVICE_COMPILE__)) + const uint32_t threadIdxLocal(alpaka::getIdx(acc)[0u]); + const uint32_t blockDimension(alpaka::getWorkDiv(acc)[0u]); + + constexpr int d = 8, w = 8 * sizeof(T); + constexpr int sb = 1 << d; + constexpr int ps = int(sizeof(T)) - NS; + + auto& c = alpaka::declareSharedVar(acc); + auto& ct = alpaka::declareSharedVar(acc); + auto& cu = alpaka::declareSharedVar(acc); + auto& ibs = alpaka::declareSharedVar(acc); + auto& p = alpaka::declareSharedVar(acc); + + ALPAKA_ASSERT_OFFLOAD(size > 0); + ALPAKA_ASSERT_OFFLOAD(blockDimension >= sb); + + p = ps; + + auto j = ind; + auto k = ind2; + + for_each_element_in_block_strided(acc, size, [&](uint32_t idx) { j[idx] = idx; }); + alpaka::syncBlockThreads(acc); + + while (alpaka::syncBlockThreadsPredicate(acc, (p < w / d))) { + for_each_element_in_block_strided(acc, sb, [&](uint32_t idx) { c[idx] = 0; }); + alpaka::syncBlockThreads(acc); + + // fill bins + for_each_element_in_block_strided(acc, size, [&](uint32_t idx) { + auto bin = (a[j[idx]] >> d * p) & (sb - 1); + alpaka::atomicAdd(acc, &c[bin], 1, alpaka::hierarchy::Threads{}); + }); + alpaka::syncBlockThreads(acc); + + // prefix scan "optimized"???... + for_each_element_in_block(acc, sb, [&](uint32_t idx) { + auto x = c[idx]; + auto laneId = idx & 0x1f; + + for (int offset = 1; offset < 32; offset <<= 1) { +#if defined(__CUDA_ARCH__) + auto y = __shfl_up_sync(0xffffffff, x, offset); +#elif defined(__HIP_DEVICE_COMPILE__) + auto y = __shfl_up(x, offset); +#endif + if (laneId >= (uint32_t)offset) + x += y; + } + ct[idx] = x; + }); + alpaka::syncBlockThreads(acc); + + for_each_element_in_block(acc, sb, [&](uint32_t idx) { + auto ss = (idx / 32) * 32 - 1; + c[idx] = ct[idx]; + for (int i = ss; i > 0; i -= 32) + c[idx] += ct[i]; + }); + + /* + //prefix scan for the nulls (for documentation) + if (threadIdxLocal==0) + for (int i = 1; i < sb; ++i) c[i] += c[i-1]; + */ + + // broadcast + ibs = size - 1; + alpaka::syncBlockThreads(acc); + + while (alpaka::syncBlockThreadsPredicate(acc, ibs > 0)) { + for_each_element_in_block(acc, sb, [&](uint32_t idx) { + cu[idx] = -1; + ct[idx] = -1; + }); + alpaka::syncBlockThreads(acc); + + for_each_element_in_block(acc, sb, [&](uint32_t idx) { + int i = ibs - idx; + int32_t bin = -1; + if (i >= 0) { + bin = (a[j[i]] >> d * p) & (sb - 1); + ct[idx] = bin; + alpaka::atomicMax(acc, &cu[bin], int(i), alpaka::hierarchy::Threads{}); + } + }); + alpaka::syncBlockThreads(acc); + + for_each_element_in_block(acc, sb, [&](uint32_t idx) { + int i = ibs - idx; + int32_t bin = (i >= 0 ? ((a[j[i]] >> d * p) & (sb - 1)) : -1); + if (i >= 0 && i == cu[bin]) // ensure to keep them in order + for (int ii = idx; ii < sb; ++ii) + if (ct[ii] == bin) { + auto oi = ii - idx; + // assert(i>=oi);if(i>=oi) + k[--c[bin]] = j[i - oi]; + } + }); + alpaka::syncBlockThreads(acc); + + if (threadIdxLocal == 0) { + ibs -= sb; + // cms-patatrack/pixeltrack-standalone#210 + alpaka::mem_fence(acc, alpaka::memory_scope::Grid{}); + } + alpaka::syncBlockThreads(acc); + } + + /* + // broadcast for the nulls (for documentation) + if (threadIdxLocal==0) + for (int i=size-first-1; i>=0; i--) { // =blockDim.x) { + auto bin = (a[j[i]] >> d*p)&(sb-1); + auto ik = atomicSub(&c[bin],1); + k[ik-1] = j[i]; + } + */ + + alpaka::syncBlockThreads(acc); + ALPAKA_ASSERT_OFFLOAD(c[0] == 0); + + // swap (local, ok) + auto t = j; + j = k; + k = t; + + const uint32_t threadIdxLocal(alpaka::getIdx(acc)[0u]); + if (threadIdxLocal == 0) + ++p; + alpaka::syncBlockThreads(acc); + } + + if ((w != 8) && (0 == (NS & 1))) + ALPAKA_ASSERT_OFFLOAD(j == ind); // w/d is even so ind is correct + + if (j != ind) // odd... + for_each_element_in_block_strided(acc, size, [&](uint32_t idx) { ind[idx] = ind2[idx]; }); + + alpaka::syncBlockThreads(acc); + + // now move negative first... (if signed) + reorder(acc, a, ind, ind2, size); +#endif + } + + template ::value, T>::type* = nullptr> + ALPAKA_FN_ACC ALPAKA_FN_INLINE __attribute__((always_inline)) void radixSort( + const TAcc& acc, T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) { + radixSortImpl(acc, a, ind, ind2, size, dummyReorder); + } + + template ::value && std::is_signed::value, T>::type* = nullptr> + ALPAKA_FN_ACC ALPAKA_FN_INLINE __attribute__((always_inline)) void radixSort( + const TAcc& acc, T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) { + radixSortImpl(acc, a, ind, ind2, size, reorderSigned); + } + + template ::value, T>::type* = nullptr> + ALPAKA_FN_ACC ALPAKA_FN_INLINE __attribute__((always_inline)) void radixSort( + const TAcc& acc, T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) { + static_assert(sizeof(T) == sizeof(int), "radixSort with the wrong type size"); + using I = int; + radixSortImpl(acc, (I const*)(a), ind, ind2, size, reorderFloat); + } + +} // namespace cms::alpakatools + +#endif // AlpakaCore_radixSort_h