diff --git a/HeterogeneousCore/AlpakaInterface/interface/AtomicPairCounter.h b/HeterogeneousCore/AlpakaInterface/interface/AtomicPairCounter.h index 5c6d1d5719623..0805e7e823fce 100644 --- a/HeterogeneousCore/AlpakaInterface/interface/AtomicPairCounter.h +++ b/HeterogeneousCore/AlpakaInterface/interface/AtomicPairCounter.h @@ -25,7 +25,7 @@ namespace cms::alpakatools { uint32_t second; // in a "One to Many" association is the total number of associations }; - ALPAKA_FN_ACC constexpr Counters get() const { return counter_.as_counters; } + ALPAKA_FN_HOST_ACC constexpr Counters get() const { return counter_.as_counters; } // atomically add as_counters, and return the previous value template diff --git a/HeterogeneousCore/AlpakaInterface/interface/atomicMaxF.h b/HeterogeneousCore/AlpakaInterface/interface/atomicMaxF.h index 726f769f70a49..cc61a405897fb 100644 --- a/HeterogeneousCore/AlpakaInterface/interface/atomicMaxF.h +++ b/HeterogeneousCore/AlpakaInterface/interface/atomicMaxF.h @@ -1,13 +1,16 @@ -#ifndef HeterogeneousCore_AlpakaCore_interface_atomicMaxF_h -#define HeterogeneousCore_AlpakaCore_interface_atomicMaxF_h +#ifndef HeterogeneousCore_AlpakaInterface_interface_atomicMaxF_h +#define HeterogeneousCore_AlpakaInterface_interface_atomicMaxF_h + #include #include "FWCore/Utilities/interface/bit_cast.h" -#include "HeterogeneousCore/AlpakaInterface/interface/config.h" -#if defined(__CUDA_ARCH__) or defined(__HIP_DEVICE_COMPILE__) +// FIXME: this should be rewritten using the correct template specialisation for the different accelerator types + template >> -static __device__ __forceinline__ float atomicMaxF(const TAcc& acc, float* address, float val) { +ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE static float atomicMaxF(const TAcc& acc, float* address, float val) { +#if defined(__CUDA_ARCH__) or defined(__HIP_DEVICE_COMPILE__) + // GPU implementation uses __float_as_int / __int_as_float int ret = __float_as_int(*address); while (val > __int_as_float(ret)) { int old = ret; @@ -15,10 +18,7 @@ static __device__ __forceinline__ float atomicMaxF(const TAcc& acc, float* addre break; } return __int_as_float(ret); -} #else -template >> -ALPAKA_FN_ACC ALPAKA_FN_INLINE static float atomicMaxF(const TAcc& acc, float* address, float val) { // CPU implementation uses edm::bit_cast int ret = edm::bit_cast(*address); while (val > edm::bit_cast(ret)) { @@ -27,7 +27,7 @@ ALPAKA_FN_ACC ALPAKA_FN_INLINE static float atomicMaxF(const TAcc& acc, float* a break; } return edm::bit_cast(ret); -} #endif // __CUDA_ARCH__ or __HIP_DEVICE_COMPILE__ +} -#endif // HeterogeneousCore_AlpakaCore_interface_atomicMaxF_h +#endif // HeterogeneousCore_AlpakaInterface_interface_atomicMaxF_h diff --git a/HeterogeneousCore/AlpakaInterface/interface/radixSort.h b/HeterogeneousCore/AlpakaInterface/interface/radixSort.h index f9b26cf3d17ae..25b4b96edd9b7 100644 --- a/HeterogeneousCore/AlpakaInterface/interface/radixSort.h +++ b/HeterogeneousCore/AlpakaInterface/interface/radixSort.h @@ -353,8 +353,8 @@ namespace cms::alpakatools { typename T, int NS = sizeof(T), // number of significant bytes to use in sorting typename std::enable_if, T>::type* = nullptr> - ALPAKA_FN_ACC ALPAKA_FN_INLINE void radixSort( - const TAcc& acc, T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) { + /* not ALPAKA_FN_ACC to avoid trying to compile it for the CUDA or ROCm back-ends */ + ALPAKA_FN_INLINE void radixSort(const TAcc& acc, T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) { static_assert(requires_single_thread_per_block_v, "CPU sort (not a radixSort) called wtth wrong accelerator"); // Initialize the index array std::iota(ind, ind + size, 0);