From 6f490752bf905595b832fa0d964602b0adf0e062 Mon Sep 17 00:00:00 2001 From: Wesley Maxey Date: Mon, 20 Sep 2021 18:29:07 -0700 Subject: [PATCH 1/2] Replace a few stray host ::std uses with internals, add a __host form of fetch_min/max --- include/cuda/std/atomic | 20 ++---- include/cuda/std/detail/__config | 2 + libcxx/include/atomic | 4 ++ libcxx/include/support/atomic/atomic_base.h | 28 ++++++++ libcxx/include/support/atomic/atomic_cuda.h | 65 +------------------ .../support/atomic/atomic_cuda_scopes.h | 41 ++++++++++++ 6 files changed, 83 insertions(+), 77 deletions(-) create mode 100644 libcxx/include/support/atomic/atomic_cuda_scopes.h diff --git a/include/cuda/std/atomic b/include/cuda/std/atomic index 35fac823cf..fa277dd829 100644 --- a/include/cuda/std/atomic +++ b/include/cuda/std/atomic @@ -37,6 +37,7 @@ #undef ATOMIC_VAR_INIT #endif //__CUDACC_RTC__ + #include "cassert" #include "cstddef" #include "cstdint" @@ -63,8 +64,6 @@ namespace __detail { using std::__detail::__thread_scope_block_tag; using std::__detail::__thread_scope_device_tag; using std::__detail::__thread_scope_system_tag; -using std::__detail::__atomic_signal_fence_cuda; -using std::__detail::__atomic_thread_fence_cuda; } using memory_order = std::memory_order; @@ -173,32 +172,25 @@ inline __host__ __device__ void atomic_thread_fence(memory_order __m, thread_sco NV_IS_DEVICE, ( switch(_Scope) { case thread_scope::thread_scope_system: - __detail::__atomic_thread_fence_cuda((int)__m, __detail::__thread_scope_system_tag()); + std::__detail::__atomic_thread_fence_cuda((int)__m, __detail::__thread_scope_system_tag()); break; case thread_scope::thread_scope_device: - __detail::__atomic_thread_fence_cuda((int)__m, __detail::__thread_scope_device_tag()); + std::__detail::__atomic_thread_fence_cuda((int)__m, __detail::__thread_scope_device_tag()); break; case thread_scope::thread_scope_block: - __detail::__atomic_thread_fence_cuda((int)__m, __detail::__thread_scope_block_tag()); + std::__detail::__atomic_thread_fence_cuda((int)__m, __detail::__thread_scope_block_tag()); break; } ), NV_IS_HOST, ( (void) _Scope; - ::std::atomic_thread_fence((::std::memory_order)__m); + std::atomic_thread_fence(__m); ) ) } inline __host__ __device__ void atomic_signal_fence(memory_order __m) { - NV_DISPATCH_TARGET( - NV_IS_DEVICE, ( - __detail::__atomic_signal_fence_cuda((int)__m); - ), - NV_IS_HOST, ( - ::std::atomic_signal_fence((::std::memory_order)__m); - ) - ) + std::atomic_signal_fence(__m); } _LIBCUDACXX_END_NAMESPACE_CUDA diff --git a/include/cuda/std/detail/__config b/include/cuda/std/detail/__config index e735f7a053..45c6fac4ec 100644 --- a/include/cuda/std/detail/__config +++ b/include/cuda/std/detail/__config @@ -71,6 +71,8 @@ #define __ELF__ #endif +#define _LIBCUDACXX_HAS_CUDA_ATOMIC_EXT + #include "libcxx/include/__config" #if defined(__CUDA_ARCH__) diff --git a/libcxx/include/atomic b/libcxx/include/atomic index ceb679ae6f..27217e51f4 100644 --- a/libcxx/include/atomic +++ b/libcxx/include/atomic @@ -680,6 +680,10 @@ __cxx_atomic_assign_volatile(_Tp volatile& __a_value, _Tv volatile const& __val) // Headers are wrapped like so: (cuda::std::|std::)detail namespace __detail { +#if defined(_LIBCUDACXX_HAS_CUDA_ATOMIC_EXT) +# include "support/atomic/atomic_cuda_scopes.h" +#endif + #if defined(_LIBCUDACXX_HAS_CUDA_ATOMIC_IMPL) # include "support/atomic/atomic_cuda.h" #elif defined(_LIBCUDACXX_HAS_MSVC_ATOMIC_IMPL) diff --git a/libcxx/include/support/atomic/atomic_base.h b/libcxx/include/support/atomic/atomic_base.h index 4b362a8e6b..d03da59805 100644 --- a/libcxx/include/support/atomic/atomic_base.h +++ b/libcxx/include/support/atomic/atomic_base.h @@ -163,6 +163,34 @@ inline auto __cxx_atomic_fetch_xor(_Tp* __a, _Td __pattern, __cxx_atomic_order_to_int(__order)); } +template +inline auto __cxx_atomic_fetch_max(_Tp* __a, _Td __val, + memory_order __order) -> __cxx_atomic_underlying_t<_Tp> { + auto __expected = __cxx_atomic_load(__a, memory_order_relaxed); + auto __desired = __expected > __val ? __expected : __val; + + while(__desired == __val && + !__cxx_atomic_compare_exchange_strong(__a, &__expected, __desired, __order, __order)) { + __desired = __expected > __val ? __expected : __val; + } + + return __expected; +} + +template +inline auto __cxx_atomic_fetch_min(_Tp* __a, _Td __val, + memory_order __order) -> __cxx_atomic_underlying_t<_Tp> { + auto __expected = __cxx_atomic_load(__a, memory_order_relaxed); + auto __desired = __expected < __val ? __expected : __val; + + while(__desired == __val && + !__cxx_atomic_compare_exchange_strong(__a, &__expected, __desired, __order, __order)) { + __desired = __expected < __val ? __expected : __val; + } + + return __expected; +} + inline constexpr bool __cxx_atomic_is_lock_free(size_t __x) { #if defined(_LIBCUDACXX_NO_RUNTIME_LOCK_FREE) diff --git a/libcxx/include/support/atomic/atomic_cuda.h b/libcxx/include/support/atomic/atomic_cuda.h index 84ce6d7388..4ecc096b41 100644 --- a/libcxx/include/support/atomic/atomic_cuda.h +++ b/libcxx/include/support/atomic/atomic_cuda.h @@ -33,13 +33,6 @@ #define __ATOMIC_SEQ_CST 5 #endif //__ATOMIC_RELAXED -#ifndef __ATOMIC_BLOCK -#define __ATOMIC_SYSTEM 0 // 0 indicates default -#define __ATOMIC_DEVICE 1 -#define __ATOMIC_BLOCK 2 -#define __ATOMIC_THREAD 10 -#endif //__ATOMIC_BLOCK - inline __host__ __device__ int __stronger_order_cuda(int __a, int __b) { int const __max = __a > __b ? __a : __b; if(__max != __ATOMIC_RELEASE) @@ -52,42 +45,6 @@ inline __host__ __device__ int __stronger_order_cuda(int __a, int __b) { return __xform[__a < __b ? __a : __b]; } -enum thread_scope { - thread_scope_system = __ATOMIC_SYSTEM, - thread_scope_device = __ATOMIC_DEVICE, - thread_scope_block = __ATOMIC_BLOCK, - thread_scope_thread = __ATOMIC_THREAD -}; - -#define _LIBCUDACXX_ATOMIC_SCOPE_TYPE ::cuda::thread_scope -#define _LIBCUDACXX_ATOMIC_SCOPE_DEFAULT ::cuda::thread_scope::system - -struct __thread_scope_thread_tag { }; -struct __thread_scope_block_tag { }; -struct __thread_scope_device_tag { }; -struct __thread_scope_system_tag { }; - -template struct __scope_enum_to_tag { }; -/* This would be the implementation once an actual thread-scope backend exists. -template<> struct __scope_enum_to_tag<(int)thread_scope_thread> { - using type = __thread_scope_thread_tag; }; -Until then: */ -template<> struct __scope_enum_to_tag<(int)thread_scope_thread> { - using type = __thread_scope_block_tag; }; -template<> struct __scope_enum_to_tag<(int)thread_scope_block> { - using type = __thread_scope_block_tag; }; -template<> struct __scope_enum_to_tag<(int)thread_scope_device> { - using type = __thread_scope_device_tag; }; -template<> struct __scope_enum_to_tag<(int)thread_scope_system> { - using type = __thread_scope_system_tag; }; - -template -_LIBCUDACXX_INLINE_VISIBILITY auto constexpr __scope_tag() -> - typename __scope_enum_to_tag<_Scope>::type { - return typename __scope_enum_to_tag<_Scope>::type(); -} -// END TODO - // Wrap host atomic implementations into a sub-namespace namespace __host { #if defined(_LIBCUDACXX_COMPILER_MSVC) @@ -385,16 +342,7 @@ __host__ __device__ NV_IS_DEVICE, ( return __atomic_fetch_max_cuda(__cxx_get_underlying_device_atomic(__a), __val, __order, __scope_tag<_Sco>()); ), ( - // IS_HOST - _Tp __expected = __cxx_atomic_load(__a, memory_order_relaxed); - _Tp __desired = __expected > __val ? __expected : __val; - - while(__desired == __val && - !__cxx_atomic_compare_exchange_strong(__a, &__expected, __desired, __order, __order)) { - __desired = __expected > __val ? __expected : __val; - } - - return __expected; + return __host::__cxx_atomic_fetch_max(&__a->__a_value, __val, __order); ) ) } @@ -406,16 +354,7 @@ __host__ __device__ NV_IS_DEVICE, ( return __atomic_fetch_min_cuda(__cxx_get_underlying_device_atomic(__a), __val, __order, __scope_tag<_Sco>()); ), ( - // IS_HOST - _Tp __expected = __cxx_atomic_load(__a, memory_order_relaxed); - _Tp __desired = __expected < __val ? __expected : __val; - - while(__desired == __val && - !__cxx_atomic_compare_exchange_strong(__a, &__expected, __desired, __order, __order)) { - __desired = __expected < __val ? __expected : __val; - } - - return __expected; + return __host::__cxx_atomic_fetch_min(&__a->__a_value, __val, __order); ) ) } diff --git a/libcxx/include/support/atomic/atomic_cuda_scopes.h b/libcxx/include/support/atomic/atomic_cuda_scopes.h new file mode 100644 index 0000000000..4d8e1477dd --- /dev/null +++ b/libcxx/include/support/atomic/atomic_cuda_scopes.h @@ -0,0 +1,41 @@ +#ifndef __ATOMIC_BLOCK +#define __ATOMIC_SYSTEM 0 // 0 indicates default +#define __ATOMIC_DEVICE 1 +#define __ATOMIC_BLOCK 2 +#define __ATOMIC_THREAD 10 +#endif //__ATOMIC_BLOCK + +enum thread_scope { + thread_scope_system = __ATOMIC_SYSTEM, + thread_scope_device = __ATOMIC_DEVICE, + thread_scope_block = __ATOMIC_BLOCK, + thread_scope_thread = __ATOMIC_THREAD +}; + +#define _LIBCUDACXX_ATOMIC_SCOPE_TYPE ::cuda::thread_scope +#define _LIBCUDACXX_ATOMIC_SCOPE_DEFAULT ::cuda::thread_scope::system + +struct __thread_scope_thread_tag { }; +struct __thread_scope_block_tag { }; +struct __thread_scope_device_tag { }; +struct __thread_scope_system_tag { }; + +template struct __scope_enum_to_tag { }; +/* This would be the implementation once an actual thread-scope backend exists. +template<> struct __scope_enum_to_tag<(int)thread_scope_thread> { + using type = __thread_scope_thread_tag; }; +Until then: */ +template<> struct __scope_enum_to_tag<(int)thread_scope_thread> { + using type = __thread_scope_block_tag; }; +template<> struct __scope_enum_to_tag<(int)thread_scope_block> { + using type = __thread_scope_block_tag; }; +template<> struct __scope_enum_to_tag<(int)thread_scope_device> { + using type = __thread_scope_device_tag; }; +template<> struct __scope_enum_to_tag<(int)thread_scope_system> { + using type = __thread_scope_system_tag; }; + +template +_LIBCUDACXX_INLINE_VISIBILITY auto constexpr __scope_tag() -> + typename __scope_enum_to_tag<_Scope>::type { + return typename __scope_enum_to_tag<_Scope>::type(); +} From 21563c7bfeb9c3612570f4c3443afa2b7834b5b6 Mon Sep 17 00:00:00 2001 From: Wesley Maxey Date: Mon, 27 Sep 2021 19:47:02 -0700 Subject: [PATCH 2/2] Rename CUDA scopes header to atomic_scopes.h --- libcxx/include/atomic | 2 +- .../support/atomic/{atomic_cuda_scopes.h => atomic_scopes.h} | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) rename libcxx/include/support/atomic/{atomic_cuda_scopes.h => atomic_scopes.h} (93%) diff --git a/libcxx/include/atomic b/libcxx/include/atomic index 27217e51f4..7ca9731aaa 100644 --- a/libcxx/include/atomic +++ b/libcxx/include/atomic @@ -681,7 +681,7 @@ __cxx_atomic_assign_volatile(_Tp volatile& __a_value, _Tv volatile const& __val) // Headers are wrapped like so: (cuda::std::|std::)detail namespace __detail { #if defined(_LIBCUDACXX_HAS_CUDA_ATOMIC_EXT) -# include "support/atomic/atomic_cuda_scopes.h" +# include "support/atomic/atomic_scopes.h" #endif #if defined(_LIBCUDACXX_HAS_CUDA_ATOMIC_IMPL) diff --git a/libcxx/include/support/atomic/atomic_cuda_scopes.h b/libcxx/include/support/atomic/atomic_scopes.h similarity index 93% rename from libcxx/include/support/atomic/atomic_cuda_scopes.h rename to libcxx/include/support/atomic/atomic_scopes.h index 4d8e1477dd..89d73bb3f8 100644 --- a/libcxx/include/support/atomic/atomic_cuda_scopes.h +++ b/libcxx/include/support/atomic/atomic_scopes.h @@ -1,3 +1,6 @@ +#ifndef __LIBCUDACXX_ATOMIC_SCOPES_H +#define __LIBCUDACXX_ATOMIC_SCOPES_H + #ifndef __ATOMIC_BLOCK #define __ATOMIC_SYSTEM 0 // 0 indicates default #define __ATOMIC_DEVICE 1 @@ -39,3 +42,5 @@ _LIBCUDACXX_INLINE_VISIBILITY auto constexpr __scope_tag() -> typename __scope_enum_to_tag<_Scope>::type { return typename __scope_enum_to_tag<_Scope>::type(); } + +#endif // __LIBCUDACXX_ATOMIC_SCOPES_H