NVIDIA · wmaxey · Sep 28, 2021 · Sep 21, 2021 · Sep 28, 2021 · griwes
diff --git a/include/cuda/std/atomic b/include/cuda/std/atomic
@@ -37,6 +37,7 @@
     #undef ATOMIC_VAR_INIT
 #endif //__CUDACC_RTC__
 
+
 #include "cassert"
 #include "cstddef"
 #include "cstdint"
@@ -63,8 +64,6 @@ namespace __detail {
 using std::__detail::__thread_scope_block_tag;
 using std::__detail::__thread_scope_device_tag;
 using std::__detail::__thread_scope_system_tag;
-using std::__detail::__atomic_signal_fence_cuda;
-using std::__detail::__atomic_thread_fence_cuda;
 }
 
 using memory_order = std::memory_order;
@@ -173,32 +172,25 @@ inline __host__ __device__ void atomic_thread_fence(memory_order __m, thread_sco
         NV_IS_DEVICE, (
             switch(_Scope) {
             case thread_scope::thread_scope_system:
-                __detail::__atomic_thread_fence_cuda((int)__m, __detail::__thread_scope_system_tag());
+                std::__detail::__atomic_thread_fence_cuda((int)__m, __detail::__thread_scope_system_tag());
                 break;
             case thread_scope::thread_scope_device:
-                __detail::__atomic_thread_fence_cuda((int)__m, __detail::__thread_scope_device_tag());
+                std::__detail::__atomic_thread_fence_cuda((int)__m, __detail::__thread_scope_device_tag());
                 break;
             case thread_scope::thread_scope_block:
-                __detail::__atomic_thread_fence_cuda((int)__m, __detail::__thread_scope_block_tag());
+                std::__detail::__atomic_thread_fence_cuda((int)__m, __detail::__thread_scope_block_tag());
                 break;
             }
         ),
         NV_IS_HOST, (
             (void) _Scope;
-            ::std::atomic_thread_fence((::std::memory_order)__m);
+            std::atomic_thread_fence(__m);
         )
     )
 }
 
 inline __host__ __device__ void atomic_signal_fence(memory_order __m) {
-    NV_DISPATCH_TARGET(
-        NV_IS_DEVICE, (
-            __detail::__atomic_signal_fence_cuda((int)__m);
-        ),
-        NV_IS_HOST, (
-            ::std::atomic_signal_fence((::std::memory_order)__m);
-        )
-    )
+    std::atomic_signal_fence(__m);
 }
 
 _LIBCUDACXX_END_NAMESPACE_CUDA

diff --git a/include/cuda/std/detail/__config b/include/cuda/std/detail/__config
@@ -71,6 +71,8 @@
     #define __ELF__
 #endif
 
+#define _LIBCUDACXX_HAS_CUDA_ATOMIC_EXT
+
 #include "libcxx/include/__config"
 
 #if defined(__CUDA_ARCH__)

diff --git a/libcxx/include/atomic b/libcxx/include/atomic
@@ -680,6 +680,10 @@ __cxx_atomic_assign_volatile(_Tp volatile& __a_value, _Tv volatile const& __val)
 
 // Headers are wrapped like so: (cuda::std::|std::)detail
 namespace __detail {
+#if defined(_LIBCUDACXX_HAS_CUDA_ATOMIC_EXT)
+#  include "support/atomic/atomic_cuda_scopes.h"
+#endif
+
 #if defined(_LIBCUDACXX_HAS_CUDA_ATOMIC_IMPL)
 #  include "support/atomic/atomic_cuda.h"
 #elif defined(_LIBCUDACXX_HAS_MSVC_ATOMIC_IMPL)

diff --git a/libcxx/include/support/atomic/atomic_base.h b/libcxx/include/support/atomic/atomic_base.h
@@ -163,6 +163,34 @@ inline auto __cxx_atomic_fetch_xor(_Tp* __a, _Td __pattern,
                             __cxx_atomic_order_to_int(__order));
 }
 
+template <typename _Tp, typename _Td>
+inline auto __cxx_atomic_fetch_max(_Tp* __a, _Td __val,
+                           memory_order __order) -> __cxx_atomic_underlying_t<_Tp> {
+  auto __expected = __cxx_atomic_load(__a, memory_order_relaxed);
+  auto __desired = __expected > __val ? __expected : __val;
+
+  while(__desired == __val &&
+          !__cxx_atomic_compare_exchange_strong(__a, &__expected, __desired, __order, __order)) {
+      __desired = __expected > __val ? __expected : __val;
+  }
+
+  return __expected;
+}
+
+template <typename _Tp, typename _Td>
+inline auto __cxx_atomic_fetch_min(_Tp* __a, _Td __val,
+                           memory_order __order) -> __cxx_atomic_underlying_t<_Tp> {
+  auto __expected = __cxx_atomic_load(__a, memory_order_relaxed);
+  auto __desired = __expected < __val ? __expected : __val;
+
+  while(__desired == __val &&
+          !__cxx_atomic_compare_exchange_strong(__a, &__expected, __desired, __order, __order)) {
+      __desired = __expected < __val ? __expected : __val;
+  }
+
+  return __expected;
+}
+
 inline constexpr
  bool __cxx_atomic_is_lock_free(size_t __x) {
   #if defined(_LIBCUDACXX_NO_RUNTIME_LOCK_FREE)

diff --git a/libcxx/include/support/atomic/atomic_cuda.h b/libcxx/include/support/atomic/atomic_cuda.h
@@ -33,13 +33,6 @@
 #define __ATOMIC_SEQ_CST 5
 #endif //__ATOMIC_RELAXED
 
-#ifndef __ATOMIC_BLOCK
-#define __ATOMIC_SYSTEM 0 // 0 indicates default
-#define __ATOMIC_DEVICE 1
-#define __ATOMIC_BLOCK 2
-#define __ATOMIC_THREAD 10
-#endif //__ATOMIC_BLOCK
-
 inline __host__ __device__ int __stronger_order_cuda(int __a, int __b) {
     int const __max = __a > __b ? __a : __b;
     if(__max != __ATOMIC_RELEASE)
@@ -52,42 +45,6 @@ inline __host__ __device__ int __stronger_order_cuda(int __a, int __b) {
     return __xform[__a < __b ? __a : __b];
 }
 
-enum thread_scope {
-    thread_scope_system = __ATOMIC_SYSTEM,
-    thread_scope_device = __ATOMIC_DEVICE,
-    thread_scope_block = __ATOMIC_BLOCK,
-    thread_scope_thread = __ATOMIC_THREAD
-};
-
-#define _LIBCUDACXX_ATOMIC_SCOPE_TYPE ::cuda::thread_scope
-#define _LIBCUDACXX_ATOMIC_SCOPE_DEFAULT ::cuda::thread_scope::system
-
-struct __thread_scope_thread_tag { };
-struct __thread_scope_block_tag { };
-struct __thread_scope_device_tag { };
-struct __thread_scope_system_tag { };
-
-template<int _Scope>  struct __scope_enum_to_tag { };
-/* This would be the implementation once an actual thread-scope backend exists.
-template<> struct __scope_enum_to_tag<(int)thread_scope_thread> {
-    using type = __thread_scope_thread_tag; };
-Until then: */
-template<> struct __scope_enum_to_tag<(int)thread_scope_thread> {
-    using type = __thread_scope_block_tag; };
-template<> struct __scope_enum_to_tag<(int)thread_scope_block> {
-    using type = __thread_scope_block_tag; };
-template<> struct __scope_enum_to_tag<(int)thread_scope_device> {
-    using type = __thread_scope_device_tag; };
-template<> struct __scope_enum_to_tag<(int)thread_scope_system> {
-    using type = __thread_scope_system_tag; };
-
-template <int _Scope>
-_LIBCUDACXX_INLINE_VISIBILITY auto constexpr __scope_tag() ->
-        typename __scope_enum_to_tag<_Scope>::type {
-    return typename __scope_enum_to_tag<_Scope>::type();
-}
-// END TODO
-
 // Wrap host atomic implementations into a sub-namespace
 namespace __host {
 #if defined(_LIBCUDACXX_COMPILER_MSVC)
@@ -385,16 +342,7 @@ __host__ __device__
         NV_IS_DEVICE, (
             return __atomic_fetch_max_cuda(__cxx_get_underlying_device_atomic(__a), __val, __order, __scope_tag<_Sco>());
         ), (
-            // IS_HOST
-            _Tp __expected = __cxx_atomic_load(__a, memory_order_relaxed);
-            _Tp __desired = __expected > __val ? __expected : __val;
-
-            while(__desired == __val &&
-                    !__cxx_atomic_compare_exchange_strong(__a, &__expected, __desired, __order, __order)) {
-                __desired = __expected > __val ? __expected : __val;
-            }
-
-            return __expected;
+            return __host::__cxx_atomic_fetch_max(&__a->__a_value, __val, __order);
         )
     )
 }
@@ -406,16 +354,7 @@ __host__ __device__
         NV_IS_DEVICE, (
             return __atomic_fetch_min_cuda(__cxx_get_underlying_device_atomic(__a), __val, __order, __scope_tag<_Sco>());
         ), (
-            // IS_HOST
-            _Tp __expected = __cxx_atomic_load(__a, memory_order_relaxed);
-            _Tp __desired = __expected < __val ? __expected : __val;
-
-            while(__desired == __val &&
-                    !__cxx_atomic_compare_exchange_strong(__a, &__expected, __desired, __order, __order)) {
-                __desired = __expected < __val ? __expected : __val;
-            }
-
-            return __expected;
+            return __host::__cxx_atomic_fetch_min(&__a->__a_value, __val, __order);
         )
     )
 }

diff --git a/libcxx/include/support/atomic/atomic_cuda_scopes.h b/libcxx/include/support/atomic/atomic_cuda_scopes.h
@@ -0,0 +1,41 @@
+#ifndef __ATOMIC_BLOCK
+#define __ATOMIC_SYSTEM 0 // 0 indicates default
+#define __ATOMIC_DEVICE 1
+#define __ATOMIC_BLOCK 2
+#define __ATOMIC_THREAD 10
+#endif //__ATOMIC_BLOCK
+
+enum thread_scope {
+    thread_scope_system = __ATOMIC_SYSTEM,
+    thread_scope_device = __ATOMIC_DEVICE,
+    thread_scope_block = __ATOMIC_BLOCK,
+    thread_scope_thread = __ATOMIC_THREAD
+};
+
+#define _LIBCUDACXX_ATOMIC_SCOPE_TYPE ::cuda::thread_scope
+#define _LIBCUDACXX_ATOMIC_SCOPE_DEFAULT ::cuda::thread_scope::system
+
+struct __thread_scope_thread_tag { };
+struct __thread_scope_block_tag { };
+struct __thread_scope_device_tag { };
+struct __thread_scope_system_tag { };
+
+template<int _Scope>  struct __scope_enum_to_tag { };
+/* This would be the implementation once an actual thread-scope backend exists.
+template<> struct __scope_enum_to_tag<(int)thread_scope_thread> {
+    using type = __thread_scope_thread_tag; };
+Until then: */
+template<> struct __scope_enum_to_tag<(int)thread_scope_thread> {
+    using type = __thread_scope_block_tag; };
+template<> struct __scope_enum_to_tag<(int)thread_scope_block> {
+    using type = __thread_scope_block_tag; };
+template<> struct __scope_enum_to_tag<(int)thread_scope_device> {
+    using type = __thread_scope_device_tag; };
+template<> struct __scope_enum_to_tag<(int)thread_scope_system> {
+    using type = __thread_scope_system_tag; };
+
+template <int _Scope>
+_LIBCUDACXX_INLINE_VISIBILITY auto constexpr __scope_tag() ->
+        typename __scope_enum_to_tag<_Scope>::type {
+    return typename __scope_enum_to_tag<_Scope>::type();
+}