From 6f490752bf905595b832fa0d964602b0adf0e062 Mon Sep 17 00:00:00 2001
From: Wesley Maxey <wesley.maxey@gmail.com>
Date: Mon, 20 Sep 2021 18:29:07 -0700
Subject: [PATCH 1/2] Replace a few stray host ::std <atomic> uses with
 internals, add a __host form of fetch_min/max

---
 include/cuda/std/atomic                       | 20 ++----
 include/cuda/std/detail/__config              |  2 +
 libcxx/include/atomic                         |  4 ++
 libcxx/include/support/atomic/atomic_base.h   | 28 ++++++++
 libcxx/include/support/atomic/atomic_cuda.h   | 65 +------------------
 .../support/atomic/atomic_cuda_scopes.h       | 41 ++++++++++++
 6 files changed, 83 insertions(+), 77 deletions(-)
 create mode 100644 libcxx/include/support/atomic/atomic_cuda_scopes.h
diff --git a/include/cuda/std/atomic b/include/cuda/std/atomic
index 35fac823cf..fa277dd829 100644
--- a/include/cuda/std/atomic
+++ b/include/cuda/std/atomic
@@ -37,6 +37,7 @@
     #undef ATOMIC_VAR_INIT
 #endif //__CUDACC_RTC__
 
+
 #include "cassert"
 #include "cstddef"
 #include "cstdint"
@@ -63,8 +64,6 @@ namespace __detail {
 using std::__detail::__thread_scope_block_tag;
 using std::__detail::__thread_scope_device_tag;
 using std::__detail::__thread_scope_system_tag;
-using std::__detail::__atomic_signal_fence_cuda;
-using std::__detail::__atomic_thread_fence_cuda;
 }
 
 using memory_order = std::memory_order;
@@ -173,32 +172,25 @@ inline __host__ __device__ void atomic_thread_fence(memory_order __m, thread_sco
         NV_IS_DEVICE, (
             switch(_Scope) {
             case thread_scope::thread_scope_system:
-                __detail::__atomic_thread_fence_cuda((int)__m, __detail::__thread_scope_system_tag());
+                std::__detail::__atomic_thread_fence_cuda((int)__m, __detail::__thread_scope_system_tag());
                 break;
             case thread_scope::thread_scope_device:
-                __detail::__atomic_thread_fence_cuda((int)__m, __detail::__thread_scope_device_tag());
+                std::__detail::__atomic_thread_fence_cuda((int)__m, __detail::__thread_scope_device_tag());
                 break;
             case thread_scope::thread_scope_block:
-                __detail::__atomic_thread_fence_cuda((int)__m, __detail::__thread_scope_block_tag());
+                std::__detail::__atomic_thread_fence_cuda((int)__m, __detail::__thread_scope_block_tag());
                 break;
             }
         ),
         NV_IS_HOST, (
             (void) _Scope;
-            ::std::atomic_thread_fence((::std::memory_order)__m);
+            std::atomic_thread_fence(__m);
         )
     )
 }
 
 inline __host__ __device__ void atomic_signal_fence(memory_order __m) {
-    NV_DISPATCH_TARGET(
-        NV_IS_DEVICE, (
-            __detail::__atomic_signal_fence_cuda((int)__m);
-        ),
-        NV_IS_HOST, (
-            ::std::atomic_signal_fence((::std::memory_order)__m);
-        )
-    )
+    std::atomic_signal_fence(__m);
 }
 
 _LIBCUDACXX_END_NAMESPACE_CUDA
diff --git a/include/cuda/std/detail/__config b/include/cuda/std/detail/__config
index e735f7a053..45c6fac4ec 100644
--- a/include/cuda/std/detail/__config
+++ b/include/cuda/std/detail/__config
@@ -71,6 +71,8 @@
     #define __ELF__
 #endif
 
+#define _LIBCUDACXX_HAS_CUDA_ATOMIC_EXT
+
 #include "libcxx/include/__config"
 
 #if defined(__CUDA_ARCH__)
diff --git a/libcxx/include/atomic b/libcxx/include/atomic
index ceb679ae6f..27217e51f4 100644
--- a/libcxx/include/atomic
+++ b/libcxx/include/atomic
@@ -680,6 +680,10 @@ __cxx_atomic_assign_volatile(_Tp volatile& __a_value, _Tv volatile const& __val)
 
 // Headers are wrapped like so: (cuda::std::|std::)detail
 namespace __detail {
+#if defined(_LIBCUDACXX_HAS_CUDA_ATOMIC_EXT)
+#  include "support/atomic/atomic_cuda_scopes.h"
+#endif
+
 #if defined(_LIBCUDACXX_HAS_CUDA_ATOMIC_IMPL)
 #  include "support/atomic/atomic_cuda.h"
 #elif defined(_LIBCUDACXX_HAS_MSVC_ATOMIC_IMPL)
diff --git a/libcxx/include/support/atomic/atomic_base.h b/libcxx/include/support/atomic/atomic_base.h
index 4b362a8e6b..d03da59805 100644
--- a/libcxx/include/support/atomic/atomic_base.h
+++ b/libcxx/include/support/atomic/atomic_base.h
@@ -163,6 +163,34 @@ inline auto __cxx_atomic_fetch_xor(_Tp* __a, _Td __pattern,
                             __cxx_atomic_order_to_int(__order));
 }
 
+template <typename _Tp, typename _Td>
+inline auto __cxx_atomic_fetch_max(_Tp* __a, _Td __val,
+                           memory_order __order) -> __cxx_atomic_underlying_t<_Tp> {
+  auto __expected = __cxx_atomic_load(__a, memory_order_relaxed);
+  auto __desired = __expected > __val ? __expected : __val;
+
+  while(__desired == __val &&
+          !__cxx_atomic_compare_exchange_strong(__a, &__expected, __desired, __order, __order)) {
+      __desired = __expected > __val ? __expected : __val;
+  }
+
+  return __expected;
+}
+
+template <typename _Tp, typename _Td>
+inline auto __cxx_atomic_fetch_min(_Tp* __a, _Td __val,
+                           memory_order __order) -> __cxx_atomic_underlying_t<_Tp> {
+  auto __expected = __cxx_atomic_load(__a, memory_order_relaxed);
+  auto __desired = __expected < __val ? __expected : __val;
+
+  while(__desired == __val &&
+          !__cxx_atomic_compare_exchange_strong(__a, &__expected, __desired, __order, __order)) {
+      __desired = __expected < __val ? __expected : __val;
+  }
+
+  return __expected;
+}
+
 inline constexpr
  bool __cxx_atomic_is_lock_free(size_t __x) {
   #if defined(_LIBCUDACXX_NO_RUNTIME_LOCK_FREE)
diff --git a/libcxx/include/support/atomic/atomic_cuda.h b/libcxx/include/support/atomic/atomic_cuda.h
index 84ce6d7388..4ecc096b41 100644
--- a/libcxx/include/support/atomic/atomic_cuda.h
+++ b/libcxx/include/support/atomic/atomic_cuda.h
@@ -33,13 +33,6 @@
 #define __ATOMIC_SEQ_CST 5
 #endif //__ATOMIC_RELAXED
 
-#ifndef __ATOMIC_BLOCK
-#define __ATOMIC_SYSTEM 0 // 0 indicates default
-#define __ATOMIC_DEVICE 1
-#define __ATOMIC_BLOCK 2
-#define __ATOMIC_THREAD 10
-#endif //__ATOMIC_BLOCK
-
 inline __host__ __device__ int __stronger_order_cuda(int __a, int __b) {
     int const __max = __a > __b ? __a : __b;
     if(__max != __ATOMIC_RELEASE)
@@ -52,42 +45,6 @@ inline __host__ __device__ int __stronger_order_cuda(int __a, int __b) {
     return __xform[__a < __b ? __a : __b];
 }
 
-enum thread_scope {
-    thread_scope_system = __ATOMIC_SYSTEM,
-    thread_scope_device = __ATOMIC_DEVICE,
-    thread_scope_block = __ATOMIC_BLOCK,
-    thread_scope_thread = __ATOMIC_THREAD
-};
-
-#define _LIBCUDACXX_ATOMIC_SCOPE_TYPE ::cuda::thread_scope
-#define _LIBCUDACXX_ATOMIC_SCOPE_DEFAULT ::cuda::thread_scope::system
-
-struct __thread_scope_thread_tag { };
-struct __thread_scope_block_tag { };
-struct __thread_scope_device_tag { };
-struct __thread_scope_system_tag { };
-
-template<int _Scope>  struct __scope_enum_to_tag { };
-/* This would be the implementation once an actual thread-scope backend exists.
-template<> struct __scope_enum_to_tag<(int)thread_scope_thread> {
-    using type = __thread_scope_thread_tag; };
-Until then: */
-template<> struct __scope_enum_to_tag<(int)thread_scope_thread> {
-    using type = __thread_scope_block_tag; };
-template<> struct __scope_enum_to_tag<(int)thread_scope_block> {
-    using type = __thread_scope_block_tag; };
-template<> struct __scope_enum_to_tag<(int)thread_scope_device> {
-    using type = __thread_scope_device_tag; };
-template<> struct __scope_enum_to_tag<(int)thread_scope_system> {
-    using type = __thread_scope_system_tag; };
-
-template <int _Scope>
-_LIBCUDACXX_INLINE_VISIBILITY auto constexpr __scope_tag() ->
-        typename __scope_enum_to_tag<_Scope>::type {
-    return typename __scope_enum_to_tag<_Scope>::type();
-}
-// END TODO
-
 // Wrap host atomic implementations into a sub-namespace
 namespace __host {
 #if defined(_LIBCUDACXX_COMPILER_MSVC)
@@ -385,16 +342,7 @@ __host__ __device__
         NV_IS_DEVICE, (
             return __atomic_fetch_max_cuda(__cxx_get_underlying_device_atomic(__a), __val, __order, __scope_tag<_Sco>());
         ), (
-            // IS_HOST
-            _Tp __expected = __cxx_atomic_load(__a, memory_order_relaxed);
-            _Tp __desired = __expected > __val ? __expected : __val;
-
-            while(__desired == __val &&
-                    !__cxx_atomic_compare_exchange_strong(__a, &__expected, __desired, __order, __order)) {
-                __desired = __expected > __val ? __expected : __val;
-            }
-
-            return __expected;
+            return __host::__cxx_atomic_fetch_max(&__a->__a_value, __val, __order);
         )
     )
 }
@@ -406,16 +354,7 @@ __host__ __device__
         NV_IS_DEVICE, (
             return __atomic_fetch_min_cuda(__cxx_get_underlying_device_atomic(__a), __val, __order, __scope_tag<_Sco>());
         ), (
-            // IS_HOST
-            _Tp __expected = __cxx_atomic_load(__a, memory_order_relaxed);
-            _Tp __desired = __expected < __val ? __expected : __val;
-
-            while(__desired == __val &&
-                    !__cxx_atomic_compare_exchange_strong(__a, &__expected, __desired, __order, __order)) {
-                __desired = __expected < __val ? __expected : __val;
-            }
-
-            return __expected;
+            return __host::__cxx_atomic_fetch_min(&__a->__a_value, __val, __order);
         )
     )
 }
diff --git a/libcxx/include/support/atomic/atomic_cuda_scopes.h b/libcxx/include/support/atomic/atomic_cuda_scopes.h
new file mode 100644
index 0000000000..4d8e1477dd
--- /dev/null
+++ b/libcxx/include/support/atomic/atomic_cuda_scopes.h
@@ -0,0 +1,41 @@
+#ifndef __ATOMIC_BLOCK
+#define __ATOMIC_SYSTEM 0 // 0 indicates default
+#define __ATOMIC_DEVICE 1
+#define __ATOMIC_BLOCK 2
+#define __ATOMIC_THREAD 10
+#endif //__ATOMIC_BLOCK
+
+enum thread_scope {
+    thread_scope_system = __ATOMIC_SYSTEM,
+    thread_scope_device = __ATOMIC_DEVICE,
+    thread_scope_block = __ATOMIC_BLOCK,
+    thread_scope_thread = __ATOMIC_THREAD
+};
+
+#define _LIBCUDACXX_ATOMIC_SCOPE_TYPE ::cuda::thread_scope
+#define _LIBCUDACXX_ATOMIC_SCOPE_DEFAULT ::cuda::thread_scope::system
+
+struct __thread_scope_thread_tag { };
+struct __thread_scope_block_tag { };
+struct __thread_scope_device_tag { };
+struct __thread_scope_system_tag { };
+
+template<int _Scope>  struct __scope_enum_to_tag { };
+/* This would be the implementation once an actual thread-scope backend exists.
+template<> struct __scope_enum_to_tag<(int)thread_scope_thread> {
+    using type = __thread_scope_thread_tag; };
+Until then: */
+template<> struct __scope_enum_to_tag<(int)thread_scope_thread> {
+    using type = __thread_scope_block_tag; };
+template<> struct __scope_enum_to_tag<(int)thread_scope_block> {
+    using type = __thread_scope_block_tag; };
+template<> struct __scope_enum_to_tag<(int)thread_scope_device> {
+    using type = __thread_scope_device_tag; };
+template<> struct __scope_enum_to_tag<(int)thread_scope_system> {
+    using type = __thread_scope_system_tag; };
+
+template <int _Scope>
+_LIBCUDACXX_INLINE_VISIBILITY auto constexpr __scope_tag() ->
+        typename __scope_enum_to_tag<_Scope>::type {
+    return typename __scope_enum_to_tag<_Scope>::type();
+}

From 21563c7bfeb9c3612570f4c3443afa2b7834b5b6 Mon Sep 17 00:00:00 2001
From: Wesley Maxey <wesley.maxey@gmail.com>
Date: Mon, 27 Sep 2021 19:47:02 -0700
Subject: [PATCH 2/2] Rename CUDA scopes header to atomic_scopes.h

---
 libcxx/include/atomic                                        | 2 +-
 .../support/atomic/{atomic_cuda_scopes.h => atomic_scopes.h} | 5 +++++
 2 files changed, 6 insertions(+), 1 deletion(-)
 rename libcxx/include/support/atomic/{atomic_cuda_scopes.h => atomic_scopes.h} (93%)

diff --git a/libcxx/include/atomic b/libcxx/include/atomic
index 27217e51f4..7ca9731aaa 100644
--- a/libcxx/include/atomic
+++ b/libcxx/include/atomic
@@ -681,7 +681,7 @@ __cxx_atomic_assign_volatile(_Tp volatile& __a_value, _Tv volatile const& __val)
 // Headers are wrapped like so: (cuda::std::|std::)detail
 namespace __detail {
 #if defined(_LIBCUDACXX_HAS_CUDA_ATOMIC_EXT)
-#  include "support/atomic/atomic_cuda_scopes.h"
+#  include "support/atomic/atomic_scopes.h"
 #endif
 
 #if defined(_LIBCUDACXX_HAS_CUDA_ATOMIC_IMPL)
diff --git a/libcxx/include/support/atomic/atomic_cuda_scopes.h b/libcxx/include/support/atomic/atomic_scopes.h
similarity index 93%
rename from libcxx/include/support/atomic/atomic_cuda_scopes.h
rename to libcxx/include/support/atomic/atomic_scopes.h
index 4d8e1477dd..89d73bb3f8 100644
--- a/libcxx/include/support/atomic/atomic_cuda_scopes.h
+++ b/libcxx/include/support/atomic/atomic_scopes.h
@@ -1,3 +1,6 @@
+#ifndef __LIBCUDACXX_ATOMIC_SCOPES_H
+#define __LIBCUDACXX_ATOMIC_SCOPES_H
+
 #ifndef __ATOMIC_BLOCK
 #define __ATOMIC_SYSTEM 0 // 0 indicates default
 #define __ATOMIC_DEVICE 1
@@ -39,3 +42,5 @@ _LIBCUDACXX_INLINE_VISIBILITY auto constexpr __scope_tag() ->
         typename __scope_enum_to_tag<_Scope>::type {
     return typename __scope_enum_to_tag<_Scope>::type();
 }
+
+#endif // __LIBCUDACXX_ATOMIC_SCOPES_H