NVIDIA · wmaxey · Jul 29, 2021 · Jan 30, 2021 · Mar 9, 2021 · Jul 23, 2021
diff --git a/.upstream-tests/test/heterogeneous/barrier_parity.pass.cpp b/.upstream-tests/test/heterogeneous/barrier_parity.pass.cpp
@@ -0,0 +1,109 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: nvrtc, pre-sm-70
+
+// uncomment for a really verbose output detailing what test steps are being launched
+// #define DEBUG_TESTERS
+
+#include "helpers.h"
+
+#include <atomic>
+#include <cuda/barrier>
+
+template<typename Barrier>
+struct barrier_and_token
+{
+    using barrier_t = Barrier;
+    using token_t = typename barrier_t::arrival_token;
+
+    barrier_t barrier;
+    cuda::std::atomic<bool> parity_waiting{false};
+
+    template<typename ...Args>
+    __host__ __device__
+    barrier_and_token(Args && ...args) : barrier{ cuda::std::forward<Args>(args)... }
+    {
+    }
+};
+
+struct barrier_arrive_and_wait
+{
+    using async = cuda::std::true_type;
+
+    template<typename Data>
+    __host__ __device__
+    static void perform(Data & data)
+    {
+        while (data.parity_waiting.load(cuda::std::memory_order_acquire) == false)
+        {
+            data.parity_waiting.wait(false);
+        }
+        data.barrier.arrive_and_wait();
+    }
+};
+
+template <bool Phase>
+struct barrier_parity_wait
+{
+    using async = cuda::std::true_type;
+
+    template<typename Data>
+    __host__ __device__
+    static void perform(Data & data)
+    {
+        data.parity_waiting.store(true, cuda::std::memory_order_release);
+        data.parity_waiting.notify_all();
+        data.barrier.wait_parity(Phase);
+    }
+};
+
+struct clear_token
+{
+    template<typename Data>
+    __host__ __device__
+    static void perform(Data & data)
+    {
+        data.parity_waiting.store(false, cuda::std::memory_order_release);
+    }
+};
+
+using aw_aw_pw = performer_list<
+    barrier_parity_wait<false>,
+    barrier_arrive_and_wait,
+    barrier_arrive_and_wait,
+    async_tester_fence,
+    clear_token,
+    barrier_parity_wait<true>,
+    barrier_arrive_and_wait,
+    barrier_arrive_and_wait,
+    async_tester_fence,
+    clear_token
+>;
+
+void kernel_invoker()
+{
+    validate_not_movable<
+        barrier_and_token<cuda::std::barrier<>>,
+        aw_aw_pw
+      >(2);
+    validate_not_movable<
+        barrier_and_token<cuda::barrier<cuda::thread_scope_system>>,
+        aw_aw_pw
+      >(2);
+}
+
+int main(int arg, char ** argv)
+{
+#ifndef __CUDA_ARCH__
+    kernel_invoker();
+#endif
+
+    return 0;
+}
+
diff --git a/include/cuda/std/barrier b/include/cuda/std/barrier
@@ -40,8 +40,6 @@ class barrier : public std::__barrier_base<_CompletionF, _Sco> {
     template<thread_scope>
     friend class pipeline;
 
-    using std::__barrier_base<_CompletionF, _Sco>::__try_wait;
-
 public:
     barrier() = default;
 
@@ -88,24 +86,6 @@ class barrier<thread_scope_block, std::__empty_completion> : public __block_scop
 public:
     using arrival_token = typename __barrier_base::arrival_token;
 
-private:
-    struct __poll_tester {
-        barrier const* __this;
-        arrival_token __phase;
-
-        _LIBCUDACXX_INLINE_VISIBILITY
-        __poll_tester(barrier const* __this_, arrival_token&& __phase_)
-          : __this(__this_)
-          , __phase(_CUDA_VSTD::move(__phase_))
-        {}
-
-        inline _LIBCUDACXX_INLINE_VISIBILITY
-        bool operator()() const
-        {
-            return __this->__try_wait(__phase);
-        }
-    };
-
     _LIBCUDACXX_INLINE_VISIBILITY
     bool __try_wait(arrival_token __phase) const {
 #if __CUDA_ARCH__ >= 800
@@ -124,14 +104,35 @@ private:
         else
 #endif
         {
-            return __barrier.__try_wait(std::move(__phase));
+            return __barrier.try_wait(std::move(__phase));
         }
     }
 
     template<thread_scope>
     friend class pipeline;
 
-public:
+    _LIBCUDACXX_INLINE_VISIBILITY
+    bool __try_wait_parity(bool __parity) const {
+#if __CUDA_ARCH__ >= 800
+        if (__isShared(&__barrier)) {
+            int __ready = 0;
+            asm volatile ("{\n\t"
+                    ".reg .pred p;\n\t"
+                    "mbarrier.test_wait.parity.shared.b64 p, [%1], %2;\n\t"
+                    "selp.b32 %0, 1, 0, p;\n\t"
+                    "}"
+                : "=r"(__ready)
+                : "r"(static_cast<std::uint32_t>(__cvta_generic_to_shared(&__barrier))), "r"(static_cast<std::uint32_t>(__parity))
+                : "memory");
+            return bool(__ready);
+        }
+        else
+#endif
+        {
+            return __barrier.try_wait_parity(__parity);
+        }
+    }
+
     barrier() = default;
 
     barrier(const barrier &) = delete;
@@ -216,7 +217,24 @@ public:
     _LIBCUDACXX_INLINE_VISIBILITY
     void wait(arrival_token && __phase) const
     {
-        _CUDA_VSTD::__libcpp_thread_poll_with_backoff(__poll_tester(this, _CUDA_VSTD::move(__phase)));
+        _CUDA_VSTD::__libcpp_thread_poll_with_backoff(std::__barrier_poll_tester<barrier>(this, _CUDA_VSTD::move(__phase)));
+    }
+
+    _LIBCUDACXX_INLINE_VISIBILITY
+    bool try_wait(arrival_token __phase) const {
+        return __try_wait(_CUDA_VSTD::move(__phase));
+    }
+
+    inline _LIBCUDACXX_INLINE_VISIBILITY
+    bool try_wait_parity(bool __parity) const
+    {
+        return __try_wait_parity(__parity);
+    }
+
+    inline _LIBCUDACXX_INLINE_VISIBILITY
+    void wait_parity(bool __parity) const
+    {
+        _CUDA_VSTD::__libcpp_thread_poll_with_backoff(std::__barrier_poll_tester_parity<barrier>(this, __parity));
     }
 
     inline _LIBCUDACXX_INLINE_VISIBILITY

diff --git a/libcxx/include/barrier b/libcxx/include/barrier
@@ -209,6 +209,12 @@ class __barrier_base {
     _LIBCUDACXX_BARRIER_ALIGNMENTS __atomic_base<ptrdiff_t, _Sco> __expected, __arrived;
     _LIBCUDACXX_BARRIER_ALIGNMENTS _CompletionF                   __completion;
     _LIBCUDACXX_BARRIER_ALIGNMENTS __atomic_base<bool, _Sco>      __phase;
+
+    _LIBCUDACXX_INLINE_VISIBILITY
+    bool __try_wait_phase(bool __old_phase) const
+    {
+        return __phase.load(memory_order_acquire) != __old_phase;
+    }
 public:
     using arrival_token = bool;
 
@@ -241,11 +247,15 @@ public:
         return __old_phase;
     }
     _LIBCUDACXX_INLINE_VISIBILITY
-    bool __try_wait(arrival_token __old_phase) const
+    bool try_wait(arrival_token __old) const
     {
-        return __phase != __old_phase;
+        return __try_wait_phase(__old);
+    }
+    _LIBCUDACXX_INLINE_VISIBILITY
+    bool __try_wait_parity(bool __parity) const
+    {
+        return __try_wait_phase(__parity);
     }
-
     _LIBCUDACXX_INLINE_VISIBILITY
     void wait(arrival_token&& __old_phase) const
     {
@@ -270,6 +280,42 @@ public:
     }
 };
 
+template<class __Barrier>
+struct __barrier_poll_tester {
+    __Barrier const* __this;
+    typename __Barrier::arrival_token __phase;
+
+    _LIBCUDACXX_INLINE_VISIBILITY
+    __barrier_poll_tester(__Barrier const* __this_, typename __Barrier::arrival_token&& __phase_)
+        : __this(__this_)
+        , __phase(_CUDA_VSTD::move(__phase_))
+    {}
+
+    _LIBCUDACXX_INLINE_VISIBILITY
+    bool operator()() const
+    {
+        return __this->try_wait(__phase);
+    }
+};
+
+template<class __Barrier>
+struct __barrier_poll_tester_parity {
+    __Barrier const* __this;
+    bool __parity;
+
+    _LIBCUDACXX_INLINE_VISIBILITY
+    __barrier_poll_tester_parity(__Barrier const* __this_, bool __parity_)
+        : __this(__this_)
+        , __parity(__parity_)
+    {}
+
+    inline _LIBCUDACXX_INLINE_VISIBILITY
+    bool operator()() const
+    {
+        return __this->try_wait_parity(__parity);
+    }
+};
+
 template<int _Sco>
 class __barrier_base<__empty_completion, _Sco> {
 
@@ -285,29 +331,23 @@ public:
     using arrival_token = uint64_t;
 
 private:
-    struct __poll_tester {
-        __barrier_base const* __this;
-        arrival_token __phase;
-
-        _LIBCUDACXX_INLINE_VISIBILITY
-        __poll_tester(__barrier_base const* __this_, arrival_token&& __phase_)
-          : __this(__this_)
-          , __phase(_CUDA_VSTD::move(__phase_))
-        {}
-
-        inline _LIBCUDACXX_INLINE_VISIBILITY
-        bool operator()() const
-        {
-            return __this->__try_wait(__phase);
-        }
-    };
-
-    static inline _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR
+    static _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR
     uint64_t __init(ptrdiff_t __count) _NOEXCEPT
     {
         return (((1u << 31) - __count) << 32)
               | ((1u << 31) - __count);
     }
+    _LIBCUDACXX_INLINE_VISIBILITY
+    bool __try_wait_phase(uint64_t __phase) const
+    {
+        uint64_t const __current = __phase_arrived_expected.load(memory_order_acquire);
+        return ((__current & __phase_bit) != __phase);
+    }
+    _LIBCUDACXX_INLINE_VISIBILITY
+    bool __try_wait_parity(bool __parity) const
+    {
+        return __try_wait_phase(__parity ? __phase_bit : 0);
+    }
 
 public:
     __barrier_base() = default;
@@ -323,10 +363,20 @@ public:
     __barrier_base& operator=(__barrier_base const&) = delete;
 
     _LIBCUDACXX_INLINE_VISIBILITY
-    bool __try_wait(arrival_token __phase) const
+    bool __try_wait(arrival_token __old) const
     {
-        uint64_t const __current = __phase_arrived_expected.load(memory_order_acquire);
-        return ((__current & __phase_bit) != __phase);
+        return __try_wait_phase(__old & __phase_bit);
+    }
+
+    _LIBCUDACXX_INLINE_VISIBILITY
+    bool try_wait_parity(bool __parity) const
+    {
+        return __try_wait_parity(__parity);
+    }
+    _LIBCUDACXX_INLINE_VISIBILITY
+    bool try_wait(arrival_token __old) const
+    {
+        return __try_wait(__old);
     }
 
     _LIBCUDACXX_NODISCARD_ATTRIBUTE inline _LIBCUDACXX_INLINE_VISIBILITY
@@ -340,17 +390,22 @@ public:
         }
         return __old & __phase_bit;
     }
-    inline _LIBCUDACXX_INLINE_VISIBILITY
+    _LIBCUDACXX_INLINE_VISIBILITY
     void wait(arrival_token&& __phase) const
     {
-        __libcpp_thread_poll_with_backoff(__poll_tester(this, _CUDA_VSTD::move(__phase)));
+        __libcpp_thread_poll_with_backoff(__barrier_poll_tester<__barrier_base>(this, _CUDA_VSTD::move(__phase)));
     }
-    inline _LIBCUDACXX_INLINE_VISIBILITY
+    _LIBCUDACXX_INLINE_VISIBILITY
+    void wait_parity(bool __parity) const
+    {
+        __libcpp_thread_poll_with_backoff(__barrier_poll_tester_parity<__barrier_base>(this, __parity));
+    }
+    _LIBCUDACXX_INLINE_VISIBILITY
     void arrive_and_wait()
     {
         wait(arrive());
     }
-    inline _LIBCUDACXX_INLINE_VISIBILITY
+    _LIBCUDACXX_INLINE_VISIBILITY
     void arrive_and_drop()
     {
         __phase_arrived_expected.fetch_add(__expected_unit, memory_order_relaxed);