From 69e7f775ad33717e9d7a1a5c2d3a072522dc78ce Mon Sep 17 00:00:00 2001 From: Olivier Giroux Date: Fri, 29 Jan 2021 17:10:48 -0800 Subject: [PATCH 1/5] Added parity waiting --- include/cuda/std/barrier | 68 ++++++++++++++++++++++++++++------------ libcxx/include/barrier | 51 ++++++++++++++++++------------ 2 files changed, 79 insertions(+), 40 deletions(-) diff --git a/include/cuda/std/barrier b/include/cuda/std/barrier index b35e927e34..781be24296 100644 --- a/include/cuda/std/barrier +++ b/include/cuda/std/barrier @@ -77,6 +77,31 @@ _LIBCUDACXX_END_NAMESPACE_CUDA_DEVICE _LIBCUDACXX_BEGIN_NAMESPACE_CUDA +template +struct __barrier_poll_tester_parity { + __Barrier const* __this; + bool __parity; + + _LIBCUDACXX_INLINE_VISIBILITY + __barrier_poll_tester_parity(__Barrier const* __this_, bool __parity_) + : __this(__this_) + , __parity(__parity_) + {} + + inline _LIBCUDACXX_INLINE_VISIBILITY + bool operator()() const + { + return __this->__try_wait_parity(__parity); + } +}; + +template +inline _LIBCUDACXX_INLINE_VISIBILITY +void barrier_wait_for_parity(__Barrier const* __self, bool __parity) +{ + _CUDA_VSTD::__libcpp_thread_poll_with_backoff(__barrier_poll_tester_parity<__Barrier>(__self, __parity)); +} + template<> class barrier : public __block_scope_barrier_base { using __barrier_base = std::__barrier_base; @@ -88,24 +113,6 @@ class barrier : public __block_scop public: using arrival_token = typename __barrier_base::arrival_token; -private: - struct __poll_tester { - barrier const* __this; - arrival_token __phase; - - _LIBCUDACXX_INLINE_VISIBILITY - __poll_tester(barrier const* __this_, arrival_token&& __phase_) - : __this(__this_) - , __phase(_CUDA_VSTD::move(__phase_)) - {} - - inline _LIBCUDACXX_INLINE_VISIBILITY - bool operator()() const - { - return __this->__try_wait(__phase); - } - }; - _LIBCUDACXX_INLINE_VISIBILITY bool __try_wait(arrival_token __phase) const { #if __CUDA_ARCH__ >= 800 @@ -131,7 +138,28 @@ private: template friend class pipeline; -public: + _LIBCUDACXX_INLINE_VISIBILITY + bool __try_wait_parity(bool __parity) const { +#if __CUDA_ARCH__ >= 800 + if (__isShared(&__barrier)) { + int __ready = 0; + asm volatile ("{\n\t" + ".reg .pred p;\n\t" + "mbarrier.test_wait.parity.shared.b64 p, [%1], %2;\n\t" + "selp.b32 %0, 1, 0, p;\n\t" + "}" + : "=r"(__ready) + : "r"(static_cast(__cvta_generic_to_shared(&__barrier))), "r"(static_cast(__parity)) + : "memory"); + return bool(__ready); + } + else +#endif + { + return __barrier.__try_wait_parity(__parity); + } + } + barrier() = default; barrier(const barrier &) = delete; @@ -216,7 +244,7 @@ public: _LIBCUDACXX_INLINE_VISIBILITY void wait(arrival_token && __phase) const { - _CUDA_VSTD::__libcpp_thread_poll_with_backoff(__poll_tester(this, _CUDA_VSTD::move(__phase))); + _CUDA_VSTD::__libcpp_thread_poll_with_backoff(std::__barrier_poll_tester(this, _CUDA_VSTD::move(__phase))); } inline _LIBCUDACXX_INLINE_VISIBILITY diff --git a/libcxx/include/barrier b/libcxx/include/barrier index 762df1934d..9b4a240894 100644 --- a/libcxx/include/barrier +++ b/libcxx/include/barrier @@ -270,6 +270,24 @@ public: } }; +template +struct __barrier_poll_tester { + __Barrier const* __this; + typename __Barrier::arrival_token __phase; + + _LIBCUDACXX_INLINE_VISIBILITY + __barrier_poll_tester(__Barrier const* __this_, typename __Barrier::arrival_token&& __phase_) + : __this(__this_) + , __phase(_CUDA_VSTD::move(__phase_)) + {} + + inline _LIBCUDACXX_INLINE_VISIBILITY + bool operator()() const + { + return __this->__try_wait(__phase); + } +}; + template class __barrier_base<__empty_completion, _Sco> { @@ -285,23 +303,6 @@ public: using arrival_token = uint64_t; private: - struct __poll_tester { - __barrier_base const* __this; - arrival_token __phase; - - _LIBCUDACXX_INLINE_VISIBILITY - __poll_tester(__barrier_base const* __this_, arrival_token&& __phase_) - : __this(__this_) - , __phase(_CUDA_VSTD::move(__phase_)) - {} - - inline _LIBCUDACXX_INLINE_VISIBILITY - bool operator()() const - { - return __this->__try_wait(__phase); - } - }; - static inline _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR uint64_t __init(ptrdiff_t __count) _NOEXCEPT { @@ -322,12 +323,22 @@ public: __barrier_base(__barrier_base const&) = delete; __barrier_base& operator=(__barrier_base const&) = delete; - _LIBCUDACXX_INLINE_VISIBILITY - bool __try_wait(arrival_token __phase) const + inline _LIBCUDACXX_INLINE_VISIBILITY + bool __try_wait_phase(uint64_t __phase) const { uint64_t const __current = __phase_arrived_expected.load(memory_order_acquire); return ((__current & __phase_bit) != __phase); } + inline _LIBCUDACXX_INLINE_VISIBILITY + bool __try_wait_parity(bool __parity) const + { + return __try_wait_phase(__parity ? __phase_bit : 0); + } + inline _LIBCUDACXX_INLINE_VISIBILITY + bool __try_wait(arrival_token __old) const + { + return __try_wait_phase(__old & __phase_bit); + } _LIBCUDACXX_NODISCARD_ATTRIBUTE inline _LIBCUDACXX_INLINE_VISIBILITY arrival_token arrive(ptrdiff_t __update = 1) @@ -343,7 +354,7 @@ public: inline _LIBCUDACXX_INLINE_VISIBILITY void wait(arrival_token&& __phase) const { - __libcpp_thread_poll_with_backoff(__poll_tester(this, _CUDA_VSTD::move(__phase))); + __libcpp_thread_poll_with_backoff(__barrier_poll_tester<__barrier_base<__empty_completion, _Sco>>(this, _CUDA_VSTD::move(__phase))); } inline _LIBCUDACXX_INLINE_VISIBILITY void arrive_and_wait() From 6e72dc6f676fe06bffb85fc572f1bbfe7f711efa Mon Sep 17 00:00:00 2001 From: Olivier Giroux Date: Tue, 9 Mar 2021 08:40:04 -0800 Subject: [PATCH 2/5] Added try_wait options --- include/cuda/std/barrier | 19 +++++++++++------ libcxx/include/barrier | 46 ++++++++++++++++++++++++---------------- 2 files changed, 40 insertions(+), 25 deletions(-) diff --git a/include/cuda/std/barrier b/include/cuda/std/barrier index 781be24296..f9f31b93ab 100644 --- a/include/cuda/std/barrier +++ b/include/cuda/std/barrier @@ -40,8 +40,6 @@ class barrier : public std::__barrier_base<_CompletionF, _Sco> { template friend class pipeline; - using std::__barrier_base<_CompletionF, _Sco>::__try_wait; - public: barrier() = default; @@ -77,6 +75,13 @@ _LIBCUDACXX_END_NAMESPACE_CUDA_DEVICE _LIBCUDACXX_BEGIN_NAMESPACE_CUDA +template +inline _LIBCUDACXX_INLINE_VISIBILITY +bool barrier_try_wait_parity(__Barrier const* __this, bool __parity) +{ + return __this->__try_wait_parity(__parity); +} + template struct __barrier_poll_tester_parity { __Barrier const* __this; @@ -91,15 +96,15 @@ struct __barrier_poll_tester_parity { inline _LIBCUDACXX_INLINE_VISIBILITY bool operator()() const { - return __this->__try_wait_parity(__parity); + return barrier_try_wait_parity(__this, __parity); } }; template inline _LIBCUDACXX_INLINE_VISIBILITY -void barrier_wait_for_parity(__Barrier const* __self, bool __parity) +void barrier_wait_parity(__Barrier const* __this, bool __parity) { - _CUDA_VSTD::__libcpp_thread_poll_with_backoff(__barrier_poll_tester_parity<__Barrier>(__self, __parity)); + _CUDA_VSTD::__libcpp_thread_poll_with_backoff(__barrier_poll_tester_parity<__Barrier>(__this, __parity)); } template<> @@ -114,7 +119,7 @@ public: using arrival_token = typename __barrier_base::arrival_token; _LIBCUDACXX_INLINE_VISIBILITY - bool __try_wait(arrival_token __phase) const { + bool try_wait(arrival_token __phase) const { #if __CUDA_ARCH__ >= 800 if (__isShared(&__barrier)) { int __ready = 0; @@ -131,7 +136,7 @@ public: else #endif { - return __barrier.__try_wait(std::move(__phase)); + return __barrier.try_wait(std::move(__phase)); } } diff --git a/libcxx/include/barrier b/libcxx/include/barrier index 9b4a240894..125fa6625d 100644 --- a/libcxx/include/barrier +++ b/libcxx/include/barrier @@ -209,6 +209,12 @@ class __barrier_base { _LIBCUDACXX_BARRIER_ALIGNMENTS __atomic_base __expected, __arrived; _LIBCUDACXX_BARRIER_ALIGNMENTS _CompletionF __completion; _LIBCUDACXX_BARRIER_ALIGNMENTS __atomic_base __phase; + + _LIBCUDACXX_INLINE_VISIBILITY + bool __try_wait_phase(bool __old_phase) const + { + return __phase.load(memory_order_acquire) != __old_phase; + } public: using arrival_token = bool; @@ -241,11 +247,15 @@ public: return __old_phase; } _LIBCUDACXX_INLINE_VISIBILITY - bool __try_wait(arrival_token __old_phase) const + bool try_wait(arrival_token __old) const { - return __phase != __old_phase; + return __try_wait_phase(__old); + } + _LIBCUDACXX_INLINE_VISIBILITY + bool __try_wait_parity(bool __parity) const + { + return __try_wait_phase(__parity); } - _LIBCUDACXX_INLINE_VISIBILITY void wait(arrival_token&& __old_phase) const { @@ -281,10 +291,10 @@ struct __barrier_poll_tester { , __phase(_CUDA_VSTD::move(__phase_)) {} - inline _LIBCUDACXX_INLINE_VISIBILITY + _LIBCUDACXX_INLINE_VISIBILITY bool operator()() const { - return __this->__try_wait(__phase); + return __this->try_wait(__phase); } }; @@ -303,12 +313,18 @@ public: using arrival_token = uint64_t; private: - static inline _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR + static _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR uint64_t __init(ptrdiff_t __count) _NOEXCEPT { return (((1u << 31) - __count) << 32) | ((1u << 31) - __count); } + _LIBCUDACXX_INLINE_VISIBILITY + bool __try_wait_phase(uint64_t __phase) const + { + uint64_t const __current = __phase_arrived_expected.load(memory_order_acquire); + return ((__current & __phase_bit) != __phase); + } public: __barrier_base() = default; @@ -323,19 +339,13 @@ public: __barrier_base(__barrier_base const&) = delete; __barrier_base& operator=(__barrier_base const&) = delete; - inline _LIBCUDACXX_INLINE_VISIBILITY - bool __try_wait_phase(uint64_t __phase) const - { - uint64_t const __current = __phase_arrived_expected.load(memory_order_acquire); - return ((__current & __phase_bit) != __phase); - } - inline _LIBCUDACXX_INLINE_VISIBILITY + _LIBCUDACXX_INLINE_VISIBILITY bool __try_wait_parity(bool __parity) const { return __try_wait_phase(__parity ? __phase_bit : 0); } - inline _LIBCUDACXX_INLINE_VISIBILITY - bool __try_wait(arrival_token __old) const + _LIBCUDACXX_INLINE_VISIBILITY + bool try_wait(arrival_token __old) const { return __try_wait_phase(__old & __phase_bit); } @@ -351,17 +361,17 @@ public: } return __old & __phase_bit; } - inline _LIBCUDACXX_INLINE_VISIBILITY + _LIBCUDACXX_INLINE_VISIBILITY void wait(arrival_token&& __phase) const { __libcpp_thread_poll_with_backoff(__barrier_poll_tester<__barrier_base<__empty_completion, _Sco>>(this, _CUDA_VSTD::move(__phase))); } - inline _LIBCUDACXX_INLINE_VISIBILITY + _LIBCUDACXX_INLINE_VISIBILITY void arrive_and_wait() { wait(arrive()); } - inline _LIBCUDACXX_INLINE_VISIBILITY + _LIBCUDACXX_INLINE_VISIBILITY void arrive_and_drop() { __phase_arrived_expected.fetch_add(__expected_unit, memory_order_relaxed); From 1464783cccad0fc8387627deb4c02ab36a4539a7 Mon Sep 17 00:00:00 2001 From: Wesley Maxey Date: Thu, 22 Jul 2021 22:25:53 -0700 Subject: [PATCH 3/5] Add a small parity wait test --- .../heterogeneous/barrier_parity.pass.cpp | 104 ++++++++++++++++++ 1 file changed, 104 insertions(+) create mode 100644 .upstream-tests/test/heterogeneous/barrier_parity.pass.cpp diff --git a/.upstream-tests/test/heterogeneous/barrier_parity.pass.cpp b/.upstream-tests/test/heterogeneous/barrier_parity.pass.cpp new file mode 100644 index 0000000000..fbf76ddf3f --- /dev/null +++ b/.upstream-tests/test/heterogeneous/barrier_parity.pass.cpp @@ -0,0 +1,104 @@ +//===----------------------------------------------------------------------===// +// +// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: nvrtc, pre-sm-70 + +// uncomment for a really verbose output detailing what test steps are being launched +// #define DEBUG_TESTERS + +#include "helpers.h" + +#include +#include + +template +struct barrier_and_token +{ + using barrier_t = Barrier; + using token_t = typename barrier_t::arrival_token; + + barrier_t barrier; + cuda::std::atomic parity_waiting{false}; + + template + __host__ __device__ + barrier_and_token(Args && ...args) : barrier{ cuda::std::forward(args)... } + { + } +}; + +struct barrier_arrive_and_wait +{ + using async = cuda::std::true_type; + + template + __host__ __device__ + static void perform(Data & data) + { + while (data.parity_waiting.load(cuda::std::memory_order_acquire) == false) + { + data.parity_waiting.wait(false); + } + data.barrier.arrive_and_wait(); + } +}; + +template +struct barrier_arrive_parity_wait +{ + using async = cuda::std::true_type; + + template + __host__ __device__ + static void perform(Data & data) + { + data.parity_waiting.store(true, cuda::std::memory_order_release); + data.parity_waiting.notify_all(); + cuda::barrier_wait_parity(&data.barrier, Phase); + } +}; + +struct clear_token +{ + template + __host__ __device__ + static void perform(Data & data) + { + data.parity_waiting.store(false, cuda::std::memory_order_release); + } +}; + +using aw_aw_pw = performer_list< + barrier_arrive_and_wait, + barrier_arrive_and_wait, + barrier_arrive_parity_wait, + async_tester_fence, + clear_token +>; + +void kernel_invoker() +{ + validate_not_movable< + barrier_and_token>, + aw_aw_pw + >(2); + validate_not_movable< + barrier_and_token>, + aw_aw_pw + >(2); +} + +int main(int arg, char ** argv) +{ +#ifndef __CUDA_ARCH__ + kernel_invoker(); +#endif + + return 0; +} + From bc80e36826fc8a8369cff8be26f31988ce8765ba Mon Sep 17 00:00:00 2001 From: Wesley Maxey Date: Fri, 23 Jul 2021 21:27:54 -0700 Subject: [PATCH 4/5] Extend the test to measure both phases, make barrier_(try_)wait_parity member functions --- .../heterogeneous/barrier_parity.pass.cpp | 11 +++-- include/cuda/std/barrier | 46 ++++++------------- libcxx/include/barrier | 34 ++++++++++++-- 3 files changed, 52 insertions(+), 39 deletions(-) diff --git a/.upstream-tests/test/heterogeneous/barrier_parity.pass.cpp b/.upstream-tests/test/heterogeneous/barrier_parity.pass.cpp index fbf76ddf3f..2c4460995b 100644 --- a/.upstream-tests/test/heterogeneous/barrier_parity.pass.cpp +++ b/.upstream-tests/test/heterogeneous/barrier_parity.pass.cpp @@ -49,7 +49,7 @@ struct barrier_arrive_and_wait }; template -struct barrier_arrive_parity_wait +struct barrier_parity_wait { using async = cuda::std::true_type; @@ -59,7 +59,7 @@ struct barrier_arrive_parity_wait { data.parity_waiting.store(true, cuda::std::memory_order_release); data.parity_waiting.notify_all(); - cuda::barrier_wait_parity(&data.barrier, Phase); + data.barrier.wait_parity(Phase); } }; @@ -74,9 +74,14 @@ struct clear_token }; using aw_aw_pw = performer_list< + barrier_parity_wait, + barrier_arrive_and_wait, + barrier_arrive_and_wait, + async_tester_fence, + clear_token, + barrier_parity_wait, barrier_arrive_and_wait, barrier_arrive_and_wait, - barrier_arrive_parity_wait, async_tester_fence, clear_token >; diff --git a/include/cuda/std/barrier b/include/cuda/std/barrier index f9f31b93ab..d24b2a7838 100644 --- a/include/cuda/std/barrier +++ b/include/cuda/std/barrier @@ -75,38 +75,6 @@ _LIBCUDACXX_END_NAMESPACE_CUDA_DEVICE _LIBCUDACXX_BEGIN_NAMESPACE_CUDA -template -inline _LIBCUDACXX_INLINE_VISIBILITY -bool barrier_try_wait_parity(__Barrier const* __this, bool __parity) -{ - return __this->__try_wait_parity(__parity); -} - -template -struct __barrier_poll_tester_parity { - __Barrier const* __this; - bool __parity; - - _LIBCUDACXX_INLINE_VISIBILITY - __barrier_poll_tester_parity(__Barrier const* __this_, bool __parity_) - : __this(__this_) - , __parity(__parity_) - {} - - inline _LIBCUDACXX_INLINE_VISIBILITY - bool operator()() const - { - return barrier_try_wait_parity(__this, __parity); - } -}; - -template -inline _LIBCUDACXX_INLINE_VISIBILITY -void barrier_wait_parity(__Barrier const* __this, bool __parity) -{ - _CUDA_VSTD::__libcpp_thread_poll_with_backoff(__barrier_poll_tester_parity<__Barrier>(__this, __parity)); -} - template<> class barrier : public __block_scope_barrier_base { using __barrier_base = std::__barrier_base; @@ -161,7 +129,7 @@ public: else #endif { - return __barrier.__try_wait_parity(__parity); + return __barrier.try_wait_parity(__parity); } } @@ -252,6 +220,18 @@ public: _CUDA_VSTD::__libcpp_thread_poll_with_backoff(std::__barrier_poll_tester(this, _CUDA_VSTD::move(__phase))); } + inline _LIBCUDACXX_INLINE_VISIBILITY + bool try_wait_parity(bool __parity) const + { + return __try_wait_parity(__parity); + } + + inline _LIBCUDACXX_INLINE_VISIBILITY + void wait_parity(bool __parity) const + { + _CUDA_VSTD::__libcpp_thread_poll_with_backoff(std::__barrier_poll_tester_parity(this, __parity)); + } + inline _LIBCUDACXX_INLINE_VISIBILITY void arrive_and_wait() { diff --git a/libcxx/include/barrier b/libcxx/include/barrier index 125fa6625d..83abb95697 100644 --- a/libcxx/include/barrier +++ b/libcxx/include/barrier @@ -298,6 +298,24 @@ struct __barrier_poll_tester { } }; +template +struct __barrier_poll_tester_parity { + __Barrier const* __this; + bool __parity; + + _LIBCUDACXX_INLINE_VISIBILITY + __barrier_poll_tester_parity(__Barrier const* __this_, bool __parity_) + : __this(__this_) + , __parity(__parity_) + {} + + inline _LIBCUDACXX_INLINE_VISIBILITY + bool operator()() const + { + return __this->try_wait_parity(__parity); + } +}; + template class __barrier_base<__empty_completion, _Sco> { @@ -325,6 +343,11 @@ private: uint64_t const __current = __phase_arrived_expected.load(memory_order_acquire); return ((__current & __phase_bit) != __phase); } + _LIBCUDACXX_INLINE_VISIBILITY + bool __try_wait_parity(bool __parity) const + { + return __try_wait_phase(__parity ? __phase_bit : 0); + } public: __barrier_base() = default; @@ -340,9 +363,9 @@ public: __barrier_base& operator=(__barrier_base const&) = delete; _LIBCUDACXX_INLINE_VISIBILITY - bool __try_wait_parity(bool __parity) const + bool try_wait_parity(bool __parity) const { - return __try_wait_phase(__parity ? __phase_bit : 0); + return __try_wait_parity(__parity); } _LIBCUDACXX_INLINE_VISIBILITY bool try_wait(arrival_token __old) const @@ -364,7 +387,12 @@ public: _LIBCUDACXX_INLINE_VISIBILITY void wait(arrival_token&& __phase) const { - __libcpp_thread_poll_with_backoff(__barrier_poll_tester<__barrier_base<__empty_completion, _Sco>>(this, _CUDA_VSTD::move(__phase))); + __libcpp_thread_poll_with_backoff(__barrier_poll_tester<__barrier_base>(this, _CUDA_VSTD::move(__phase))); + } + _LIBCUDACXX_INLINE_VISIBILITY + void wait_parity(bool __parity) const + { + __libcpp_thread_poll_with_backoff(__barrier_poll_tester_parity<__barrier_base>(this, __parity)); } _LIBCUDACXX_INLINE_VISIBILITY void arrive_and_wait() From f882b5dec56229cc28912b59bba728328d679df6 Mon Sep 17 00:00:00 2001 From: Wesley Maxey Date: Tue, 27 Jul 2021 15:29:57 -0700 Subject: [PATCH 5/5] Re-add the __try_wait API as pipeline uses internal symbols of barrier for some reason --- include/cuda/std/barrier | 7 ++++++- libcxx/include/barrier | 8 +++++++- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/include/cuda/std/barrier b/include/cuda/std/barrier index d24b2a7838..e7af6f138c 100644 --- a/include/cuda/std/barrier +++ b/include/cuda/std/barrier @@ -87,7 +87,7 @@ public: using arrival_token = typename __barrier_base::arrival_token; _LIBCUDACXX_INLINE_VISIBILITY - bool try_wait(arrival_token __phase) const { + bool __try_wait(arrival_token __phase) const { #if __CUDA_ARCH__ >= 800 if (__isShared(&__barrier)) { int __ready = 0; @@ -220,6 +220,11 @@ public: _CUDA_VSTD::__libcpp_thread_poll_with_backoff(std::__barrier_poll_tester(this, _CUDA_VSTD::move(__phase))); } + _LIBCUDACXX_INLINE_VISIBILITY + bool try_wait(arrival_token __phase) const { + return __try_wait(_CUDA_VSTD::move(__phase)); + } + inline _LIBCUDACXX_INLINE_VISIBILITY bool try_wait_parity(bool __parity) const { diff --git a/libcxx/include/barrier b/libcxx/include/barrier index 83abb95697..d8be55452c 100644 --- a/libcxx/include/barrier +++ b/libcxx/include/barrier @@ -362,6 +362,12 @@ public: __barrier_base(__barrier_base const&) = delete; __barrier_base& operator=(__barrier_base const&) = delete; + _LIBCUDACXX_INLINE_VISIBILITY + bool __try_wait(arrival_token __old) const + { + return __try_wait_phase(__old & __phase_bit); + } + _LIBCUDACXX_INLINE_VISIBILITY bool try_wait_parity(bool __parity) const { @@ -370,7 +376,7 @@ public: _LIBCUDACXX_INLINE_VISIBILITY bool try_wait(arrival_token __old) const { - return __try_wait_phase(__old & __phase_bit); + return __try_wait(__old); } _LIBCUDACXX_NODISCARD_ATTRIBUTE inline _LIBCUDACXX_INLINE_VISIBILITY