From 98d40457253e3459ecbb0e52c40ed5d2834c34d8 Mon Sep 17 00:00:00 2001 From: Olivier Giroux Date: Mon, 2 Nov 2020 17:46:20 -0800 Subject: [PATCH 1/5] Barrier parity waiting and extendable phases --- include/cuda/std/detail/libcxx/include | 2 +- libcxx/include/barrier | 113 ++++++++++++++++++++----- 2 files changed, 94 insertions(+), 21 deletions(-) diff --git a/include/cuda/std/detail/libcxx/include b/include/cuda/std/detail/libcxx/include index 88410570f5..17935b7db1 120000 --- a/include/cuda/std/detail/libcxx/include +++ b/include/cuda/std/detail/libcxx/include @@ -1 +1 @@ -../../../../../libcxx/include \ No newline at end of file +C:/Users/Andromeda/Documents/Git/nvidia/libcudacxx/libcxx/include \ No newline at end of file diff --git a/libcxx/include/barrier b/libcxx/include/barrier index dda2f359ff..7c4aad2b47 100644 --- a/libcxx/include/barrier +++ b/libcxx/include/barrier @@ -273,13 +273,20 @@ public: template class __barrier_base<__empty_completion, _Sco> { + static constexpr uint64_t __arrived_shift = 32; + static constexpr uint64_t __guard_shift = 31; + static constexpr uint64_t __phase_shift = 63; + static constexpr uint64_t __expected_unit = 1ull; - static constexpr uint64_t __arrived_unit = 1ull << 32; - static constexpr uint64_t __expected_mask = __arrived_unit - 1; - static constexpr uint64_t __phase_bit = 1ull << 63; - static constexpr uint64_t __arrived_mask = (__phase_bit - 1) & ~__expected_mask; + static constexpr uint64_t __arrived_unit = 1ull << __arrived_shift; + + static constexpr uint64_t __phase_bit = 1ull << __phase_shift; + static constexpr uint64_t __arrived_sign_bit = __phase_bit >> 1; + static constexpr uint64_t __guard_bit = 1ull << __guard_shift; + + static constexpr uint64_t __expected_mask = __guard_bit - 1; - _LIBCUDACXX_BARRIER_ALIGNMENTS __atomic_base __phase_arrived_expected; + mutable _LIBCUDACXX_BARRIER_ALIGNMENTS __atomic_base __phase_arrived_expected; public: using arrival_token = uint64_t; @@ -301,12 +308,55 @@ private: return __this->__try_wait(__phase); } }; + struct __poll_tester_parity { + __barrier_base const* __this; + bool __parity; + + _LIBCUDACXX_INLINE_VISIBILITY + __poll_tester_parity(__barrier_base const* __this_, bool __parity_) + : __this(__this_) + , __parity(__parity_) + {} + + inline _LIBCUDACXX_INLINE_VISIBILITY + bool operator()() const + { + return __this->__try_wait_parity(__parity); + } + }; static inline _LIBCUDACXX_INLINE_VISIBILITY constexpr uint64_t __init(ptrdiff_t __count) _NOEXCEPT { - return (((1u << 31) - __count) << 32) - | ((1u << 31) - __count); + return ((__guard_bit - __count) << __arrived_shift) + | (__guard_bit - __count); + } + + _LIBCUDACXX_NODISCARD_ATTRIBUTE inline _LIBCUDACXX_INLINE_VISIBILITY + uint64_t __update_fetch(uint64_t __with_inc, memory_order __order) const + { + uint64_t __old = __phase_arrived_expected.load(__order == memory_order_acquire ? memory_order_acquire : memory_order_relaxed); + uint64_t __new; + while(1) + { + if((__old & __arrived_sign_bit) == 0) //[[unlikely]] + { + auto const __shifted_expected = (__old & __expected_mask) << __arrived_shift; + __new = __old + __with_inc + __shifted_expected; + if(__phase_arrived_expected.compare_exchange_weak(__old, __new, __order)) + break; + } + else + { + if(__with_inc != 0) + __old = __phase_arrived_expected.fetch_add(__with_inc, __order); + __new = __old + __with_inc; + break; + } + } + if((__old ^ __new) & __phase_bit) + __phase_arrived_expected.notify_all(); + return __new; } public: @@ -322,28 +372,46 @@ public: __barrier_base(__barrier_base const&) = delete; __barrier_base& operator=(__barrier_base const&) = delete; - _LIBCUDACXX_INLINE_VISIBILITY - bool __try_wait(arrival_token __phase) const + inline _LIBCUDACXX_INLINE_VISIBILITY + bool __try_wait(arrival_token __old) const + { + auto const __new = __update_fetch(0, memory_order_acquire); + return (__old ^ __new) & __phase_bit; + } + inline _LIBCUDACXX_INLINE_VISIBILITY + bool __try_wait_parity(bool __parity) const + { + auto const __new = __update_fetch(0, memory_order_acquire); + return (__new & __phase_bit) != (__parity ? __phase_bit : 0); + } + +#ifndef _LIBCUDACXX_HAS_PLATFORM_WAIT + inline _LIBCUDACXX_INLINE_VISIBILITY + friend void __expect_extra_arrive(__barrier_base& __self, uint64_t __inc = __arrived_unit) { - uint64_t const __current = __phase_arrived_expected.load(memory_order_acquire); - return ((__current & __phase_bit) != __phase); + (void)__self.__update_fetch(~__inc + 1, memory_order_relaxed); } + inline _LIBCUDACXX_INLINE_VISIBILITY + friend void __extra_arrive(__barrier_base& __self, uint64_t __inc = __arrived_unit) + { + __self.__phase_arrived_expected.fetch_add(__inc, memory_order_release); + } +#endif _LIBCUDACXX_NODISCARD_ATTRIBUTE inline _LIBCUDACXX_INLINE_VISIBILITY arrival_token arrive(ptrdiff_t __update = 1) { - auto const __inc = __arrived_unit * __update; + auto const __inc = __update << __arrived_shift; auto const __old = __phase_arrived_expected.fetch_add(__inc, memory_order_acq_rel); - if((__old ^ (__old + __inc)) & __phase_bit) { - __phase_arrived_expected.fetch_add((__old & __expected_mask) << 32, memory_order_relaxed); + auto const __new = __old + __inc; + if((__old ^ __new) & __phase_bit) __phase_arrived_expected.notify_all(); - } - return __old & __phase_bit; + return __old; } inline _LIBCUDACXX_INLINE_VISIBILITY - void wait(arrival_token&& __phase) const + void wait(arrival_token&& __old) const { - __libcpp_thread_poll_with_backoff(__poll_tester(this, _CUDA_VSTD::move(__phase))); + __libcpp_thread_poll_with_backoff(__poll_tester(this, _CUDA_VSTD::move(__old))); } inline _LIBCUDACXX_INLINE_VISIBILITY void arrive_and_wait() @@ -353,8 +421,7 @@ public: inline _LIBCUDACXX_INLINE_VISIBILITY void arrive_and_drop() { - __phase_arrived_expected.fetch_add(__expected_unit, memory_order_relaxed); - (void)arrive(); + (void)__update_fetch(__arrived_unit + __expected_unit, memory_order_release); } _LIBCUDACXX_INLINE_VISIBILITY @@ -362,6 +429,12 @@ public: { return numeric_limits::max(); } + + inline _LIBCUDACXX_INLINE_VISIBILITY + friend void wait_for_parity(__barrier_base const* __self, bool __parity) + { + __libcpp_thread_poll_with_backoff(__poll_tester_parity(__self, __parity)); + } }; #endif //_LIBCUDACXX_HAS_NO_TREE_BARRIER From 936f6f76ac58c06f1afb5dbafd1a1d76d9125359 Mon Sep 17 00:00:00 2001 From: Olivier Giroux Date: Mon, 2 Nov 2020 17:51:25 -0800 Subject: [PATCH 2/5] Cuda namespace layer --- include/cuda/std/barrier | 44 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/include/cuda/std/barrier b/include/cuda/std/barrier index 2817ea5094..9609e7feca 100644 --- a/include/cuda/std/barrier +++ b/include/cuda/std/barrier @@ -104,6 +104,22 @@ private: return __this->__try_wait(__phase); } }; + struct __poll_tester_parity { + barrier const* __this; + bool __parity; + + _LIBCUDACXX_INLINE_VISIBILITY + __poll_tester_parity(barrier const* __this_, bool __parity_) + : __this(__this_) + , __parity(__parity_) + {} + + inline _LIBCUDACXX_INLINE_VISIBILITY + bool operator()() const + { + return __this->__try_wait_parity(__parity); + } + }; _LIBCUDACXX_INLINE_VISIBILITY bool __try_wait(arrival_token __phase) const { @@ -127,6 +143,28 @@ private: } } + _LIBCUDACXX_INLINE_VISIBILITY + bool __try_wait_parity(bool __parity) const { +#if __CUDA_ARCH__ >= 800 + if (__isShared(&__barrier)) { + int __ready = 0; + asm volatile ("{\n\t" + ".reg .pred p;\n\t" + "mbarrier.test_wait.parity.shared.b64 p, [%1], %2;\n\t" + "selp.b32 %0, 1, 0, p;\n\t" + "}" + : "=r"(__ready) + : "r"(static_cast(__cvta_generic_to_shared(&__barrier))), "r"(static_cast(__parity)) + : "memory"); + return bool(__ready); + } + else +#endif + { + return __barrier.__try_wait_parity(__parity); + } + } + template friend class pipeline; @@ -245,6 +283,12 @@ public: { return (1 << 20) - 1; } + + inline _LIBCUDACXX_INLINE_VISIBILITY + friend void wait_for_parity(barrier const* __self, bool __parity) + { + _CUDA_VSTD::__libcpp_thread_poll_with_backoff(__poll_tester_parity(__self, __parity)); + } }; _LIBCUDACXX_END_NAMESPACE_CUDA From c155cbb49803b22cd5b90aea0c466b935e5f9a2a Mon Sep 17 00:00:00 2001 From: Olivier Giroux Date: Mon, 2 Nov 2020 17:53:44 -0800 Subject: [PATCH 3/5] Reverted the include symlink --- include/cuda/std/detail/libcxx/include | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/cuda/std/detail/libcxx/include b/include/cuda/std/detail/libcxx/include index 17935b7db1..88410570f5 120000 --- a/include/cuda/std/detail/libcxx/include +++ b/include/cuda/std/detail/libcxx/include @@ -1 +1 @@ -C:/Users/Andromeda/Documents/Git/nvidia/libcudacxx/libcxx/include \ No newline at end of file +../../../../../libcxx/include \ No newline at end of file From a24a5f3765f77e623e95a7df638da3d8a54906cf Mon Sep 17 00:00:00 2001 From: Olivier Giroux Date: Mon, 2 Nov 2020 17:55:03 -0800 Subject: [PATCH 4/5] Renamed the API to have barrier in the name --- include/cuda/std/barrier | 2 +- libcxx/include/barrier | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/cuda/std/barrier b/include/cuda/std/barrier index 9609e7feca..d1acfc57d0 100644 --- a/include/cuda/std/barrier +++ b/include/cuda/std/barrier @@ -285,7 +285,7 @@ public: } inline _LIBCUDACXX_INLINE_VISIBILITY - friend void wait_for_parity(barrier const* __self, bool __parity) + friend void barrier_wait_for_parity(barrier const* __self, bool __parity) { _CUDA_VSTD::__libcpp_thread_poll_with_backoff(__poll_tester_parity(__self, __parity)); } diff --git a/libcxx/include/barrier b/libcxx/include/barrier index 7c4aad2b47..7588cce5fc 100644 --- a/libcxx/include/barrier +++ b/libcxx/include/barrier @@ -431,7 +431,7 @@ public: } inline _LIBCUDACXX_INLINE_VISIBILITY - friend void wait_for_parity(__barrier_base const* __self, bool __parity) + friend void barrier_wait_for_parity(__barrier_base const* __self, bool __parity) { __libcpp_thread_poll_with_backoff(__poll_tester_parity(__self, __parity)); } From 57361ef1769516a4417a2841dd66bb4a4f37c535 Mon Sep 17 00:00:00 2001 From: Olivier Giroux Date: Mon, 2 Nov 2020 21:03:03 -0800 Subject: [PATCH 5/5] Improved tuning --- libcxx/include/barrier | 83 ++++++++++++++++++++++-------------------- 1 file changed, 44 insertions(+), 39 deletions(-) diff --git a/libcxx/include/barrier b/libcxx/include/barrier index 7588cce5fc..b752c3bd79 100644 --- a/libcxx/include/barrier +++ b/libcxx/include/barrier @@ -332,31 +332,29 @@ private: | (__guard_bit - __count); } - _LIBCUDACXX_NODISCARD_ATTRIBUTE inline _LIBCUDACXX_INLINE_VISIBILITY - uint64_t __update_fetch(uint64_t __with_inc, memory_order __order) const + inline _LIBCUDACXX_INLINE_VISIBILITY + void __update(uint64_t __old, uint64_t __with_inc, memory_order __order) const { - uint64_t __old = __phase_arrived_expected.load(__order == memory_order_acquire ? memory_order_acquire : memory_order_relaxed); - uint64_t __new; - while(1) - { - if((__old & __arrived_sign_bit) == 0) //[[unlikely]] - { - auto const __shifted_expected = (__old & __expected_mask) << __arrived_shift; - __new = __old + __with_inc + __shifted_expected; - if(__phase_arrived_expected.compare_exchange_weak(__old, __new, __order)) - break; + while((__old & __arrived_sign_bit) == 0) { + auto const __shifted_expected = (__old & __expected_mask) << __arrived_shift; + auto const __new = __old + __with_inc + __shifted_expected; + if(__phase_arrived_expected.compare_exchange_weak(__old, __new, __order)) { + if((__old ^ __new) & __phase_bit) { + if(__with_inc) + __update(__new, 0, memory_order_relaxed); + __phase_arrived_expected.notify_all(); + } + return; } - else - { - if(__with_inc != 0) - __old = __phase_arrived_expected.fetch_add(__with_inc, __order); - __new = __old + __with_inc; - break; + } + if(__with_inc) { + __old = __phase_arrived_expected.fetch_add(__with_inc, __order); + auto const __new = __old + __with_inc; + if((__old ^ __new) & __phase_bit) { + __update(__new, 0, memory_order_relaxed); + __phase_arrived_expected.notify_all(); } } - if((__old ^ __new) & __phase_bit) - __phase_arrived_expected.notify_all(); - return __new; } public: @@ -373,39 +371,45 @@ public: __barrier_base& operator=(__barrier_base const&) = delete; inline _LIBCUDACXX_INLINE_VISIBILITY - bool __try_wait(arrival_token __old) const + bool __try_wait_phase(uint64_t __phase) const { - auto const __new = __update_fetch(0, memory_order_acquire); - return (__old ^ __new) & __phase_bit; + auto const __current = __phase_arrived_expected.load(memory_order_acquire); + if((__current & __arrived_sign_bit) == 0) + __update(__current, 0, memory_order_acquire); + return (__current & __phase_bit) != __phase; } inline _LIBCUDACXX_INLINE_VISIBILITY bool __try_wait_parity(bool __parity) const { - auto const __new = __update_fetch(0, memory_order_acquire); - return (__new & __phase_bit) != (__parity ? __phase_bit : 0); + return __try_wait_phase(__parity ? __phase_bit : 0); + } + inline _LIBCUDACXX_INLINE_VISIBILITY + bool __try_wait(arrival_token __old) const + { + return __try_wait_phase(__old & __phase_bit); } #ifndef _LIBCUDACXX_HAS_PLATFORM_WAIT inline _LIBCUDACXX_INLINE_VISIBILITY - friend void __expect_extra_arrive(__barrier_base& __self, uint64_t __inc = __arrived_unit) + friend void __expect_extra_arrive(__barrier_base* __self, uint64_t __count = 1) { - (void)__self.__update_fetch(~__inc + 1, memory_order_relaxed); + auto const __inc = __count << __arrived_shift; + auto const __old = __self->__phase_arrived_expected.load(memory_order_relaxed); + __self->__update(__old, ~__inc + 1, memory_order_relaxed); } inline _LIBCUDACXX_INLINE_VISIBILITY - friend void __extra_arrive(__barrier_base& __self, uint64_t __inc = __arrived_unit) + friend uint64_t __extra_arrive(__barrier_base* __self, uint64_t __count = 1) { - __self.__phase_arrived_expected.fetch_add(__inc, memory_order_release); + auto const __inc = __count << __arrived_shift; + return __self->__phase_arrived_expected.fetch_add(__inc, memory_order_release); } #endif _LIBCUDACXX_NODISCARD_ATTRIBUTE inline _LIBCUDACXX_INLINE_VISIBILITY - arrival_token arrive(ptrdiff_t __update = 1) + arrival_token arrive(ptrdiff_t __count = 1) { - auto const __inc = __update << __arrived_shift; - auto const __old = __phase_arrived_expected.fetch_add(__inc, memory_order_acq_rel); - auto const __new = __old + __inc; - if((__old ^ __new) & __phase_bit) - __phase_arrived_expected.notify_all(); + auto const __old = __phase_arrived_expected.load(memory_order_relaxed); + __update(__old, __count << __arrived_shift, memory_order_release); return __old; } inline _LIBCUDACXX_INLINE_VISIBILITY @@ -421,17 +425,18 @@ public: inline _LIBCUDACXX_INLINE_VISIBILITY void arrive_and_drop() { - (void)__update_fetch(__arrived_unit + __expected_unit, memory_order_release); + auto const __old = __phase_arrived_expected.load(memory_order_relaxed); + __update(__old, __arrived_unit + __expected_unit, memory_order_release); } _LIBCUDACXX_INLINE_VISIBILITY static constexpr ptrdiff_t max() noexcept { - return numeric_limits::max(); + return numeric_limits::max() >> 1; } inline _LIBCUDACXX_INLINE_VISIBILITY - friend void barrier_wait_for_parity(__barrier_base const* __self, bool __parity) + friend void wait_for_parity(__barrier_base const* __self, bool __parity) { __libcpp_thread_poll_with_backoff(__poll_tester_parity(__self, __parity)); }