Skip to content
This repository has been archived by the owner on Mar 21, 2024. It is now read-only.

Commit

Permalink
Merge pull request #111 from NVIDIA/barrier-parity
Browse files Browse the repository at this point in the history
Add parity waiting for `cuda::std::barrier`
  • Loading branch information
wmaxey authored Jul 29, 2021
2 parents 5fe780d + f882b5d commit 48d213a
Show file tree
Hide file tree
Showing 3 changed files with 233 additions and 51 deletions.
109 changes: 109 additions & 0 deletions .upstream-tests/test/heterogeneous/barrier_parity.pass.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
//===----------------------------------------------------------------------===//
//
// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

// UNSUPPORTED: nvrtc, pre-sm-70

// uncomment for a really verbose output detailing what test steps are being launched
// #define DEBUG_TESTERS

#include "helpers.h"

#include <atomic>
#include <cuda/barrier>

template<typename Barrier>
struct barrier_and_token
{
using barrier_t = Barrier;
using token_t = typename barrier_t::arrival_token;

barrier_t barrier;
cuda::std::atomic<bool> parity_waiting{false};

template<typename ...Args>
__host__ __device__
barrier_and_token(Args && ...args) : barrier{ cuda::std::forward<Args>(args)... }
{
}
};

struct barrier_arrive_and_wait
{
using async = cuda::std::true_type;

template<typename Data>
__host__ __device__
static void perform(Data & data)
{
while (data.parity_waiting.load(cuda::std::memory_order_acquire) == false)
{
data.parity_waiting.wait(false);
}
data.barrier.arrive_and_wait();
}
};

template <bool Phase>
struct barrier_parity_wait
{
using async = cuda::std::true_type;

template<typename Data>
__host__ __device__
static void perform(Data & data)
{
data.parity_waiting.store(true, cuda::std::memory_order_release);
data.parity_waiting.notify_all();
data.barrier.wait_parity(Phase);
}
};

struct clear_token
{
template<typename Data>
__host__ __device__
static void perform(Data & data)
{
data.parity_waiting.store(false, cuda::std::memory_order_release);
}
};

using aw_aw_pw = performer_list<
barrier_parity_wait<false>,
barrier_arrive_and_wait,
barrier_arrive_and_wait,
async_tester_fence,
clear_token,
barrier_parity_wait<true>,
barrier_arrive_and_wait,
barrier_arrive_and_wait,
async_tester_fence,
clear_token
>;

void kernel_invoker()
{
validate_not_movable<
barrier_and_token<cuda::std::barrier<>>,
aw_aw_pw
>(2);
validate_not_movable<
barrier_and_token<cuda::barrier<cuda::thread_scope_system>>,
aw_aw_pw
>(2);
}

int main(int arg, char ** argv)
{
#ifndef __CUDA_ARCH__
kernel_invoker();
#endif

return 0;
}

64 changes: 41 additions & 23 deletions include/cuda/std/barrier
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,6 @@ class barrier : public std::__barrier_base<_CompletionF, _Sco> {
template<thread_scope>
friend class pipeline;

using std::__barrier_base<_CompletionF, _Sco>::__try_wait;

public:
barrier() = default;

Expand Down Expand Up @@ -88,24 +86,6 @@ class barrier<thread_scope_block, std::__empty_completion> : public __block_scop
public:
using arrival_token = typename __barrier_base::arrival_token;

private:
struct __poll_tester {
barrier const* __this;
arrival_token __phase;

_LIBCUDACXX_INLINE_VISIBILITY
__poll_tester(barrier const* __this_, arrival_token&& __phase_)
: __this(__this_)
, __phase(_CUDA_VSTD::move(__phase_))
{}

inline _LIBCUDACXX_INLINE_VISIBILITY
bool operator()() const
{
return __this->__try_wait(__phase);
}
};

_LIBCUDACXX_INLINE_VISIBILITY
bool __try_wait(arrival_token __phase) const {
#if __CUDA_ARCH__ >= 800
Expand All @@ -124,14 +104,35 @@ private:
else
#endif
{
return __barrier.__try_wait(std::move(__phase));
return __barrier.try_wait(std::move(__phase));
}
}

template<thread_scope>
friend class pipeline;

public:
_LIBCUDACXX_INLINE_VISIBILITY
bool __try_wait_parity(bool __parity) const {
#if __CUDA_ARCH__ >= 800
if (__isShared(&__barrier)) {
int __ready = 0;
asm volatile ("{\n\t"
".reg .pred p;\n\t"
"mbarrier.test_wait.parity.shared.b64 p, [%1], %2;\n\t"
"selp.b32 %0, 1, 0, p;\n\t"
"}"
: "=r"(__ready)
: "r"(static_cast<std::uint32_t>(__cvta_generic_to_shared(&__barrier))), "r"(static_cast<std::uint32_t>(__parity))
: "memory");
return bool(__ready);
}
else
#endif
{
return __barrier.try_wait_parity(__parity);
}
}

barrier() = default;

barrier(const barrier &) = delete;
Expand Down Expand Up @@ -216,7 +217,24 @@ public:
_LIBCUDACXX_INLINE_VISIBILITY
void wait(arrival_token && __phase) const
{
_CUDA_VSTD::__libcpp_thread_poll_with_backoff(__poll_tester(this, _CUDA_VSTD::move(__phase)));
_CUDA_VSTD::__libcpp_thread_poll_with_backoff(std::__barrier_poll_tester<barrier>(this, _CUDA_VSTD::move(__phase)));
}

_LIBCUDACXX_INLINE_VISIBILITY
bool try_wait(arrival_token __phase) const {
return __try_wait(_CUDA_VSTD::move(__phase));
}

inline _LIBCUDACXX_INLINE_VISIBILITY
bool try_wait_parity(bool __parity) const
{
return __try_wait_parity(__parity);
}

inline _LIBCUDACXX_INLINE_VISIBILITY
void wait_parity(bool __parity) const
{
_CUDA_VSTD::__libcpp_thread_poll_with_backoff(std::__barrier_poll_tester_parity<barrier>(this, __parity));
}

inline _LIBCUDACXX_INLINE_VISIBILITY
Expand Down
111 changes: 83 additions & 28 deletions libcxx/include/barrier
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,12 @@ class __barrier_base {
_LIBCUDACXX_BARRIER_ALIGNMENTS __atomic_base<ptrdiff_t, _Sco> __expected, __arrived;
_LIBCUDACXX_BARRIER_ALIGNMENTS _CompletionF __completion;
_LIBCUDACXX_BARRIER_ALIGNMENTS __atomic_base<bool, _Sco> __phase;

_LIBCUDACXX_INLINE_VISIBILITY
bool __try_wait_phase(bool __old_phase) const
{
return __phase.load(memory_order_acquire) != __old_phase;
}
public:
using arrival_token = bool;

Expand Down Expand Up @@ -241,11 +247,15 @@ public:
return __old_phase;
}
_LIBCUDACXX_INLINE_VISIBILITY
bool __try_wait(arrival_token __old_phase) const
bool try_wait(arrival_token __old) const
{
return __phase != __old_phase;
return __try_wait_phase(__old);
}
_LIBCUDACXX_INLINE_VISIBILITY
bool __try_wait_parity(bool __parity) const
{
return __try_wait_phase(__parity);
}

_LIBCUDACXX_INLINE_VISIBILITY
void wait(arrival_token&& __old_phase) const
{
Expand All @@ -270,6 +280,42 @@ public:
}
};

template<class __Barrier>
struct __barrier_poll_tester {
__Barrier const* __this;
typename __Barrier::arrival_token __phase;

_LIBCUDACXX_INLINE_VISIBILITY
__barrier_poll_tester(__Barrier const* __this_, typename __Barrier::arrival_token&& __phase_)
: __this(__this_)
, __phase(_CUDA_VSTD::move(__phase_))
{}

_LIBCUDACXX_INLINE_VISIBILITY
bool operator()() const
{
return __this->try_wait(__phase);
}
};

template<class __Barrier>
struct __barrier_poll_tester_parity {
__Barrier const* __this;
bool __parity;

_LIBCUDACXX_INLINE_VISIBILITY
__barrier_poll_tester_parity(__Barrier const* __this_, bool __parity_)
: __this(__this_)
, __parity(__parity_)
{}

inline _LIBCUDACXX_INLINE_VISIBILITY
bool operator()() const
{
return __this->try_wait_parity(__parity);
}
};

template<int _Sco>
class __barrier_base<__empty_completion, _Sco> {

Expand All @@ -285,29 +331,23 @@ public:
using arrival_token = uint64_t;

private:
struct __poll_tester {
__barrier_base const* __this;
arrival_token __phase;

_LIBCUDACXX_INLINE_VISIBILITY
__poll_tester(__barrier_base const* __this_, arrival_token&& __phase_)
: __this(__this_)
, __phase(_CUDA_VSTD::move(__phase_))
{}

inline _LIBCUDACXX_INLINE_VISIBILITY
bool operator()() const
{
return __this->__try_wait(__phase);
}
};

static inline _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR
static _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR
uint64_t __init(ptrdiff_t __count) _NOEXCEPT
{
return (((1u << 31) - __count) << 32)
| ((1u << 31) - __count);
}
_LIBCUDACXX_INLINE_VISIBILITY
bool __try_wait_phase(uint64_t __phase) const
{
uint64_t const __current = __phase_arrived_expected.load(memory_order_acquire);
return ((__current & __phase_bit) != __phase);
}
_LIBCUDACXX_INLINE_VISIBILITY
bool __try_wait_parity(bool __parity) const
{
return __try_wait_phase(__parity ? __phase_bit : 0);
}

public:
__barrier_base() = default;
Expand All @@ -323,10 +363,20 @@ public:
__barrier_base& operator=(__barrier_base const&) = delete;

_LIBCUDACXX_INLINE_VISIBILITY
bool __try_wait(arrival_token __phase) const
bool __try_wait(arrival_token __old) const
{
uint64_t const __current = __phase_arrived_expected.load(memory_order_acquire);
return ((__current & __phase_bit) != __phase);
return __try_wait_phase(__old & __phase_bit);
}

_LIBCUDACXX_INLINE_VISIBILITY
bool try_wait_parity(bool __parity) const
{
return __try_wait_parity(__parity);
}
_LIBCUDACXX_INLINE_VISIBILITY
bool try_wait(arrival_token __old) const
{
return __try_wait(__old);
}

_LIBCUDACXX_NODISCARD_ATTRIBUTE inline _LIBCUDACXX_INLINE_VISIBILITY
Expand All @@ -340,17 +390,22 @@ public:
}
return __old & __phase_bit;
}
inline _LIBCUDACXX_INLINE_VISIBILITY
_LIBCUDACXX_INLINE_VISIBILITY
void wait(arrival_token&& __phase) const
{
__libcpp_thread_poll_with_backoff(__poll_tester(this, _CUDA_VSTD::move(__phase)));
__libcpp_thread_poll_with_backoff(__barrier_poll_tester<__barrier_base>(this, _CUDA_VSTD::move(__phase)));
}
inline _LIBCUDACXX_INLINE_VISIBILITY
_LIBCUDACXX_INLINE_VISIBILITY
void wait_parity(bool __parity) const
{
__libcpp_thread_poll_with_backoff(__barrier_poll_tester_parity<__barrier_base>(this, __parity));
}
_LIBCUDACXX_INLINE_VISIBILITY
void arrive_and_wait()
{
wait(arrive());
}
inline _LIBCUDACXX_INLINE_VISIBILITY
_LIBCUDACXX_INLINE_VISIBILITY
void arrive_and_drop()
{
__phase_arrived_expected.fetch_add(__expected_unit, memory_order_relaxed);
Expand Down

0 comments on commit 48d213a

Please sign in to comment.