Skip to content
This repository has been archived by the owner on Mar 21, 2024. It is now read-only.

Add parity waiting #111

Merged
merged 5 commits into from
Jul 29, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
109 changes: 109 additions & 0 deletions .upstream-tests/test/heterogeneous/barrier_parity.pass.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
//===----------------------------------------------------------------------===//
//
// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

// UNSUPPORTED: nvrtc, pre-sm-70

// uncomment for a really verbose output detailing what test steps are being launched
// #define DEBUG_TESTERS

#include "helpers.h"

#include <atomic>
#include <cuda/barrier>

template<typename Barrier>
struct barrier_and_token
{
using barrier_t = Barrier;
using token_t = typename barrier_t::arrival_token;

barrier_t barrier;
cuda::std::atomic<bool> parity_waiting{false};

template<typename ...Args>
__host__ __device__
barrier_and_token(Args && ...args) : barrier{ cuda::std::forward<Args>(args)... }
{
}
};

struct barrier_arrive_and_wait
{
using async = cuda::std::true_type;

template<typename Data>
__host__ __device__
static void perform(Data & data)
{
while (data.parity_waiting.load(cuda::std::memory_order_acquire) == false)
{
data.parity_waiting.wait(false);
}
data.barrier.arrive_and_wait();
}
};

template <bool Phase>
struct barrier_parity_wait
{
using async = cuda::std::true_type;

template<typename Data>
__host__ __device__
static void perform(Data & data)
{
data.parity_waiting.store(true, cuda::std::memory_order_release);
data.parity_waiting.notify_all();
data.barrier.wait_parity(Phase);
}
};

struct clear_token
{
template<typename Data>
__host__ __device__
static void perform(Data & data)
{
data.parity_waiting.store(false, cuda::std::memory_order_release);
}
};

using aw_aw_pw = performer_list<
barrier_parity_wait<false>,
barrier_arrive_and_wait,
barrier_arrive_and_wait,
async_tester_fence,
clear_token,
barrier_parity_wait<true>,
barrier_arrive_and_wait,
barrier_arrive_and_wait,
async_tester_fence,
clear_token
>;

void kernel_invoker()
{
validate_not_movable<
barrier_and_token<cuda::std::barrier<>>,
aw_aw_pw
>(2);
validate_not_movable<
barrier_and_token<cuda::barrier<cuda::thread_scope_system>>,
aw_aw_pw
>(2);
}

int main(int arg, char ** argv)
{
#ifndef __CUDA_ARCH__
kernel_invoker();
#endif

return 0;
}

64 changes: 41 additions & 23 deletions include/cuda/std/barrier
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,6 @@ class barrier : public std::__barrier_base<_CompletionF, _Sco> {
template<thread_scope>
friend class pipeline;

using std::__barrier_base<_CompletionF, _Sco>::__try_wait;

public:
barrier() = default;

Expand Down Expand Up @@ -88,24 +86,6 @@ class barrier<thread_scope_block, std::__empty_completion> : public __block_scop
public:
using arrival_token = typename __barrier_base::arrival_token;

private:
struct __poll_tester {
barrier const* __this;
arrival_token __phase;

_LIBCUDACXX_INLINE_VISIBILITY
__poll_tester(barrier const* __this_, arrival_token&& __phase_)
: __this(__this_)
, __phase(_CUDA_VSTD::move(__phase_))
{}

inline _LIBCUDACXX_INLINE_VISIBILITY
bool operator()() const
{
return __this->__try_wait(__phase);
}
};

_LIBCUDACXX_INLINE_VISIBILITY
bool __try_wait(arrival_token __phase) const {
#if __CUDA_ARCH__ >= 800
Expand All @@ -124,14 +104,35 @@ private:
else
#endif
{
return __barrier.__try_wait(std::move(__phase));
return __barrier.try_wait(std::move(__phase));
}
}

template<thread_scope>
friend class pipeline;

public:
_LIBCUDACXX_INLINE_VISIBILITY
bool __try_wait_parity(bool __parity) const {
#if __CUDA_ARCH__ >= 800
if (__isShared(&__barrier)) {
int __ready = 0;
asm volatile ("{\n\t"
".reg .pred p;\n\t"
"mbarrier.test_wait.parity.shared.b64 p, [%1], %2;\n\t"
"selp.b32 %0, 1, 0, p;\n\t"
"}"
: "=r"(__ready)
: "r"(static_cast<std::uint32_t>(__cvta_generic_to_shared(&__barrier))), "r"(static_cast<std::uint32_t>(__parity))
: "memory");
return bool(__ready);
}
else
#endif
{
return __barrier.try_wait_parity(__parity);
}
}

barrier() = default;

barrier(const barrier &) = delete;
Expand Down Expand Up @@ -216,7 +217,24 @@ public:
_LIBCUDACXX_INLINE_VISIBILITY
void wait(arrival_token && __phase) const
{
_CUDA_VSTD::__libcpp_thread_poll_with_backoff(__poll_tester(this, _CUDA_VSTD::move(__phase)));
_CUDA_VSTD::__libcpp_thread_poll_with_backoff(std::__barrier_poll_tester<barrier>(this, _CUDA_VSTD::move(__phase)));
}

_LIBCUDACXX_INLINE_VISIBILITY
bool try_wait(arrival_token __phase) const {
return __try_wait(_CUDA_VSTD::move(__phase));
}

inline _LIBCUDACXX_INLINE_VISIBILITY
bool try_wait_parity(bool __parity) const
{
return __try_wait_parity(__parity);
}

inline _LIBCUDACXX_INLINE_VISIBILITY
void wait_parity(bool __parity) const
{
_CUDA_VSTD::__libcpp_thread_poll_with_backoff(std::__barrier_poll_tester_parity<barrier>(this, __parity));
}

inline _LIBCUDACXX_INLINE_VISIBILITY
Expand Down
111 changes: 83 additions & 28 deletions libcxx/include/barrier
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,12 @@ class __barrier_base {
_LIBCUDACXX_BARRIER_ALIGNMENTS __atomic_base<ptrdiff_t, _Sco> __expected, __arrived;
_LIBCUDACXX_BARRIER_ALIGNMENTS _CompletionF __completion;
_LIBCUDACXX_BARRIER_ALIGNMENTS __atomic_base<bool, _Sco> __phase;

_LIBCUDACXX_INLINE_VISIBILITY
bool __try_wait_phase(bool __old_phase) const
{
return __phase.load(memory_order_acquire) != __old_phase;
}
public:
using arrival_token = bool;

Expand Down Expand Up @@ -241,11 +247,15 @@ public:
return __old_phase;
}
_LIBCUDACXX_INLINE_VISIBILITY
bool __try_wait(arrival_token __old_phase) const
bool try_wait(arrival_token __old) const
{
return __phase != __old_phase;
return __try_wait_phase(__old);
}
_LIBCUDACXX_INLINE_VISIBILITY
bool __try_wait_parity(bool __parity) const
{
return __try_wait_phase(__parity);
}

_LIBCUDACXX_INLINE_VISIBILITY
void wait(arrival_token&& __old_phase) const
{
Expand All @@ -270,6 +280,42 @@ public:
}
};

template<class __Barrier>
struct __barrier_poll_tester {
__Barrier const* __this;
typename __Barrier::arrival_token __phase;

_LIBCUDACXX_INLINE_VISIBILITY
__barrier_poll_tester(__Barrier const* __this_, typename __Barrier::arrival_token&& __phase_)
: __this(__this_)
, __phase(_CUDA_VSTD::move(__phase_))
{}

_LIBCUDACXX_INLINE_VISIBILITY
bool operator()() const
{
return __this->try_wait(__phase);
}
};

template<class __Barrier>
struct __barrier_poll_tester_parity {
__Barrier const* __this;
bool __parity;

_LIBCUDACXX_INLINE_VISIBILITY
__barrier_poll_tester_parity(__Barrier const* __this_, bool __parity_)
: __this(__this_)
, __parity(__parity_)
{}

inline _LIBCUDACXX_INLINE_VISIBILITY
bool operator()() const
{
return __this->try_wait_parity(__parity);
}
};

template<int _Sco>
class __barrier_base<__empty_completion, _Sco> {

Expand All @@ -285,29 +331,23 @@ public:
using arrival_token = uint64_t;

private:
struct __poll_tester {
__barrier_base const* __this;
arrival_token __phase;

_LIBCUDACXX_INLINE_VISIBILITY
__poll_tester(__barrier_base const* __this_, arrival_token&& __phase_)
: __this(__this_)
, __phase(_CUDA_VSTD::move(__phase_))
{}

inline _LIBCUDACXX_INLINE_VISIBILITY
bool operator()() const
{
return __this->__try_wait(__phase);
}
};

static inline _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR
static _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR
uint64_t __init(ptrdiff_t __count) _NOEXCEPT
{
return (((1u << 31) - __count) << 32)
| ((1u << 31) - __count);
}
_LIBCUDACXX_INLINE_VISIBILITY
bool __try_wait_phase(uint64_t __phase) const
{
uint64_t const __current = __phase_arrived_expected.load(memory_order_acquire);
return ((__current & __phase_bit) != __phase);
}
_LIBCUDACXX_INLINE_VISIBILITY
bool __try_wait_parity(bool __parity) const
{
return __try_wait_phase(__parity ? __phase_bit : 0);
}

public:
__barrier_base() = default;
Expand All @@ -323,10 +363,20 @@ public:
__barrier_base& operator=(__barrier_base const&) = delete;

_LIBCUDACXX_INLINE_VISIBILITY
bool __try_wait(arrival_token __phase) const
bool __try_wait(arrival_token __old) const
{
uint64_t const __current = __phase_arrived_expected.load(memory_order_acquire);
return ((__current & __phase_bit) != __phase);
return __try_wait_phase(__old & __phase_bit);
}

_LIBCUDACXX_INLINE_VISIBILITY
bool try_wait_parity(bool __parity) const
{
return __try_wait_parity(__parity);
}
_LIBCUDACXX_INLINE_VISIBILITY
bool try_wait(arrival_token __old) const
{
return __try_wait(__old);
}

_LIBCUDACXX_NODISCARD_ATTRIBUTE inline _LIBCUDACXX_INLINE_VISIBILITY
Expand All @@ -340,17 +390,22 @@ public:
}
return __old & __phase_bit;
}
inline _LIBCUDACXX_INLINE_VISIBILITY
_LIBCUDACXX_INLINE_VISIBILITY
void wait(arrival_token&& __phase) const
{
__libcpp_thread_poll_with_backoff(__poll_tester(this, _CUDA_VSTD::move(__phase)));
__libcpp_thread_poll_with_backoff(__barrier_poll_tester<__barrier_base>(this, _CUDA_VSTD::move(__phase)));
}
inline _LIBCUDACXX_INLINE_VISIBILITY
_LIBCUDACXX_INLINE_VISIBILITY
void wait_parity(bool __parity) const
{
__libcpp_thread_poll_with_backoff(__barrier_poll_tester_parity<__barrier_base>(this, __parity));
}
_LIBCUDACXX_INLINE_VISIBILITY
void arrive_and_wait()
{
wait(arrive());
}
inline _LIBCUDACXX_INLINE_VISIBILITY
_LIBCUDACXX_INLINE_VISIBILITY
void arrive_and_drop()
{
__phase_arrived_expected.fetch_add(__expected_unit, memory_order_relaxed);
Expand Down