Skip to content

Commit

Permalink
Add cuda::ptx::red_async (#1080)
Browse files Browse the repository at this point in the history
* Add red.async

* Change spelling in test

* Create separate overload for each variant

* Remove redundant include

* Add red.async.add.s64 emulation
  • Loading branch information
ahendriksen authored Nov 13, 2023
1 parent 22a570d commit 9bbec0c
Show file tree
Hide file tree
Showing 3 changed files with 871 additions and 18 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,183 @@
//===----------------------------------------------------------------------===//
//
// Part of libcu++, the C++ Standard Library for your entire system,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//
// UNSUPPORTED: libcpp-has-no-threads

// <cuda/ptx>

#include <cuda/ptx>
#include <cuda/std/utility>

/*
* We use a special strategy to force the generation of the PTX. This is mainly
* a fight against dead-code-elimination in the NVVM layer.
*
* The reason we need this strategy is because certain older versions of ptxas
* segfault when a non-sensical sequence of PTX is generated. So instead, we try
* to force the instantiation and compilation to PTX of all the overloads of the
* PTX wrapping functions.
*
* We do this by writing a function pointer of each overload to the `__device__`
* variable `fn_ptr`. Now, because weak stores from a single thread may be
* elided, we also wrap the store in an if branch that cannot be removed.
*
* To prevent dead-code-elimination of the if branch, we use
* `non_eliminated_false`, which uses inline assembly to hide the fact that is
* always false from NVVM.
*
* So this is how we ensure that none of the function pointer stores are elided.
* Because `fn_ptr` is possibly visible outside this translation unit, the
* compiler must compile all the functions which are stored.
*
*/

__device__ void * fn_ptr = nullptr;

__device__ bool non_eliminated_false(void){
int ret = 0;
asm ("": "=r"(ret)::);
return ret != 0;
}

__global__ void test_compilation() {
#if __cccl_ptx_isa >= 810
NV_IF_TARGET(NV_PROVIDES_SM_90, (
if (non_eliminated_false()) {
// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.inc.u32 [dest], value, [remote_bar];
auto overload = static_cast<void (*)(cuda::ptx::op_inc_t, uint32_t* , const uint32_t& , uint64_t* )>(cuda::ptx::red_async);
fn_ptr = reinterpret_cast<void*>(overload);
}
));
#endif // __cccl_ptx_isa >= 810

#if __cccl_ptx_isa >= 810
NV_IF_TARGET(NV_PROVIDES_SM_90, (
if (non_eliminated_false()) {
// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.dec.u32 [dest], value, [remote_bar];
auto overload = static_cast<void (*)(cuda::ptx::op_dec_t, uint32_t* , const uint32_t& , uint64_t* )>(cuda::ptx::red_async);
fn_ptr = reinterpret_cast<void*>(overload);
}
));
#endif // __cccl_ptx_isa >= 810

#if __cccl_ptx_isa >= 810
NV_IF_TARGET(NV_PROVIDES_SM_90, (
if (non_eliminated_false()) {
// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.min.u32 [dest], value, [remote_bar];
auto overload = static_cast<void (*)(cuda::ptx::op_min_t, uint32_t* , const uint32_t& , uint64_t* )>(cuda::ptx::red_async);
fn_ptr = reinterpret_cast<void*>(overload);
}
));
#endif // __cccl_ptx_isa >= 810

#if __cccl_ptx_isa >= 810
NV_IF_TARGET(NV_PROVIDES_SM_90, (
if (non_eliminated_false()) {
// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.max.u32 [dest], value, [remote_bar];
auto overload = static_cast<void (*)(cuda::ptx::op_max_t, uint32_t* , const uint32_t& , uint64_t* )>(cuda::ptx::red_async);
fn_ptr = reinterpret_cast<void*>(overload);
}
));
#endif // __cccl_ptx_isa >= 810

#if __cccl_ptx_isa >= 810
NV_IF_TARGET(NV_PROVIDES_SM_90, (
if (non_eliminated_false()) {
// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u32 [dest], value, [remote_bar];
auto overload = static_cast<void (*)(cuda::ptx::op_add_t, uint32_t* , const uint32_t& , uint64_t* )>(cuda::ptx::red_async);
fn_ptr = reinterpret_cast<void*>(overload);
}
));
#endif // __cccl_ptx_isa >= 810

#if __cccl_ptx_isa >= 810
NV_IF_TARGET(NV_PROVIDES_SM_90, (
if (non_eliminated_false()) {
// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.min.s32 [dest], value, [remote_bar];
auto overload = static_cast<void (*)(cuda::ptx::op_min_t, uint32_t* , const int32_t& , uint64_t* )>(cuda::ptx::red_async);
fn_ptr = reinterpret_cast<void*>(overload);
}
));
#endif // __cccl_ptx_isa >= 810

#if __cccl_ptx_isa >= 810
NV_IF_TARGET(NV_PROVIDES_SM_90, (
if (non_eliminated_false()) {
// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.max.s32 [dest], value, [remote_bar];
auto overload = static_cast<void (*)(cuda::ptx::op_max_t, uint32_t* , const int32_t& , uint64_t* )>(cuda::ptx::red_async);
fn_ptr = reinterpret_cast<void*>(overload);
}
));
#endif // __cccl_ptx_isa >= 810

#if __cccl_ptx_isa >= 810
NV_IF_TARGET(NV_PROVIDES_SM_90, (
if (non_eliminated_false()) {
// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.s32 [dest], value, [remote_bar];
auto overload = static_cast<void (*)(cuda::ptx::op_add_t, uint32_t* , const int32_t& , uint64_t* )>(cuda::ptx::red_async);
fn_ptr = reinterpret_cast<void*>(overload);
}
));
#endif // __cccl_ptx_isa >= 810

#if __cccl_ptx_isa >= 810
NV_IF_TARGET(NV_PROVIDES_SM_90, (
if (non_eliminated_false()) {
// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.and.b32 [dest], value, [remote_bar];
auto overload = static_cast<void (*)(cuda::ptx::op_and_op_t, int32_t* , const int32_t& , uint64_t* )>(cuda::ptx::red_async);
fn_ptr = reinterpret_cast<void*>(overload);
}
));
#endif // __cccl_ptx_isa >= 810

#if __cccl_ptx_isa >= 810
NV_IF_TARGET(NV_PROVIDES_SM_90, (
if (non_eliminated_false()) {
// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.or.b32 [dest], value, [remote_bar];
auto overload = static_cast<void (*)(cuda::ptx::op_or_op_t, int32_t* , const int32_t& , uint64_t* )>(cuda::ptx::red_async);
fn_ptr = reinterpret_cast<void*>(overload);
}
));
#endif // __cccl_ptx_isa >= 810

#if __cccl_ptx_isa >= 810
NV_IF_TARGET(NV_PROVIDES_SM_90, (
if (non_eliminated_false()) {
// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.xor.b32 [dest], value, [remote_bar];
auto overload = static_cast<void (*)(cuda::ptx::op_xor_op_t, int32_t* , const int32_t& , uint64_t* )>(cuda::ptx::red_async);
fn_ptr = reinterpret_cast<void*>(overload);
}
));
#endif // __cccl_ptx_isa >= 810

#if __cccl_ptx_isa >= 810
NV_IF_TARGET(NV_PROVIDES_SM_90, (
if (non_eliminated_false()) {
// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u64 [dest], value, [remote_bar];
auto overload = static_cast<void (*)(cuda::ptx::op_add_t, uint64_t* , const uint64_t& , uint64_t* )>(cuda::ptx::red_async);
fn_ptr = reinterpret_cast<void*>(overload);
}
));
#endif // __cccl_ptx_isa >= 810

#if __cccl_ptx_isa >= 810
NV_IF_TARGET(NV_PROVIDES_SM_90, (
if (non_eliminated_false()) {
// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u64 [dest], value, [remote_bar]; // .u64 intentional
auto overload = static_cast<void (*)(cuda::ptx::op_add_t, int64_t* , const int64_t& , int64_t* )>(cuda::ptx::red_async);
fn_ptr = reinterpret_cast<void*>(overload);
}
));
#endif // __cccl_ptx_isa >= 810
}

int main(int, char**)
{
return 0;
}
183 changes: 166 additions & 17 deletions libcudacxx/docs/extended_api/ptx.md
Original file line number Diff line number Diff line change
Expand Up @@ -490,30 +490,30 @@ int main() {

### [9.7.12. Parallel Synchronization and Communication Instructions](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions)

| Instruction | Available in libcu++ |
|------------------------------------------|----------------------|
| [`bar, barrier`] | No |
| [`bar.warp.sync`] | No |
| [`barrier.cluster`] | No |
| [`membar/fence`] | No |
| [`atom`] | No |
| [`red`] | No |
| [`red.async`] | No |
| [`vote (deprecated)`] | No |
| [`vote.sync`] | No |
| [`match.sync`] | No |
| [`activemask`] | No |
| [`redux.sync`] | No |
| [`griddepcontrol`] | No |
| [`elect.sync`] | No |
| Instruction | Available in libcu++ |
|-----------------------|-------------------------|
| [`bar, barrier`] | No |
| [`bar.warp.sync`] | No |
| [`barrier.cluster`] | No |
| [`membar/fence`] | No |
| [`atom`] | No |
| [`red`] | No |
| [`red.async`] | CTK-FUTURE, CCCL v2.3.0 |
| [`vote (deprecated)`] | No |
| [`vote.sync`] | No |
| [`match.sync`] | No |
| [`activemask`] | No |
| [`redux.sync`] | No |
| [`griddepcontrol`] | No |
| [`elect.sync`] | No |

[`bar, barrier`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-bar-barrier
[`bar.warp.sync`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-bar-warp-sync
[`barrier.cluster`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-barrier-cluster
[`membar/fence`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar-fence
[`atom`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-atom
[`red`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-red
[`red.async`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-red-async
[`red.async`]: #redasync
[`vote (deprecated)`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-vote-deprecated
[`vote.sync`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-vote-sync
[`match.sync`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-match-sync
Expand All @@ -522,6 +522,155 @@ int main() {
[`griddepcontrol`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-griddepcontrol
[`elect.sync`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-elect-sync

#### `red.async`

- PTX ISA: [`red.async`](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-red-async)

PTX does not currently (CTK 12.3) expose `red.async.add.s64`. This exposure is emulated in `cuda::ptx` using

```cuda
// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}.u64 [dest], value, [remote_bar]; // .u64 intentional PTX ISA 81, SM_90
// .op = { .add }
template <typename=void>
__device__ static inline void red_async(
cuda::ptx::op_add_t,
int64_t* dest,
const int64_t& value,
int64_t* remote_bar);
```

**red_async**:
```cuda
// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // PTX ISA 81, SM_90
// .type = { .u32 }
// .op = { .inc }
template <typename=void>
__device__ static inline void red_async(
cuda::ptx::op_inc_t,
uint32_t* dest,
const uint32_t& value,
uint64_t* remote_bar);
// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // PTX ISA 81, SM_90
// .type = { .u32 }
// .op = { .dec }
template <typename=void>
__device__ static inline void red_async(
cuda::ptx::op_dec_t,
uint32_t* dest,
const uint32_t& value,
uint64_t* remote_bar);
// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // PTX ISA 81, SM_90
// .type = { .u32 }
// .op = { .min }
template <typename=void>
__device__ static inline void red_async(
cuda::ptx::op_min_t,
uint32_t* dest,
const uint32_t& value,
uint64_t* remote_bar);
// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // PTX ISA 81, SM_90
// .type = { .u32 }
// .op = { .max }
template <typename=void>
__device__ static inline void red_async(
cuda::ptx::op_max_t,
uint32_t* dest,
const uint32_t& value,
uint64_t* remote_bar);
// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // PTX ISA 81, SM_90
// .type = { .u32 }
// .op = { .add }
template <typename=void>
__device__ static inline void red_async(
cuda::ptx::op_add_t,
uint32_t* dest,
const uint32_t& value,
uint64_t* remote_bar);
// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // PTX ISA 81, SM_90
// .type = { .s32 }
// .op = { .min }
template <typename=void>
__device__ static inline void red_async(
cuda::ptx::op_min_t,
uint32_t* dest,
const int32_t& value,
uint64_t* remote_bar);
// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // PTX ISA 81, SM_90
// .type = { .s32 }
// .op = { .max }
template <typename=void>
__device__ static inline void red_async(
cuda::ptx::op_max_t,
uint32_t* dest,
const int32_t& value,
uint64_t* remote_bar);
// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // PTX ISA 81, SM_90
// .type = { .s32 }
// .op = { .add }
template <typename=void>
__device__ static inline void red_async(
cuda::ptx::op_add_t,
uint32_t* dest,
const int32_t& value,
uint64_t* remote_bar);
// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // PTX ISA 81, SM_90
// .type = { .b32 }
// .op = { .and }
template <typename B32>
__device__ static inline void red_async(
cuda::ptx::op_and_op_t,
B32* dest,
const B32& value,
uint64_t* remote_bar);
// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // PTX ISA 81, SM_90
// .type = { .b32 }
// .op = { .or }
template <typename B32>
__device__ static inline void red_async(
cuda::ptx::op_or_op_t,
B32* dest,
const B32& value,
uint64_t* remote_bar);
// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // PTX ISA 81, SM_90
// .type = { .b32 }
// .op = { .xor }
template <typename B32>
__device__ static inline void red_async(
cuda::ptx::op_xor_op_t,
B32* dest,
const B32& value,
uint64_t* remote_bar);
// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // PTX ISA 81, SM_90
// .type = { .u64 }
// .op = { .add }
template <typename=void>
__device__ static inline void red_async(
cuda::ptx::op_add_t,
uint64_t* dest,
const uint64_t& value,
uint64_t* remote_bar);
// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}.u64 [dest], value, [remote_bar]; // .u64 intentional PTX ISA 81, SM_90
// .op = { .add }
template <typename=void>
__device__ static inline void red_async(
cuda::ptx::op_add_t,
int64_t* dest,
const int64_t& value,
int64_t* remote_bar);
```

### [9.7.12.15. Parallel Synchronization and Communication Instructions: mbarrier](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier)

| Instruction | Available in libcu++ |
Expand Down
Loading

0 comments on commit 9bbec0c

Please sign in to comment.