Skip to content

Commit

Permalink
PTX: Add cuda::ptx::elect_sync (NVIDIA#1537)
Browse files Browse the repository at this point in the history
* PTX: Add `cuda::ptx::elect_sync`

* Add link to invoke_one API
  • Loading branch information
ahendriksen authored Mar 14, 2024
1 parent 0ff0f61 commit 3d629a9
Show file tree
Hide file tree
Showing 5 changed files with 145 additions and 3 deletions.
4 changes: 2 additions & 2 deletions libcudacxx/docs/ptx.md
Original file line number Diff line number Diff line change
Expand Up @@ -400,7 +400,7 @@ notes](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#release
| [`activemask`] | No |
| [`redux.sync`] | No |
| [`griddepcontrol`] | No |
| [`elect.sync`] | No |
| [`elect.sync`] | CTK-FUTURE, CCCL v2.5.0 |

[`bar, barrier`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-bar-barrier
[`bar.warp.sync`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-bar-warp-sync
Expand All @@ -416,7 +416,7 @@ notes](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#release
[`activemask`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-activemask
[`redux.sync`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-redux-sync
[`griddepcontrol`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-griddepcontrol
[`elect.sync`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-elect-sync
[`elect.sync`]: ptx/instructions/elect.sync.md

### [Parallel Synchronization and Communication Instructions: mbarrier](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier)

Expand Down
24 changes: 24 additions & 0 deletions libcudacxx/docs/ptx/instructions/elect.sync.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# elect.sync

- PTX ISA: [`elect.sync`](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-elect-sync)

*Implementation note:* Since C++ does not support returning multiple values, the
variant of the instruction that returns both a predicate and an updated
membermask is not supported.

This instruction can also be accessed through the cooperative groups
~invoke_one~
[API](https://docs.nvidia.com/cuda/cuda-c-programming-guide/#invoke-one-and-invoke-one-broadcast).

| C++ | PTX |
| [(0)](#0-elect_sync) `cuda::ptx::elect_sync`| `elect.sync` |


### [(0)](#0-elect_sync) `elect_sync`
{: .no_toc }
```cuda
// elect.sync _|is_elected, membermask; // PTX ISA 80, SM_90
template <typename=void>
__device__ static inline bool elect_sync(
const uint32_t& membermask);
```
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
//===----------------------------------------------------------------------===//

#ifndef _LIBCUDACXX___CUDA_PTX_H
#define _LIBCUDACXX___CUDA_PTX_H
#define _LIBCUDACXX___CUDA_PTX_H

#ifndef __cuda_std__
#error "<__cuda/ptx.h> should only be included in from <cuda/ptx>"
Expand Down Expand Up @@ -78,6 +78,7 @@
#include "ptx/instructions/cp_async_bulk_wait_group.h"
#include "ptx/instructions/cp_reduce_async_bulk.h"
#include "ptx/instructions/cp_reduce_async_bulk_tensor.h"
#include "ptx/instructions/elect_sync.h"
#include "ptx/instructions/fence.h"
#include "ptx/instructions/get_sreg.h"
#include "ptx/instructions/getctarank.h"
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
// -*- C++ -*-
//===----------------------------------------------------------------------===//
//
// Part of libcu++, the C++ Standard Library for your entire system,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//

#ifndef _CUDA_PTX_ELECT_SYNC_H_
#define _CUDA_PTX_ELECT_SYNC_H_

#ifndef __cuda_std__
# include <__config>
#endif // __cuda_std__

#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
# pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
# pragma clang system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
# pragma system_header
#endif // no system header

#include <nv/target> // __CUDA_MINIMUM_ARCH__ and friends

#include "../ptx_dot_variants.h"
#include "../ptx_helper_functions.h"
#include "../../../cstdint"

_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX

// elect_sync
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-elect-sync
/*
// elect.sync _|is_elected, membermask; // PTX ISA 80, SM_90
template <typename=void>
__device__ static inline bool elect_sync(
const uint32_t& membermask);
*/
#if __cccl_ptx_isa >= 800
extern "C" _CCCL_DEVICE void __cuda_ptx_elect_sync_is_not_supported_before_SM_90__();
template <typename=void>
_CCCL_DEVICE static inline bool elect_sync(
const _CUDA_VSTD::uint32_t& __membermask)
{
NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
_CUDA_VSTD::uint32_t __is_elected;
asm volatile (
"{\n\t .reg .pred P_OUT; \n\t"
"elect.sync _|P_OUT, %1;\n\t"
"selp.b32 %0, 1, 0, P_OUT; \n"
"}"
: "=r"(__is_elected)
: "r"(__membermask)
:
);
return static_cast<bool>(__is_elected);
),(
// Unsupported architectures will have a linker error with a semi-decent error message
__cuda_ptx_elect_sync_is_not_supported_before_SM_90__();
return false;
));
}
#endif // __cccl_ptx_isa >= 800

_LIBCUDACXX_END_NAMESPACE_CUDA_PTX

#endif // _CUDA_PTX_ELECT_SYNC_H_
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
//===----------------------------------------------------------------------===//
//
// Part of libcu++, the C++ Standard Library for your entire system,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//
// UNSUPPORTED: libcpp-has-no-threads

// <cuda/ptx>

#include <cuda/ptx>
#include <cuda/std/utility>

/*
* We use a special strategy to force the generation of the PTX. This is mainly
* a fight against dead-code-elimination in the NVVM layer.
*
* The reason we need this strategy is because certain older versions of ptxas
* segfault when a non-sensical sequence of PTX is generated. So instead, we try
* to force the instantiation and compilation to PTX of all the overloads of the
* PTX wrapping functions.
*
* We do this by writing a function pointer of each overload to the kernel
* parameter `fn_ptr`.
*
* Because `fn_ptr` is possibly visible outside this translation unit, the
* compiler must compile all the functions which are stored.
*
*/

__global__ void test_elect_sync(void ** fn_ptr) {
#if __cccl_ptx_isa >= 800
NV_IF_TARGET(NV_PROVIDES_SM_90, (
// elect.sync _|is_elected, membermask;
*fn_ptr++ = reinterpret_cast<void*>(static_cast<bool (*)(const uint32_t& )>(cuda::ptx::elect_sync));
));
#endif // __cccl_ptx_isa >= 800
}

int main(int, char**)
{
return 0;
}

0 comments on commit 3d629a9

Please sign in to comment.