forked from NVIDIA/cccl
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
PTX: Add
cuda::ptx::elect_sync
(NVIDIA#1537)
* PTX: Add `cuda::ptx::elect_sync` * Add link to invoke_one API
- Loading branch information
1 parent
0ff0f61
commit 3d629a9
Showing
5 changed files
with
145 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
# elect.sync | ||
|
||
- PTX ISA: [`elect.sync`](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-elect-sync) | ||
|
||
*Implementation note:* Since C++ does not support returning multiple values, the | ||
variant of the instruction that returns both a predicate and an updated | ||
membermask is not supported. | ||
|
||
This instruction can also be accessed through the cooperative groups | ||
~invoke_one~ | ||
[API](https://docs.nvidia.com/cuda/cuda-c-programming-guide/#invoke-one-and-invoke-one-broadcast). | ||
|
||
| C++ | PTX | | ||
| [(0)](#0-elect_sync) `cuda::ptx::elect_sync`| `elect.sync` | | ||
|
||
|
||
### [(0)](#0-elect_sync) `elect_sync` | ||
{: .no_toc } | ||
```cuda | ||
// elect.sync _|is_elected, membermask; // PTX ISA 80, SM_90 | ||
template <typename=void> | ||
__device__ static inline bool elect_sync( | ||
const uint32_t& membermask); | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
71 changes: 71 additions & 0 deletions
71
libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/instructions/elect_sync.h
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
// -*- C++ -*- | ||
//===----------------------------------------------------------------------===// | ||
// | ||
// Part of libcu++, the C++ Standard Library for your entire system, | ||
// under the Apache License v2.0 with LLVM Exceptions. | ||
// See https://llvm.org/LICENSE.txt for license information. | ||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. | ||
// | ||
//===----------------------------------------------------------------------===// | ||
|
||
#ifndef _CUDA_PTX_ELECT_SYNC_H_ | ||
#define _CUDA_PTX_ELECT_SYNC_H_ | ||
|
||
#ifndef __cuda_std__ | ||
# include <__config> | ||
#endif // __cuda_std__ | ||
|
||
#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) | ||
# pragma GCC system_header | ||
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) | ||
# pragma clang system_header | ||
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) | ||
# pragma system_header | ||
#endif // no system header | ||
|
||
#include <nv/target> // __CUDA_MINIMUM_ARCH__ and friends | ||
|
||
#include "../ptx_dot_variants.h" | ||
#include "../ptx_helper_functions.h" | ||
#include "../../../cstdint" | ||
|
||
_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX | ||
|
||
// elect_sync | ||
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-elect-sync | ||
/* | ||
// elect.sync _|is_elected, membermask; // PTX ISA 80, SM_90 | ||
template <typename=void> | ||
__device__ static inline bool elect_sync( | ||
const uint32_t& membermask); | ||
*/ | ||
#if __cccl_ptx_isa >= 800 | ||
extern "C" _CCCL_DEVICE void __cuda_ptx_elect_sync_is_not_supported_before_SM_90__(); | ||
template <typename=void> | ||
_CCCL_DEVICE static inline bool elect_sync( | ||
const _CUDA_VSTD::uint32_t& __membermask) | ||
{ | ||
NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,( | ||
_CUDA_VSTD::uint32_t __is_elected; | ||
asm volatile ( | ||
"{\n\t .reg .pred P_OUT; \n\t" | ||
"elect.sync _|P_OUT, %1;\n\t" | ||
"selp.b32 %0, 1, 0, P_OUT; \n" | ||
"}" | ||
: "=r"(__is_elected) | ||
: "r"(__membermask) | ||
: | ||
); | ||
return static_cast<bool>(__is_elected); | ||
),( | ||
// Unsupported architectures will have a linker error with a semi-decent error message | ||
__cuda_ptx_elect_sync_is_not_supported_before_SM_90__(); | ||
return false; | ||
)); | ||
} | ||
#endif // __cccl_ptx_isa >= 800 | ||
|
||
_LIBCUDACXX_END_NAMESPACE_CUDA_PTX | ||
|
||
#endif // _CUDA_PTX_ELECT_SYNC_H_ |
46 changes: 46 additions & 0 deletions
46
libcudacxx/test/libcudacxx/cuda/ptx/ptx.elect.sync.compile.pass.cpp
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
//===----------------------------------------------------------------------===// | ||
// | ||
// Part of libcu++, the C++ Standard Library for your entire system, | ||
// under the Apache License v2.0 with LLVM Exceptions. | ||
// See https://llvm.org/LICENSE.txt for license information. | ||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. | ||
// | ||
//===----------------------------------------------------------------------===// | ||
// UNSUPPORTED: libcpp-has-no-threads | ||
|
||
// <cuda/ptx> | ||
|
||
#include <cuda/ptx> | ||
#include <cuda/std/utility> | ||
|
||
/* | ||
* We use a special strategy to force the generation of the PTX. This is mainly | ||
* a fight against dead-code-elimination in the NVVM layer. | ||
* | ||
* The reason we need this strategy is because certain older versions of ptxas | ||
* segfault when a non-sensical sequence of PTX is generated. So instead, we try | ||
* to force the instantiation and compilation to PTX of all the overloads of the | ||
* PTX wrapping functions. | ||
* | ||
* We do this by writing a function pointer of each overload to the kernel | ||
* parameter `fn_ptr`. | ||
* | ||
* Because `fn_ptr` is possibly visible outside this translation unit, the | ||
* compiler must compile all the functions which are stored. | ||
* | ||
*/ | ||
|
||
__global__ void test_elect_sync(void ** fn_ptr) { | ||
#if __cccl_ptx_isa >= 800 | ||
NV_IF_TARGET(NV_PROVIDES_SM_90, ( | ||
// elect.sync _|is_elected, membermask; | ||
*fn_ptr++ = reinterpret_cast<void*>(static_cast<bool (*)(const uint32_t& )>(cuda::ptx::elect_sync)); | ||
)); | ||
#endif // __cccl_ptx_isa >= 800 | ||
} | ||
|
||
int main(int, char**) | ||
{ | ||
return 0; | ||
} |