PTX: Add cuda::ptx::elect_sync (NVIDIA#1537)

* PTX: Add `cuda::ptx::elect_sync` * Add link to invoke_one API
ahendriksen · Mar 14, 2024 · 3d629a9 · 3d629a9
1 parent 0ff0f61
commit 3d629a9
Show file tree

Hide file tree

Showing 5 changed files with 145 additions and 3 deletions.
diff --git a/libcudacxx/docs/ptx.md b/libcudacxx/docs/ptx.md
@@ -400,7 +400,7 @@ notes](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#release
 | [`activemask`]        | No                      |
 | [`redux.sync`]        | No                      |
 | [`griddepcontrol`]    | No                      |
-| [`elect.sync`]        | No                      |
+| [`elect.sync`]        | CTK-FUTURE, CCCL v2.5.0 |
 
 [`bar, barrier`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-bar-barrier
 [`bar.warp.sync`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-bar-warp-sync
@@ -416,7 +416,7 @@ notes](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#release
 [`activemask`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-activemask
 [`redux.sync`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-redux-sync
 [`griddepcontrol`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-griddepcontrol
-[`elect.sync`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-elect-sync
+[`elect.sync`]: ptx/instructions/elect.sync.md
 
 ### [Parallel Synchronization and Communication Instructions: mbarrier](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier)
 

diff --git a/libcudacxx/docs/ptx/instructions/elect.sync.md b/libcudacxx/docs/ptx/instructions/elect.sync.md
@@ -0,0 +1,24 @@
+# elect.sync
+
+- PTX ISA: [`elect.sync`](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-elect-sync)
+
+*Implementation note:* Since C++ does not support returning multiple values, the
+variant of the instruction that returns both a predicate and an updated
+membermask is not supported.
+
+This instruction can also be accessed through the cooperative groups
+~invoke_one~
+[API](https://docs.nvidia.com/cuda/cuda-c-programming-guide/#invoke-one-and-invoke-one-broadcast).
+
+| C++ | PTX |
+| [(0)](#0-elect_sync) `cuda::ptx::elect_sync`| `elect.sync` |
+
+
+### [(0)](#0-elect_sync) `elect_sync`
+{: .no_toc }
+```cuda
+// elect.sync _|is_elected, membermask; // PTX ISA 80, SM_90
+template <typename=void>
+__device__ static inline bool elect_sync(
+  const uint32_t& membermask);
+```
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h
@@ -10,7 +10,7 @@
 //===----------------------------------------------------------------------===//
 
 #ifndef _LIBCUDACXX___CUDA_PTX_H
-#define  _LIBCUDACXX___CUDA_PTX_H
+#define _LIBCUDACXX___CUDA_PTX_H
 
 #ifndef __cuda_std__
 #error "<__cuda/ptx.h> should only be included in from <cuda/ptx>"
@@ -78,6 +78,7 @@
 #include "ptx/instructions/cp_async_bulk_wait_group.h"
 #include "ptx/instructions/cp_reduce_async_bulk.h"
 #include "ptx/instructions/cp_reduce_async_bulk_tensor.h"
+#include "ptx/instructions/elect_sync.h"
 #include "ptx/instructions/fence.h"
 #include "ptx/instructions/get_sreg.h"
 #include "ptx/instructions/getctarank.h"

diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/instructions/elect_sync.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/instructions/elect_sync.h
@@ -0,0 +1,71 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_PTX_ELECT_SYNC_H_
+#define _CUDA_PTX_ELECT_SYNC_H_
+
+#ifndef __cuda_std__
+#  include <__config>
+#endif // __cuda_std__
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <nv/target> // __CUDA_MINIMUM_ARCH__ and friends
+
+#include "../ptx_dot_variants.h"
+#include "../ptx_helper_functions.h"
+#include "../../../cstdint"
+
+_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
+
+// elect_sync
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-elect-sync
+/*
+// elect.sync _|is_elected, membermask; // PTX ISA 80, SM_90
+template <typename=void>
+__device__ static inline bool elect_sync(
+  const uint32_t& membermask);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_elect_sync_is_not_supported_before_SM_90__();
+template <typename=void>
+_CCCL_DEVICE static inline bool elect_sync(
+  const _CUDA_VSTD::uint32_t& __membermask)
+{
+  NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
+    _CUDA_VSTD::uint32_t __is_elected;
+    asm volatile (
+      "{\n\t .reg .pred P_OUT; \n\t"
+      "elect.sync _|P_OUT, %1;\n\t"
+      "selp.b32 %0, 1, 0, P_OUT; \n"
+      "}"
+      : "=r"(__is_elected)
+      : "r"(__membermask)
+      :
+    );
+    return static_cast<bool>(__is_elected);
+  ),(
+    // Unsupported architectures will have a linker error with a semi-decent error message
+    __cuda_ptx_elect_sync_is_not_supported_before_SM_90__();
+    return false;
+  ));
+}
+#endif // __cccl_ptx_isa >= 800
+
+_LIBCUDACXX_END_NAMESPACE_CUDA_PTX
+
+#endif // _CUDA_PTX_ELECT_SYNC_H_
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.elect.sync.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.elect.sync.compile.pass.cpp
@@ -0,0 +1,46 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+// UNSUPPORTED: libcpp-has-no-threads
+
+// <cuda/ptx>
+
+#include <cuda/ptx>
+#include <cuda/std/utility>
+
+/*
+ * We use a special strategy to force the generation of the PTX. This is mainly
+ * a fight against dead-code-elimination in the NVVM layer.
+ *
+ * The reason we need this strategy is because certain older versions of ptxas
+ * segfault when a non-sensical sequence of PTX is generated. So instead, we try
+ * to force the instantiation and compilation to PTX of all the overloads of the
+ * PTX wrapping functions.
+ *
+ * We do this by writing a function pointer of each overload to the kernel
+ * parameter `fn_ptr`.
+ *
+ * Because `fn_ptr` is possibly visible outside this translation unit, the
+ * compiler must compile all the functions which are stored.
+ *
+ */
+
+__global__ void test_elect_sync(void ** fn_ptr) {
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(NV_PROVIDES_SM_90, (
+    // elect.sync _|is_elected, membermask;
+    *fn_ptr++ = reinterpret_cast<void*>(static_cast<bool (*)(const uint32_t& )>(cuda::ptx::elect_sync));
+  ));
+#endif // __cccl_ptx_isa >= 800
+}
+
+int main(int, char**)
+{
+    return 0;
+}