From 16ad54ae7e99d0d4dc2338acc63a4d7c507bcb37 Mon Sep 17 00:00:00 2001 From: Allard Hendriksen Date: Tue, 17 Oct 2023 16:27:24 +0200 Subject: [PATCH 01/49] Initial proof-of-concept for PTX header --- .../test/cuda/ptx/mbarrier_arrive_tx.pass.cpp | 52 +++ libcudacxx/include/cuda/ptx | 352 ++++++++++++++++++ .../cuda/std/detail/libcxx/include/__config | 2 + 3 files changed, 406 insertions(+) create mode 100644 libcudacxx/.upstream-tests/test/cuda/ptx/mbarrier_arrive_tx.pass.cpp create mode 100644 libcudacxx/include/cuda/ptx diff --git a/libcudacxx/.upstream-tests/test/cuda/ptx/mbarrier_arrive_tx.pass.cpp b/libcudacxx/.upstream-tests/test/cuda/ptx/mbarrier_arrive_tx.pass.cpp new file mode 100644 index 00000000000..f72406bdeb2 --- /dev/null +++ b/libcudacxx/.upstream-tests/test/cuda/ptx/mbarrier_arrive_tx.pass.cpp @@ -0,0 +1,52 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// +// UNSUPPORTED: libcpp-has-no-threads +// UNSUPPORTED: pre-sm-90 + +// + +#include + +#include + +#include "concurrent_agents.h" +#include "cuda_space_selector.h" +#include "test_macros.h" + +int main(int, char**) +{ + NV_DISPATCH_TARGET( + NV_IS_HOST, ( + // Required by concurrent_agents_launch to know how many we're + // launching. This can only be an int, because the nvrtc tests use grep + // to figure out how many threads to launch. + cuda_thread_count = 1; + ), + NV_IS_DEVICE, ( + // Do not execute. Just check if this compiles (that is: assembles) without error. + if (false) { + using cuda::ptx::sem_release; + using cuda::ptx::space_shared_cluster; + using cuda::ptx::space_shared; + using cuda::ptx::scope_cluster; + using cuda::ptx::scope_cta; + + __shared__ uint64_t bar; + cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cta, space_shared, &bar, 1); + cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cluster, space_shared, &bar, 1); + + cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cta, space_shared_cluster, &bar, 1); + cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cluster, space_shared_cluster, &bar, 1); + } + ) + ); + + return 0; +} diff --git a/libcudacxx/include/cuda/ptx b/libcudacxx/include/cuda/ptx new file mode 100644 index 00000000000..31d22a40231 --- /dev/null +++ b/libcudacxx/include/cuda/ptx @@ -0,0 +1,352 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_PTX +#define _CUDA_PTX + +#include "std/detail/__config" // Macros +#include "std/type_traits" // std::integral_constant +#include "../nv/target" // __CUDA_MINIMUM_ARCH__ and friends + +/* + * The cuda::ptx namespace intends to provide PTX wrappers for new hardware + * features and new PTX instructions so that they can be experimented with + * before higher-level C++ APIs are designed and developed. + * + * The wrappers have the following responsibilities: + * + * - They must prevent any PTX assembler errors, that is: + * - They are defined only for versions of the CUDA Toolkit in which nvcc/ptxas + * actually recognizes the instruction. + * - Sizes and types of parameters are correct. + * - They must convert state spaces correctly. + * - They adhere to the libcu++ coding standards of using: + * - Double underscores for all parameters, variables + * - _CUDA_VSTD:: namespace for types + * + * The wrappers should not do the following: + * + * - Use any non-native types. For example, an mbarrier instruction wrapper + * takes the barrier address as a uint64_t pointer. + * + * This header is intended for: + * + * - internal consumption by higher-level APIs such as cuda::barrier, + * - outside developers who want to experiment with the latest features of the + * hardware. + * + */ + +_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX + +/* + * Integral constant types and values for + * + * - .sem + * - .space + * - .scope + * + * Skipping some steps in my reasoning: If we want to keep the PTX bindings + * relatively stable, and also be able to adapt to additions of semantics, + * space, and scope variants of a PTX instruction, then we must be able to add + * new overloads of an instruction with .sem, .space, or .scope as type-level + * parameters. + * + */ + +enum class dot_sem { + acq_rel, + acquire, + relaxed, + release, + sc, + weak + // mmio? + // volatile? +}; + +enum class dot_space { + reg, + sreg, + const_mem, // can't use const + global, + local, + param, + shared, + shared_cluster, + tex // deprecated +}; + +enum class dot_scope { + cta, + cluster, + gpu, + sys +}; + +template +using sem_t = std::integral_constant; +using sem_acq_rel_t = sem_t; +using sem_acquire_t = sem_t; +using sem_relaxed_t = sem_t; +using sem_release_t = sem_t; +using sem_sc_t = sem_t; +using sem_weak_t = sem_t; + +static constexpr sem_acq_rel_t sem_acq_rel{}; +static constexpr sem_acquire_t sem_acquire{}; +static constexpr sem_relaxed_t sem_relaxed{}; +static constexpr sem_release_t sem_release{}; +static constexpr sem_sc_t sem_sc{}; +static constexpr sem_weak_t sem_weak{}; + +template +using space_t = std::integral_constant; +using space_const_mem_t = std::integral_constant; +using space_global_t = std::integral_constant; +using space_local_t = std::integral_constant; +using space_param_t = std::integral_constant; +using space_reg_t = std::integral_constant; +using space_shared_t = std::integral_constant; +using space_shared_cluster_t = std::integral_constant; +using space_sreg_t = std::integral_constant; +using space_tex_t = std::integral_constant; + +static constexpr space_const_mem_t space_const_mem{}; +static constexpr space_global_t space_global{}; +static constexpr space_local_t space_local{}; +static constexpr space_param_t space_param{}; +static constexpr space_reg_t space_reg{}; +static constexpr space_shared_t space_shared{}; +static constexpr space_shared_cluster_t space_shared_cluster{}; +static constexpr space_sreg_t space_sreg{}; +static constexpr space_tex_t space_tex{}; + +template +using scope_t = std::integral_constant; +using scope_cluster_t = std::integral_constant; +using scope_cta_t = std::integral_constant; +using scope_gpu_t = std::integral_constant; +using scope_sys_t = std::integral_constant; + +static constexpr scope_cluster_t scope_cluster{}; +static constexpr scope_cta_t scope_cta{}; +static constexpr scope_gpu_t scope_gpu{}; +static constexpr scope_sys_t scope_sys{}; + + +inline __device__ _CUDA_VSTD::uint32_t __as_smem_ptr(const void * __ptr) { return static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__ptr)); } +inline __device__ _CUDA_VSTD::uint32_t __as_remote_dsmem_ptr(const void * __ptr) { return static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__ptr)); } +inline __device__ _CUDA_VSTD::uint64_t __as_gmem_ptr(const void * __ptr) { return static_cast<_CUDA_VSTD::uint64_t>(__cvta_generic_to_global(__ptr)); } + + + +// SM 90 features +// -------------- + +/* + * TMA / cp.async.bulk + * + */ + +// cp.async.bulk +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk + +// cp.reduce.async.bulk +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-reduce-async-bulk + +// cp.async.bulk.tensor +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor + +// cp.reduce.async.bulk.tensor +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-reduce-async-bulk-tensor + +// cp.async.bulk.commit_group +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-commit-group + +// cp.async.bulk.wait_group +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-wait-group + + +// Lower priority: + +// prefetch{.tensormap_space}.tensormap [a]; // prefetch the tensormap +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-prefetch-prefetchu + +// cp.async.bulk.prefetch +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk + +// cp.async.bulk.prefetch.tensor +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-prefetch-tensor + +/* + * Shared memory barrier + * + */ + +// mbarrier.expect_tx +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-expect-tx + +// mbarrier.complete_tx +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-complete-tx + +// mbarrier.arrive.expect_tx +// Support for count argument without the modifier .noComplete requires sm_90 or higher. +// Qualifier .expect_tx requires sm_90 or higher. +// Sub-qualifier ::cluster requires sm_90 or higher. +// Support for .cluster scope requires sm_90 or higher. +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive + + + +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive +#if (defined(__CUDA_MINIMUM_ARCH__) && 900 <= __CUDA_MINIMUM_ARCH__) || (!defined(__CUDA_MINIMUM_ARCH__)) +template +_LIBCUDACXX_DEVICE inline +_CUDA_VSTD::uint64_t mbarrier_arrive_expect_tx(sem_release_t __sem, scope_t<_Sco> __scope, space_shared_t __spc, _CUDA_VSTD::uint64_t* __addr, _CUDA_VSTD::uint32_t __tx_count) +{ + // Arrive on local shared memory barrier + static_assert(__scope == scope_cta || __scope == scope_cluster, ""); + _CUDA_VSTD::uint64_t __token; + + if constexpr (__scope == scope_cta) { + asm ( + "mbarrier.arrive.expect_tx.release.cta.shared::cta.b64 %0, [%1], %2;" + : "=l"(__token) + : "r"(__as_smem_ptr(__addr)), + "r"(__tx_count) + : "memory"); + } else { + asm ( + "mbarrier.arrive.expect_tx.release.cluster.shared::cta.b64 %0, [%1], %2;" + : "=l"(__token) + : "r"(__as_smem_ptr(__addr)), + "r"(__tx_count) + : "memory"); + } + return __token; +} + +template +_LIBCUDACXX_DEVICE inline +void mbarrier_arrive_expect_tx(sem_release_t __sem, scope_t<_Sco> __scope, space_shared_cluster_t __spc, _CUDA_VSTD::uint64_t* __addr, _CUDA_VSTD::uint32_t __tx_count) +{ + // Arrive on remote cluster barrier + static_assert(__scope == scope_cta || __scope == scope_cluster, ""); + if constexpr (__scope == scope_cta) { + asm ( + "mbarrier.arrive.expect_tx.release.cta.shared::cluster.b64 _, [%0], %1;" + : + : "r"(__as_smem_ptr(__addr)), + "r"(__tx_count) + : "memory"); + } else { + asm ( + "mbarrier.arrive.expect_tx.release.cluster.shared::cluster.b64 _, [%0], %1;" + : + : "r"(__as_smem_ptr(__addr)), + "r"(__tx_count) + : "memory"); + } +} +#endif // __CUDA_MINIMUM_ARCH__ + + + + +// mbarrier.test_wait/mbarrier.try_wait +// mbarrier.try_wait requires sm_90 or higher. +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-test-wait-mbarrier-try-wait + + +/* + * Cluster Basics: + * + * These instructions are already exposed at a higher level, so may not be necessary. + */ + +// mapa{.space}.type d, a, b; +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-mapa + +// getctarank{.space}.type d, a; +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-getctarank + +// barrier.cluster +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-barrier-cluster + +// atom .cluster +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-atom + +// red .cluster +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-red + +/* + * Cluster async + * + */ + +// st.async +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-st-async + +// red.async +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-red-async + +/* + * + * Other instructions + */ + +// fence.proxy.async.{global, shared::{cta, cluster}} +// fence.mbarrier_init.release.cluster (may be a bit overkill??) +// fence.{sc, acq_rel}.cluster +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar-fence + +// multimem.ld_reduce, multimem.st, multimem.red +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-multimem-ld-reduce-multimem-st-multimem-red + +// griddepcontrol +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-griddepcontrol + +// elect.sync +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-elect-sync + +// stmatrix +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-store-instruction-stmatrix + +/* + * Special registers (cluster-related) + * + */ + +// 10.12. Special Registers: %clusterid +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-clusterid + +// 10.13. Special Registers: %nclusterid +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-nclusterid + +// 10.14. Special Registers: %cluster_ctaid +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-cluster-ctaid + +// 10.15. Special Registers: %cluster_nctaid +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-cluster-nctaid + +// 10.16. Special Registers: %cluster_ctarank +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-cluster-ctarank + +// 10.17. Special Registers: %cluster_nctarank +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-cluster-nctarank + +// 10.31. Special Registers: %aggr_smem_size +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-aggr-smem-size + +_LIBCUDACXX_END_NAMESPACE_CUDA_PTX + +#endif // _CUDA_PTX diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__config b/libcudacxx/include/cuda/std/detail/libcxx/include/__config index 79fe46c5a05..65db3322031 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/__config +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__config @@ -1493,6 +1493,8 @@ typedef __char32_t char32_t; #define _LIBCUDACXX_END_NAMESPACE_CUDA } } #define _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_DEVICE namespace cuda { namespace device { inline namespace _LIBCUDACXX_ABI_NAMESPACE { #define _LIBCUDACXX_END_NAMESPACE_CUDA_DEVICE } } } +#define _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX namespace cuda { namespace ptx { inline namespace _LIBCUDACXX_ABI_NAMESPACE { +#define _LIBCUDACXX_END_NAMESPACE_CUDA_PTX } } } #define _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_DEVICE_EXPERIMENTAL namespace cuda { namespace device { namespace experimental { inline namespace _LIBCUDACXX_ABI_NAMESPACE { #define _LIBCUDACXX_END_NAMESPACE_CUDA_DEVICE_EXPERIMENTAL } } } } #endif From 9b31cc8b801b75d6f8a1b82eae95f8a4f98d7f08 Mon Sep 17 00:00:00 2001 From: Allard Hendriksen Date: Tue, 17 Oct 2023 16:53:27 +0200 Subject: [PATCH 02/49] Add docs --- libcudacxx/docs/extended_api.md | 2 + libcudacxx/docs/extended_api/ptx.md | 70 +++++++++++++++++++++++++++++ 2 files changed, 72 insertions(+) create mode 100644 libcudacxx/docs/extended_api/ptx.md diff --git a/libcudacxx/docs/extended_api.md b/libcudacxx/docs/extended_api.md index 952b7c81e51..6f71683edc7 100644 --- a/libcudacxx/docs/extended_api.md +++ b/libcudacxx/docs/extended_api.md @@ -21,6 +21,8 @@ nav_order: 3 {% include_relative extended_api/functional.md %} +{% include_relative extended_api/ptx.md %} + [Thread Scopes]: ./extended_api/memory_model.md#thread-scopes [Thread Groups]: ./extended_api/thread_groups.md diff --git a/libcudacxx/docs/extended_api/ptx.md b/libcudacxx/docs/extended_api/ptx.md new file mode 100644 index 00000000000..56a3c519f5c --- /dev/null +++ b/libcudacxx/docs/extended_api/ptx.md @@ -0,0 +1,70 @@ +## PTX instructions + +The `cuda::ptx` namespace contains functions that map one-to-one to PTX +instructions. These can be used for maximal control of the generated code, or to +experiment with new hardware features before a high-level C++ API is available. + +### Shared memory barrier (mbarrier) + +| Instruction | Compute capability | CUDA Toolkit | +|----------------------------------------|--------------------|--------------| +| `cuda::ptx::mbarrier_arrive_expect_tx` | 9.0 | CTK 12.4 | + + +#### [`cuda::ptx::mbarrier_arrive_expect_tx`](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive) + +```cuda +template +__device__ inline +uint64_t mbarrier_arrive_expect_tx(sem_release_t __sem, scope_t<_Sco> __scope, space_shared_t __spc, uint64_t* __addr, uint32_t __tx_count); + +template +__device__ inline +void mbarrier_arrive_expect_tx(sem_release_t __sem, scope_t<_Sco> __scope, space_shared_cluster_t __spc, uint64_t* __addr, uint32_t __tx_count); +``` + +Usage: + +```cuda +#include +#include +#include + +__global__ void kernel() { + using cuda::ptx::sem_release; + using cuda::ptx::space_shared_cluster; + using cuda::ptx::space_shared; + using cuda::ptx::scope_cluster; + using cuda::ptx::scope_cta; + + using barrier_t = cuda::barrier; + __shared__ barrier_t bar; + init(&bar, blockDim.x); + __syncthreads(); + + + + NV_IF_TARGET(NV_PROVIDES_SM_90, ( + // Arrive on local shared memory barrier: + uint64_t token; + token = cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cta, space_shared, &bar, 1); + token = cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cluster, space_shared, &bar, 1); + + // Get address of remote cluster barrier: + namespace cg = cooperative_groups; + cg::cluster_group cluster = cg::this_cluster(); + unsigned int other_block_rank = cluster.block_rank() ^ 1; + uint64_t * remote_bar = cluster.map_shared_rank(&bar, other_block_rank); + + // Sync cluster to ensure remote barrier is initialized. + cluster.sync(); + + // Arrive on remote cluster barrier: + cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cta, space_shared_cluster, remote_bar, 1); + cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cluster, space_shared_cluster, remote_bar, 1); + ) +} +``` + + + From 229704aa0a0f3d2f71977d60f3044b98a1fd3bee Mon Sep 17 00:00:00 2001 From: Allard Hendriksen Date: Tue, 17 Oct 2023 16:55:11 +0200 Subject: [PATCH 03/49] Reformat docs --- libcudacxx/docs/extended_api/ptx.md | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/libcudacxx/docs/extended_api/ptx.md b/libcudacxx/docs/extended_api/ptx.md index 56a3c519f5c..26d3236b9bd 100644 --- a/libcudacxx/docs/extended_api/ptx.md +++ b/libcudacxx/docs/extended_api/ptx.md @@ -16,11 +16,11 @@ experiment with new hardware features before a high-level C++ API is available. ```cuda template __device__ inline -uint64_t mbarrier_arrive_expect_tx(sem_release_t __sem, scope_t<_Sco> __scope, space_shared_t __spc, uint64_t* __addr, uint32_t __tx_count); +uint64_t mbarrier_arrive_expect_tx(sem_release_t sem, scope_t<_Sco> scope, space_shared_t spc, uint64_t* addr, uint32_t tx_count); template __device__ inline -void mbarrier_arrive_expect_tx(sem_release_t __sem, scope_t<_Sco> __scope, space_shared_cluster_t __spc, uint64_t* __addr, uint32_t __tx_count); +void mbarrier_arrive_expect_tx(sem_release_t sem, scope_t<_Sco> scope, space_shared_cluster_t spc, uint64_t* addr, uint32_t tx_count); ``` Usage: @@ -42,8 +42,6 @@ __global__ void kernel() { init(&bar, blockDim.x); __syncthreads(); - - NV_IF_TARGET(NV_PROVIDES_SM_90, ( // Arrive on local shared memory barrier: uint64_t token; From dad93de59af47bf8003cea1c00d3c228255e6de2 Mon Sep 17 00:00:00 2001 From: Allard Hendriksen Date: Tue, 17 Oct 2023 18:26:29 +0200 Subject: [PATCH 04/49] Use PTX wrapper in internal code --- .../cuda/std/detail/libcxx/include/__cuda/barrier.h | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h index da6b09b3e3d..6b3919f29d4 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h @@ -24,6 +24,7 @@ #endif #include "../cstdlib" // _LIBCUDACXX_UNREACHABLE +#include // cuda::ptx::* #if defined(_LIBCUDACXX_COMPILER_NVRTC) #define _LIBCUDACXX_OFFSET_IS_ZERO(type, member) !(&(((type *)0)->member)) @@ -586,14 +587,12 @@ barrier::arrival_token barrier_arrive_tx( // us in release builds. In debug builds, the error would be caught // by the asserts at the top of this function. - auto __bh = __cvta_generic_to_shared(barrier_native_handle(__b)); + auto __native_handle = barrier_native_handle(__b); + auto __bh = __cvta_generic_to_shared(__native_handle); if (__arrive_count_update == 1) { - asm ( - "mbarrier.arrive.expect_tx.release.cta.shared::cta.b64 %0, [%1], %2;" - : "=l"(__token) - : "r"(static_cast<_CUDA_VSTD::uint32_t>(__bh)), - "r"(static_cast<_CUDA_VSTD::uint32_t>(__transaction_count_update)) - : "memory"); + __token = cuda::ptx::mbarrier_arrive_expect_tx( + cuda::ptx::sem_release, cuda::ptx::scope_cta, cuda::ptx::space_shared, __native_handle, __transaction_count_update + ); } else { asm ( "mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;" From 220d4758eebb9504084fd80c9075fff89a24a570 Mon Sep 17 00:00:00 2001 From: Allard Hendriksen Date: Wed, 18 Oct 2023 10:38:12 +0200 Subject: [PATCH 05/49] Apply suggestions from code review Co-authored-by: Michael Schellenberger Costa --- libcudacxx/docs/extended_api/ptx.md | 2 +- libcudacxx/include/cuda/ptx | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/libcudacxx/docs/extended_api/ptx.md b/libcudacxx/docs/extended_api/ptx.md index 26d3236b9bd..e8ab487d16a 100644 --- a/libcudacxx/docs/extended_api/ptx.md +++ b/libcudacxx/docs/extended_api/ptx.md @@ -1,7 +1,7 @@ ## PTX instructions The `cuda::ptx` namespace contains functions that map one-to-one to PTX -instructions. These can be used for maximal control of the generated code, or to +[instructions](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html). These can be used for maximal control of the generated code, or to experiment with new hardware features before a high-level C++ API is available. ### Shared memory barrier (mbarrier) diff --git a/libcudacxx/include/cuda/ptx b/libcudacxx/include/cuda/ptx index 31d22a40231..ff2c26bb05b 100644 --- a/libcudacxx/include/cuda/ptx +++ b/libcudacxx/include/cuda/ptx @@ -29,7 +29,7 @@ * - Sizes and types of parameters are correct. * - They must convert state spaces correctly. * - They adhere to the libcu++ coding standards of using: - * - Double underscores for all parameters, variables + * - Reserved identifiers for all parameters, variables. E.g. `__meow` or `_Woof` * - _CUDA_VSTD:: namespace for types * * The wrappers should not do the following: From ae1a0846fe6fe652fe9a7b47a57000d89267370d Mon Sep 17 00:00:00 2001 From: Allard Hendriksen Date: Wed, 18 Oct 2023 10:38:50 +0200 Subject: [PATCH 06/49] Address review comments --- libcudacxx/include/cuda/ptx | 59 +++++++++++++++++++++++-------------- 1 file changed, 37 insertions(+), 22 deletions(-) diff --git a/libcudacxx/include/cuda/ptx b/libcudacxx/include/cuda/ptx index ff2c26bb05b..c64b20fbfbf 100644 --- a/libcudacxx/include/cuda/ptx +++ b/libcudacxx/include/cuda/ptx @@ -43,12 +43,25 @@ * - outside developers who want to experiment with the latest features of the * hardware. * + * Stability: + * + * - These headers are intended to present a stable API (not ABI) within one + * major version of the CTK. This means that: + * - All functions are marked inline + * - The type of a function parameter can be changed to be more generic if + * that means that code that called the original version can still be + * compiled. + * + * - Good exposure of the PTX should be high priority. If, at a new major + * version, we face a difficult choice between breaking backward-compatibility + * and an improvement of the PTX exposure, we will tend to the latter option + * more easily than in other parts of libcu++. */ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX /* - * Integral constant types and values for + * Public integral constant types and values for * * - .sem * - .space @@ -76,13 +89,14 @@ enum class dot_sem { enum class dot_space { reg, sreg, - const_mem, // can't use const + const_mem, // Using const_mem as `const` is reserved in C++. global, local, param, - shared, - shared_cluster, + shared, // The PTX spelling is shared::cta + shared_cluster, // The PTX spelling is shared::cluster, but we might want to go for cluster here. tex // deprecated + // generic? }; enum class dot_scope { @@ -93,7 +107,7 @@ enum class dot_scope { }; template -using sem_t = std::integral_constant; +using sem_t = _CUDA_VSTD::integral_constant; using sem_acq_rel_t = sem_t; using sem_acquire_t = sem_t; using sem_relaxed_t = sem_t; @@ -109,16 +123,16 @@ static constexpr sem_sc_t sem_sc{}; static constexpr sem_weak_t sem_weak{}; template -using space_t = std::integral_constant; -using space_const_mem_t = std::integral_constant; -using space_global_t = std::integral_constant; -using space_local_t = std::integral_constant; -using space_param_t = std::integral_constant; -using space_reg_t = std::integral_constant; -using space_shared_t = std::integral_constant; -using space_shared_cluster_t = std::integral_constant; -using space_sreg_t = std::integral_constant; -using space_tex_t = std::integral_constant; +using space_t = _CUDA_VSTD::integral_constant; +using space_const_mem_t = space_t; +using space_global_t = space_t; +using space_local_t = space_t; +using space_param_t = space_t; +using space_reg_t = space_t; +using space_shared_t = space_t; +using space_shared_cluster_t = space_t; +using space_sreg_t = space_t; +using space_tex_t = space_t; static constexpr space_const_mem_t space_const_mem{}; static constexpr space_global_t space_global{}; @@ -131,11 +145,11 @@ static constexpr space_sreg_t space_sreg{}; static constexpr space_tex_t space_tex{}; template -using scope_t = std::integral_constant; -using scope_cluster_t = std::integral_constant; -using scope_cta_t = std::integral_constant; -using scope_gpu_t = std::integral_constant; -using scope_sys_t = std::integral_constant; +using scope_t = _CUDA_VSTD::integral_constant; +using scope_cluster_t = scope_t; +using scope_cta_t = scope_t; +using scope_gpu_t = scope_t; +using scope_sys_t = scope_t; static constexpr scope_cluster_t scope_cluster{}; static constexpr scope_cta_t scope_cta{}; @@ -143,6 +157,7 @@ static constexpr scope_gpu_t scope_gpu{}; static constexpr scope_sys_t scope_sys{}; +// Private helper functions inline __device__ _CUDA_VSTD::uint32_t __as_smem_ptr(const void * __ptr) { return static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__ptr)); } inline __device__ _CUDA_VSTD::uint32_t __as_remote_dsmem_ptr(const void * __ptr) { return static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__ptr)); } inline __device__ _CUDA_VSTD::uint64_t __as_gmem_ptr(const void * __ptr) { return static_cast<_CUDA_VSTD::uint64_t>(__cvta_generic_to_global(__ptr)); } @@ -217,7 +232,7 @@ _CUDA_VSTD::uint64_t mbarrier_arrive_expect_tx(sem_release_t __sem, scope_t<_Sco static_assert(__scope == scope_cta || __scope == scope_cluster, ""); _CUDA_VSTD::uint64_t __token; - if constexpr (__scope == scope_cta) { + if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cta) { asm ( "mbarrier.arrive.expect_tx.release.cta.shared::cta.b64 %0, [%1], %2;" : "=l"(__token) @@ -241,7 +256,7 @@ void mbarrier_arrive_expect_tx(sem_release_t __sem, scope_t<_Sco> __scope, space { // Arrive on remote cluster barrier static_assert(__scope == scope_cta || __scope == scope_cluster, ""); - if constexpr (__scope == scope_cta) { + if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cta) { asm ( "mbarrier.arrive.expect_tx.release.cta.shared::cluster.b64 _, [%0], %1;" : From ecbb6fea762f49ed9793fff64abe1a0ffc39de08 Mon Sep 17 00:00:00 2001 From: Allard Hendriksen Date: Wed, 18 Oct 2023 11:51:36 +0200 Subject: [PATCH 07/49] Apply suggestions from code review Co-authored-by: Michael Schellenberger Costa --- libcudacxx/docs/extended_api/ptx.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libcudacxx/docs/extended_api/ptx.md b/libcudacxx/docs/extended_api/ptx.md index e8ab487d16a..d5092dc030f 100644 --- a/libcudacxx/docs/extended_api/ptx.md +++ b/libcudacxx/docs/extended_api/ptx.md @@ -1,7 +1,7 @@ ## PTX instructions -The `cuda::ptx` namespace contains functions that map one-to-one to PTX -[instructions](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html). These can be used for maximal control of the generated code, or to +The `cuda::ptx` namespace contains functions that map one-to-one to +[PTX instructions](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html). These can be used for maximal control of the generated code, or to experiment with new hardware features before a high-level C++ API is available. ### Shared memory barrier (mbarrier) From cf19e539c8ddb344d1c6dcf3aa9b24d0f44f0e00 Mon Sep 17 00:00:00 2001 From: Allard Hendriksen Date: Wed, 18 Oct 2023 11:52:18 +0200 Subject: [PATCH 08/49] Address review comments --- .../test/cuda/ptx/mbarrier_arrive_tx.pass.cpp | 52 ------------------- .../test/cuda/ptx/sm90.ptx.compile.pass.cpp | 44 ++++++++++++++++ libcudacxx/include/cuda/ptx | 7 ++- 3 files changed, 47 insertions(+), 56 deletions(-) delete mode 100644 libcudacxx/.upstream-tests/test/cuda/ptx/mbarrier_arrive_tx.pass.cpp create mode 100644 libcudacxx/.upstream-tests/test/cuda/ptx/sm90.ptx.compile.pass.cpp diff --git a/libcudacxx/.upstream-tests/test/cuda/ptx/mbarrier_arrive_tx.pass.cpp b/libcudacxx/.upstream-tests/test/cuda/ptx/mbarrier_arrive_tx.pass.cpp deleted file mode 100644 index f72406bdeb2..00000000000 --- a/libcudacxx/.upstream-tests/test/cuda/ptx/mbarrier_arrive_tx.pass.cpp +++ /dev/null @@ -1,52 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of libcu++, the C++ Standard Library for your entire system, -// under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. -// -//===----------------------------------------------------------------------===// -// UNSUPPORTED: libcpp-has-no-threads -// UNSUPPORTED: pre-sm-90 - -// - -#include - -#include - -#include "concurrent_agents.h" -#include "cuda_space_selector.h" -#include "test_macros.h" - -int main(int, char**) -{ - NV_DISPATCH_TARGET( - NV_IS_HOST, ( - // Required by concurrent_agents_launch to know how many we're - // launching. This can only be an int, because the nvrtc tests use grep - // to figure out how many threads to launch. - cuda_thread_count = 1; - ), - NV_IS_DEVICE, ( - // Do not execute. Just check if this compiles (that is: assembles) without error. - if (false) { - using cuda::ptx::sem_release; - using cuda::ptx::space_shared_cluster; - using cuda::ptx::space_shared; - using cuda::ptx::scope_cluster; - using cuda::ptx::scope_cta; - - __shared__ uint64_t bar; - cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cta, space_shared, &bar, 1); - cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cluster, space_shared, &bar, 1); - - cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cta, space_shared_cluster, &bar, 1); - cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cluster, space_shared_cluster, &bar, 1); - } - ) - ); - - return 0; -} diff --git a/libcudacxx/.upstream-tests/test/cuda/ptx/sm90.ptx.compile.pass.cpp b/libcudacxx/.upstream-tests/test/cuda/ptx/sm90.ptx.compile.pass.cpp new file mode 100644 index 00000000000..27b5af8e6f2 --- /dev/null +++ b/libcudacxx/.upstream-tests/test/cuda/ptx/sm90.ptx.compile.pass.cpp @@ -0,0 +1,44 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// +// UNSUPPORTED: libcpp-has-no-threads +// UNSUPPORTED: pre-sm-90 + +// + +#include + +#include + +#include "concurrent_agents.h" +#include "cuda_space_selector.h" +#include "test_macros.h" + +int main(int, char**) +{ + NV_IF_TARGET(NV_IS_DEVICE, ( + // Do not execute. Just check if below PTX compiles (that is: assembles) without error. + if (false) { + using cuda::ptx::sem_release; + using cuda::ptx::space_shared_cluster; + using cuda::ptx::space_shared; + using cuda::ptx::scope_cluster; + using cuda::ptx::scope_cta; + + __shared__ uint64_t bar; + cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cta, space_shared, &bar, 1); + cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cluster, space_shared, &bar, 1); + + cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cta, space_shared_cluster, &bar, 1); + cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cluster, space_shared_cluster, &bar, 1); + } + )); + + return 0; +} diff --git a/libcudacxx/include/cuda/ptx b/libcudacxx/include/cuda/ptx index c64b20fbfbf..c00bc8e9a6c 100644 --- a/libcudacxx/include/cuda/ptx +++ b/libcudacxx/include/cuda/ptx @@ -158,10 +158,9 @@ static constexpr scope_sys_t scope_sys{}; // Private helper functions -inline __device__ _CUDA_VSTD::uint32_t __as_smem_ptr(const void * __ptr) { return static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__ptr)); } -inline __device__ _CUDA_VSTD::uint32_t __as_remote_dsmem_ptr(const void * __ptr) { return static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__ptr)); } -inline __device__ _CUDA_VSTD::uint64_t __as_gmem_ptr(const void * __ptr) { return static_cast<_CUDA_VSTD::uint64_t>(__cvta_generic_to_global(__ptr)); } - +inline _LIBCUDACXX_DEVICE CUDA_VSTD::uint32_t __as_smem_ptr(const void * __ptr) { return static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__ptr)); } +inline _LIBCUDACXX_DEVICE _CUDA_VSTD::uint32_t __as_remote_dsmem_ptr(const void * __ptr) { return static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__ptr)); } +inline _LIBCUDACXX_DEVICE _CUDA_VSTD::uint64_t __as_gmem_ptr(const void * __ptr) { return static_cast<_CUDA_VSTD::uint64_t>(__cvta_generic_to_global(__ptr)); } // SM 90 features From 1d57b022a4057df3cbf0bcf2bc115124762a38b4 Mon Sep 17 00:00:00 2001 From: Michael Schellenberger Costa Date: Wed, 25 Oct 2023 12:02:40 +0200 Subject: [PATCH 09/49] Fix typo --- libcudacxx/include/cuda/ptx | 203 +++++++++++++++++++----------------- 1 file changed, 106 insertions(+), 97 deletions(-) diff --git a/libcudacxx/include/cuda/ptx b/libcudacxx/include/cuda/ptx index c00bc8e9a6c..cf14026c01f 100644 --- a/libcudacxx/include/cuda/ptx +++ b/libcudacxx/include/cuda/ptx @@ -12,9 +12,9 @@ #ifndef _CUDA_PTX #define _CUDA_PTX -#include "std/detail/__config" // Macros -#include "std/type_traits" // std::integral_constant -#include "../nv/target" // __CUDA_MINIMUM_ARCH__ and friends +#include "std/cstdint" // uint32_t +#include "std/type_traits" // std::integral_constant +#include "../nv/target" // __CUDA_MINIMUM_ARCH__ and friends /* * The cuda::ptx namespace intends to provide PTX wrappers for new hardware @@ -75,45 +75,48 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX * */ -enum class dot_sem { - acq_rel, - acquire, - relaxed, - release, - sc, - weak - // mmio? - // volatile? +enum class dot_sem +{ + acq_rel, + acquire, + relaxed, + release, + sc, + weak + // mmio? + // volatile? }; -enum class dot_space { - reg, - sreg, - const_mem, // Using const_mem as `const` is reserved in C++. - global, - local, - param, - shared, // The PTX spelling is shared::cta - shared_cluster, // The PTX spelling is shared::cluster, but we might want to go for cluster here. - tex // deprecated - // generic? +enum class dot_space +{ + reg, + sreg, + const_mem, // Using const_mem as `const` is reserved in C++. + global, + local, + param, + shared, // The PTX spelling is shared::cta + shared_cluster, // The PTX spelling is shared::cluster, but we might want to go for cluster here. + tex // deprecated + // generic? }; -enum class dot_scope { - cta, - cluster, - gpu, - sys +enum class dot_scope +{ + cta, + cluster, + gpu, + sys }; template -using sem_t = _CUDA_VSTD::integral_constant; +using sem_t = _CUDA_VSTD::integral_constant; using sem_acq_rel_t = sem_t; using sem_acquire_t = sem_t; using sem_relaxed_t = sem_t; using sem_release_t = sem_t; -using sem_sc_t = sem_t; -using sem_weak_t = sem_t; +using sem_sc_t = sem_t; +using sem_weak_t = sem_t; static constexpr sem_acq_rel_t sem_acq_rel{}; static constexpr sem_acquire_t sem_acquire{}; @@ -123,16 +126,16 @@ static constexpr sem_sc_t sem_sc{}; static constexpr sem_weak_t sem_weak{}; template -using space_t = _CUDA_VSTD::integral_constant; -using space_const_mem_t = space_t; -using space_global_t = space_t; -using space_local_t = space_t; -using space_param_t = space_t; -using space_reg_t = space_t; -using space_shared_t = space_t; +using space_t = _CUDA_VSTD::integral_constant; +using space_const_mem_t = space_t; +using space_global_t = space_t; +using space_local_t = space_t; +using space_param_t = space_t; +using space_reg_t = space_t; +using space_shared_t = space_t; using space_shared_cluster_t = space_t; -using space_sreg_t = space_t; -using space_tex_t = space_t; +using space_sreg_t = space_t; +using space_tex_t = space_t; static constexpr space_const_mem_t space_const_mem{}; static constexpr space_global_t space_global{}; @@ -145,23 +148,30 @@ static constexpr space_sreg_t space_sreg{}; static constexpr space_tex_t space_tex{}; template -using scope_t = _CUDA_VSTD::integral_constant; +using scope_t = _CUDA_VSTD::integral_constant; using scope_cluster_t = scope_t; -using scope_cta_t = scope_t; -using scope_gpu_t = scope_t; -using scope_sys_t = scope_t; +using scope_cta_t = scope_t; +using scope_gpu_t = scope_t; +using scope_sys_t = scope_t; static constexpr scope_cluster_t scope_cluster{}; static constexpr scope_cta_t scope_cta{}; static constexpr scope_gpu_t scope_gpu{}; static constexpr scope_sys_t scope_sys{}; - // Private helper functions -inline _LIBCUDACXX_DEVICE CUDA_VSTD::uint32_t __as_smem_ptr(const void * __ptr) { return static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__ptr)); } -inline _LIBCUDACXX_DEVICE _CUDA_VSTD::uint32_t __as_remote_dsmem_ptr(const void * __ptr) { return static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__ptr)); } -inline _LIBCUDACXX_DEVICE _CUDA_VSTD::uint64_t __as_gmem_ptr(const void * __ptr) { return static_cast<_CUDA_VSTD::uint64_t>(__cvta_generic_to_global(__ptr)); } - +inline _LIBCUDACXX_DEVICE _CUDA_VSTD::uint32_t __as_smem_ptr(const void* __ptr) +{ + return static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__ptr)); +} +inline _LIBCUDACXX_DEVICE _CUDA_VSTD::uint32_t __as_remote_dsmem_ptr(const void* __ptr) +{ + return static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__ptr)); +} +inline _LIBCUDACXX_DEVICE _CUDA_VSTD::uint64_t __as_gmem_ptr(const void* __ptr) +{ + return static_cast<_CUDA_VSTD::uint64_t>(__cvta_generic_to_global(__ptr)); +} // SM 90 features // -------------- @@ -189,7 +199,6 @@ inline _LIBCUDACXX_DEVICE _CUDA_VSTD::uint64_t __as_gmem_ptr(const void * __ptr) // cp.async.bulk.wait_group // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-wait-group - // Lower priority: // prefetch{.tensormap_space}.tensormap [a]; // prefetch the tensormap @@ -219,68 +228,68 @@ inline _LIBCUDACXX_DEVICE _CUDA_VSTD::uint64_t __as_gmem_ptr(const void * __ptr) // Support for .cluster scope requires sm_90 or higher. // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive - - // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive #if (defined(__CUDA_MINIMUM_ARCH__) && 900 <= __CUDA_MINIMUM_ARCH__) || (!defined(__CUDA_MINIMUM_ARCH__)) template -_LIBCUDACXX_DEVICE inline -_CUDA_VSTD::uint64_t mbarrier_arrive_expect_tx(sem_release_t __sem, scope_t<_Sco> __scope, space_shared_t __spc, _CUDA_VSTD::uint64_t* __addr, _CUDA_VSTD::uint32_t __tx_count) +_LIBCUDACXX_DEVICE inline _CUDA_VSTD::uint64_t mbarrier_arrive_expect_tx( + sem_release_t __sem, + scope_t<_Sco> __scope, + space_shared_t __spc, + _CUDA_VSTD::uint64_t* __addr, + _CUDA_VSTD::uint32_t __tx_count) { - // Arrive on local shared memory barrier - static_assert(__scope == scope_cta || __scope == scope_cluster, ""); - _CUDA_VSTD::uint64_t __token; - - if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cta) { - asm ( - "mbarrier.arrive.expect_tx.release.cta.shared::cta.b64 %0, [%1], %2;" - : "=l"(__token) - : "r"(__as_smem_ptr(__addr)), - "r"(__tx_count) - : "memory"); - } else { - asm ( - "mbarrier.arrive.expect_tx.release.cluster.shared::cta.b64 %0, [%1], %2;" - : "=l"(__token) - : "r"(__as_smem_ptr(__addr)), - "r"(__tx_count) - : "memory"); - } - return __token; + // Arrive on local shared memory barrier + static_assert(__scope == scope_cta || __scope == scope_cluster, ""); + _CUDA_VSTD::uint64_t __token; + + if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cta) + { + asm("mbarrier.arrive.expect_tx.release.cta.shared::cta.b64 %0, [%1], %2;" + : "=l"(__token) + : "r"(__as_smem_ptr(__addr)), "r"(__tx_count) + : "memory"); + } + else + { + asm("mbarrier.arrive.expect_tx.release.cluster.shared::cta.b64 %0, [%1], %2;" + : "=l"(__token) + : "r"(__as_smem_ptr(__addr)), "r"(__tx_count) + : "memory"); + } + return __token; } template -_LIBCUDACXX_DEVICE inline -void mbarrier_arrive_expect_tx(sem_release_t __sem, scope_t<_Sco> __scope, space_shared_cluster_t __spc, _CUDA_VSTD::uint64_t* __addr, _CUDA_VSTD::uint32_t __tx_count) +_LIBCUDACXX_DEVICE inline void mbarrier_arrive_expect_tx( + sem_release_t __sem, + scope_t<_Sco> __scope, + space_shared_cluster_t __spc, + _CUDA_VSTD::uint64_t* __addr, + _CUDA_VSTD::uint32_t __tx_count) { - // Arrive on remote cluster barrier - static_assert(__scope == scope_cta || __scope == scope_cluster, ""); - if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cta) { - asm ( - "mbarrier.arrive.expect_tx.release.cta.shared::cluster.b64 _, [%0], %1;" - : - : "r"(__as_smem_ptr(__addr)), - "r"(__tx_count) - : "memory"); - } else { - asm ( - "mbarrier.arrive.expect_tx.release.cluster.shared::cluster.b64 _, [%0], %1;" - : - : "r"(__as_smem_ptr(__addr)), - "r"(__tx_count) - : "memory"); - } + // Arrive on remote cluster barrier + static_assert(__scope == scope_cta || __scope == scope_cluster, ""); + if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cta) + { + asm("mbarrier.arrive.expect_tx.release.cta.shared::cluster.b64 _, [%0], %1;" + : + : "r"(__as_smem_ptr(__addr)), "r"(__tx_count) + : "memory"); + } + else + { + asm("mbarrier.arrive.expect_tx.release.cluster.shared::cluster.b64 _, [%0], %1;" + : + : "r"(__as_smem_ptr(__addr)), "r"(__tx_count) + : "memory"); + } } #endif // __CUDA_MINIMUM_ARCH__ - - - // mbarrier.test_wait/mbarrier.try_wait // mbarrier.try_wait requires sm_90 or higher. // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-test-wait-mbarrier-try-wait - /* * Cluster Basics: * From 21050e82e2a89dea7a034ab1f61781b9ac094be3 Mon Sep 17 00:00:00 2001 From: Allard Hendriksen Date: Wed, 18 Oct 2023 17:08:29 +0200 Subject: [PATCH 10/49] Add targeting macros and a few more helper functions --- libcudacxx/include/cuda/ptx | 51 +++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/libcudacxx/include/cuda/ptx b/libcudacxx/include/cuda/ptx index cf14026c01f..c499fd57aed 100644 --- a/libcudacxx/include/cuda/ptx +++ b/libcudacxx/include/cuda/ptx @@ -58,6 +58,45 @@ * more easily than in other parts of libcu++. */ + +/* + * Targeting macros + * + */ + +#if (defined(__CUDA_MINIMUM_ARCH__) && 800 <= __CUDA_MINIMUM_ARCH__) || (!defined(__CUDA_MINIMUM_ARCH__)) +# define _LIBCUDACXX_PTX_SM_80_AVAILABLE +#endif + +#if (defined(__CUDA_MINIMUM_ARCH__) && 900 <= __CUDA_MINIMUM_ARCH__) || (!defined(__CUDA_MINIMUM_ARCH__)) +# define _LIBCUDACXX_PTX_SM_90_AVAILABLE +#endif + +// PTX ISA 7.8 is available from CTK 11.8, driver r520 +#if (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 8)) || (!defined(__CUDACC_VER_MAJOR__)) +# define _LIBCUDACXX_PTX_ISA_78_AVAILABLE +#endif + +// PTX ISA 7.8 is available from CTK 11.8, driver r520 (so also from CTK 12.0 onwards) +#if (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 0)) || (!defined(__CUDACC_VER_MAJOR__)) +# define _LIBCUDACXX_PTX_ISA_78_AVAILABLE +#endif + +// PTX ISA 8.0 is available from CTK 12.0, driver r525 +#if (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 0)) || (!defined(__CUDACC_VER_MAJOR__)) +# define _LIBCUDACXX_PTX_ISA_80_AVAILABLE +#endif + +// PTX ISA 8.1 is available from CTK 12.1, driver r530 +#if (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 1)) || (!defined(__CUDACC_VER_MAJOR__)) +# define _LIBCUDACXX_PTX_ISA_81_AVAILABLE +#endif + +// PTX ISA 8.2 is available from CTK 12.2, driver r535 +#if (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 2)) || (!defined(__CUDACC_VER_MAJOR__)) +# define _LIBCUDACXX_PTX_ISA_82_AVAILABLE +#endif + _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX /* @@ -173,6 +212,18 @@ inline _LIBCUDACXX_DEVICE _CUDA_VSTD::uint64_t __as_gmem_ptr(const void* __ptr) return static_cast<_CUDA_VSTD::uint64_t>(__cvta_generic_to_global(__ptr)); } +template +inline _LIBCUDACXX_DEVICE int __as_b32(_Tp __val) { + static_assert(sizeof(_Tp) == 4, ""); + return *reinterpret_cast(&__val); +} + +template +inline _LIBCUDACXX_DEVICE _CUDA_VSTD::uint64_t __as_b64(_Tp __val) { + static_assert(sizeof(_Tp) == 8, ""); + return *reinterpret_cast<_CUDA_VSTD::uint64_t*>(&__val); +} + // SM 90 features // -------------- From 986d990073c06def2174d46bc396bd62ebdbc18e Mon Sep 17 00:00:00 2001 From: Allard Hendriksen Date: Wed, 25 Oct 2023 12:32:22 +0200 Subject: [PATCH 11/49] Add PTX ISA 8.3 macro --- libcudacxx/include/cuda/ptx | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/libcudacxx/include/cuda/ptx b/libcudacxx/include/cuda/ptx index c499fd57aed..c5543c4bf59 100644 --- a/libcudacxx/include/cuda/ptx +++ b/libcudacxx/include/cuda/ptx @@ -97,6 +97,11 @@ # define _LIBCUDACXX_PTX_ISA_82_AVAILABLE #endif +// PTX ISA 8.3 is available from CTK 12.3, driver r545 +#if (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 3)) || (!defined(__CUDACC_VER_MAJOR__)) +# define _LIBCUDACXX_PTX_ISA_83_AVAILABLE +#endif + _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX /* From 82d1b859d0d99df919c54e45320f11f35808f891 Mon Sep 17 00:00:00 2001 From: Allard Hendriksen Date: Wed, 25 Oct 2023 13:05:43 +0200 Subject: [PATCH 12/49] Improve code organization --- libcudacxx/include/cuda/ptx | 845 ++++++++++++------ ..._and_communication_instructions_mbarrier.h | 105 +++ .../include/__cuda/ptx/ptx_dot_variants.h | 136 +++ .../include/__cuda/ptx/ptx_helper_functions.h | 43 + .../__cuda/ptx/ptx_isa_target_macros.h | 63 ++ 5 files changed, 909 insertions(+), 283 deletions(-) create mode 100644 libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h create mode 100644 libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_dot_variants.h create mode 100644 libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_helper_functions.h create mode 100644 libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h diff --git a/libcudacxx/include/cuda/ptx b/libcudacxx/include/cuda/ptx index c5543c4bf59..ea319195134 100644 --- a/libcudacxx/include/cuda/ptx +++ b/libcudacxx/include/cuda/ptx @@ -16,6 +16,11 @@ #include "std/type_traits" // std::integral_constant #include "../nv/target" // __CUDA_MINIMUM_ARCH__ and friends +#include "cuda/std/detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h" +#include "cuda/std/detail/libcxx/include/__cuda/ptx/ptx_dot_variants.h" +#include "cuda/std/detail/libcxx/include/__cuda/ptx/ptx_helper_functions.h" +#include "cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h" + /* * The cuda::ptx namespace intends to provide PTX wrappers for new hardware * features and new PTX instructions so that they can be experimented with @@ -58,373 +63,647 @@ * more easily than in other parts of libcu++. */ +_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX + +/* + * Instructions + * + * The organization of the instructions below follows that of the PTX ISA documentation: + * https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#instructions + * + * To improve code organization, some sections are separated into their own + * header. For instance, the mbarrier instructions are found in: + * __cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h + * + */ /* - * Targeting macros + * 9.7.1. Integer Arithmetic Instructions + * https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions * */ -#if (defined(__CUDA_MINIMUM_ARCH__) && 800 <= __CUDA_MINIMUM_ARCH__) || (!defined(__CUDA_MINIMUM_ARCH__)) -# define _LIBCUDACXX_PTX_SM_80_AVAILABLE -#endif +// 9.7.1.7. Integer Arithmetic Instructions: sad +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-sad -#if (defined(__CUDA_MINIMUM_ARCH__) && 900 <= __CUDA_MINIMUM_ARCH__) || (!defined(__CUDA_MINIMUM_ARCH__)) -# define _LIBCUDACXX_PTX_SM_90_AVAILABLE -#endif +// 9.7.1.8. Integer Arithmetic Instructions: div +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-div -// PTX ISA 7.8 is available from CTK 11.8, driver r520 -#if (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 8)) || (!defined(__CUDACC_VER_MAJOR__)) -# define _LIBCUDACXX_PTX_ISA_78_AVAILABLE -#endif +// 9.7.1.9. Integer Arithmetic Instructions: rem +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-rem -// PTX ISA 7.8 is available from CTK 11.8, driver r520 (so also from CTK 12.0 onwards) -#if (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 0)) || (!defined(__CUDACC_VER_MAJOR__)) -# define _LIBCUDACXX_PTX_ISA_78_AVAILABLE -#endif +// 9.7.1.10. Integer Arithmetic Instructions: abs +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-abs -// PTX ISA 8.0 is available from CTK 12.0, driver r525 -#if (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 0)) || (!defined(__CUDACC_VER_MAJOR__)) -# define _LIBCUDACXX_PTX_ISA_80_AVAILABLE -#endif +// 9.7.1.11. Integer Arithmetic Instructions: neg +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-neg -// PTX ISA 8.1 is available from CTK 12.1, driver r530 -#if (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 1)) || (!defined(__CUDACC_VER_MAJOR__)) -# define _LIBCUDACXX_PTX_ISA_81_AVAILABLE -#endif +// 9.7.1.12. Integer Arithmetic Instructions: min +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-min -// PTX ISA 8.2 is available from CTK 12.2, driver r535 -#if (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 2)) || (!defined(__CUDACC_VER_MAJOR__)) -# define _LIBCUDACXX_PTX_ISA_82_AVAILABLE -#endif +// 9.7.1.13. Integer Arithmetic Instructions: max +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-max -// PTX ISA 8.3 is available from CTK 12.3, driver r545 -#if (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 3)) || (!defined(__CUDACC_VER_MAJOR__)) -# define _LIBCUDACXX_PTX_ISA_83_AVAILABLE -#endif +// 9.7.1.14. Integer Arithmetic Instructions: popc +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-popc + +// 9.7.1.15. Integer Arithmetic Instructions: clz +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-clz + +// 9.7.1.16. Integer Arithmetic Instructions: bfind +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-bfind + +// 9.7.1.17. Integer Arithmetic Instructions: fns +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-fns + +// 9.7.1.18. Integer Arithmetic Instructions: brev +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-brev + +// 9.7.1.19. Integer Arithmetic Instructions: bfe +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-bfe + +// 9.7.1.20. Integer Arithmetic Instructions: bfi +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-bfi + +// 9.7.1.21. Integer Arithmetic Instructions: szext +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-szext + +// 9.7.1.22. Integer Arithmetic Instructions: bmsk +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-bmsk + +// 9.7.1.23. Integer Arithmetic Instructions: dp4a +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-dp4a + +// 9.7.1.24. Integer Arithmetic Instructions: dp2a +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-dp2a -_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX /* - * Public integral constant types and values for + * 9.7.2. Extended-Precision Integer Arithmetic Instructions + * https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-integer-arithmetic-instructions * - * - .sem - * - .space - * - .scope + */ + +// 9.7.2.1. Extended-Precision Arithmetic Instructions: add.cc +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-arithmetic-instructions-add-cc + +// 9.7.2.2. Extended-Precision Arithmetic Instructions: addc +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-arithmetic-instructions-addc + +// 9.7.2.3. Extended-Precision Arithmetic Instructions: sub.cc +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-arithmetic-instructions-sub-cc + +// 9.7.2.4. Extended-Precision Arithmetic Instructions: subc +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-arithmetic-instructions-subc + +// 9.7.2.5. Extended-Precision Arithmetic Instructions: mad.cc +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-arithmetic-instructions-mad-cc + +// 9.7.2.6. Extended-Precision Arithmetic Instructions: madc +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-arithmetic-instructions-madc + + +/* + * 9.7.3. Floating-Point Instructions + * https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions * - * Skipping some steps in my reasoning: If we want to keep the PTX bindings - * relatively stable, and also be able to adapt to additions of semantics, - * space, and scope variants of a PTX instruction, then we must be able to add - * new overloads of an instruction with .sem, .space, or .scope as type-level - * parameters. + */ + +// 9.7.3.1. Floating Point Instructions: testp +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-testp + +// 9.7.3.2. Floating Point Instructions: copysign +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-copysign + +// 9.7.3.3. Floating Point Instructions: add +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-add + +// 9.7.3.4. Floating Point Instructions: sub +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-sub + +// 9.7.3.5. Floating Point Instructions: mul +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-mul + +// 9.7.3.6. Floating Point Instructions: fma +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-fma + +// 9.7.3.7. Floating Point Instructions: mad +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-mad + +// 9.7.3.8. Floating Point Instructions: div +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-div + +// 9.7.3.9. Floating Point Instructions: abs +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-abs + +// 9.7.3.10. Floating Point Instructions: neg +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-neg + +// 9.7.3.11. Floating Point Instructions: min +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-min + +// 9.7.3.12. Floating Point Instructions: max +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-max + +// 9.7.3.13. Floating Point Instructions: rcp +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-rcp + +// 9.7.3.14. Floating Point Instructions: rcp.approx.ftz.f64 +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-rcp-approx-ftz-f64 + +// 9.7.3.15. Floating Point Instructions: sqrt +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-sqrt + +// 9.7.3.16. Floating Point Instructions: rsqrt +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-rsqrt + +// 9.7.3.17. Floating Point Instructions: rsqrt.approx.ftz.f64 +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-rsqrt-approx-ftz-f64 + +// 9.7.3.18. Floating Point Instructions: sin +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-sin + +// 9.7.3.19. Floating Point Instructions: cos +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-cos + +// 9.7.3.20. Floating Point Instructions: lg2 +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-lg2 + +// 9.7.3.21. Floating Point Instructions: ex2 +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-ex2 + +// 9.7.3.22. Floating Point Instructions: tanh +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-tanh + + +/* + * 9.7.4. Half Precision Floating-Point Instructions + * https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions * */ -enum class dot_sem -{ - acq_rel, - acquire, - relaxed, - release, - sc, - weak - // mmio? - // volatile? -}; - -enum class dot_space -{ - reg, - sreg, - const_mem, // Using const_mem as `const` is reserved in C++. - global, - local, - param, - shared, // The PTX spelling is shared::cta - shared_cluster, // The PTX spelling is shared::cluster, but we might want to go for cluster here. - tex // deprecated - // generic? -}; - -enum class dot_scope -{ - cta, - cluster, - gpu, - sys -}; - -template -using sem_t = _CUDA_VSTD::integral_constant; -using sem_acq_rel_t = sem_t; -using sem_acquire_t = sem_t; -using sem_relaxed_t = sem_t; -using sem_release_t = sem_t; -using sem_sc_t = sem_t; -using sem_weak_t = sem_t; - -static constexpr sem_acq_rel_t sem_acq_rel{}; -static constexpr sem_acquire_t sem_acquire{}; -static constexpr sem_relaxed_t sem_relaxed{}; -static constexpr sem_release_t sem_release{}; -static constexpr sem_sc_t sem_sc{}; -static constexpr sem_weak_t sem_weak{}; - -template -using space_t = _CUDA_VSTD::integral_constant; -using space_const_mem_t = space_t; -using space_global_t = space_t; -using space_local_t = space_t; -using space_param_t = space_t; -using space_reg_t = space_t; -using space_shared_t = space_t; -using space_shared_cluster_t = space_t; -using space_sreg_t = space_t; -using space_tex_t = space_t; - -static constexpr space_const_mem_t space_const_mem{}; -static constexpr space_global_t space_global{}; -static constexpr space_local_t space_local{}; -static constexpr space_param_t space_param{}; -static constexpr space_reg_t space_reg{}; -static constexpr space_shared_t space_shared{}; -static constexpr space_shared_cluster_t space_shared_cluster{}; -static constexpr space_sreg_t space_sreg{}; -static constexpr space_tex_t space_tex{}; - -template -using scope_t = _CUDA_VSTD::integral_constant; -using scope_cluster_t = scope_t; -using scope_cta_t = scope_t; -using scope_gpu_t = scope_t; -using scope_sys_t = scope_t; - -static constexpr scope_cluster_t scope_cluster{}; -static constexpr scope_cta_t scope_cta{}; -static constexpr scope_gpu_t scope_gpu{}; -static constexpr scope_sys_t scope_sys{}; - -// Private helper functions -inline _LIBCUDACXX_DEVICE _CUDA_VSTD::uint32_t __as_smem_ptr(const void* __ptr) -{ - return static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__ptr)); -} -inline _LIBCUDACXX_DEVICE _CUDA_VSTD::uint32_t __as_remote_dsmem_ptr(const void* __ptr) -{ - return static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__ptr)); -} -inline _LIBCUDACXX_DEVICE _CUDA_VSTD::uint64_t __as_gmem_ptr(const void* __ptr) -{ - return static_cast<_CUDA_VSTD::uint64_t>(__cvta_generic_to_global(__ptr)); -} - -template -inline _LIBCUDACXX_DEVICE int __as_b32(_Tp __val) { - static_assert(sizeof(_Tp) == 4, ""); - return *reinterpret_cast(&__val); -} - -template -inline _LIBCUDACXX_DEVICE _CUDA_VSTD::uint64_t __as_b64(_Tp __val) { - static_assert(sizeof(_Tp) == 8, ""); - return *reinterpret_cast<_CUDA_VSTD::uint64_t*>(&__val); -} - -// SM 90 features -// -------------- +// 9.7.4.1. Half Precision Floating Point Instructions: add +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-add + +// 9.7.4.2. Half Precision Floating Point Instructions: sub +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-sub + +// 9.7.4.3. Half Precision Floating Point Instructions: mul +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-mul + +// 9.7.4.4. Half Precision Floating Point Instructions: fma +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-fma + +// 9.7.4.5. Half Precision Floating Point Instructions: neg +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-neg + +// 9.7.4.6. Half Precision Floating Point Instructions: abs +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-abs + +// 9.7.4.7. Half Precision Floating Point Instructions: min +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-min + +// 9.7.4.8. Half Precision Floating Point Instructions: max +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-max + +// 9.7.4.9. Half Precision Floating Point Instructions: tanh +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-tanh + +// 9.7.4.10. Half Precision Floating Point Instructions: ex2 +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-ex2 + + +/* + * 9.7.5. Comparison and Selection Instructions + * https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#comparison-and-selection-instructions + * + */ + +// 9.7.5.1. Comparison and Selection Instructions: set +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#comparison-and-selection-instructions-set + +// 9.7.5.2. Comparison and Selection Instructions: setp +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#comparison-and-selection-instructions-setp + +// 9.7.5.3. Comparison and Selection Instructions: selp +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#comparison-and-selection-instructions-selp + +// 9.7.5.4. Comparison and Selection Instructions: slct +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#comparison-and-selection-instructions-slct + + +/* + * 9.7.6. Half Precision Comparison Instructions + * https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-comparison-instructions + * + */ + +// 9.7.6.1. Half Precision Comparison Instructions: set +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-comparison-instructions-set + +// 9.7.6.2. Half Precision Comparison Instructions: setp +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-comparison-instructions-setp + + +/* + * 9.7.7. Logic and Shift Instructions + * https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions + * + */ + +// 9.7.7.1. Logic and Shift Instructions: and +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-and + +// 9.7.7.2. Logic and Shift Instructions: or +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-or + +// 9.7.7.3. Logic and Shift Instructions: xor +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-xor + +// 9.7.7.4. Logic and Shift Instructions: not +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-not + +// 9.7.7.5. Logic and Shift Instructions: cnot +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-cnot + +// 9.7.7.6. Logic and Shift Instructions: lop3 +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-lop3 + +// 9.7.7.7. Logic and Shift Instructions: shf +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-shf + +// 9.7.7.8. Logic and Shift Instructions: shl +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-shl + +// 9.7.7.9. Logic and Shift Instructions: shr +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-shr + /* - * TMA / cp.async.bulk + * 9.7.8. Data Movement and Conversion Instructions + * https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions * */ -// cp.async.bulk +// 9.7.8.3. Data Movement and Conversion Instructions: mov +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-mov + +// 9.7.8.4. Data Movement and Conversion Instructions: mov +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-mov-2 + +// 9.7.8.5. Data Movement and Conversion Instructions: shfl (deprecated) +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-shfl-deprecated + +// 9.7.8.6. Data Movement and Conversion Instructions: shfl.sync +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-shfl-sync + +// 9.7.8.7. Data Movement and Conversion Instructions: prmt +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-prmt + +// 9.7.8.8. Data Movement and Conversion Instructions: ld +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-ld + +// 9.7.8.9. Data Movement and Conversion Instructions: ld.global.nc +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-ld-global-nc + +// 9.7.8.10. Data Movement and Conversion Instructions: ldu +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-ldu + +// 9.7.8.11. Data Movement and Conversion Instructions: st +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-st + +// 9.7.8.12. Data Movement and Conversion Instructions: st.async +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-st-async + +// 9.7.8.13. Data Movement and Conversion Instructions: multimem.ld_reduce, multimem.st, multimem.red +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-multimem-ld-reduce-multimem-st-multimem-red + +// 9.7.8.14. Data Movement and Conversion Instructions: prefetch, prefetchu +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-prefetch-prefetchu + +// 9.7.8.15. Data Movement and Conversion Instructions: applypriority +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-applypriority + +// 9.7.8.16. Data Movement and Conversion Instructions: discard +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-discard + +// 9.7.8.17. Data Movement and Conversion Instructions: createpolicy +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-createpolicy + +// 9.7.8.18. Data Movement and Conversion Instructions: isspacep +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-isspacep + +// 9.7.8.19. Data Movement and Conversion Instructions: cvta +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvta + +// 9.7.8.20. Data Movement and Conversion Instructions: cvt +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvt + +// 9.7.8.21. Data Movement and Conversion Instructions: cvt.pack +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvt-pack + +// 9.7.8.22. Data Movement and Conversion Instructions: mapa +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-mapa + +// 9.7.8.23. Data Movement and Conversion Instructions: getctarank +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-getctarank + + +/* + * 9.7.8.24. Data Movement and Conversion Instructions: Asynchronous copy + * https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-asynchronous-copy + * + */ + +// 9.7.8.24.3. Data Movement and Conversion Instructions: cp.async +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async + +// 9.7.8.24.4. Data Movement and Conversion Instructions: cp.async.commit_group +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-commit-group + +// 9.7.8.24.5. Data Movement and Conversion Instructions: cp.async.wait_group / cp.async.wait_all +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-wait-group-cp-async-wait-all + +// 9.7.8.24.6. Data Movement and Conversion Instructions: cp.async.bulk // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk -// cp.reduce.async.bulk +// 9.7.8.24.7. Data Movement and Conversion Instructions: cp.reduce.async.bulk // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-reduce-async-bulk -// cp.async.bulk.tensor +// 9.7.8.24.8. Data Movement and Conversion Instructions: cp.async.bulk.prefetch +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-prefetch + +// 9.7.8.24.9. Data Movement and Conversion Instructions: cp.async.bulk.tensor // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor -// cp.reduce.async.bulk.tensor +// 9.7.8.24.10. Data Movement and Conversion Instructions: cp.reduce.async.bulk.tensor // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-reduce-async-bulk-tensor -// cp.async.bulk.commit_group +// 9.7.8.24.11. Data Movement and Conversion Instructions: cp.async.bulk.prefetch.tensor +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-prefetch-tensor + +// 9.7.8.24.12. Data Movement and Conversion Instructions: cp.async.bulk.commit_group // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-commit-group -// cp.async.bulk.wait_group +// 9.7.8.24.13. Data Movement and Conversion Instructions: cp.async.bulk.wait_group // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-wait-group -// Lower priority: +// 9.7.8.25. Data Movement and Conversion Instructions: tensormap.replace +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-tensormap-replace -// prefetch{.tensormap_space}.tensormap [a]; // prefetch the tensormap -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-prefetch-prefetchu -// cp.async.bulk.prefetch -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk +/* + * 9.7.9. Texture Instructions + * https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#texture-instructions + * + */ + +// 9.7.9.3. Texture Instructions: tex +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#texture-instructions-tex + +// 9.7.9.4. Texture Instructions: tld4 +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#texture-instructions-tld4 + +// 9.7.9.5. Texture Instructions: txq +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#texture-instructions-txq + +// 9.7.9.6. Texture Instructions: istypep +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#texture-instructions-istypep -// cp.async.bulk.prefetch.tensor -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-prefetch-tensor /* - * Shared memory barrier + * 9.7.10. Surface Instructions + * https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#surface-instructions * */ -// mbarrier.expect_tx -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-expect-tx - -// mbarrier.complete_tx -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-complete-tx - -// mbarrier.arrive.expect_tx -// Support for count argument without the modifier .noComplete requires sm_90 or higher. -// Qualifier .expect_tx requires sm_90 or higher. -// Sub-qualifier ::cluster requires sm_90 or higher. -// Support for .cluster scope requires sm_90 or higher. -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive - -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive -#if (defined(__CUDA_MINIMUM_ARCH__) && 900 <= __CUDA_MINIMUM_ARCH__) || (!defined(__CUDA_MINIMUM_ARCH__)) -template -_LIBCUDACXX_DEVICE inline _CUDA_VSTD::uint64_t mbarrier_arrive_expect_tx( - sem_release_t __sem, - scope_t<_Sco> __scope, - space_shared_t __spc, - _CUDA_VSTD::uint64_t* __addr, - _CUDA_VSTD::uint32_t __tx_count) -{ - // Arrive on local shared memory barrier - static_assert(__scope == scope_cta || __scope == scope_cluster, ""); - _CUDA_VSTD::uint64_t __token; - - if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cta) - { - asm("mbarrier.arrive.expect_tx.release.cta.shared::cta.b64 %0, [%1], %2;" - : "=l"(__token) - : "r"(__as_smem_ptr(__addr)), "r"(__tx_count) - : "memory"); - } - else - { - asm("mbarrier.arrive.expect_tx.release.cluster.shared::cta.b64 %0, [%1], %2;" - : "=l"(__token) - : "r"(__as_smem_ptr(__addr)), "r"(__tx_count) - : "memory"); - } - return __token; -} - -template -_LIBCUDACXX_DEVICE inline void mbarrier_arrive_expect_tx( - sem_release_t __sem, - scope_t<_Sco> __scope, - space_shared_cluster_t __spc, - _CUDA_VSTD::uint64_t* __addr, - _CUDA_VSTD::uint32_t __tx_count) -{ - // Arrive on remote cluster barrier - static_assert(__scope == scope_cta || __scope == scope_cluster, ""); - if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cta) - { - asm("mbarrier.arrive.expect_tx.release.cta.shared::cluster.b64 _, [%0], %1;" - : - : "r"(__as_smem_ptr(__addr)), "r"(__tx_count) - : "memory"); - } - else - { - asm("mbarrier.arrive.expect_tx.release.cluster.shared::cluster.b64 _, [%0], %1;" - : - : "r"(__as_smem_ptr(__addr)), "r"(__tx_count) - : "memory"); - } -} -#endif // __CUDA_MINIMUM_ARCH__ - -// mbarrier.test_wait/mbarrier.try_wait -// mbarrier.try_wait requires sm_90 or higher. -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-test-wait-mbarrier-try-wait +// 9.7.10.1. Surface Instructions: suld +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#surface-instructions-suld + +// 9.7.10.2. Surface Instructions: sust +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#surface-instructions-sust + +// 9.7.10.3. Surface Instructions: sured +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#surface-instructions-sured + +// 9.7.10.4. Surface Instructions: suq +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#surface-instructions-suq + /* - * Cluster Basics: + * 9.7.11. Control Flow Instructions + * https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions * - * These instructions are already exposed at a higher level, so may not be necessary. */ -// mapa{.space}.type d, a, b; -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-mapa +// 9.7.11.1. Control Flow Instructions: {} +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-curly-braces -// getctarank{.space}.type d, a; -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-getctarank +// 9.7.11.2. Control Flow Instructions: @ +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-at + +// 9.7.11.3. Control Flow Instructions: bra +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-bra + +// 9.7.11.4. Control Flow Instructions: brx.idx +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-brx-idx -// barrier.cluster +// 9.7.11.5. Control Flow Instructions: call +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-call + +// 9.7.11.6. Control Flow Instructions: ret +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-ret + +// 9.7.11.7. Control Flow Instructions: exit +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-exit + + +/* + * 9.7.12. Parallel Synchronization and Communication Instructions + * https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions + * + */ + +// 9.7.12.1. Parallel Synchronization and Communication Instructions: bar, barrier +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-bar-barrier + +// 9.7.12.2. Parallel Synchronization and Communication Instructions: bar.warp.sync +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-bar-warp-sync + +// 9.7.12.3. Parallel Synchronization and Communication Instructions: barrier.cluster // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-barrier-cluster -// atom .cluster +// 9.7.12.4. Parallel Synchronization and Communication Instructions: membar/fence +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar-fence + +// 9.7.12.5. Parallel Synchronization and Communication Instructions: atom // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-atom -// red .cluster +// 9.7.12.6. Parallel Synchronization and Communication Instructions: red // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-red +// 9.7.12.7. Parallel Synchronization and Communication Instructions: red.async +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-red-async + +// 9.7.12.8. Parallel Synchronization and Communication Instructions: vote (deprecated) +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-vote-deprecated + +// 9.7.12.9. Parallel Synchronization and Communication Instructions: vote.sync +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-vote-sync + +// 9.7.12.10. Parallel Synchronization and Communication Instructions: match.sync +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-match-sync + +// 9.7.12.11. Parallel Synchronization and Communication Instructions: activemask +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-activemask + +// 9.7.12.12. Parallel Synchronization and Communication Instructions: redux.sync +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-redux-sync + +// 9.7.12.13. Parallel Synchronization and Communication Instructions: griddepcontrol +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-griddepcontrol + +// 9.7.12.14. Parallel Synchronization and Communication Instructions: elect.sync +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-elect-sync + /* - * Cluster async + * 9.7.12.15. Parallel Synchronization and Communication Instructions: mbarrier + * https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier * + * Contained in: __cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h */ -// st.async -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-st-async +// 9.7.12.15.18. Parallel Synchronization and Communication Instructions: tensormap.cp_fenceproxy +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-tensormap-cp-fenceproxy -// red.async -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-red-async /* + * 9.7.13. Warp Level Matrix Multiply-Accumulate Instructions + * https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-multiply-accumulate-instructions * - * Other instructions */ -// fence.proxy.async.{global, shared::{cta, cluster}} -// fence.mbarrier_init.release.cluster (may be a bit overkill??) -// fence.{sc, acq_rel}.cluster -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar-fence +// 9.7.13.3.3. Warp-level Matrix Load Instruction: wmma.load +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-load-instruction-wmma-load -// multimem.ld_reduce, multimem.st, multimem.red -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-multimem-ld-reduce-multimem-st-multimem-red +// 9.7.13.3.4. Warp-level Matrix Store Instruction: wmma.store +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-store-instruction-wmma-store -// griddepcontrol -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-griddepcontrol +// 9.7.13.3.5. Warp-level Matrix Multiply-and-Accumulate Instruction: wmma.mma +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-multiply-and-accumulate-instruction-wmma-mma -// elect.sync -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-elect-sync +// 9.7.13.4.14. Multiply-and-Accumulate Instruction: mma +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#multiply-and-accumulate-instruction-mma -// stmatrix +// 9.7.13.4.15. Warp-level matrix load instruction: ldmatrix +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-load-instruction-ldmatrix + +// 9.7.13.4.16. Warp-level matrix store instruction: stmatrix // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-store-instruction-stmatrix +// 9.7.13.4.17. Warp-level matrix transpose instruction: movmatrix +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-transpose-instruction-movmatrix + +// 9.7.13.5.3. Multiply-and-Accumulate Instruction: mma.sp +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#multiply-and-accumulate-instruction-mma-sp + + +/* + * 9.7.14. Asynchronous Warpgroup Level Matrix Multiply-Accumulate Instructions + * https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-warpgroup-level-matrix-multiply-accumulate-instructions + * + */ + +// 9.7.14.5.2. Asynchronous Multiply-and-Accumulate Instruction: wgmma.mma_async +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-multiply-and-accumulate-instruction-wgmma-mma-async + +// 9.7.14.6.4. Asynchronous Multiply-and-Accumulate Instruction: wgmma.mma_async.sp +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-multiply-and-accumulate-instruction-wgmma-mma-async-sp + +// 9.7.14.7.1. Asynchronous Multiply-and-Accumulate Instruction: wgmma.fence +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-multiply-and-accumulate-instruction-wgmma-fence + +// 9.7.14.7.2. Asynchronous Multiply-and-Accumulate Instruction: wgmma.commit_group +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-multiply-and-accumulate-instruction-wgmma-commit-group + +// 9.7.14.7.3. Asynchronous Multiply-and-Accumulate Instruction: wgmma.wait_group +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-multiply-and-accumulate-instruction-wgmma-wait-group + + +/* + * 9.7.15. Stack Manipulation Instructions + * https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#stack-manipulation-instructions + * + */ + +// 9.7.15.1. Stack Manipulation Instructions: stacksave +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#stack-manipulation-instructions-stacksave + +// 9.7.15.2. Stack Manipulation Instructions: stackrestore +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#stack-manipulation-instructions-stackrestore + +// 9.7.15.3. Stack Manipulation Instructions: alloca +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#stack-manipulation-instructions-alloca + + /* - * Special registers (cluster-related) + * 9.7.16. Video Instructions + * https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#video-instructions * */ -// 10.12. Special Registers: %clusterid -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-clusterid +// 9.7.16.1.1. Scalar Video Instructions: vadd, vsub, vabsdiff, vmin, vmax +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#scalar-video-instructions-vadd-vsub-vabsdiff-vmin-vmax + +// 9.7.16.1.2. Scalar Video Instructions: vshl, vshr +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#scalar-video-instructions-vshl-vshr + +// 9.7.16.1.3. Scalar Video Instructions: vmad +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#scalar-video-instructions-vmad + +// 9.7.16.1.4. Scalar Video Instructions: vset +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#scalar-video-instructions-vset -// 10.13. Special Registers: %nclusterid -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-nclusterid -// 10.14. Special Registers: %cluster_ctaid -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-cluster-ctaid +/* + * 9.7.16.2. SIMD Video Instructions + * https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#simd-video-instructions + * + */ + +// 9.7.16.2.1. SIMD Video Instructions: vadd2, vsub2, vavrg2, vabsdiff2, vmin2, vmax2 +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#simd-video-instructions-vadd2-vsub2-vavrg2-vabsdiff2-vmin2-vmax2 + +// 9.7.16.2.2. SIMD Video Instructions: vset2 +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#simd-video-instructions-vset2 + +// 9.7.16.2.3. SIMD Video Instructions: vadd4, vsub4, vavrg4, vabsdiff4, vmin4, vmax4 +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#simd-video-instructions-vadd4-vsub4-vavrg4-vabsdiff4-vmin4-vmax4 + +// 9.7.16.2.4. SIMD Video Instructions: vset4 +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#simd-video-instructions-vset4 + + +/* + * 9.7.17. Miscellaneous Instructions + * https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions + * + */ + +// 9.7.17.1. Miscellaneous Instructions: brkpt +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions-brkpt -// 10.15. Special Registers: %cluster_nctaid -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-cluster-nctaid +// 9.7.17.2. Miscellaneous Instructions: nanosleep +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions-nanosleep -// 10.16. Special Registers: %cluster_ctarank -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-cluster-ctarank +// 9.7.17.3. Miscellaneous Instructions: pmevent +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions-pmevent -// 10.17. Special Registers: %cluster_nctarank -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-cluster-nctarank +// 9.7.17.4. Miscellaneous Instructions: trap +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions-trap -// 10.31. Special Registers: %aggr_smem_size -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-aggr-smem-size +// 9.7.17.5. Miscellaneous Instructions: setmaxnreg +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions-setmaxnreg _LIBCUDACXX_END_NAMESPACE_CUDA_PTX diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h new file mode 100644 index 00000000000..10da1675226 --- /dev/null +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h @@ -0,0 +1,105 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + + +#ifndef PARALLEL_SYNCHRONIZATION_AND_COMMUNICATION_INSTRUCTIONS_MBARRIER_H_ +#define PARALLEL_SYNCHRONIZATION_AND_COMMUNICATION_INSTRUCTIONS_MBARRIER_H_ + +/* + * 9.7.12.15. Parallel Synchronization and Communication Instructions: mbarrier + * https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier + * + */ + +// 9.7.12.15.9. Parallel Synchronization and Communication Instructions: mbarrier.init +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-init + +// 9.7.12.15.10. Parallel Synchronization and Communication Instructions: mbarrier.inval +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-inval + +// 9.7.12.15.11. Parallel Synchronization and Communication Instructions: mbarrier.expect_tx +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-expect-tx + +// 9.7.12.15.12. Parallel Synchronization and Communication Instructions: mbarrier.complete_tx +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-complete-tx + +// 9.7.12.15.13. Parallel Synchronization and Communication Instructions: mbarrier.arrive +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive + +#if defined(_LIBCUDACXX_PTX_ISA_78_AVAILABLE) && defined(_LIBCUDACXX_PTX_SM_90_AVAILABLE) +template +_LIBCUDACXX_DEVICE inline +_CUDA_VSTD::uint64_t mbarrier_arrive_expect_tx(sem_release_t __sem, scope_t<_Sco> __scope, space_shared_t __spc, _CUDA_VSTD::uint64_t* __addr, _CUDA_VSTD::uint32_t __tx_count) +{ + // Arrive on local shared memory barrier + static_assert(__scope == scope_cta || __scope == scope_cluster, ""); + _CUDA_VSTD::uint64_t __token; + + if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cta) { + asm ( + "mbarrier.arrive.expect_tx.release.cta.shared::cta.b64 %0, [%1], %2;" + : "=l"(__token) + : "r"(__as_smem_ptr(__addr)), + "r"(__tx_count) + : "memory"); + } else { + asm ( + "mbarrier.arrive.expect_tx.release.cluster.shared::cta.b64 %0, [%1], %2;" + : "=l"(__token) + : "r"(__as_smem_ptr(__addr)), + "r"(__tx_count) + : "memory"); + } + return __token; +} +#ifdef // _LIBCUDACXX_PTX_ISA_78_AVAILABLE + +#if defined(_LIBCUDACXX_PTX_ISA_78_AVAILABLE) && defined(_LIBCUDACXX_PTX_SM_90_AVAILABLE) +template +_LIBCUDACXX_DEVICE inline +void mbarrier_arrive_expect_tx(sem_release_t __sem, scope_t<_Sco> __scope, space_shared_cluster_t __spc, _CUDA_VSTD::uint64_t* __addr, _CUDA_VSTD::uint32_t __tx_count) +{ + // Arrive on remote cluster barrier + static_assert(__scope == scope_cta || __scope == scope_cluster, ""); + if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cta) { + asm ( + "mbarrier.arrive.expect_tx.release.cta.shared::cluster.b64 _, [%0], %1;" + : + : "r"(__as_smem_ptr(__addr)), + "r"(__tx_count) + : "memory"); + } else { + asm ( + "mbarrier.arrive.expect_tx.release.cluster.shared::cluster.b64 _, [%0], %1;" + : + : "r"(__as_smem_ptr(__addr)), + "r"(__tx_count) + : "memory"); + } +} +#ifdef // _LIBCUDACXX_PTX_ISA_78_AVAILABLE + + +// 9.7.12.15.14. Parallel Synchronization and Communication Instructions: mbarrier.arrive_drop +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive-drop + +// 9.7.12.15.15. Parallel Synchronization and Communication Instructions: cp.async.mbarrier.arrive +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-cp-async-mbarrier-arrive + +// 9.7.12.15.16. Parallel Synchronization and Communication Instructions: mbarrier.test_wait/mbarrier.try_wait +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-test-wait-mbarrier-try-wait + +// 9.7.12.15.17. Parallel Synchronization and Communication Instructions: mbarrier.pending_count +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-pending-count + + + +#endif // PARALLEL_SYNCHRONIZATION_AND_COMMUNICATION_INSTRUCTIONS_MBARRIER_H_ diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_dot_variants.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_dot_variants.h new file mode 100644 index 00000000000..08a972492e7 --- /dev/null +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_dot_variants.h @@ -0,0 +1,136 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + + +#ifndef PTX_DOT_VARIANTS_H_ +#define PTX_DOT_VARIANTS_H_ + +/* + * Public integral constant types and values for ".variant"s: + * + * - .sem + * - .space + * - .scope + * + * For each .variant, the code below defines: + * - An enum `dot_variant` with each possible value + * - A type template `variant_t` + * - Types `variant_A_t`, ..., `variant_Z_t` + * - Constexpr values `variant_A` of type `variant_A_t` + * + * These types enable specifying fine-grained overloads of a PTX binding. If a + * binding can handle multiple variants, then it is defined as: + * + * template + * [...] void ptx_binding(variant_t __v) { ... } + * + * If it only handles a single variant, then it is defined as: + * + * [...] void ptx_binding(variant_A __v) { ... } + * + * If two variants have different behaviors or return types (see .space + * overloads of mbarrier.arrive.expect_tx for an example), then these can be + * provided as separate overloads of the same function: + * + * [...] void ptx_binding(variant_A __v) { ... } + * [...] int ptx_binding(variant_B __v) { ... } + * + */ + +_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX + +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#operation-types +enum class dot_sem { + acq_rel, + acquire, + relaxed, + release, + sc, + weak + // mmio? + // volatile? +}; + +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#state-spaces +enum class dot_space { + reg, + sreg, + const_mem, // Using const_mem as `const` is reserved in C++. + global, + local, + param, + shared, // The PTX spelling is shared::cta + shared_cluster, // The PTX spelling is shared::cluster, but we might want to go for cluster here. + tex // deprecated + // generic? +}; + +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#scope +enum class dot_scope { + cta, + cluster, + gpu, + sys +}; + +template +using sem_t = _CUDA_VSTD::integral_constant; +using sem_acq_rel_t = sem_t; +using sem_acquire_t = sem_t; +using sem_relaxed_t = sem_t; +using sem_release_t = sem_t; +using sem_sc_t = sem_t; +using sem_weak_t = sem_t; + +static constexpr sem_acq_rel_t sem_acq_rel{}; +static constexpr sem_acquire_t sem_acquire{}; +static constexpr sem_relaxed_t sem_relaxed{}; +static constexpr sem_release_t sem_release{}; +static constexpr sem_sc_t sem_sc{}; +static constexpr sem_weak_t sem_weak{}; + +template +using space_t = _CUDA_VSTD::integral_constant; +using space_const_mem_t = space_t; +using space_global_t = space_t; +using space_local_t = space_t; +using space_param_t = space_t; +using space_reg_t = space_t; +using space_shared_t = space_t; +using space_shared_cluster_t = space_t; +using space_sreg_t = space_t; +using space_tex_t = space_t; + +static constexpr space_const_mem_t space_const_mem{}; +static constexpr space_global_t space_global{}; +static constexpr space_local_t space_local{}; +static constexpr space_param_t space_param{}; +static constexpr space_reg_t space_reg{}; +static constexpr space_shared_t space_shared{}; +static constexpr space_shared_cluster_t space_shared_cluster{}; +static constexpr space_sreg_t space_sreg{}; +static constexpr space_tex_t space_tex{}; + +template +using scope_t = _CUDA_VSTD::integral_constant; +using scope_cluster_t = scope_t; +using scope_cta_t = scope_t; +using scope_gpu_t = scope_t; +using scope_sys_t = scope_t; + +static constexpr scope_cluster_t scope_cluster{}; +static constexpr scope_cta_t scope_cta{}; +static constexpr scope_gpu_t scope_gpu{}; +static constexpr scope_sys_t scope_sys{}; + +_LIBCUDACXX_END_NAMESPACE_CUDA_PTX + +#endif // PTX_DOT_VARIANTS_H_ diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_helper_functions.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_helper_functions.h new file mode 100644 index 00000000000..50982232f66 --- /dev/null +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_helper_functions.h @@ -0,0 +1,43 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef PTX_HELPER_FUNCTIONS_H_ +#define PTX_HELPER_FUNCTIONS_H_ + +_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX + +// Private helper functions +inline _LIBCUDACXX_DEVICE _CUDA_VSTD::uint32_t __as_smem_ptr(const void * __ptr) { + return static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__ptr)); +} + +inline _LIBCUDACXX_DEVICE _CUDA_VSTD::uint32_t __as_remote_dsmem_ptr(const void * __ptr) { + return static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__ptr)); +} +inline _LIBCUDACXX_DEVICE _CUDA_VSTD::uint64_t __as_gmem_ptr(const void * __ptr) { + return static_cast<_CUDA_VSTD::uint64_t>(__cvta_generic_to_global(__ptr)); +} + +template +inline _LIBCUDACXX_DEVICE int __as_b32(_Tp __val) { + static_assert(sizeof(_Tp) == 4, ""); + return *reinterpret_cast(&__val); +} + +template +inline _LIBCUDACXX_DEVICE _CUDA_VSTD::uint64_t __as_b64(_Tp __val) { + static_assert(sizeof(_Tp) == 8, ""); + return *reinterpret_cast<_CUDA_VSTD::uint64_t*>(&__val); +} + +_LIBCUDACXX_END_NAMESPACE_CUDA_PTX + +#endif // PTX_HELPER_FUNCTIONS_H_ diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h new file mode 100644 index 00000000000..89b519513ca --- /dev/null +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h @@ -0,0 +1,63 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + + +#ifndef PTX_ISA_TARGET_MACROS_H_ +#define PTX_ISA_TARGET_MACROS_H_ + + +/* + * Targeting macros + * + * Information from: + * https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#release-notes + */ + +#if (defined(__CUDA_MINIMUM_ARCH__) && 800 <= __CUDA_MINIMUM_ARCH__) || (!defined(__CUDA_MINIMUM_ARCH__)) +# define _LIBCUDACXX_PTX_SM_80_AVAILABLE +#endif + +#if (defined(__CUDA_MINIMUM_ARCH__) && 900 <= __CUDA_MINIMUM_ARCH__) || (!defined(__CUDA_MINIMUM_ARCH__)) +# define _LIBCUDACXX_PTX_SM_90_AVAILABLE +#endif + +// PTX ISA 7.8 is available from CTK 11.8, driver r520 +#if (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 8)) || (!defined(__CUDACC_VER_MAJOR__)) +# define _LIBCUDACXX_PTX_ISA_78_AVAILABLE +#endif + +// PTX ISA 7.8 is available from CTK 11.8, driver r520 (so also from CTK 12.0 onwards) +#if (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 0)) || (!defined(__CUDACC_VER_MAJOR__)) +# define _LIBCUDACXX_PTX_ISA_78_AVAILABLE +#endif + +// PTX ISA 8.0 is available from CTK 12.0, driver r525 +#if (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 0)) || (!defined(__CUDACC_VER_MAJOR__)) +# define _LIBCUDACXX_PTX_ISA_80_AVAILABLE +#endif + +// PTX ISA 8.1 is available from CTK 12.1, driver r530 +#if (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 1)) || (!defined(__CUDACC_VER_MAJOR__)) +# define _LIBCUDACXX_PTX_ISA_81_AVAILABLE +#endif + +// PTX ISA 8.2 is available from CTK 12.2, driver r535 +#if (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 2)) || (!defined(__CUDACC_VER_MAJOR__)) +# define _LIBCUDACXX_PTX_ISA_82_AVAILABLE +#endif + +// PTX ISA 8.3 is available from CTK 12.3, driver r545 +#if (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 3)) || (!defined(__CUDACC_VER_MAJOR__)) +# define _LIBCUDACXX_PTX_ISA_83_AVAILABLE +#endif + + +#endif // PTX_ISA_TARGET_MACROS_H_ From e356271672c645c1956a5ac209b34ddc275b9341 Mon Sep 17 00:00:00 2001 From: Allard Hendriksen Date: Wed, 25 Oct 2023 13:17:06 +0200 Subject: [PATCH 13/49] Format code --- libcudacxx/include/cuda/ptx | 1 + .../detail/libcxx/include/__cuda/barrier.h | 3 +- ..._and_communication_instructions_mbarrier.h | 86 ++++++++++--------- .../include/__cuda/ptx/ptx_dot_variants.h | 85 +++++++++--------- .../include/__cuda/ptx/ptx_helper_functions.h | 30 ++++--- 5 files changed, 110 insertions(+), 95 deletions(-) diff --git a/libcudacxx/include/cuda/ptx b/libcudacxx/include/cuda/ptx index ea319195134..1f3fb868679 100644 --- a/libcudacxx/include/cuda/ptx +++ b/libcudacxx/include/cuda/ptx @@ -16,6 +16,7 @@ #include "std/type_traits" // std::integral_constant #include "../nv/target" // __CUDA_MINIMUM_ARCH__ and friends +// The following includes depend on the includes above: #include "cuda/std/detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h" #include "cuda/std/detail/libcxx/include/__cuda/ptx/ptx_dot_variants.h" #include "cuda/std/detail/libcxx/include/__cuda/ptx/ptx_helper_functions.h" diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h index d4b8fe45126..f5a65400d1e 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h @@ -27,8 +27,7 @@ _CCCL_IMPLICIT_SYSTEM_HEADER #include "../cstdlib" // _LIBCUDACXX_UNREACHABLE #include "../__type_traits/void_t.h" // _CUDA_VSTD::__void_t - -#include // cuda::ptx::* +#include // cuda::ptx::* #if defined(_LIBCUDACXX_COMPILER_NVRTC) #define _LIBCUDACXX_OFFSET_IS_ZERO(type, member) !(&(((type *)0)->member)) diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h index 10da1675226..e56b6f2586a 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h @@ -36,54 +36,62 @@ #if defined(_LIBCUDACXX_PTX_ISA_78_AVAILABLE) && defined(_LIBCUDACXX_PTX_SM_90_AVAILABLE) template -_LIBCUDACXX_DEVICE inline -_CUDA_VSTD::uint64_t mbarrier_arrive_expect_tx(sem_release_t __sem, scope_t<_Sco> __scope, space_shared_t __spc, _CUDA_VSTD::uint64_t* __addr, _CUDA_VSTD::uint32_t __tx_count) +_LIBCUDACXX_DEVICE inline _CUDA_VSTD::uint64_t mbarrier_arrive_expect_tx( + sem_release_t __sem, + scope_t<_Sco> __scope, + space_shared_t __spc, + _CUDA_VSTD::uint64_t* __addr, + _CUDA_VSTD::uint32_t __tx_count) { - // Arrive on local shared memory barrier - static_assert(__scope == scope_cta || __scope == scope_cluster, ""); - _CUDA_VSTD::uint64_t __token; - - if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cta) { - asm ( - "mbarrier.arrive.expect_tx.release.cta.shared::cta.b64 %0, [%1], %2;" - : "=l"(__token) - : "r"(__as_smem_ptr(__addr)), - "r"(__tx_count) - : "memory"); + // Arrive on local shared memory barrier + static_assert(__scope == scope_cta || __scope == scope_cluster, ""); + _CUDA_VSTD::uint64_t __token; + + if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cta) { + asm ( + "mbarrier.arrive.expect_tx.release.cta.shared::cta.b64 %0, [%1], %2;" + : "=l"(__token) + : "r"(__as_smem_ptr(__addr)), + "r"(__tx_count) + : "memory"); } else { - asm ( - "mbarrier.arrive.expect_tx.release.cluster.shared::cta.b64 %0, [%1], %2;" - : "=l"(__token) - : "r"(__as_smem_ptr(__addr)), - "r"(__tx_count) - : "memory"); - } - return __token; + asm ( + "mbarrier.arrive.expect_tx.release.cluster.shared::cta.b64 %0, [%1], %2;" + : "=l"(__token) + : "r"(__as_smem_ptr(__addr)), + "r"(__tx_count) + : "memory"); + } + return __token; } #ifdef // _LIBCUDACXX_PTX_ISA_78_AVAILABLE #if defined(_LIBCUDACXX_PTX_ISA_78_AVAILABLE) && defined(_LIBCUDACXX_PTX_SM_90_AVAILABLE) template -_LIBCUDACXX_DEVICE inline -void mbarrier_arrive_expect_tx(sem_release_t __sem, scope_t<_Sco> __scope, space_shared_cluster_t __spc, _CUDA_VSTD::uint64_t* __addr, _CUDA_VSTD::uint32_t __tx_count) +_LIBCUDACXX_DEVICE inline void mbarrier_arrive_expect_tx( + sem_release_t __sem, + scope_t<_Sco> __scope, + space_shared_cluster_t __spc, + _CUDA_VSTD::uint64_t* __addr, + _CUDA_VSTD::uint32_t __tx_count) { - // Arrive on remote cluster barrier - static_assert(__scope == scope_cta || __scope == scope_cluster, ""); - if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cta) { - asm ( - "mbarrier.arrive.expect_tx.release.cta.shared::cluster.b64 _, [%0], %1;" - : - : "r"(__as_smem_ptr(__addr)), - "r"(__tx_count) - : "memory"); + // Arrive on remote cluster barrier + static_assert(__scope == scope_cta || __scope == scope_cluster, ""); + if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cta) { + asm ( + "mbarrier.arrive.expect_tx.release.cta.shared::cluster.b64 _, [%0], %1;" + : + : "r"(__as_smem_ptr(__addr)), + "r"(__tx_count) + : "memory"); } else { - asm ( - "mbarrier.arrive.expect_tx.release.cluster.shared::cluster.b64 _, [%0], %1;" - : - : "r"(__as_smem_ptr(__addr)), - "r"(__tx_count) - : "memory"); - } + asm ( + "mbarrier.arrive.expect_tx.release.cluster.shared::cluster.b64 _, [%0], %1;" + : + : "r"(__as_smem_ptr(__addr)), + "r"(__tx_count) + : "memory"); + } } #ifdef // _LIBCUDACXX_PTX_ISA_78_AVAILABLE diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_dot_variants.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_dot_variants.h index 08a972492e7..c91a2512847 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_dot_variants.h +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_dot_variants.h @@ -48,47 +48,50 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#operation-types -enum class dot_sem { - acq_rel, - acquire, - relaxed, - release, - sc, - weak - // mmio? - // volatile? +enum class dot_sem +{ + acq_rel, + acquire, + relaxed, + release, + sc, + weak + // mmio? + // volatile? }; // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#state-spaces -enum class dot_space { - reg, - sreg, - const_mem, // Using const_mem as `const` is reserved in C++. - global, - local, - param, - shared, // The PTX spelling is shared::cta - shared_cluster, // The PTX spelling is shared::cluster, but we might want to go for cluster here. - tex // deprecated - // generic? +enum class dot_space +{ + reg, + sreg, + const_mem, // Using const_mem as `const` is reserved in C++. + global, + local, + param, + shared, // The PTX spelling is shared::cta + shared_cluster, // The PTX spelling is shared::cluster, but we might want to go for cluster here. + tex // deprecated + // generic? }; // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#scope -enum class dot_scope { - cta, - cluster, - gpu, - sys +enum class dot_scope +{ + cta, + cluster, + gpu, + sys }; template -using sem_t = _CUDA_VSTD::integral_constant; +using sem_t = _CUDA_VSTD::integral_constant; using sem_acq_rel_t = sem_t; using sem_acquire_t = sem_t; using sem_relaxed_t = sem_t; using sem_release_t = sem_t; -using sem_sc_t = sem_t; -using sem_weak_t = sem_t; +using sem_sc_t = sem_t; +using sem_weak_t = sem_t; static constexpr sem_acq_rel_t sem_acq_rel{}; static constexpr sem_acquire_t sem_acquire{}; @@ -98,16 +101,16 @@ static constexpr sem_sc_t sem_sc{}; static constexpr sem_weak_t sem_weak{}; template -using space_t = _CUDA_VSTD::integral_constant; -using space_const_mem_t = space_t; -using space_global_t = space_t; -using space_local_t = space_t; -using space_param_t = space_t; -using space_reg_t = space_t; -using space_shared_t = space_t; +using space_t = _CUDA_VSTD::integral_constant; +using space_const_mem_t = space_t; +using space_global_t = space_t; +using space_local_t = space_t; +using space_param_t = space_t; +using space_reg_t = space_t; +using space_shared_t = space_t; using space_shared_cluster_t = space_t; -using space_sreg_t = space_t; -using space_tex_t = space_t; +using space_sreg_t = space_t; +using space_tex_t = space_t; static constexpr space_const_mem_t space_const_mem{}; static constexpr space_global_t space_global{}; @@ -120,11 +123,11 @@ static constexpr space_sreg_t space_sreg{}; static constexpr space_tex_t space_tex{}; template -using scope_t = _CUDA_VSTD::integral_constant; +using scope_t = _CUDA_VSTD::integral_constant; using scope_cluster_t = scope_t; -using scope_cta_t = scope_t; -using scope_gpu_t = scope_t; -using scope_sys_t = scope_t; +using scope_cta_t = scope_t; +using scope_gpu_t = scope_t; +using scope_sys_t = scope_t; static constexpr scope_cluster_t scope_cluster{}; static constexpr scope_cta_t scope_cta{}; diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_helper_functions.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_helper_functions.h index 50982232f66..02ac1370d3d 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_helper_functions.h +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_helper_functions.h @@ -15,27 +15,31 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX // Private helper functions -inline _LIBCUDACXX_DEVICE _CUDA_VSTD::uint32_t __as_smem_ptr(const void * __ptr) { - return static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__ptr)); +inline _LIBCUDACXX_DEVICE _CUDA_VSTD::uint32_t __as_smem_ptr(const void* __ptr) +{ + return static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__ptr)); } - -inline _LIBCUDACXX_DEVICE _CUDA_VSTD::uint32_t __as_remote_dsmem_ptr(const void * __ptr) { - return static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__ptr)); +inline _LIBCUDACXX_DEVICE _CUDA_VSTD::uint32_t __as_remote_dsmem_ptr(const void* __ptr) +{ + return static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__ptr)); } -inline _LIBCUDACXX_DEVICE _CUDA_VSTD::uint64_t __as_gmem_ptr(const void * __ptr) { - return static_cast<_CUDA_VSTD::uint64_t>(__cvta_generic_to_global(__ptr)); +inline _LIBCUDACXX_DEVICE _CUDA_VSTD::uint64_t __as_gmem_ptr(const void* __ptr) +{ + return static_cast<_CUDA_VSTD::uint64_t>(__cvta_generic_to_global(__ptr)); } template -inline _LIBCUDACXX_DEVICE int __as_b32(_Tp __val) { - static_assert(sizeof(_Tp) == 4, ""); - return *reinterpret_cast(&__val); +inline _LIBCUDACXX_DEVICE int __as_b32(_Tp __val) +{ + static_assert(sizeof(_Tp) == 4, ""); + return *reinterpret_cast(&__val); } template -inline _LIBCUDACXX_DEVICE _CUDA_VSTD::uint64_t __as_b64(_Tp __val) { - static_assert(sizeof(_Tp) == 8, ""); - return *reinterpret_cast<_CUDA_VSTD::uint64_t*>(&__val); +inline _LIBCUDACXX_DEVICE _CUDA_VSTD::uint64_t __as_b64(_Tp __val) +{ + static_assert(sizeof(_Tp) == 8, ""); + return *reinterpret_cast<_CUDA_VSTD::uint64_t*>(&__val); } _LIBCUDACXX_END_NAMESPACE_CUDA_PTX From bb91eb741188e007becefb5e4c352edfa5c541bc Mon Sep 17 00:00:00 2001 From: Allard Hendriksen Date: Wed, 25 Oct 2023 13:34:04 +0200 Subject: [PATCH 14/49] Fix test and ifdefs The test would previously not fail when invalid ptx was present. Fixed now. --- .../test/cuda/ptx/sm90.ptx.compile.pass.cpp | 7 ++++++- ...chronization_and_communication_instructions_mbarrier.h | 8 +++++--- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/libcudacxx/.upstream-tests/test/cuda/ptx/sm90.ptx.compile.pass.cpp b/libcudacxx/.upstream-tests/test/cuda/ptx/sm90.ptx.compile.pass.cpp index 27b5af8e6f2..787a9c1f327 100644 --- a/libcudacxx/.upstream-tests/test/cuda/ptx/sm90.ptx.compile.pass.cpp +++ b/libcudacxx/.upstream-tests/test/cuda/ptx/sm90.ptx.compile.pass.cpp @@ -24,7 +24,12 @@ int main(int, char**) { NV_IF_TARGET(NV_IS_DEVICE, ( // Do not execute. Just check if below PTX compiles (that is: assembles) without error. - if (false) { + + // This condition always evaluates to false, but the compiler does not + // reason through it. This avoids dead code elimination. + const bool non_eliminated_false = threadIdx.x > 1024; + + if (non_eliminated_false) { using cuda::ptx::sem_release; using cuda::ptx::space_shared_cluster; using cuda::ptx::space_shared; diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h index e56b6f2586a..ade0cf411ca 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h @@ -13,6 +13,8 @@ #ifndef PARALLEL_SYNCHRONIZATION_AND_COMMUNICATION_INSTRUCTIONS_MBARRIER_H_ #define PARALLEL_SYNCHRONIZATION_AND_COMMUNICATION_INSTRUCTIONS_MBARRIER_H_ +_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX + /* * 9.7.12.15. Parallel Synchronization and Communication Instructions: mbarrier * https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier @@ -64,7 +66,7 @@ _LIBCUDACXX_DEVICE inline _CUDA_VSTD::uint64_t mbarrier_arrive_expect_tx( } return __token; } -#ifdef // _LIBCUDACXX_PTX_ISA_78_AVAILABLE +#endif // _LIBCUDACXX_PTX_ISA_78_AVAILABLE #if defined(_LIBCUDACXX_PTX_ISA_78_AVAILABLE) && defined(_LIBCUDACXX_PTX_SM_90_AVAILABLE) template @@ -93,7 +95,7 @@ _LIBCUDACXX_DEVICE inline void mbarrier_arrive_expect_tx( : "memory"); } } -#ifdef // _LIBCUDACXX_PTX_ISA_78_AVAILABLE +#endif // _LIBCUDACXX_PTX_ISA_78_AVAILABLE // 9.7.12.15.14. Parallel Synchronization and Communication Instructions: mbarrier.arrive_drop @@ -108,6 +110,6 @@ _LIBCUDACXX_DEVICE inline void mbarrier_arrive_expect_tx( // 9.7.12.15.17. Parallel Synchronization and Communication Instructions: mbarrier.pending_count // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-pending-count - +_LIBCUDACXX_END_NAMESPACE_CUDA_PTX #endif // PARALLEL_SYNCHRONIZATION_AND_COMMUNICATION_INSTRUCTIONS_MBARRIER_H_ From b514e2d86ed400a57bb1dc8cee65e290c8c2a34f Mon Sep 17 00:00:00 2001 From: Allard Hendriksen Date: Wed, 25 Oct 2023 14:35:59 +0200 Subject: [PATCH 15/49] Update ptx.md --- libcudacxx/docs/extended_api/ptx.md | 487 +++++++++++++++++++++++++++- 1 file changed, 482 insertions(+), 5 deletions(-) diff --git a/libcudacxx/docs/extended_api/ptx.md b/libcudacxx/docs/extended_api/ptx.md index d5092dc030f..feb4040d724 100644 --- a/libcudacxx/docs/extended_api/ptx.md +++ b/libcudacxx/docs/extended_api/ptx.md @@ -4,15 +4,381 @@ The `cuda::ptx` namespace contains functions that map one-to-one to [PTX instructions](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html). These can be used for maximal control of the generated code, or to experiment with new hardware features before a high-level C++ API is available. -### Shared memory barrier (mbarrier) +### [9.7.1. Integer Arithmetic Instructions](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions) -| Instruction | Compute capability | CUDA Toolkit | -|----------------------------------------|--------------------|--------------| -| `cuda::ptx::mbarrier_arrive_expect_tx` | 9.0 | CTK 12.4 | +| Instruction | Available in libcu++ | +|------------------------------------------|----------------------| +| [`sad`] | No | +| [`div`] | No | +| [`rem`] | No | +| [`abs`] | No | +| [`neg`] | No | +| [`min`] | No | +| [`max`] | No | +| [`popc`] | No | +| [`clz`] | No | +| [`bfind`] | No | +| [`fns`] | No | +| [`brev`] | No | +| [`bfe`] | No | +| [`bfi`] | No | +| [`szext`] | No | +| [`bmsk`] | No | +| [`dp4a`] | No | +| [`dp2a`] | No | + +[`sad`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-sad +[`div`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-div +[`rem`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-rem +[`abs`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-abs +[`neg`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-neg +[`min`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-min +[`max`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-max +[`popc`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-popc +[`clz`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-clz +[`bfind`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-bfind +[`fns`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-fns +[`brev`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-brev +[`bfe`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-bfe +[`bfi`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-bfi +[`szext`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-szext +[`bmsk`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-bmsk +[`dp4a`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-dp4a +[`dp2a`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-dp2a + +### [9.7.2. Extended-Precision Integer Arithmetic Instructions](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-integer-arithmetic-instructions) + +| Instruction | Available in libcu++ | +|------------------------------------------|----------------------| +| [`add.cc`] | No | +| [`addc`] | No | +| [`sub.cc`] | No | +| [`subc`] | No | +| [`mad.cc`] | No | +| [`madc`] | No | + +[`add.cc`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-arithmetic-instructions-add-cc +[`addc`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-arithmetic-instructions-addc +[`sub.cc`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-arithmetic-instructions-sub-cc +[`subc`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-arithmetic-instructions-subc +[`mad.cc`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-arithmetic-instructions-mad-cc +[`madc`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-arithmetic-instructions-madc + +### [9.7.3. Floating-Point Instructions](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions) + +| Instruction | Available in libcu++ | +|------------------------------------------|----------------------| +| [`testp`] | No | +| [`copysign`] | No | +| [`add`] | No | +| [`sub`] | No | +| [`mul`] | No | +| [`fma`] | No | +| [`mad`] | No | +| [`div`] | No | +| [`abs`] | No | +| [`neg`] | No | +| [`min`] | No | +| [`max`] | No | +| [`rcp`] | No | +| [`rcp.approx.ftz.f64`] | No | +| [`sqrt`] | No | +| [`rsqrt`] | No | +| [`rsqrt.approx.ftz.f64`] | No | +| [`sin`] | No | +| [`cos`] | No | +| [`lg2`] | No | +| [`ex2`] | No | +| [`tanh`] | No | + +[`testp`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-testp +[`copysign`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-copysign +[`add`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-add +[`sub`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-sub +[`mul`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-mul +[`fma`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-fma +[`mad`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-mad +[`div`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-div +[`abs`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-abs +[`neg`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-neg +[`min`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-min +[`max`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-max +[`rcp`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-rcp +[`rcp.approx.ftz.f64`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-rcp-approx-ftz-f64 +[`sqrt`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-sqrt +[`rsqrt`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-rsqrt +[`rsqrt.approx.ftz.f64`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-rsqrt-approx-ftz-f64 +[`sin`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-sin +[`cos`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-cos +[`lg2`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-lg2 +[`ex2`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-ex2 +[`tanh`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-tanh + +### [9.7.4. Half Precision Floating-Point Instructions](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions) + +| Instruction | Available in libcu++ | +|------------------------------------------|----------------------| +| [`add`] | No | +| [`sub`] | No | +| [`mul`] | No | +| [`fma`] | No | +| [`neg`] | No | +| [`abs`] | No | +| [`min`] | No | +| [`max`] | No | +| [`tanh`] | No | +| [`ex2`] | No | + +[`add`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-add +[`sub`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-sub +[`mul`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-mul +[`fma`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-fma +[`neg`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-neg +[`abs`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-abs +[`min`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-min +[`max`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-max +[`tanh`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-tanh +[`ex2`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-ex2 + +### [9.7.5. Comparison and Selection Instructions](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#comparison-and-selection-instructions) + +| Instruction | Available in libcu++ | +|------------------------------------------|----------------------| +| [`set`] | No | +| [`setp`] | No | +| [`selp`] | No | +| [`slct`] | No | + +[`set`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#comparison-and-selection-instructions-set +[`setp`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#comparison-and-selection-instructions-setp +[`selp`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#comparison-and-selection-instructions-selp +[`slct`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#comparison-and-selection-instructions-slct + +### [9.7.6. Half Precision Comparison Instructions](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-comparison-instructions) + +| Instruction | Available in libcu++ | +|------------------------------------------|----------------------| +| [`set`] | No | +| [`setp`] | No | + +[`set`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-comparison-instructions-set +[`setp`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-comparison-instructions-setp + +### [9.7.7. Logic and Shift Instructions](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions) + +| Instruction | Available in libcu++ | +|------------------------------------------|----------------------| +| [`and`] | No | +| [`or`] | No | +| [`xor`] | No | +| [`not`] | No | +| [`cnot`] | No | +| [`lop3`] | No | +| [`shf`] | No | +| [`shl`] | No | +| [`shr`] | No | + +[`and`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-and +[`or`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-or +[`xor`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-xor +[`not`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-not +[`cnot`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-cnot +[`lop3`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-lop3 +[`shf`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-shf +[`shl`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-shl +[`shr`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-shr + +### [9.7.8. Data Movement and Conversion Instructions](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions) + +| Instruction | Available in libcu++ | +|------------------------------------------|----------------------| +| [`mov`] | No | +| [`mov`] | No | +| [`shfl (deprecated)`] | No | +| [`shfl.sync`] | No | +| [`prmt`] | No | +| [`ld`] | No | +| [`ld.global.nc`] | No | +| [`ldu`] | No | +| [`st`] | No | +| [`st.async`] | No | +| [`multimem.ld_reduce, multimem.st, multimem.red`] | No | +| [`prefetch, prefetchu`] | No | +| [`applypriority`] | No | +| [`discard`] | No | +| [`createpolicy`] | No | +| [`isspacep`] | No | +| [`cvta`] | No | +| [`cvt`] | No | +| [`cvt.pack`] | No | +| [`mapa`] | No | +| [`getctarank`] | No | + +[`mov`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-mov-2 +[`shfl (deprecated)`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-shfl-deprecated +[`shfl.sync`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-shfl-sync +[`prmt`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-prmt +[`ld`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-ld +[`ld.global.nc`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-ld-global-nc +[`ldu`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-ldu +[`st`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-st +[`st.async`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-st-async +[`multimem.ld_reduce, multimem.st, multimem.red`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-multimem-ld-reduce-multimem-st-multimem-red +[`prefetch, prefetchu`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-prefetch-prefetchu +[`applypriority`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-applypriority +[`discard`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-discard +[`createpolicy`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-createpolicy +[`isspacep`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-isspacep +[`cvta`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvta +[`cvt`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvt +[`cvt.pack`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvt-pack +[`mapa`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-mapa +[`getctarank`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-getctarank + +### [9.7.8.24. Data Movement and Conversion Instructions: Asynchronous copy](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-asynchronous-copy) + +| Instruction | Available in libcu++ | +|------------------------------------------|----------------------| +| [`cp.async`] | No | +| [`cp.async.commit_group`] | No | +| [`cp.async.wait_group / cp.async.wait_all`] | No | +| [`cp.async.bulk`] | No | +| [`cp.reduce.async.bulk`] | No | +| [`cp.async.bulk.prefetch`] | No | +| [`cp.async.bulk.tensor`] | No | +| [`cp.reduce.async.bulk.tensor`] | No | +| [`cp.async.bulk.prefetch.tensor`] | No | +| [`cp.async.bulk.commit_group`] | No | +| [`cp.async.bulk.wait_group`] | No | +| [`tensormap.replace`] | No | + +[`cp.async`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async +[`cp.async.commit_group`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-commit-group +[`cp.async.wait_group / cp.async.wait_all`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-wait-group-cp-async-wait-all +[`cp.async.bulk`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk +[`cp.reduce.async.bulk`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-reduce-async-bulk +[`cp.async.bulk.prefetch`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-prefetch +[`cp.async.bulk.tensor`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor +[`cp.reduce.async.bulk.tensor`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-reduce-async-bulk-tensor +[`cp.async.bulk.prefetch.tensor`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-prefetch-tensor +[`cp.async.bulk.commit_group`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-commit-group +[`cp.async.bulk.wait_group`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-wait-group +[`tensormap.replace`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-tensormap-replace + +### [9.7.9. Texture Instructions](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#texture-instructions) + +| Instruction | Available in libcu++ | +|------------------------------------------|----------------------| +| [`tex`] | No | +| [`tld4`] | No | +| [`txq`] | No | +| [`istypep`] | No | + +[`tex`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#texture-instructions-tex +[`tld4`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#texture-instructions-tld4 +[`txq`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#texture-instructions-txq +[`istypep`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#texture-instructions-istypep + +### [9.7.10. Surface Instructions](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#surface-instructions) + +| Instruction | Available in libcu++ | +|------------------------------------------|----------------------| +| [`suld`] | No | +| [`sust`] | No | +| [`sured`] | No | +| [`suq`] | No | + +[`suld`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#surface-instructions-suld +[`sust`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#surface-instructions-sust +[`sured`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#surface-instructions-sured +[`suq`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#surface-instructions-suq + +### [9.7.11. Control Flow Instructions](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions) + +| Instruction | Available in libcu++ | +|------------------------------------------|----------------------| +| [`{}`] | No | +| [`@`] | No | +| [`bra`] | No | +| [`brx.idx`] | No | +| [`call`] | No | +| [`ret`] | No | +| [`exit`] | No | + +[`{}`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-curly-braces +[`@`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-at +[`bra`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-bra +[`brx.idx`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-brx-idx +[`call`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-call +[`ret`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-ret +[`exit`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-exit + +### [9.7.12. Parallel Synchronization and Communication Instructions](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions) + +| Instruction | Available in libcu++ | +|------------------------------------------|----------------------| +| [`bar, barrier`] | No | +| [`bar.warp.sync`] | No | +| [`barrier.cluster`] | No | +| [`membar/fence`] | No | +| [`atom`] | No | +| [`red`] | No | +| [`red.async`] | No | +| [`vote (deprecated)`] | No | +| [`vote.sync`] | No | +| [`match.sync`] | No | +| [`activemask`] | No | +| [`redux.sync`] | No | +| [`griddepcontrol`] | No | +| [`elect.sync`] | No | +[`bar, barrier`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-bar-barrier +[`bar.warp.sync`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-bar-warp-sync +[`barrier.cluster`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-barrier-cluster +[`membar/fence`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar-fence +[`atom`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-atom +[`red`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-red +[`red.async`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-red-async +[`vote (deprecated)`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-vote-deprecated +[`vote.sync`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-vote-sync +[`match.sync`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-match-sync +[`activemask`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-activemask +[`redux.sync`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-redux-sync +[`griddepcontrol`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-griddepcontrol +[`elect.sync`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-elect-sync -#### [`cuda::ptx::mbarrier_arrive_expect_tx`](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive) +### [9.7.12.15. Parallel Synchronization and Communication Instructions: mbarrier](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier) +| Instruction | Available in libcu++ | +|------------------------------------------|----------------------| +| [`mbarrier.init`] | No | +| [`mbarrier.inval`] | No | +| [`mbarrier.expect_tx`] | No | +| [`mbarrier.complete_tx`] | No | +| [`mbarrier.arrive`] | CTK-FUTURE, CCCL v2.3.0 | +| [`mbarrier.arrive_drop`] | No | +| [`cp.async.mbarrier.arrive`] | No | +| [`mbarrier.test_wait/mbarrier.try_wait`] | No | +| [`mbarrier.pending_count`] | No | +| [`tensormap.cp_fenceproxy`] | No | + +[`mbarrier.init`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-init +[`mbarrier.inval`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-inval +[`mbarrier.expect_tx`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-expect-tx +[`mbarrier.complete_tx`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-complete-tx +[`mbarrier.arrive`]: #mbarrierarrive +[`mbarrier.arrive_drop`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive-drop +[`cp.async.mbarrier.arrive`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-cp-async-mbarrier-arrive +[`mbarrier.test_wait/mbarrier.try_wait`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-test-wait-mbarrier-try-wait +[`mbarrier.pending_count`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-pending-count +[`tensormap.cp_fenceproxy`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-tensormap-cp-fenceproxy + + +#### `mbarrier.arrive` + +- PTX ISA: [mbarrier.arrive](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive) + + ```cuda template __device__ inline @@ -63,6 +429,117 @@ __global__ void kernel() { ) } ``` +### [9.7.13. Warp Level Matrix Multiply-Accumulate Instructions](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-multiply-accumulate-instructions) + +| Instruction | Available in libcu++ | +|------------------------------------------|----------------------| +| [`wmma.load`] | No | +| [`wmma.store`] | No | +| [`wmma.mma`] | No | +| [`mma`] | No | +| [`ldmatrix`] | No | +| [`stmatrix`] | No | +| [`movmatrix`] | No | +| [`mma.sp`] | No | + +[`wmma.load`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-load-instruction-wmma-load +[`wmma.store`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-store-instruction-wmma-store +[`wmma.mma`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-multiply-and-accumulate-instruction-wmma-mma +[`mma`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#multiply-and-accumulate-instruction-mma +[`ldmatrix`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-load-instruction-ldmatrix +[`stmatrix`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-store-instruction-stmatrix +[`movmatrix`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-transpose-instruction-movmatrix +[`mma.sp`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#multiply-and-accumulate-instruction-mma-sp + +### [9.7.14. Asynchronous Warpgroup Level Matrix Multiply-Accumulate Instructions](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-warpgroup-level-matrix-multiply-accumulate-instructions) + +| Instruction | Available in libcu++ | +|------------------------------------------|----------------------| +| [`wgmma.mma_async`] | No | +| [`wgmma.mma_async.sp`] | No | +| [`wgmma.fence`] | No | +| [`wgmma.commit_group`] | No | +| [`wgmma.wait_group`] | No | + +[`wgmma.mma_async`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-multiply-and-accumulate-instruction-wgmma-mma-async +[`wgmma.mma_async.sp`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-multiply-and-accumulate-instruction-wgmma-mma-async-sp +[`wgmma.fence`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-multiply-and-accumulate-instruction-wgmma-fence +[`wgmma.commit_group`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-multiply-and-accumulate-instruction-wgmma-commit-group +[`wgmma.wait_group`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-multiply-and-accumulate-instruction-wgmma-wait-group + +### [9.7.15. Stack Manipulation Instructions](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#stack-manipulation-instructions) + +| Instruction | Available in libcu++ | +|------------------------------------------|----------------------| +| [`stacksave`] | No | +| [`stackrestore`] | No | +| [`alloca`] | No | + +[`stacksave`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#stack-manipulation-instructions-stacksave +[`stackrestore`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#stack-manipulation-instructions-stackrestore +[`alloca`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#stack-manipulation-instructions-alloca + +### [9.7.16. Video Instructions](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#video-instructions) + +| Instruction | Available in libcu++ | +|------------------------------------------|----------------------| +| [`vadd, vsub, vabsdiff, vmin, vmax`] | No | +| [`vshl, vshr`] | No | +| [`vmad`] | No | +| [`vset`] | No | + +[`vadd, vsub, vabsdiff, vmin, vmax`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#scalar-video-instructions-vadd-vsub-vabsdiff-vmin-vmax +[`vshl, vshr`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#scalar-video-instructions-vshl-vshr +[`vmad`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#scalar-video-instructions-vmad +[`vset`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#scalar-video-instructions-vset + +### [9.7.16.2. SIMD Video Instructions](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#simd-video-instructions) + +| Instruction | Available in libcu++ | +|------------------------------------------|----------------------| +| [`vadd2, vsub2, vavrg2, vabsdiff2, vmin2, vmax2`] | No | +| [`vset2`] | No | +| [`vadd4, vsub4, vavrg4, vabsdiff4, vmin4, vmax4`] | No | +| [`vset4`] | No | + +[`vadd2, vsub2, vavrg2, vabsdiff2, vmin2, vmax2`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#simd-video-instructions-vadd2-vsub2-vavrg2-vabsdiff2-vmin2-vmax2 +[`vset2`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#simd-video-instructions-vset2 +[`vadd4, vsub4, vavrg4, vabsdiff4, vmin4, vmax4`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#simd-video-instructions-vadd4-vsub4-vavrg4-vabsdiff4-vmin4-vmax4 +[`vset4`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#simd-video-instructions-vset4 + +### [9.7.17. Miscellaneous Instructions](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions) + +| Instruction | Available in libcu++ | +|------------------------------------------|----------------------| +| [`brkpt`] | No | +| [`nanosleep`] | No | +| [`pmevent`] | No | +| [`trap`] | No | +| [`setmaxnreg`] | No | + +[`brkpt`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions-brkpt +[`nanosleep`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions-nanosleep +[`pmevent`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions-pmevent +[`trap`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions-trap +[`setmaxnreg`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions-setmaxnreg + + + + + + + + + + + +### Shared memory barrier (mbarrier) + +| Instruction | Compute capability | CUDA Toolkit | +|----------------------------------------|--------------------|--------------| +| `cuda::ptx::mbarrier_arrive_expect_tx` | 9.0 | CTK 12.4 | + + From e351c799eaabcfa8bfd3c95031f8a07ab435d973 Mon Sep 17 00:00:00 2001 From: Allard Hendriksen Date: Wed, 25 Oct 2023 15:08:13 +0200 Subject: [PATCH 16/49] Use numerical PTX ISA/SM target macros --- ..._and_communication_instructions_mbarrier.h | 8 +-- .../__cuda/ptx/ptx_isa_target_macros.h | 57 +++++++++---------- 2 files changed, 32 insertions(+), 33 deletions(-) diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h index ade0cf411ca..2bcfdc4bd33 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h @@ -36,7 +36,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX // 9.7.12.15.13. Parallel Synchronization and Communication Instructions: mbarrier.arrive // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive -#if defined(_LIBCUDACXX_PTX_ISA_78_AVAILABLE) && defined(_LIBCUDACXX_PTX_SM_90_AVAILABLE) +#if __cccl_ptx_sm >= 900 && __cccl_ptx_isa >= 780 template _LIBCUDACXX_DEVICE inline _CUDA_VSTD::uint64_t mbarrier_arrive_expect_tx( sem_release_t __sem, @@ -66,9 +66,9 @@ _LIBCUDACXX_DEVICE inline _CUDA_VSTD::uint64_t mbarrier_arrive_expect_tx( } return __token; } -#endif // _LIBCUDACXX_PTX_ISA_78_AVAILABLE +#endif // __cccl_ptx_isa -#if defined(_LIBCUDACXX_PTX_ISA_78_AVAILABLE) && defined(_LIBCUDACXX_PTX_SM_90_AVAILABLE) +#if __cccl_ptx_sm >= 900 && __cccl_ptx_isa >= 780 template _LIBCUDACXX_DEVICE inline void mbarrier_arrive_expect_tx( sem_release_t __sem, @@ -95,7 +95,7 @@ _LIBCUDACXX_DEVICE inline void mbarrier_arrive_expect_tx( : "memory"); } } -#endif // _LIBCUDACXX_PTX_ISA_78_AVAILABLE +#endif // __cccl_ptx_isa // 9.7.12.15.14. Parallel Synchronization and Communication Instructions: mbarrier.arrive_drop diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h index 89b519513ca..e0306dbf627 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h @@ -21,43 +21,42 @@ * https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#release-notes */ -#if (defined(__CUDA_MINIMUM_ARCH__) && 800 <= __CUDA_MINIMUM_ARCH__) || (!defined(__CUDA_MINIMUM_ARCH__)) -# define _LIBCUDACXX_PTX_SM_80_AVAILABLE -#endif -#if (defined(__CUDA_MINIMUM_ARCH__) && 900 <= __CUDA_MINIMUM_ARCH__) || (!defined(__CUDA_MINIMUM_ARCH__)) -# define _LIBCUDACXX_PTX_SM_90_AVAILABLE -#endif +// SM version -// PTX ISA 7.8 is available from CTK 11.8, driver r520 -#if (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 8)) || (!defined(__CUDACC_VER_MAJOR__)) -# define _LIBCUDACXX_PTX_ISA_78_AVAILABLE +#if (defined(__CUDA_MINIMUM_ARCH__) && 900 <= __CUDA_MINIMUM_ARCH__) || (!defined(__CUDA_MINIMUM_ARCH__)) +# define __cccl_ptx_sm 900ULL +#elif (defined(__CUDA_MINIMUM_ARCH__) && 800 <= __CUDA_MINIMUM_ARCH__) || (!defined(__CUDA_MINIMUM_ARCH__)) +# define __cccl_ptx_sm 800ULL +// Fallback case. Define the SM version to be zero. This ensures that the macro is always defined. +#else +# define __cccl_ptx_sm 0ULL #endif -// PTX ISA 7.8 is available from CTK 11.8, driver r520 (so also from CTK 12.0 onwards) -#if (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 0)) || (!defined(__CUDACC_VER_MAJOR__)) -# define _LIBCUDACXX_PTX_ISA_78_AVAILABLE -#endif -// PTX ISA 8.0 is available from CTK 12.0, driver r525 -#if (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 0)) || (!defined(__CUDACC_VER_MAJOR__)) -# define _LIBCUDACXX_PTX_ISA_80_AVAILABLE -#endif +// PTX ISA version +// PTX ISA 8.3 is available from CTK 12.3, driver r545 +#if (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 3)) || (!defined(__CUDACC_VER_MAJOR__)) +# define __cccl_ptx_isa 830ULL // PTX ISA 8.1 is available from CTK 12.1, driver r530 -#if (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 1)) || (!defined(__CUDACC_VER_MAJOR__)) -# define _LIBCUDACXX_PTX_ISA_81_AVAILABLE -#endif - +#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 2)) || (!defined(__CUDACC_VER_MAJOR__)) +# define __cccl_ptx_isa 820ULL // PTX ISA 8.2 is available from CTK 12.2, driver r535 -#if (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 2)) || (!defined(__CUDACC_VER_MAJOR__)) -# define _LIBCUDACXX_PTX_ISA_82_AVAILABLE -#endif - -// PTX ISA 8.3 is available from CTK 12.3, driver r545 -#if (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 3)) || (!defined(__CUDACC_VER_MAJOR__)) -# define _LIBCUDACXX_PTX_ISA_83_AVAILABLE +#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 1)) || (!defined(__CUDACC_VER_MAJOR__)) +# define __cccl_ptx_isa 810ULL +// PTX ISA 8.0 is available from CTK 12.0, driver r525 +#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 0)) || (!defined(__CUDACC_VER_MAJOR__)) +# define __cccl_ptx_isa 800ULL +// PTX ISA 7.8 is available from CTK 11.8, driver r520 (so also from CTK 12.0 onwards) +#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 0)) || (!defined(__CUDACC_VER_MAJOR__)) +# define __cccl_ptx_isa 780ULL +// PTX ISA 7.8 is available from CTK 11.8, driver r520 +#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 8)) || (!defined(__CUDACC_VER_MAJOR__)) +# define __cccl_ptx_isa 780ULL +// Fallback case. Define the ISA version to be zero. This ensures that the macro is always defined. +#else +# define __cccl_ptx_isa 0ULL #endif - #endif // PTX_ISA_TARGET_MACROS_H_ From 9006317cdb2368eb74a0ac8780c15f3898e1570d Mon Sep 17 00:00:00 2001 From: Allard Hendriksen Date: Wed, 25 Oct 2023 15:17:18 +0200 Subject: [PATCH 17/49] Move bulk of ptx header into detail/ptx.h --- libcudacxx/include/cuda/ptx | 696 +---------------- .../std/detail/libcxx/include/__cuda/ptx.h | 711 ++++++++++++++++++ 2 files changed, 715 insertions(+), 692 deletions(-) create mode 100644 libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h diff --git a/libcudacxx/include/cuda/ptx b/libcudacxx/include/cuda/ptx index 1f3fb868679..ab6ed62d9d2 100644 --- a/libcudacxx/include/cuda/ptx +++ b/libcudacxx/include/cuda/ptx @@ -12,700 +12,12 @@ #ifndef _CUDA_PTX #define _CUDA_PTX -#include "std/cstdint" // uint32_t -#include "std/type_traits" // std::integral_constant -#include "../nv/target" // __CUDA_MINIMUM_ARCH__ and friends +#include "std/detail/__config" -// The following includes depend on the includes above: -#include "cuda/std/detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h" -#include "cuda/std/detail/libcxx/include/__cuda/ptx/ptx_dot_variants.h" -#include "cuda/std/detail/libcxx/include/__cuda/ptx/ptx_helper_functions.h" -#include "cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h" +#include "std/detail/__pragma_push" -/* - * The cuda::ptx namespace intends to provide PTX wrappers for new hardware - * features and new PTX instructions so that they can be experimented with - * before higher-level C++ APIs are designed and developed. - * - * The wrappers have the following responsibilities: - * - * - They must prevent any PTX assembler errors, that is: - * - They are defined only for versions of the CUDA Toolkit in which nvcc/ptxas - * actually recognizes the instruction. - * - Sizes and types of parameters are correct. - * - They must convert state spaces correctly. - * - They adhere to the libcu++ coding standards of using: - * - Reserved identifiers for all parameters, variables. E.g. `__meow` or `_Woof` - * - _CUDA_VSTD:: namespace for types - * - * The wrappers should not do the following: - * - * - Use any non-native types. For example, an mbarrier instruction wrapper - * takes the barrier address as a uint64_t pointer. - * - * This header is intended for: - * - * - internal consumption by higher-level APIs such as cuda::barrier, - * - outside developers who want to experiment with the latest features of the - * hardware. - * - * Stability: - * - * - These headers are intended to present a stable API (not ABI) within one - * major version of the CTK. This means that: - * - All functions are marked inline - * - The type of a function parameter can be changed to be more generic if - * that means that code that called the original version can still be - * compiled. - * - * - Good exposure of the PTX should be high priority. If, at a new major - * version, we face a difficult choice between breaking backward-compatibility - * and an improvement of the PTX exposure, we will tend to the latter option - * more easily than in other parts of libcu++. - */ +#include "std/detail/libcxx/include/__cuda/ptx.h" -_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX - -/* - * Instructions - * - * The organization of the instructions below follows that of the PTX ISA documentation: - * https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#instructions - * - * To improve code organization, some sections are separated into their own - * header. For instance, the mbarrier instructions are found in: - * __cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h - * - */ - -/* - * 9.7.1. Integer Arithmetic Instructions - * https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions - * - */ - -// 9.7.1.7. Integer Arithmetic Instructions: sad -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-sad - -// 9.7.1.8. Integer Arithmetic Instructions: div -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-div - -// 9.7.1.9. Integer Arithmetic Instructions: rem -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-rem - -// 9.7.1.10. Integer Arithmetic Instructions: abs -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-abs - -// 9.7.1.11. Integer Arithmetic Instructions: neg -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-neg - -// 9.7.1.12. Integer Arithmetic Instructions: min -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-min - -// 9.7.1.13. Integer Arithmetic Instructions: max -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-max - -// 9.7.1.14. Integer Arithmetic Instructions: popc -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-popc - -// 9.7.1.15. Integer Arithmetic Instructions: clz -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-clz - -// 9.7.1.16. Integer Arithmetic Instructions: bfind -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-bfind - -// 9.7.1.17. Integer Arithmetic Instructions: fns -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-fns - -// 9.7.1.18. Integer Arithmetic Instructions: brev -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-brev - -// 9.7.1.19. Integer Arithmetic Instructions: bfe -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-bfe - -// 9.7.1.20. Integer Arithmetic Instructions: bfi -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-bfi - -// 9.7.1.21. Integer Arithmetic Instructions: szext -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-szext - -// 9.7.1.22. Integer Arithmetic Instructions: bmsk -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-bmsk - -// 9.7.1.23. Integer Arithmetic Instructions: dp4a -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-dp4a - -// 9.7.1.24. Integer Arithmetic Instructions: dp2a -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-dp2a - - -/* - * 9.7.2. Extended-Precision Integer Arithmetic Instructions - * https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-integer-arithmetic-instructions - * - */ - -// 9.7.2.1. Extended-Precision Arithmetic Instructions: add.cc -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-arithmetic-instructions-add-cc - -// 9.7.2.2. Extended-Precision Arithmetic Instructions: addc -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-arithmetic-instructions-addc - -// 9.7.2.3. Extended-Precision Arithmetic Instructions: sub.cc -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-arithmetic-instructions-sub-cc - -// 9.7.2.4. Extended-Precision Arithmetic Instructions: subc -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-arithmetic-instructions-subc - -// 9.7.2.5. Extended-Precision Arithmetic Instructions: mad.cc -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-arithmetic-instructions-mad-cc - -// 9.7.2.6. Extended-Precision Arithmetic Instructions: madc -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-arithmetic-instructions-madc - - -/* - * 9.7.3. Floating-Point Instructions - * https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions - * - */ - -// 9.7.3.1. Floating Point Instructions: testp -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-testp - -// 9.7.3.2. Floating Point Instructions: copysign -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-copysign - -// 9.7.3.3. Floating Point Instructions: add -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-add - -// 9.7.3.4. Floating Point Instructions: sub -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-sub - -// 9.7.3.5. Floating Point Instructions: mul -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-mul - -// 9.7.3.6. Floating Point Instructions: fma -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-fma - -// 9.7.3.7. Floating Point Instructions: mad -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-mad - -// 9.7.3.8. Floating Point Instructions: div -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-div - -// 9.7.3.9. Floating Point Instructions: abs -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-abs - -// 9.7.3.10. Floating Point Instructions: neg -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-neg - -// 9.7.3.11. Floating Point Instructions: min -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-min - -// 9.7.3.12. Floating Point Instructions: max -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-max - -// 9.7.3.13. Floating Point Instructions: rcp -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-rcp - -// 9.7.3.14. Floating Point Instructions: rcp.approx.ftz.f64 -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-rcp-approx-ftz-f64 - -// 9.7.3.15. Floating Point Instructions: sqrt -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-sqrt - -// 9.7.3.16. Floating Point Instructions: rsqrt -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-rsqrt - -// 9.7.3.17. Floating Point Instructions: rsqrt.approx.ftz.f64 -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-rsqrt-approx-ftz-f64 - -// 9.7.3.18. Floating Point Instructions: sin -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-sin - -// 9.7.3.19. Floating Point Instructions: cos -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-cos - -// 9.7.3.20. Floating Point Instructions: lg2 -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-lg2 - -// 9.7.3.21. Floating Point Instructions: ex2 -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-ex2 - -// 9.7.3.22. Floating Point Instructions: tanh -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-tanh - - -/* - * 9.7.4. Half Precision Floating-Point Instructions - * https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions - * - */ - -// 9.7.4.1. Half Precision Floating Point Instructions: add -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-add - -// 9.7.4.2. Half Precision Floating Point Instructions: sub -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-sub - -// 9.7.4.3. Half Precision Floating Point Instructions: mul -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-mul - -// 9.7.4.4. Half Precision Floating Point Instructions: fma -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-fma - -// 9.7.4.5. Half Precision Floating Point Instructions: neg -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-neg - -// 9.7.4.6. Half Precision Floating Point Instructions: abs -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-abs - -// 9.7.4.7. Half Precision Floating Point Instructions: min -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-min - -// 9.7.4.8. Half Precision Floating Point Instructions: max -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-max - -// 9.7.4.9. Half Precision Floating Point Instructions: tanh -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-tanh - -// 9.7.4.10. Half Precision Floating Point Instructions: ex2 -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-ex2 - - -/* - * 9.7.5. Comparison and Selection Instructions - * https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#comparison-and-selection-instructions - * - */ - -// 9.7.5.1. Comparison and Selection Instructions: set -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#comparison-and-selection-instructions-set - -// 9.7.5.2. Comparison and Selection Instructions: setp -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#comparison-and-selection-instructions-setp - -// 9.7.5.3. Comparison and Selection Instructions: selp -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#comparison-and-selection-instructions-selp - -// 9.7.5.4. Comparison and Selection Instructions: slct -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#comparison-and-selection-instructions-slct - - -/* - * 9.7.6. Half Precision Comparison Instructions - * https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-comparison-instructions - * - */ - -// 9.7.6.1. Half Precision Comparison Instructions: set -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-comparison-instructions-set - -// 9.7.6.2. Half Precision Comparison Instructions: setp -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-comparison-instructions-setp - - -/* - * 9.7.7. Logic and Shift Instructions - * https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions - * - */ - -// 9.7.7.1. Logic and Shift Instructions: and -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-and - -// 9.7.7.2. Logic and Shift Instructions: or -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-or - -// 9.7.7.3. Logic and Shift Instructions: xor -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-xor - -// 9.7.7.4. Logic and Shift Instructions: not -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-not - -// 9.7.7.5. Logic and Shift Instructions: cnot -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-cnot - -// 9.7.7.6. Logic and Shift Instructions: lop3 -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-lop3 - -// 9.7.7.7. Logic and Shift Instructions: shf -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-shf - -// 9.7.7.8. Logic and Shift Instructions: shl -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-shl - -// 9.7.7.9. Logic and Shift Instructions: shr -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-shr - - -/* - * 9.7.8. Data Movement and Conversion Instructions - * https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions - * - */ - -// 9.7.8.3. Data Movement and Conversion Instructions: mov -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-mov - -// 9.7.8.4. Data Movement and Conversion Instructions: mov -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-mov-2 - -// 9.7.8.5. Data Movement and Conversion Instructions: shfl (deprecated) -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-shfl-deprecated - -// 9.7.8.6. Data Movement and Conversion Instructions: shfl.sync -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-shfl-sync - -// 9.7.8.7. Data Movement and Conversion Instructions: prmt -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-prmt - -// 9.7.8.8. Data Movement and Conversion Instructions: ld -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-ld - -// 9.7.8.9. Data Movement and Conversion Instructions: ld.global.nc -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-ld-global-nc - -// 9.7.8.10. Data Movement and Conversion Instructions: ldu -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-ldu - -// 9.7.8.11. Data Movement and Conversion Instructions: st -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-st - -// 9.7.8.12. Data Movement and Conversion Instructions: st.async -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-st-async - -// 9.7.8.13. Data Movement and Conversion Instructions: multimem.ld_reduce, multimem.st, multimem.red -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-multimem-ld-reduce-multimem-st-multimem-red - -// 9.7.8.14. Data Movement and Conversion Instructions: prefetch, prefetchu -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-prefetch-prefetchu - -// 9.7.8.15. Data Movement and Conversion Instructions: applypriority -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-applypriority - -// 9.7.8.16. Data Movement and Conversion Instructions: discard -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-discard - -// 9.7.8.17. Data Movement and Conversion Instructions: createpolicy -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-createpolicy - -// 9.7.8.18. Data Movement and Conversion Instructions: isspacep -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-isspacep - -// 9.7.8.19. Data Movement and Conversion Instructions: cvta -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvta - -// 9.7.8.20. Data Movement and Conversion Instructions: cvt -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvt - -// 9.7.8.21. Data Movement and Conversion Instructions: cvt.pack -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvt-pack - -// 9.7.8.22. Data Movement and Conversion Instructions: mapa -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-mapa - -// 9.7.8.23. Data Movement and Conversion Instructions: getctarank -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-getctarank - - -/* - * 9.7.8.24. Data Movement and Conversion Instructions: Asynchronous copy - * https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-asynchronous-copy - * - */ - -// 9.7.8.24.3. Data Movement and Conversion Instructions: cp.async -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async - -// 9.7.8.24.4. Data Movement and Conversion Instructions: cp.async.commit_group -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-commit-group - -// 9.7.8.24.5. Data Movement and Conversion Instructions: cp.async.wait_group / cp.async.wait_all -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-wait-group-cp-async-wait-all - -// 9.7.8.24.6. Data Movement and Conversion Instructions: cp.async.bulk -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk - -// 9.7.8.24.7. Data Movement and Conversion Instructions: cp.reduce.async.bulk -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-reduce-async-bulk - -// 9.7.8.24.8. Data Movement and Conversion Instructions: cp.async.bulk.prefetch -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-prefetch - -// 9.7.8.24.9. Data Movement and Conversion Instructions: cp.async.bulk.tensor -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor - -// 9.7.8.24.10. Data Movement and Conversion Instructions: cp.reduce.async.bulk.tensor -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-reduce-async-bulk-tensor - -// 9.7.8.24.11. Data Movement and Conversion Instructions: cp.async.bulk.prefetch.tensor -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-prefetch-tensor - -// 9.7.8.24.12. Data Movement and Conversion Instructions: cp.async.bulk.commit_group -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-commit-group - -// 9.7.8.24.13. Data Movement and Conversion Instructions: cp.async.bulk.wait_group -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-wait-group - -// 9.7.8.25. Data Movement and Conversion Instructions: tensormap.replace -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-tensormap-replace - - -/* - * 9.7.9. Texture Instructions - * https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#texture-instructions - * - */ - -// 9.7.9.3. Texture Instructions: tex -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#texture-instructions-tex - -// 9.7.9.4. Texture Instructions: tld4 -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#texture-instructions-tld4 - -// 9.7.9.5. Texture Instructions: txq -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#texture-instructions-txq - -// 9.7.9.6. Texture Instructions: istypep -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#texture-instructions-istypep - - -/* - * 9.7.10. Surface Instructions - * https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#surface-instructions - * - */ - -// 9.7.10.1. Surface Instructions: suld -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#surface-instructions-suld - -// 9.7.10.2. Surface Instructions: sust -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#surface-instructions-sust - -// 9.7.10.3. Surface Instructions: sured -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#surface-instructions-sured - -// 9.7.10.4. Surface Instructions: suq -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#surface-instructions-suq - - -/* - * 9.7.11. Control Flow Instructions - * https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions - * - */ - -// 9.7.11.1. Control Flow Instructions: {} -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-curly-braces - -// 9.7.11.2. Control Flow Instructions: @ -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-at - -// 9.7.11.3. Control Flow Instructions: bra -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-bra - -// 9.7.11.4. Control Flow Instructions: brx.idx -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-brx-idx - -// 9.7.11.5. Control Flow Instructions: call -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-call - -// 9.7.11.6. Control Flow Instructions: ret -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-ret - -// 9.7.11.7. Control Flow Instructions: exit -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-exit - - -/* - * 9.7.12. Parallel Synchronization and Communication Instructions - * https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions - * - */ - -// 9.7.12.1. Parallel Synchronization and Communication Instructions: bar, barrier -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-bar-barrier - -// 9.7.12.2. Parallel Synchronization and Communication Instructions: bar.warp.sync -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-bar-warp-sync - -// 9.7.12.3. Parallel Synchronization and Communication Instructions: barrier.cluster -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-barrier-cluster - -// 9.7.12.4. Parallel Synchronization and Communication Instructions: membar/fence -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar-fence - -// 9.7.12.5. Parallel Synchronization and Communication Instructions: atom -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-atom - -// 9.7.12.6. Parallel Synchronization and Communication Instructions: red -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-red - -// 9.7.12.7. Parallel Synchronization and Communication Instructions: red.async -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-red-async - -// 9.7.12.8. Parallel Synchronization and Communication Instructions: vote (deprecated) -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-vote-deprecated - -// 9.7.12.9. Parallel Synchronization and Communication Instructions: vote.sync -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-vote-sync - -// 9.7.12.10. Parallel Synchronization and Communication Instructions: match.sync -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-match-sync - -// 9.7.12.11. Parallel Synchronization and Communication Instructions: activemask -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-activemask - -// 9.7.12.12. Parallel Synchronization and Communication Instructions: redux.sync -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-redux-sync - -// 9.7.12.13. Parallel Synchronization and Communication Instructions: griddepcontrol -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-griddepcontrol - -// 9.7.12.14. Parallel Synchronization and Communication Instructions: elect.sync -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-elect-sync - -/* - * 9.7.12.15. Parallel Synchronization and Communication Instructions: mbarrier - * https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier - * - * Contained in: __cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h - */ - -// 9.7.12.15.18. Parallel Synchronization and Communication Instructions: tensormap.cp_fenceproxy -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-tensormap-cp-fenceproxy - - -/* - * 9.7.13. Warp Level Matrix Multiply-Accumulate Instructions - * https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-multiply-accumulate-instructions - * - */ - -// 9.7.13.3.3. Warp-level Matrix Load Instruction: wmma.load -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-load-instruction-wmma-load - -// 9.7.13.3.4. Warp-level Matrix Store Instruction: wmma.store -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-store-instruction-wmma-store - -// 9.7.13.3.5. Warp-level Matrix Multiply-and-Accumulate Instruction: wmma.mma -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-multiply-and-accumulate-instruction-wmma-mma - -// 9.7.13.4.14. Multiply-and-Accumulate Instruction: mma -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#multiply-and-accumulate-instruction-mma - -// 9.7.13.4.15. Warp-level matrix load instruction: ldmatrix -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-load-instruction-ldmatrix - -// 9.7.13.4.16. Warp-level matrix store instruction: stmatrix -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-store-instruction-stmatrix - -// 9.7.13.4.17. Warp-level matrix transpose instruction: movmatrix -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-transpose-instruction-movmatrix - -// 9.7.13.5.3. Multiply-and-Accumulate Instruction: mma.sp -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#multiply-and-accumulate-instruction-mma-sp - - -/* - * 9.7.14. Asynchronous Warpgroup Level Matrix Multiply-Accumulate Instructions - * https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-warpgroup-level-matrix-multiply-accumulate-instructions - * - */ - -// 9.7.14.5.2. Asynchronous Multiply-and-Accumulate Instruction: wgmma.mma_async -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-multiply-and-accumulate-instruction-wgmma-mma-async - -// 9.7.14.6.4. Asynchronous Multiply-and-Accumulate Instruction: wgmma.mma_async.sp -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-multiply-and-accumulate-instruction-wgmma-mma-async-sp - -// 9.7.14.7.1. Asynchronous Multiply-and-Accumulate Instruction: wgmma.fence -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-multiply-and-accumulate-instruction-wgmma-fence - -// 9.7.14.7.2. Asynchronous Multiply-and-Accumulate Instruction: wgmma.commit_group -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-multiply-and-accumulate-instruction-wgmma-commit-group - -// 9.7.14.7.3. Asynchronous Multiply-and-Accumulate Instruction: wgmma.wait_group -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-multiply-and-accumulate-instruction-wgmma-wait-group - - -/* - * 9.7.15. Stack Manipulation Instructions - * https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#stack-manipulation-instructions - * - */ - -// 9.7.15.1. Stack Manipulation Instructions: stacksave -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#stack-manipulation-instructions-stacksave - -// 9.7.15.2. Stack Manipulation Instructions: stackrestore -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#stack-manipulation-instructions-stackrestore - -// 9.7.15.3. Stack Manipulation Instructions: alloca -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#stack-manipulation-instructions-alloca - - -/* - * 9.7.16. Video Instructions - * https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#video-instructions - * - */ - -// 9.7.16.1.1. Scalar Video Instructions: vadd, vsub, vabsdiff, vmin, vmax -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#scalar-video-instructions-vadd-vsub-vabsdiff-vmin-vmax - -// 9.7.16.1.2. Scalar Video Instructions: vshl, vshr -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#scalar-video-instructions-vshl-vshr - -// 9.7.16.1.3. Scalar Video Instructions: vmad -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#scalar-video-instructions-vmad - -// 9.7.16.1.4. Scalar Video Instructions: vset -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#scalar-video-instructions-vset - - -/* - * 9.7.16.2. SIMD Video Instructions - * https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#simd-video-instructions - * - */ - -// 9.7.16.2.1. SIMD Video Instructions: vadd2, vsub2, vavrg2, vabsdiff2, vmin2, vmax2 -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#simd-video-instructions-vadd2-vsub2-vavrg2-vabsdiff2-vmin2-vmax2 - -// 9.7.16.2.2. SIMD Video Instructions: vset2 -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#simd-video-instructions-vset2 - -// 9.7.16.2.3. SIMD Video Instructions: vadd4, vsub4, vavrg4, vabsdiff4, vmin4, vmax4 -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#simd-video-instructions-vadd4-vsub4-vavrg4-vabsdiff4-vmin4-vmax4 - -// 9.7.16.2.4. SIMD Video Instructions: vset4 -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#simd-video-instructions-vset4 - - -/* - * 9.7.17. Miscellaneous Instructions - * https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions - * - */ - -// 9.7.17.1. Miscellaneous Instructions: brkpt -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions-brkpt - -// 9.7.17.2. Miscellaneous Instructions: nanosleep -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions-nanosleep - -// 9.7.17.3. Miscellaneous Instructions: pmevent -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions-pmevent - -// 9.7.17.4. Miscellaneous Instructions: trap -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions-trap - -// 9.7.17.5. Miscellaneous Instructions: setmaxnreg -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions-setmaxnreg - -_LIBCUDACXX_END_NAMESPACE_CUDA_PTX +#include "std/detail/__pragma_pop" #endif // _CUDA_PTX diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h new file mode 100644 index 00000000000..efa73e15f09 --- /dev/null +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h @@ -0,0 +1,711 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _LIBCUDACXX___CUDA_PTX_H +#define _LIBCUDACXX___CUDA_PTX_H + +#include "../cstdint" // uint32_t +#include "../type_traits" // std::integral_constant +#include "../../../../../../nv/target" // __CUDA_MINIMUM_ARCH__ and friends + +// The following includes depend on the includes above: +#include "cuda/std/detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h" +#include "cuda/std/detail/libcxx/include/__cuda/ptx/ptx_dot_variants.h" +#include "cuda/std/detail/libcxx/include/__cuda/ptx/ptx_helper_functions.h" +#include "cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h" + +/* + * The cuda::ptx namespace intends to provide PTX wrappers for new hardware + * features and new PTX instructions so that they can be experimented with + * before higher-level C++ APIs are designed and developed. + * + * The wrappers have the following responsibilities: + * + * - They must prevent any PTX assembler errors, that is: + * - They are defined only for versions of the CUDA Toolkit in which nvcc/ptxas + * actually recognizes the instruction. + * - Sizes and types of parameters are correct. + * - They must convert state spaces correctly. + * - They adhere to the libcu++ coding standards of using: + * - Reserved identifiers for all parameters, variables. E.g. `__meow` or `_Woof` + * - _CUDA_VSTD:: namespace for types + * + * The wrappers should not do the following: + * + * - Use any non-native types. For example, an mbarrier instruction wrapper + * takes the barrier address as a uint64_t pointer. + * + * This header is intended for: + * + * - internal consumption by higher-level APIs such as cuda::barrier, + * - outside developers who want to experiment with the latest features of the + * hardware. + * + * Stability: + * + * - These headers are intended to present a stable API (not ABI) within one + * major version of the CTK. This means that: + * - All functions are marked inline + * - The type of a function parameter can be changed to be more generic if + * that means that code that called the original version can still be + * compiled. + * + * - Good exposure of the PTX should be high priority. If, at a new major + * version, we face a difficult choice between breaking backward-compatibility + * and an improvement of the PTX exposure, we will tend to the latter option + * more easily than in other parts of libcu++. + */ + +_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX + +/* + * Instructions + * + * The organization of the instructions below follows that of the PTX ISA documentation: + * https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#instructions + * + * To improve code organization, some sections are separated into their own + * header. For instance, the mbarrier instructions are found in: + * __cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h + * + */ + +/* + * 9.7.1. Integer Arithmetic Instructions + * https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions + * + */ + +// 9.7.1.7. Integer Arithmetic Instructions: sad +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-sad + +// 9.7.1.8. Integer Arithmetic Instructions: div +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-div + +// 9.7.1.9. Integer Arithmetic Instructions: rem +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-rem + +// 9.7.1.10. Integer Arithmetic Instructions: abs +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-abs + +// 9.7.1.11. Integer Arithmetic Instructions: neg +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-neg + +// 9.7.1.12. Integer Arithmetic Instructions: min +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-min + +// 9.7.1.13. Integer Arithmetic Instructions: max +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-max + +// 9.7.1.14. Integer Arithmetic Instructions: popc +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-popc + +// 9.7.1.15. Integer Arithmetic Instructions: clz +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-clz + +// 9.7.1.16. Integer Arithmetic Instructions: bfind +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-bfind + +// 9.7.1.17. Integer Arithmetic Instructions: fns +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-fns + +// 9.7.1.18. Integer Arithmetic Instructions: brev +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-brev + +// 9.7.1.19. Integer Arithmetic Instructions: bfe +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-bfe + +// 9.7.1.20. Integer Arithmetic Instructions: bfi +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-bfi + +// 9.7.1.21. Integer Arithmetic Instructions: szext +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-szext + +// 9.7.1.22. Integer Arithmetic Instructions: bmsk +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-bmsk + +// 9.7.1.23. Integer Arithmetic Instructions: dp4a +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-dp4a + +// 9.7.1.24. Integer Arithmetic Instructions: dp2a +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-dp2a + + +/* + * 9.7.2. Extended-Precision Integer Arithmetic Instructions + * https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-integer-arithmetic-instructions + * + */ + +// 9.7.2.1. Extended-Precision Arithmetic Instructions: add.cc +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-arithmetic-instructions-add-cc + +// 9.7.2.2. Extended-Precision Arithmetic Instructions: addc +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-arithmetic-instructions-addc + +// 9.7.2.3. Extended-Precision Arithmetic Instructions: sub.cc +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-arithmetic-instructions-sub-cc + +// 9.7.2.4. Extended-Precision Arithmetic Instructions: subc +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-arithmetic-instructions-subc + +// 9.7.2.5. Extended-Precision Arithmetic Instructions: mad.cc +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-arithmetic-instructions-mad-cc + +// 9.7.2.6. Extended-Precision Arithmetic Instructions: madc +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-arithmetic-instructions-madc + + +/* + * 9.7.3. Floating-Point Instructions + * https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions + * + */ + +// 9.7.3.1. Floating Point Instructions: testp +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-testp + +// 9.7.3.2. Floating Point Instructions: copysign +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-copysign + +// 9.7.3.3. Floating Point Instructions: add +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-add + +// 9.7.3.4. Floating Point Instructions: sub +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-sub + +// 9.7.3.5. Floating Point Instructions: mul +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-mul + +// 9.7.3.6. Floating Point Instructions: fma +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-fma + +// 9.7.3.7. Floating Point Instructions: mad +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-mad + +// 9.7.3.8. Floating Point Instructions: div +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-div + +// 9.7.3.9. Floating Point Instructions: abs +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-abs + +// 9.7.3.10. Floating Point Instructions: neg +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-neg + +// 9.7.3.11. Floating Point Instructions: min +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-min + +// 9.7.3.12. Floating Point Instructions: max +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-max + +// 9.7.3.13. Floating Point Instructions: rcp +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-rcp + +// 9.7.3.14. Floating Point Instructions: rcp.approx.ftz.f64 +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-rcp-approx-ftz-f64 + +// 9.7.3.15. Floating Point Instructions: sqrt +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-sqrt + +// 9.7.3.16. Floating Point Instructions: rsqrt +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-rsqrt + +// 9.7.3.17. Floating Point Instructions: rsqrt.approx.ftz.f64 +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-rsqrt-approx-ftz-f64 + +// 9.7.3.18. Floating Point Instructions: sin +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-sin + +// 9.7.3.19. Floating Point Instructions: cos +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-cos + +// 9.7.3.20. Floating Point Instructions: lg2 +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-lg2 + +// 9.7.3.21. Floating Point Instructions: ex2 +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-ex2 + +// 9.7.3.22. Floating Point Instructions: tanh +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-tanh + + +/* + * 9.7.4. Half Precision Floating-Point Instructions + * https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions + * + */ + +// 9.7.4.1. Half Precision Floating Point Instructions: add +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-add + +// 9.7.4.2. Half Precision Floating Point Instructions: sub +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-sub + +// 9.7.4.3. Half Precision Floating Point Instructions: mul +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-mul + +// 9.7.4.4. Half Precision Floating Point Instructions: fma +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-fma + +// 9.7.4.5. Half Precision Floating Point Instructions: neg +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-neg + +// 9.7.4.6. Half Precision Floating Point Instructions: abs +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-abs + +// 9.7.4.7. Half Precision Floating Point Instructions: min +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-min + +// 9.7.4.8. Half Precision Floating Point Instructions: max +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-max + +// 9.7.4.9. Half Precision Floating Point Instructions: tanh +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-tanh + +// 9.7.4.10. Half Precision Floating Point Instructions: ex2 +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-ex2 + + +/* + * 9.7.5. Comparison and Selection Instructions + * https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#comparison-and-selection-instructions + * + */ + +// 9.7.5.1. Comparison and Selection Instructions: set +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#comparison-and-selection-instructions-set + +// 9.7.5.2. Comparison and Selection Instructions: setp +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#comparison-and-selection-instructions-setp + +// 9.7.5.3. Comparison and Selection Instructions: selp +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#comparison-and-selection-instructions-selp + +// 9.7.5.4. Comparison and Selection Instructions: slct +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#comparison-and-selection-instructions-slct + + +/* + * 9.7.6. Half Precision Comparison Instructions + * https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-comparison-instructions + * + */ + +// 9.7.6.1. Half Precision Comparison Instructions: set +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-comparison-instructions-set + +// 9.7.6.2. Half Precision Comparison Instructions: setp +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-comparison-instructions-setp + + +/* + * 9.7.7. Logic and Shift Instructions + * https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions + * + */ + +// 9.7.7.1. Logic and Shift Instructions: and +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-and + +// 9.7.7.2. Logic and Shift Instructions: or +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-or + +// 9.7.7.3. Logic and Shift Instructions: xor +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-xor + +// 9.7.7.4. Logic and Shift Instructions: not +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-not + +// 9.7.7.5. Logic and Shift Instructions: cnot +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-cnot + +// 9.7.7.6. Logic and Shift Instructions: lop3 +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-lop3 + +// 9.7.7.7. Logic and Shift Instructions: shf +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-shf + +// 9.7.7.8. Logic and Shift Instructions: shl +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-shl + +// 9.7.7.9. Logic and Shift Instructions: shr +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-shr + + +/* + * 9.7.8. Data Movement and Conversion Instructions + * https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions + * + */ + +// 9.7.8.3. Data Movement and Conversion Instructions: mov +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-mov + +// 9.7.8.4. Data Movement and Conversion Instructions: mov +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-mov-2 + +// 9.7.8.5. Data Movement and Conversion Instructions: shfl (deprecated) +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-shfl-deprecated + +// 9.7.8.6. Data Movement and Conversion Instructions: shfl.sync +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-shfl-sync + +// 9.7.8.7. Data Movement and Conversion Instructions: prmt +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-prmt + +// 9.7.8.8. Data Movement and Conversion Instructions: ld +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-ld + +// 9.7.8.9. Data Movement and Conversion Instructions: ld.global.nc +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-ld-global-nc + +// 9.7.8.10. Data Movement and Conversion Instructions: ldu +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-ldu + +// 9.7.8.11. Data Movement and Conversion Instructions: st +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-st + +// 9.7.8.12. Data Movement and Conversion Instructions: st.async +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-st-async + +// 9.7.8.13. Data Movement and Conversion Instructions: multimem.ld_reduce, multimem.st, multimem.red +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-multimem-ld-reduce-multimem-st-multimem-red + +// 9.7.8.14. Data Movement and Conversion Instructions: prefetch, prefetchu +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-prefetch-prefetchu + +// 9.7.8.15. Data Movement and Conversion Instructions: applypriority +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-applypriority + +// 9.7.8.16. Data Movement and Conversion Instructions: discard +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-discard + +// 9.7.8.17. Data Movement and Conversion Instructions: createpolicy +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-createpolicy + +// 9.7.8.18. Data Movement and Conversion Instructions: isspacep +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-isspacep + +// 9.7.8.19. Data Movement and Conversion Instructions: cvta +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvta + +// 9.7.8.20. Data Movement and Conversion Instructions: cvt +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvt + +// 9.7.8.21. Data Movement and Conversion Instructions: cvt.pack +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvt-pack + +// 9.7.8.22. Data Movement and Conversion Instructions: mapa +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-mapa + +// 9.7.8.23. Data Movement and Conversion Instructions: getctarank +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-getctarank + + +/* + * 9.7.8.24. Data Movement and Conversion Instructions: Asynchronous copy + * https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-asynchronous-copy + * + */ + +// 9.7.8.24.3. Data Movement and Conversion Instructions: cp.async +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async + +// 9.7.8.24.4. Data Movement and Conversion Instructions: cp.async.commit_group +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-commit-group + +// 9.7.8.24.5. Data Movement and Conversion Instructions: cp.async.wait_group / cp.async.wait_all +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-wait-group-cp-async-wait-all + +// 9.7.8.24.6. Data Movement and Conversion Instructions: cp.async.bulk +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk + +// 9.7.8.24.7. Data Movement and Conversion Instructions: cp.reduce.async.bulk +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-reduce-async-bulk + +// 9.7.8.24.8. Data Movement and Conversion Instructions: cp.async.bulk.prefetch +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-prefetch + +// 9.7.8.24.9. Data Movement and Conversion Instructions: cp.async.bulk.tensor +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor + +// 9.7.8.24.10. Data Movement and Conversion Instructions: cp.reduce.async.bulk.tensor +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-reduce-async-bulk-tensor + +// 9.7.8.24.11. Data Movement and Conversion Instructions: cp.async.bulk.prefetch.tensor +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-prefetch-tensor + +// 9.7.8.24.12. Data Movement and Conversion Instructions: cp.async.bulk.commit_group +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-commit-group + +// 9.7.8.24.13. Data Movement and Conversion Instructions: cp.async.bulk.wait_group +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-wait-group + +// 9.7.8.25. Data Movement and Conversion Instructions: tensormap.replace +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-tensormap-replace + + +/* + * 9.7.9. Texture Instructions + * https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#texture-instructions + * + */ + +// 9.7.9.3. Texture Instructions: tex +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#texture-instructions-tex + +// 9.7.9.4. Texture Instructions: tld4 +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#texture-instructions-tld4 + +// 9.7.9.5. Texture Instructions: txq +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#texture-instructions-txq + +// 9.7.9.6. Texture Instructions: istypep +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#texture-instructions-istypep + + +/* + * 9.7.10. Surface Instructions + * https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#surface-instructions + * + */ + +// 9.7.10.1. Surface Instructions: suld +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#surface-instructions-suld + +// 9.7.10.2. Surface Instructions: sust +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#surface-instructions-sust + +// 9.7.10.3. Surface Instructions: sured +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#surface-instructions-sured + +// 9.7.10.4. Surface Instructions: suq +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#surface-instructions-suq + + +/* + * 9.7.11. Control Flow Instructions + * https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions + * + */ + +// 9.7.11.1. Control Flow Instructions: {} +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-curly-braces + +// 9.7.11.2. Control Flow Instructions: @ +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-at + +// 9.7.11.3. Control Flow Instructions: bra +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-bra + +// 9.7.11.4. Control Flow Instructions: brx.idx +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-brx-idx + +// 9.7.11.5. Control Flow Instructions: call +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-call + +// 9.7.11.6. Control Flow Instructions: ret +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-ret + +// 9.7.11.7. Control Flow Instructions: exit +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-exit + + +/* + * 9.7.12. Parallel Synchronization and Communication Instructions + * https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions + * + */ + +// 9.7.12.1. Parallel Synchronization and Communication Instructions: bar, barrier +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-bar-barrier + +// 9.7.12.2. Parallel Synchronization and Communication Instructions: bar.warp.sync +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-bar-warp-sync + +// 9.7.12.3. Parallel Synchronization and Communication Instructions: barrier.cluster +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-barrier-cluster + +// 9.7.12.4. Parallel Synchronization and Communication Instructions: membar/fence +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar-fence + +// 9.7.12.5. Parallel Synchronization and Communication Instructions: atom +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-atom + +// 9.7.12.6. Parallel Synchronization and Communication Instructions: red +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-red + +// 9.7.12.7. Parallel Synchronization and Communication Instructions: red.async +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-red-async + +// 9.7.12.8. Parallel Synchronization and Communication Instructions: vote (deprecated) +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-vote-deprecated + +// 9.7.12.9. Parallel Synchronization and Communication Instructions: vote.sync +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-vote-sync + +// 9.7.12.10. Parallel Synchronization and Communication Instructions: match.sync +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-match-sync + +// 9.7.12.11. Parallel Synchronization and Communication Instructions: activemask +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-activemask + +// 9.7.12.12. Parallel Synchronization and Communication Instructions: redux.sync +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-redux-sync + +// 9.7.12.13. Parallel Synchronization and Communication Instructions: griddepcontrol +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-griddepcontrol + +// 9.7.12.14. Parallel Synchronization and Communication Instructions: elect.sync +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-elect-sync + +/* + * 9.7.12.15. Parallel Synchronization and Communication Instructions: mbarrier + * https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier + * + * Contained in: __cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h + */ + +// 9.7.12.15.18. Parallel Synchronization and Communication Instructions: tensormap.cp_fenceproxy +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-tensormap-cp-fenceproxy + + +/* + * 9.7.13. Warp Level Matrix Multiply-Accumulate Instructions + * https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-multiply-accumulate-instructions + * + */ + +// 9.7.13.3.3. Warp-level Matrix Load Instruction: wmma.load +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-load-instruction-wmma-load + +// 9.7.13.3.4. Warp-level Matrix Store Instruction: wmma.store +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-store-instruction-wmma-store + +// 9.7.13.3.5. Warp-level Matrix Multiply-and-Accumulate Instruction: wmma.mma +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-multiply-and-accumulate-instruction-wmma-mma + +// 9.7.13.4.14. Multiply-and-Accumulate Instruction: mma +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#multiply-and-accumulate-instruction-mma + +// 9.7.13.4.15. Warp-level matrix load instruction: ldmatrix +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-load-instruction-ldmatrix + +// 9.7.13.4.16. Warp-level matrix store instruction: stmatrix +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-store-instruction-stmatrix + +// 9.7.13.4.17. Warp-level matrix transpose instruction: movmatrix +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-transpose-instruction-movmatrix + +// 9.7.13.5.3. Multiply-and-Accumulate Instruction: mma.sp +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#multiply-and-accumulate-instruction-mma-sp + + +/* + * 9.7.14. Asynchronous Warpgroup Level Matrix Multiply-Accumulate Instructions + * https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-warpgroup-level-matrix-multiply-accumulate-instructions + * + */ + +// 9.7.14.5.2. Asynchronous Multiply-and-Accumulate Instruction: wgmma.mma_async +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-multiply-and-accumulate-instruction-wgmma-mma-async + +// 9.7.14.6.4. Asynchronous Multiply-and-Accumulate Instruction: wgmma.mma_async.sp +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-multiply-and-accumulate-instruction-wgmma-mma-async-sp + +// 9.7.14.7.1. Asynchronous Multiply-and-Accumulate Instruction: wgmma.fence +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-multiply-and-accumulate-instruction-wgmma-fence + +// 9.7.14.7.2. Asynchronous Multiply-and-Accumulate Instruction: wgmma.commit_group +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-multiply-and-accumulate-instruction-wgmma-commit-group + +// 9.7.14.7.3. Asynchronous Multiply-and-Accumulate Instruction: wgmma.wait_group +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-multiply-and-accumulate-instruction-wgmma-wait-group + + +/* + * 9.7.15. Stack Manipulation Instructions + * https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#stack-manipulation-instructions + * + */ + +// 9.7.15.1. Stack Manipulation Instructions: stacksave +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#stack-manipulation-instructions-stacksave + +// 9.7.15.2. Stack Manipulation Instructions: stackrestore +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#stack-manipulation-instructions-stackrestore + +// 9.7.15.3. Stack Manipulation Instructions: alloca +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#stack-manipulation-instructions-alloca + + +/* + * 9.7.16. Video Instructions + * https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#video-instructions + * + */ + +// 9.7.16.1.1. Scalar Video Instructions: vadd, vsub, vabsdiff, vmin, vmax +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#scalar-video-instructions-vadd-vsub-vabsdiff-vmin-vmax + +// 9.7.16.1.2. Scalar Video Instructions: vshl, vshr +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#scalar-video-instructions-vshl-vshr + +// 9.7.16.1.3. Scalar Video Instructions: vmad +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#scalar-video-instructions-vmad + +// 9.7.16.1.4. Scalar Video Instructions: vset +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#scalar-video-instructions-vset + + +/* + * 9.7.16.2. SIMD Video Instructions + * https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#simd-video-instructions + * + */ + +// 9.7.16.2.1. SIMD Video Instructions: vadd2, vsub2, vavrg2, vabsdiff2, vmin2, vmax2 +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#simd-video-instructions-vadd2-vsub2-vavrg2-vabsdiff2-vmin2-vmax2 + +// 9.7.16.2.2. SIMD Video Instructions: vset2 +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#simd-video-instructions-vset2 + +// 9.7.16.2.3. SIMD Video Instructions: vadd4, vsub4, vavrg4, vabsdiff4, vmin4, vmax4 +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#simd-video-instructions-vadd4-vsub4-vavrg4-vabsdiff4-vmin4-vmax4 + +// 9.7.16.2.4. SIMD Video Instructions: vset4 +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#simd-video-instructions-vset4 + + +/* + * 9.7.17. Miscellaneous Instructions + * https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions + * + */ + +// 9.7.17.1. Miscellaneous Instructions: brkpt +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions-brkpt + +// 9.7.17.2. Miscellaneous Instructions: nanosleep +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions-nanosleep + +// 9.7.17.3. Miscellaneous Instructions: pmevent +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions-pmevent + +// 9.7.17.4. Miscellaneous Instructions: trap +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions-trap + +// 9.7.17.5. Miscellaneous Instructions: setmaxnreg +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions-setmaxnreg + +_LIBCUDACXX_END_NAMESPACE_CUDA_PTX + +#endif // _LIBCUDACXX___CUDA_PTX_H From 42710f931ee6e3c7da02bafd04978aa08fdf8c7f Mon Sep 17 00:00:00 2001 From: Allard Hendriksen Date: Wed, 25 Oct 2023 15:22:00 +0200 Subject: [PATCH 18/49] Rename include guards --- ...ynchronization_and_communication_instructions_mbarrier.h | 6 +++--- .../std/detail/libcxx/include/__cuda/ptx/ptx_dot_variants.h | 6 +++--- .../detail/libcxx/include/__cuda/ptx/ptx_helper_functions.h | 6 +++--- .../libcxx/include/__cuda/ptx/ptx_isa_target_macros.h | 6 +++--- 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h index 2bcfdc4bd33..ca5f9f09f42 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h @@ -10,8 +10,8 @@ //===----------------------------------------------------------------------===// -#ifndef PARALLEL_SYNCHRONIZATION_AND_COMMUNICATION_INSTRUCTIONS_MBARRIER_H_ -#define PARALLEL_SYNCHRONIZATION_AND_COMMUNICATION_INSTRUCTIONS_MBARRIER_H_ +#ifndef _CUDA_PTX_PARALLEL_SYNCHRONIZATION_AND_COMMUNICATION_INSTRUCTIONS_MBARRIER_H_ +#define _CUDA_PTX_PARALLEL_SYNCHRONIZATION_AND_COMMUNICATION_INSTRUCTIONS_MBARRIER_H_ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX @@ -112,4 +112,4 @@ _LIBCUDACXX_DEVICE inline void mbarrier_arrive_expect_tx( _LIBCUDACXX_END_NAMESPACE_CUDA_PTX -#endif // PARALLEL_SYNCHRONIZATION_AND_COMMUNICATION_INSTRUCTIONS_MBARRIER_H_ +#endif // _CUDA_PTX_PARALLEL_SYNCHRONIZATION_AND_COMMUNICATION_INSTRUCTIONS_MBARRIER_H_ diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_dot_variants.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_dot_variants.h index c91a2512847..13f48692e4f 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_dot_variants.h +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_dot_variants.h @@ -10,8 +10,8 @@ //===----------------------------------------------------------------------===// -#ifndef PTX_DOT_VARIANTS_H_ -#define PTX_DOT_VARIANTS_H_ +#ifndef _CUDA_PTX_DOT_VARIANTS_H_ +#define _CUDA_PTX_DOT_VARIANTS_H_ /* * Public integral constant types and values for ".variant"s: @@ -136,4 +136,4 @@ static constexpr scope_sys_t scope_sys{}; _LIBCUDACXX_END_NAMESPACE_CUDA_PTX -#endif // PTX_DOT_VARIANTS_H_ +#endif // _CUDA_PTX_DOT_VARIANTS_H_ diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_helper_functions.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_helper_functions.h index 02ac1370d3d..6f2f8aa3060 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_helper_functions.h +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_helper_functions.h @@ -9,8 +9,8 @@ // //===----------------------------------------------------------------------===// -#ifndef PTX_HELPER_FUNCTIONS_H_ -#define PTX_HELPER_FUNCTIONS_H_ +#ifndef _CUDA_PTX_HELPER_FUNCTIONS_H_ +#define _CUDA_PTX_HELPER_FUNCTIONS_H_ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX @@ -44,4 +44,4 @@ inline _LIBCUDACXX_DEVICE _CUDA_VSTD::uint64_t __as_b64(_Tp __val) _LIBCUDACXX_END_NAMESPACE_CUDA_PTX -#endif // PTX_HELPER_FUNCTIONS_H_ +#endif // _CUDA_PTX_HELPER_FUNCTIONS_H_ diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h index e0306dbf627..a1f88e3423f 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h @@ -10,8 +10,8 @@ //===----------------------------------------------------------------------===// -#ifndef PTX_ISA_TARGET_MACROS_H_ -#define PTX_ISA_TARGET_MACROS_H_ +#ifndef _CUDA_PTX_PTX_ISA_TARGET_MACROS_H_ +#define _CUDA_PTX_PTX_ISA_TARGET_MACROS_H_ /* @@ -59,4 +59,4 @@ # define __cccl_ptx_isa 0ULL #endif -#endif // PTX_ISA_TARGET_MACROS_H_ +#endif // _CUDA_PTX_PTX_ISA_TARGET_MACROS_H_ From 4144d43f22c4bd875bbc3093cb8a29f87f2a8fb1 Mon Sep 17 00:00:00 2001 From: Allard Hendriksen Date: Wed, 25 Oct 2023 15:26:27 +0200 Subject: [PATCH 19/49] Fix missing includes --- .../include/cuda/std/detail/libcxx/include/__cuda/ptx.h | 2 -- ...synchronization_and_communication_instructions_mbarrier.h | 5 +++++ .../std/detail/libcxx/include/__cuda/ptx/ptx_dot_variants.h | 2 ++ .../detail/libcxx/include/__cuda/ptx/ptx_helper_functions.h | 4 ++++ .../detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h | 1 + 5 files changed, 12 insertions(+), 2 deletions(-) diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h index efa73e15f09..9c8a33c18dd 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h @@ -13,10 +13,8 @@ #define _LIBCUDACXX___CUDA_PTX_H #include "../cstdint" // uint32_t -#include "../type_traits" // std::integral_constant #include "../../../../../../nv/target" // __CUDA_MINIMUM_ARCH__ and friends -// The following includes depend on the includes above: #include "cuda/std/detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h" #include "cuda/std/detail/libcxx/include/__cuda/ptx/ptx_dot_variants.h" #include "cuda/std/detail/libcxx/include/__cuda/ptx/ptx_helper_functions.h" diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h index ca5f9f09f42..47ad09f5d66 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h @@ -13,6 +13,11 @@ #ifndef _CUDA_PTX_PARALLEL_SYNCHRONIZATION_AND_COMMUNICATION_INSTRUCTIONS_MBARRIER_H_ #define _CUDA_PTX_PARALLEL_SYNCHRONIZATION_AND_COMMUNICATION_INSTRUCTIONS_MBARRIER_H_ +#include "ptx_dot_variants.h" +#include "ptx_helper_functions.h" +#include "ptx_isa_target_macros.h" +#include "../../cstdint" + _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX /* diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_dot_variants.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_dot_variants.h index 13f48692e4f..3ed1fca8bd2 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_dot_variants.h +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_dot_variants.h @@ -13,6 +13,8 @@ #ifndef _CUDA_PTX_DOT_VARIANTS_H_ #define _CUDA_PTX_DOT_VARIANTS_H_ +#include "../../type_traits" // std::integral_constant + /* * Public integral constant types and values for ".variant"s: * diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_helper_functions.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_helper_functions.h index 6f2f8aa3060..f2bf91615e2 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_helper_functions.h +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_helper_functions.h @@ -12,6 +12,8 @@ #ifndef _CUDA_PTX_HELPER_FUNCTIONS_H_ #define _CUDA_PTX_HELPER_FUNCTIONS_H_ +#include "../../cstdint" // uint32_t + _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX // Private helper functions @@ -32,6 +34,7 @@ template inline _LIBCUDACXX_DEVICE int __as_b32(_Tp __val) { static_assert(sizeof(_Tp) == 4, ""); + // Consider using std::bitcast return *reinterpret_cast(&__val); } @@ -39,6 +42,7 @@ template inline _LIBCUDACXX_DEVICE _CUDA_VSTD::uint64_t __as_b64(_Tp __val) { static_assert(sizeof(_Tp) == 8, ""); + // Consider using std::bitcast return *reinterpret_cast<_CUDA_VSTD::uint64_t*>(&__val); } diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h index a1f88e3423f..4e996a02eb4 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h @@ -13,6 +13,7 @@ #ifndef _CUDA_PTX_PTX_ISA_TARGET_MACROS_H_ #define _CUDA_PTX_PTX_ISA_TARGET_MACROS_H_ +#include "../../../../../../../nv/target" // __CUDA_MINIMUM_ARCH__ and friends /* * Targeting macros From 8a609cd8f1783eede2c12fe2f841b0338b5dc508 Mon Sep 17 00:00:00 2001 From: Allard Hendriksen Date: Wed, 25 Oct 2023 15:28:00 +0200 Subject: [PATCH 20/49] Remove redundant comment --- .../std/detail/libcxx/include/__cuda/ptx/ptx_helper_functions.h | 1 - 1 file changed, 1 deletion(-) diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_helper_functions.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_helper_functions.h index f2bf91615e2..731c07aee12 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_helper_functions.h +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_helper_functions.h @@ -16,7 +16,6 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX -// Private helper functions inline _LIBCUDACXX_DEVICE _CUDA_VSTD::uint32_t __as_smem_ptr(const void* __ptr) { return static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__ptr)); From 6953ea0027e3899df78b301387eb2443171a69cd Mon Sep 17 00:00:00 2001 From: Allard Hendriksen Date: Wed, 25 Oct 2023 15:32:35 +0200 Subject: [PATCH 21/49] Rename __as_smem_ptr -> __as_ptr_smem for disambiguation --- ...ization_and_communication_instructions_mbarrier.h | 8 ++++---- .../libcxx/include/__cuda/ptx/ptx_helper_functions.h | 12 +++++++++--- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h index 47ad09f5d66..66c10cfb762 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h @@ -58,14 +58,14 @@ _LIBCUDACXX_DEVICE inline _CUDA_VSTD::uint64_t mbarrier_arrive_expect_tx( asm ( "mbarrier.arrive.expect_tx.release.cta.shared::cta.b64 %0, [%1], %2;" : "=l"(__token) - : "r"(__as_smem_ptr(__addr)), + : "r"(__as_ptr_smem(__addr)), "r"(__tx_count) : "memory"); } else { asm ( "mbarrier.arrive.expect_tx.release.cluster.shared::cta.b64 %0, [%1], %2;" : "=l"(__token) - : "r"(__as_smem_ptr(__addr)), + : "r"(__as_ptr_smem(__addr)), "r"(__tx_count) : "memory"); } @@ -88,14 +88,14 @@ _LIBCUDACXX_DEVICE inline void mbarrier_arrive_expect_tx( asm ( "mbarrier.arrive.expect_tx.release.cta.shared::cluster.b64 _, [%0], %1;" : - : "r"(__as_smem_ptr(__addr)), + : "r"(__as_ptr_remote_dsmem(__addr)), "r"(__tx_count) : "memory"); } else { asm ( "mbarrier.arrive.expect_tx.release.cluster.shared::cluster.b64 _, [%0], %1;" : - : "r"(__as_smem_ptr(__addr)), + : "r"(__as_ptr_remote_dsmem(__addr)), "r"(__tx_count) : "memory"); } diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_helper_functions.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_helper_functions.h index 731c07aee12..2d80f18b746 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_helper_functions.h +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_helper_functions.h @@ -16,16 +16,22 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX -inline _LIBCUDACXX_DEVICE _CUDA_VSTD::uint32_t __as_smem_ptr(const void* __ptr) +inline _LIBCUDACXX_DEVICE _CUDA_VSTD::uint32_t __as_ptr_smem(const void* __ptr) { + // Consider adding debug asserts here. return static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__ptr)); } -inline _LIBCUDACXX_DEVICE _CUDA_VSTD::uint32_t __as_remote_dsmem_ptr(const void* __ptr) + +inline _LIBCUDACXX_DEVICE _CUDA_VSTD::uint32_t __as_ptr_remote_dsmem(const void* __ptr) { + // No difference in implementation to __as_ptr_smem. + // Consider adding debug asserts here. return static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__ptr)); } -inline _LIBCUDACXX_DEVICE _CUDA_VSTD::uint64_t __as_gmem_ptr(const void* __ptr) + +inline _LIBCUDACXX_DEVICE _CUDA_VSTD::uint64_t __as_ptr_gmem(const void* __ptr) { + // Consider adding debug asserts here. return static_cast<_CUDA_VSTD::uint64_t>(__cvta_generic_to_global(__ptr)); } From eae5df6f754f712d262f14851c86535352ff6571 Mon Sep 17 00:00:00 2001 From: Allard Hendriksen Date: Wed, 25 Oct 2023 15:36:07 +0200 Subject: [PATCH 22/49] Use uint32_t --- .../detail/libcxx/include/__cuda/ptx/ptx_helper_functions.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_helper_functions.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_helper_functions.h index 2d80f18b746..41826081a54 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_helper_functions.h +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_helper_functions.h @@ -36,11 +36,11 @@ inline _LIBCUDACXX_DEVICE _CUDA_VSTD::uint64_t __as_ptr_gmem(const void* __ptr) } template -inline _LIBCUDACXX_DEVICE int __as_b32(_Tp __val) +inline _LIBCUDACXX_DEVICE _CUDA_VSTD::uint32_t __as_b32(_Tp __val) { static_assert(sizeof(_Tp) == 4, ""); // Consider using std::bitcast - return *reinterpret_cast(&__val); + return *reinterpret_cast<_CUDA_VSTD::uint32_t*>(&__val); } template From eda6d9335e225377233307d8a218d1c64044a93a Mon Sep 17 00:00:00 2001 From: Allard Hendriksen Date: Wed, 25 Oct 2023 15:59:12 +0200 Subject: [PATCH 23/49] Update libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h Co-authored-by: Michael Schellenberger Costa --- .../include/cuda/std/detail/libcxx/include/__cuda/barrier.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h index f5a65400d1e..c4bba0222dd 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h @@ -27,7 +27,7 @@ _CCCL_IMPLICIT_SYSTEM_HEADER #include "../cstdlib" // _LIBCUDACXX_UNREACHABLE #include "../__type_traits/void_t.h" // _CUDA_VSTD::__void_t -#include // cuda::ptx::* +#include "../__cuda/ptx.h" // cuda::ptx::* #if defined(_LIBCUDACXX_COMPILER_NVRTC) #define _LIBCUDACXX_OFFSET_IS_ZERO(type, member) !(&(((type *)0)->member)) From f262f6cc9f07dc1108b1ac23adb7b47cbc92fefb Mon Sep 17 00:00:00 2001 From: Allard Hendriksen Date: Wed, 25 Oct 2023 17:43:01 +0200 Subject: [PATCH 24/49] Apply suggestions from code review Co-authored-by: Michael Schellenberger Costa --- libcudacxx/include/cuda/std/detail/libcxx/include/__config | 1 + .../std/detail/libcxx/include/__cuda/ptx/ptx_dot_variants.h | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__config b/libcudacxx/include/cuda/std/detail/libcxx/include/__config index f5f24aa95fb..bc0398eddf6 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/__config +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__config @@ -1507,6 +1507,7 @@ typedef __char32_t char32_t; #define _LIBCUDACXX_END_NAMESPACE_CUDA_DEVICE } } } #define _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX namespace cuda { namespace ptx { inline namespace _LIBCUDACXX_ABI_NAMESPACE { #define _LIBCUDACXX_END_NAMESPACE_CUDA_PTX } } } +#define _CUDA_VPTX ::cuda::ptx::_LIBCUDACXX_ABI_NAMESPACE #define _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_DEVICE_EXPERIMENTAL namespace cuda { namespace device { namespace experimental { inline namespace _LIBCUDACXX_ABI_NAMESPACE { #define _LIBCUDACXX_END_NAMESPACE_CUDA_DEVICE_EXPERIMENTAL } } } } #endif diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_dot_variants.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_dot_variants.h index 3ed1fca8bd2..433254e1e1b 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_dot_variants.h +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_dot_variants.h @@ -13,7 +13,7 @@ #ifndef _CUDA_PTX_DOT_VARIANTS_H_ #define _CUDA_PTX_DOT_VARIANTS_H_ -#include "../../type_traits" // std::integral_constant +#include "../../__type_traits/integral_constant.h" // std::integral_constant /* * Public integral constant types and values for ".variant"s: From 5701b9f78c0fc586ce4bb023474e6d06d4b960bd Mon Sep 17 00:00:00 2001 From: Allard Hendriksen Date: Wed, 25 Oct 2023 17:35:11 +0200 Subject: [PATCH 25/49] Use --- .../detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h index 4e996a02eb4..c53ee6a9679 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h @@ -13,7 +13,7 @@ #ifndef _CUDA_PTX_PTX_ISA_TARGET_MACROS_H_ #define _CUDA_PTX_PTX_ISA_TARGET_MACROS_H_ -#include "../../../../../../../nv/target" // __CUDA_MINIMUM_ARCH__ and friends +#include // __CUDA_MINIMUM_ARCH__ and friends /* * Targeting macros From 7a54b1996ae82830b642154246cbf25900627e90 Mon Sep 17 00:00:00 2001 From: Allard Hendriksen Date: Wed, 25 Oct 2023 17:41:09 +0200 Subject: [PATCH 26/49] Reorder PTX ISA target macros --- .../libcxx/include/__cuda/ptx/ptx_isa_target_macros.h | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h index c53ee6a9679..592cc5f96e6 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h @@ -40,18 +40,15 @@ // PTX ISA 8.3 is available from CTK 12.3, driver r545 #if (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 3)) || (!defined(__CUDACC_VER_MAJOR__)) # define __cccl_ptx_isa 830ULL -// PTX ISA 8.1 is available from CTK 12.1, driver r530 +// PTX ISA 8.2 is available from CTK 12.2, driver r535 #elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 2)) || (!defined(__CUDACC_VER_MAJOR__)) # define __cccl_ptx_isa 820ULL -// PTX ISA 8.2 is available from CTK 12.2, driver r535 +// PTX ISA 8.1 is available from CTK 12.1, driver r530 #elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 1)) || (!defined(__CUDACC_VER_MAJOR__)) # define __cccl_ptx_isa 810ULL // PTX ISA 8.0 is available from CTK 12.0, driver r525 #elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 0)) || (!defined(__CUDACC_VER_MAJOR__)) # define __cccl_ptx_isa 800ULL -// PTX ISA 7.8 is available from CTK 11.8, driver r520 (so also from CTK 12.0 onwards) -#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 0)) || (!defined(__CUDACC_VER_MAJOR__)) -# define __cccl_ptx_isa 780ULL // PTX ISA 7.8 is available from CTK 11.8, driver r520 #elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 8)) || (!defined(__CUDACC_VER_MAJOR__)) # define __cccl_ptx_isa 780ULL From d4ec10f1cd729b4d5f3eefcca35df388ee0e5b13 Mon Sep 17 00:00:00 2001 From: Allard Hendriksen Date: Wed, 25 Oct 2023 20:06:51 +0200 Subject: [PATCH 27/49] Add .op --- .../include/__cuda/ptx/ptx_dot_variants.h | 57 ++++++++++++++++--- 1 file changed, 48 insertions(+), 9 deletions(-) diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_dot_variants.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_dot_variants.h index 433254e1e1b..aca4eac097e 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_dot_variants.h +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_dot_variants.h @@ -18,9 +18,10 @@ /* * Public integral constant types and values for ".variant"s: * - * - .sem - * - .space - * - .scope + * - .sem: acquire, release, .. + * - .space: global, shared, constant, .. + * - .scope: cta, cluster, gpu, .. + * - .op: add, min, cas, .. * * For each .variant, the code below defines: * - An enum `dot_variant` with each possible value @@ -86,8 +87,22 @@ enum class dot_scope sys }; -template -using sem_t = _CUDA_VSTD::integral_constant; +enum class dot_op +{ + add, + dec, + inc, + max, + min, + and_op, // Using and_op, as `and, or, xor` are reserved in C++. + or_op, + xor_op, + cas, + exch +}; + +template +using sem_t = _CUDA_VSTD::integral_constant; using sem_acq_rel_t = sem_t; using sem_acquire_t = sem_t; using sem_relaxed_t = sem_t; @@ -102,8 +117,8 @@ static constexpr sem_release_t sem_release{}; static constexpr sem_sc_t sem_sc{}; static constexpr sem_weak_t sem_weak{}; -template -using space_t = _CUDA_VSTD::integral_constant; +template +using space_t = _CUDA_VSTD::integral_constant; using space_const_mem_t = space_t; using space_global_t = space_t; using space_local_t = space_t; @@ -124,8 +139,8 @@ static constexpr space_shared_cluster_t space_shared_cluster{}; static constexpr space_sreg_t space_sreg{}; static constexpr space_tex_t space_tex{}; -template -using scope_t = _CUDA_VSTD::integral_constant; +template +using scope_t = _CUDA_VSTD::integral_constant; using scope_cluster_t = scope_t; using scope_cta_t = scope_t; using scope_gpu_t = scope_t; @@ -136,6 +151,30 @@ static constexpr scope_cta_t scope_cta{}; static constexpr scope_gpu_t scope_gpu{}; static constexpr scope_sys_t scope_sys{}; +template +using op_t = _CUDA_VSTD::integral_constant; +using op_add_t = op_t; +using op_dec_t = op_t; +using op_inc_t = op_t; +using op_max_t = op_t; +using op_min_t = op_t; +using op_and_op_t = op_t; +using op_or_op_t = op_t; +using op_xor_op_t = op_t; +using op_cas_t = op_t; +using op_exch_t = op_t; + +static constexpr op_add_t op_add{}; +static constexpr op_dec_t op_dec{}; +static constexpr op_inc_t op_inc{}; +static constexpr op_max_t op_max{}; +static constexpr op_min_t op_min{}; +static constexpr op_and_op_t op_and_op{}; +static constexpr op_or_op_t op_or_op{}; +static constexpr op_xor_op_t op_xor_op{}; +static constexpr op_cas_t op_cas{}; +static constexpr op_exch_t op_exch{}; + _LIBCUDACXX_END_NAMESPACE_CUDA_PTX #endif // _CUDA_PTX_DOT_VARIANTS_H_ From dd5764873f09151117d83987f51d6f1f31956aca Mon Sep 17 00:00:00 2001 From: Allard Hendriksen Date: Thu, 26 Oct 2023 19:34:02 +0200 Subject: [PATCH 28/49] Improve backward compatibility and docs Previous versions were using PTX that did not compile on CTK 11. --- .../test/cuda/ptx/sm90.ptx.compile.pass.cpp | 29 +- libcudacxx/docs/extended_api/ptx.md | 153 ++++++++-- ..._and_communication_instructions_mbarrier.h | 287 +++++++++++++++--- 3 files changed, 399 insertions(+), 70 deletions(-) diff --git a/libcudacxx/.upstream-tests/test/cuda/ptx/sm90.ptx.compile.pass.cpp b/libcudacxx/.upstream-tests/test/cuda/ptx/sm90.ptx.compile.pass.cpp index 787a9c1f327..83335510a52 100644 --- a/libcudacxx/.upstream-tests/test/cuda/ptx/sm90.ptx.compile.pass.cpp +++ b/libcudacxx/.upstream-tests/test/cuda/ptx/sm90.ptx.compile.pass.cpp @@ -8,7 +8,6 @@ // //===----------------------------------------------------------------------===// // UNSUPPORTED: libcpp-has-no-threads -// UNSUPPORTED: pre-sm-90 // @@ -37,11 +36,31 @@ int main(int, char**) using cuda::ptx::scope_cta; __shared__ uint64_t bar; - cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cta, space_shared, &bar, 1); - cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cluster, space_shared, &bar, 1); + uint64_t state; - cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cta, space_shared_cluster, &bar, 1); - cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cluster, space_shared_cluster, &bar, 1); + // TODO: check PTX ISA version. + + NV_IF_TARGET(NV_PROVIDES_SM_80, ( + state = cuda::ptx::mbarrier_arrive(sem_release, scope_cta, space_shared, &bar); // 1. + state = cuda::ptx::mbarrier_arrive_no_complete(sem_release, scope_cta, space_shared, &bar, 1); // 2. + )); + + NV_IF_TARGET(NV_PROVIDES_SM_90, ( + state = cuda::ptx::mbarrier_arrive(sem_release, scope_cta, space_shared, &bar, 1); // 3. + state = cuda::ptx::mbarrier_arrive(sem_release, scope_cluster, space_shared, &bar, 1); // 4. + + cuda::ptx::mbarrier_arrive(sem_release, scope_cta, space_shared_cluster, &bar, 1); // 5. + cuda::ptx::mbarrier_arrive(sem_release, scope_cluster, space_shared_cluster, &bar, 1); // 5. + + state = cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cta, space_shared, &bar, 1); // 6. + state = cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cluster, space_shared, &bar, 1); // 6. + + cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cta, space_shared_cluster, &bar, 1); // 7. + cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cluster, space_shared_cluster, &bar, 1); // 7. + )); + + state += 1; // "Use" state to prevent compiler warnings + (void) state; } )); diff --git a/libcudacxx/docs/extended_api/ptx.md b/libcudacxx/docs/extended_api/ptx.md index feb4040d724..b90d8900a7e 100644 --- a/libcudacxx/docs/extended_api/ptx.md +++ b/libcudacxx/docs/extended_api/ptx.md @@ -4,6 +4,70 @@ The `cuda::ptx` namespace contains functions that map one-to-one to [PTX instructions](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html). These can be used for maximal control of the generated code, or to experiment with new hardware features before a high-level C++ API is available. +### Variants + +### Versions and compatibility + +The `cuda/ptx` header is intended to present a stable API (not ABI) within one +major version of the CTK on a best effort basis. This means that: + +- All functions are marked inline. + +- The type of a function parameter can be changed to be more generic if + that means that code that called the original version can still be + compiled. + +- Good exposure of the PTX should be high priority. If, at a new major + version, we face a difficult choice between breaking backward-compatibility + and an improvement of the PTX exposure, we will tend to the latter option + more easily than in other parts of libcu++. + +API stability is not taken to the extreme. Call functions like below to ensure +forward-compatibility: + +``` +// Use arguments to driver overload resolution: +cuda::ptx::mbarrier_arrive_expect_tx(cuda::ptx::sem_release, cuda::ptx::scope_cta, cuda::ptx::space_shared, &bar, 1); + +// Specifying templates directly is not forward-compatible, as order and number +// of template parameters may change in a minor release: +cuda::ptx::mbarrier_arrive_expect_tx( + cuda::ptx::sem_release, cuda::ptx::scope_cta, cuda::ptx::space_shared, &bar, 1 +); +``` + +**PTX ISA version and compute capability.** Each binding notes under which PTX +ISA version and SM version it may be used. Example: + +```cuda +// mbarrier.arrive.shared::cta.b64 state, [addr]; // 1. PTX ISA 70, SM_80 +__device__ inline uint64_t mbarrier_arrive( + cuda::ptx::sem_release_t sem, + cuda::ptx::scope_cta_t scope, + cuda::ptx::space_shared_t space, + uint64_t* addr); +``` + +To check if the current compiler is recent enough, use: +```cuda +#if __cccl_ptx_isa >= 700 +cuda::ptx::mbarrier_arrive(cuda::ptx::sem_release, cuda::ptx::scope_cta, cuda::ptx::space_shared, &bar, 1); +#endif +``` + +Ensure that you only call the function when compiling for a recent enough +compute capability (SM version), like this: +``` +NV_IF_TARGET(NV_PROVIDES_SM_80,( + cuda::ptx::mbarrier_arrive(cuda::ptx::sem_release, cuda::ptx::scope_cta, cuda::ptx::space_shared, &bar, 1); +)); +``` + +For more information on which compilers correspond to which PTX ISA, see the +[PTX ISA release +notes](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#release-notes). + + ### [9.7.1. Integer Arithmetic Instructions](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions) | Instruction | Available in libcu++ | @@ -380,17 +444,69 @@ experiment with new hardware features before a high-level C++ API is available. ```cuda -template -__device__ inline -uint64_t mbarrier_arrive_expect_tx(sem_release_t sem, scope_t<_Sco> scope, space_shared_t spc, uint64_t* addr, uint32_t tx_count); - -template -__device__ inline -void mbarrier_arrive_expect_tx(sem_release_t sem, scope_t<_Sco> scope, space_shared_cluster_t spc, uint64_t* addr, uint32_t tx_count); +// mbarrier.arrive.shared::cta.b64 state, [addr]; // 1. PTX ISA 70, SM_80 +__device__ inline uint64_t mbarrier_arrive( + cuda::ptx::sem_release_t sem, + cuda::ptx::scope_cta_t scope, + cuda::ptx::space_shared_t space, + uint64_t* addr); + +// mbarrier.arrive.noComplete.shared::cta.b64 state, [addr], count; // 2. PTX ISA 70, SM_80 +__device__ inline uint64_t mbarrier_arrive_no_complete( + cuda::ptx::sem_release_t sem, + cuda::ptx::scope_cta_t scope, + cuda::ptx::space_shared_t space, + uint64_t* addr, + uint32_t count); + +// mbarrier.arrive.shared::cta.b64 state, [addr], count; // 3. PTX ISA 78, SM_90 +__device__ inline uint64_t mbarrier_arrive( + cuda::ptx::sem_release_t sem, + cuda::ptx::scope_cta_t scope, + cuda::ptx::space_shared_t space, + uint64_t* addr, + uint32_t count); + +// mbarrier.arrive.release.cluster.shared::cta.b64 state, [addr], count; // 4. PTX ISA 80, SM_90 +__device__ inline uint64_t mbarrier_arrive( + cuda::ptx::sem_release_t sem, + cuda::ptx::scope_cluster_t scope, + cuda::ptx::space_shared_t space, + uint64_t* addr, + uint32_t count); + +// mbarrier.arrive.release{.scope}.shared::cluster.b64 _, [addr], count; // 5. PTX ISA 80, SM_90 +// .scope = { .cta, .cluster } +template +__device__ inline void mbarrier_arrive( + cuda::ptx::sem_release_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::space_shared_cluster_t space, + uint64_t* addr, + uint32_t count); + +// mbarrier.arrive.expect_tx.release{.scope}.shared::cta.b64 state, [addr], tx_count; // 6. PTX ISA 80, SM_90 +// .scope = { .cta, .cluster } +template +__device__ inline uint64_t mbarrier_arrive_expect_tx( + cuda::ptx::sem_release_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::space_shared_t space, + uint64_t* addr, + uint32_t tx_count); + +// mbarrier.arrive.expect_tx.release{.scope}.shared::cluster.b64 _, [addr], tx_count; // 7. PTX ISA 80, SM_90 +// .scope = { .cta, .cluster } +template +__device__ inline void mbarrier_arrive_expect_tx( + cuda::ptx::sem_release_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::space_shared_cluster_t space, + uint64_t* addr, + uint32_t tx_count); ``` Usage: - ```cuda #include #include @@ -522,24 +638,3 @@ __global__ void kernel() { [`pmevent`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions-pmevent [`trap`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions-trap [`setmaxnreg`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions-setmaxnreg - - - - - - - - - - - -### Shared memory barrier (mbarrier) - -| Instruction | Compute capability | CUDA Toolkit | -|----------------------------------------|--------------------|--------------| -| `cuda::ptx::mbarrier_arrive_expect_tx` | 9.0 | CTK 12.4 | - - - - - diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h index 66c10cfb762..c752abcf65f 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h @@ -41,67 +41,282 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX // 9.7.12.15.13. Parallel Synchronization and Communication Instructions: mbarrier.arrive // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive -#if __cccl_ptx_sm >= 900 && __cccl_ptx_isa >= 780 -template +// mbarrier.arrive{.sem}{.scope}{.shared{::cta}}.b64 state, [addr]{, count}; +// mbarrier.arrive{.sem}{.scope}{.shared::cluster}.b64 _, [addr] {,count} +// mbarrier.arrive.expect_tx{.sem}{.scope}{.shared{::cta}}.b64 state, [addr], txCount; +// mbarrier.arrive.expect_tx{.sem}{.scope}{.shared::cluster}.b64 _, [addr], txCount; +// mbarrier.arrive.noComplete{.sem}{.cta}{.shared{::cta}}.b64 state, [addr], count; +// +// .sem = { .release } +// .scope = { .cta, .cluster } + +/* +// mbarrier.arrive.shared::cta.b64 state, [addr]; // 1. PTX ISA 70, SM_80 +__device__ inline uint64_t mbarrier_arrive( + cuda::ptx::sem_release_t sem, + cuda::ptx::scope_cta_t scope, + cuda::ptx::space_shared_t space, + uint64_t* addr); +*/ +#if __cccl_ptx_isa >= 700 && __cccl_ptx_sm >= 800 +_LIBCUDACXX_DEVICE inline _CUDA_VSTD::uint64_t mbarrier_arrive( + sem_release_t __sem, + scope_cta_t __scope, + space_shared_t __space, + _CUDA_VSTD::uint64_t* __addr) +{ + // __sem == sem_release (due to parameter type constraint) + // __scope == scope_cta (due to parameter type constraint) + // __space == space_shared (due to parameter type constraint) + + _CUDA_VSTD::uint64_t __state; + + asm ( + "mbarrier.arrive.shared::cta.b64 %0, [%1]; // 1. " + : "=l"(__state) + : "r"(__as_ptr_smem(__addr)) + : "memory" + ); + return __state; +} +#endif // __cccl_ptx_isa >= 700 && __cccl_ptx_sm >= 800 +/* +// mbarrier.arrive.noComplete.shared::cta.b64 state, [addr], count; // 2. PTX ISA 70, SM_80 +__device__ inline uint64_t mbarrier_arrive_no_complete( + cuda::ptx::sem_release_t sem, + cuda::ptx::scope_cta_t scope, + cuda::ptx::space_shared_t space, + uint64_t* addr, + uint32_t count); +*/ +#if __cccl_ptx_isa >= 700 && __cccl_ptx_sm >= 800 +_LIBCUDACXX_DEVICE inline _CUDA_VSTD::uint64_t mbarrier_arrive_no_complete( + sem_release_t __sem, + scope_cta_t __scope, + space_shared_t __space, + _CUDA_VSTD::uint64_t* __addr, + _CUDA_VSTD::uint32_t __count) +{ + // __sem == sem_release (due to parameter type constraint) + // __scope == scope_cta (due to parameter type constraint) + // __space == space_shared (due to parameter type constraint) + + _CUDA_VSTD::uint64_t __state; + + asm ( + "mbarrier.arrive.noComplete.shared::cta.b64 %0, [%1], %2; // 2. " + : "=l"(__state) + : "r"(__as_ptr_smem(__addr)), + "r"(__count) + : "memory" + ); + return __state; +} +#endif // __cccl_ptx_isa >= 700 && __cccl_ptx_sm >= 800 +/* +// mbarrier.arrive.shared::cta.b64 state, [addr], count; // 3. PTX ISA 78, SM_90 +__device__ inline uint64_t mbarrier_arrive( + cuda::ptx::sem_release_t sem, + cuda::ptx::scope_cta_t scope, + cuda::ptx::space_shared_t space, + uint64_t* addr, + uint32_t count); +*/ +#if __cccl_ptx_isa >= 780 && __cccl_ptx_sm >= 900 +_LIBCUDACXX_DEVICE inline _CUDA_VSTD::uint64_t mbarrier_arrive( + sem_release_t __sem, + scope_cta_t __scope, + space_shared_t __space, + _CUDA_VSTD::uint64_t* __addr, + _CUDA_VSTD::uint32_t __count) +{ + // __sem == sem_release (due to parameter type constraint) + // __scope == scope_cta (due to parameter type constraint) + // __space == space_shared (due to parameter type constraint) + + _CUDA_VSTD::uint64_t __state; + + asm ( + "mbarrier.arrive.shared::cta.b64 %0, [%1], %2; // 3." + : "=l"(__state) + : "r"(__as_ptr_smem(__addr)), + "r"(__count) + : "memory" + ); + return __state; +} +#endif // __cccl_ptx_isa >= 780 && __cccl_ptx_sm >= 900 +/* +// mbarrier.arrive.release.cluster.shared::cta.b64 state, [addr], count; // 4. PTX ISA 80, SM_90 +__device__ inline uint64_t mbarrier_arrive( + cuda::ptx::sem_release_t sem, + cuda::ptx::scope_cluster_t scope, + cuda::ptx::space_shared_t space, + uint64_t* addr, + uint32_t count); +*/ +#if __cccl_ptx_isa >= 800 && __cccl_ptx_sm >= 900 +_LIBCUDACXX_DEVICE inline _CUDA_VSTD::uint64_t mbarrier_arrive( + sem_release_t __sem, + scope_cluster_t __scope, + space_shared_t __space, + _CUDA_VSTD::uint64_t* __addr, + _CUDA_VSTD::uint32_t __count) +{ + // __sem == sem_release (due to parameter type constraint) + // __scope == scope_cluster (due to parameter type constraint) + // __space == space_shared (due to parameter type constraint) + + _CUDA_VSTD::uint64_t __state; + + asm ( + "mbarrier.arrive.release.cluster.shared::cta.b64 %0, [%1], %2; // 4." + : "=l"(__state) + : "r"(__as_ptr_smem(__addr)), + "r"(__count) + : "memory" + ); + return __state; +} +#endif // __cccl_ptx_isa >= 800 && __cccl_ptx_sm >= 900 +/* +// mbarrier.arrive.release{.scope}.shared::cluster.b64 _, [addr], count; // 5. PTX ISA 80, SM_90 +// .scope = { .cta, .cluster } +template +__device__ inline void mbarrier_arrive( + cuda::ptx::sem_release_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::space_shared_cluster_t space, + uint64_t* addr, + uint32_t count); +*/ +#if __cccl_ptx_isa >= 800 && __cccl_ptx_sm >= 900 +template +_LIBCUDACXX_DEVICE inline void mbarrier_arrive( + sem_release_t __sem, + scope_t<_Scope> __scope, + space_shared_cluster_t __space, + _CUDA_VSTD::uint64_t* __addr, + _CUDA_VSTD::uint32_t __count) +{ + // __sem == sem_release (due to parameter type constraint) + static_assert(__scope == scope_cta || __scope == scope_cluster, ""); + // __space == space_shared_cluster (due to parameter type constraint) + + + + if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cta) { + asm ( + "mbarrier.arrive.release.cta.shared::cluster.b64 _, [%0], %1; // 5. " + : + : "r"(__as_ptr_smem(__addr)), + "r"(__count) + : "memory" + ); + } else if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cluster) { + asm ( + "mbarrier.arrive.release.cluster.shared::cluster.b64 _, [%0], %1; // 5. " + : + : "r"(__as_ptr_smem(__addr)), + "r"(__count) + : "memory" + ); + } + +} +#endif // __cccl_ptx_isa >= 800 && __cccl_ptx_sm >= 900 +/* +// mbarrier.arrive.expect_tx.release{.scope}.shared::cta.b64 state, [addr], tx_count; // 6. PTX ISA 80, SM_90 +// .scope = { .cta, .cluster } +template +__device__ inline uint64_t mbarrier_arrive_expect_tx( + cuda::ptx::sem_release_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::space_shared_t space, + uint64_t* addr, + uint32_t tx_count); +*/ +#if __cccl_ptx_isa >= 800 && __cccl_ptx_sm >= 900 +template _LIBCUDACXX_DEVICE inline _CUDA_VSTD::uint64_t mbarrier_arrive_expect_tx( sem_release_t __sem, - scope_t<_Sco> __scope, - space_shared_t __spc, + scope_t<_Scope> __scope, + space_shared_t __space, _CUDA_VSTD::uint64_t* __addr, _CUDA_VSTD::uint32_t __tx_count) { - // Arrive on local shared memory barrier + // __sem == sem_release (due to parameter type constraint) static_assert(__scope == scope_cta || __scope == scope_cluster, ""); - _CUDA_VSTD::uint64_t __token; + // __space == space_shared (due to parameter type constraint) + + _CUDA_VSTD::uint64_t __state; if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cta) { - asm ( - "mbarrier.arrive.expect_tx.release.cta.shared::cta.b64 %0, [%1], %2;" - : "=l"(__token) - : "r"(__as_ptr_smem(__addr)), - "r"(__tx_count) - : "memory"); - } else { asm ( - "mbarrier.arrive.expect_tx.release.cluster.shared::cta.b64 %0, [%1], %2;" - : "=l"(__token) + "mbarrier.arrive.expect_tx.release.cta.shared::cta.b64 %0, [%1], %2; // 6. " + : "=l"(__state) + : "r"(__as_ptr_smem(__addr)), + "r"(__tx_count) + : "memory" + ); + } else if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cluster) { + asm ( + "mbarrier.arrive.expect_tx.release.cluster.shared::cta.b64 %0, [%1], %2; // 6. " + : "=l"(__state) : "r"(__as_ptr_smem(__addr)), "r"(__tx_count) - : "memory"); + : "memory" + ); } - return __token; + return __state; } -#endif // __cccl_ptx_isa - -#if __cccl_ptx_sm >= 900 && __cccl_ptx_isa >= 780 -template +#endif // __cccl_ptx_isa >= 800 && __cccl_ptx_sm >= 900 +/* +// mbarrier.arrive.expect_tx.release{.scope}.shared::cluster.b64 _, [addr], tx_count; // 7. PTX ISA 80, SM_90 +// .scope = { .cta, .cluster } +template +__device__ inline void mbarrier_arrive_expect_tx( + cuda::ptx::sem_release_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::space_shared_cluster_t space, + uint64_t* addr, + uint32_t tx_count); +*/ +#if __cccl_ptx_isa >= 800 && __cccl_ptx_sm >= 900 +template _LIBCUDACXX_DEVICE inline void mbarrier_arrive_expect_tx( sem_release_t __sem, - scope_t<_Sco> __scope, - space_shared_cluster_t __spc, + scope_t<_Scope> __scope, + space_shared_cluster_t __space, _CUDA_VSTD::uint64_t* __addr, _CUDA_VSTD::uint32_t __tx_count) { - // Arrive on remote cluster barrier + // __sem == sem_release (due to parameter type constraint) static_assert(__scope == scope_cta || __scope == scope_cluster, ""); + // __space == space_shared_cluster (due to parameter type constraint) + + + if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cta) { - asm ( - "mbarrier.arrive.expect_tx.release.cta.shared::cluster.b64 _, [%0], %1;" - : - : "r"(__as_ptr_remote_dsmem(__addr)), - "r"(__tx_count) - : "memory"); - } else { asm ( - "mbarrier.arrive.expect_tx.release.cluster.shared::cluster.b64 _, [%0], %1;" + "mbarrier.arrive.expect_tx.release.cta.shared::cluster.b64 _, [%0], %1; // 7. " + : + : "r"(__as_ptr_smem(__addr)), + "r"(__tx_count) + : "memory" + ); + } else if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cluster) { + asm ( + "mbarrier.arrive.expect_tx.release.cluster.shared::cluster.b64 _, [%0], %1; // 7. " : - : "r"(__as_ptr_remote_dsmem(__addr)), + : "r"(__as_ptr_smem(__addr)), "r"(__tx_count) - : "memory"); + : "memory" + ); } -} -#endif // __cccl_ptx_isa +} +#endif // __cccl_ptx_isa >= 800 && __cccl_ptx_sm >= 900 // 9.7.12.15.14. Parallel Synchronization and Communication Instructions: mbarrier.arrive_drop // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive-drop From e1688151083a1c3bf5d1ad22dc1db7e2b10ce9ef Mon Sep 17 00:00:00 2001 From: Allard Hendriksen Date: Thu, 26 Oct 2023 21:44:57 +0200 Subject: [PATCH 29/49] Use cuda code-blocks for syntax highlighting --- libcudacxx/docs/extended_api/ptx.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libcudacxx/docs/extended_api/ptx.md b/libcudacxx/docs/extended_api/ptx.md index b90d8900a7e..64bcbb5d2e0 100644 --- a/libcudacxx/docs/extended_api/ptx.md +++ b/libcudacxx/docs/extended_api/ptx.md @@ -25,7 +25,7 @@ major version of the CTK on a best effort basis. This means that: API stability is not taken to the extreme. Call functions like below to ensure forward-compatibility: -``` +```cuda // Use arguments to driver overload resolution: cuda::ptx::mbarrier_arrive_expect_tx(cuda::ptx::sem_release, cuda::ptx::scope_cta, cuda::ptx::space_shared, &bar, 1); @@ -57,7 +57,7 @@ cuda::ptx::mbarrier_arrive(cuda::ptx::sem_release, cuda::ptx::scope_cta, cuda::p Ensure that you only call the function when compiling for a recent enough compute capability (SM version), like this: -``` +```cuda NV_IF_TARGET(NV_PROVIDES_SM_80,( cuda::ptx::mbarrier_arrive(cuda::ptx::sem_release, cuda::ptx::scope_cta, cuda::ptx::space_shared, &bar, 1); )); From bd967f0e55eb19308c2eb242497324727ad38635 Mon Sep 17 00:00:00 2001 From: Allard Hendriksen Date: Fri, 27 Oct 2023 12:16:17 +0200 Subject: [PATCH 30/49] Use backward-compatible PTX spelling --- ..._and_communication_instructions_mbarrier.h | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h index c752abcf65f..002b0990b1a 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h @@ -51,7 +51,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX // .scope = { .cta, .cluster } /* -// mbarrier.arrive.shared::cta.b64 state, [addr]; // 1. PTX ISA 70, SM_80 +// mbarrier.arrive.shared.b64 state, [addr]; // 1. PTX ISA 70, SM_80 __device__ inline uint64_t mbarrier_arrive( cuda::ptx::sem_release_t sem, cuda::ptx::scope_cta_t scope, @@ -72,7 +72,7 @@ _LIBCUDACXX_DEVICE inline _CUDA_VSTD::uint64_t mbarrier_arrive( _CUDA_VSTD::uint64_t __state; asm ( - "mbarrier.arrive.shared::cta.b64 %0, [%1]; // 1. " + "mbarrier.arrive.shared.b64 %0, [%1]; // 1. " : "=l"(__state) : "r"(__as_ptr_smem(__addr)) : "memory" @@ -81,7 +81,7 @@ _LIBCUDACXX_DEVICE inline _CUDA_VSTD::uint64_t mbarrier_arrive( } #endif // __cccl_ptx_isa >= 700 && __cccl_ptx_sm >= 800 /* -// mbarrier.arrive.noComplete.shared::cta.b64 state, [addr], count; // 2. PTX ISA 70, SM_80 +// mbarrier.arrive.noComplete.shared.b64 state, [addr], count; // 2. PTX ISA 70, SM_80 __device__ inline uint64_t mbarrier_arrive_no_complete( cuda::ptx::sem_release_t sem, cuda::ptx::scope_cta_t scope, @@ -104,7 +104,7 @@ _LIBCUDACXX_DEVICE inline _CUDA_VSTD::uint64_t mbarrier_arrive_no_complete( _CUDA_VSTD::uint64_t __state; asm ( - "mbarrier.arrive.noComplete.shared::cta.b64 %0, [%1], %2; // 2. " + "mbarrier.arrive.noComplete.shared.b64 %0, [%1], %2; // 2. " : "=l"(__state) : "r"(__as_ptr_smem(__addr)), "r"(__count) @@ -114,7 +114,7 @@ _LIBCUDACXX_DEVICE inline _CUDA_VSTD::uint64_t mbarrier_arrive_no_complete( } #endif // __cccl_ptx_isa >= 700 && __cccl_ptx_sm >= 800 /* -// mbarrier.arrive.shared::cta.b64 state, [addr], count; // 3. PTX ISA 78, SM_90 +// mbarrier.arrive.shared.b64 state, [addr], count; // 3. PTX ISA 78, SM_90 __device__ inline uint64_t mbarrier_arrive( cuda::ptx::sem_release_t sem, cuda::ptx::scope_cta_t scope, @@ -137,7 +137,7 @@ _LIBCUDACXX_DEVICE inline _CUDA_VSTD::uint64_t mbarrier_arrive( _CUDA_VSTD::uint64_t __state; asm ( - "mbarrier.arrive.shared::cta.b64 %0, [%1], %2; // 3." + "mbarrier.arrive.shared.b64 %0, [%1], %2; // 3." : "=l"(__state) : "r"(__as_ptr_smem(__addr)), "r"(__count) @@ -147,7 +147,7 @@ _LIBCUDACXX_DEVICE inline _CUDA_VSTD::uint64_t mbarrier_arrive( } #endif // __cccl_ptx_isa >= 780 && __cccl_ptx_sm >= 900 /* -// mbarrier.arrive.release.cluster.shared::cta.b64 state, [addr], count; // 4. PTX ISA 80, SM_90 +// mbarrier.arrive.release.cluster.shared.b64 state, [addr], count; // 4. PTX ISA 80, SM_90 __device__ inline uint64_t mbarrier_arrive( cuda::ptx::sem_release_t sem, cuda::ptx::scope_cluster_t scope, @@ -170,7 +170,7 @@ _LIBCUDACXX_DEVICE inline _CUDA_VSTD::uint64_t mbarrier_arrive( _CUDA_VSTD::uint64_t __state; asm ( - "mbarrier.arrive.release.cluster.shared::cta.b64 %0, [%1], %2; // 4." + "mbarrier.arrive.release.cluster.shared.b64 %0, [%1], %2; // 4." : "=l"(__state) : "r"(__as_ptr_smem(__addr)), "r"(__count) @@ -226,7 +226,7 @@ _LIBCUDACXX_DEVICE inline void mbarrier_arrive( } #endif // __cccl_ptx_isa >= 800 && __cccl_ptx_sm >= 900 /* -// mbarrier.arrive.expect_tx.release{.scope}.shared::cta.b64 state, [addr], tx_count; // 6. PTX ISA 80, SM_90 +// mbarrier.arrive.expect_tx.release{.scope}.shared.b64 state, [addr], tx_count; // 6. PTX ISA 80, SM_90 // .scope = { .cta, .cluster } template __device__ inline uint64_t mbarrier_arrive_expect_tx( @@ -253,7 +253,7 @@ _LIBCUDACXX_DEVICE inline _CUDA_VSTD::uint64_t mbarrier_arrive_expect_tx( if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cta) { asm ( - "mbarrier.arrive.expect_tx.release.cta.shared::cta.b64 %0, [%1], %2; // 6. " + "mbarrier.arrive.expect_tx.release.cta.shared.b64 %0, [%1], %2; // 6. " : "=l"(__state) : "r"(__as_ptr_smem(__addr)), "r"(__tx_count) @@ -261,7 +261,7 @@ _LIBCUDACXX_DEVICE inline _CUDA_VSTD::uint64_t mbarrier_arrive_expect_tx( ); } else if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cluster) { asm ( - "mbarrier.arrive.expect_tx.release.cluster.shared::cta.b64 %0, [%1], %2; // 6. " + "mbarrier.arrive.expect_tx.release.cluster.shared.b64 %0, [%1], %2; // 6. " : "=l"(__state) : "r"(__as_ptr_smem(__addr)), "r"(__tx_count) From db33678f36502eaef4121c5812ad3a256781c021 Mon Sep 17 00:00:00 2001 From: Allard Hendriksen Date: Fri, 27 Oct 2023 12:17:29 +0200 Subject: [PATCH 31/49] Use linker-error trick to enable architecture selection All functions are now function templates. If they are instantiated on an unsupported architecture, they call a declared, but undefined function. This leads to a linker error. In addition, functions are marked static to avoid them being linked. --- ..._and_communication_instructions_mbarrier.h | 283 +++++++++++------- 1 file changed, 170 insertions(+), 113 deletions(-) diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h index 002b0990b1a..92dce26cbee 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h @@ -52,14 +52,16 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX /* // mbarrier.arrive.shared.b64 state, [addr]; // 1. PTX ISA 70, SM_80 -__device__ inline uint64_t mbarrier_arrive( +template +__device__ static inline uint64_t mbarrier_arrive( cuda::ptx::sem_release_t sem, cuda::ptx::scope_cta_t scope, cuda::ptx::space_shared_t space, uint64_t* addr); */ -#if __cccl_ptx_isa >= 700 && __cccl_ptx_sm >= 800 -_LIBCUDACXX_DEVICE inline _CUDA_VSTD::uint64_t mbarrier_arrive( +#if __cccl_ptx_isa >= 700 +template +_LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive( sem_release_t __sem, scope_cta_t __scope, space_shared_t __space, @@ -71,26 +73,35 @@ _LIBCUDACXX_DEVICE inline _CUDA_VSTD::uint64_t mbarrier_arrive( _CUDA_VSTD::uint64_t __state; - asm ( - "mbarrier.arrive.shared.b64 %0, [%1]; // 1. " - : "=l"(__state) - : "r"(__as_ptr_smem(__addr)) - : "memory" - ); - return __state; + NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,( + asm ( + "mbarrier.arrive.shared.b64 %0, [%1]; // 1. " + : "=l"(__state) + : "r"(__as_ptr_smem(__addr)) + : "memory" + ); + return __state; + ),( + // Unsupported architectures will have a linker error with a semi-decent error message + __device__ _CUDA_VSTD::uint64_t ___cuda_vstd_uint64_t__cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_80__(); + return ___cuda_vstd_uint64_t__cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_80__(); + )); } -#endif // __cccl_ptx_isa >= 700 && __cccl_ptx_sm >= 800 +#endif // __cccl_ptx_isa >= 700 + /* // mbarrier.arrive.noComplete.shared.b64 state, [addr], count; // 2. PTX ISA 70, SM_80 -__device__ inline uint64_t mbarrier_arrive_no_complete( +template +__device__ static inline uint64_t mbarrier_arrive_no_complete( cuda::ptx::sem_release_t sem, cuda::ptx::scope_cta_t scope, cuda::ptx::space_shared_t space, uint64_t* addr, uint32_t count); */ -#if __cccl_ptx_isa >= 700 && __cccl_ptx_sm >= 800 -_LIBCUDACXX_DEVICE inline _CUDA_VSTD::uint64_t mbarrier_arrive_no_complete( +#if __cccl_ptx_isa >= 700 +template +_LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive_no_complete( sem_release_t __sem, scope_cta_t __scope, space_shared_t __space, @@ -103,27 +114,36 @@ _LIBCUDACXX_DEVICE inline _CUDA_VSTD::uint64_t mbarrier_arrive_no_complete( _CUDA_VSTD::uint64_t __state; - asm ( - "mbarrier.arrive.noComplete.shared.b64 %0, [%1], %2; // 2. " - : "=l"(__state) - : "r"(__as_ptr_smem(__addr)), - "r"(__count) - : "memory" - ); - return __state; + NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,( + asm ( + "mbarrier.arrive.noComplete.shared.b64 %0, [%1], %2; // 2. " + : "=l"(__state) + : "r"(__as_ptr_smem(__addr)), + "r"(__count) + : "memory" + ); + return __state; + ),( + // Unsupported architectures will have a linker error with a semi-decent error message + __device__ _CUDA_VSTD::uint64_t ___cuda_vstd_uint64_t__cuda_ptx_mbarrier_arrive_no_complete_is_not_supported_before_SM_80__(); + return ___cuda_vstd_uint64_t__cuda_ptx_mbarrier_arrive_no_complete_is_not_supported_before_SM_80__(); + )); } -#endif // __cccl_ptx_isa >= 700 && __cccl_ptx_sm >= 800 +#endif // __cccl_ptx_isa >= 700 + /* // mbarrier.arrive.shared.b64 state, [addr], count; // 3. PTX ISA 78, SM_90 -__device__ inline uint64_t mbarrier_arrive( +template +__device__ static inline uint64_t mbarrier_arrive( cuda::ptx::sem_release_t sem, cuda::ptx::scope_cta_t scope, cuda::ptx::space_shared_t space, uint64_t* addr, uint32_t count); */ -#if __cccl_ptx_isa >= 780 && __cccl_ptx_sm >= 900 -_LIBCUDACXX_DEVICE inline _CUDA_VSTD::uint64_t mbarrier_arrive( +#if __cccl_ptx_isa >= 780 +template +_LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive( sem_release_t __sem, scope_cta_t __scope, space_shared_t __space, @@ -136,27 +156,36 @@ _LIBCUDACXX_DEVICE inline _CUDA_VSTD::uint64_t mbarrier_arrive( _CUDA_VSTD::uint64_t __state; - asm ( - "mbarrier.arrive.shared.b64 %0, [%1], %2; // 3." - : "=l"(__state) - : "r"(__as_ptr_smem(__addr)), - "r"(__count) - : "memory" - ); - return __state; + NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,( + asm ( + "mbarrier.arrive.shared.b64 %0, [%1], %2; // 3." + : "=l"(__state) + : "r"(__as_ptr_smem(__addr)), + "r"(__count) + : "memory" + ); + return __state; + ),( + // Unsupported architectures will have a linker error with a semi-decent error message + __device__ _CUDA_VSTD::uint64_t ___cuda_vstd_uint64_t__cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); + return ___cuda_vstd_uint64_t__cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); + )); } -#endif // __cccl_ptx_isa >= 780 && __cccl_ptx_sm >= 900 +#endif // __cccl_ptx_isa >= 780 + /* // mbarrier.arrive.release.cluster.shared.b64 state, [addr], count; // 4. PTX ISA 80, SM_90 -__device__ inline uint64_t mbarrier_arrive( +template +__device__ static inline uint64_t mbarrier_arrive( cuda::ptx::sem_release_t sem, cuda::ptx::scope_cluster_t scope, cuda::ptx::space_shared_t space, uint64_t* addr, uint32_t count); */ -#if __cccl_ptx_isa >= 800 && __cccl_ptx_sm >= 900 -_LIBCUDACXX_DEVICE inline _CUDA_VSTD::uint64_t mbarrier_arrive( +#if __cccl_ptx_isa >= 800 +template +_LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive( sem_release_t __sem, scope_cluster_t __scope, space_shared_t __space, @@ -169,30 +198,37 @@ _LIBCUDACXX_DEVICE inline _CUDA_VSTD::uint64_t mbarrier_arrive( _CUDA_VSTD::uint64_t __state; - asm ( - "mbarrier.arrive.release.cluster.shared.b64 %0, [%1], %2; // 4." - : "=l"(__state) - : "r"(__as_ptr_smem(__addr)), - "r"(__count) - : "memory" - ); - return __state; + NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,( + asm ( + "mbarrier.arrive.release.cluster.shared.b64 %0, [%1], %2; // 4." + : "=l"(__state) + : "r"(__as_ptr_smem(__addr)), + "r"(__count) + : "memory" + ); + return __state; + ),( + // Unsupported architectures will have a linker error with a semi-decent error message + __device__ _CUDA_VSTD::uint64_t ___cuda_vstd_uint64_t__cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); + return ___cuda_vstd_uint64_t__cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); + )); } -#endif // __cccl_ptx_isa >= 800 && __cccl_ptx_sm >= 900 +#endif // __cccl_ptx_isa >= 800 + /* // mbarrier.arrive.release{.scope}.shared::cluster.b64 _, [addr], count; // 5. PTX ISA 80, SM_90 // .scope = { .cta, .cluster } template -__device__ inline void mbarrier_arrive( +__device__ static inline void mbarrier_arrive( cuda::ptx::sem_release_t sem, cuda::ptx::scope_t scope, cuda::ptx::space_shared_cluster_t space, uint64_t* addr, uint32_t count); */ -#if __cccl_ptx_isa >= 800 && __cccl_ptx_sm >= 900 +#if __cccl_ptx_isa >= 800 template -_LIBCUDACXX_DEVICE inline void mbarrier_arrive( +_LIBCUDACXX_DEVICE static inline void mbarrier_arrive( sem_release_t __sem, scope_t<_Scope> __scope, space_shared_cluster_t __space, @@ -205,40 +241,47 @@ _LIBCUDACXX_DEVICE inline void mbarrier_arrive( - if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cta) { - asm ( - "mbarrier.arrive.release.cta.shared::cluster.b64 _, [%0], %1; // 5. " - : - : "r"(__as_ptr_smem(__addr)), - "r"(__count) - : "memory" - ); - } else if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cluster) { - asm ( - "mbarrier.arrive.release.cluster.shared::cluster.b64 _, [%0], %1; // 5. " - : - : "r"(__as_ptr_smem(__addr)), - "r"(__count) - : "memory" - ); - } - + NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,( + if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cta) { + asm ( + "mbarrier.arrive.release.cta.shared::cluster.b64 _, [%0], %1; // 5. " + : + : "r"(__as_ptr_smem(__addr)), + "r"(__count) + : "memory" + ); + } else if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cluster) { + asm ( + "mbarrier.arrive.release.cluster.shared::cluster.b64 _, [%0], %1; // 5. " + : + : "r"(__as_ptr_smem(__addr)), + "r"(__count) + : "memory" + ); + } + + ),( + // Unsupported architectures will have a linker error with a semi-decent error message + __device__ void __void__cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); + return __void__cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); + )); } -#endif // __cccl_ptx_isa >= 800 && __cccl_ptx_sm >= 900 +#endif // __cccl_ptx_isa >= 800 + /* // mbarrier.arrive.expect_tx.release{.scope}.shared.b64 state, [addr], tx_count; // 6. PTX ISA 80, SM_90 // .scope = { .cta, .cluster } template -__device__ inline uint64_t mbarrier_arrive_expect_tx( +__device__ static inline uint64_t mbarrier_arrive_expect_tx( cuda::ptx::sem_release_t sem, cuda::ptx::scope_t scope, cuda::ptx::space_shared_t space, uint64_t* addr, uint32_t tx_count); */ -#if __cccl_ptx_isa >= 800 && __cccl_ptx_sm >= 900 +#if __cccl_ptx_isa >= 800 template -_LIBCUDACXX_DEVICE inline _CUDA_VSTD::uint64_t mbarrier_arrive_expect_tx( +_LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive_expect_tx( sem_release_t __sem, scope_t<_Scope> __scope, space_shared_t __space, @@ -251,40 +294,47 @@ _LIBCUDACXX_DEVICE inline _CUDA_VSTD::uint64_t mbarrier_arrive_expect_tx( _CUDA_VSTD::uint64_t __state; - if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cta) { - asm ( - "mbarrier.arrive.expect_tx.release.cta.shared.b64 %0, [%1], %2; // 6. " - : "=l"(__state) - : "r"(__as_ptr_smem(__addr)), - "r"(__tx_count) - : "memory" - ); - } else if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cluster) { - asm ( - "mbarrier.arrive.expect_tx.release.cluster.shared.b64 %0, [%1], %2; // 6. " - : "=l"(__state) - : "r"(__as_ptr_smem(__addr)), - "r"(__tx_count) - : "memory" - ); - } - return __state; + NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,( + if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cta) { + asm ( + "mbarrier.arrive.expect_tx.release.cta.shared.b64 %0, [%1], %2; // 6. " + : "=l"(__state) + : "r"(__as_ptr_smem(__addr)), + "r"(__tx_count) + : "memory" + ); + } else if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cluster) { + asm ( + "mbarrier.arrive.expect_tx.release.cluster.shared.b64 %0, [%1], %2; // 6. " + : "=l"(__state) + : "r"(__as_ptr_smem(__addr)), + "r"(__tx_count) + : "memory" + ); + } + return __state; + ),( + // Unsupported architectures will have a linker error with a semi-decent error message + __device__ _CUDA_VSTD::uint64_t ___cuda_vstd_uint64_t__cuda_ptx_mbarrier_arrive_expect_tx_is_not_supported_before_SM_90__(); + return ___cuda_vstd_uint64_t__cuda_ptx_mbarrier_arrive_expect_tx_is_not_supported_before_SM_90__(); + )); } -#endif // __cccl_ptx_isa >= 800 && __cccl_ptx_sm >= 900 +#endif // __cccl_ptx_isa >= 800 + /* // mbarrier.arrive.expect_tx.release{.scope}.shared::cluster.b64 _, [addr], tx_count; // 7. PTX ISA 80, SM_90 // .scope = { .cta, .cluster } template -__device__ inline void mbarrier_arrive_expect_tx( +__device__ static inline void mbarrier_arrive_expect_tx( cuda::ptx::sem_release_t sem, cuda::ptx::scope_t scope, cuda::ptx::space_shared_cluster_t space, uint64_t* addr, uint32_t tx_count); */ -#if __cccl_ptx_isa >= 800 && __cccl_ptx_sm >= 900 +#if __cccl_ptx_isa >= 800 template -_LIBCUDACXX_DEVICE inline void mbarrier_arrive_expect_tx( +_LIBCUDACXX_DEVICE static inline void mbarrier_arrive_expect_tx( sem_release_t __sem, scope_t<_Scope> __scope, space_shared_cluster_t __space, @@ -297,26 +347,33 @@ _LIBCUDACXX_DEVICE inline void mbarrier_arrive_expect_tx( - if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cta) { - asm ( - "mbarrier.arrive.expect_tx.release.cta.shared::cluster.b64 _, [%0], %1; // 7. " - : - : "r"(__as_ptr_smem(__addr)), - "r"(__tx_count) - : "memory" - ); - } else if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cluster) { - asm ( - "mbarrier.arrive.expect_tx.release.cluster.shared::cluster.b64 _, [%0], %1; // 7. " - : - : "r"(__as_ptr_smem(__addr)), - "r"(__tx_count) - : "memory" - ); - } - + NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,( + if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cta) { + asm ( + "mbarrier.arrive.expect_tx.release.cta.shared::cluster.b64 _, [%0], %1; // 7. " + : + : "r"(__as_ptr_smem(__addr)), + "r"(__tx_count) + : "memory" + ); + } else if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cluster) { + asm ( + "mbarrier.arrive.expect_tx.release.cluster.shared::cluster.b64 _, [%0], %1; // 7. " + : + : "r"(__as_ptr_smem(__addr)), + "r"(__tx_count) + : "memory" + ); + } + + ),( + // Unsupported architectures will have a linker error with a semi-decent error message + __device__ void __void__cuda_ptx_mbarrier_arrive_expect_tx_is_not_supported_before_SM_90__(); + return __void__cuda_ptx_mbarrier_arrive_expect_tx_is_not_supported_before_SM_90__(); + )); } -#endif // __cccl_ptx_isa >= 800 && __cccl_ptx_sm >= 900 +#endif // __cccl_ptx_isa >= 800 + // 9.7.12.15.14. Parallel Synchronization and Communication Instructions: mbarrier.arrive_drop // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive-drop From 6a1b36e3dc73a5cc1dbdc7eff66c2407c1ad2719 Mon Sep 17 00:00:00 2001 From: Allard Hendriksen Date: Fri, 27 Oct 2023 12:32:53 +0200 Subject: [PATCH 32/49] Use const references --- libcudacxx/docs/extended_api/ptx.md | 46 ++++++++++--------- ..._and_communication_instructions_mbarrier.h | 24 +++++----- 2 files changed, 37 insertions(+), 33 deletions(-) diff --git a/libcudacxx/docs/extended_api/ptx.md b/libcudacxx/docs/extended_api/ptx.md index 64bcbb5d2e0..349149eaf42 100644 --- a/libcudacxx/docs/extended_api/ptx.md +++ b/libcudacxx/docs/extended_api/ptx.md @@ -444,66 +444,70 @@ notes](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#release ```cuda -// mbarrier.arrive.shared::cta.b64 state, [addr]; // 1. PTX ISA 70, SM_80 -__device__ inline uint64_t mbarrier_arrive( +// mbarrier.arrive.shared.b64 state, [addr]; // 1. PTX ISA 70, SM_80 +template +__device__ static inline uint64_t mbarrier_arrive( cuda::ptx::sem_release_t sem, cuda::ptx::scope_cta_t scope, cuda::ptx::space_shared_t space, uint64_t* addr); -// mbarrier.arrive.noComplete.shared::cta.b64 state, [addr], count; // 2. PTX ISA 70, SM_80 -__device__ inline uint64_t mbarrier_arrive_no_complete( +// mbarrier.arrive.noComplete.shared.b64 state, [addr], count; // 2. PTX ISA 70, SM_80 +template +__device__ static inline uint64_t mbarrier_arrive_no_complete( cuda::ptx::sem_release_t sem, cuda::ptx::scope_cta_t scope, cuda::ptx::space_shared_t space, uint64_t* addr, - uint32_t count); + const uint32_t& count); -// mbarrier.arrive.shared::cta.b64 state, [addr], count; // 3. PTX ISA 78, SM_90 -__device__ inline uint64_t mbarrier_arrive( +// mbarrier.arrive.shared.b64 state, [addr], count; // 3. PTX ISA 78, SM_90 +template +__device__ static inline uint64_t mbarrier_arrive( cuda::ptx::sem_release_t sem, cuda::ptx::scope_cta_t scope, cuda::ptx::space_shared_t space, uint64_t* addr, - uint32_t count); - -// mbarrier.arrive.release.cluster.shared::cta.b64 state, [addr], count; // 4. PTX ISA 80, SM_90 -__device__ inline uint64_t mbarrier_arrive( + const uint32_t& count); + +// mbarrier.arrive.release.cluster.shared.b64 state, [addr], count; // 4. PTX ISA 80, SM_90 +template +__device__ static inline uint64_t mbarrier_arrive( cuda::ptx::sem_release_t sem, cuda::ptx::scope_cluster_t scope, cuda::ptx::space_shared_t space, uint64_t* addr, - uint32_t count); + const uint32_t& count); -// mbarrier.arrive.release{.scope}.shared::cluster.b64 _, [addr], count; // 5. PTX ISA 80, SM_90 +// mbarrier.arrive.release{.scope}.shared::cluster.b64 _, [addr], count; // 5. PTX ISA 80, SM_90 // .scope = { .cta, .cluster } template -__device__ inline void mbarrier_arrive( +__device__ static inline void mbarrier_arrive( cuda::ptx::sem_release_t sem, cuda::ptx::scope_t scope, cuda::ptx::space_shared_cluster_t space, uint64_t* addr, - uint32_t count); + const uint32_t& count); -// mbarrier.arrive.expect_tx.release{.scope}.shared::cta.b64 state, [addr], tx_count; // 6. PTX ISA 80, SM_90 +// mbarrier.arrive.expect_tx.release{.scope}.shared.b64 state, [addr], tx_count; // 6. PTX ISA 80, SM_90 // .scope = { .cta, .cluster } template -__device__ inline uint64_t mbarrier_arrive_expect_tx( +__device__ static inline uint64_t mbarrier_arrive_expect_tx( cuda::ptx::sem_release_t sem, cuda::ptx::scope_t scope, cuda::ptx::space_shared_t space, uint64_t* addr, - uint32_t tx_count); + const uint32_t& tx_count); -// mbarrier.arrive.expect_tx.release{.scope}.shared::cluster.b64 _, [addr], tx_count; // 7. PTX ISA 80, SM_90 +// mbarrier.arrive.expect_tx.release{.scope}.shared::cluster.b64 _, [addr], tx_count; // 7. PTX ISA 80, SM_90 // .scope = { .cta, .cluster } template -__device__ inline void mbarrier_arrive_expect_tx( +__device__ static inline void mbarrier_arrive_expect_tx( cuda::ptx::sem_release_t sem, cuda::ptx::scope_t scope, cuda::ptx::space_shared_cluster_t space, uint64_t* addr, - uint32_t tx_count); + const uint32_t& tx_count); ``` Usage: diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h index 92dce26cbee..7313bff28ce 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h @@ -97,7 +97,7 @@ __device__ static inline uint64_t mbarrier_arrive_no_complete( cuda::ptx::scope_cta_t scope, cuda::ptx::space_shared_t space, uint64_t* addr, - uint32_t count); + const uint32_t& count); */ #if __cccl_ptx_isa >= 700 template @@ -106,7 +106,7 @@ _LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive_no_complet scope_cta_t __scope, space_shared_t __space, _CUDA_VSTD::uint64_t* __addr, - _CUDA_VSTD::uint32_t __count) + const _CUDA_VSTD::uint32_t& __count) { // __sem == sem_release (due to parameter type constraint) // __scope == scope_cta (due to parameter type constraint) @@ -139,7 +139,7 @@ __device__ static inline uint64_t mbarrier_arrive( cuda::ptx::scope_cta_t scope, cuda::ptx::space_shared_t space, uint64_t* addr, - uint32_t count); + const uint32_t& count); */ #if __cccl_ptx_isa >= 780 template @@ -148,7 +148,7 @@ _LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive( scope_cta_t __scope, space_shared_t __space, _CUDA_VSTD::uint64_t* __addr, - _CUDA_VSTD::uint32_t __count) + const _CUDA_VSTD::uint32_t& __count) { // __sem == sem_release (due to parameter type constraint) // __scope == scope_cta (due to parameter type constraint) @@ -181,7 +181,7 @@ __device__ static inline uint64_t mbarrier_arrive( cuda::ptx::scope_cluster_t scope, cuda::ptx::space_shared_t space, uint64_t* addr, - uint32_t count); + const uint32_t& count); */ #if __cccl_ptx_isa >= 800 template @@ -190,7 +190,7 @@ _LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive( scope_cluster_t __scope, space_shared_t __space, _CUDA_VSTD::uint64_t* __addr, - _CUDA_VSTD::uint32_t __count) + const _CUDA_VSTD::uint32_t& __count) { // __sem == sem_release (due to parameter type constraint) // __scope == scope_cluster (due to parameter type constraint) @@ -224,7 +224,7 @@ __device__ static inline void mbarrier_arrive( cuda::ptx::scope_t scope, cuda::ptx::space_shared_cluster_t space, uint64_t* addr, - uint32_t count); + const uint32_t& count); */ #if __cccl_ptx_isa >= 800 template @@ -233,7 +233,7 @@ _LIBCUDACXX_DEVICE static inline void mbarrier_arrive( scope_t<_Scope> __scope, space_shared_cluster_t __space, _CUDA_VSTD::uint64_t* __addr, - _CUDA_VSTD::uint32_t __count) + const _CUDA_VSTD::uint32_t& __count) { // __sem == sem_release (due to parameter type constraint) static_assert(__scope == scope_cta || __scope == scope_cluster, ""); @@ -277,7 +277,7 @@ __device__ static inline uint64_t mbarrier_arrive_expect_tx( cuda::ptx::scope_t scope, cuda::ptx::space_shared_t space, uint64_t* addr, - uint32_t tx_count); + const uint32_t& tx_count); */ #if __cccl_ptx_isa >= 800 template @@ -286,7 +286,7 @@ _LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive_expect_tx( scope_t<_Scope> __scope, space_shared_t __space, _CUDA_VSTD::uint64_t* __addr, - _CUDA_VSTD::uint32_t __tx_count) + const _CUDA_VSTD::uint32_t& __tx_count) { // __sem == sem_release (due to parameter type constraint) static_assert(__scope == scope_cta || __scope == scope_cluster, ""); @@ -330,7 +330,7 @@ __device__ static inline void mbarrier_arrive_expect_tx( cuda::ptx::scope_t scope, cuda::ptx::space_shared_cluster_t space, uint64_t* addr, - uint32_t tx_count); + const uint32_t& tx_count); */ #if __cccl_ptx_isa >= 800 template @@ -339,7 +339,7 @@ _LIBCUDACXX_DEVICE static inline void mbarrier_arrive_expect_tx( scope_t<_Scope> __scope, space_shared_cluster_t __space, _CUDA_VSTD::uint64_t* __addr, - _CUDA_VSTD::uint32_t __tx_count) + const _CUDA_VSTD::uint32_t& __tx_count) { // __sem == sem_release (due to parameter type constraint) static_assert(__scope == scope_cta || __scope == scope_cluster, ""); From 594c82fb472af101b714f6b4b68b6aafd7a09478 Mon Sep 17 00:00:00 2001 From: Allard Hendriksen Date: Fri, 27 Oct 2023 12:39:19 +0200 Subject: [PATCH 33/49] Do not name unused parameters --- libcudacxx/docs/extended_api/ptx.md | 52 +++++++------- ..._and_communication_instructions_mbarrier.h | 72 +++++++++---------- 2 files changed, 62 insertions(+), 62 deletions(-) diff --git a/libcudacxx/docs/extended_api/ptx.md b/libcudacxx/docs/extended_api/ptx.md index 349149eaf42..baf859f044a 100644 --- a/libcudacxx/docs/extended_api/ptx.md +++ b/libcudacxx/docs/extended_api/ptx.md @@ -444,68 +444,68 @@ notes](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#release ```cuda -// mbarrier.arrive.shared.b64 state, [addr]; // 1. PTX ISA 70, SM_80 +// mbarrier.arrive.shared.b64 state, [addr]; // 1. PTX ISA 70, SM_80 template __device__ static inline uint64_t mbarrier_arrive( - cuda::ptx::sem_release_t sem, - cuda::ptx::scope_cta_t scope, - cuda::ptx::space_shared_t space, + cuda::ptx::sem_release_t, + cuda::ptx::scope_cta_t, + cuda::ptx::space_shared_t, uint64_t* addr); -// mbarrier.arrive.noComplete.shared.b64 state, [addr], count; // 2. PTX ISA 70, SM_80 +// mbarrier.arrive.noComplete.shared.b64 state, [addr], count; // 2. PTX ISA 70, SM_80 template __device__ static inline uint64_t mbarrier_arrive_no_complete( - cuda::ptx::sem_release_t sem, - cuda::ptx::scope_cta_t scope, - cuda::ptx::space_shared_t space, + cuda::ptx::sem_release_t, + cuda::ptx::scope_cta_t, + cuda::ptx::space_shared_t, uint64_t* addr, const uint32_t& count); -// mbarrier.arrive.shared.b64 state, [addr], count; // 3. PTX ISA 78, SM_90 +// mbarrier.arrive.shared.b64 state, [addr], count; // 3. PTX ISA 78, SM_90 template __device__ static inline uint64_t mbarrier_arrive( - cuda::ptx::sem_release_t sem, - cuda::ptx::scope_cta_t scope, - cuda::ptx::space_shared_t space, + cuda::ptx::sem_release_t, + cuda::ptx::scope_cta_t, + cuda::ptx::space_shared_t, uint64_t* addr, const uint32_t& count); - -// mbarrier.arrive.release.cluster.shared.b64 state, [addr], count; // 4. PTX ISA 80, SM_90 + +// mbarrier.arrive.release.cluster.shared.b64 state, [addr], count; // 4. PTX ISA 80, SM_90 template __device__ static inline uint64_t mbarrier_arrive( - cuda::ptx::sem_release_t sem, - cuda::ptx::scope_cluster_t scope, - cuda::ptx::space_shared_t space, + cuda::ptx::sem_release_t, + cuda::ptx::scope_cluster_t, + cuda::ptx::space_shared_t, uint64_t* addr, const uint32_t& count); -// mbarrier.arrive.release{.scope}.shared::cluster.b64 _, [addr], count; // 5. PTX ISA 80, SM_90 +// mbarrier.arrive.release{.scope}.shared::cluster.b64 _, [addr], count; // 5. PTX ISA 80, SM_90 // .scope = { .cta, .cluster } template __device__ static inline void mbarrier_arrive( - cuda::ptx::sem_release_t sem, + cuda::ptx::sem_release_t, cuda::ptx::scope_t scope, - cuda::ptx::space_shared_cluster_t space, + cuda::ptx::space_shared_cluster_t, uint64_t* addr, const uint32_t& count); -// mbarrier.arrive.expect_tx.release{.scope}.shared.b64 state, [addr], tx_count; // 6. PTX ISA 80, SM_90 +// mbarrier.arrive.expect_tx.release{.scope}.shared.b64 state, [addr], tx_count; // 6. PTX ISA 80, SM_90 // .scope = { .cta, .cluster } template __device__ static inline uint64_t mbarrier_arrive_expect_tx( - cuda::ptx::sem_release_t sem, + cuda::ptx::sem_release_t, cuda::ptx::scope_t scope, - cuda::ptx::space_shared_t space, + cuda::ptx::space_shared_t, uint64_t* addr, const uint32_t& tx_count); -// mbarrier.arrive.expect_tx.release{.scope}.shared::cluster.b64 _, [addr], tx_count; // 7. PTX ISA 80, SM_90 +// mbarrier.arrive.expect_tx.release{.scope}.shared::cluster.b64 _, [addr], tx_count; // 7. PTX ISA 80, SM_90 // .scope = { .cta, .cluster } template __device__ static inline void mbarrier_arrive_expect_tx( - cuda::ptx::sem_release_t sem, + cuda::ptx::sem_release_t, cuda::ptx::scope_t scope, - cuda::ptx::space_shared_cluster_t space, + cuda::ptx::space_shared_cluster_t, uint64_t* addr, const uint32_t& tx_count); ``` diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h index 7313bff28ce..1a1418dfeae 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h @@ -54,17 +54,17 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX // mbarrier.arrive.shared.b64 state, [addr]; // 1. PTX ISA 70, SM_80 template __device__ static inline uint64_t mbarrier_arrive( - cuda::ptx::sem_release_t sem, - cuda::ptx::scope_cta_t scope, - cuda::ptx::space_shared_t space, + cuda::ptx::sem_release_t, + cuda::ptx::scope_cta_t, + cuda::ptx::space_shared_t, uint64_t* addr); */ #if __cccl_ptx_isa >= 700 template _LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive( - sem_release_t __sem, - scope_cta_t __scope, - space_shared_t __space, + sem_release_t, + scope_cta_t, + space_shared_t, _CUDA_VSTD::uint64_t* __addr) { // __sem == sem_release (due to parameter type constraint) @@ -93,18 +93,18 @@ _LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive( // mbarrier.arrive.noComplete.shared.b64 state, [addr], count; // 2. PTX ISA 70, SM_80 template __device__ static inline uint64_t mbarrier_arrive_no_complete( - cuda::ptx::sem_release_t sem, - cuda::ptx::scope_cta_t scope, - cuda::ptx::space_shared_t space, + cuda::ptx::sem_release_t, + cuda::ptx::scope_cta_t, + cuda::ptx::space_shared_t, uint64_t* addr, const uint32_t& count); */ #if __cccl_ptx_isa >= 700 template _LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive_no_complete( - sem_release_t __sem, - scope_cta_t __scope, - space_shared_t __space, + sem_release_t, + scope_cta_t, + space_shared_t, _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __count) { @@ -135,18 +135,18 @@ _LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive_no_complet // mbarrier.arrive.shared.b64 state, [addr], count; // 3. PTX ISA 78, SM_90 template __device__ static inline uint64_t mbarrier_arrive( - cuda::ptx::sem_release_t sem, - cuda::ptx::scope_cta_t scope, - cuda::ptx::space_shared_t space, + cuda::ptx::sem_release_t, + cuda::ptx::scope_cta_t, + cuda::ptx::space_shared_t, uint64_t* addr, const uint32_t& count); */ #if __cccl_ptx_isa >= 780 template _LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive( - sem_release_t __sem, - scope_cta_t __scope, - space_shared_t __space, + sem_release_t, + scope_cta_t, + space_shared_t, _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __count) { @@ -177,18 +177,18 @@ _LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive( // mbarrier.arrive.release.cluster.shared.b64 state, [addr], count; // 4. PTX ISA 80, SM_90 template __device__ static inline uint64_t mbarrier_arrive( - cuda::ptx::sem_release_t sem, - cuda::ptx::scope_cluster_t scope, - cuda::ptx::space_shared_t space, + cuda::ptx::sem_release_t, + cuda::ptx::scope_cluster_t, + cuda::ptx::space_shared_t, uint64_t* addr, const uint32_t& count); */ #if __cccl_ptx_isa >= 800 template _LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive( - sem_release_t __sem, - scope_cluster_t __scope, - space_shared_t __space, + sem_release_t, + scope_cluster_t, + space_shared_t, _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __count) { @@ -220,18 +220,18 @@ _LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive( // .scope = { .cta, .cluster } template __device__ static inline void mbarrier_arrive( - cuda::ptx::sem_release_t sem, + cuda::ptx::sem_release_t, cuda::ptx::scope_t scope, - cuda::ptx::space_shared_cluster_t space, + cuda::ptx::space_shared_cluster_t, uint64_t* addr, const uint32_t& count); */ #if __cccl_ptx_isa >= 800 template _LIBCUDACXX_DEVICE static inline void mbarrier_arrive( - sem_release_t __sem, + sem_release_t, scope_t<_Scope> __scope, - space_shared_cluster_t __space, + space_shared_cluster_t, _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __count) { @@ -273,18 +273,18 @@ _LIBCUDACXX_DEVICE static inline void mbarrier_arrive( // .scope = { .cta, .cluster } template __device__ static inline uint64_t mbarrier_arrive_expect_tx( - cuda::ptx::sem_release_t sem, + cuda::ptx::sem_release_t, cuda::ptx::scope_t scope, - cuda::ptx::space_shared_t space, + cuda::ptx::space_shared_t, uint64_t* addr, const uint32_t& tx_count); */ #if __cccl_ptx_isa >= 800 template _LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive_expect_tx( - sem_release_t __sem, + sem_release_t, scope_t<_Scope> __scope, - space_shared_t __space, + space_shared_t, _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __tx_count) { @@ -326,18 +326,18 @@ _LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive_expect_tx( // .scope = { .cta, .cluster } template __device__ static inline void mbarrier_arrive_expect_tx( - cuda::ptx::sem_release_t sem, + cuda::ptx::sem_release_t, cuda::ptx::scope_t scope, - cuda::ptx::space_shared_cluster_t space, + cuda::ptx::space_shared_cluster_t, uint64_t* addr, const uint32_t& tx_count); */ #if __cccl_ptx_isa >= 800 template _LIBCUDACXX_DEVICE static inline void mbarrier_arrive_expect_tx( - sem_release_t __sem, + sem_release_t, scope_t<_Scope> __scope, - space_shared_cluster_t __space, + space_shared_cluster_t, _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __tx_count) { From 6b4d380f2decd12c0c98bd5e294b7f15e11e9e92 Mon Sep 17 00:00:00 2001 From: Allard Hendriksen Date: Fri, 27 Oct 2023 14:23:27 +0200 Subject: [PATCH 34/49] Add PTX ISA target macros for CUDA 11.X --- .../__cuda/ptx/ptx_isa_target_macros.h | 34 ++++++++++++++++--- 1 file changed, 29 insertions(+), 5 deletions(-) diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h index 592cc5f96e6..ab59d8d33f7 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h @@ -37,21 +37,45 @@ // PTX ISA version -// PTX ISA 8.3 is available from CTK 12.3, driver r545 +// PTX ISA 8.3 is available from CUDA 12.3, driver r545 #if (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 3)) || (!defined(__CUDACC_VER_MAJOR__)) # define __cccl_ptx_isa 830ULL -// PTX ISA 8.2 is available from CTK 12.2, driver r535 +// PTX ISA 8.2 is available from CUDA 12.2, driver r535 #elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 2)) || (!defined(__CUDACC_VER_MAJOR__)) # define __cccl_ptx_isa 820ULL -// PTX ISA 8.1 is available from CTK 12.1, driver r530 +// PTX ISA 8.1 is available from CUDA 12.1, driver r530 #elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 1)) || (!defined(__CUDACC_VER_MAJOR__)) # define __cccl_ptx_isa 810ULL -// PTX ISA 8.0 is available from CTK 12.0, driver r525 +// PTX ISA 8.0 is available from CUDA 12.0, driver r525 #elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 0)) || (!defined(__CUDACC_VER_MAJOR__)) # define __cccl_ptx_isa 800ULL -// PTX ISA 7.8 is available from CTK 11.8, driver r520 +// PTX ISA 7.8 is available from CUDA 11.8, driver r520 #elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 8)) || (!defined(__CUDACC_VER_MAJOR__)) # define __cccl_ptx_isa 780ULL +// PTX ISA 7.7 is available from CUDA 11.7, driver r515 +#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 7)) || (!defined(__CUDACC_VER_MAJOR__)) +# define __cccl_ptx_isa 770ULL +// PTX ISA 7.6 is available from CUDA 11.6, driver r510 +#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 6)) || (!defined(__CUDACC_VER_MAJOR__)) +# define __cccl_ptx_isa 760ULL +// PTX ISA 7.5 is available from CUDA 11.5, driver r495 +#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 5)) || (!defined(__CUDACC_VER_MAJOR__)) +# define __cccl_ptx_isa 750ULL +// PTX ISA 7.4 is available from CUDA 11.4, driver r470 +#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 4)) || (!defined(__CUDACC_VER_MAJOR__)) +# define __cccl_ptx_isa 740ULL +// PTX ISA 7.3 is available from CUDA 11.3, driver r465 +#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 3)) || (!defined(__CUDACC_VER_MAJOR__)) +# define __cccl_ptx_isa 730ULL +// PTX ISA 7.2 is available from CUDA 11.2, driver r460 +#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 2)) || (!defined(__CUDACC_VER_MAJOR__)) +# define __cccl_ptx_isa 720ULL +// PTX ISA 7.1 is available from CUDA 11.1, driver r455 +#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 1)) || (!defined(__CUDACC_VER_MAJOR__)) +# define __cccl_ptx_isa 710ULL +// PTX ISA 7.0 is available from CUDA 11.0, driver r445 +#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 0)) || (!defined(__CUDACC_VER_MAJOR__)) +# define __cccl_ptx_isa 700ULL // Fallback case. Define the ISA version to be zero. This ensures that the macro is always defined. #else # define __cccl_ptx_isa 0ULL From 87f300c595a3461bc6ddd5cba710f2cc7cd13e71 Mon Sep 17 00:00:00 2001 From: Allard Hendriksen Date: Fri, 27 Oct 2023 14:23:44 +0200 Subject: [PATCH 35/49] Use _CUDA_VPTX in barrier.h --- .../include/cuda/std/detail/libcxx/include/__cuda/barrier.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h index c4bba0222dd..3e4052a7feb 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h @@ -607,8 +607,8 @@ barrier::arrival_token barrier_arrive_tx( auto __native_handle = barrier_native_handle(__b); auto __bh = __cvta_generic_to_shared(__native_handle); if (__arrive_count_update == 1) { - __token = cuda::ptx::mbarrier_arrive_expect_tx( - cuda::ptx::sem_release, cuda::ptx::scope_cta, cuda::ptx::space_shared, __native_handle, __transaction_count_update + __token = _CUDA_VPTX::mbarrier_arrive_expect_tx( + _CUDA_VPTX::sem_release, _CUDA_VPTX::scope_cta, _CUDA_VPTX::space_shared, __native_handle, __transaction_count_update ); } else { asm ( From 3535036444c1cebd81f0091a760f30d65d08b55a Mon Sep 17 00:00:00 2001 From: Allard Hendriksen Date: Fri, 27 Oct 2023 15:00:04 +0200 Subject: [PATCH 36/49] Replace internal use of mbarrier.arrive with cuda::ptx::mbarrier_arrive --- .../detail/libcxx/include/__cuda/barrier.h | 39 ++++++++----------- 1 file changed, 17 insertions(+), 22 deletions(-) diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h index 3e4052a7feb..c388d802b30 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h @@ -207,29 +207,27 @@ friend class _CUDA_VSTD::__barrier_poll_tester_parity; else if (!__isShared(&__barrier)) { __trap(); } - - asm volatile ("mbarrier.arrive.shared.b64 %0, [%1], %2;" - : "=l"(__token) - : "r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(&__barrier))), - "r"(static_cast<_CUDA_VSTD::uint32_t>(__update)) - : "memory"); + // Cannot use cuda::device::barrier_native_handle here, as it is + // only defined for block-scope barriers. This barrier may be a + // non-block scoped barrier. + auto __bh = reinterpret_cast<_CUDA_VSTD::uint64_t*>(&__barrier); + __token = _CUDA_VPTX::mbarrier_arrive( + _CUDA_VPTX::sem_release, _CUDA_VPTX::scope_cta, _CUDA_VPTX::space_shared, __bh, __update + ); ), NV_PROVIDES_SM_80, ( if (!__isShared(&__barrier)) { return __barrier.arrive(__update); } - + auto __bh = reinterpret_cast<_CUDA_VSTD::uint64_t*>(&__barrier); // Need 2 instructions, can't finish barrier with arrive > 1 if (__update > 1) { - asm volatile ("mbarrier.arrive.noComplete.shared.b64 %0, [%1], %2;" - : "=l"(__token) - : "r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(&__barrier))), - "r"(static_cast<_CUDA_VSTD::uint32_t>(__update - 1)) - : "memory"); + ___CUDA_VPTX::mbarrier_arrive_no_complete( + _CUDA_VPTX::sem_release, _CUDA_VPTX::scope_cta, _CUDA_VPTX::space_shared, + __bh, __update - 1); } - asm volatile ("mbarrier.arrive.shared.b64 %0, [%1];" - : "=l"(__token) - : "r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(&__barrier))) - : "memory"); + __token = _CUDA_VPTX::mbarrier_arrive( + _CUDA_VPTX::sem_release, _CUDA_VPTX::scope_cta, _CUDA_VPTX::space_shared, __bh + ); ), NV_IS_DEVICE, ( if (!__isShared(&__barrier)) { return __barrier.arrive(__update); @@ -617,12 +615,9 @@ barrier::arrival_token barrier_arrive_tx( : "r"(static_cast<_CUDA_VSTD::uint32_t>(__bh)), "r"(static_cast<_CUDA_VSTD::uint32_t>(__transaction_count_update)) : "memory"); - asm ( - "mbarrier.arrive.release.cta.shared::cta.b64 %0, [%1], %2;" - : "=l"(__token) - : "r"(static_cast<_CUDA_VSTD::uint32_t>(__bh)), - "r"(static_cast<_CUDA_VSTD::uint32_t>(__arrive_count_update)) - : "memory"); + __token = _CUDA_VPTX::mbarrier_arrive( + _CUDA_VPTX::sem_release, _CUDA_VPTX::scope_cta, _CUDA_VPTX::space_shared, __native_handle, __arrive_count_update + ); } ) ); From 82db00d6eadb4a711875f17b3c0ed707ddb2aac8 Mon Sep 17 00:00:00 2001 From: Allard Hendriksen Date: Fri, 27 Oct 2023 15:16:30 +0200 Subject: [PATCH 37/49] Guard for PTX ISA version in test --- .../test/cuda/ptx/sm90.ptx.compile.pass.cpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/libcudacxx/.upstream-tests/test/cuda/ptx/sm90.ptx.compile.pass.cpp b/libcudacxx/.upstream-tests/test/cuda/ptx/sm90.ptx.compile.pass.cpp index 83335510a52..a643ce78222 100644 --- a/libcudacxx/.upstream-tests/test/cuda/ptx/sm90.ptx.compile.pass.cpp +++ b/libcudacxx/.upstream-tests/test/cuda/ptx/sm90.ptx.compile.pass.cpp @@ -38,15 +38,18 @@ int main(int, char**) __shared__ uint64_t bar; uint64_t state; - // TODO: check PTX ISA version. - NV_IF_TARGET(NV_PROVIDES_SM_80, ( +#if __cccl_ptx_isa >= 700 state = cuda::ptx::mbarrier_arrive(sem_release, scope_cta, space_shared, &bar); // 1. state = cuda::ptx::mbarrier_arrive_no_complete(sem_release, scope_cta, space_shared, &bar, 1); // 2. +#endif )); NV_IF_TARGET(NV_PROVIDES_SM_90, ( +#if __cccl_ptx_isa >= 780 // This guard is redundant: before PTX ISA 7.8, there was no support for SM_90 state = cuda::ptx::mbarrier_arrive(sem_release, scope_cta, space_shared, &bar, 1); // 3. +#endif +#if __cccl_ptx_isa >= 800 state = cuda::ptx::mbarrier_arrive(sem_release, scope_cluster, space_shared, &bar, 1); // 4. cuda::ptx::mbarrier_arrive(sem_release, scope_cta, space_shared_cluster, &bar, 1); // 5. @@ -57,6 +60,7 @@ int main(int, char**) cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cta, space_shared_cluster, &bar, 1); // 7. cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cluster, space_shared_cluster, &bar, 1); // 7. +#endif )); state += 1; // "Use" state to prevent compiler warnings From e9abe97ab0d580bfa1c21fb3fe366762b686b435 Mon Sep 17 00:00:00 2001 From: Allard Hendriksen Date: Fri, 27 Oct 2023 15:20:54 +0200 Subject: [PATCH 38/49] Remove __cccl_ptx_sm targeting macros They are not used anymore --- .../include/__cuda/ptx/ptx_isa_target_macros.h | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h index ab59d8d33f7..f3b412bb6b6 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h @@ -22,21 +22,6 @@ * https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#release-notes */ - -// SM version - -#if (defined(__CUDA_MINIMUM_ARCH__) && 900 <= __CUDA_MINIMUM_ARCH__) || (!defined(__CUDA_MINIMUM_ARCH__)) -# define __cccl_ptx_sm 900ULL -#elif (defined(__CUDA_MINIMUM_ARCH__) && 800 <= __CUDA_MINIMUM_ARCH__) || (!defined(__CUDA_MINIMUM_ARCH__)) -# define __cccl_ptx_sm 800ULL -// Fallback case. Define the SM version to be zero. This ensures that the macro is always defined. -#else -# define __cccl_ptx_sm 0ULL -#endif - - -// PTX ISA version - // PTX ISA 8.3 is available from CUDA 12.3, driver r545 #if (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 3)) || (!defined(__CUDACC_VER_MAJOR__)) # define __cccl_ptx_isa 830ULL From f806ca0254536bcdb0c328454b7319ec52c77de4 Mon Sep 17 00:00:00 2001 From: Allard Hendriksen Date: Fri, 27 Oct 2023 17:38:43 +0200 Subject: [PATCH 39/49] Prevent unused compiler warnings in test --- .../test/cuda/ptx/sm90.ptx.compile.pass.cpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/libcudacxx/.upstream-tests/test/cuda/ptx/sm90.ptx.compile.pass.cpp b/libcudacxx/.upstream-tests/test/cuda/ptx/sm90.ptx.compile.pass.cpp index a643ce78222..ffc4d671dad 100644 --- a/libcudacxx/.upstream-tests/test/cuda/ptx/sm90.ptx.compile.pass.cpp +++ b/libcudacxx/.upstream-tests/test/cuda/ptx/sm90.ptx.compile.pass.cpp @@ -19,6 +19,9 @@ #include "cuda_space_selector.h" #include "test_macros.h" +template +__device__ inline bool __unused(_Ty...) { return true; } + int main(int, char**) { NV_IF_TARGET(NV_IS_DEVICE, ( @@ -36,7 +39,8 @@ int main(int, char**) using cuda::ptx::scope_cta; __shared__ uint64_t bar; - uint64_t state; + bar = 1; + uint64_t state = 1; NV_IF_TARGET(NV_PROVIDES_SM_80, ( #if __cccl_ptx_isa >= 700 @@ -63,8 +67,7 @@ int main(int, char**) #endif )); - state += 1; // "Use" state to prevent compiler warnings - (void) state; + __unused(bar, state); } )); From 6917e60e6cac82909169ed2de15dca2aa5a99a1f Mon Sep 17 00:00:00 2001 From: Allard Hendriksen Date: Fri, 27 Oct 2023 20:16:42 +0200 Subject: [PATCH 40/49] Use extern "C" error function declaration --- ...ation_and_communication_instructions_mbarrier.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h index 1a1418dfeae..9fda72e1980 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h @@ -60,6 +60,7 @@ __device__ static inline uint64_t mbarrier_arrive( uint64_t* addr); */ #if __cccl_ptx_isa >= 700 +extern "C" __device__ _CUDA_VSTD::uint64_t ___cuda_vstd_uint64_t__cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_80__(); template _LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive( sem_release_t, @@ -83,7 +84,6 @@ _LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive( return __state; ),( // Unsupported architectures will have a linker error with a semi-decent error message - __device__ _CUDA_VSTD::uint64_t ___cuda_vstd_uint64_t__cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_80__(); return ___cuda_vstd_uint64_t__cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_80__(); )); } @@ -100,6 +100,7 @@ __device__ static inline uint64_t mbarrier_arrive_no_complete( const uint32_t& count); */ #if __cccl_ptx_isa >= 700 +extern "C" __device__ _CUDA_VSTD::uint64_t ___cuda_vstd_uint64_t__cuda_ptx_mbarrier_arrive_no_complete_is_not_supported_before_SM_80__(); template _LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive_no_complete( sem_release_t, @@ -125,7 +126,6 @@ _LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive_no_complet return __state; ),( // Unsupported architectures will have a linker error with a semi-decent error message - __device__ _CUDA_VSTD::uint64_t ___cuda_vstd_uint64_t__cuda_ptx_mbarrier_arrive_no_complete_is_not_supported_before_SM_80__(); return ___cuda_vstd_uint64_t__cuda_ptx_mbarrier_arrive_no_complete_is_not_supported_before_SM_80__(); )); } @@ -142,6 +142,7 @@ __device__ static inline uint64_t mbarrier_arrive( const uint32_t& count); */ #if __cccl_ptx_isa >= 780 +extern "C" __device__ _CUDA_VSTD::uint64_t ___cuda_vstd_uint64_t__cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); template _LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive( sem_release_t, @@ -167,7 +168,6 @@ _LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive( return __state; ),( // Unsupported architectures will have a linker error with a semi-decent error message - __device__ _CUDA_VSTD::uint64_t ___cuda_vstd_uint64_t__cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); return ___cuda_vstd_uint64_t__cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); )); } @@ -184,6 +184,7 @@ __device__ static inline uint64_t mbarrier_arrive( const uint32_t& count); */ #if __cccl_ptx_isa >= 800 +extern "C" __device__ _CUDA_VSTD::uint64_t ___cuda_vstd_uint64_t__cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); template _LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive( sem_release_t, @@ -209,7 +210,6 @@ _LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive( return __state; ),( // Unsupported architectures will have a linker error with a semi-decent error message - __device__ _CUDA_VSTD::uint64_t ___cuda_vstd_uint64_t__cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); return ___cuda_vstd_uint64_t__cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); )); } @@ -227,6 +227,7 @@ __device__ static inline void mbarrier_arrive( const uint32_t& count); */ #if __cccl_ptx_isa >= 800 +extern "C" __device__ void __void__cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); template _LIBCUDACXX_DEVICE static inline void mbarrier_arrive( sem_release_t, @@ -262,7 +263,6 @@ _LIBCUDACXX_DEVICE static inline void mbarrier_arrive( ),( // Unsupported architectures will have a linker error with a semi-decent error message - __device__ void __void__cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); return __void__cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); )); } @@ -280,6 +280,7 @@ __device__ static inline uint64_t mbarrier_arrive_expect_tx( const uint32_t& tx_count); */ #if __cccl_ptx_isa >= 800 +extern "C" __device__ _CUDA_VSTD::uint64_t ___cuda_vstd_uint64_t__cuda_ptx_mbarrier_arrive_expect_tx_is_not_supported_before_SM_90__(); template _LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive_expect_tx( sem_release_t, @@ -315,7 +316,6 @@ _LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive_expect_tx( return __state; ),( // Unsupported architectures will have a linker error with a semi-decent error message - __device__ _CUDA_VSTD::uint64_t ___cuda_vstd_uint64_t__cuda_ptx_mbarrier_arrive_expect_tx_is_not_supported_before_SM_90__(); return ___cuda_vstd_uint64_t__cuda_ptx_mbarrier_arrive_expect_tx_is_not_supported_before_SM_90__(); )); } @@ -333,6 +333,7 @@ __device__ static inline void mbarrier_arrive_expect_tx( const uint32_t& tx_count); */ #if __cccl_ptx_isa >= 800 +extern "C" __device__ void __void__cuda_ptx_mbarrier_arrive_expect_tx_is_not_supported_before_SM_90__(); template _LIBCUDACXX_DEVICE static inline void mbarrier_arrive_expect_tx( sem_release_t, @@ -368,7 +369,6 @@ _LIBCUDACXX_DEVICE static inline void mbarrier_arrive_expect_tx( ),( // Unsupported architectures will have a linker error with a semi-decent error message - __device__ void __void__cuda_ptx_mbarrier_arrive_expect_tx_is_not_supported_before_SM_90__(); return __void__cuda_ptx_mbarrier_arrive_expect_tx_is_not_supported_before_SM_90__(); )); } From 6a5b42304c559b4209ae0ed303aa3f80b0c20935 Mon Sep 17 00:00:00 2001 From: Allard Hendriksen Date: Fri, 27 Oct 2023 20:31:38 +0200 Subject: [PATCH 41/49] Fix wrapping of ifdef and NV_IF_TARGET for Windows The MS C++ compiler apparently chokes on an ifdef within NV_IF_TARGET --- .../test/cuda/ptx/sm90.ptx.compile.pass.cpp | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/libcudacxx/.upstream-tests/test/cuda/ptx/sm90.ptx.compile.pass.cpp b/libcudacxx/.upstream-tests/test/cuda/ptx/sm90.ptx.compile.pass.cpp index ffc4d671dad..bfeea02a359 100644 --- a/libcudacxx/.upstream-tests/test/cuda/ptx/sm90.ptx.compile.pass.cpp +++ b/libcudacxx/.upstream-tests/test/cuda/ptx/sm90.ptx.compile.pass.cpp @@ -42,18 +42,21 @@ int main(int, char**) bar = 1; uint64_t state = 1; - NV_IF_TARGET(NV_PROVIDES_SM_80, ( #if __cccl_ptx_isa >= 700 + NV_IF_TARGET(NV_PROVIDES_SM_80, ( state = cuda::ptx::mbarrier_arrive(sem_release, scope_cta, space_shared, &bar); // 1. state = cuda::ptx::mbarrier_arrive_no_complete(sem_release, scope_cta, space_shared, &bar, 1); // 2. -#endif )); +#endif - NV_IF_TARGET(NV_PROVIDES_SM_90, ( #if __cccl_ptx_isa >= 780 // This guard is redundant: before PTX ISA 7.8, there was no support for SM_90 + NV_IF_TARGET(NV_PROVIDES_SM_90, ( state = cuda::ptx::mbarrier_arrive(sem_release, scope_cta, space_shared, &bar, 1); // 3. + )); #endif + #if __cccl_ptx_isa >= 800 + NV_IF_TARGET(NV_PROVIDES_SM_90, ( state = cuda::ptx::mbarrier_arrive(sem_release, scope_cluster, space_shared, &bar, 1); // 4. cuda::ptx::mbarrier_arrive(sem_release, scope_cta, space_shared_cluster, &bar, 1); // 5. @@ -64,8 +67,8 @@ int main(int, char**) cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cta, space_shared_cluster, &bar, 1); // 7. cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cluster, space_shared_cluster, &bar, 1); // 7. -#endif )); +#endif __unused(bar, state); } From 7d6d4d5e99914eca6322cec8ee68391af7638faa Mon Sep 17 00:00:00 2001 From: Michael Schellenberger Costa Date: Wed, 1 Nov 2023 16:05:18 +0100 Subject: [PATCH 42/49] Try and fix CI issues --- .../test/cuda/ptx/sm90.ptx.compile.pass.cpp | 73 ++++++++----------- .../test/support/concurrent_agents.h | 2 + ..._and_communication_instructions_mbarrier.h | 24 ++---- 3 files changed, 39 insertions(+), 60 deletions(-) diff --git a/libcudacxx/.upstream-tests/test/cuda/ptx/sm90.ptx.compile.pass.cpp b/libcudacxx/.upstream-tests/test/cuda/ptx/sm90.ptx.compile.pass.cpp index bfeea02a359..da086fe3811 100644 --- a/libcudacxx/.upstream-tests/test/cuda/ptx/sm90.ptx.compile.pass.cpp +++ b/libcudacxx/.upstream-tests/test/cuda/ptx/sm90.ptx.compile.pass.cpp @@ -22,57 +22,48 @@ template __device__ inline bool __unused(_Ty...) { return true; } -int main(int, char**) -{ - NV_IF_TARGET(NV_IS_DEVICE, ( - // Do not execute. Just check if below PTX compiles (that is: assembles) without error. - - // This condition always evaluates to false, but the compiler does not - // reason through it. This avoids dead code elimination. - const bool non_eliminated_false = threadIdx.x > 1024; - - if (non_eliminated_false) { - using cuda::ptx::sem_release; - using cuda::ptx::space_shared_cluster; - using cuda::ptx::space_shared; - using cuda::ptx::scope_cluster; - using cuda::ptx::scope_cta; +__device__ void test_compilation() { + using cuda::ptx::sem_release; + using cuda::ptx::space_shared_cluster; + using cuda::ptx::space_shared; + using cuda::ptx::scope_cluster; + using cuda::ptx::scope_cta; - __shared__ uint64_t bar; - bar = 1; - uint64_t state = 1; + __shared__ uint64_t bar; + bar = 1; + uint64_t state = 1; #if __cccl_ptx_isa >= 700 - NV_IF_TARGET(NV_PROVIDES_SM_80, ( - state = cuda::ptx::mbarrier_arrive(sem_release, scope_cta, space_shared, &bar); // 1. - state = cuda::ptx::mbarrier_arrive_no_complete(sem_release, scope_cta, space_shared, &bar, 1); // 2. - )); -#endif + NV_IF_TARGET(NV_PROVIDES_SM_80, ( + state = cuda::ptx::mbarrier_arrive(sem_release, scope_cta, space_shared, &bar); // 1. + state = cuda::ptx::mbarrier_arrive_no_complete(sem_release, scope_cta, space_shared, &bar, 1); // 2. + )); +#endif // __cccl_ptx_isa >= 700 #if __cccl_ptx_isa >= 780 // This guard is redundant: before PTX ISA 7.8, there was no support for SM_90 - NV_IF_TARGET(NV_PROVIDES_SM_90, ( - state = cuda::ptx::mbarrier_arrive(sem_release, scope_cta, space_shared, &bar, 1); // 3. - )); -#endif + NV_IF_TARGET(NV_PROVIDES_SM_90, ( + state = cuda::ptx::mbarrier_arrive(sem_release, scope_cta, space_shared, &bar, 1); // 3. + )); +#endif // __cccl_ptx_isa >= 780 #if __cccl_ptx_isa >= 800 - NV_IF_TARGET(NV_PROVIDES_SM_90, ( - state = cuda::ptx::mbarrier_arrive(sem_release, scope_cluster, space_shared, &bar, 1); // 4. + NV_IF_TARGET(NV_PROVIDES_SM_90, ( + state = cuda::ptx::mbarrier_arrive(sem_release, scope_cluster, space_shared, &bar, 1); // 4. - cuda::ptx::mbarrier_arrive(sem_release, scope_cta, space_shared_cluster, &bar, 1); // 5. - cuda::ptx::mbarrier_arrive(sem_release, scope_cluster, space_shared_cluster, &bar, 1); // 5. + cuda::ptx::mbarrier_arrive(sem_release, scope_cta, space_shared_cluster, &bar, 1); // 5. + cuda::ptx::mbarrier_arrive(sem_release, scope_cluster, space_shared_cluster, &bar, 1); // 5. - state = cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cta, space_shared, &bar, 1); // 6. - state = cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cluster, space_shared, &bar, 1); // 6. + state = cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cta, space_shared, &bar, 1); // 6. + state = cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cluster, space_shared, &bar, 1); // 6. - cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cta, space_shared_cluster, &bar, 1); // 7. - cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cluster, space_shared_cluster, &bar, 1); // 7. - )); -#endif - - __unused(bar, state); - } - )); + cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cta, space_shared_cluster, &bar, 1); // 7. + cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cluster, space_shared_cluster, &bar, 1); // 7. + )); +#endif // __cccl_ptx_isa >= 800 + __unused(bar, state); +} +int main(int, char**) +{ return 0; } diff --git a/libcudacxx/.upstream-tests/test/support/concurrent_agents.h b/libcudacxx/.upstream-tests/test/support/concurrent_agents.h index d0d3163c88f..33b338ff712 100644 --- a/libcudacxx/.upstream-tests/test/support/concurrent_agents.h +++ b/libcudacxx/.upstream-tests/test/support/concurrent_agents.h @@ -19,6 +19,8 @@ #endif #endif +#include + #include "test_macros.h" TEST_EXEC_CHECK_DISABLE diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h index 9fda72e1980..fed0f732555 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h @@ -71,10 +71,8 @@ _LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive( // __sem == sem_release (due to parameter type constraint) // __scope == scope_cta (due to parameter type constraint) // __space == space_shared (due to parameter type constraint) - - _CUDA_VSTD::uint64_t __state; - NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,( + _CUDA_VSTD::uint64_t __state; asm ( "mbarrier.arrive.shared.b64 %0, [%1]; // 1. " : "=l"(__state) @@ -112,10 +110,8 @@ _LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive_no_complet // __sem == sem_release (due to parameter type constraint) // __scope == scope_cta (due to parameter type constraint) // __space == space_shared (due to parameter type constraint) - - _CUDA_VSTD::uint64_t __state; - NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,( + _CUDA_VSTD::uint64_t __state; asm ( "mbarrier.arrive.noComplete.shared.b64 %0, [%1], %2; // 2. " : "=l"(__state) @@ -154,10 +150,8 @@ _LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive( // __sem == sem_release (due to parameter type constraint) // __scope == scope_cta (due to parameter type constraint) // __space == space_shared (due to parameter type constraint) - - _CUDA_VSTD::uint64_t __state; - NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,( + _CUDA_VSTD::uint64_t __state; asm ( "mbarrier.arrive.shared.b64 %0, [%1], %2; // 3." : "=l"(__state) @@ -196,10 +190,8 @@ _LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive( // __sem == sem_release (due to parameter type constraint) // __scope == scope_cluster (due to parameter type constraint) // __space == space_shared (due to parameter type constraint) - - _CUDA_VSTD::uint64_t __state; - NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,( + _CUDA_VSTD::uint64_t __state; asm ( "mbarrier.arrive.release.cluster.shared.b64 %0, [%1], %2; // 4." : "=l"(__state) @@ -240,8 +232,6 @@ _LIBCUDACXX_DEVICE static inline void mbarrier_arrive( static_assert(__scope == scope_cta || __scope == scope_cluster, ""); // __space == space_shared_cluster (due to parameter type constraint) - - NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,( if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cta) { asm ( @@ -292,10 +282,8 @@ _LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive_expect_tx( // __sem == sem_release (due to parameter type constraint) static_assert(__scope == scope_cta || __scope == scope_cluster, ""); // __space == space_shared (due to parameter type constraint) - - _CUDA_VSTD::uint64_t __state; - NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,( + _CUDA_VSTD::uint64_t __state; if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cta) { asm ( "mbarrier.arrive.expect_tx.release.cta.shared.b64 %0, [%1], %2; // 6. " @@ -346,8 +334,6 @@ _LIBCUDACXX_DEVICE static inline void mbarrier_arrive_expect_tx( static_assert(__scope == scope_cta || __scope == scope_cluster, ""); // __space == space_shared_cluster (due to parameter type constraint) - - NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,( if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cta) { asm ( From bd242653bae356c633b001e63c032e7d166280a3 Mon Sep 17 00:00:00 2001 From: Allard Hendriksen Date: Thu, 2 Nov 2023 11:40:36 +0100 Subject: [PATCH 43/49] Rename space_shared_cluster -> space_cluster --- .../test/cuda/ptx/sm90.ptx.compile.pass.cpp | 13 ++--- libcudacxx/docs/extended_api/ptx.md | 10 ++-- ..._and_communication_instructions_mbarrier.h | 12 ++--- .../include/__cuda/ptx/ptx_dot_variants.h | 47 +++++++------------ 4 files changed, 36 insertions(+), 46 deletions(-) diff --git a/libcudacxx/.upstream-tests/test/cuda/ptx/sm90.ptx.compile.pass.cpp b/libcudacxx/.upstream-tests/test/cuda/ptx/sm90.ptx.compile.pass.cpp index da086fe3811..2fc4e346507 100644 --- a/libcudacxx/.upstream-tests/test/cuda/ptx/sm90.ptx.compile.pass.cpp +++ b/libcudacxx/.upstream-tests/test/cuda/ptx/sm90.ptx.compile.pass.cpp @@ -24,7 +24,7 @@ __device__ inline bool __unused(_Ty...) { return true; } __device__ void test_compilation() { using cuda::ptx::sem_release; - using cuda::ptx::space_shared_cluster; + using cuda::ptx::space_cluster; using cuda::ptx::space_shared; using cuda::ptx::scope_cluster; using cuda::ptx::scope_cta; @@ -40,7 +40,8 @@ __device__ void test_compilation() { )); #endif // __cccl_ptx_isa >= 700 -#if __cccl_ptx_isa >= 780 // This guard is redundant: before PTX ISA 7.8, there was no support for SM_90 + // This guard is redundant: before PTX ISA 7.8, there was no support for SM_90 +#if __cccl_ptx_isa >= 780 NV_IF_TARGET(NV_PROVIDES_SM_90, ( state = cuda::ptx::mbarrier_arrive(sem_release, scope_cta, space_shared, &bar, 1); // 3. )); @@ -50,14 +51,14 @@ __device__ void test_compilation() { NV_IF_TARGET(NV_PROVIDES_SM_90, ( state = cuda::ptx::mbarrier_arrive(sem_release, scope_cluster, space_shared, &bar, 1); // 4. - cuda::ptx::mbarrier_arrive(sem_release, scope_cta, space_shared_cluster, &bar, 1); // 5. - cuda::ptx::mbarrier_arrive(sem_release, scope_cluster, space_shared_cluster, &bar, 1); // 5. + cuda::ptx::mbarrier_arrive(sem_release, scope_cta, space_cluster, &bar, 1); // 5. + cuda::ptx::mbarrier_arrive(sem_release, scope_cluster, space_cluster, &bar, 1); // 5. state = cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cta, space_shared, &bar, 1); // 6. state = cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cluster, space_shared, &bar, 1); // 6. - cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cta, space_shared_cluster, &bar, 1); // 7. - cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cluster, space_shared_cluster, &bar, 1); // 7. + cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cta, space_cluster, &bar, 1); // 7. + cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cluster, space_cluster, &bar, 1); // 7. )); #endif // __cccl_ptx_isa >= 800 __unused(bar, state); diff --git a/libcudacxx/docs/extended_api/ptx.md b/libcudacxx/docs/extended_api/ptx.md index baf859f044a..c5529684330 100644 --- a/libcudacxx/docs/extended_api/ptx.md +++ b/libcudacxx/docs/extended_api/ptx.md @@ -485,7 +485,7 @@ template __device__ static inline void mbarrier_arrive( cuda::ptx::sem_release_t, cuda::ptx::scope_t scope, - cuda::ptx::space_shared_cluster_t, + cuda::ptx::space_cluster_t, uint64_t* addr, const uint32_t& count); @@ -505,7 +505,7 @@ template __device__ static inline void mbarrier_arrive_expect_tx( cuda::ptx::sem_release_t, cuda::ptx::scope_t scope, - cuda::ptx::space_shared_cluster_t, + cuda::ptx::space_cluster_t, uint64_t* addr, const uint32_t& tx_count); ``` @@ -518,7 +518,7 @@ Usage: __global__ void kernel() { using cuda::ptx::sem_release; - using cuda::ptx::space_shared_cluster; + using cuda::ptx::space_cluster; using cuda::ptx::space_shared; using cuda::ptx::scope_cluster; using cuda::ptx::scope_cta; @@ -544,8 +544,8 @@ __global__ void kernel() { cluster.sync(); // Arrive on remote cluster barrier: - cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cta, space_shared_cluster, remote_bar, 1); - cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cluster, space_shared_cluster, remote_bar, 1); + cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cta, space_cluster, remote_bar, 1); + cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cluster, space_cluster, remote_bar, 1); ) } ``` diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h index fed0f732555..5ff96d974dd 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h @@ -214,7 +214,7 @@ template __device__ static inline void mbarrier_arrive( cuda::ptx::sem_release_t, cuda::ptx::scope_t scope, - cuda::ptx::space_shared_cluster_t, + cuda::ptx::space_cluster_t, uint64_t* addr, const uint32_t& count); */ @@ -224,13 +224,13 @@ template _LIBCUDACXX_DEVICE static inline void mbarrier_arrive( sem_release_t, scope_t<_Scope> __scope, - space_shared_cluster_t, + space_cluster_t, _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __count) { // __sem == sem_release (due to parameter type constraint) static_assert(__scope == scope_cta || __scope == scope_cluster, ""); - // __space == space_shared_cluster (due to parameter type constraint) + // __space == space_cluster (due to parameter type constraint) NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,( if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cta) { @@ -316,7 +316,7 @@ template __device__ static inline void mbarrier_arrive_expect_tx( cuda::ptx::sem_release_t, cuda::ptx::scope_t scope, - cuda::ptx::space_shared_cluster_t, + cuda::ptx::space_cluster_t, uint64_t* addr, const uint32_t& tx_count); */ @@ -326,13 +326,13 @@ template _LIBCUDACXX_DEVICE static inline void mbarrier_arrive_expect_tx( sem_release_t, scope_t<_Scope> __scope, - space_shared_cluster_t, + space_cluster_t, _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __tx_count) { // __sem == sem_release (due to parameter type constraint) static_assert(__scope == scope_cta || __scope == scope_cluster, ""); - // __space == space_shared_cluster (due to parameter type constraint) + // __space == space_cluster (due to parameter type constraint) NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,( if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cta) { diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_dot_variants.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_dot_variants.h index aca4eac097e..18f67c37479 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_dot_variants.h +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_dot_variants.h @@ -59,23 +59,24 @@ enum class dot_sem release, sc, weak - // mmio? - // volatile? }; // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#state-spaces enum class dot_space { - reg, - sreg, - const_mem, // Using const_mem as `const` is reserved in C++. global, - local, - param, - shared, // The PTX spelling is shared::cta - shared_cluster, // The PTX spelling is shared::cluster, but we might want to go for cluster here. - tex // deprecated - // generic? + cluster, // The PTX spelling is shared::cluster + shared, // The PTX spelling is shared::cta + + // The following state spaces are unlikely to be used in cuda::ptx in the near + // future, so they are not exposed: + + // reg, + // sreg, + // const_mem, // Using const_mem as `const` is reserved in C++. + // local, + // param, + // tex // deprecated }; // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#scope @@ -118,26 +119,14 @@ static constexpr sem_sc_t sem_sc{}; static constexpr sem_weak_t sem_weak{}; template -using space_t = _CUDA_VSTD::integral_constant; -using space_const_mem_t = space_t; -using space_global_t = space_t; -using space_local_t = space_t; -using space_param_t = space_t; -using space_reg_t = space_t; -using space_shared_t = space_t; -using space_shared_cluster_t = space_t; -using space_sreg_t = space_t; -using space_tex_t = space_t; - -static constexpr space_const_mem_t space_const_mem{}; +using space_t = _CUDA_VSTD::integral_constant; +using space_global_t = space_t; +using space_shared_t = space_t; +using space_cluster_t = space_t; + static constexpr space_global_t space_global{}; -static constexpr space_local_t space_local{}; -static constexpr space_param_t space_param{}; -static constexpr space_reg_t space_reg{}; static constexpr space_shared_t space_shared{}; -static constexpr space_shared_cluster_t space_shared_cluster{}; -static constexpr space_sreg_t space_sreg{}; -static constexpr space_tex_t space_tex{}; +static constexpr space_cluster_t space_cluster{}; template using scope_t = _CUDA_VSTD::integral_constant; From 4f26aa2a6f905b6a36ec3106842f397bfd4b2c14 Mon Sep 17 00:00:00 2001 From: Allard Hendriksen Date: Thu, 2 Nov 2023 11:51:45 +0100 Subject: [PATCH 44/49] Ensure PTX test is actually assembled --- .../.upstream-tests/test/cuda/ptx/sm90.ptx.compile.pass.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libcudacxx/.upstream-tests/test/cuda/ptx/sm90.ptx.compile.pass.cpp b/libcudacxx/.upstream-tests/test/cuda/ptx/sm90.ptx.compile.pass.cpp index 2fc4e346507..b7a1a5d9b52 100644 --- a/libcudacxx/.upstream-tests/test/cuda/ptx/sm90.ptx.compile.pass.cpp +++ b/libcudacxx/.upstream-tests/test/cuda/ptx/sm90.ptx.compile.pass.cpp @@ -22,7 +22,7 @@ template __device__ inline bool __unused(_Ty...) { return true; } -__device__ void test_compilation() { +__global__ void test_compilation() { using cuda::ptx::sem_release; using cuda::ptx::space_cluster; using cuda::ptx::space_shared; From 9555532b45d4db07f34dc0498f276890e4d7a580 Mon Sep 17 00:00:00 2001 From: Allard Hendriksen Date: Thu, 2 Nov 2023 11:54:21 +0100 Subject: [PATCH 45/49] Rename test --- ....ptx.compile.pass.cpp => ptx.mbarrier.arrive.compile.pass.cpp} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename libcudacxx/.upstream-tests/test/cuda/ptx/{sm90.ptx.compile.pass.cpp => ptx.mbarrier.arrive.compile.pass.cpp} (100%) diff --git a/libcudacxx/.upstream-tests/test/cuda/ptx/sm90.ptx.compile.pass.cpp b/libcudacxx/.upstream-tests/test/cuda/ptx/ptx.mbarrier.arrive.compile.pass.cpp similarity index 100% rename from libcudacxx/.upstream-tests/test/cuda/ptx/sm90.ptx.compile.pass.cpp rename to libcudacxx/.upstream-tests/test/cuda/ptx/ptx.mbarrier.arrive.compile.pass.cpp From ffa1f304543fb802f5dc98e8ddb5995617b92d8f Mon Sep 17 00:00:00 2001 From: Allard Hendriksen Date: Thu, 2 Nov 2023 15:43:32 +0100 Subject: [PATCH 46/49] Stay closer to original PTX exposure Use the original spellings as in PTX ISA 70 and 78 and also expose in C++ as such. --- .../ptx/ptx.mbarrier.arrive.compile.pass.cpp | 28 +- libcudacxx/docs/extended_api/ptx.md | 67 ++-- .../detail/libcxx/include/__cuda/barrier.h | 14 +- ..._and_communication_instructions_mbarrier.h | 324 +++++++++++++----- 4 files changed, 307 insertions(+), 126 deletions(-) diff --git a/libcudacxx/.upstream-tests/test/cuda/ptx/ptx.mbarrier.arrive.compile.pass.cpp b/libcudacxx/.upstream-tests/test/cuda/ptx/ptx.mbarrier.arrive.compile.pass.cpp index b7a1a5d9b52..4666467cad5 100644 --- a/libcudacxx/.upstream-tests/test/cuda/ptx/ptx.mbarrier.arrive.compile.pass.cpp +++ b/libcudacxx/.upstream-tests/test/cuda/ptx/ptx.mbarrier.arrive.compile.pass.cpp @@ -12,7 +12,6 @@ // #include - #include #include "concurrent_agents.h" @@ -35,30 +34,37 @@ __global__ void test_compilation() { #if __cccl_ptx_isa >= 700 NV_IF_TARGET(NV_PROVIDES_SM_80, ( - state = cuda::ptx::mbarrier_arrive(sem_release, scope_cta, space_shared, &bar); // 1. - state = cuda::ptx::mbarrier_arrive_no_complete(sem_release, scope_cta, space_shared, &bar, 1); // 2. + state = cuda::ptx::mbarrier_arrive(&bar); // 1. + state = cuda::ptx::mbarrier_arrive_no_complete(&bar, 1); // 5. )); #endif // __cccl_ptx_isa >= 700 // This guard is redundant: before PTX ISA 7.8, there was no support for SM_90 #if __cccl_ptx_isa >= 780 NV_IF_TARGET(NV_PROVIDES_SM_90, ( - state = cuda::ptx::mbarrier_arrive(sem_release, scope_cta, space_shared, &bar, 1); // 3. + state = cuda::ptx::mbarrier_arrive(&bar, 1); // 2. )); #endif // __cccl_ptx_isa >= 780 #if __cccl_ptx_isa >= 800 NV_IF_TARGET(NV_PROVIDES_SM_90, ( - state = cuda::ptx::mbarrier_arrive(sem_release, scope_cluster, space_shared, &bar, 1); // 4. + state = cuda::ptx::mbarrier_arrive(sem_release, scope_cta, space_shared, &bar); // 3a. + state = cuda::ptx::mbarrier_arrive(sem_release, scope_cluster, space_shared, &bar); // 3a. + + state = cuda::ptx::mbarrier_arrive(sem_release, scope_cta, space_shared, &bar, 1); // 3b. + state = cuda::ptx::mbarrier_arrive(sem_release, scope_cluster, space_shared, &bar, 1); // 3b. + + cuda::ptx::mbarrier_arrive(sem_release, scope_cta, space_cluster, &bar); // 4a. + cuda::ptx::mbarrier_arrive(sem_release, scope_cluster, space_cluster, &bar); // 4a. - cuda::ptx::mbarrier_arrive(sem_release, scope_cta, space_cluster, &bar, 1); // 5. - cuda::ptx::mbarrier_arrive(sem_release, scope_cluster, space_cluster, &bar, 1); // 5. + cuda::ptx::mbarrier_arrive(sem_release, scope_cta, space_cluster, &bar, 1); // 4b. + cuda::ptx::mbarrier_arrive(sem_release, scope_cluster, space_cluster, &bar, 1); // 4b. - state = cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cta, space_shared, &bar, 1); // 6. - state = cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cluster, space_shared, &bar, 1); // 6. + state = cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cta, space_shared, &bar, 1); // 8. + state = cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cluster, space_shared, &bar, 1); // 8. - cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cta, space_cluster, &bar, 1); // 7. - cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cluster, space_cluster, &bar, 1); // 7. + cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cta, space_cluster, &bar, 1); // 9. + cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cluster, space_cluster, &bar, 1); // 9. )); #endif // __cccl_ptx_isa >= 800 __unused(bar, state); diff --git a/libcudacxx/docs/extended_api/ptx.md b/libcudacxx/docs/extended_api/ptx.md index c5529684330..8b9efe694f0 100644 --- a/libcudacxx/docs/extended_api/ptx.md +++ b/libcudacxx/docs/extended_api/ptx.md @@ -442,45 +442,56 @@ notes](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#release - PTX ISA: [mbarrier.arrive](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive) - ```cuda -// mbarrier.arrive.shared.b64 state, [addr]; // 1. PTX ISA 70, SM_80 +// mbarrier.arrive.shared.b64 state, [addr]; // 1. PTX ISA 70, SM_80 template __device__ static inline uint64_t mbarrier_arrive( - cuda::ptx::sem_release_t, - cuda::ptx::scope_cta_t, - cuda::ptx::space_shared_t, uint64_t* addr); -// mbarrier.arrive.noComplete.shared.b64 state, [addr], count; // 2. PTX ISA 70, SM_80 +// mbarrier.arrive.shared::cta.b64 state, [addr], count; // 2. PTX ISA 78, SM_90 template -__device__ static inline uint64_t mbarrier_arrive_no_complete( - cuda::ptx::sem_release_t, - cuda::ptx::scope_cta_t, - cuda::ptx::space_shared_t, +__device__ static inline uint64_t mbarrier_arrive( uint64_t* addr, const uint32_t& count); -// mbarrier.arrive.shared.b64 state, [addr], count; // 3. PTX ISA 78, SM_90 -template +// mbarrier.arrive{.sem}{.scope}{.space}.b64 state, [addr]; // 3a. PTX ISA 80, SM_90 +// .sem = { .release } +// .scope = { .cta, .cluster } +// .space = { .shared::cta } +template __device__ static inline uint64_t mbarrier_arrive( cuda::ptx::sem_release_t, - cuda::ptx::scope_cta_t, + cuda::ptx::scope_t scope, cuda::ptx::space_shared_t, - uint64_t* addr, - const uint32_t& count); + uint64_t* addr); -// mbarrier.arrive.release.cluster.shared.b64 state, [addr], count; // 4. PTX ISA 80, SM_90 -template +// mbarrier.arrive{.sem}{.scope}{.space}.b64 state, [addr], count; // 3b. PTX ISA 80, SM_90 +// .sem = { .release } +// .scope = { .cta, .cluster } +// .space = { .shared::cta } +template __device__ static inline uint64_t mbarrier_arrive( cuda::ptx::sem_release_t, - cuda::ptx::scope_cluster_t, + cuda::ptx::scope_t scope, cuda::ptx::space_shared_t, uint64_t* addr, const uint32_t& count); -// mbarrier.arrive.release{.scope}.shared::cluster.b64 _, [addr], count; // 5. PTX ISA 80, SM_90 +// mbarrier.arrive{.sem}{.scope}{.space}.b64 _, [addr]; // 4a. PTX ISA 80, SM_90 +// .sem = { .release } // .scope = { .cta, .cluster } +// .space = { .shared::cluster } +template +__device__ static inline void mbarrier_arrive( + cuda::ptx::sem_release_t, + cuda::ptx::scope_t scope, + cuda::ptx::space_cluster_t, + uint64_t* addr); + +// mbarrier.arrive{.sem}{.scope}{.space}.b64 _, [addr], count; // 4b. PTX ISA 80, SM_90 +// .sem = { .release } +// .scope = { .cta, .cluster } +// .space = { .shared::cluster } template __device__ static inline void mbarrier_arrive( cuda::ptx::sem_release_t, @@ -488,9 +499,21 @@ __device__ static inline void mbarrier_arrive( cuda::ptx::space_cluster_t, uint64_t* addr, const uint32_t& count); +``` -// mbarrier.arrive.expect_tx.release{.scope}.shared.b64 state, [addr], tx_count; // 6. PTX ISA 80, SM_90 +```cuda +// mbarrier.arrive.noComplete.shared.b64 state, [addr], count; // 5. PTX ISA 70, SM_80 +template +__device__ static inline uint64_t mbarrier_arrive_no_complete( + uint64_t* addr, + const uint32_t& count); +``` + +```cuda +// mbarrier.arrive.expect_tx{.sem}{.scope}{.space}.b64 state, [addr], tx_count; // 8. PTX ISA 80, SM_90 +// .sem = { .release } // .scope = { .cta, .cluster } +// .space = { .shared::cta } template __device__ static inline uint64_t mbarrier_arrive_expect_tx( cuda::ptx::sem_release_t, @@ -499,8 +522,10 @@ __device__ static inline uint64_t mbarrier_arrive_expect_tx( uint64_t* addr, const uint32_t& tx_count); -// mbarrier.arrive.expect_tx.release{.scope}.shared::cluster.b64 _, [addr], tx_count; // 7. PTX ISA 80, SM_90 +// mbarrier.arrive.expect_tx{.sem}{.scope}{.space}.b64 _, [addr], tx_count; // 9. PTX ISA 80, SM_90 +// .sem = { .release } // .scope = { .cta, .cluster } +// .space = { .shared::cluster } template __device__ static inline void mbarrier_arrive_expect_tx( cuda::ptx::sem_release_t, diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h index c388d802b30..c4f8cedba71 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h @@ -27,7 +27,7 @@ _CCCL_IMPLICIT_SYSTEM_HEADER #include "../cstdlib" // _LIBCUDACXX_UNREACHABLE #include "../__type_traits/void_t.h" // _CUDA_VSTD::__void_t -#include "../__cuda/ptx.h" // cuda::ptx::* +#include "../__cuda/ptx.h" // cuda::ptx::* #if defined(_LIBCUDACXX_COMPILER_NVRTC) #define _LIBCUDACXX_OFFSET_IS_ZERO(type, member) !(&(((type *)0)->member)) @@ -211,9 +211,7 @@ friend class _CUDA_VSTD::__barrier_poll_tester_parity; // only defined for block-scope barriers. This barrier may be a // non-block scoped barrier. auto __bh = reinterpret_cast<_CUDA_VSTD::uint64_t*>(&__barrier); - __token = _CUDA_VPTX::mbarrier_arrive( - _CUDA_VPTX::sem_release, _CUDA_VPTX::scope_cta, _CUDA_VPTX::space_shared, __bh, __update - ); + __token = _CUDA_VPTX::mbarrier_arrive(__bh, __update); ), NV_PROVIDES_SM_80, ( if (!__isShared(&__barrier)) { return __barrier.arrive(__update); @@ -221,13 +219,9 @@ friend class _CUDA_VSTD::__barrier_poll_tester_parity; auto __bh = reinterpret_cast<_CUDA_VSTD::uint64_t*>(&__barrier); // Need 2 instructions, can't finish barrier with arrive > 1 if (__update > 1) { - ___CUDA_VPTX::mbarrier_arrive_no_complete( - _CUDA_VPTX::sem_release, _CUDA_VPTX::scope_cta, _CUDA_VPTX::space_shared, - __bh, __update - 1); + ___CUDA_VPTX::mbarrier_arrive_no_complete(__bh, __update - 1); } - __token = _CUDA_VPTX::mbarrier_arrive( - _CUDA_VPTX::sem_release, _CUDA_VPTX::scope_cta, _CUDA_VPTX::space_shared, __bh - ); + __token = _CUDA_VPTX::mbarrier_arrive( __bh); ), NV_IS_DEVICE, ( if (!__isShared(&__barrier)) { return __barrier.arrive(__update); diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h index 5ff96d974dd..d6b5c72e38d 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h @@ -41,40 +41,93 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX // 9.7.12.15.13. Parallel Synchronization and Communication Instructions: mbarrier.arrive // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive -// mbarrier.arrive{.sem}{.scope}{.shared{::cta}}.b64 state, [addr]{, count}; -// mbarrier.arrive{.sem}{.scope}{.shared::cluster}.b64 _, [addr] {,count} -// mbarrier.arrive.expect_tx{.sem}{.scope}{.shared{::cta}}.b64 state, [addr], txCount; -// mbarrier.arrive.expect_tx{.sem}{.scope}{.shared::cluster}.b64 _, [addr], txCount; -// mbarrier.arrive.noComplete{.sem}{.cta}{.shared{::cta}}.b64 state, [addr], count; -// -// .sem = { .release } -// .scope = { .cta, .cluster } +/* +PTX ISA docs: + +// mbarrier.arrive: +mbarrier.arrive{.shared}.b64 state, [addr]; // 1. PTX ISA 70, SM_80 +mbarrier.arrive{.shared{::cta}}.b64 state, [addr]{, count}; // 2. PTX ISA 78, SM_90 (due to count) + +mbarrier.arrive{.sem}{.scope}{.shared{::cta}}.b64 state, [addr]{, count}; // 3. PTX ISA 80, SM_90 (some variants are SM_80, but are covered by 1) +mbarrier.arrive{.sem}{.scope}{.shared::cluster}.b64 _, [addr] {,count} // 4. PTX ISA 80, SM_90 + +.sem = { .release } +.scope = { .cta, .cluster } + + +// mbarrier.arrive.noComplete: +mbarrier.arrive.noComplete{.shared}.b64 state, [addr], count; // 5. PTX ISA 70, SM_80 +mbarrier.arrive.noComplete{.shared{::cta}}.b64 state, [addr], count; // 6. PTX ISA 78, Not exposed. Just a spelling change (shared -> shared::cta) +mbarrier.arrive.noComplete{.sem}{.cta}{.shared{::cta}}.b64 state, [addr], count; // 7. PTX ISA 80, Not exposed. Adds .release, and .cta scope. + + +// mbarrier.arrive.expect_tx: +mbarrier.arrive.expect_tx{.sem}{.scope}{.shared{::cta}}.b64 state, [addr], tx_count; // 8. PTX ISA 80, SM_90 +mbarrier.arrive.expect_tx{.sem}{.scope}{.shared::cluster}.b64 _, [addr], tx_count; // 9. PTX ISA 80, SM_90 + +.sem = { .release } +.scope = { .cta, .cluster } + + +Corresponding Exposure: + +// mbarrier_arrive: +mbarrier.arrive.shared.b64 state, [addr]; // 1. PTX ISA 70, SM_80, !memory +// count is non-optional, otherwise 3 would not be distinguishable from 1 +mbarrier.arrive.shared::cta.b64 state, [addr], count; // 2. PTX ISA 78, SM_90, !memory +mbarrier.arrive{.sem}{.scope}{.space}.b64 state, [addr]; // 3a. PTX ISA 80, SM_90, !memory +.space = { .shared::cta} +.sem = { .release } +.scope = { .cta, .cluster } + +mbarrier.arrive{.sem}{.scope}{.space}.b64 state, [addr], count; // 3b. PTX ISA 80, SM_90, !memory +.space = { .shared::cta} +.sem = { .release } +.scope = { .cta, .cluster } + +mbarrier.arrive{.sem}{.scope}{.space}.b64 _, [addr]; // 4a. PTX ISA 80, SM_90, !memory +.space = { .shared::cluster} +.sem = { .release } +.scope = { .cta, .cluster } + +mbarrier.arrive{.sem}{.scope}{.space}.b64 _, [addr], count; // 4b. PTX ISA 80, SM_90, !memory +.space = { .shared::cluster} +.sem = { .release } +.scope = { .cta, .cluster } + + +// mbarrier_arrive_no_complete: +mbarrier.arrive.noComplete.shared.b64 state, [addr], count; // 5. PTX ISA 70, SM_80, !memory + + +mbarrier.arrive.expect_tx{.sem}{.scope}{.space}.b64 state, [addr], tx_count; // 8. PTX ISA 80, SM_90, !memory +.space = { .shared::cta } +.sem = { .release } +.scope = { .cta, .cluster } + +mbarrier.arrive.expect_tx{.sem}{.scope}{.space}.b64 _, [addr], tx_count; // 9. PTX ISA 80, SM_90, !memory +.space = { .shared::cluster } +.sem = { .release } +.scope = { .cta, .cluster } + +*/ /* -// mbarrier.arrive.shared.b64 state, [addr]; // 1. PTX ISA 70, SM_80 +// mbarrier.arrive.shared.b64 state, [addr]; // 1. PTX ISA 70, SM_80 template __device__ static inline uint64_t mbarrier_arrive( - cuda::ptx::sem_release_t, - cuda::ptx::scope_cta_t, - cuda::ptx::space_shared_t, uint64_t* addr); */ #if __cccl_ptx_isa >= 700 -extern "C" __device__ _CUDA_VSTD::uint64_t ___cuda_vstd_uint64_t__cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_80__(); +extern "C" _LIBCUDACXX_DEVICE _CUDA_VSTD::uint64_t ___cuda_vstd_uint64_t__cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_80__(); template _LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive( - sem_release_t, - scope_cta_t, - space_shared_t, _CUDA_VSTD::uint64_t* __addr) { - // __sem == sem_release (due to parameter type constraint) - // __scope == scope_cta (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,( _CUDA_VSTD::uint64_t __state; asm ( - "mbarrier.arrive.shared.b64 %0, [%1]; // 1. " + "mbarrier.arrive.shared.b64 %0, [%1]; // 1. " : "=l"(__state) : "r"(__as_ptr_smem(__addr)) : "memory" @@ -88,32 +141,23 @@ _LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive( #endif // __cccl_ptx_isa >= 700 /* -// mbarrier.arrive.noComplete.shared.b64 state, [addr], count; // 2. PTX ISA 70, SM_80 +// mbarrier.arrive.shared::cta.b64 state, [addr], count; // 2. PTX ISA 78, SM_90 template -__device__ static inline uint64_t mbarrier_arrive_no_complete( - cuda::ptx::sem_release_t, - cuda::ptx::scope_cta_t, - cuda::ptx::space_shared_t, +__device__ static inline uint64_t mbarrier_arrive( uint64_t* addr, const uint32_t& count); */ -#if __cccl_ptx_isa >= 700 -extern "C" __device__ _CUDA_VSTD::uint64_t ___cuda_vstd_uint64_t__cuda_ptx_mbarrier_arrive_no_complete_is_not_supported_before_SM_80__(); +#if __cccl_ptx_isa >= 780 +extern "C" _LIBCUDACXX_DEVICE _CUDA_VSTD::uint64_t ___cuda_vstd_uint64_t__cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); template -_LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive_no_complete( - sem_release_t, - scope_cta_t, - space_shared_t, +_LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive( _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __count) { - // __sem == sem_release (due to parameter type constraint) - // __scope == scope_cta (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,( + NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,( _CUDA_VSTD::uint64_t __state; asm ( - "mbarrier.arrive.noComplete.shared.b64 %0, [%1], %2; // 2. " + "mbarrier.arrive.shared::cta.b64 %0, [%1], %2; // 2. " : "=l"(__state) : "r"(__as_ptr_smem(__addr)), "r"(__count) @@ -122,83 +166,107 @@ _LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive_no_complet return __state; ),( // Unsupported architectures will have a linker error with a semi-decent error message - return ___cuda_vstd_uint64_t__cuda_ptx_mbarrier_arrive_no_complete_is_not_supported_before_SM_80__(); + return ___cuda_vstd_uint64_t__cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); )); } -#endif // __cccl_ptx_isa >= 700 +#endif // __cccl_ptx_isa >= 780 /* -// mbarrier.arrive.shared.b64 state, [addr], count; // 3. PTX ISA 78, SM_90 -template +// mbarrier.arrive{.sem}{.scope}{.space}.b64 state, [addr]; // 3a. PTX ISA 80, SM_90 +// .sem = { .release } +// .scope = { .cta, .cluster } +// .space = { .shared::cta } +template __device__ static inline uint64_t mbarrier_arrive( cuda::ptx::sem_release_t, - cuda::ptx::scope_cta_t, + cuda::ptx::scope_t scope, cuda::ptx::space_shared_t, - uint64_t* addr, - const uint32_t& count); + uint64_t* addr); */ -#if __cccl_ptx_isa >= 780 -extern "C" __device__ _CUDA_VSTD::uint64_t ___cuda_vstd_uint64_t__cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); -template +#if __cccl_ptx_isa >= 800 +extern "C" _LIBCUDACXX_DEVICE _CUDA_VSTD::uint64_t ___cuda_vstd_uint64_t__cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); +template _LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive( sem_release_t, - scope_cta_t, + scope_t<_Scope> __scope, space_shared_t, - _CUDA_VSTD::uint64_t* __addr, - const _CUDA_VSTD::uint32_t& __count) + _CUDA_VSTD::uint64_t* __addr) { // __sem == sem_release (due to parameter type constraint) - // __scope == scope_cta (due to parameter type constraint) + static_assert(__scope == scope_cta || __scope == scope_cluster, ""); // __space == space_shared (due to parameter type constraint) + NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,( _CUDA_VSTD::uint64_t __state; - asm ( - "mbarrier.arrive.shared.b64 %0, [%1], %2; // 3." - : "=l"(__state) - : "r"(__as_ptr_smem(__addr)), - "r"(__count) - : "memory" - ); + if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cta) { + asm ( + "mbarrier.arrive.release.cta.shared.b64 %0, [%1]; // 3a. " + : "=l"(__state) + : "r"(__as_ptr_smem(__addr)) + : "memory" + ); + } else if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cluster) { + asm ( + "mbarrier.arrive.release.cluster.shared.b64 %0, [%1]; // 3a. " + : "=l"(__state) + : "r"(__as_ptr_smem(__addr)) + : "memory" + ); + } return __state; ),( // Unsupported architectures will have a linker error with a semi-decent error message return ___cuda_vstd_uint64_t__cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); )); } -#endif // __cccl_ptx_isa >= 780 +#endif // __cccl_ptx_isa >= 800 /* -// mbarrier.arrive.release.cluster.shared.b64 state, [addr], count; // 4. PTX ISA 80, SM_90 -template +// mbarrier.arrive{.sem}{.scope}{.space}.b64 state, [addr], count; // 3b. PTX ISA 80, SM_90 +// .sem = { .release } +// .scope = { .cta, .cluster } +// .space = { .shared::cta } +template __device__ static inline uint64_t mbarrier_arrive( cuda::ptx::sem_release_t, - cuda::ptx::scope_cluster_t, + cuda::ptx::scope_t scope, cuda::ptx::space_shared_t, uint64_t* addr, const uint32_t& count); */ #if __cccl_ptx_isa >= 800 -extern "C" __device__ _CUDA_VSTD::uint64_t ___cuda_vstd_uint64_t__cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); -template +extern "C" _LIBCUDACXX_DEVICE _CUDA_VSTD::uint64_t ___cuda_vstd_uint64_t__cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); +template _LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive( sem_release_t, - scope_cluster_t, + scope_t<_Scope> __scope, space_shared_t, _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __count) { // __sem == sem_release (due to parameter type constraint) - // __scope == scope_cluster (due to parameter type constraint) + static_assert(__scope == scope_cta || __scope == scope_cluster, ""); // __space == space_shared (due to parameter type constraint) + NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,( _CUDA_VSTD::uint64_t __state; - asm ( - "mbarrier.arrive.release.cluster.shared.b64 %0, [%1], %2; // 4." - : "=l"(__state) - : "r"(__as_ptr_smem(__addr)), - "r"(__count) - : "memory" - ); + if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cta) { + asm ( + "mbarrier.arrive.release.cta.shared.b64 %0, [%1], %2; // 3b. " + : "=l"(__state) + : "r"(__as_ptr_smem(__addr)), + "r"(__count) + : "memory" + ); + } else if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cluster) { + asm ( + "mbarrier.arrive.release.cluster.shared.b64 %0, [%1], %2; // 3b. " + : "=l"(__state) + : "r"(__as_ptr_smem(__addr)), + "r"(__count) + : "memory" + ); + } return __state; ),( // Unsupported architectures will have a linker error with a semi-decent error message @@ -208,8 +276,59 @@ _LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive( #endif // __cccl_ptx_isa >= 800 /* -// mbarrier.arrive.release{.scope}.shared::cluster.b64 _, [addr], count; // 5. PTX ISA 80, SM_90 +// mbarrier.arrive{.sem}{.scope}{.space}.b64 _, [addr]; // 4a. PTX ISA 80, SM_90 +// .sem = { .release } +// .scope = { .cta, .cluster } +// .space = { .shared::cluster } +template +__device__ static inline void mbarrier_arrive( + cuda::ptx::sem_release_t, + cuda::ptx::scope_t scope, + cuda::ptx::space_cluster_t, + uint64_t* addr); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _LIBCUDACXX_DEVICE void __void__cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); +template +_LIBCUDACXX_DEVICE static inline void mbarrier_arrive( + sem_release_t, + scope_t<_Scope> __scope, + space_cluster_t, + _CUDA_VSTD::uint64_t* __addr) +{ + // __sem == sem_release (due to parameter type constraint) + static_assert(__scope == scope_cta || __scope == scope_cluster, ""); + // __space == space_cluster (due to parameter type constraint) + + NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,( + if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cta) { + asm ( + "mbarrier.arrive.release.cta.shared::cluster.b64 _, [%0]; // 4a. " + : + : "r"(__as_ptr_smem(__addr)) + : "memory" + ); + } else if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cluster) { + asm ( + "mbarrier.arrive.release.cluster.shared::cluster.b64 _, [%0]; // 4a. " + : + : "r"(__as_ptr_smem(__addr)) + : "memory" + ); + } + + ),( + // Unsupported architectures will have a linker error with a semi-decent error message + return __void__cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); + )); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// mbarrier.arrive{.sem}{.scope}{.space}.b64 _, [addr], count; // 4b. PTX ISA 80, SM_90 +// .sem = { .release } // .scope = { .cta, .cluster } +// .space = { .shared::cluster } template __device__ static inline void mbarrier_arrive( cuda::ptx::sem_release_t, @@ -219,7 +338,7 @@ __device__ static inline void mbarrier_arrive( const uint32_t& count); */ #if __cccl_ptx_isa >= 800 -extern "C" __device__ void __void__cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); +extern "C" _LIBCUDACXX_DEVICE void __void__cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); template _LIBCUDACXX_DEVICE static inline void mbarrier_arrive( sem_release_t, @@ -235,7 +354,7 @@ _LIBCUDACXX_DEVICE static inline void mbarrier_arrive( NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,( if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cta) { asm ( - "mbarrier.arrive.release.cta.shared::cluster.b64 _, [%0], %1; // 5. " + "mbarrier.arrive.release.cta.shared::cluster.b64 _, [%0], %1; // 4b. " : : "r"(__as_ptr_smem(__addr)), "r"(__count) @@ -243,7 +362,7 @@ _LIBCUDACXX_DEVICE static inline void mbarrier_arrive( ); } else if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cluster) { asm ( - "mbarrier.arrive.release.cluster.shared::cluster.b64 _, [%0], %1; // 5. " + "mbarrier.arrive.release.cluster.shared::cluster.b64 _, [%0], %1; // 4b. " : : "r"(__as_ptr_smem(__addr)), "r"(__count) @@ -259,8 +378,41 @@ _LIBCUDACXX_DEVICE static inline void mbarrier_arrive( #endif // __cccl_ptx_isa >= 800 /* -// mbarrier.arrive.expect_tx.release{.scope}.shared.b64 state, [addr], tx_count; // 6. PTX ISA 80, SM_90 +// mbarrier.arrive.noComplete.shared.b64 state, [addr], count; // 5. PTX ISA 70, SM_80 +template +__device__ static inline uint64_t mbarrier_arrive_no_complete( + uint64_t* addr, + const uint32_t& count); +*/ +#if __cccl_ptx_isa >= 700 +extern "C" _LIBCUDACXX_DEVICE _CUDA_VSTD::uint64_t ___cuda_vstd_uint64_t__cuda_ptx_mbarrier_arrive_no_complete_is_not_supported_before_SM_80__(); +template +_LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive_no_complete( + _CUDA_VSTD::uint64_t* __addr, + const _CUDA_VSTD::uint32_t& __count) +{ + NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,( + _CUDA_VSTD::uint64_t __state; + asm ( + "mbarrier.arrive.noComplete.shared.b64 %0, [%1], %2; // 5. " + : "=l"(__state) + : "r"(__as_ptr_smem(__addr)), + "r"(__count) + : "memory" + ); + return __state; + ),( + // Unsupported architectures will have a linker error with a semi-decent error message + return ___cuda_vstd_uint64_t__cuda_ptx_mbarrier_arrive_no_complete_is_not_supported_before_SM_80__(); + )); +} +#endif // __cccl_ptx_isa >= 700 + +/* +// mbarrier.arrive.expect_tx{.sem}{.scope}{.space}.b64 state, [addr], tx_count; // 8. PTX ISA 80, SM_90 +// .sem = { .release } // .scope = { .cta, .cluster } +// .space = { .shared::cta } template __device__ static inline uint64_t mbarrier_arrive_expect_tx( cuda::ptx::sem_release_t, @@ -270,7 +422,7 @@ __device__ static inline uint64_t mbarrier_arrive_expect_tx( const uint32_t& tx_count); */ #if __cccl_ptx_isa >= 800 -extern "C" __device__ _CUDA_VSTD::uint64_t ___cuda_vstd_uint64_t__cuda_ptx_mbarrier_arrive_expect_tx_is_not_supported_before_SM_90__(); +extern "C" _LIBCUDACXX_DEVICE _CUDA_VSTD::uint64_t ___cuda_vstd_uint64_t__cuda_ptx_mbarrier_arrive_expect_tx_is_not_supported_before_SM_90__(); template _LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive_expect_tx( sem_release_t, @@ -282,11 +434,12 @@ _LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive_expect_tx( // __sem == sem_release (due to parameter type constraint) static_assert(__scope == scope_cta || __scope == scope_cluster, ""); // __space == space_shared (due to parameter type constraint) + NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,( _CUDA_VSTD::uint64_t __state; if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cta) { asm ( - "mbarrier.arrive.expect_tx.release.cta.shared.b64 %0, [%1], %2; // 6. " + "mbarrier.arrive.expect_tx.release.cta.shared.b64 %0, [%1], %2; // 8. " : "=l"(__state) : "r"(__as_ptr_smem(__addr)), "r"(__tx_count) @@ -294,7 +447,7 @@ _LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive_expect_tx( ); } else if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cluster) { asm ( - "mbarrier.arrive.expect_tx.release.cluster.shared.b64 %0, [%1], %2; // 6. " + "mbarrier.arrive.expect_tx.release.cluster.shared.b64 %0, [%1], %2; // 8. " : "=l"(__state) : "r"(__as_ptr_smem(__addr)), "r"(__tx_count) @@ -310,8 +463,10 @@ _LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive_expect_tx( #endif // __cccl_ptx_isa >= 800 /* -// mbarrier.arrive.expect_tx.release{.scope}.shared::cluster.b64 _, [addr], tx_count; // 7. PTX ISA 80, SM_90 +// mbarrier.arrive.expect_tx{.sem}{.scope}{.space}.b64 _, [addr], tx_count; // 9. PTX ISA 80, SM_90 +// .sem = { .release } // .scope = { .cta, .cluster } +// .space = { .shared::cluster } template __device__ static inline void mbarrier_arrive_expect_tx( cuda::ptx::sem_release_t, @@ -321,7 +476,7 @@ __device__ static inline void mbarrier_arrive_expect_tx( const uint32_t& tx_count); */ #if __cccl_ptx_isa >= 800 -extern "C" __device__ void __void__cuda_ptx_mbarrier_arrive_expect_tx_is_not_supported_before_SM_90__(); +extern "C" _LIBCUDACXX_DEVICE void __void__cuda_ptx_mbarrier_arrive_expect_tx_is_not_supported_before_SM_90__(); template _LIBCUDACXX_DEVICE static inline void mbarrier_arrive_expect_tx( sem_release_t, @@ -337,7 +492,7 @@ _LIBCUDACXX_DEVICE static inline void mbarrier_arrive_expect_tx( NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,( if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cta) { asm ( - "mbarrier.arrive.expect_tx.release.cta.shared::cluster.b64 _, [%0], %1; // 7. " + "mbarrier.arrive.expect_tx.release.cta.shared::cluster.b64 _, [%0], %1; // 9. " : : "r"(__as_ptr_smem(__addr)), "r"(__tx_count) @@ -345,7 +500,7 @@ _LIBCUDACXX_DEVICE static inline void mbarrier_arrive_expect_tx( ); } else if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cluster) { asm ( - "mbarrier.arrive.expect_tx.release.cluster.shared::cluster.b64 _, [%0], %1; // 7. " + "mbarrier.arrive.expect_tx.release.cluster.shared::cluster.b64 _, [%0], %1; // 9. " : : "r"(__as_ptr_smem(__addr)), "r"(__tx_count) @@ -361,6 +516,7 @@ _LIBCUDACXX_DEVICE static inline void mbarrier_arrive_expect_tx( #endif // __cccl_ptx_isa >= 800 + // 9.7.12.15.14. Parallel Synchronization and Communication Instructions: mbarrier.arrive_drop // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive-drop From 8b03da35467ab89965d78e610524155f5ae441fd Mon Sep 17 00:00:00 2001 From: Michael Schellenberger Costa Date: Fri, 3 Nov 2023 08:30:32 +0100 Subject: [PATCH 47/49] Address review feedback --- libcudacxx/docs/extended_api/ptx.md | 4 +-- .../std/detail/libcxx/include/__cuda/ptx.h | 26 ++++++++++++++----- ..._and_communication_instructions_mbarrier.h | 7 ++++- .../include/__cuda/ptx/ptx_dot_variants.h | 7 ++++- .../include/__cuda/ptx/ptx_helper_functions.h | 6 +++++ .../__cuda/ptx/ptx_isa_target_macros.h | 6 +++++ 6 files changed, 46 insertions(+), 10 deletions(-) diff --git a/libcudacxx/docs/extended_api/ptx.md b/libcudacxx/docs/extended_api/ptx.md index 8b9efe694f0..1201b09748f 100644 --- a/libcudacxx/docs/extended_api/ptx.md +++ b/libcudacxx/docs/extended_api/ptx.md @@ -11,7 +11,7 @@ experiment with new hardware features before a high-level C++ API is available. The `cuda/ptx` header is intended to present a stable API (not ABI) within one major version of the CTK on a best effort basis. This means that: -- All functions are marked inline. +- All functions are marked static inline. - The type of a function parameter can be changed to be more generic if that means that code that called the original version can still be @@ -26,7 +26,7 @@ API stability is not taken to the extreme. Call functions like below to ensure forward-compatibility: ```cuda -// Use arguments to driver overload resolution: +// Use arguments to drive overload resolution: cuda::ptx::mbarrier_arrive_expect_tx(cuda::ptx::sem_release, cuda::ptx::scope_cta, cuda::ptx::space_shared, &bar, 1); // Specifying templates directly is not forward-compatible, as order and number diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h index 9c8a33c18dd..4ad22be7419 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h @@ -12,13 +12,27 @@ #ifndef _LIBCUDACXX___CUDA_PTX_H #define _LIBCUDACXX___CUDA_PTX_H +#ifndef __cuda_std__ +#error "<__cuda/ptx.h> should only be included in from " +#endif // __cuda_std__ + +#if defined(__CUDA_MINIMUM_ARCH__) && __CUDA_MINIMUM_ARCH__ < 700 +# error "CUDA synchronization primitives are only supported for sm_70 and up." +#endif + +#if defined(_CCCL_COMPILER_NVHPC) && defined(_CCCL_USE_IMPLICIT_SYSTEM_DEADER) +#pragma GCC system_header +#else // ^^^ _CCCL_COMPILER_NVHPC ^^^ / vvv !_CCCL_COMPILER_NVHPC vvv +_CCCL_IMPLICIT_SYSTEM_HEADER +#endif // !_CCCL_COMPILER_NVHPC + +#include // __CUDA_MINIMUM_ARCH__ and friends + +#include "../__cuda/ptx/ptx_isa_target_macros.h" +#include "../__cuda/ptx/ptx_dot_variants.h" +#include "../__cuda/ptx/ptx_helper_functions.h" +#include "../__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h" #include "../cstdint" // uint32_t -#include "../../../../../../nv/target" // __CUDA_MINIMUM_ARCH__ and friends - -#include "cuda/std/detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h" -#include "cuda/std/detail/libcxx/include/__cuda/ptx/ptx_dot_variants.h" -#include "cuda/std/detail/libcxx/include/__cuda/ptx/ptx_helper_functions.h" -#include "cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h" /* * The cuda::ptx namespace intends to provide PTX wrappers for new hardware diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h index d6b5c72e38d..01cba7cd0b1 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h @@ -9,7 +9,6 @@ // //===----------------------------------------------------------------------===// - #ifndef _CUDA_PTX_PARALLEL_SYNCHRONIZATION_AND_COMMUNICATION_INSTRUCTIONS_MBARRIER_H_ #define _CUDA_PTX_PARALLEL_SYNCHRONIZATION_AND_COMMUNICATION_INSTRUCTIONS_MBARRIER_H_ @@ -18,6 +17,12 @@ #include "ptx_isa_target_macros.h" #include "../../cstdint" +#if defined(_CCCL_COMPILER_NVHPC) && defined(_CCCL_USE_IMPLICIT_SYSTEM_DEADER) +#pragma GCC system_header +#else // ^^^ _CCCL_COMPILER_NVHPC ^^^ / vvv !_CCCL_COMPILER_NVHPC vvv +_CCCL_IMPLICIT_SYSTEM_HEADER +#endif // !_CCCL_COMPILER_NVHPC + _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX /* diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_dot_variants.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_dot_variants.h index 18f67c37479..442c484e8eb 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_dot_variants.h +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_dot_variants.h @@ -9,12 +9,17 @@ // //===----------------------------------------------------------------------===// - #ifndef _CUDA_PTX_DOT_VARIANTS_H_ #define _CUDA_PTX_DOT_VARIANTS_H_ #include "../../__type_traits/integral_constant.h" // std::integral_constant +#if defined(_CCCL_COMPILER_NVHPC) && defined(_CCCL_USE_IMPLICIT_SYSTEM_DEADER) +#pragma GCC system_header +#else // ^^^ _CCCL_COMPILER_NVHPC ^^^ / vvv !_CCCL_COMPILER_NVHPC vvv +_CCCL_IMPLICIT_SYSTEM_HEADER +#endif // !_CCCL_COMPILER_NVHPC + /* * Public integral constant types and values for ".variant"s: * diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_helper_functions.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_helper_functions.h index 41826081a54..f6ec0b3959e 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_helper_functions.h +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_helper_functions.h @@ -14,6 +14,12 @@ #include "../../cstdint" // uint32_t +#if defined(_CCCL_COMPILER_NVHPC) && defined(_CCCL_USE_IMPLICIT_SYSTEM_DEADER) +#pragma GCC system_header +#else // ^^^ _CCCL_COMPILER_NVHPC ^^^ / vvv !_CCCL_COMPILER_NVHPC vvv +_CCCL_IMPLICIT_SYSTEM_HEADER +#endif // !_CCCL_COMPILER_NVHPC + _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX inline _LIBCUDACXX_DEVICE _CUDA_VSTD::uint32_t __as_ptr_smem(const void* __ptr) diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h index f3b412bb6b6..ca5297e4de4 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h @@ -15,6 +15,12 @@ #include // __CUDA_MINIMUM_ARCH__ and friends +#if defined(_CCCL_COMPILER_NVHPC) && defined(_CCCL_USE_IMPLICIT_SYSTEM_DEADER) +#pragma GCC system_header +#else // ^^^ _CCCL_COMPILER_NVHPC ^^^ / vvv !_CCCL_COMPILER_NVHPC vvv +_CCCL_IMPLICIT_SYSTEM_HEADER +#endif // !_CCCL_COMPILER_NVHPC + /* * Targeting macros * From 614326b9260f5c4022c6adc69e0125d52d3978fd Mon Sep 17 00:00:00 2001 From: Michael Schellenberger Costa Date: Fri, 3 Nov 2023 09:06:34 +0100 Subject: [PATCH 48/49] Do not require set arch --- .../include/cuda/std/detail/libcxx/include/__cuda/ptx.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h index 4ad22be7419..384f3ba14b3 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h @@ -16,10 +16,6 @@ #error "<__cuda/ptx.h> should only be included in from " #endif // __cuda_std__ -#if defined(__CUDA_MINIMUM_ARCH__) && __CUDA_MINIMUM_ARCH__ < 700 -# error "CUDA synchronization primitives are only supported for sm_70 and up." -#endif - #if defined(_CCCL_COMPILER_NVHPC) && defined(_CCCL_USE_IMPLICIT_SYSTEM_DEADER) #pragma GCC system_header #else // ^^^ _CCCL_COMPILER_NVHPC ^^^ / vvv !_CCCL_COMPILER_NVHPC vvv From 9e9fb70b712a6799791997d3c70db6a28a71af72 Mon Sep 17 00:00:00 2001 From: Allard Hendriksen Date: Fri, 3 Nov 2023 12:22:23 +0100 Subject: [PATCH 49/49] Do not expose remote mbarrier arrive with .cta scope --- .../ptx/ptx.mbarrier.arrive.compile.pass.cpp | 3 - libcudacxx/docs/extended_api/ptx.md | 18 +-- ..._and_communication_instructions_mbarrier.h | 115 +++++++----------- 3 files changed, 54 insertions(+), 82 deletions(-) diff --git a/libcudacxx/.upstream-tests/test/cuda/ptx/ptx.mbarrier.arrive.compile.pass.cpp b/libcudacxx/.upstream-tests/test/cuda/ptx/ptx.mbarrier.arrive.compile.pass.cpp index 4666467cad5..4316b3604fa 100644 --- a/libcudacxx/.upstream-tests/test/cuda/ptx/ptx.mbarrier.arrive.compile.pass.cpp +++ b/libcudacxx/.upstream-tests/test/cuda/ptx/ptx.mbarrier.arrive.compile.pass.cpp @@ -54,16 +54,13 @@ __global__ void test_compilation() { state = cuda::ptx::mbarrier_arrive(sem_release, scope_cta, space_shared, &bar, 1); // 3b. state = cuda::ptx::mbarrier_arrive(sem_release, scope_cluster, space_shared, &bar, 1); // 3b. - cuda::ptx::mbarrier_arrive(sem_release, scope_cta, space_cluster, &bar); // 4a. cuda::ptx::mbarrier_arrive(sem_release, scope_cluster, space_cluster, &bar); // 4a. - cuda::ptx::mbarrier_arrive(sem_release, scope_cta, space_cluster, &bar, 1); // 4b. cuda::ptx::mbarrier_arrive(sem_release, scope_cluster, space_cluster, &bar, 1); // 4b. state = cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cta, space_shared, &bar, 1); // 8. state = cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cluster, space_shared, &bar, 1); // 8. - cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cta, space_cluster, &bar, 1); // 9. cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cluster, space_cluster, &bar, 1); // 9. )); #endif // __cccl_ptx_isa >= 800 diff --git a/libcudacxx/docs/extended_api/ptx.md b/libcudacxx/docs/extended_api/ptx.md index 1201b09748f..e45eed54a42 100644 --- a/libcudacxx/docs/extended_api/ptx.md +++ b/libcudacxx/docs/extended_api/ptx.md @@ -479,23 +479,23 @@ __device__ static inline uint64_t mbarrier_arrive( // mbarrier.arrive{.sem}{.scope}{.space}.b64 _, [addr]; // 4a. PTX ISA 80, SM_90 // .sem = { .release } -// .scope = { .cta, .cluster } +// .scope = { .cluster } // .space = { .shared::cluster } -template +template __device__ static inline void mbarrier_arrive( cuda::ptx::sem_release_t, - cuda::ptx::scope_t scope, + cuda::ptx::scope_cluster_t, cuda::ptx::space_cluster_t, uint64_t* addr); // mbarrier.arrive{.sem}{.scope}{.space}.b64 _, [addr], count; // 4b. PTX ISA 80, SM_90 // .sem = { .release } -// .scope = { .cta, .cluster } +// .scope = { .cluster } // .space = { .shared::cluster } -template +template __device__ static inline void mbarrier_arrive( cuda::ptx::sem_release_t, - cuda::ptx::scope_t scope, + cuda::ptx::scope_cluster_t, cuda::ptx::space_cluster_t, uint64_t* addr, const uint32_t& count); @@ -524,12 +524,12 @@ __device__ static inline uint64_t mbarrier_arrive_expect_tx( // mbarrier.arrive.expect_tx{.sem}{.scope}{.space}.b64 _, [addr], tx_count; // 9. PTX ISA 80, SM_90 // .sem = { .release } -// .scope = { .cta, .cluster } +// .scope = { .cluster } // .space = { .shared::cluster } -template +template __device__ static inline void mbarrier_arrive_expect_tx( cuda::ptx::sem_release_t, - cuda::ptx::scope_t scope, + cuda::ptx::scope_cluster_t, cuda::ptx::space_cluster_t, uint64_t* addr, const uint32_t& tx_count); diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h index 01cba7cd0b1..39bab140414 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h @@ -90,15 +90,17 @@ mbarrier.arrive{.sem}{.scope}{.space}.b64 state, [addr], coun .sem = { .release } .scope = { .cta, .cluster } +// NOTE: .scope=.cta is dropped on purpose mbarrier.arrive{.sem}{.scope}{.space}.b64 _, [addr]; // 4a. PTX ISA 80, SM_90, !memory .space = { .shared::cluster} .sem = { .release } -.scope = { .cta, .cluster } +.scope = { .cluster } +// NOTE: .scope=.cta is dropped on purpose mbarrier.arrive{.sem}{.scope}{.space}.b64 _, [addr], count; // 4b. PTX ISA 80, SM_90, !memory .space = { .shared::cluster} .sem = { .release } -.scope = { .cta, .cluster } +.scope = { .cluster } // mbarrier_arrive_no_complete: @@ -110,10 +112,11 @@ mbarrier.arrive.expect_tx{.sem}{.scope}{.space}.b64 state, [addr], tx_cou .sem = { .release } .scope = { .cta, .cluster } +// NOTE: .scope=.cta is dropped on purpose mbarrier.arrive.expect_tx{.sem}{.scope}{.space}.b64 _, [addr], tx_count; // 9. PTX ISA 80, SM_90, !memory .space = { .shared::cluster } .sem = { .release } -.scope = { .cta, .cluster } +.scope = { .cluster } */ @@ -283,44 +286,35 @@ _LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive( /* // mbarrier.arrive{.sem}{.scope}{.space}.b64 _, [addr]; // 4a. PTX ISA 80, SM_90 // .sem = { .release } -// .scope = { .cta, .cluster } +// .scope = { .cluster } // .space = { .shared::cluster } -template +template __device__ static inline void mbarrier_arrive( cuda::ptx::sem_release_t, - cuda::ptx::scope_t scope, + cuda::ptx::scope_cluster_t, cuda::ptx::space_cluster_t, uint64_t* addr); */ #if __cccl_ptx_isa >= 800 extern "C" _LIBCUDACXX_DEVICE void __void__cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); -template +template _LIBCUDACXX_DEVICE static inline void mbarrier_arrive( sem_release_t, - scope_t<_Scope> __scope, + scope_cluster_t, space_cluster_t, _CUDA_VSTD::uint64_t* __addr) { // __sem == sem_release (due to parameter type constraint) - static_assert(__scope == scope_cta || __scope == scope_cluster, ""); + // __scope == scope_cluster (due to parameter type constraint) // __space == space_cluster (due to parameter type constraint) NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,( - if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cta) { - asm ( - "mbarrier.arrive.release.cta.shared::cluster.b64 _, [%0]; // 4a. " - : - : "r"(__as_ptr_smem(__addr)) - : "memory" - ); - } else if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cluster) { - asm ( - "mbarrier.arrive.release.cluster.shared::cluster.b64 _, [%0]; // 4a. " - : - : "r"(__as_ptr_smem(__addr)) - : "memory" - ); - } + asm ( + "mbarrier.arrive.release.cluster.shared::cluster.b64 _, [%0]; // 4a. " + : + : "r"(__as_ptr_smem(__addr)) + : "memory" + ); ),( // Unsupported architectures will have a linker error with a semi-decent error message @@ -332,48 +326,38 @@ _LIBCUDACXX_DEVICE static inline void mbarrier_arrive( /* // mbarrier.arrive{.sem}{.scope}{.space}.b64 _, [addr], count; // 4b. PTX ISA 80, SM_90 // .sem = { .release } -// .scope = { .cta, .cluster } +// .scope = { .cluster } // .space = { .shared::cluster } -template +template __device__ static inline void mbarrier_arrive( cuda::ptx::sem_release_t, - cuda::ptx::scope_t scope, + cuda::ptx::scope_cluster_t, cuda::ptx::space_cluster_t, uint64_t* addr, const uint32_t& count); */ #if __cccl_ptx_isa >= 800 extern "C" _LIBCUDACXX_DEVICE void __void__cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); -template +template _LIBCUDACXX_DEVICE static inline void mbarrier_arrive( sem_release_t, - scope_t<_Scope> __scope, + scope_cluster_t, space_cluster_t, _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __count) { // __sem == sem_release (due to parameter type constraint) - static_assert(__scope == scope_cta || __scope == scope_cluster, ""); + // __scope == scope_cluster (due to parameter type constraint) // __space == space_cluster (due to parameter type constraint) NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,( - if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cta) { - asm ( - "mbarrier.arrive.release.cta.shared::cluster.b64 _, [%0], %1; // 4b. " - : - : "r"(__as_ptr_smem(__addr)), - "r"(__count) - : "memory" - ); - } else if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cluster) { - asm ( - "mbarrier.arrive.release.cluster.shared::cluster.b64 _, [%0], %1; // 4b. " - : - : "r"(__as_ptr_smem(__addr)), - "r"(__count) - : "memory" - ); - } + asm ( + "mbarrier.arrive.release.cluster.shared::cluster.b64 _, [%0], %1; // 4b. " + : + : "r"(__as_ptr_smem(__addr)), + "r"(__count) + : "memory" + ); ),( // Unsupported architectures will have a linker error with a semi-decent error message @@ -470,48 +454,38 @@ _LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive_expect_tx( /* // mbarrier.arrive.expect_tx{.sem}{.scope}{.space}.b64 _, [addr], tx_count; // 9. PTX ISA 80, SM_90 // .sem = { .release } -// .scope = { .cta, .cluster } +// .scope = { .cluster } // .space = { .shared::cluster } -template +template __device__ static inline void mbarrier_arrive_expect_tx( cuda::ptx::sem_release_t, - cuda::ptx::scope_t scope, + cuda::ptx::scope_cluster_t, cuda::ptx::space_cluster_t, uint64_t* addr, const uint32_t& tx_count); */ #if __cccl_ptx_isa >= 800 extern "C" _LIBCUDACXX_DEVICE void __void__cuda_ptx_mbarrier_arrive_expect_tx_is_not_supported_before_SM_90__(); -template +template _LIBCUDACXX_DEVICE static inline void mbarrier_arrive_expect_tx( sem_release_t, - scope_t<_Scope> __scope, + scope_cluster_t, space_cluster_t, _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __tx_count) { // __sem == sem_release (due to parameter type constraint) - static_assert(__scope == scope_cta || __scope == scope_cluster, ""); + // __scope == scope_cluster (due to parameter type constraint) // __space == space_cluster (due to parameter type constraint) NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,( - if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cta) { - asm ( - "mbarrier.arrive.expect_tx.release.cta.shared::cluster.b64 _, [%0], %1; // 9. " - : - : "r"(__as_ptr_smem(__addr)), - "r"(__tx_count) - : "memory" - ); - } else if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cluster) { - asm ( - "mbarrier.arrive.expect_tx.release.cluster.shared::cluster.b64 _, [%0], %1; // 9. " - : - : "r"(__as_ptr_smem(__addr)), - "r"(__tx_count) - : "memory" - ); - } + asm ( + "mbarrier.arrive.expect_tx.release.cluster.shared::cluster.b64 _, [%0], %1; // 9. " + : + : "r"(__as_ptr_smem(__addr)), + "r"(__tx_count) + : "memory" + ); ),( // Unsupported architectures will have a linker error with a semi-decent error message @@ -522,6 +496,7 @@ _LIBCUDACXX_DEVICE static inline void mbarrier_arrive_expect_tx( + // 9.7.12.15.14. Parallel Synchronization and Communication Instructions: mbarrier.arrive_drop // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive-drop