From 16ad54ae7e99d0d4dc2338acc63a4d7c507bcb37 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Tue, 17 Oct 2023 16:27:24 +0200
Subject: [PATCH 01/49] Initial proof-of-concept for PTX header

---
 .../test/cuda/ptx/mbarrier_arrive_tx.pass.cpp |  52 +++
 libcudacxx/include/cuda/ptx                   | 352 ++++++++++++++++++
 .../cuda/std/detail/libcxx/include/__config   |   2 +
 3 files changed, 406 insertions(+)
 create mode 100644 libcudacxx/.upstream-tests/test/cuda/ptx/mbarrier_arrive_tx.pass.cpp
 create mode 100644 libcudacxx/include/cuda/ptx

diff --git a/libcudacxx/.upstream-tests/test/cuda/ptx/mbarrier_arrive_tx.pass.cpp b/libcudacxx/.upstream-tests/test/cuda/ptx/mbarrier_arrive_tx.pass.cpp
new file mode 100644
index 00000000000..f72406bdeb2
--- /dev/null
+++ b/libcudacxx/.upstream-tests/test/cuda/ptx/mbarrier_arrive_tx.pass.cpp
@@ -0,0 +1,52 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+// UNSUPPORTED: libcpp-has-no-threads
+// UNSUPPORTED: pre-sm-90
+
+// <cuda/ptx>
+
+#include <cuda/ptx>
+
+#include <cuda/std/utility>
+
+#include "concurrent_agents.h"
+#include "cuda_space_selector.h"
+#include "test_macros.h"
+
+int main(int, char**)
+{
+    NV_DISPATCH_TARGET(
+        NV_IS_HOST, (
+            // Required by concurrent_agents_launch to know how many we're
+            // launching. This can only be an int, because the nvrtc tests use grep
+            // to figure out how many threads to launch.
+            cuda_thread_count = 1;
+        ),
+        NV_IS_DEVICE, (
+            // Do not execute. Just check if this compiles (that is: assembles) without error.
+            if (false) {
+                using cuda::ptx::sem_release;
+                using cuda::ptx::space_shared_cluster;
+                using cuda::ptx::space_shared;
+                using cuda::ptx::scope_cluster;
+                using cuda::ptx::scope_cta;
+
+                __shared__ uint64_t bar;
+                cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cta, space_shared, &bar, 1);
+                cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cluster, space_shared, &bar, 1);
+
+                cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cta, space_shared_cluster, &bar, 1);
+                cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cluster, space_shared_cluster, &bar, 1);
+            }
+        )
+    );
+
+    return 0;
+}
diff --git a/libcudacxx/include/cuda/ptx b/libcudacxx/include/cuda/ptx
new file mode 100644
index 00000000000..31d22a40231
--- /dev/null
+++ b/libcudacxx/include/cuda/ptx
@@ -0,0 +1,352 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_PTX
+#define _CUDA_PTX
+
+#include "std/detail/__config"  // Macros
+#include "std/type_traits"      // std::integral_constant
+#include "../nv/target"         // __CUDA_MINIMUM_ARCH__ and friends
+
+/*
+ * The cuda::ptx namespace intends to provide PTX wrappers for new hardware
+ * features and new PTX instructions so that they can be experimented with
+ * before higher-level C++ APIs are designed and developed.
+ *
+ * The wrappers have the following responsibilities:
+ *
+ * - They must prevent any PTX assembler errors, that is:
+ *   - They are defined only for versions of the CUDA Toolkit in which nvcc/ptxas
+ *     actually recognizes the instruction.
+ *   - Sizes and types of parameters are correct.
+ * - They must convert state spaces correctly.
+ * - They adhere to the libcu++ coding standards of using:
+ *   - Double underscores for all parameters, variables
+ *   - _CUDA_VSTD:: namespace for types
+ *
+ * The wrappers should not do the following:
+ *
+ * - Use any non-native types. For example, an mbarrier instruction wrapper
+ *   takes the barrier address as a uint64_t pointer.
+ *
+ * This header is intended for:
+ *
+ * - internal consumption by higher-level APIs such as cuda::barrier,
+ * - outside developers who want to experiment with the latest features of the
+ *   hardware.
+ *
+ */
+
+_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
+
+/*
+ * Integral constant types and values for
+ *
+ * - .sem
+ * - .space
+ * - .scope
+ *
+ * Skipping some steps in my reasoning: If we want to keep the PTX bindings
+ * relatively stable, and also be able to adapt to additions of semantics,
+ * space, and scope variants of a PTX instruction, then we must be able to add
+ * new overloads of an instruction with .sem, .space, or .scope as type-level
+ * parameters.
+ *
+ */
+
+enum class dot_sem {
+    acq_rel,
+    acquire,
+    relaxed,
+    release,
+    sc,
+    weak
+    // mmio?
+    // volatile?
+};
+
+enum class dot_space {
+    reg,
+    sreg,
+    const_mem, // can't use const
+    global,
+    local,
+    param,
+    shared,
+    shared_cluster,
+    tex // deprecated
+};
+
+enum class dot_scope {
+    cta,
+    cluster,
+    gpu,
+    sys
+};
+
+template <dot_sem sem>
+using sem_t = std::integral_constant<dot_sem, sem>;
+using sem_acq_rel_t = sem_t<dot_sem::acq_rel>;
+using sem_acquire_t = sem_t<dot_sem::acquire>;
+using sem_relaxed_t = sem_t<dot_sem::relaxed>;
+using sem_release_t = sem_t<dot_sem::release>;
+using sem_sc_t = sem_t<dot_sem::sc>;
+using sem_weak_t = sem_t<dot_sem::weak>;
+
+static constexpr sem_acq_rel_t sem_acq_rel{};
+static constexpr sem_acquire_t sem_acquire{};
+static constexpr sem_relaxed_t sem_relaxed{};
+static constexpr sem_release_t sem_release{};
+static constexpr sem_sc_t sem_sc{};
+static constexpr sem_weak_t sem_weak{};
+
+template <dot_space spc>
+using space_t = std::integral_constant<dot_space, spc>;
+using space_const_mem_t = std::integral_constant<dot_space, dot_space::const_mem>;
+using space_global_t = std::integral_constant<dot_space, dot_space::global>;
+using space_local_t = std::integral_constant<dot_space, dot_space::local>;
+using space_param_t = std::integral_constant<dot_space, dot_space::param>;
+using space_reg_t = std::integral_constant<dot_space, dot_space::reg>;
+using space_shared_t = std::integral_constant<dot_space, dot_space::shared>;
+using space_shared_cluster_t = std::integral_constant<dot_space, dot_space::shared_cluster>;
+using space_sreg_t = std::integral_constant<dot_space, dot_space::sreg>;
+using space_tex_t = std::integral_constant<dot_space, dot_space::tex>;
+
+static constexpr space_const_mem_t space_const_mem{};
+static constexpr space_global_t space_global{};
+static constexpr space_local_t space_local{};
+static constexpr space_param_t space_param{};
+static constexpr space_reg_t space_reg{};
+static constexpr space_shared_t space_shared{};
+static constexpr space_shared_cluster_t space_shared_cluster{};
+static constexpr space_sreg_t space_sreg{};
+static constexpr space_tex_t space_tex{};
+
+template <dot_scope scope>
+using scope_t = std::integral_constant<dot_scope, scope>;
+using scope_cluster_t = std::integral_constant<dot_scope, dot_scope::cluster>;
+using scope_cta_t = std::integral_constant<dot_scope, dot_scope::cta>;
+using scope_gpu_t = std::integral_constant<dot_scope, dot_scope::gpu>;
+using scope_sys_t = std::integral_constant<dot_scope, dot_scope::sys>;
+
+static constexpr scope_cluster_t scope_cluster{};
+static constexpr scope_cta_t scope_cta{};
+static constexpr scope_gpu_t scope_gpu{};
+static constexpr scope_sys_t scope_sys{};
+
+
+inline __device__ _CUDA_VSTD::uint32_t __as_smem_ptr(const void * __ptr)   { return static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__ptr)); }
+inline __device__ _CUDA_VSTD::uint32_t __as_remote_dsmem_ptr(const void * __ptr)   { return static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__ptr)); }
+inline __device__ _CUDA_VSTD::uint64_t __as_gmem_ptr(const void * __ptr)   { return static_cast<_CUDA_VSTD::uint64_t>(__cvta_generic_to_global(__ptr)); }
+
+
+
+// SM 90 features
+// --------------
+
+/*
+ *  TMA / cp.async.bulk
+ *
+ */
+
+// cp.async.bulk
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk
+
+// cp.reduce.async.bulk
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-reduce-async-bulk
+
+// cp.async.bulk.tensor
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
+
+// cp.reduce.async.bulk.tensor
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-reduce-async-bulk-tensor
+
+// cp.async.bulk.commit_group
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-commit-group
+
+// cp.async.bulk.wait_group
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-wait-group
+
+
+// Lower priority:
+
+// prefetch{.tensormap_space}.tensormap [a];  // prefetch the tensormap
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-prefetch-prefetchu
+
+// cp.async.bulk.prefetch
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk
+
+// cp.async.bulk.prefetch.tensor
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-prefetch-tensor
+
+/*
+ *  Shared memory barrier
+ *
+ */
+
+// mbarrier.expect_tx
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-expect-tx
+
+// mbarrier.complete_tx
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-complete-tx
+
+// mbarrier.arrive.expect_tx
+// Support for count argument without the modifier .noComplete requires sm_90 or higher.
+// Qualifier .expect_tx requires sm_90 or higher.
+// Sub-qualifier ::cluster requires sm_90 or higher.
+// Support for .cluster scope requires sm_90 or higher.
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive
+
+
+
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive
+#if (defined(__CUDA_MINIMUM_ARCH__) && 900 <= __CUDA_MINIMUM_ARCH__) || (!defined(__CUDA_MINIMUM_ARCH__))
+template <dot_scope _Sco>
+_LIBCUDACXX_DEVICE inline
+_CUDA_VSTD::uint64_t mbarrier_arrive_expect_tx(sem_release_t __sem, scope_t<_Sco> __scope, space_shared_t __spc, _CUDA_VSTD::uint64_t* __addr, _CUDA_VSTD::uint32_t __tx_count)
+{
+    // Arrive on local shared memory barrier
+    static_assert(__scope == scope_cta || __scope == scope_cluster, "");
+    _CUDA_VSTD::uint64_t __token;
+
+    if constexpr (__scope == scope_cta) {
+        asm (
+            "mbarrier.arrive.expect_tx.release.cta.shared::cta.b64 %0, [%1], %2;"
+            : "=l"(__token)
+            : "r"(__as_smem_ptr(__addr)),
+              "r"(__tx_count)
+            : "memory");
+    } else {
+        asm (
+            "mbarrier.arrive.expect_tx.release.cluster.shared::cta.b64 %0, [%1], %2;"
+            : "=l"(__token)
+            : "r"(__as_smem_ptr(__addr)),
+              "r"(__tx_count)
+            : "memory");
+    }
+    return __token;
+}
+
+template <dot_scope _Sco>
+_LIBCUDACXX_DEVICE inline
+void mbarrier_arrive_expect_tx(sem_release_t __sem, scope_t<_Sco> __scope, space_shared_cluster_t __spc, _CUDA_VSTD::uint64_t* __addr, _CUDA_VSTD::uint32_t __tx_count)
+{
+    // Arrive on remote cluster barrier
+    static_assert(__scope == scope_cta || __scope == scope_cluster, "");
+    if constexpr (__scope == scope_cta) {
+        asm (
+            "mbarrier.arrive.expect_tx.release.cta.shared::cluster.b64 _, [%0], %1;"
+            :
+            : "r"(__as_smem_ptr(__addr)),
+              "r"(__tx_count)
+            : "memory");
+    } else {
+        asm (
+            "mbarrier.arrive.expect_tx.release.cluster.shared::cluster.b64 _, [%0], %1;"
+            :
+            : "r"(__as_smem_ptr(__addr)),
+              "r"(__tx_count)
+            : "memory");
+    }
+}
+#endif // __CUDA_MINIMUM_ARCH__
+
+
+
+
+// mbarrier.test_wait/mbarrier.try_wait
+// mbarrier.try_wait requires sm_90 or higher.
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-test-wait-mbarrier-try-wait
+
+
+/*
+ *  Cluster Basics:
+ *
+ *  These instructions are already exposed at a higher level, so may not be necessary.
+ */
+
+// mapa{.space}.type          d, a, b;
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-mapa
+
+// getctarank{.space}.type d, a;
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-getctarank
+
+// barrier.cluster
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-barrier-cluster
+
+// atom .cluster
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-atom
+
+// red .cluster
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-red
+
+/*
+ *   Cluster async
+ *
+ */
+
+// st.async
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-st-async
+
+// red.async
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-red-async
+
+/*
+ *
+ *   Other instructions
+ */
+
+// fence.proxy.async.{global, shared::{cta, cluster}}
+// fence.mbarrier_init.release.cluster (may be a bit overkill??)
+// fence.{sc, acq_rel}.cluster
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar-fence
+
+// multimem.ld_reduce, multimem.st, multimem.red
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-multimem-ld-reduce-multimem-st-multimem-red
+
+// griddepcontrol
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-griddepcontrol
+
+// elect.sync
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-elect-sync
+
+// stmatrix
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-store-instruction-stmatrix
+
+/*
+ *  Special registers (cluster-related)
+ *
+ */
+
+//  10.12. Special Registers: %clusterid
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-clusterid
+
+//  10.13. Special Registers: %nclusterid
+//  https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-nclusterid
+
+//  10.14. Special Registers: %cluster_ctaid
+//  https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-cluster-ctaid
+
+// 10.15. Special Registers: %cluster_nctaid
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-cluster-nctaid
+
+//  10.16. Special Registers: %cluster_ctarank
+//  https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-cluster-ctarank
+
+//  10.17. Special Registers: %cluster_nctarank
+//  https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-cluster-nctarank
+
+//  10.31. Special Registers: %aggr_smem_size
+//  https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-aggr-smem-size
+
+_LIBCUDACXX_END_NAMESPACE_CUDA_PTX
+
+#endif // _CUDA_PTX
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__config b/libcudacxx/include/cuda/std/detail/libcxx/include/__config
index 79fe46c5a05..65db3322031 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__config
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__config
@@ -1493,6 +1493,8 @@ typedef __char32_t char32_t;
 #define _LIBCUDACXX_END_NAMESPACE_CUDA  } }
 #define _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_DEVICE namespace cuda { namespace device { inline namespace _LIBCUDACXX_ABI_NAMESPACE {
 #define _LIBCUDACXX_END_NAMESPACE_CUDA_DEVICE  } } }
+#define _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX namespace cuda { namespace ptx { inline namespace _LIBCUDACXX_ABI_NAMESPACE {
+#define _LIBCUDACXX_END_NAMESPACE_CUDA_PTX  } } }
 #define _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_DEVICE_EXPERIMENTAL namespace cuda { namespace device { namespace experimental { inline namespace _LIBCUDACXX_ABI_NAMESPACE {
 #define _LIBCUDACXX_END_NAMESPACE_CUDA_DEVICE_EXPERIMENTAL  } } } }
 #endif

From 9b31cc8b801b75d6f8a1b82eae95f8a4f98d7f08 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Tue, 17 Oct 2023 16:53:27 +0200
Subject: [PATCH 02/49] Add docs

---
 libcudacxx/docs/extended_api.md     |  2 +
 libcudacxx/docs/extended_api/ptx.md | 70 +++++++++++++++++++++++++++++
 2 files changed, 72 insertions(+)
 create mode 100644 libcudacxx/docs/extended_api/ptx.md

diff --git a/libcudacxx/docs/extended_api.md b/libcudacxx/docs/extended_api.md
index 952b7c81e51..6f71683edc7 100644
--- a/libcudacxx/docs/extended_api.md
+++ b/libcudacxx/docs/extended_api.md
@@ -21,6 +21,8 @@ nav_order: 3
 
 {% include_relative extended_api/functional.md %}
 
+{% include_relative extended_api/ptx.md %}
+
 [Thread Scopes]: ./extended_api/memory_model.md#thread-scopes
 [Thread Groups]: ./extended_api/thread_groups.md
 
diff --git a/libcudacxx/docs/extended_api/ptx.md b/libcudacxx/docs/extended_api/ptx.md
new file mode 100644
index 00000000000..56a3c519f5c
--- /dev/null
+++ b/libcudacxx/docs/extended_api/ptx.md
@@ -0,0 +1,70 @@
+## PTX instructions
+
+The `cuda::ptx` namespace contains functions that map one-to-one to PTX
+instructions. These can be used for maximal control of the generated code, or to
+experiment with new hardware features before a high-level C++ API is available.
+
+### Shared memory barrier (mbarrier)
+
+| Instruction | Compute capability | CUDA Toolkit |
+|----------------------------------------|--------------------|--------------|
+| `cuda::ptx::mbarrier_arrive_expect_tx` | 9.0                | CTK 12.4     |
+
+
+#### [`cuda::ptx::mbarrier_arrive_expect_tx`](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive)
+
+```cuda
+template <dot_scope _Sco>
+__device__ inline
+uint64_t mbarrier_arrive_expect_tx(sem_release_t __sem, scope_t<_Sco> __scope, space_shared_t __spc, uint64_t* __addr, uint32_t __tx_count);
+
+template <dot_scope _Sco>
+__device__ inline
+void mbarrier_arrive_expect_tx(sem_release_t __sem, scope_t<_Sco> __scope, space_shared_cluster_t __spc, uint64_t* __addr, uint32_t __tx_count);
+```
+
+Usage:
+
+```cuda
+#include <cuda/ptx>
+#include <cuda/barrier>
+#include <cooperative_groups.h>
+
+__global__ void kernel() {
+    using cuda::ptx::sem_release;
+    using cuda::ptx::space_shared_cluster;
+    using cuda::ptx::space_shared;
+    using cuda::ptx::scope_cluster;
+    using cuda::ptx::scope_cta;
+
+    using barrier_t = cuda::barrier<cuda::thread_scope_block>;
+    __shared__ barrier_t bar;
+    init(&bar, blockDim.x);
+    __syncthreads();
+
+    
+
+    NV_IF_TARGET(NV_PROVIDES_SM_90, (
+        // Arrive on local shared memory barrier:
+        uint64_t token;
+        token = cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cta, space_shared, &bar, 1);
+        token = cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cluster, space_shared, &bar, 1);
+
+        // Get address of remote cluster barrier:
+        namespace cg = cooperative_groups;
+        cg::cluster_group cluster = cg::this_cluster();
+        unsigned int other_block_rank = cluster.block_rank() ^ 1;
+        uint64_t * remote_bar = cluster.map_shared_rank(&bar, other_block_rank);
+
+        // Sync cluster to ensure remote barrier is initialized.
+        cluster.sync();
+
+        // Arrive on remote cluster barrier:
+        cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cta, space_shared_cluster, remote_bar, 1);
+        cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cluster, space_shared_cluster, remote_bar, 1);
+    )
+}
+```
+
+
+

From 229704aa0a0f3d2f71977d60f3044b98a1fd3bee Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Tue, 17 Oct 2023 16:55:11 +0200
Subject: [PATCH 03/49] Reformat docs

---
 libcudacxx/docs/extended_api/ptx.md | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/libcudacxx/docs/extended_api/ptx.md b/libcudacxx/docs/extended_api/ptx.md
index 56a3c519f5c..26d3236b9bd 100644
--- a/libcudacxx/docs/extended_api/ptx.md
+++ b/libcudacxx/docs/extended_api/ptx.md
@@ -16,11 +16,11 @@ experiment with new hardware features before a high-level C++ API is available.
 ```cuda
 template <dot_scope _Sco>
 __device__ inline
-uint64_t mbarrier_arrive_expect_tx(sem_release_t __sem, scope_t<_Sco> __scope, space_shared_t __spc, uint64_t* __addr, uint32_t __tx_count);
+uint64_t mbarrier_arrive_expect_tx(sem_release_t sem, scope_t<_Sco> scope, space_shared_t spc, uint64_t* addr, uint32_t tx_count);
 
 template <dot_scope _Sco>
 __device__ inline
-void mbarrier_arrive_expect_tx(sem_release_t __sem, scope_t<_Sco> __scope, space_shared_cluster_t __spc, uint64_t* __addr, uint32_t __tx_count);
+void mbarrier_arrive_expect_tx(sem_release_t sem, scope_t<_Sco> scope, space_shared_cluster_t spc, uint64_t* addr, uint32_t tx_count);
 ```
 
 Usage:
@@ -42,8 +42,6 @@ __global__ void kernel() {
     init(&bar, blockDim.x);
     __syncthreads();
 
-    
-
     NV_IF_TARGET(NV_PROVIDES_SM_90, (
         // Arrive on local shared memory barrier:
         uint64_t token;

From dad93de59af47bf8003cea1c00d3c228255e6de2 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Tue, 17 Oct 2023 18:26:29 +0200
Subject: [PATCH 04/49] Use PTX wrapper in internal code

---
 .../cuda/std/detail/libcxx/include/__cuda/barrier.h | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h
index da6b09b3e3d..6b3919f29d4 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h
@@ -24,6 +24,7 @@
 #endif
 
 #include "../cstdlib"           // _LIBCUDACXX_UNREACHABLE
+#include <cuda/ptx>             // cuda::ptx::*
 
 #if defined(_LIBCUDACXX_COMPILER_NVRTC)
 #define _LIBCUDACXX_OFFSET_IS_ZERO(type, member) !(&(((type *)0)->member))
@@ -586,14 +587,12 @@ barrier<thread_scope_block>::arrival_token barrier_arrive_tx(
             // us in release builds. In debug builds, the error would be caught
             // by the asserts at the top of this function.
 
-            auto __bh = __cvta_generic_to_shared(barrier_native_handle(__b));
+            auto __native_handle = barrier_native_handle(__b);
+            auto __bh = __cvta_generic_to_shared(__native_handle);
             if (__arrive_count_update == 1) {
-                asm (
-                    "mbarrier.arrive.expect_tx.release.cta.shared::cta.b64 %0, [%1], %2;"
-                    : "=l"(__token)
-                    : "r"(static_cast<_CUDA_VSTD::uint32_t>(__bh)),
-                      "r"(static_cast<_CUDA_VSTD::uint32_t>(__transaction_count_update))
-                    : "memory");
+                __token = cuda::ptx::mbarrier_arrive_expect_tx(
+                    cuda::ptx::sem_release, cuda::ptx::scope_cta, cuda::ptx::space_shared, __native_handle, __transaction_count_update
+                );
             } else {
                 asm (
                     "mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;"

From 220d4758eebb9504084fd80c9075fff89a24a570 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <allard@allardhendriksen.nl>
Date: Wed, 18 Oct 2023 10:38:12 +0200
Subject: [PATCH 05/49] Apply suggestions from code review

Co-authored-by: Michael Schellenberger Costa <miscco@nvidia.com>
---
 libcudacxx/docs/extended_api/ptx.md | 2 +-
 libcudacxx/include/cuda/ptx         | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/libcudacxx/docs/extended_api/ptx.md b/libcudacxx/docs/extended_api/ptx.md
index 26d3236b9bd..e8ab487d16a 100644
--- a/libcudacxx/docs/extended_api/ptx.md
+++ b/libcudacxx/docs/extended_api/ptx.md
@@ -1,7 +1,7 @@
 ## PTX instructions
 
 The `cuda::ptx` namespace contains functions that map one-to-one to PTX
-instructions. These can be used for maximal control of the generated code, or to
+[instructions](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html). These can be used for maximal control of the generated code, or to
 experiment with new hardware features before a high-level C++ API is available.
 
 ### Shared memory barrier (mbarrier)
diff --git a/libcudacxx/include/cuda/ptx b/libcudacxx/include/cuda/ptx
index 31d22a40231..ff2c26bb05b 100644
--- a/libcudacxx/include/cuda/ptx
+++ b/libcudacxx/include/cuda/ptx
@@ -29,7 +29,7 @@
  *   - Sizes and types of parameters are correct.
  * - They must convert state spaces correctly.
  * - They adhere to the libcu++ coding standards of using:
- *   - Double underscores for all parameters, variables
+ *   - Reserved identifiers for all parameters, variables. E.g. `__meow` or `_Woof`
  *   - _CUDA_VSTD:: namespace for types
  *
  * The wrappers should not do the following:

From ae1a0846fe6fe652fe9a7b47a57000d89267370d Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Wed, 18 Oct 2023 10:38:50 +0200
Subject: [PATCH 06/49] Address review comments

---
 libcudacxx/include/cuda/ptx | 59 +++++++++++++++++++++++--------------
 1 file changed, 37 insertions(+), 22 deletions(-)

diff --git a/libcudacxx/include/cuda/ptx b/libcudacxx/include/cuda/ptx
index ff2c26bb05b..c64b20fbfbf 100644
--- a/libcudacxx/include/cuda/ptx
+++ b/libcudacxx/include/cuda/ptx
@@ -43,12 +43,25 @@
  * - outside developers who want to experiment with the latest features of the
  *   hardware.
  *
+ * Stability:
+ *
+ * - These headers are intended to present a stable API (not ABI) within one
+ *   major version of the CTK. This means that:
+ *   - All functions are marked inline
+ *   - The type of a function parameter can be changed to be more generic if
+ *     that means that code that called the original version can still be
+ *     compiled.
+ *
+ * - Good exposure of the PTX should be high priority. If, at a new major
+ *   version, we face a difficult choice between breaking backward-compatibility
+ *   and an improvement of the PTX exposure, we will tend to the latter option
+ *   more easily than in other parts of libcu++.
  */
 
 _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
 /*
- * Integral constant types and values for
+ * Public integral constant types and values for
  *
  * - .sem
  * - .space
@@ -76,13 +89,14 @@ enum class dot_sem {
 enum class dot_space {
     reg,
     sreg,
-    const_mem, // can't use const
+    const_mem,                 // Using const_mem as `const` is reserved in C++.
     global,
     local,
     param,
-    shared,
-    shared_cluster,
+    shared,                    // The PTX spelling is shared::cta
+    shared_cluster,            // The PTX spelling is shared::cluster, but we might want to go for cluster here.
     tex // deprecated
+    // generic?
 };
 
 enum class dot_scope {
@@ -93,7 +107,7 @@ enum class dot_scope {
 };
 
 template <dot_sem sem>
-using sem_t = std::integral_constant<dot_sem, sem>;
+using sem_t = _CUDA_VSTD::integral_constant<dot_sem, sem>;
 using sem_acq_rel_t = sem_t<dot_sem::acq_rel>;
 using sem_acquire_t = sem_t<dot_sem::acquire>;
 using sem_relaxed_t = sem_t<dot_sem::relaxed>;
@@ -109,16 +123,16 @@ static constexpr sem_sc_t sem_sc{};
 static constexpr sem_weak_t sem_weak{};
 
 template <dot_space spc>
-using space_t = std::integral_constant<dot_space, spc>;
-using space_const_mem_t = std::integral_constant<dot_space, dot_space::const_mem>;
-using space_global_t = std::integral_constant<dot_space, dot_space::global>;
-using space_local_t = std::integral_constant<dot_space, dot_space::local>;
-using space_param_t = std::integral_constant<dot_space, dot_space::param>;
-using space_reg_t = std::integral_constant<dot_space, dot_space::reg>;
-using space_shared_t = std::integral_constant<dot_space, dot_space::shared>;
-using space_shared_cluster_t = std::integral_constant<dot_space, dot_space::shared_cluster>;
-using space_sreg_t = std::integral_constant<dot_space, dot_space::sreg>;
-using space_tex_t = std::integral_constant<dot_space, dot_space::tex>;
+using space_t = _CUDA_VSTD::integral_constant<dot_space, spc>;
+using space_const_mem_t = space_t<dot_space::const_mem>;
+using space_global_t = space_t<dot_space::global>;
+using space_local_t = space_t<dot_space::local>;
+using space_param_t = space_t<dot_space::param>;
+using space_reg_t = space_t<dot_space::reg>;
+using space_shared_t = space_t<dot_space::shared>;
+using space_shared_cluster_t = space_t<dot_space::shared_cluster>;
+using space_sreg_t = space_t<dot_space::sreg>;
+using space_tex_t = space_t<dot_space::tex>;
 
 static constexpr space_const_mem_t space_const_mem{};
 static constexpr space_global_t space_global{};
@@ -131,11 +145,11 @@ static constexpr space_sreg_t space_sreg{};
 static constexpr space_tex_t space_tex{};
 
 template <dot_scope scope>
-using scope_t = std::integral_constant<dot_scope, scope>;
-using scope_cluster_t = std::integral_constant<dot_scope, dot_scope::cluster>;
-using scope_cta_t = std::integral_constant<dot_scope, dot_scope::cta>;
-using scope_gpu_t = std::integral_constant<dot_scope, dot_scope::gpu>;
-using scope_sys_t = std::integral_constant<dot_scope, dot_scope::sys>;
+using scope_t = _CUDA_VSTD::integral_constant<dot_scope, scope>;
+using scope_cluster_t = scope_t<dot_scope::cluster>;
+using scope_cta_t = scope_t<dot_scope::cta>;
+using scope_gpu_t = scope_t<dot_scope::gpu>;
+using scope_sys_t = scope_t<dot_scope::sys>;
 
 static constexpr scope_cluster_t scope_cluster{};
 static constexpr scope_cta_t scope_cta{};
@@ -143,6 +157,7 @@ static constexpr scope_gpu_t scope_gpu{};
 static constexpr scope_sys_t scope_sys{};
 
 
+// Private helper functions
 inline __device__ _CUDA_VSTD::uint32_t __as_smem_ptr(const void * __ptr)   { return static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__ptr)); }
 inline __device__ _CUDA_VSTD::uint32_t __as_remote_dsmem_ptr(const void * __ptr)   { return static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__ptr)); }
 inline __device__ _CUDA_VSTD::uint64_t __as_gmem_ptr(const void * __ptr)   { return static_cast<_CUDA_VSTD::uint64_t>(__cvta_generic_to_global(__ptr)); }
@@ -217,7 +232,7 @@ _CUDA_VSTD::uint64_t mbarrier_arrive_expect_tx(sem_release_t __sem, scope_t<_Sco
     static_assert(__scope == scope_cta || __scope == scope_cluster, "");
     _CUDA_VSTD::uint64_t __token;
 
-    if constexpr (__scope == scope_cta) {
+    if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cta) {
         asm (
             "mbarrier.arrive.expect_tx.release.cta.shared::cta.b64 %0, [%1], %2;"
             : "=l"(__token)
@@ -241,7 +256,7 @@ void mbarrier_arrive_expect_tx(sem_release_t __sem, scope_t<_Sco> __scope, space
 {
     // Arrive on remote cluster barrier
     static_assert(__scope == scope_cta || __scope == scope_cluster, "");
-    if constexpr (__scope == scope_cta) {
+    if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cta) {
         asm (
             "mbarrier.arrive.expect_tx.release.cta.shared::cluster.b64 _, [%0], %1;"
             :

From ecbb6fea762f49ed9793fff64abe1a0ffc39de08 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <allard@allardhendriksen.nl>
Date: Wed, 18 Oct 2023 11:51:36 +0200
Subject: [PATCH 07/49] Apply suggestions from code review

Co-authored-by: Michael Schellenberger Costa <miscco@nvidia.com>
---
 libcudacxx/docs/extended_api/ptx.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libcudacxx/docs/extended_api/ptx.md b/libcudacxx/docs/extended_api/ptx.md
index e8ab487d16a..d5092dc030f 100644
--- a/libcudacxx/docs/extended_api/ptx.md
+++ b/libcudacxx/docs/extended_api/ptx.md
@@ -1,7 +1,7 @@
 ## PTX instructions
 
-The `cuda::ptx` namespace contains functions that map one-to-one to PTX
-[instructions](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html). These can be used for maximal control of the generated code, or to
+The `cuda::ptx` namespace contains functions that map one-to-one to
+[PTX instructions](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html). These can be used for maximal control of the generated code, or to
 experiment with new hardware features before a high-level C++ API is available.
 
 ### Shared memory barrier (mbarrier)

From cf19e539c8ddb344d1c6dcf3aa9b24d0f44f0e00 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Wed, 18 Oct 2023 11:52:18 +0200
Subject: [PATCH 08/49] Address review comments

---
 .../test/cuda/ptx/mbarrier_arrive_tx.pass.cpp | 52 -------------------
 .../test/cuda/ptx/sm90.ptx.compile.pass.cpp   | 44 ++++++++++++++++
 libcudacxx/include/cuda/ptx                   |  7 ++-
 3 files changed, 47 insertions(+), 56 deletions(-)
 delete mode 100644 libcudacxx/.upstream-tests/test/cuda/ptx/mbarrier_arrive_tx.pass.cpp
 create mode 100644 libcudacxx/.upstream-tests/test/cuda/ptx/sm90.ptx.compile.pass.cpp

diff --git a/libcudacxx/.upstream-tests/test/cuda/ptx/mbarrier_arrive_tx.pass.cpp b/libcudacxx/.upstream-tests/test/cuda/ptx/mbarrier_arrive_tx.pass.cpp
deleted file mode 100644
index f72406bdeb2..00000000000
--- a/libcudacxx/.upstream-tests/test/cuda/ptx/mbarrier_arrive_tx.pass.cpp
+++ /dev/null
@@ -1,52 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of libcu++, the C++ Standard Library for your entire system,
-// under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
-//
-//===----------------------------------------------------------------------===//
-// UNSUPPORTED: libcpp-has-no-threads
-// UNSUPPORTED: pre-sm-90
-
-// <cuda/ptx>
-
-#include <cuda/ptx>
-
-#include <cuda/std/utility>
-
-#include "concurrent_agents.h"
-#include "cuda_space_selector.h"
-#include "test_macros.h"
-
-int main(int, char**)
-{
-    NV_DISPATCH_TARGET(
-        NV_IS_HOST, (
-            // Required by concurrent_agents_launch to know how many we're
-            // launching. This can only be an int, because the nvrtc tests use grep
-            // to figure out how many threads to launch.
-            cuda_thread_count = 1;
-        ),
-        NV_IS_DEVICE, (
-            // Do not execute. Just check if this compiles (that is: assembles) without error.
-            if (false) {
-                using cuda::ptx::sem_release;
-                using cuda::ptx::space_shared_cluster;
-                using cuda::ptx::space_shared;
-                using cuda::ptx::scope_cluster;
-                using cuda::ptx::scope_cta;
-
-                __shared__ uint64_t bar;
-                cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cta, space_shared, &bar, 1);
-                cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cluster, space_shared, &bar, 1);
-
-                cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cta, space_shared_cluster, &bar, 1);
-                cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cluster, space_shared_cluster, &bar, 1);
-            }
-        )
-    );
-
-    return 0;
-}
diff --git a/libcudacxx/.upstream-tests/test/cuda/ptx/sm90.ptx.compile.pass.cpp b/libcudacxx/.upstream-tests/test/cuda/ptx/sm90.ptx.compile.pass.cpp
new file mode 100644
index 00000000000..27b5af8e6f2
--- /dev/null
+++ b/libcudacxx/.upstream-tests/test/cuda/ptx/sm90.ptx.compile.pass.cpp
@@ -0,0 +1,44 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+// UNSUPPORTED: libcpp-has-no-threads
+// UNSUPPORTED: pre-sm-90
+
+// <cuda/ptx>
+
+#include <cuda/ptx>
+
+#include <cuda/std/utility>
+
+#include "concurrent_agents.h"
+#include "cuda_space_selector.h"
+#include "test_macros.h"
+
+int main(int, char**)
+{
+    NV_IF_TARGET(NV_IS_DEVICE, (
+        // Do not execute. Just check if below PTX compiles (that is: assembles) without error.
+        if (false) {
+            using cuda::ptx::sem_release;
+            using cuda::ptx::space_shared_cluster;
+            using cuda::ptx::space_shared;
+            using cuda::ptx::scope_cluster;
+            using cuda::ptx::scope_cta;
+
+            __shared__ uint64_t bar;
+            cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cta, space_shared, &bar, 1);
+            cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cluster, space_shared, &bar, 1);
+
+            cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cta, space_shared_cluster, &bar, 1);
+            cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cluster, space_shared_cluster, &bar, 1);
+        }
+    ));
+
+    return 0;
+}
diff --git a/libcudacxx/include/cuda/ptx b/libcudacxx/include/cuda/ptx
index c64b20fbfbf..c00bc8e9a6c 100644
--- a/libcudacxx/include/cuda/ptx
+++ b/libcudacxx/include/cuda/ptx
@@ -158,10 +158,9 @@ static constexpr scope_sys_t scope_sys{};
 
 
 // Private helper functions
-inline __device__ _CUDA_VSTD::uint32_t __as_smem_ptr(const void * __ptr)   { return static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__ptr)); }
-inline __device__ _CUDA_VSTD::uint32_t __as_remote_dsmem_ptr(const void * __ptr)   { return static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__ptr)); }
-inline __device__ _CUDA_VSTD::uint64_t __as_gmem_ptr(const void * __ptr)   { return static_cast<_CUDA_VSTD::uint64_t>(__cvta_generic_to_global(__ptr)); }
-
+inline _LIBCUDACXX_DEVICE CUDA_VSTD::uint32_t __as_smem_ptr(const void * __ptr)   { return static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__ptr)); }
+inline _LIBCUDACXX_DEVICE _CUDA_VSTD::uint32_t __as_remote_dsmem_ptr(const void * __ptr)   { return static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__ptr)); }
+inline _LIBCUDACXX_DEVICE _CUDA_VSTD::uint64_t __as_gmem_ptr(const void * __ptr)   { return static_cast<_CUDA_VSTD::uint64_t>(__cvta_generic_to_global(__ptr)); }
 
 
 // SM 90 features

From 1d57b022a4057df3cbf0bcf2bc115124762a38b4 Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Wed, 25 Oct 2023 12:02:40 +0200
Subject: [PATCH 09/49] Fix typo

---
 libcudacxx/include/cuda/ptx | 203 +++++++++++++++++++-----------------
 1 file changed, 106 insertions(+), 97 deletions(-)

diff --git a/libcudacxx/include/cuda/ptx b/libcudacxx/include/cuda/ptx
index c00bc8e9a6c..cf14026c01f 100644
--- a/libcudacxx/include/cuda/ptx
+++ b/libcudacxx/include/cuda/ptx
@@ -12,9 +12,9 @@
 #ifndef _CUDA_PTX
 #define _CUDA_PTX
 
-#include "std/detail/__config"  // Macros
-#include "std/type_traits"      // std::integral_constant
-#include "../nv/target"         // __CUDA_MINIMUM_ARCH__ and friends
+#include "std/cstdint" // uint32_t
+#include "std/type_traits" // std::integral_constant
+#include "../nv/target" // __CUDA_MINIMUM_ARCH__ and friends
 
 /*
  * The cuda::ptx namespace intends to provide PTX wrappers for new hardware
@@ -75,45 +75,48 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
  *
  */
 
-enum class dot_sem {
-    acq_rel,
-    acquire,
-    relaxed,
-    release,
-    sc,
-    weak
-    // mmio?
-    // volatile?
+enum class dot_sem
+{
+  acq_rel,
+  acquire,
+  relaxed,
+  release,
+  sc,
+  weak
+  // mmio?
+  // volatile?
 };
 
-enum class dot_space {
-    reg,
-    sreg,
-    const_mem,                 // Using const_mem as `const` is reserved in C++.
-    global,
-    local,
-    param,
-    shared,                    // The PTX spelling is shared::cta
-    shared_cluster,            // The PTX spelling is shared::cluster, but we might want to go for cluster here.
-    tex // deprecated
-    // generic?
+enum class dot_space
+{
+  reg,
+  sreg,
+  const_mem, // Using const_mem as `const` is reserved in C++.
+  global,
+  local,
+  param,
+  shared, // The PTX spelling is shared::cta
+  shared_cluster, // The PTX spelling is shared::cluster, but we might want to go for cluster here.
+  tex // deprecated
+  // generic?
 };
 
-enum class dot_scope {
-    cta,
-    cluster,
-    gpu,
-    sys
+enum class dot_scope
+{
+  cta,
+  cluster,
+  gpu,
+  sys
 };
 
 template <dot_sem sem>
-using sem_t = _CUDA_VSTD::integral_constant<dot_sem, sem>;
+using sem_t         = _CUDA_VSTD::integral_constant<dot_sem, sem>;
 using sem_acq_rel_t = sem_t<dot_sem::acq_rel>;
 using sem_acquire_t = sem_t<dot_sem::acquire>;
 using sem_relaxed_t = sem_t<dot_sem::relaxed>;
 using sem_release_t = sem_t<dot_sem::release>;
-using sem_sc_t = sem_t<dot_sem::sc>;
-using sem_weak_t = sem_t<dot_sem::weak>;
+using sem_sc_t      = sem_t<dot_sem::sc>;
+using sem_weak_t    = sem_t<dot_sem::weak>;
 
 static constexpr sem_acq_rel_t sem_acq_rel{};
 static constexpr sem_acquire_t sem_acquire{};
@@ -123,16 +126,16 @@ static constexpr sem_sc_t sem_sc{};
 static constexpr sem_weak_t sem_weak{};
 
 template <dot_space spc>
-using space_t = _CUDA_VSTD::integral_constant<dot_space, spc>;
-using space_const_mem_t = space_t<dot_space::const_mem>;
-using space_global_t = space_t<dot_space::global>;
-using space_local_t = space_t<dot_space::local>;
-using space_param_t = space_t<dot_space::param>;
-using space_reg_t = space_t<dot_space::reg>;
-using space_shared_t = space_t<dot_space::shared>;
+using space_t                = _CUDA_VSTD::integral_constant<dot_space, spc>;
+using space_const_mem_t      = space_t<dot_space::const_mem>;
+using space_global_t         = space_t<dot_space::global>;
+using space_local_t          = space_t<dot_space::local>;
+using space_param_t          = space_t<dot_space::param>;
+using space_reg_t            = space_t<dot_space::reg>;
+using space_shared_t         = space_t<dot_space::shared>;
 using space_shared_cluster_t = space_t<dot_space::shared_cluster>;
-using space_sreg_t = space_t<dot_space::sreg>;
-using space_tex_t = space_t<dot_space::tex>;
+using space_sreg_t           = space_t<dot_space::sreg>;
+using space_tex_t            = space_t<dot_space::tex>;
 
 static constexpr space_const_mem_t space_const_mem{};
 static constexpr space_global_t space_global{};
@@ -145,23 +148,30 @@ static constexpr space_sreg_t space_sreg{};
 static constexpr space_tex_t space_tex{};
 
 template <dot_scope scope>
-using scope_t = _CUDA_VSTD::integral_constant<dot_scope, scope>;
+using scope_t         = _CUDA_VSTD::integral_constant<dot_scope, scope>;
 using scope_cluster_t = scope_t<dot_scope::cluster>;
-using scope_cta_t = scope_t<dot_scope::cta>;
-using scope_gpu_t = scope_t<dot_scope::gpu>;
-using scope_sys_t = scope_t<dot_scope::sys>;
+using scope_cta_t     = scope_t<dot_scope::cta>;
+using scope_gpu_t     = scope_t<dot_scope::gpu>;
+using scope_sys_t     = scope_t<dot_scope::sys>;
 
 static constexpr scope_cluster_t scope_cluster{};
 static constexpr scope_cta_t scope_cta{};
 static constexpr scope_gpu_t scope_gpu{};
 static constexpr scope_sys_t scope_sys{};
 
-
 // Private helper functions
-inline _LIBCUDACXX_DEVICE CUDA_VSTD::uint32_t __as_smem_ptr(const void * __ptr)   { return static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__ptr)); }
-inline _LIBCUDACXX_DEVICE _CUDA_VSTD::uint32_t __as_remote_dsmem_ptr(const void * __ptr)   { return static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__ptr)); }
-inline _LIBCUDACXX_DEVICE _CUDA_VSTD::uint64_t __as_gmem_ptr(const void * __ptr)   { return static_cast<_CUDA_VSTD::uint64_t>(__cvta_generic_to_global(__ptr)); }
-
+inline _LIBCUDACXX_DEVICE _CUDA_VSTD::uint32_t __as_smem_ptr(const void* __ptr)
+{
+  return static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__ptr));
+}
+inline _LIBCUDACXX_DEVICE _CUDA_VSTD::uint32_t __as_remote_dsmem_ptr(const void* __ptr)
+{
+  return static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__ptr));
+}
+inline _LIBCUDACXX_DEVICE _CUDA_VSTD::uint64_t __as_gmem_ptr(const void* __ptr)
+{
+  return static_cast<_CUDA_VSTD::uint64_t>(__cvta_generic_to_global(__ptr));
+}
 
 // SM 90 features
 // --------------
@@ -189,7 +199,6 @@ inline _LIBCUDACXX_DEVICE _CUDA_VSTD::uint64_t __as_gmem_ptr(const void * __ptr)
 // cp.async.bulk.wait_group
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-wait-group
 
-
 // Lower priority:
 
 // prefetch{.tensormap_space}.tensormap [a];  // prefetch the tensormap
@@ -219,68 +228,68 @@ inline _LIBCUDACXX_DEVICE _CUDA_VSTD::uint64_t __as_gmem_ptr(const void * __ptr)
 // Support for .cluster scope requires sm_90 or higher.
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive
 
-
-
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive
 #if (defined(__CUDA_MINIMUM_ARCH__) && 900 <= __CUDA_MINIMUM_ARCH__) || (!defined(__CUDA_MINIMUM_ARCH__))
 template <dot_scope _Sco>
-_LIBCUDACXX_DEVICE inline
-_CUDA_VSTD::uint64_t mbarrier_arrive_expect_tx(sem_release_t __sem, scope_t<_Sco> __scope, space_shared_t __spc, _CUDA_VSTD::uint64_t* __addr, _CUDA_VSTD::uint32_t __tx_count)
+_LIBCUDACXX_DEVICE inline _CUDA_VSTD::uint64_t mbarrier_arrive_expect_tx(
+  sem_release_t __sem,
+  scope_t<_Sco> __scope,
+  space_shared_t __spc,
+  _CUDA_VSTD::uint64_t* __addr,
+  _CUDA_VSTD::uint32_t __tx_count)
 {
-    // Arrive on local shared memory barrier
-    static_assert(__scope == scope_cta || __scope == scope_cluster, "");
-    _CUDA_VSTD::uint64_t __token;
-
-    if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cta) {
-        asm (
-            "mbarrier.arrive.expect_tx.release.cta.shared::cta.b64 %0, [%1], %2;"
-            : "=l"(__token)
-            : "r"(__as_smem_ptr(__addr)),
-              "r"(__tx_count)
-            : "memory");
-    } else {
-        asm (
-            "mbarrier.arrive.expect_tx.release.cluster.shared::cta.b64 %0, [%1], %2;"
-            : "=l"(__token)
-            : "r"(__as_smem_ptr(__addr)),
-              "r"(__tx_count)
-            : "memory");
-    }
-    return __token;
+  // Arrive on local shared memory barrier
+  static_assert(__scope == scope_cta || __scope == scope_cluster, "");
+  _CUDA_VSTD::uint64_t __token;
+
+  if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cta)
+  {
+    asm("mbarrier.arrive.expect_tx.release.cta.shared::cta.b64 %0, [%1], %2;"
+        : "=l"(__token)
+        : "r"(__as_smem_ptr(__addr)), "r"(__tx_count)
+        : "memory");
+  }
+  else
+  {
+    asm("mbarrier.arrive.expect_tx.release.cluster.shared::cta.b64 %0, [%1], %2;"
+        : "=l"(__token)
+        : "r"(__as_smem_ptr(__addr)), "r"(__tx_count)
+        : "memory");
+  }
+  return __token;
 }
 
 template <dot_scope _Sco>
-_LIBCUDACXX_DEVICE inline
-void mbarrier_arrive_expect_tx(sem_release_t __sem, scope_t<_Sco> __scope, space_shared_cluster_t __spc, _CUDA_VSTD::uint64_t* __addr, _CUDA_VSTD::uint32_t __tx_count)
+_LIBCUDACXX_DEVICE inline void mbarrier_arrive_expect_tx(
+  sem_release_t __sem,
+  scope_t<_Sco> __scope,
+  space_shared_cluster_t __spc,
+  _CUDA_VSTD::uint64_t* __addr,
+  _CUDA_VSTD::uint32_t __tx_count)
 {
-    // Arrive on remote cluster barrier
-    static_assert(__scope == scope_cta || __scope == scope_cluster, "");
-    if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cta) {
-        asm (
-            "mbarrier.arrive.expect_tx.release.cta.shared::cluster.b64 _, [%0], %1;"
-            :
-            : "r"(__as_smem_ptr(__addr)),
-              "r"(__tx_count)
-            : "memory");
-    } else {
-        asm (
-            "mbarrier.arrive.expect_tx.release.cluster.shared::cluster.b64 _, [%0], %1;"
-            :
-            : "r"(__as_smem_ptr(__addr)),
-              "r"(__tx_count)
-            : "memory");
-    }
+  // Arrive on remote cluster barrier
+  static_assert(__scope == scope_cta || __scope == scope_cluster, "");
+  if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cta)
+  {
+    asm("mbarrier.arrive.expect_tx.release.cta.shared::cluster.b64 _, [%0], %1;"
+        :
+        : "r"(__as_smem_ptr(__addr)), "r"(__tx_count)
+        : "memory");
+  }
+  else
+  {
+    asm("mbarrier.arrive.expect_tx.release.cluster.shared::cluster.b64 _, [%0], %1;"
+        :
+        : "r"(__as_smem_ptr(__addr)), "r"(__tx_count)
+        : "memory");
+  }
 }
 #endif // __CUDA_MINIMUM_ARCH__
 
-
-
-
 // mbarrier.test_wait/mbarrier.try_wait
 // mbarrier.try_wait requires sm_90 or higher.
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-test-wait-mbarrier-try-wait
 
-
 /*
  *  Cluster Basics:
  *

From 21050e82e2a89dea7a034ab1f61781b9ac094be3 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Wed, 18 Oct 2023 17:08:29 +0200
Subject: [PATCH 10/49] Add targeting macros and a few more helper functions

---
 libcudacxx/include/cuda/ptx | 51 +++++++++++++++++++++++++++++++++++++
 1 file changed, 51 insertions(+)

diff --git a/libcudacxx/include/cuda/ptx b/libcudacxx/include/cuda/ptx
index cf14026c01f..c499fd57aed 100644
--- a/libcudacxx/include/cuda/ptx
+++ b/libcudacxx/include/cuda/ptx
@@ -58,6 +58,45 @@
  *   more easily than in other parts of libcu++.
  */
 
+
+/*
+ * Targeting macros
+ *
+ */
+
+#if (defined(__CUDA_MINIMUM_ARCH__) && 800 <= __CUDA_MINIMUM_ARCH__) || (!defined(__CUDA_MINIMUM_ARCH__))
+#  define _LIBCUDACXX_PTX_SM_80_AVAILABLE
+#endif
+
+#if (defined(__CUDA_MINIMUM_ARCH__) && 900 <= __CUDA_MINIMUM_ARCH__) || (!defined(__CUDA_MINIMUM_ARCH__))
+#  define _LIBCUDACXX_PTX_SM_90_AVAILABLE
+#endif
+
+// PTX ISA 7.8 is available from CTK 11.8, driver r520
+#if (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 8)) || (!defined(__CUDACC_VER_MAJOR__))
+#  define _LIBCUDACXX_PTX_ISA_78_AVAILABLE
+#endif
+
+// PTX ISA 7.8 is available from CTK 11.8, driver r520 (so also from CTK 12.0 onwards)
+#if (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 0)) || (!defined(__CUDACC_VER_MAJOR__))
+#  define _LIBCUDACXX_PTX_ISA_78_AVAILABLE
+#endif
+
+// PTX ISA 8.0 is available from CTK 12.0, driver r525
+#if (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 0)) || (!defined(__CUDACC_VER_MAJOR__))
+#  define _LIBCUDACXX_PTX_ISA_80_AVAILABLE
+#endif
+
+// PTX ISA 8.1 is available from CTK 12.1, driver r530
+#if (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 1)) || (!defined(__CUDACC_VER_MAJOR__))
+#  define _LIBCUDACXX_PTX_ISA_81_AVAILABLE
+#endif
+
+// PTX ISA 8.2 is available from CTK 12.2, driver r535
+#if (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 2)) || (!defined(__CUDACC_VER_MAJOR__))
+#  define _LIBCUDACXX_PTX_ISA_82_AVAILABLE
+#endif
+
 _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
 /*
@@ -173,6 +212,18 @@ inline _LIBCUDACXX_DEVICE _CUDA_VSTD::uint64_t __as_gmem_ptr(const void* __ptr)
   return static_cast<_CUDA_VSTD::uint64_t>(__cvta_generic_to_global(__ptr));
 }
 
+template <typename _Tp>
+inline _LIBCUDACXX_DEVICE int __as_b32(_Tp __val) {
+    static_assert(sizeof(_Tp) == 4, "");
+    return *reinterpret_cast<int*>(&__val);
+}
+
+template <typename _Tp>
+inline _LIBCUDACXX_DEVICE _CUDA_VSTD::uint64_t __as_b64(_Tp __val) {
+    static_assert(sizeof(_Tp) == 8, "");
+    return *reinterpret_cast<_CUDA_VSTD::uint64_t*>(&__val);
+}
+
 // SM 90 features
 // --------------
 

From 986d990073c06def2174d46bc396bd62ebdbc18e Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Wed, 25 Oct 2023 12:32:22 +0200
Subject: [PATCH 11/49] Add PTX ISA 8.3 macro

---
 libcudacxx/include/cuda/ptx | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/libcudacxx/include/cuda/ptx b/libcudacxx/include/cuda/ptx
index c499fd57aed..c5543c4bf59 100644
--- a/libcudacxx/include/cuda/ptx
+++ b/libcudacxx/include/cuda/ptx
@@ -97,6 +97,11 @@
 #  define _LIBCUDACXX_PTX_ISA_82_AVAILABLE
 #endif
 
+// PTX ISA 8.3 is available from CTK 12.3, driver r545
+#if (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 3)) || (!defined(__CUDACC_VER_MAJOR__))
+#  define _LIBCUDACXX_PTX_ISA_83_AVAILABLE
+#endif
+
 _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
 /*

From 82d1b859d0d99df919c54e45320f11f35808f891 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Wed, 25 Oct 2023 13:05:43 +0200
Subject: [PATCH 12/49] Improve code organization

---
 libcudacxx/include/cuda/ptx                   | 845 ++++++++++++------
 ..._and_communication_instructions_mbarrier.h | 105 +++
 .../include/__cuda/ptx/ptx_dot_variants.h     | 136 +++
 .../include/__cuda/ptx/ptx_helper_functions.h |  43 +
 .../__cuda/ptx/ptx_isa_target_macros.h        |  63 ++
 5 files changed, 909 insertions(+), 283 deletions(-)
 create mode 100644 libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h
 create mode 100644 libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_dot_variants.h
 create mode 100644 libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_helper_functions.h
 create mode 100644 libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h

diff --git a/libcudacxx/include/cuda/ptx b/libcudacxx/include/cuda/ptx
index c5543c4bf59..ea319195134 100644
--- a/libcudacxx/include/cuda/ptx
+++ b/libcudacxx/include/cuda/ptx
@@ -16,6 +16,11 @@
 #include "std/type_traits" // std::integral_constant
 #include "../nv/target" // __CUDA_MINIMUM_ARCH__ and friends
 
+#include "cuda/std/detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h"
+#include "cuda/std/detail/libcxx/include/__cuda/ptx/ptx_dot_variants.h"
+#include "cuda/std/detail/libcxx/include/__cuda/ptx/ptx_helper_functions.h"
+#include "cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h"
+
 /*
  * The cuda::ptx namespace intends to provide PTX wrappers for new hardware
  * features and new PTX instructions so that they can be experimented with
@@ -58,373 +63,647 @@
  *   more easily than in other parts of libcu++.
  */
 
+_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
+
+/*
+ *  Instructions
+ *
+ *  The organization of the instructions below follows that of the PTX ISA documentation:
+ *  https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#instructions
+ *
+ *  To improve code organization, some sections are separated into their own
+ *  header. For instance, the mbarrier instructions are found in:
+ *  __cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h
+ *
+ */
 
 /*
- * Targeting macros
+ *  9.7.1. Integer Arithmetic Instructions
+ *  https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions
  *
  */
 
-#if (defined(__CUDA_MINIMUM_ARCH__) && 800 <= __CUDA_MINIMUM_ARCH__) || (!defined(__CUDA_MINIMUM_ARCH__))
-#  define _LIBCUDACXX_PTX_SM_80_AVAILABLE
-#endif
+// 9.7.1.7. Integer Arithmetic Instructions: sad
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-sad
 
-#if (defined(__CUDA_MINIMUM_ARCH__) && 900 <= __CUDA_MINIMUM_ARCH__) || (!defined(__CUDA_MINIMUM_ARCH__))
-#  define _LIBCUDACXX_PTX_SM_90_AVAILABLE
-#endif
+// 9.7.1.8. Integer Arithmetic Instructions: div
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-div
 
-// PTX ISA 7.8 is available from CTK 11.8, driver r520
-#if (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 8)) || (!defined(__CUDACC_VER_MAJOR__))
-#  define _LIBCUDACXX_PTX_ISA_78_AVAILABLE
-#endif
+// 9.7.1.9. Integer Arithmetic Instructions: rem
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-rem
 
-// PTX ISA 7.8 is available from CTK 11.8, driver r520 (so also from CTK 12.0 onwards)
-#if (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 0)) || (!defined(__CUDACC_VER_MAJOR__))
-#  define _LIBCUDACXX_PTX_ISA_78_AVAILABLE
-#endif
+// 9.7.1.10. Integer Arithmetic Instructions: abs
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-abs
 
-// PTX ISA 8.0 is available from CTK 12.0, driver r525
-#if (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 0)) || (!defined(__CUDACC_VER_MAJOR__))
-#  define _LIBCUDACXX_PTX_ISA_80_AVAILABLE
-#endif
+// 9.7.1.11. Integer Arithmetic Instructions: neg
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-neg
 
-// PTX ISA 8.1 is available from CTK 12.1, driver r530
-#if (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 1)) || (!defined(__CUDACC_VER_MAJOR__))
-#  define _LIBCUDACXX_PTX_ISA_81_AVAILABLE
-#endif
+// 9.7.1.12. Integer Arithmetic Instructions: min
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-min
 
-// PTX ISA 8.2 is available from CTK 12.2, driver r535
-#if (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 2)) || (!defined(__CUDACC_VER_MAJOR__))
-#  define _LIBCUDACXX_PTX_ISA_82_AVAILABLE
-#endif
+// 9.7.1.13. Integer Arithmetic Instructions: max
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-max
 
-// PTX ISA 8.3 is available from CTK 12.3, driver r545
-#if (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 3)) || (!defined(__CUDACC_VER_MAJOR__))
-#  define _LIBCUDACXX_PTX_ISA_83_AVAILABLE
-#endif
+// 9.7.1.14. Integer Arithmetic Instructions: popc
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-popc
+
+// 9.7.1.15. Integer Arithmetic Instructions: clz
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-clz
+
+// 9.7.1.16. Integer Arithmetic Instructions: bfind
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-bfind
+
+// 9.7.1.17. Integer Arithmetic Instructions: fns
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-fns
+
+// 9.7.1.18. Integer Arithmetic Instructions: brev
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-brev
+
+// 9.7.1.19. Integer Arithmetic Instructions: bfe
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-bfe
+
+// 9.7.1.20. Integer Arithmetic Instructions: bfi
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-bfi
+
+// 9.7.1.21. Integer Arithmetic Instructions: szext
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-szext
+
+// 9.7.1.22. Integer Arithmetic Instructions: bmsk
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-bmsk
+
+// 9.7.1.23. Integer Arithmetic Instructions: dp4a
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-dp4a
+
+// 9.7.1.24. Integer Arithmetic Instructions: dp2a
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-dp2a
 
-_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
 /*
- * Public integral constant types and values for
+ *  9.7.2. Extended-Precision Integer Arithmetic Instructions
+ *  https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-integer-arithmetic-instructions
  *
- * - .sem
- * - .space
- * - .scope
+ */
+
+// 9.7.2.1. Extended-Precision Arithmetic Instructions: add.cc
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-arithmetic-instructions-add-cc
+
+// 9.7.2.2. Extended-Precision Arithmetic Instructions: addc
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-arithmetic-instructions-addc
+
+// 9.7.2.3. Extended-Precision Arithmetic Instructions: sub.cc
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-arithmetic-instructions-sub-cc
+
+// 9.7.2.4. Extended-Precision Arithmetic Instructions: subc
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-arithmetic-instructions-subc
+
+// 9.7.2.5. Extended-Precision Arithmetic Instructions: mad.cc
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-arithmetic-instructions-mad-cc
+
+// 9.7.2.6. Extended-Precision Arithmetic Instructions: madc
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-arithmetic-instructions-madc
+
+
+/*
+ *  9.7.3. Floating-Point Instructions
+ *  https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions
  *
- * Skipping some steps in my reasoning: If we want to keep the PTX bindings
- * relatively stable, and also be able to adapt to additions of semantics,
- * space, and scope variants of a PTX instruction, then we must be able to add
- * new overloads of an instruction with .sem, .space, or .scope as type-level
- * parameters.
+ */
+
+// 9.7.3.1. Floating Point Instructions: testp
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-testp
+
+// 9.7.3.2. Floating Point Instructions: copysign
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-copysign
+
+// 9.7.3.3. Floating Point Instructions: add
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-add
+
+// 9.7.3.4. Floating Point Instructions: sub
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-sub
+
+// 9.7.3.5. Floating Point Instructions: mul
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-mul
+
+// 9.7.3.6. Floating Point Instructions: fma
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-fma
+
+// 9.7.3.7. Floating Point Instructions: mad
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-mad
+
+// 9.7.3.8. Floating Point Instructions: div
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-div
+
+// 9.7.3.9. Floating Point Instructions: abs
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-abs
+
+// 9.7.3.10. Floating Point Instructions: neg
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-neg
+
+// 9.7.3.11. Floating Point Instructions: min
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-min
+
+// 9.7.3.12. Floating Point Instructions: max
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-max
+
+// 9.7.3.13. Floating Point Instructions: rcp
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-rcp
+
+// 9.7.3.14. Floating Point Instructions: rcp.approx.ftz.f64
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-rcp-approx-ftz-f64
+
+// 9.7.3.15. Floating Point Instructions: sqrt
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-sqrt
+
+// 9.7.3.16. Floating Point Instructions: rsqrt
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-rsqrt
+
+// 9.7.3.17. Floating Point Instructions: rsqrt.approx.ftz.f64
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-rsqrt-approx-ftz-f64
+
+// 9.7.3.18. Floating Point Instructions: sin
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-sin
+
+// 9.7.3.19. Floating Point Instructions: cos
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-cos
+
+// 9.7.3.20. Floating Point Instructions: lg2
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-lg2
+
+// 9.7.3.21. Floating Point Instructions: ex2
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-ex2
+
+// 9.7.3.22. Floating Point Instructions: tanh
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-tanh
+
+
+/*
+ *  9.7.4. Half Precision Floating-Point Instructions
+ *  https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions
  *
  */
 
-enum class dot_sem
-{
-  acq_rel,
-  acquire,
-  relaxed,
-  release,
-  sc,
-  weak
-  // mmio?
-  // volatile?
-};
-
-enum class dot_space
-{
-  reg,
-  sreg,
-  const_mem, // Using const_mem as `const` is reserved in C++.
-  global,
-  local,
-  param,
-  shared, // The PTX spelling is shared::cta
-  shared_cluster, // The PTX spelling is shared::cluster, but we might want to go for cluster here.
-  tex // deprecated
-  // generic?
-};
-
-enum class dot_scope
-{
-  cta,
-  cluster,
-  gpu,
-  sys
-};
-
-template <dot_sem sem>
-using sem_t         = _CUDA_VSTD::integral_constant<dot_sem, sem>;
-using sem_acq_rel_t = sem_t<dot_sem::acq_rel>;
-using sem_acquire_t = sem_t<dot_sem::acquire>;
-using sem_relaxed_t = sem_t<dot_sem::relaxed>;
-using sem_release_t = sem_t<dot_sem::release>;
-using sem_sc_t      = sem_t<dot_sem::sc>;
-using sem_weak_t    = sem_t<dot_sem::weak>;
-
-static constexpr sem_acq_rel_t sem_acq_rel{};
-static constexpr sem_acquire_t sem_acquire{};
-static constexpr sem_relaxed_t sem_relaxed{};
-static constexpr sem_release_t sem_release{};
-static constexpr sem_sc_t sem_sc{};
-static constexpr sem_weak_t sem_weak{};
-
-template <dot_space spc>
-using space_t                = _CUDA_VSTD::integral_constant<dot_space, spc>;
-using space_const_mem_t      = space_t<dot_space::const_mem>;
-using space_global_t         = space_t<dot_space::global>;
-using space_local_t          = space_t<dot_space::local>;
-using space_param_t          = space_t<dot_space::param>;
-using space_reg_t            = space_t<dot_space::reg>;
-using space_shared_t         = space_t<dot_space::shared>;
-using space_shared_cluster_t = space_t<dot_space::shared_cluster>;
-using space_sreg_t           = space_t<dot_space::sreg>;
-using space_tex_t            = space_t<dot_space::tex>;
-
-static constexpr space_const_mem_t space_const_mem{};
-static constexpr space_global_t space_global{};
-static constexpr space_local_t space_local{};
-static constexpr space_param_t space_param{};
-static constexpr space_reg_t space_reg{};
-static constexpr space_shared_t space_shared{};
-static constexpr space_shared_cluster_t space_shared_cluster{};
-static constexpr space_sreg_t space_sreg{};
-static constexpr space_tex_t space_tex{};
-
-template <dot_scope scope>
-using scope_t         = _CUDA_VSTD::integral_constant<dot_scope, scope>;
-using scope_cluster_t = scope_t<dot_scope::cluster>;
-using scope_cta_t     = scope_t<dot_scope::cta>;
-using scope_gpu_t     = scope_t<dot_scope::gpu>;
-using scope_sys_t     = scope_t<dot_scope::sys>;
-
-static constexpr scope_cluster_t scope_cluster{};
-static constexpr scope_cta_t scope_cta{};
-static constexpr scope_gpu_t scope_gpu{};
-static constexpr scope_sys_t scope_sys{};
-
-// Private helper functions
-inline _LIBCUDACXX_DEVICE _CUDA_VSTD::uint32_t __as_smem_ptr(const void* __ptr)
-{
-  return static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__ptr));
-}
-inline _LIBCUDACXX_DEVICE _CUDA_VSTD::uint32_t __as_remote_dsmem_ptr(const void* __ptr)
-{
-  return static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__ptr));
-}
-inline _LIBCUDACXX_DEVICE _CUDA_VSTD::uint64_t __as_gmem_ptr(const void* __ptr)
-{
-  return static_cast<_CUDA_VSTD::uint64_t>(__cvta_generic_to_global(__ptr));
-}
-
-template <typename _Tp>
-inline _LIBCUDACXX_DEVICE int __as_b32(_Tp __val) {
-    static_assert(sizeof(_Tp) == 4, "");
-    return *reinterpret_cast<int*>(&__val);
-}
-
-template <typename _Tp>
-inline _LIBCUDACXX_DEVICE _CUDA_VSTD::uint64_t __as_b64(_Tp __val) {
-    static_assert(sizeof(_Tp) == 8, "");
-    return *reinterpret_cast<_CUDA_VSTD::uint64_t*>(&__val);
-}
-
-// SM 90 features
-// --------------
+// 9.7.4.1. Half Precision Floating Point Instructions: add
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-add
+
+// 9.7.4.2. Half Precision Floating Point Instructions: sub
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-sub
+
+// 9.7.4.3. Half Precision Floating Point Instructions: mul
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-mul
+
+// 9.7.4.4. Half Precision Floating Point Instructions: fma
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-fma
+
+// 9.7.4.5. Half Precision Floating Point Instructions: neg
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-neg
+
+// 9.7.4.6. Half Precision Floating Point Instructions: abs
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-abs
+
+// 9.7.4.7. Half Precision Floating Point Instructions: min
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-min
+
+// 9.7.4.8. Half Precision Floating Point Instructions: max
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-max
+
+// 9.7.4.9. Half Precision Floating Point Instructions: tanh
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-tanh
+
+// 9.7.4.10. Half Precision Floating Point Instructions: ex2
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-ex2
+
+
+/*
+ *  9.7.5. Comparison and Selection Instructions
+ *  https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#comparison-and-selection-instructions
+ *
+ */
+
+// 9.7.5.1. Comparison and Selection Instructions: set
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#comparison-and-selection-instructions-set
+
+// 9.7.5.2. Comparison and Selection Instructions: setp
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#comparison-and-selection-instructions-setp
+
+// 9.7.5.3. Comparison and Selection Instructions: selp
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#comparison-and-selection-instructions-selp
+
+// 9.7.5.4. Comparison and Selection Instructions: slct
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#comparison-and-selection-instructions-slct
+
+
+/*
+ *  9.7.6. Half Precision Comparison Instructions
+ *  https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-comparison-instructions
+ *
+ */
+
+// 9.7.6.1. Half Precision Comparison Instructions: set
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-comparison-instructions-set
+
+// 9.7.6.2. Half Precision Comparison Instructions: setp
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-comparison-instructions-setp
+
+
+/*
+ *  9.7.7. Logic and Shift Instructions
+ *  https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions
+ *
+ */
+
+// 9.7.7.1. Logic and Shift Instructions: and
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-and
+
+// 9.7.7.2. Logic and Shift Instructions: or
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-or
+
+// 9.7.7.3. Logic and Shift Instructions: xor
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-xor
+
+// 9.7.7.4. Logic and Shift Instructions: not
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-not
+
+// 9.7.7.5. Logic and Shift Instructions: cnot
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-cnot
+
+// 9.7.7.6. Logic and Shift Instructions: lop3
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-lop3
+
+// 9.7.7.7. Logic and Shift Instructions: shf
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-shf
+
+// 9.7.7.8. Logic and Shift Instructions: shl
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-shl
+
+// 9.7.7.9. Logic and Shift Instructions: shr
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-shr
+
 
 /*
- *  TMA / cp.async.bulk
+ *  9.7.8. Data Movement and Conversion Instructions
+ *  https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions
  *
  */
 
-// cp.async.bulk
+// 9.7.8.3. Data Movement and Conversion Instructions: mov
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-mov
+
+// 9.7.8.4. Data Movement and Conversion Instructions: mov
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-mov-2
+
+// 9.7.8.5. Data Movement and Conversion Instructions: shfl (deprecated)
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-shfl-deprecated
+
+// 9.7.8.6. Data Movement and Conversion Instructions: shfl.sync
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-shfl-sync
+
+// 9.7.8.7. Data Movement and Conversion Instructions: prmt
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-prmt
+
+// 9.7.8.8. Data Movement and Conversion Instructions: ld
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-ld
+
+// 9.7.8.9. Data Movement and Conversion Instructions: ld.global.nc
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-ld-global-nc
+
+// 9.7.8.10. Data Movement and Conversion Instructions: ldu
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-ldu
+
+// 9.7.8.11. Data Movement and Conversion Instructions: st
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-st
+
+// 9.7.8.12. Data Movement and Conversion Instructions: st.async
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-st-async
+
+// 9.7.8.13. Data Movement and Conversion Instructions: multimem.ld_reduce, multimem.st, multimem.red
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-multimem-ld-reduce-multimem-st-multimem-red
+
+// 9.7.8.14. Data Movement and Conversion Instructions: prefetch, prefetchu
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-prefetch-prefetchu
+
+// 9.7.8.15. Data Movement and Conversion Instructions: applypriority
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-applypriority
+
+// 9.7.8.16. Data Movement and Conversion Instructions: discard
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-discard
+
+// 9.7.8.17. Data Movement and Conversion Instructions: createpolicy
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-createpolicy
+
+// 9.7.8.18. Data Movement and Conversion Instructions: isspacep
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-isspacep
+
+// 9.7.8.19. Data Movement and Conversion Instructions: cvta
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvta
+
+// 9.7.8.20. Data Movement and Conversion Instructions: cvt
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvt
+
+// 9.7.8.21. Data Movement and Conversion Instructions: cvt.pack
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvt-pack
+
+// 9.7.8.22. Data Movement and Conversion Instructions: mapa
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-mapa
+
+// 9.7.8.23. Data Movement and Conversion Instructions: getctarank
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-getctarank
+
+
+/*
+ *  9.7.8.24. Data Movement and Conversion Instructions: Asynchronous copy
+ *   https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-asynchronous-copy
+ *
+ */
+
+// 9.7.8.24.3. Data Movement and Conversion Instructions: cp.async
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async
+
+// 9.7.8.24.4. Data Movement and Conversion Instructions: cp.async.commit_group
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-commit-group
+
+// 9.7.8.24.5. Data Movement and Conversion Instructions: cp.async.wait_group / cp.async.wait_all
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-wait-group-cp-async-wait-all
+
+// 9.7.8.24.6. Data Movement and Conversion Instructions: cp.async.bulk
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk
 
-// cp.reduce.async.bulk
+// 9.7.8.24.7. Data Movement and Conversion Instructions: cp.reduce.async.bulk
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-reduce-async-bulk
 
-// cp.async.bulk.tensor
+// 9.7.8.24.8. Data Movement and Conversion Instructions: cp.async.bulk.prefetch
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-prefetch
+
+// 9.7.8.24.9. Data Movement and Conversion Instructions: cp.async.bulk.tensor
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
 
-// cp.reduce.async.bulk.tensor
+// 9.7.8.24.10. Data Movement and Conversion Instructions: cp.reduce.async.bulk.tensor
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-reduce-async-bulk-tensor
 
-// cp.async.bulk.commit_group
+// 9.7.8.24.11. Data Movement and Conversion Instructions: cp.async.bulk.prefetch.tensor
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-prefetch-tensor
+
+// 9.7.8.24.12. Data Movement and Conversion Instructions: cp.async.bulk.commit_group
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-commit-group
 
-// cp.async.bulk.wait_group
+// 9.7.8.24.13. Data Movement and Conversion Instructions: cp.async.bulk.wait_group
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-wait-group
 
-// Lower priority:
+// 9.7.8.25. Data Movement and Conversion Instructions: tensormap.replace
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-tensormap-replace
 
-// prefetch{.tensormap_space}.tensormap [a];  // prefetch the tensormap
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-prefetch-prefetchu
 
-// cp.async.bulk.prefetch
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk
+/*
+ *  9.7.9. Texture Instructions
+ *  https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#texture-instructions
+ *
+ */
+
+// 9.7.9.3. Texture Instructions: tex
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#texture-instructions-tex
+
+// 9.7.9.4. Texture Instructions: tld4
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#texture-instructions-tld4
+
+// 9.7.9.5. Texture Instructions: txq
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#texture-instructions-txq
+
+// 9.7.9.6. Texture Instructions: istypep
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#texture-instructions-istypep
 
-// cp.async.bulk.prefetch.tensor
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-prefetch-tensor
 
 /*
- *  Shared memory barrier
+ *  9.7.10. Surface Instructions
+ *  https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#surface-instructions
  *
  */
 
-// mbarrier.expect_tx
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-expect-tx
-
-// mbarrier.complete_tx
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-complete-tx
-
-// mbarrier.arrive.expect_tx
-// Support for count argument without the modifier .noComplete requires sm_90 or higher.
-// Qualifier .expect_tx requires sm_90 or higher.
-// Sub-qualifier ::cluster requires sm_90 or higher.
-// Support for .cluster scope requires sm_90 or higher.
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive
-
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive
-#if (defined(__CUDA_MINIMUM_ARCH__) && 900 <= __CUDA_MINIMUM_ARCH__) || (!defined(__CUDA_MINIMUM_ARCH__))
-template <dot_scope _Sco>
-_LIBCUDACXX_DEVICE inline _CUDA_VSTD::uint64_t mbarrier_arrive_expect_tx(
-  sem_release_t __sem,
-  scope_t<_Sco> __scope,
-  space_shared_t __spc,
-  _CUDA_VSTD::uint64_t* __addr,
-  _CUDA_VSTD::uint32_t __tx_count)
-{
-  // Arrive on local shared memory barrier
-  static_assert(__scope == scope_cta || __scope == scope_cluster, "");
-  _CUDA_VSTD::uint64_t __token;
-
-  if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cta)
-  {
-    asm("mbarrier.arrive.expect_tx.release.cta.shared::cta.b64 %0, [%1], %2;"
-        : "=l"(__token)
-        : "r"(__as_smem_ptr(__addr)), "r"(__tx_count)
-        : "memory");
-  }
-  else
-  {
-    asm("mbarrier.arrive.expect_tx.release.cluster.shared::cta.b64 %0, [%1], %2;"
-        : "=l"(__token)
-        : "r"(__as_smem_ptr(__addr)), "r"(__tx_count)
-        : "memory");
-  }
-  return __token;
-}
-
-template <dot_scope _Sco>
-_LIBCUDACXX_DEVICE inline void mbarrier_arrive_expect_tx(
-  sem_release_t __sem,
-  scope_t<_Sco> __scope,
-  space_shared_cluster_t __spc,
-  _CUDA_VSTD::uint64_t* __addr,
-  _CUDA_VSTD::uint32_t __tx_count)
-{
-  // Arrive on remote cluster barrier
-  static_assert(__scope == scope_cta || __scope == scope_cluster, "");
-  if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cta)
-  {
-    asm("mbarrier.arrive.expect_tx.release.cta.shared::cluster.b64 _, [%0], %1;"
-        :
-        : "r"(__as_smem_ptr(__addr)), "r"(__tx_count)
-        : "memory");
-  }
-  else
-  {
-    asm("mbarrier.arrive.expect_tx.release.cluster.shared::cluster.b64 _, [%0], %1;"
-        :
-        : "r"(__as_smem_ptr(__addr)), "r"(__tx_count)
-        : "memory");
-  }
-}
-#endif // __CUDA_MINIMUM_ARCH__
-
-// mbarrier.test_wait/mbarrier.try_wait
-// mbarrier.try_wait requires sm_90 or higher.
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-test-wait-mbarrier-try-wait
+// 9.7.10.1. Surface Instructions: suld
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#surface-instructions-suld
+
+// 9.7.10.2. Surface Instructions: sust
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#surface-instructions-sust
+
+// 9.7.10.3. Surface Instructions: sured
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#surface-instructions-sured
+
+// 9.7.10.4. Surface Instructions: suq
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#surface-instructions-suq
+
 
 /*
- *  Cluster Basics:
+ *  9.7.11. Control Flow Instructions
+ *  https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions
  *
- *  These instructions are already exposed at a higher level, so may not be necessary.
  */
 
-// mapa{.space}.type          d, a, b;
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-mapa
+// 9.7.11.1. Control Flow Instructions: {}
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-curly-braces
 
-// getctarank{.space}.type d, a;
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-getctarank
+// 9.7.11.2. Control Flow Instructions: @
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-at
+
+// 9.7.11.3. Control Flow Instructions: bra
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-bra
+
+// 9.7.11.4. Control Flow Instructions: brx.idx
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-brx-idx
 
-// barrier.cluster
+// 9.7.11.5. Control Flow Instructions: call
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-call
+
+// 9.7.11.6. Control Flow Instructions: ret
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-ret
+
+// 9.7.11.7. Control Flow Instructions: exit
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-exit
+
+
+/*
+ *  9.7.12. Parallel Synchronization and Communication Instructions
+ *  https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions
+ *
+ */
+
+// 9.7.12.1. Parallel Synchronization and Communication Instructions: bar, barrier
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-bar-barrier
+
+// 9.7.12.2. Parallel Synchronization and Communication Instructions: bar.warp.sync
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-bar-warp-sync
+
+// 9.7.12.3. Parallel Synchronization and Communication Instructions: barrier.cluster
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-barrier-cluster
 
-// atom .cluster
+// 9.7.12.4. Parallel Synchronization and Communication Instructions: membar/fence
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar-fence
+
+// 9.7.12.5. Parallel Synchronization and Communication Instructions: atom
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-atom
 
-// red .cluster
+// 9.7.12.6. Parallel Synchronization and Communication Instructions: red
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-red
 
+// 9.7.12.7. Parallel Synchronization and Communication Instructions: red.async
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-red-async
+
+// 9.7.12.8. Parallel Synchronization and Communication Instructions: vote (deprecated)
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-vote-deprecated
+
+// 9.7.12.9. Parallel Synchronization and Communication Instructions: vote.sync
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-vote-sync
+
+// 9.7.12.10. Parallel Synchronization and Communication Instructions: match.sync
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-match-sync
+
+// 9.7.12.11. Parallel Synchronization and Communication Instructions: activemask
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-activemask
+
+// 9.7.12.12. Parallel Synchronization and Communication Instructions: redux.sync
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-redux-sync
+
+// 9.7.12.13. Parallel Synchronization and Communication Instructions: griddepcontrol
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-griddepcontrol
+
+// 9.7.12.14. Parallel Synchronization and Communication Instructions: elect.sync
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-elect-sync
+
 /*
- *   Cluster async
+ *  9.7.12.15. Parallel Synchronization and Communication Instructions: mbarrier
+ *  https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier
  *
+ *  Contained in: __cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h
  */
 
-// st.async
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-st-async
+// 9.7.12.15.18. Parallel Synchronization and Communication Instructions: tensormap.cp_fenceproxy
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-tensormap-cp-fenceproxy
 
-// red.async
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-red-async
 
 /*
+ *  9.7.13. Warp Level Matrix Multiply-Accumulate Instructions
+ *  https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-multiply-accumulate-instructions
  *
- *   Other instructions
  */
 
-// fence.proxy.async.{global, shared::{cta, cluster}}
-// fence.mbarrier_init.release.cluster (may be a bit overkill??)
-// fence.{sc, acq_rel}.cluster
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar-fence
+// 9.7.13.3.3. Warp-level Matrix Load Instruction: wmma.load
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-load-instruction-wmma-load
 
-// multimem.ld_reduce, multimem.st, multimem.red
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-multimem-ld-reduce-multimem-st-multimem-red
+// 9.7.13.3.4. Warp-level Matrix Store Instruction: wmma.store
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-store-instruction-wmma-store
 
-// griddepcontrol
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-griddepcontrol
+// 9.7.13.3.5. Warp-level Matrix Multiply-and-Accumulate Instruction: wmma.mma
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-multiply-and-accumulate-instruction-wmma-mma
 
-// elect.sync
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-elect-sync
+// 9.7.13.4.14. Multiply-and-Accumulate Instruction: mma
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#multiply-and-accumulate-instruction-mma
 
-// stmatrix
+// 9.7.13.4.15. Warp-level matrix load instruction: ldmatrix
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-load-instruction-ldmatrix
+
+// 9.7.13.4.16. Warp-level matrix store instruction: stmatrix
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-store-instruction-stmatrix
 
+// 9.7.13.4.17. Warp-level matrix transpose instruction: movmatrix
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-transpose-instruction-movmatrix
+
+// 9.7.13.5.3. Multiply-and-Accumulate Instruction: mma.sp
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#multiply-and-accumulate-instruction-mma-sp
+
+
+/*
+ *  9.7.14. Asynchronous Warpgroup Level Matrix Multiply-Accumulate Instructions
+ *  https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-warpgroup-level-matrix-multiply-accumulate-instructions
+ *
+ */
+
+// 9.7.14.5.2. Asynchronous Multiply-and-Accumulate Instruction: wgmma.mma_async
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-multiply-and-accumulate-instruction-wgmma-mma-async
+
+// 9.7.14.6.4. Asynchronous Multiply-and-Accumulate Instruction: wgmma.mma_async.sp
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-multiply-and-accumulate-instruction-wgmma-mma-async-sp
+
+// 9.7.14.7.1. Asynchronous Multiply-and-Accumulate Instruction: wgmma.fence
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-multiply-and-accumulate-instruction-wgmma-fence
+
+// 9.7.14.7.2. Asynchronous Multiply-and-Accumulate Instruction: wgmma.commit_group
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-multiply-and-accumulate-instruction-wgmma-commit-group
+
+// 9.7.14.7.3. Asynchronous Multiply-and-Accumulate Instruction: wgmma.wait_group
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-multiply-and-accumulate-instruction-wgmma-wait-group
+
+
+/*
+ *  9.7.15. Stack Manipulation Instructions
+ *  https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#stack-manipulation-instructions
+ *
+ */
+
+// 9.7.15.1. Stack Manipulation Instructions: stacksave
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#stack-manipulation-instructions-stacksave
+
+// 9.7.15.2. Stack Manipulation Instructions: stackrestore
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#stack-manipulation-instructions-stackrestore
+
+// 9.7.15.3. Stack Manipulation Instructions: alloca
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#stack-manipulation-instructions-alloca
+
+
 /*
- *  Special registers (cluster-related)
+ *  9.7.16. Video Instructions
+ *  https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#video-instructions
  *
  */
 
-//  10.12. Special Registers: %clusterid
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-clusterid
+// 9.7.16.1.1. Scalar Video Instructions: vadd, vsub, vabsdiff, vmin, vmax
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#scalar-video-instructions-vadd-vsub-vabsdiff-vmin-vmax
+
+// 9.7.16.1.2. Scalar Video Instructions: vshl, vshr
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#scalar-video-instructions-vshl-vshr
+
+// 9.7.16.1.3. Scalar Video Instructions: vmad
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#scalar-video-instructions-vmad
+
+// 9.7.16.1.4. Scalar Video Instructions: vset
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#scalar-video-instructions-vset
 
-//  10.13. Special Registers: %nclusterid
-//  https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-nclusterid
 
-//  10.14. Special Registers: %cluster_ctaid
-//  https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-cluster-ctaid
+/*
+ *  9.7.16.2. SIMD Video Instructions
+ *  https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#simd-video-instructions
+ *
+ */
+
+// 9.7.16.2.1. SIMD Video Instructions: vadd2, vsub2, vavrg2, vabsdiff2, vmin2, vmax2
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#simd-video-instructions-vadd2-vsub2-vavrg2-vabsdiff2-vmin2-vmax2
+
+// 9.7.16.2.2. SIMD Video Instructions: vset2
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#simd-video-instructions-vset2
+
+// 9.7.16.2.3. SIMD Video Instructions: vadd4, vsub4, vavrg4, vabsdiff4, vmin4, vmax4
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#simd-video-instructions-vadd4-vsub4-vavrg4-vabsdiff4-vmin4-vmax4
+
+// 9.7.16.2.4. SIMD Video Instructions: vset4
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#simd-video-instructions-vset4
+
+
+/*
+ *  9.7.17. Miscellaneous Instructions
+ *  https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions
+ *
+ */
+
+// 9.7.17.1. Miscellaneous Instructions: brkpt
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions-brkpt
 
-// 10.15. Special Registers: %cluster_nctaid
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-cluster-nctaid
+// 9.7.17.2. Miscellaneous Instructions: nanosleep
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions-nanosleep
 
-//  10.16. Special Registers: %cluster_ctarank
-//  https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-cluster-ctarank
+// 9.7.17.3. Miscellaneous Instructions: pmevent
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions-pmevent
 
-//  10.17. Special Registers: %cluster_nctarank
-//  https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-cluster-nctarank
+// 9.7.17.4. Miscellaneous Instructions: trap
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions-trap
 
-//  10.31. Special Registers: %aggr_smem_size
-//  https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-aggr-smem-size
+// 9.7.17.5. Miscellaneous Instructions: setmaxnreg
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions-setmaxnreg
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_PTX
 
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h
new file mode 100644
index 00000000000..10da1675226
--- /dev/null
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h
@@ -0,0 +1,105 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+
+#ifndef PARALLEL_SYNCHRONIZATION_AND_COMMUNICATION_INSTRUCTIONS_MBARRIER_H_
+#define PARALLEL_SYNCHRONIZATION_AND_COMMUNICATION_INSTRUCTIONS_MBARRIER_H_
+
+/*
+ *  9.7.12.15. Parallel Synchronization and Communication Instructions: mbarrier
+ *   https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier
+ *
+ */
+
+// 9.7.12.15.9. Parallel Synchronization and Communication Instructions: mbarrier.init
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-init
+
+// 9.7.12.15.10. Parallel Synchronization and Communication Instructions: mbarrier.inval
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-inval
+
+// 9.7.12.15.11. Parallel Synchronization and Communication Instructions: mbarrier.expect_tx
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-expect-tx
+
+// 9.7.12.15.12. Parallel Synchronization and Communication Instructions: mbarrier.complete_tx
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-complete-tx
+
+// 9.7.12.15.13. Parallel Synchronization and Communication Instructions: mbarrier.arrive
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive
+
+#if defined(_LIBCUDACXX_PTX_ISA_78_AVAILABLE) && defined(_LIBCUDACXX_PTX_SM_90_AVAILABLE)
+template <dot_scope _Sco>
+_LIBCUDACXX_DEVICE inline
+_CUDA_VSTD::uint64_t mbarrier_arrive_expect_tx(sem_release_t __sem, scope_t<_Sco> __scope, space_shared_t __spc, _CUDA_VSTD::uint64_t* __addr, _CUDA_VSTD::uint32_t __tx_count)
+{
+    // Arrive on local shared memory barrier
+    static_assert(__scope == scope_cta || __scope == scope_cluster, "");
+    _CUDA_VSTD::uint64_t __token;
+
+    if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cta) {
+        asm (
+            "mbarrier.arrive.expect_tx.release.cta.shared::cta.b64 %0, [%1], %2;"
+            : "=l"(__token)
+            : "r"(__as_smem_ptr(__addr)),
+              "r"(__tx_count)
+            : "memory");
+    } else {
+        asm (
+            "mbarrier.arrive.expect_tx.release.cluster.shared::cta.b64 %0, [%1], %2;"
+            : "=l"(__token)
+            : "r"(__as_smem_ptr(__addr)),
+              "r"(__tx_count)
+            : "memory");
+    }
+    return __token;
+}
+#ifdef // _LIBCUDACXX_PTX_ISA_78_AVAILABLE
+
+#if defined(_LIBCUDACXX_PTX_ISA_78_AVAILABLE) && defined(_LIBCUDACXX_PTX_SM_90_AVAILABLE)
+template <dot_scope _Sco>
+_LIBCUDACXX_DEVICE inline
+void mbarrier_arrive_expect_tx(sem_release_t __sem, scope_t<_Sco> __scope, space_shared_cluster_t __spc, _CUDA_VSTD::uint64_t* __addr, _CUDA_VSTD::uint32_t __tx_count)
+{
+    // Arrive on remote cluster barrier
+    static_assert(__scope == scope_cta || __scope == scope_cluster, "");
+    if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cta) {
+        asm (
+            "mbarrier.arrive.expect_tx.release.cta.shared::cluster.b64 _, [%0], %1;"
+            :
+            : "r"(__as_smem_ptr(__addr)),
+              "r"(__tx_count)
+            : "memory");
+    } else {
+        asm (
+            "mbarrier.arrive.expect_tx.release.cluster.shared::cluster.b64 _, [%0], %1;"
+            :
+            : "r"(__as_smem_ptr(__addr)),
+              "r"(__tx_count)
+            : "memory");
+    }
+}
+#ifdef // _LIBCUDACXX_PTX_ISA_78_AVAILABLE
+
+
+// 9.7.12.15.14. Parallel Synchronization and Communication Instructions: mbarrier.arrive_drop
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive-drop
+
+// 9.7.12.15.15. Parallel Synchronization and Communication Instructions: cp.async.mbarrier.arrive
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-cp-async-mbarrier-arrive
+
+// 9.7.12.15.16. Parallel Synchronization and Communication Instructions: mbarrier.test_wait/mbarrier.try_wait
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-test-wait-mbarrier-try-wait
+
+// 9.7.12.15.17. Parallel Synchronization and Communication Instructions: mbarrier.pending_count
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-pending-count
+
+
+
+#endif // PARALLEL_SYNCHRONIZATION_AND_COMMUNICATION_INSTRUCTIONS_MBARRIER_H_
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_dot_variants.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_dot_variants.h
new file mode 100644
index 00000000000..08a972492e7
--- /dev/null
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_dot_variants.h
@@ -0,0 +1,136 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+
+#ifndef PTX_DOT_VARIANTS_H_
+#define PTX_DOT_VARIANTS_H_
+
+/*
+ * Public integral constant types and values for ".variant"s:
+ *
+ * - .sem
+ * - .space
+ * - .scope
+ *
+ * For each .variant, the code below defines:
+ * - An enum `dot_variant` with each possible value
+ * - A type template `variant_t<dot_variant>`
+ * - Types `variant_A_t`, ..., `variant_Z_t`
+ * - Constexpr values `variant_A` of type `variant_A_t`
+ *
+ * These types enable specifying fine-grained overloads of a PTX binding. If a
+ * binding can handle multiple variants, then it is defined as:
+ *
+ * template <dot_variant var>
+ * [...] void ptx_binding(variant_t<var> __v) { ... }
+ *
+ * If it only handles a single variant, then it is defined as:
+ *
+ * [...] void ptx_binding(variant_A __v) { ... }
+ *
+ * If two variants have different behaviors or return types (see .space
+ * overloads of mbarrier.arrive.expect_tx for an example), then these can be
+ * provided as separate overloads of the same function:
+ *
+ * [...] void ptx_binding(variant_A __v) { ... }
+ * [...] int ptx_binding(variant_B __v) { ... }
+ *
+ */
+
+_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
+
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#operation-types
+enum class dot_sem {
+    acq_rel,
+    acquire,
+    relaxed,
+    release,
+    sc,
+    weak
+    // mmio?
+    // volatile?
+};
+
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#state-spaces
+enum class dot_space {
+    reg,
+    sreg,
+    const_mem,                 // Using const_mem as `const` is reserved in C++.
+    global,
+    local,
+    param,
+    shared,                    // The PTX spelling is shared::cta
+    shared_cluster,            // The PTX spelling is shared::cluster, but we might want to go for cluster here.
+    tex // deprecated
+    // generic?
+};
+
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#scope
+enum class dot_scope {
+    cta,
+    cluster,
+    gpu,
+    sys
+};
+
+template <dot_sem sem>
+using sem_t = _CUDA_VSTD::integral_constant<dot_sem, sem>;
+using sem_acq_rel_t = sem_t<dot_sem::acq_rel>;
+using sem_acquire_t = sem_t<dot_sem::acquire>;
+using sem_relaxed_t = sem_t<dot_sem::relaxed>;
+using sem_release_t = sem_t<dot_sem::release>;
+using sem_sc_t = sem_t<dot_sem::sc>;
+using sem_weak_t = sem_t<dot_sem::weak>;
+
+static constexpr sem_acq_rel_t sem_acq_rel{};
+static constexpr sem_acquire_t sem_acquire{};
+static constexpr sem_relaxed_t sem_relaxed{};
+static constexpr sem_release_t sem_release{};
+static constexpr sem_sc_t sem_sc{};
+static constexpr sem_weak_t sem_weak{};
+
+template <dot_space spc>
+using space_t = _CUDA_VSTD::integral_constant<dot_space, spc>;
+using space_const_mem_t = space_t<dot_space::const_mem>;
+using space_global_t = space_t<dot_space::global>;
+using space_local_t = space_t<dot_space::local>;
+using space_param_t = space_t<dot_space::param>;
+using space_reg_t = space_t<dot_space::reg>;
+using space_shared_t = space_t<dot_space::shared>;
+using space_shared_cluster_t = space_t<dot_space::shared_cluster>;
+using space_sreg_t = space_t<dot_space::sreg>;
+using space_tex_t = space_t<dot_space::tex>;
+
+static constexpr space_const_mem_t space_const_mem{};
+static constexpr space_global_t space_global{};
+static constexpr space_local_t space_local{};
+static constexpr space_param_t space_param{};
+static constexpr space_reg_t space_reg{};
+static constexpr space_shared_t space_shared{};
+static constexpr space_shared_cluster_t space_shared_cluster{};
+static constexpr space_sreg_t space_sreg{};
+static constexpr space_tex_t space_tex{};
+
+template <dot_scope scope>
+using scope_t = _CUDA_VSTD::integral_constant<dot_scope, scope>;
+using scope_cluster_t = scope_t<dot_scope::cluster>;
+using scope_cta_t = scope_t<dot_scope::cta>;
+using scope_gpu_t = scope_t<dot_scope::gpu>;
+using scope_sys_t = scope_t<dot_scope::sys>;
+
+static constexpr scope_cluster_t scope_cluster{};
+static constexpr scope_cta_t scope_cta{};
+static constexpr scope_gpu_t scope_gpu{};
+static constexpr scope_sys_t scope_sys{};
+
+_LIBCUDACXX_END_NAMESPACE_CUDA_PTX
+
+#endif // PTX_DOT_VARIANTS_H_
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_helper_functions.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_helper_functions.h
new file mode 100644
index 00000000000..50982232f66
--- /dev/null
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_helper_functions.h
@@ -0,0 +1,43 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PTX_HELPER_FUNCTIONS_H_
+#define PTX_HELPER_FUNCTIONS_H_
+
+_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
+
+// Private helper functions
+inline _LIBCUDACXX_DEVICE _CUDA_VSTD::uint32_t __as_smem_ptr(const void * __ptr)   {
+    return static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__ptr));
+}
+
+inline _LIBCUDACXX_DEVICE _CUDA_VSTD::uint32_t __as_remote_dsmem_ptr(const void * __ptr)   {
+    return static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__ptr));
+}
+inline _LIBCUDACXX_DEVICE _CUDA_VSTD::uint64_t __as_gmem_ptr(const void * __ptr)   {
+    return static_cast<_CUDA_VSTD::uint64_t>(__cvta_generic_to_global(__ptr));
+}
+
+template <typename _Tp>
+inline _LIBCUDACXX_DEVICE int __as_b32(_Tp __val) {
+    static_assert(sizeof(_Tp) == 4, "");
+    return *reinterpret_cast<int*>(&__val);
+}
+
+template <typename _Tp>
+inline _LIBCUDACXX_DEVICE _CUDA_VSTD::uint64_t __as_b64(_Tp __val) {
+    static_assert(sizeof(_Tp) == 8, "");
+    return *reinterpret_cast<_CUDA_VSTD::uint64_t*>(&__val);
+}
+
+_LIBCUDACXX_END_NAMESPACE_CUDA_PTX
+
+#endif // PTX_HELPER_FUNCTIONS_H_
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h
new file mode 100644
index 00000000000..89b519513ca
--- /dev/null
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h
@@ -0,0 +1,63 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+
+#ifndef PTX_ISA_TARGET_MACROS_H_
+#define PTX_ISA_TARGET_MACROS_H_
+
+
+/*
+ * Targeting macros
+ *
+ * Information from:
+ * https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#release-notes
+ */
+
+#if (defined(__CUDA_MINIMUM_ARCH__) && 800 <= __CUDA_MINIMUM_ARCH__) || (!defined(__CUDA_MINIMUM_ARCH__))
+#  define _LIBCUDACXX_PTX_SM_80_AVAILABLE
+#endif
+
+#if (defined(__CUDA_MINIMUM_ARCH__) && 900 <= __CUDA_MINIMUM_ARCH__) || (!defined(__CUDA_MINIMUM_ARCH__))
+#  define _LIBCUDACXX_PTX_SM_90_AVAILABLE
+#endif
+
+// PTX ISA 7.8 is available from CTK 11.8, driver r520
+#if (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 8)) || (!defined(__CUDACC_VER_MAJOR__))
+#  define _LIBCUDACXX_PTX_ISA_78_AVAILABLE
+#endif
+
+// PTX ISA 7.8 is available from CTK 11.8, driver r520 (so also from CTK 12.0 onwards)
+#if (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 0)) || (!defined(__CUDACC_VER_MAJOR__))
+#  define _LIBCUDACXX_PTX_ISA_78_AVAILABLE
+#endif
+
+// PTX ISA 8.0 is available from CTK 12.0, driver r525
+#if (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 0)) || (!defined(__CUDACC_VER_MAJOR__))
+#  define _LIBCUDACXX_PTX_ISA_80_AVAILABLE
+#endif
+
+// PTX ISA 8.1 is available from CTK 12.1, driver r530
+#if (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 1)) || (!defined(__CUDACC_VER_MAJOR__))
+#  define _LIBCUDACXX_PTX_ISA_81_AVAILABLE
+#endif
+
+// PTX ISA 8.2 is available from CTK 12.2, driver r535
+#if (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 2)) || (!defined(__CUDACC_VER_MAJOR__))
+#  define _LIBCUDACXX_PTX_ISA_82_AVAILABLE
+#endif
+
+// PTX ISA 8.3 is available from CTK 12.3, driver r545
+#if (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 3)) || (!defined(__CUDACC_VER_MAJOR__))
+#  define _LIBCUDACXX_PTX_ISA_83_AVAILABLE
+#endif
+
+
+#endif // PTX_ISA_TARGET_MACROS_H_

From e356271672c645c1956a5ac209b34ddc275b9341 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Wed, 25 Oct 2023 13:17:06 +0200
Subject: [PATCH 13/49] Format code

---
 libcudacxx/include/cuda/ptx                   |  1 +
 .../detail/libcxx/include/__cuda/barrier.h    |  3 +-
 ..._and_communication_instructions_mbarrier.h | 86 ++++++++++---------
 .../include/__cuda/ptx/ptx_dot_variants.h     | 85 +++++++++---------
 .../include/__cuda/ptx/ptx_helper_functions.h | 30 ++++---
 5 files changed, 110 insertions(+), 95 deletions(-)

diff --git a/libcudacxx/include/cuda/ptx b/libcudacxx/include/cuda/ptx
index ea319195134..1f3fb868679 100644
--- a/libcudacxx/include/cuda/ptx
+++ b/libcudacxx/include/cuda/ptx
@@ -16,6 +16,7 @@
 #include "std/type_traits" // std::integral_constant
 #include "../nv/target" // __CUDA_MINIMUM_ARCH__ and friends
 
+// The following includes depend on the includes above:
 #include "cuda/std/detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h"
 #include "cuda/std/detail/libcxx/include/__cuda/ptx/ptx_dot_variants.h"
 #include "cuda/std/detail/libcxx/include/__cuda/ptx/ptx_helper_functions.h"
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h
index d4b8fe45126..f5a65400d1e 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h
@@ -27,8 +27,7 @@ _CCCL_IMPLICIT_SYSTEM_HEADER
 
 #include "../cstdlib"                // _LIBCUDACXX_UNREACHABLE
 #include "../__type_traits/void_t.h" // _CUDA_VSTD::__void_t
-
-#include <cuda/ptx>             // cuda::ptx::*
+#include <cuda/ptx>                  // cuda::ptx::*
 
 #if defined(_LIBCUDACXX_COMPILER_NVRTC)
 #define _LIBCUDACXX_OFFSET_IS_ZERO(type, member) !(&(((type *)0)->member))
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h
index 10da1675226..e56b6f2586a 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h
@@ -36,54 +36,62 @@
 
 #if defined(_LIBCUDACXX_PTX_ISA_78_AVAILABLE) && defined(_LIBCUDACXX_PTX_SM_90_AVAILABLE)
 template <dot_scope _Sco>
-_LIBCUDACXX_DEVICE inline
-_CUDA_VSTD::uint64_t mbarrier_arrive_expect_tx(sem_release_t __sem, scope_t<_Sco> __scope, space_shared_t __spc, _CUDA_VSTD::uint64_t* __addr, _CUDA_VSTD::uint32_t __tx_count)
+_LIBCUDACXX_DEVICE inline _CUDA_VSTD::uint64_t mbarrier_arrive_expect_tx(
+  sem_release_t __sem,
+  scope_t<_Sco> __scope,
+  space_shared_t __spc,
+  _CUDA_VSTD::uint64_t* __addr,
+  _CUDA_VSTD::uint32_t __tx_count)
 {
-    // Arrive on local shared memory barrier
-    static_assert(__scope == scope_cta || __scope == scope_cluster, "");
-    _CUDA_VSTD::uint64_t __token;
-
-    if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cta) {
-        asm (
-            "mbarrier.arrive.expect_tx.release.cta.shared::cta.b64 %0, [%1], %2;"
-            : "=l"(__token)
-            : "r"(__as_smem_ptr(__addr)),
-              "r"(__tx_count)
-            : "memory");
+  // Arrive on local shared memory barrier
+  static_assert(__scope == scope_cta || __scope == scope_cluster, "");
+  _CUDA_VSTD::uint64_t __token;
+
+  if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cta) {
+      asm (
+        "mbarrier.arrive.expect_tx.release.cta.shared::cta.b64 %0, [%1], %2;"
+        : "=l"(__token)
+        : "r"(__as_smem_ptr(__addr)),
+          "r"(__tx_count)
+        : "memory");
     } else {
-        asm (
-            "mbarrier.arrive.expect_tx.release.cluster.shared::cta.b64 %0, [%1], %2;"
-            : "=l"(__token)
-            : "r"(__as_smem_ptr(__addr)),
-              "r"(__tx_count)
-            : "memory");
-    }
-    return __token;
+    asm (
+      "mbarrier.arrive.expect_tx.release.cluster.shared::cta.b64 %0, [%1], %2;"
+      : "=l"(__token)
+      : "r"(__as_smem_ptr(__addr)),
+        "r"(__tx_count)
+      : "memory");
+  }
+  return __token;
 }
 #ifdef // _LIBCUDACXX_PTX_ISA_78_AVAILABLE
 
 #if defined(_LIBCUDACXX_PTX_ISA_78_AVAILABLE) && defined(_LIBCUDACXX_PTX_SM_90_AVAILABLE)
 template <dot_scope _Sco>
-_LIBCUDACXX_DEVICE inline
-void mbarrier_arrive_expect_tx(sem_release_t __sem, scope_t<_Sco> __scope, space_shared_cluster_t __spc, _CUDA_VSTD::uint64_t* __addr, _CUDA_VSTD::uint32_t __tx_count)
+_LIBCUDACXX_DEVICE inline void mbarrier_arrive_expect_tx(
+  sem_release_t __sem,
+  scope_t<_Sco> __scope,
+  space_shared_cluster_t __spc,
+  _CUDA_VSTD::uint64_t* __addr,
+  _CUDA_VSTD::uint32_t __tx_count)
 {
-    // Arrive on remote cluster barrier
-    static_assert(__scope == scope_cta || __scope == scope_cluster, "");
-    if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cta) {
-        asm (
-            "mbarrier.arrive.expect_tx.release.cta.shared::cluster.b64 _, [%0], %1;"
-            :
-            : "r"(__as_smem_ptr(__addr)),
-              "r"(__tx_count)
-            : "memory");
+  // Arrive on remote cluster barrier
+  static_assert(__scope == scope_cta || __scope == scope_cluster, "");
+  if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cta) {
+      asm (
+        "mbarrier.arrive.expect_tx.release.cta.shared::cluster.b64 _, [%0], %1;"
+        :
+        : "r"(__as_smem_ptr(__addr)),
+          "r"(__tx_count)
+        : "memory");
     } else {
-        asm (
-            "mbarrier.arrive.expect_tx.release.cluster.shared::cluster.b64 _, [%0], %1;"
-            :
-            : "r"(__as_smem_ptr(__addr)),
-              "r"(__tx_count)
-            : "memory");
-    }
+    asm (
+      "mbarrier.arrive.expect_tx.release.cluster.shared::cluster.b64 _, [%0], %1;"
+      :
+      : "r"(__as_smem_ptr(__addr)),
+        "r"(__tx_count)
+      : "memory");
+  }
 }
 #ifdef // _LIBCUDACXX_PTX_ISA_78_AVAILABLE
 
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_dot_variants.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_dot_variants.h
index 08a972492e7..c91a2512847 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_dot_variants.h
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_dot_variants.h
@@ -48,47 +48,50 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#operation-types
-enum class dot_sem {
-    acq_rel,
-    acquire,
-    relaxed,
-    release,
-    sc,
-    weak
-    // mmio?
-    // volatile?
+enum class dot_sem
+{
+  acq_rel,
+  acquire,
+  relaxed,
+  release,
+  sc,
+  weak
+  // mmio?
+  // volatile?
 };
 
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#state-spaces
-enum class dot_space {
-    reg,
-    sreg,
-    const_mem,                 // Using const_mem as `const` is reserved in C++.
-    global,
-    local,
-    param,
-    shared,                    // The PTX spelling is shared::cta
-    shared_cluster,            // The PTX spelling is shared::cluster, but we might want to go for cluster here.
-    tex // deprecated
-    // generic?
+enum class dot_space
+{
+  reg,
+  sreg,
+  const_mem, // Using const_mem as `const` is reserved in C++.
+  global,
+  local,
+  param,
+  shared, // The PTX spelling is shared::cta
+  shared_cluster, // The PTX spelling is shared::cluster, but we might want to go for cluster here.
+  tex // deprecated
+  // generic?
 };
 
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#scope
-enum class dot_scope {
-    cta,
-    cluster,
-    gpu,
-    sys
+enum class dot_scope
+{
+  cta,
+  cluster,
+  gpu,
+  sys
 };
 
 template <dot_sem sem>
-using sem_t = _CUDA_VSTD::integral_constant<dot_sem, sem>;
+using sem_t         = _CUDA_VSTD::integral_constant<dot_sem, sem>;
 using sem_acq_rel_t = sem_t<dot_sem::acq_rel>;
 using sem_acquire_t = sem_t<dot_sem::acquire>;
 using sem_relaxed_t = sem_t<dot_sem::relaxed>;
 using sem_release_t = sem_t<dot_sem::release>;
-using sem_sc_t = sem_t<dot_sem::sc>;
-using sem_weak_t = sem_t<dot_sem::weak>;
+using sem_sc_t      = sem_t<dot_sem::sc>;
+using sem_weak_t    = sem_t<dot_sem::weak>;
 
 static constexpr sem_acq_rel_t sem_acq_rel{};
 static constexpr sem_acquire_t sem_acquire{};
@@ -98,16 +101,16 @@ static constexpr sem_sc_t sem_sc{};
 static constexpr sem_weak_t sem_weak{};
 
 template <dot_space spc>
-using space_t = _CUDA_VSTD::integral_constant<dot_space, spc>;
-using space_const_mem_t = space_t<dot_space::const_mem>;
-using space_global_t = space_t<dot_space::global>;
-using space_local_t = space_t<dot_space::local>;
-using space_param_t = space_t<dot_space::param>;
-using space_reg_t = space_t<dot_space::reg>;
-using space_shared_t = space_t<dot_space::shared>;
+using space_t                = _CUDA_VSTD::integral_constant<dot_space, spc>;
+using space_const_mem_t      = space_t<dot_space::const_mem>;
+using space_global_t         = space_t<dot_space::global>;
+using space_local_t          = space_t<dot_space::local>;
+using space_param_t          = space_t<dot_space::param>;
+using space_reg_t            = space_t<dot_space::reg>;
+using space_shared_t         = space_t<dot_space::shared>;
 using space_shared_cluster_t = space_t<dot_space::shared_cluster>;
-using space_sreg_t = space_t<dot_space::sreg>;
-using space_tex_t = space_t<dot_space::tex>;
+using space_sreg_t           = space_t<dot_space::sreg>;
+using space_tex_t            = space_t<dot_space::tex>;
 
 static constexpr space_const_mem_t space_const_mem{};
 static constexpr space_global_t space_global{};
@@ -120,11 +123,11 @@ static constexpr space_sreg_t space_sreg{};
 static constexpr space_tex_t space_tex{};
 
 template <dot_scope scope>
-using scope_t = _CUDA_VSTD::integral_constant<dot_scope, scope>;
+using scope_t         = _CUDA_VSTD::integral_constant<dot_scope, scope>;
 using scope_cluster_t = scope_t<dot_scope::cluster>;
-using scope_cta_t = scope_t<dot_scope::cta>;
-using scope_gpu_t = scope_t<dot_scope::gpu>;
-using scope_sys_t = scope_t<dot_scope::sys>;
+using scope_cta_t     = scope_t<dot_scope::cta>;
+using scope_gpu_t     = scope_t<dot_scope::gpu>;
+using scope_sys_t     = scope_t<dot_scope::sys>;
 
 static constexpr scope_cluster_t scope_cluster{};
 static constexpr scope_cta_t scope_cta{};
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_helper_functions.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_helper_functions.h
index 50982232f66..02ac1370d3d 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_helper_functions.h
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_helper_functions.h
@@ -15,27 +15,31 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
 // Private helper functions
-inline _LIBCUDACXX_DEVICE _CUDA_VSTD::uint32_t __as_smem_ptr(const void * __ptr)   {
-    return static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__ptr));
+inline _LIBCUDACXX_DEVICE _CUDA_VSTD::uint32_t __as_smem_ptr(const void* __ptr)
+{
+  return static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__ptr));
 }
-
-inline _LIBCUDACXX_DEVICE _CUDA_VSTD::uint32_t __as_remote_dsmem_ptr(const void * __ptr)   {
-    return static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__ptr));
+inline _LIBCUDACXX_DEVICE _CUDA_VSTD::uint32_t __as_remote_dsmem_ptr(const void* __ptr)
+{
+  return static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__ptr));
 }
-inline _LIBCUDACXX_DEVICE _CUDA_VSTD::uint64_t __as_gmem_ptr(const void * __ptr)   {
-    return static_cast<_CUDA_VSTD::uint64_t>(__cvta_generic_to_global(__ptr));
+inline _LIBCUDACXX_DEVICE _CUDA_VSTD::uint64_t __as_gmem_ptr(const void* __ptr)
+{
+  return static_cast<_CUDA_VSTD::uint64_t>(__cvta_generic_to_global(__ptr));
 }
 
 template <typename _Tp>
-inline _LIBCUDACXX_DEVICE int __as_b32(_Tp __val) {
-    static_assert(sizeof(_Tp) == 4, "");
-    return *reinterpret_cast<int*>(&__val);
+inline _LIBCUDACXX_DEVICE int __as_b32(_Tp __val)
+{
+  static_assert(sizeof(_Tp) == 4, "");
+  return *reinterpret_cast<int*>(&__val);
 }
 
 template <typename _Tp>
-inline _LIBCUDACXX_DEVICE _CUDA_VSTD::uint64_t __as_b64(_Tp __val) {
-    static_assert(sizeof(_Tp) == 8, "");
-    return *reinterpret_cast<_CUDA_VSTD::uint64_t*>(&__val);
+inline _LIBCUDACXX_DEVICE _CUDA_VSTD::uint64_t __as_b64(_Tp __val)
+{
+  static_assert(sizeof(_Tp) == 8, "");
+  return *reinterpret_cast<_CUDA_VSTD::uint64_t*>(&__val);
 }
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_PTX

From bb91eb741188e007becefb5e4c352edfa5c541bc Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Wed, 25 Oct 2023 13:34:04 +0200
Subject: [PATCH 14/49] Fix test and ifdefs

The test would previously not fail when invalid ptx was present. Fixed
now.
---
 .../test/cuda/ptx/sm90.ptx.compile.pass.cpp               | 7 ++++++-
 ...chronization_and_communication_instructions_mbarrier.h | 8 +++++---
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/libcudacxx/.upstream-tests/test/cuda/ptx/sm90.ptx.compile.pass.cpp b/libcudacxx/.upstream-tests/test/cuda/ptx/sm90.ptx.compile.pass.cpp
index 27b5af8e6f2..787a9c1f327 100644
--- a/libcudacxx/.upstream-tests/test/cuda/ptx/sm90.ptx.compile.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/cuda/ptx/sm90.ptx.compile.pass.cpp
@@ -24,7 +24,12 @@ int main(int, char**)
 {
     NV_IF_TARGET(NV_IS_DEVICE, (
         // Do not execute. Just check if below PTX compiles (that is: assembles) without error.
-        if (false) {
+
+        // This condition always evaluates to false, but the compiler does not
+        // reason through it. This avoids dead code elimination.
+        const bool non_eliminated_false = threadIdx.x > 1024;
+
+        if (non_eliminated_false) {
             using cuda::ptx::sem_release;
             using cuda::ptx::space_shared_cluster;
             using cuda::ptx::space_shared;
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h
index e56b6f2586a..ade0cf411ca 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h
@@ -13,6 +13,8 @@
 #ifndef PARALLEL_SYNCHRONIZATION_AND_COMMUNICATION_INSTRUCTIONS_MBARRIER_H_
 #define PARALLEL_SYNCHRONIZATION_AND_COMMUNICATION_INSTRUCTIONS_MBARRIER_H_
 
+_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
+
 /*
  *  9.7.12.15. Parallel Synchronization and Communication Instructions: mbarrier
  *   https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier
@@ -64,7 +66,7 @@ _LIBCUDACXX_DEVICE inline _CUDA_VSTD::uint64_t mbarrier_arrive_expect_tx(
   }
   return __token;
 }
-#ifdef // _LIBCUDACXX_PTX_ISA_78_AVAILABLE
+#endif // _LIBCUDACXX_PTX_ISA_78_AVAILABLE
 
 #if defined(_LIBCUDACXX_PTX_ISA_78_AVAILABLE) && defined(_LIBCUDACXX_PTX_SM_90_AVAILABLE)
 template <dot_scope _Sco>
@@ -93,7 +95,7 @@ _LIBCUDACXX_DEVICE inline void mbarrier_arrive_expect_tx(
       : "memory");
   }
 }
-#ifdef // _LIBCUDACXX_PTX_ISA_78_AVAILABLE
+#endif // _LIBCUDACXX_PTX_ISA_78_AVAILABLE
 
 
 // 9.7.12.15.14. Parallel Synchronization and Communication Instructions: mbarrier.arrive_drop
@@ -108,6 +110,6 @@ _LIBCUDACXX_DEVICE inline void mbarrier_arrive_expect_tx(
 // 9.7.12.15.17. Parallel Synchronization and Communication Instructions: mbarrier.pending_count
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-pending-count
 
-
+_LIBCUDACXX_END_NAMESPACE_CUDA_PTX
 
 #endif // PARALLEL_SYNCHRONIZATION_AND_COMMUNICATION_INSTRUCTIONS_MBARRIER_H_

From b514e2d86ed400a57bb1dc8cee65e290c8c2a34f Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <allard@allardhendriksen.nl>
Date: Wed, 25 Oct 2023 14:35:59 +0200
Subject: [PATCH 15/49] Update ptx.md

---
 libcudacxx/docs/extended_api/ptx.md | 487 +++++++++++++++++++++++++++-
 1 file changed, 482 insertions(+), 5 deletions(-)

diff --git a/libcudacxx/docs/extended_api/ptx.md b/libcudacxx/docs/extended_api/ptx.md
index d5092dc030f..feb4040d724 100644
--- a/libcudacxx/docs/extended_api/ptx.md
+++ b/libcudacxx/docs/extended_api/ptx.md
@@ -4,15 +4,381 @@ The `cuda::ptx` namespace contains functions that map one-to-one to
 [PTX instructions](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html). These can be used for maximal control of the generated code, or to
 experiment with new hardware features before a high-level C++ API is available.
 
-### Shared memory barrier (mbarrier)
+### [9.7.1. Integer Arithmetic Instructions](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions)
 
-| Instruction | Compute capability | CUDA Toolkit |
-|----------------------------------------|--------------------|--------------|
-| `cuda::ptx::mbarrier_arrive_expect_tx` | 9.0                | CTK 12.4     |
+| Instruction                              | Available in libcu++ |
+|------------------------------------------|----------------------|
+| [`sad`]                                  | No                   |
+| [`div`]                                  | No                   |
+| [`rem`]                                  | No                   |
+| [`abs`]                                  | No                   |
+| [`neg`]                                  | No                   |
+| [`min`]                                  | No                   |
+| [`max`]                                  | No                   |
+| [`popc`]                                 | No                   |
+| [`clz`]                                  | No                   |
+| [`bfind`]                                | No                   |
+| [`fns`]                                  | No                   |
+| [`brev`]                                 | No                   |
+| [`bfe`]                                  | No                   |
+| [`bfi`]                                  | No                   |
+| [`szext`]                                | No                   |
+| [`bmsk`]                                 | No                   |
+| [`dp4a`]                                 | No                   |
+| [`dp2a`]                                 | No                   |
+
+[`sad`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-sad
+[`div`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-div
+[`rem`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-rem
+[`abs`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-abs
+[`neg`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-neg
+[`min`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-min
+[`max`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-max
+[`popc`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-popc
+[`clz`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-clz
+[`bfind`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-bfind
+[`fns`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-fns
+[`brev`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-brev
+[`bfe`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-bfe
+[`bfi`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-bfi
+[`szext`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-szext
+[`bmsk`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-bmsk
+[`dp4a`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-dp4a
+[`dp2a`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-dp2a
+
+### [9.7.2. Extended-Precision Integer Arithmetic Instructions](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-integer-arithmetic-instructions)
+
+| Instruction                              | Available in libcu++ |
+|------------------------------------------|----------------------|
+| [`add.cc`]                               | No                   |
+| [`addc`]                                 | No                   |
+| [`sub.cc`]                               | No                   |
+| [`subc`]                                 | No                   |
+| [`mad.cc`]                               | No                   |
+| [`madc`]                                 | No                   |
+
+[`add.cc`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-arithmetic-instructions-add-cc
+[`addc`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-arithmetic-instructions-addc
+[`sub.cc`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-arithmetic-instructions-sub-cc
+[`subc`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-arithmetic-instructions-subc
+[`mad.cc`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-arithmetic-instructions-mad-cc
+[`madc`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-arithmetic-instructions-madc
+
+### [9.7.3. Floating-Point Instructions](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions)
+
+| Instruction                              | Available in libcu++ |
+|------------------------------------------|----------------------|
+| [`testp`]                                | No                   |
+| [`copysign`]                             | No                   |
+| [`add`]                                  | No                   |
+| [`sub`]                                  | No                   |
+| [`mul`]                                  | No                   |
+| [`fma`]                                  | No                   |
+| [`mad`]                                  | No                   |
+| [`div`]                                  | No                   |
+| [`abs`]                                  | No                   |
+| [`neg`]                                  | No                   |
+| [`min`]                                  | No                   |
+| [`max`]                                  | No                   |
+| [`rcp`]                                  | No                   |
+| [`rcp.approx.ftz.f64`]                   | No                   |
+| [`sqrt`]                                 | No                   |
+| [`rsqrt`]                                | No                   |
+| [`rsqrt.approx.ftz.f64`]                 | No                   |
+| [`sin`]                                  | No                   |
+| [`cos`]                                  | No                   |
+| [`lg2`]                                  | No                   |
+| [`ex2`]                                  | No                   |
+| [`tanh`]                                 | No                   |
+
+[`testp`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-testp
+[`copysign`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-copysign
+[`add`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-add
+[`sub`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-sub
+[`mul`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-mul
+[`fma`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-fma
+[`mad`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-mad
+[`div`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-div
+[`abs`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-abs
+[`neg`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-neg
+[`min`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-min
+[`max`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-max
+[`rcp`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-rcp
+[`rcp.approx.ftz.f64`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-rcp-approx-ftz-f64
+[`sqrt`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-sqrt
+[`rsqrt`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-rsqrt
+[`rsqrt.approx.ftz.f64`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-rsqrt-approx-ftz-f64
+[`sin`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-sin
+[`cos`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-cos
+[`lg2`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-lg2
+[`ex2`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-ex2
+[`tanh`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-tanh
+
+### [9.7.4. Half Precision Floating-Point Instructions](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions)
+
+| Instruction                              | Available in libcu++ |
+|------------------------------------------|----------------------|
+| [`add`]                                  | No                   |
+| [`sub`]                                  | No                   |
+| [`mul`]                                  | No                   |
+| [`fma`]                                  | No                   |
+| [`neg`]                                  | No                   |
+| [`abs`]                                  | No                   |
+| [`min`]                                  | No                   |
+| [`max`]                                  | No                   |
+| [`tanh`]                                 | No                   |
+| [`ex2`]                                  | No                   |
+
+[`add`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-add
+[`sub`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-sub
+[`mul`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-mul
+[`fma`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-fma
+[`neg`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-neg
+[`abs`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-abs
+[`min`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-min
+[`max`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-max
+[`tanh`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-tanh
+[`ex2`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-ex2
+
+### [9.7.5. Comparison and Selection Instructions](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#comparison-and-selection-instructions)
+
+| Instruction                              | Available in libcu++ |
+|------------------------------------------|----------------------|
+| [`set`]                                  | No                   |
+| [`setp`]                                 | No                   |
+| [`selp`]                                 | No                   |
+| [`slct`]                                 | No                   |
+
+[`set`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#comparison-and-selection-instructions-set
+[`setp`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#comparison-and-selection-instructions-setp
+[`selp`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#comparison-and-selection-instructions-selp
+[`slct`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#comparison-and-selection-instructions-slct
+
+### [9.7.6. Half Precision Comparison Instructions](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-comparison-instructions)
+
+| Instruction                              | Available in libcu++ |
+|------------------------------------------|----------------------|
+| [`set`]                                  | No                   |
+| [`setp`]                                 | No                   |
+
+[`set`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-comparison-instructions-set
+[`setp`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-comparison-instructions-setp
+
+### [9.7.7. Logic and Shift Instructions](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions)
+
+| Instruction                              | Available in libcu++ |
+|------------------------------------------|----------------------|
+| [`and`]                                  | No                   |
+| [`or`]                                   | No                   |
+| [`xor`]                                  | No                   |
+| [`not`]                                  | No                   |
+| [`cnot`]                                 | No                   |
+| [`lop3`]                                 | No                   |
+| [`shf`]                                  | No                   |
+| [`shl`]                                  | No                   |
+| [`shr`]                                  | No                   |
+
+[`and`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-and
+[`or`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-or
+[`xor`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-xor
+[`not`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-not
+[`cnot`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-cnot
+[`lop3`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-lop3
+[`shf`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-shf
+[`shl`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-shl
+[`shr`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-shr
+
+### [9.7.8. Data Movement and Conversion Instructions](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions)
+
+| Instruction                              | Available in libcu++ |
+|------------------------------------------|----------------------|
+| [`mov`]                                  | No                   |
+| [`mov`]                                  | No                   |
+| [`shfl (deprecated)`]                    | No                   |
+| [`shfl.sync`]                            | No                   |
+| [`prmt`]                                 | No                   |
+| [`ld`]                                   | No                   |
+| [`ld.global.nc`]                         | No                   |
+| [`ldu`]                                  | No                   |
+| [`st`]                                   | No                   |
+| [`st.async`]                             | No                   |
+| [`multimem.ld_reduce, multimem.st, multimem.red`] | No                   |
+| [`prefetch, prefetchu`]                  | No                   |
+| [`applypriority`]                        | No                   |
+| [`discard`]                              | No                   |
+| [`createpolicy`]                         | No                   |
+| [`isspacep`]                             | No                   |
+| [`cvta`]                                 | No                   |
+| [`cvt`]                                  | No                   |
+| [`cvt.pack`]                             | No                   |
+| [`mapa`]                                 | No                   |
+| [`getctarank`]                           | No                   |
+
+[`mov`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-mov-2
+[`shfl (deprecated)`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-shfl-deprecated
+[`shfl.sync`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-shfl-sync
+[`prmt`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-prmt
+[`ld`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-ld
+[`ld.global.nc`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-ld-global-nc
+[`ldu`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-ldu
+[`st`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-st
+[`st.async`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-st-async
+[`multimem.ld_reduce, multimem.st, multimem.red`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-multimem-ld-reduce-multimem-st-multimem-red
+[`prefetch, prefetchu`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-prefetch-prefetchu
+[`applypriority`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-applypriority
+[`discard`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-discard
+[`createpolicy`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-createpolicy
+[`isspacep`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-isspacep
+[`cvta`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvta
+[`cvt`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvt
+[`cvt.pack`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvt-pack
+[`mapa`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-mapa
+[`getctarank`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-getctarank
+
+### [9.7.8.24. Data Movement and Conversion Instructions: Asynchronous copy](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-asynchronous-copy)
+
+| Instruction                              | Available in libcu++ |
+|------------------------------------------|----------------------|
+| [`cp.async`]                             | No                   |
+| [`cp.async.commit_group`]                | No                   |
+| [`cp.async.wait_group / cp.async.wait_all`] | No                   |
+| [`cp.async.bulk`]                        | No                   |
+| [`cp.reduce.async.bulk`]                 | No                   |
+| [`cp.async.bulk.prefetch`]               | No                   |
+| [`cp.async.bulk.tensor`]                 | No                   |
+| [`cp.reduce.async.bulk.tensor`]          | No                   |
+| [`cp.async.bulk.prefetch.tensor`]        | No                   |
+| [`cp.async.bulk.commit_group`]           | No                   |
+| [`cp.async.bulk.wait_group`]             | No                   |
+| [`tensormap.replace`]                    | No                   |
+
+[`cp.async`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async
+[`cp.async.commit_group`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-commit-group
+[`cp.async.wait_group / cp.async.wait_all`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-wait-group-cp-async-wait-all
+[`cp.async.bulk`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk
+[`cp.reduce.async.bulk`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-reduce-async-bulk
+[`cp.async.bulk.prefetch`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-prefetch
+[`cp.async.bulk.tensor`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
+[`cp.reduce.async.bulk.tensor`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-reduce-async-bulk-tensor
+[`cp.async.bulk.prefetch.tensor`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-prefetch-tensor
+[`cp.async.bulk.commit_group`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-commit-group
+[`cp.async.bulk.wait_group`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-wait-group
+[`tensormap.replace`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-tensormap-replace
+
+### [9.7.9. Texture Instructions](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#texture-instructions)
+
+| Instruction                              | Available in libcu++ |
+|------------------------------------------|----------------------|
+| [`tex`]                                  | No                   |
+| [`tld4`]                                 | No                   |
+| [`txq`]                                  | No                   |
+| [`istypep`]                              | No                   |
+
+[`tex`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#texture-instructions-tex
+[`tld4`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#texture-instructions-tld4
+[`txq`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#texture-instructions-txq
+[`istypep`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#texture-instructions-istypep
+
+### [9.7.10. Surface Instructions](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#surface-instructions)
+
+| Instruction                              | Available in libcu++ |
+|------------------------------------------|----------------------|
+| [`suld`]                                 | No                   |
+| [`sust`]                                 | No                   |
+| [`sured`]                                | No                   |
+| [`suq`]                                  | No                   |
+
+[`suld`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#surface-instructions-suld
+[`sust`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#surface-instructions-sust
+[`sured`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#surface-instructions-sured
+[`suq`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#surface-instructions-suq
+
+### [9.7.11. Control Flow Instructions](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions)
+
+| Instruction                              | Available in libcu++ |
+|------------------------------------------|----------------------|
+| [`{}`]                                   | No                   |
+| [`@`]                                    | No                   |
+| [`bra`]                                  | No                   |
+| [`brx.idx`]                              | No                   |
+| [`call`]                                 | No                   |
+| [`ret`]                                  | No                   |
+| [`exit`]                                 | No                   |
+
+[`{}`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-curly-braces
+[`@`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-at
+[`bra`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-bra
+[`brx.idx`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-brx-idx
+[`call`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-call
+[`ret`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-ret
+[`exit`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-exit
+
+### [9.7.12. Parallel Synchronization and Communication Instructions](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions)
+
+| Instruction                              | Available in libcu++ |
+|------------------------------------------|----------------------|
+| [`bar, barrier`]                         | No                   |
+| [`bar.warp.sync`]                        | No                   |
+| [`barrier.cluster`]                      | No                   |
+| [`membar/fence`]                         | No                   |
+| [`atom`]                                 | No                   |
+| [`red`]                                  | No                   |
+| [`red.async`]                            | No                   |
+| [`vote (deprecated)`]                    | No                   |
+| [`vote.sync`]                            | No                   |
+| [`match.sync`]                           | No                   |
+| [`activemask`]                           | No                   |
+| [`redux.sync`]                           | No                   |
+| [`griddepcontrol`]                       | No                   |
+| [`elect.sync`]                           | No                   |
 
+[`bar, barrier`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-bar-barrier
+[`bar.warp.sync`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-bar-warp-sync
+[`barrier.cluster`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-barrier-cluster
+[`membar/fence`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar-fence
+[`atom`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-atom
+[`red`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-red
+[`red.async`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-red-async
+[`vote (deprecated)`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-vote-deprecated
+[`vote.sync`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-vote-sync
+[`match.sync`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-match-sync
+[`activemask`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-activemask
+[`redux.sync`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-redux-sync
+[`griddepcontrol`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-griddepcontrol
+[`elect.sync`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-elect-sync
 
-#### [`cuda::ptx::mbarrier_arrive_expect_tx`](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive)
+### [9.7.12.15. Parallel Synchronization and Communication Instructions: mbarrier](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier)
 
+| Instruction                              | Available in libcu++ |
+|------------------------------------------|----------------------|
+| [`mbarrier.init`]                        | No                   |
+| [`mbarrier.inval`]                       | No                   |
+| [`mbarrier.expect_tx`]                   | No                   |
+| [`mbarrier.complete_tx`]                 | No                   |
+| [`mbarrier.arrive`]                      | CTK-FUTURE, CCCL v2.3.0 |
+| [`mbarrier.arrive_drop`]                 | No                   |
+| [`cp.async.mbarrier.arrive`]             | No                   |
+| [`mbarrier.test_wait/mbarrier.try_wait`] | No                   |
+| [`mbarrier.pending_count`]               | No                   |
+| [`tensormap.cp_fenceproxy`]              | No                   |
+
+[`mbarrier.init`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-init
+[`mbarrier.inval`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-inval
+[`mbarrier.expect_tx`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-expect-tx
+[`mbarrier.complete_tx`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-complete-tx
+[`mbarrier.arrive`]: #mbarrierarrive
+[`mbarrier.arrive_drop`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive-drop
+[`cp.async.mbarrier.arrive`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-cp-async-mbarrier-arrive
+[`mbarrier.test_wait/mbarrier.try_wait`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-test-wait-mbarrier-try-wait
+[`mbarrier.pending_count`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-pending-count
+[`tensormap.cp_fenceproxy`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-tensormap-cp-fenceproxy
+
+
+#### `mbarrier.arrive`
+
+-  PTX ISA: [mbarrier.arrive](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive)
+
+ 
 ```cuda
 template <dot_scope _Sco>
 __device__ inline
@@ -63,6 +429,117 @@ __global__ void kernel() {
     )
 }
 ```
+### [9.7.13. Warp Level Matrix Multiply-Accumulate Instructions](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-multiply-accumulate-instructions)
+
+| Instruction                              | Available in libcu++ |
+|------------------------------------------|----------------------|
+| [`wmma.load`]                            | No                   |
+| [`wmma.store`]                           | No                   |
+| [`wmma.mma`]                             | No                   |
+| [`mma`]                                  | No                   |
+| [`ldmatrix`]                             | No                   |
+| [`stmatrix`]                             | No                   |
+| [`movmatrix`]                            | No                   |
+| [`mma.sp`]                               | No                   |
+
+[`wmma.load`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-load-instruction-wmma-load
+[`wmma.store`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-store-instruction-wmma-store
+[`wmma.mma`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-multiply-and-accumulate-instruction-wmma-mma
+[`mma`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#multiply-and-accumulate-instruction-mma
+[`ldmatrix`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-load-instruction-ldmatrix
+[`stmatrix`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-store-instruction-stmatrix
+[`movmatrix`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-transpose-instruction-movmatrix
+[`mma.sp`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#multiply-and-accumulate-instruction-mma-sp
+
+### [9.7.14. Asynchronous Warpgroup Level Matrix Multiply-Accumulate Instructions](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-warpgroup-level-matrix-multiply-accumulate-instructions)
+
+| Instruction                              | Available in libcu++ |
+|------------------------------------------|----------------------|
+| [`wgmma.mma_async`]                      | No                   |
+| [`wgmma.mma_async.sp`]                   | No                   |
+| [`wgmma.fence`]                          | No                   |
+| [`wgmma.commit_group`]                   | No                   |
+| [`wgmma.wait_group`]                     | No                   |
+
+[`wgmma.mma_async`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-multiply-and-accumulate-instruction-wgmma-mma-async
+[`wgmma.mma_async.sp`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-multiply-and-accumulate-instruction-wgmma-mma-async-sp
+[`wgmma.fence`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-multiply-and-accumulate-instruction-wgmma-fence
+[`wgmma.commit_group`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-multiply-and-accumulate-instruction-wgmma-commit-group
+[`wgmma.wait_group`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-multiply-and-accumulate-instruction-wgmma-wait-group
+
+### [9.7.15. Stack Manipulation Instructions](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#stack-manipulation-instructions)
+
+| Instruction                              | Available in libcu++ |
+|------------------------------------------|----------------------|
+| [`stacksave`]                            | No                   |
+| [`stackrestore`]                         | No                   |
+| [`alloca`]                               | No                   |
+
+[`stacksave`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#stack-manipulation-instructions-stacksave
+[`stackrestore`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#stack-manipulation-instructions-stackrestore
+[`alloca`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#stack-manipulation-instructions-alloca
+
+### [9.7.16. Video Instructions](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#video-instructions)
+
+| Instruction                              | Available in libcu++ |
+|------------------------------------------|----------------------|
+| [`vadd, vsub, vabsdiff, vmin, vmax`]     | No                   |
+| [`vshl, vshr`]                           | No                   |
+| [`vmad`]                                 | No                   |
+| [`vset`]                                 | No                   |
+
+[`vadd, vsub, vabsdiff, vmin, vmax`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#scalar-video-instructions-vadd-vsub-vabsdiff-vmin-vmax
+[`vshl, vshr`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#scalar-video-instructions-vshl-vshr
+[`vmad`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#scalar-video-instructions-vmad
+[`vset`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#scalar-video-instructions-vset
+
+### [9.7.16.2. SIMD Video Instructions](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#simd-video-instructions)
+
+| Instruction                              | Available in libcu++ |
+|------------------------------------------|----------------------|
+| [`vadd2, vsub2, vavrg2, vabsdiff2, vmin2, vmax2`] | No                   |
+| [`vset2`]                                | No                   |
+| [`vadd4, vsub4, vavrg4, vabsdiff4, vmin4, vmax4`] | No                   |
+| [`vset4`]                                | No                   |
+
+[`vadd2, vsub2, vavrg2, vabsdiff2, vmin2, vmax2`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#simd-video-instructions-vadd2-vsub2-vavrg2-vabsdiff2-vmin2-vmax2
+[`vset2`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#simd-video-instructions-vset2
+[`vadd4, vsub4, vavrg4, vabsdiff4, vmin4, vmax4`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#simd-video-instructions-vadd4-vsub4-vavrg4-vabsdiff4-vmin4-vmax4
+[`vset4`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#simd-video-instructions-vset4
+
+### [9.7.17. Miscellaneous Instructions](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions)
+
+| Instruction                              | Available in libcu++ |
+|------------------------------------------|----------------------|
+| [`brkpt`]                                | No                   |
+| [`nanosleep`]                            | No                   |
+| [`pmevent`]                              | No                   |
+| [`trap`]                                 | No                   |
+| [`setmaxnreg`]                           | No                   |
+
+[`brkpt`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions-brkpt
+[`nanosleep`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions-nanosleep
+[`pmevent`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions-pmevent
+[`trap`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions-trap
+[`setmaxnreg`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions-setmaxnreg
+
+
+
+
+
+
+
+
+
+
+
+### Shared memory barrier (mbarrier)
+
+| Instruction | Compute capability | CUDA Toolkit |
+|----------------------------------------|--------------------|--------------|
+| `cuda::ptx::mbarrier_arrive_expect_tx` | 9.0                | CTK 12.4     |
+
+
 
 
 

From e351c799eaabcfa8bfd3c95031f8a07ab435d973 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Wed, 25 Oct 2023 15:08:13 +0200
Subject: [PATCH 16/49] Use numerical PTX ISA/SM target macros

---
 ..._and_communication_instructions_mbarrier.h |  8 +--
 .../__cuda/ptx/ptx_isa_target_macros.h        | 57 +++++++++----------
 2 files changed, 32 insertions(+), 33 deletions(-)

diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h
index ade0cf411ca..2bcfdc4bd33 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h
@@ -36,7 +36,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 // 9.7.12.15.13. Parallel Synchronization and Communication Instructions: mbarrier.arrive
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive
 
-#if defined(_LIBCUDACXX_PTX_ISA_78_AVAILABLE) && defined(_LIBCUDACXX_PTX_SM_90_AVAILABLE)
+#if __cccl_ptx_sm >= 900 && __cccl_ptx_isa >= 780
 template <dot_scope _Sco>
 _LIBCUDACXX_DEVICE inline _CUDA_VSTD::uint64_t mbarrier_arrive_expect_tx(
   sem_release_t __sem,
@@ -66,9 +66,9 @@ _LIBCUDACXX_DEVICE inline _CUDA_VSTD::uint64_t mbarrier_arrive_expect_tx(
   }
   return __token;
 }
-#endif // _LIBCUDACXX_PTX_ISA_78_AVAILABLE
+#endif // __cccl_ptx_isa
 
-#if defined(_LIBCUDACXX_PTX_ISA_78_AVAILABLE) && defined(_LIBCUDACXX_PTX_SM_90_AVAILABLE)
+#if __cccl_ptx_sm >= 900 && __cccl_ptx_isa >= 780
 template <dot_scope _Sco>
 _LIBCUDACXX_DEVICE inline void mbarrier_arrive_expect_tx(
   sem_release_t __sem,
@@ -95,7 +95,7 @@ _LIBCUDACXX_DEVICE inline void mbarrier_arrive_expect_tx(
       : "memory");
   }
 }
-#endif // _LIBCUDACXX_PTX_ISA_78_AVAILABLE
+#endif // __cccl_ptx_isa
 
 
 // 9.7.12.15.14. Parallel Synchronization and Communication Instructions: mbarrier.arrive_drop
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h
index 89b519513ca..e0306dbf627 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h
@@ -21,43 +21,42 @@
  * https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#release-notes
  */
 
-#if (defined(__CUDA_MINIMUM_ARCH__) && 800 <= __CUDA_MINIMUM_ARCH__) || (!defined(__CUDA_MINIMUM_ARCH__))
-#  define _LIBCUDACXX_PTX_SM_80_AVAILABLE
-#endif
 
-#if (defined(__CUDA_MINIMUM_ARCH__) && 900 <= __CUDA_MINIMUM_ARCH__) || (!defined(__CUDA_MINIMUM_ARCH__))
-#  define _LIBCUDACXX_PTX_SM_90_AVAILABLE
-#endif
+// SM version
 
-// PTX ISA 7.8 is available from CTK 11.8, driver r520
-#if (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 8)) || (!defined(__CUDACC_VER_MAJOR__))
-#  define _LIBCUDACXX_PTX_ISA_78_AVAILABLE
+#if (defined(__CUDA_MINIMUM_ARCH__) && 900 <= __CUDA_MINIMUM_ARCH__) || (!defined(__CUDA_MINIMUM_ARCH__))
+#  define __cccl_ptx_sm 900ULL
+#elif (defined(__CUDA_MINIMUM_ARCH__) && 800 <= __CUDA_MINIMUM_ARCH__) || (!defined(__CUDA_MINIMUM_ARCH__))
+#  define __cccl_ptx_sm 800ULL
+// Fallback case. Define the SM version to be zero. This ensures that the macro is always defined.
+#else
+#  define __cccl_ptx_sm 0ULL
 #endif
 
-// PTX ISA 7.8 is available from CTK 11.8, driver r520 (so also from CTK 12.0 onwards)
-#if (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 0)) || (!defined(__CUDACC_VER_MAJOR__))
-#  define _LIBCUDACXX_PTX_ISA_78_AVAILABLE
-#endif
 
-// PTX ISA 8.0 is available from CTK 12.0, driver r525
-#if (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 0)) || (!defined(__CUDACC_VER_MAJOR__))
-#  define _LIBCUDACXX_PTX_ISA_80_AVAILABLE
-#endif
+// PTX ISA version
 
+// PTX ISA 8.3 is available from CTK 12.3, driver r545
+#if   (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 3)) || (!defined(__CUDACC_VER_MAJOR__))
+#  define __cccl_ptx_isa 830ULL
 // PTX ISA 8.1 is available from CTK 12.1, driver r530
-#if (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 1)) || (!defined(__CUDACC_VER_MAJOR__))
-#  define _LIBCUDACXX_PTX_ISA_81_AVAILABLE
-#endif
-
+#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 2)) || (!defined(__CUDACC_VER_MAJOR__))
+#  define __cccl_ptx_isa 820ULL
 // PTX ISA 8.2 is available from CTK 12.2, driver r535
-#if (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 2)) || (!defined(__CUDACC_VER_MAJOR__))
-#  define _LIBCUDACXX_PTX_ISA_82_AVAILABLE
-#endif
-
-// PTX ISA 8.3 is available from CTK 12.3, driver r545
-#if (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 3)) || (!defined(__CUDACC_VER_MAJOR__))
-#  define _LIBCUDACXX_PTX_ISA_83_AVAILABLE
+#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 1)) || (!defined(__CUDACC_VER_MAJOR__))
+#  define __cccl_ptx_isa 810ULL
+// PTX ISA 8.0 is available from CTK 12.0, driver r525
+#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 0)) || (!defined(__CUDACC_VER_MAJOR__))
+#  define __cccl_ptx_isa 800ULL
+// PTX ISA 7.8 is available from CTK 11.8, driver r520 (so also from CTK 12.0 onwards)
+#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 0)) || (!defined(__CUDACC_VER_MAJOR__))
+#  define __cccl_ptx_isa 780ULL
+// PTX ISA 7.8 is available from CTK 11.8, driver r520
+#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 8)) || (!defined(__CUDACC_VER_MAJOR__))
+#  define __cccl_ptx_isa 780ULL
+// Fallback case. Define the ISA version to be zero. This ensures that the macro is always defined.
+#else
+#  define __cccl_ptx_isa 0ULL
 #endif
 
-
 #endif // PTX_ISA_TARGET_MACROS_H_

From 9006317cdb2368eb74a0ac8780c15f3898e1570d Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Wed, 25 Oct 2023 15:17:18 +0200
Subject: [PATCH 17/49] Move bulk of ptx header into detail/ptx.h

---
 libcudacxx/include/cuda/ptx                   | 696 +----------------
 .../std/detail/libcxx/include/__cuda/ptx.h    | 711 ++++++++++++++++++
 2 files changed, 715 insertions(+), 692 deletions(-)
 create mode 100644 libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h

diff --git a/libcudacxx/include/cuda/ptx b/libcudacxx/include/cuda/ptx
index 1f3fb868679..ab6ed62d9d2 100644
--- a/libcudacxx/include/cuda/ptx
+++ b/libcudacxx/include/cuda/ptx
@@ -12,700 +12,12 @@
 #ifndef _CUDA_PTX
 #define _CUDA_PTX
 
-#include "std/cstdint" // uint32_t
-#include "std/type_traits" // std::integral_constant
-#include "../nv/target" // __CUDA_MINIMUM_ARCH__ and friends
+#include "std/detail/__config"
 
-// The following includes depend on the includes above:
-#include "cuda/std/detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h"
-#include "cuda/std/detail/libcxx/include/__cuda/ptx/ptx_dot_variants.h"
-#include "cuda/std/detail/libcxx/include/__cuda/ptx/ptx_helper_functions.h"
-#include "cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h"
+#include "std/detail/__pragma_push"
 
-/*
- * The cuda::ptx namespace intends to provide PTX wrappers for new hardware
- * features and new PTX instructions so that they can be experimented with
- * before higher-level C++ APIs are designed and developed.
- *
- * The wrappers have the following responsibilities:
- *
- * - They must prevent any PTX assembler errors, that is:
- *   - They are defined only for versions of the CUDA Toolkit in which nvcc/ptxas
- *     actually recognizes the instruction.
- *   - Sizes and types of parameters are correct.
- * - They must convert state spaces correctly.
- * - They adhere to the libcu++ coding standards of using:
- *   - Reserved identifiers for all parameters, variables. E.g. `__meow` or `_Woof`
- *   - _CUDA_VSTD:: namespace for types
- *
- * The wrappers should not do the following:
- *
- * - Use any non-native types. For example, an mbarrier instruction wrapper
- *   takes the barrier address as a uint64_t pointer.
- *
- * This header is intended for:
- *
- * - internal consumption by higher-level APIs such as cuda::barrier,
- * - outside developers who want to experiment with the latest features of the
- *   hardware.
- *
- * Stability:
- *
- * - These headers are intended to present a stable API (not ABI) within one
- *   major version of the CTK. This means that:
- *   - All functions are marked inline
- *   - The type of a function parameter can be changed to be more generic if
- *     that means that code that called the original version can still be
- *     compiled.
- *
- * - Good exposure of the PTX should be high priority. If, at a new major
- *   version, we face a difficult choice between breaking backward-compatibility
- *   and an improvement of the PTX exposure, we will tend to the latter option
- *   more easily than in other parts of libcu++.
- */
+#include "std/detail/libcxx/include/__cuda/ptx.h"
 
-_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
-
-/*
- *  Instructions
- *
- *  The organization of the instructions below follows that of the PTX ISA documentation:
- *  https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#instructions
- *
- *  To improve code organization, some sections are separated into their own
- *  header. For instance, the mbarrier instructions are found in:
- *  __cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h
- *
- */
-
-/*
- *  9.7.1. Integer Arithmetic Instructions
- *  https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions
- *
- */
-
-// 9.7.1.7. Integer Arithmetic Instructions: sad
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-sad
-
-// 9.7.1.8. Integer Arithmetic Instructions: div
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-div
-
-// 9.7.1.9. Integer Arithmetic Instructions: rem
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-rem
-
-// 9.7.1.10. Integer Arithmetic Instructions: abs
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-abs
-
-// 9.7.1.11. Integer Arithmetic Instructions: neg
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-neg
-
-// 9.7.1.12. Integer Arithmetic Instructions: min
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-min
-
-// 9.7.1.13. Integer Arithmetic Instructions: max
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-max
-
-// 9.7.1.14. Integer Arithmetic Instructions: popc
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-popc
-
-// 9.7.1.15. Integer Arithmetic Instructions: clz
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-clz
-
-// 9.7.1.16. Integer Arithmetic Instructions: bfind
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-bfind
-
-// 9.7.1.17. Integer Arithmetic Instructions: fns
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-fns
-
-// 9.7.1.18. Integer Arithmetic Instructions: brev
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-brev
-
-// 9.7.1.19. Integer Arithmetic Instructions: bfe
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-bfe
-
-// 9.7.1.20. Integer Arithmetic Instructions: bfi
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-bfi
-
-// 9.7.1.21. Integer Arithmetic Instructions: szext
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-szext
-
-// 9.7.1.22. Integer Arithmetic Instructions: bmsk
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-bmsk
-
-// 9.7.1.23. Integer Arithmetic Instructions: dp4a
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-dp4a
-
-// 9.7.1.24. Integer Arithmetic Instructions: dp2a
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-dp2a
-
-
-/*
- *  9.7.2. Extended-Precision Integer Arithmetic Instructions
- *  https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-integer-arithmetic-instructions
- *
- */
-
-// 9.7.2.1. Extended-Precision Arithmetic Instructions: add.cc
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-arithmetic-instructions-add-cc
-
-// 9.7.2.2. Extended-Precision Arithmetic Instructions: addc
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-arithmetic-instructions-addc
-
-// 9.7.2.3. Extended-Precision Arithmetic Instructions: sub.cc
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-arithmetic-instructions-sub-cc
-
-// 9.7.2.4. Extended-Precision Arithmetic Instructions: subc
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-arithmetic-instructions-subc
-
-// 9.7.2.5. Extended-Precision Arithmetic Instructions: mad.cc
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-arithmetic-instructions-mad-cc
-
-// 9.7.2.6. Extended-Precision Arithmetic Instructions: madc
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-arithmetic-instructions-madc
-
-
-/*
- *  9.7.3. Floating-Point Instructions
- *  https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions
- *
- */
-
-// 9.7.3.1. Floating Point Instructions: testp
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-testp
-
-// 9.7.3.2. Floating Point Instructions: copysign
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-copysign
-
-// 9.7.3.3. Floating Point Instructions: add
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-add
-
-// 9.7.3.4. Floating Point Instructions: sub
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-sub
-
-// 9.7.3.5. Floating Point Instructions: mul
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-mul
-
-// 9.7.3.6. Floating Point Instructions: fma
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-fma
-
-// 9.7.3.7. Floating Point Instructions: mad
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-mad
-
-// 9.7.3.8. Floating Point Instructions: div
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-div
-
-// 9.7.3.9. Floating Point Instructions: abs
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-abs
-
-// 9.7.3.10. Floating Point Instructions: neg
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-neg
-
-// 9.7.3.11. Floating Point Instructions: min
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-min
-
-// 9.7.3.12. Floating Point Instructions: max
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-max
-
-// 9.7.3.13. Floating Point Instructions: rcp
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-rcp
-
-// 9.7.3.14. Floating Point Instructions: rcp.approx.ftz.f64
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-rcp-approx-ftz-f64
-
-// 9.7.3.15. Floating Point Instructions: sqrt
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-sqrt
-
-// 9.7.3.16. Floating Point Instructions: rsqrt
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-rsqrt
-
-// 9.7.3.17. Floating Point Instructions: rsqrt.approx.ftz.f64
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-rsqrt-approx-ftz-f64
-
-// 9.7.3.18. Floating Point Instructions: sin
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-sin
-
-// 9.7.3.19. Floating Point Instructions: cos
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-cos
-
-// 9.7.3.20. Floating Point Instructions: lg2
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-lg2
-
-// 9.7.3.21. Floating Point Instructions: ex2
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-ex2
-
-// 9.7.3.22. Floating Point Instructions: tanh
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-tanh
-
-
-/*
- *  9.7.4. Half Precision Floating-Point Instructions
- *  https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions
- *
- */
-
-// 9.7.4.1. Half Precision Floating Point Instructions: add
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-add
-
-// 9.7.4.2. Half Precision Floating Point Instructions: sub
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-sub
-
-// 9.7.4.3. Half Precision Floating Point Instructions: mul
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-mul
-
-// 9.7.4.4. Half Precision Floating Point Instructions: fma
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-fma
-
-// 9.7.4.5. Half Precision Floating Point Instructions: neg
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-neg
-
-// 9.7.4.6. Half Precision Floating Point Instructions: abs
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-abs
-
-// 9.7.4.7. Half Precision Floating Point Instructions: min
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-min
-
-// 9.7.4.8. Half Precision Floating Point Instructions: max
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-max
-
-// 9.7.4.9. Half Precision Floating Point Instructions: tanh
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-tanh
-
-// 9.7.4.10. Half Precision Floating Point Instructions: ex2
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-ex2
-
-
-/*
- *  9.7.5. Comparison and Selection Instructions
- *  https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#comparison-and-selection-instructions
- *
- */
-
-// 9.7.5.1. Comparison and Selection Instructions: set
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#comparison-and-selection-instructions-set
-
-// 9.7.5.2. Comparison and Selection Instructions: setp
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#comparison-and-selection-instructions-setp
-
-// 9.7.5.3. Comparison and Selection Instructions: selp
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#comparison-and-selection-instructions-selp
-
-// 9.7.5.4. Comparison and Selection Instructions: slct
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#comparison-and-selection-instructions-slct
-
-
-/*
- *  9.7.6. Half Precision Comparison Instructions
- *  https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-comparison-instructions
- *
- */
-
-// 9.7.6.1. Half Precision Comparison Instructions: set
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-comparison-instructions-set
-
-// 9.7.6.2. Half Precision Comparison Instructions: setp
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-comparison-instructions-setp
-
-
-/*
- *  9.7.7. Logic and Shift Instructions
- *  https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions
- *
- */
-
-// 9.7.7.1. Logic and Shift Instructions: and
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-and
-
-// 9.7.7.2. Logic and Shift Instructions: or
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-or
-
-// 9.7.7.3. Logic and Shift Instructions: xor
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-xor
-
-// 9.7.7.4. Logic and Shift Instructions: not
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-not
-
-// 9.7.7.5. Logic and Shift Instructions: cnot
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-cnot
-
-// 9.7.7.6. Logic and Shift Instructions: lop3
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-lop3
-
-// 9.7.7.7. Logic and Shift Instructions: shf
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-shf
-
-// 9.7.7.8. Logic and Shift Instructions: shl
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-shl
-
-// 9.7.7.9. Logic and Shift Instructions: shr
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-shr
-
-
-/*
- *  9.7.8. Data Movement and Conversion Instructions
- *  https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions
- *
- */
-
-// 9.7.8.3. Data Movement and Conversion Instructions: mov
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-mov
-
-// 9.7.8.4. Data Movement and Conversion Instructions: mov
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-mov-2
-
-// 9.7.8.5. Data Movement and Conversion Instructions: shfl (deprecated)
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-shfl-deprecated
-
-// 9.7.8.6. Data Movement and Conversion Instructions: shfl.sync
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-shfl-sync
-
-// 9.7.8.7. Data Movement and Conversion Instructions: prmt
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-prmt
-
-// 9.7.8.8. Data Movement and Conversion Instructions: ld
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-ld
-
-// 9.7.8.9. Data Movement and Conversion Instructions: ld.global.nc
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-ld-global-nc
-
-// 9.7.8.10. Data Movement and Conversion Instructions: ldu
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-ldu
-
-// 9.7.8.11. Data Movement and Conversion Instructions: st
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-st
-
-// 9.7.8.12. Data Movement and Conversion Instructions: st.async
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-st-async
-
-// 9.7.8.13. Data Movement and Conversion Instructions: multimem.ld_reduce, multimem.st, multimem.red
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-multimem-ld-reduce-multimem-st-multimem-red
-
-// 9.7.8.14. Data Movement and Conversion Instructions: prefetch, prefetchu
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-prefetch-prefetchu
-
-// 9.7.8.15. Data Movement and Conversion Instructions: applypriority
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-applypriority
-
-// 9.7.8.16. Data Movement and Conversion Instructions: discard
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-discard
-
-// 9.7.8.17. Data Movement and Conversion Instructions: createpolicy
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-createpolicy
-
-// 9.7.8.18. Data Movement and Conversion Instructions: isspacep
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-isspacep
-
-// 9.7.8.19. Data Movement and Conversion Instructions: cvta
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvta
-
-// 9.7.8.20. Data Movement and Conversion Instructions: cvt
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvt
-
-// 9.7.8.21. Data Movement and Conversion Instructions: cvt.pack
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvt-pack
-
-// 9.7.8.22. Data Movement and Conversion Instructions: mapa
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-mapa
-
-// 9.7.8.23. Data Movement and Conversion Instructions: getctarank
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-getctarank
-
-
-/*
- *  9.7.8.24. Data Movement and Conversion Instructions: Asynchronous copy
- *   https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-asynchronous-copy
- *
- */
-
-// 9.7.8.24.3. Data Movement and Conversion Instructions: cp.async
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async
-
-// 9.7.8.24.4. Data Movement and Conversion Instructions: cp.async.commit_group
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-commit-group
-
-// 9.7.8.24.5. Data Movement and Conversion Instructions: cp.async.wait_group / cp.async.wait_all
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-wait-group-cp-async-wait-all
-
-// 9.7.8.24.6. Data Movement and Conversion Instructions: cp.async.bulk
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk
-
-// 9.7.8.24.7. Data Movement and Conversion Instructions: cp.reduce.async.bulk
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-reduce-async-bulk
-
-// 9.7.8.24.8. Data Movement and Conversion Instructions: cp.async.bulk.prefetch
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-prefetch
-
-// 9.7.8.24.9. Data Movement and Conversion Instructions: cp.async.bulk.tensor
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
-
-// 9.7.8.24.10. Data Movement and Conversion Instructions: cp.reduce.async.bulk.tensor
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-reduce-async-bulk-tensor
-
-// 9.7.8.24.11. Data Movement and Conversion Instructions: cp.async.bulk.prefetch.tensor
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-prefetch-tensor
-
-// 9.7.8.24.12. Data Movement and Conversion Instructions: cp.async.bulk.commit_group
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-commit-group
-
-// 9.7.8.24.13. Data Movement and Conversion Instructions: cp.async.bulk.wait_group
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-wait-group
-
-// 9.7.8.25. Data Movement and Conversion Instructions: tensormap.replace
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-tensormap-replace
-
-
-/*
- *  9.7.9. Texture Instructions
- *  https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#texture-instructions
- *
- */
-
-// 9.7.9.3. Texture Instructions: tex
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#texture-instructions-tex
-
-// 9.7.9.4. Texture Instructions: tld4
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#texture-instructions-tld4
-
-// 9.7.9.5. Texture Instructions: txq
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#texture-instructions-txq
-
-// 9.7.9.6. Texture Instructions: istypep
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#texture-instructions-istypep
-
-
-/*
- *  9.7.10. Surface Instructions
- *  https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#surface-instructions
- *
- */
-
-// 9.7.10.1. Surface Instructions: suld
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#surface-instructions-suld
-
-// 9.7.10.2. Surface Instructions: sust
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#surface-instructions-sust
-
-// 9.7.10.3. Surface Instructions: sured
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#surface-instructions-sured
-
-// 9.7.10.4. Surface Instructions: suq
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#surface-instructions-suq
-
-
-/*
- *  9.7.11. Control Flow Instructions
- *  https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions
- *
- */
-
-// 9.7.11.1. Control Flow Instructions: {}
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-curly-braces
-
-// 9.7.11.2. Control Flow Instructions: @
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-at
-
-// 9.7.11.3. Control Flow Instructions: bra
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-bra
-
-// 9.7.11.4. Control Flow Instructions: brx.idx
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-brx-idx
-
-// 9.7.11.5. Control Flow Instructions: call
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-call
-
-// 9.7.11.6. Control Flow Instructions: ret
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-ret
-
-// 9.7.11.7. Control Flow Instructions: exit
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-exit
-
-
-/*
- *  9.7.12. Parallel Synchronization and Communication Instructions
- *  https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions
- *
- */
-
-// 9.7.12.1. Parallel Synchronization and Communication Instructions: bar, barrier
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-bar-barrier
-
-// 9.7.12.2. Parallel Synchronization and Communication Instructions: bar.warp.sync
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-bar-warp-sync
-
-// 9.7.12.3. Parallel Synchronization and Communication Instructions: barrier.cluster
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-barrier-cluster
-
-// 9.7.12.4. Parallel Synchronization and Communication Instructions: membar/fence
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar-fence
-
-// 9.7.12.5. Parallel Synchronization and Communication Instructions: atom
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-atom
-
-// 9.7.12.6. Parallel Synchronization and Communication Instructions: red
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-red
-
-// 9.7.12.7. Parallel Synchronization and Communication Instructions: red.async
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-red-async
-
-// 9.7.12.8. Parallel Synchronization and Communication Instructions: vote (deprecated)
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-vote-deprecated
-
-// 9.7.12.9. Parallel Synchronization and Communication Instructions: vote.sync
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-vote-sync
-
-// 9.7.12.10. Parallel Synchronization and Communication Instructions: match.sync
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-match-sync
-
-// 9.7.12.11. Parallel Synchronization and Communication Instructions: activemask
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-activemask
-
-// 9.7.12.12. Parallel Synchronization and Communication Instructions: redux.sync
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-redux-sync
-
-// 9.7.12.13. Parallel Synchronization and Communication Instructions: griddepcontrol
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-griddepcontrol
-
-// 9.7.12.14. Parallel Synchronization and Communication Instructions: elect.sync
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-elect-sync
-
-/*
- *  9.7.12.15. Parallel Synchronization and Communication Instructions: mbarrier
- *  https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier
- *
- *  Contained in: __cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h
- */
-
-// 9.7.12.15.18. Parallel Synchronization and Communication Instructions: tensormap.cp_fenceproxy
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-tensormap-cp-fenceproxy
-
-
-/*
- *  9.7.13. Warp Level Matrix Multiply-Accumulate Instructions
- *  https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-multiply-accumulate-instructions
- *
- */
-
-// 9.7.13.3.3. Warp-level Matrix Load Instruction: wmma.load
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-load-instruction-wmma-load
-
-// 9.7.13.3.4. Warp-level Matrix Store Instruction: wmma.store
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-store-instruction-wmma-store
-
-// 9.7.13.3.5. Warp-level Matrix Multiply-and-Accumulate Instruction: wmma.mma
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-multiply-and-accumulate-instruction-wmma-mma
-
-// 9.7.13.4.14. Multiply-and-Accumulate Instruction: mma
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#multiply-and-accumulate-instruction-mma
-
-// 9.7.13.4.15. Warp-level matrix load instruction: ldmatrix
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-load-instruction-ldmatrix
-
-// 9.7.13.4.16. Warp-level matrix store instruction: stmatrix
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-store-instruction-stmatrix
-
-// 9.7.13.4.17. Warp-level matrix transpose instruction: movmatrix
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-transpose-instruction-movmatrix
-
-// 9.7.13.5.3. Multiply-and-Accumulate Instruction: mma.sp
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#multiply-and-accumulate-instruction-mma-sp
-
-
-/*
- *  9.7.14. Asynchronous Warpgroup Level Matrix Multiply-Accumulate Instructions
- *  https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-warpgroup-level-matrix-multiply-accumulate-instructions
- *
- */
-
-// 9.7.14.5.2. Asynchronous Multiply-and-Accumulate Instruction: wgmma.mma_async
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-multiply-and-accumulate-instruction-wgmma-mma-async
-
-// 9.7.14.6.4. Asynchronous Multiply-and-Accumulate Instruction: wgmma.mma_async.sp
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-multiply-and-accumulate-instruction-wgmma-mma-async-sp
-
-// 9.7.14.7.1. Asynchronous Multiply-and-Accumulate Instruction: wgmma.fence
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-multiply-and-accumulate-instruction-wgmma-fence
-
-// 9.7.14.7.2. Asynchronous Multiply-and-Accumulate Instruction: wgmma.commit_group
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-multiply-and-accumulate-instruction-wgmma-commit-group
-
-// 9.7.14.7.3. Asynchronous Multiply-and-Accumulate Instruction: wgmma.wait_group
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-multiply-and-accumulate-instruction-wgmma-wait-group
-
-
-/*
- *  9.7.15. Stack Manipulation Instructions
- *  https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#stack-manipulation-instructions
- *
- */
-
-// 9.7.15.1. Stack Manipulation Instructions: stacksave
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#stack-manipulation-instructions-stacksave
-
-// 9.7.15.2. Stack Manipulation Instructions: stackrestore
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#stack-manipulation-instructions-stackrestore
-
-// 9.7.15.3. Stack Manipulation Instructions: alloca
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#stack-manipulation-instructions-alloca
-
-
-/*
- *  9.7.16. Video Instructions
- *  https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#video-instructions
- *
- */
-
-// 9.7.16.1.1. Scalar Video Instructions: vadd, vsub, vabsdiff, vmin, vmax
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#scalar-video-instructions-vadd-vsub-vabsdiff-vmin-vmax
-
-// 9.7.16.1.2. Scalar Video Instructions: vshl, vshr
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#scalar-video-instructions-vshl-vshr
-
-// 9.7.16.1.3. Scalar Video Instructions: vmad
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#scalar-video-instructions-vmad
-
-// 9.7.16.1.4. Scalar Video Instructions: vset
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#scalar-video-instructions-vset
-
-
-/*
- *  9.7.16.2. SIMD Video Instructions
- *  https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#simd-video-instructions
- *
- */
-
-// 9.7.16.2.1. SIMD Video Instructions: vadd2, vsub2, vavrg2, vabsdiff2, vmin2, vmax2
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#simd-video-instructions-vadd2-vsub2-vavrg2-vabsdiff2-vmin2-vmax2
-
-// 9.7.16.2.2. SIMD Video Instructions: vset2
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#simd-video-instructions-vset2
-
-// 9.7.16.2.3. SIMD Video Instructions: vadd4, vsub4, vavrg4, vabsdiff4, vmin4, vmax4
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#simd-video-instructions-vadd4-vsub4-vavrg4-vabsdiff4-vmin4-vmax4
-
-// 9.7.16.2.4. SIMD Video Instructions: vset4
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#simd-video-instructions-vset4
-
-
-/*
- *  9.7.17. Miscellaneous Instructions
- *  https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions
- *
- */
-
-// 9.7.17.1. Miscellaneous Instructions: brkpt
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions-brkpt
-
-// 9.7.17.2. Miscellaneous Instructions: nanosleep
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions-nanosleep
-
-// 9.7.17.3. Miscellaneous Instructions: pmevent
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions-pmevent
-
-// 9.7.17.4. Miscellaneous Instructions: trap
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions-trap
-
-// 9.7.17.5. Miscellaneous Instructions: setmaxnreg
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions-setmaxnreg
-
-_LIBCUDACXX_END_NAMESPACE_CUDA_PTX
+#include "std/detail/__pragma_pop"
 
 #endif // _CUDA_PTX
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h
new file mode 100644
index 00000000000..efa73e15f09
--- /dev/null
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h
@@ -0,0 +1,711 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCUDACXX___CUDA_PTX_H
+#define  _LIBCUDACXX___CUDA_PTX_H
+
+#include "../cstdint" // uint32_t
+#include "../type_traits" // std::integral_constant
+#include "../../../../../../nv/target" // __CUDA_MINIMUM_ARCH__ and friends
+
+// The following includes depend on the includes above:
+#include "cuda/std/detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h"
+#include "cuda/std/detail/libcxx/include/__cuda/ptx/ptx_dot_variants.h"
+#include "cuda/std/detail/libcxx/include/__cuda/ptx/ptx_helper_functions.h"
+#include "cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h"
+
+/*
+ * The cuda::ptx namespace intends to provide PTX wrappers for new hardware
+ * features and new PTX instructions so that they can be experimented with
+ * before higher-level C++ APIs are designed and developed.
+ *
+ * The wrappers have the following responsibilities:
+ *
+ * - They must prevent any PTX assembler errors, that is:
+ *   - They are defined only for versions of the CUDA Toolkit in which nvcc/ptxas
+ *     actually recognizes the instruction.
+ *   - Sizes and types of parameters are correct.
+ * - They must convert state spaces correctly.
+ * - They adhere to the libcu++ coding standards of using:
+ *   - Reserved identifiers for all parameters, variables. E.g. `__meow` or `_Woof`
+ *   - _CUDA_VSTD:: namespace for types
+ *
+ * The wrappers should not do the following:
+ *
+ * - Use any non-native types. For example, an mbarrier instruction wrapper
+ *   takes the barrier address as a uint64_t pointer.
+ *
+ * This header is intended for:
+ *
+ * - internal consumption by higher-level APIs such as cuda::barrier,
+ * - outside developers who want to experiment with the latest features of the
+ *   hardware.
+ *
+ * Stability:
+ *
+ * - These headers are intended to present a stable API (not ABI) within one
+ *   major version of the CTK. This means that:
+ *   - All functions are marked inline
+ *   - The type of a function parameter can be changed to be more generic if
+ *     that means that code that called the original version can still be
+ *     compiled.
+ *
+ * - Good exposure of the PTX should be high priority. If, at a new major
+ *   version, we face a difficult choice between breaking backward-compatibility
+ *   and an improvement of the PTX exposure, we will tend to the latter option
+ *   more easily than in other parts of libcu++.
+ */
+
+_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
+
+/*
+ *  Instructions
+ *
+ *  The organization of the instructions below follows that of the PTX ISA documentation:
+ *  https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#instructions
+ *
+ *  To improve code organization, some sections are separated into their own
+ *  header. For instance, the mbarrier instructions are found in:
+ *  __cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h
+ *
+ */
+
+/*
+ *  9.7.1. Integer Arithmetic Instructions
+ *  https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions
+ *
+ */
+
+// 9.7.1.7. Integer Arithmetic Instructions: sad
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-sad
+
+// 9.7.1.8. Integer Arithmetic Instructions: div
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-div
+
+// 9.7.1.9. Integer Arithmetic Instructions: rem
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-rem
+
+// 9.7.1.10. Integer Arithmetic Instructions: abs
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-abs
+
+// 9.7.1.11. Integer Arithmetic Instructions: neg
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-neg
+
+// 9.7.1.12. Integer Arithmetic Instructions: min
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-min
+
+// 9.7.1.13. Integer Arithmetic Instructions: max
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-max
+
+// 9.7.1.14. Integer Arithmetic Instructions: popc
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-popc
+
+// 9.7.1.15. Integer Arithmetic Instructions: clz
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-clz
+
+// 9.7.1.16. Integer Arithmetic Instructions: bfind
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-bfind
+
+// 9.7.1.17. Integer Arithmetic Instructions: fns
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-fns
+
+// 9.7.1.18. Integer Arithmetic Instructions: brev
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-brev
+
+// 9.7.1.19. Integer Arithmetic Instructions: bfe
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-bfe
+
+// 9.7.1.20. Integer Arithmetic Instructions: bfi
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-bfi
+
+// 9.7.1.21. Integer Arithmetic Instructions: szext
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-szext
+
+// 9.7.1.22. Integer Arithmetic Instructions: bmsk
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-bmsk
+
+// 9.7.1.23. Integer Arithmetic Instructions: dp4a
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-dp4a
+
+// 9.7.1.24. Integer Arithmetic Instructions: dp2a
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-dp2a
+
+
+/*
+ *  9.7.2. Extended-Precision Integer Arithmetic Instructions
+ *  https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-integer-arithmetic-instructions
+ *
+ */
+
+// 9.7.2.1. Extended-Precision Arithmetic Instructions: add.cc
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-arithmetic-instructions-add-cc
+
+// 9.7.2.2. Extended-Precision Arithmetic Instructions: addc
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-arithmetic-instructions-addc
+
+// 9.7.2.3. Extended-Precision Arithmetic Instructions: sub.cc
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-arithmetic-instructions-sub-cc
+
+// 9.7.2.4. Extended-Precision Arithmetic Instructions: subc
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-arithmetic-instructions-subc
+
+// 9.7.2.5. Extended-Precision Arithmetic Instructions: mad.cc
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-arithmetic-instructions-mad-cc
+
+// 9.7.2.6. Extended-Precision Arithmetic Instructions: madc
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-arithmetic-instructions-madc
+
+
+/*
+ *  9.7.3. Floating-Point Instructions
+ *  https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions
+ *
+ */
+
+// 9.7.3.1. Floating Point Instructions: testp
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-testp
+
+// 9.7.3.2. Floating Point Instructions: copysign
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-copysign
+
+// 9.7.3.3. Floating Point Instructions: add
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-add
+
+// 9.7.3.4. Floating Point Instructions: sub
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-sub
+
+// 9.7.3.5. Floating Point Instructions: mul
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-mul
+
+// 9.7.3.6. Floating Point Instructions: fma
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-fma
+
+// 9.7.3.7. Floating Point Instructions: mad
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-mad
+
+// 9.7.3.8. Floating Point Instructions: div
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-div
+
+// 9.7.3.9. Floating Point Instructions: abs
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-abs
+
+// 9.7.3.10. Floating Point Instructions: neg
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-neg
+
+// 9.7.3.11. Floating Point Instructions: min
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-min
+
+// 9.7.3.12. Floating Point Instructions: max
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-max
+
+// 9.7.3.13. Floating Point Instructions: rcp
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-rcp
+
+// 9.7.3.14. Floating Point Instructions: rcp.approx.ftz.f64
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-rcp-approx-ftz-f64
+
+// 9.7.3.15. Floating Point Instructions: sqrt
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-sqrt
+
+// 9.7.3.16. Floating Point Instructions: rsqrt
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-rsqrt
+
+// 9.7.3.17. Floating Point Instructions: rsqrt.approx.ftz.f64
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-rsqrt-approx-ftz-f64
+
+// 9.7.3.18. Floating Point Instructions: sin
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-sin
+
+// 9.7.3.19. Floating Point Instructions: cos
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-cos
+
+// 9.7.3.20. Floating Point Instructions: lg2
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-lg2
+
+// 9.7.3.21. Floating Point Instructions: ex2
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-ex2
+
+// 9.7.3.22. Floating Point Instructions: tanh
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-tanh
+
+
+/*
+ *  9.7.4. Half Precision Floating-Point Instructions
+ *  https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions
+ *
+ */
+
+// 9.7.4.1. Half Precision Floating Point Instructions: add
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-add
+
+// 9.7.4.2. Half Precision Floating Point Instructions: sub
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-sub
+
+// 9.7.4.3. Half Precision Floating Point Instructions: mul
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-mul
+
+// 9.7.4.4. Half Precision Floating Point Instructions: fma
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-fma
+
+// 9.7.4.5. Half Precision Floating Point Instructions: neg
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-neg
+
+// 9.7.4.6. Half Precision Floating Point Instructions: abs
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-abs
+
+// 9.7.4.7. Half Precision Floating Point Instructions: min
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-min
+
+// 9.7.4.8. Half Precision Floating Point Instructions: max
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-max
+
+// 9.7.4.9. Half Precision Floating Point Instructions: tanh
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-tanh
+
+// 9.7.4.10. Half Precision Floating Point Instructions: ex2
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-ex2
+
+
+/*
+ *  9.7.5. Comparison and Selection Instructions
+ *  https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#comparison-and-selection-instructions
+ *
+ */
+
+// 9.7.5.1. Comparison and Selection Instructions: set
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#comparison-and-selection-instructions-set
+
+// 9.7.5.2. Comparison and Selection Instructions: setp
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#comparison-and-selection-instructions-setp
+
+// 9.7.5.3. Comparison and Selection Instructions: selp
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#comparison-and-selection-instructions-selp
+
+// 9.7.5.4. Comparison and Selection Instructions: slct
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#comparison-and-selection-instructions-slct
+
+
+/*
+ *  9.7.6. Half Precision Comparison Instructions
+ *  https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-comparison-instructions
+ *
+ */
+
+// 9.7.6.1. Half Precision Comparison Instructions: set
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-comparison-instructions-set
+
+// 9.7.6.2. Half Precision Comparison Instructions: setp
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-comparison-instructions-setp
+
+
+/*
+ *  9.7.7. Logic and Shift Instructions
+ *  https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions
+ *
+ */
+
+// 9.7.7.1. Logic and Shift Instructions: and
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-and
+
+// 9.7.7.2. Logic and Shift Instructions: or
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-or
+
+// 9.7.7.3. Logic and Shift Instructions: xor
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-xor
+
+// 9.7.7.4. Logic and Shift Instructions: not
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-not
+
+// 9.7.7.5. Logic and Shift Instructions: cnot
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-cnot
+
+// 9.7.7.6. Logic and Shift Instructions: lop3
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-lop3
+
+// 9.7.7.7. Logic and Shift Instructions: shf
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-shf
+
+// 9.7.7.8. Logic and Shift Instructions: shl
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-shl
+
+// 9.7.7.9. Logic and Shift Instructions: shr
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-shr
+
+
+/*
+ *  9.7.8. Data Movement and Conversion Instructions
+ *  https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions
+ *
+ */
+
+// 9.7.8.3. Data Movement and Conversion Instructions: mov
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-mov
+
+// 9.7.8.4. Data Movement and Conversion Instructions: mov
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-mov-2
+
+// 9.7.8.5. Data Movement and Conversion Instructions: shfl (deprecated)
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-shfl-deprecated
+
+// 9.7.8.6. Data Movement and Conversion Instructions: shfl.sync
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-shfl-sync
+
+// 9.7.8.7. Data Movement and Conversion Instructions: prmt
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-prmt
+
+// 9.7.8.8. Data Movement and Conversion Instructions: ld
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-ld
+
+// 9.7.8.9. Data Movement and Conversion Instructions: ld.global.nc
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-ld-global-nc
+
+// 9.7.8.10. Data Movement and Conversion Instructions: ldu
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-ldu
+
+// 9.7.8.11. Data Movement and Conversion Instructions: st
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-st
+
+// 9.7.8.12. Data Movement and Conversion Instructions: st.async
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-st-async
+
+// 9.7.8.13. Data Movement and Conversion Instructions: multimem.ld_reduce, multimem.st, multimem.red
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-multimem-ld-reduce-multimem-st-multimem-red
+
+// 9.7.8.14. Data Movement and Conversion Instructions: prefetch, prefetchu
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-prefetch-prefetchu
+
+// 9.7.8.15. Data Movement and Conversion Instructions: applypriority
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-applypriority
+
+// 9.7.8.16. Data Movement and Conversion Instructions: discard
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-discard
+
+// 9.7.8.17. Data Movement and Conversion Instructions: createpolicy
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-createpolicy
+
+// 9.7.8.18. Data Movement and Conversion Instructions: isspacep
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-isspacep
+
+// 9.7.8.19. Data Movement and Conversion Instructions: cvta
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvta
+
+// 9.7.8.20. Data Movement and Conversion Instructions: cvt
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvt
+
+// 9.7.8.21. Data Movement and Conversion Instructions: cvt.pack
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvt-pack
+
+// 9.7.8.22. Data Movement and Conversion Instructions: mapa
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-mapa
+
+// 9.7.8.23. Data Movement and Conversion Instructions: getctarank
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-getctarank
+
+
+/*
+ *  9.7.8.24. Data Movement and Conversion Instructions: Asynchronous copy
+ *   https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-asynchronous-copy
+ *
+ */
+
+// 9.7.8.24.3. Data Movement and Conversion Instructions: cp.async
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async
+
+// 9.7.8.24.4. Data Movement and Conversion Instructions: cp.async.commit_group
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-commit-group
+
+// 9.7.8.24.5. Data Movement and Conversion Instructions: cp.async.wait_group / cp.async.wait_all
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-wait-group-cp-async-wait-all
+
+// 9.7.8.24.6. Data Movement and Conversion Instructions: cp.async.bulk
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk
+
+// 9.7.8.24.7. Data Movement and Conversion Instructions: cp.reduce.async.bulk
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-reduce-async-bulk
+
+// 9.7.8.24.8. Data Movement and Conversion Instructions: cp.async.bulk.prefetch
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-prefetch
+
+// 9.7.8.24.9. Data Movement and Conversion Instructions: cp.async.bulk.tensor
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
+
+// 9.7.8.24.10. Data Movement and Conversion Instructions: cp.reduce.async.bulk.tensor
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-reduce-async-bulk-tensor
+
+// 9.7.8.24.11. Data Movement and Conversion Instructions: cp.async.bulk.prefetch.tensor
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-prefetch-tensor
+
+// 9.7.8.24.12. Data Movement and Conversion Instructions: cp.async.bulk.commit_group
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-commit-group
+
+// 9.7.8.24.13. Data Movement and Conversion Instructions: cp.async.bulk.wait_group
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-wait-group
+
+// 9.7.8.25. Data Movement and Conversion Instructions: tensormap.replace
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-tensormap-replace
+
+
+/*
+ *  9.7.9. Texture Instructions
+ *  https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#texture-instructions
+ *
+ */
+
+// 9.7.9.3. Texture Instructions: tex
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#texture-instructions-tex
+
+// 9.7.9.4. Texture Instructions: tld4
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#texture-instructions-tld4
+
+// 9.7.9.5. Texture Instructions: txq
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#texture-instructions-txq
+
+// 9.7.9.6. Texture Instructions: istypep
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#texture-instructions-istypep
+
+
+/*
+ *  9.7.10. Surface Instructions
+ *  https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#surface-instructions
+ *
+ */
+
+// 9.7.10.1. Surface Instructions: suld
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#surface-instructions-suld
+
+// 9.7.10.2. Surface Instructions: sust
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#surface-instructions-sust
+
+// 9.7.10.3. Surface Instructions: sured
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#surface-instructions-sured
+
+// 9.7.10.4. Surface Instructions: suq
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#surface-instructions-suq
+
+
+/*
+ *  9.7.11. Control Flow Instructions
+ *  https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions
+ *
+ */
+
+// 9.7.11.1. Control Flow Instructions: {}
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-curly-braces
+
+// 9.7.11.2. Control Flow Instructions: @
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-at
+
+// 9.7.11.3. Control Flow Instructions: bra
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-bra
+
+// 9.7.11.4. Control Flow Instructions: brx.idx
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-brx-idx
+
+// 9.7.11.5. Control Flow Instructions: call
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-call
+
+// 9.7.11.6. Control Flow Instructions: ret
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-ret
+
+// 9.7.11.7. Control Flow Instructions: exit
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-exit
+
+
+/*
+ *  9.7.12. Parallel Synchronization and Communication Instructions
+ *  https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions
+ *
+ */
+
+// 9.7.12.1. Parallel Synchronization and Communication Instructions: bar, barrier
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-bar-barrier
+
+// 9.7.12.2. Parallel Synchronization and Communication Instructions: bar.warp.sync
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-bar-warp-sync
+
+// 9.7.12.3. Parallel Synchronization and Communication Instructions: barrier.cluster
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-barrier-cluster
+
+// 9.7.12.4. Parallel Synchronization and Communication Instructions: membar/fence
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar-fence
+
+// 9.7.12.5. Parallel Synchronization and Communication Instructions: atom
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-atom
+
+// 9.7.12.6. Parallel Synchronization and Communication Instructions: red
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-red
+
+// 9.7.12.7. Parallel Synchronization and Communication Instructions: red.async
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-red-async
+
+// 9.7.12.8. Parallel Synchronization and Communication Instructions: vote (deprecated)
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-vote-deprecated
+
+// 9.7.12.9. Parallel Synchronization and Communication Instructions: vote.sync
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-vote-sync
+
+// 9.7.12.10. Parallel Synchronization and Communication Instructions: match.sync
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-match-sync
+
+// 9.7.12.11. Parallel Synchronization and Communication Instructions: activemask
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-activemask
+
+// 9.7.12.12. Parallel Synchronization and Communication Instructions: redux.sync
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-redux-sync
+
+// 9.7.12.13. Parallel Synchronization and Communication Instructions: griddepcontrol
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-griddepcontrol
+
+// 9.7.12.14. Parallel Synchronization and Communication Instructions: elect.sync
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-elect-sync
+
+/*
+ *  9.7.12.15. Parallel Synchronization and Communication Instructions: mbarrier
+ *  https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier
+ *
+ *  Contained in: __cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h
+ */
+
+// 9.7.12.15.18. Parallel Synchronization and Communication Instructions: tensormap.cp_fenceproxy
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-tensormap-cp-fenceproxy
+
+
+/*
+ *  9.7.13. Warp Level Matrix Multiply-Accumulate Instructions
+ *  https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-multiply-accumulate-instructions
+ *
+ */
+
+// 9.7.13.3.3. Warp-level Matrix Load Instruction: wmma.load
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-load-instruction-wmma-load
+
+// 9.7.13.3.4. Warp-level Matrix Store Instruction: wmma.store
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-store-instruction-wmma-store
+
+// 9.7.13.3.5. Warp-level Matrix Multiply-and-Accumulate Instruction: wmma.mma
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-multiply-and-accumulate-instruction-wmma-mma
+
+// 9.7.13.4.14. Multiply-and-Accumulate Instruction: mma
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#multiply-and-accumulate-instruction-mma
+
+// 9.7.13.4.15. Warp-level matrix load instruction: ldmatrix
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-load-instruction-ldmatrix
+
+// 9.7.13.4.16. Warp-level matrix store instruction: stmatrix
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-store-instruction-stmatrix
+
+// 9.7.13.4.17. Warp-level matrix transpose instruction: movmatrix
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-transpose-instruction-movmatrix
+
+// 9.7.13.5.3. Multiply-and-Accumulate Instruction: mma.sp
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#multiply-and-accumulate-instruction-mma-sp
+
+
+/*
+ *  9.7.14. Asynchronous Warpgroup Level Matrix Multiply-Accumulate Instructions
+ *  https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-warpgroup-level-matrix-multiply-accumulate-instructions
+ *
+ */
+
+// 9.7.14.5.2. Asynchronous Multiply-and-Accumulate Instruction: wgmma.mma_async
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-multiply-and-accumulate-instruction-wgmma-mma-async
+
+// 9.7.14.6.4. Asynchronous Multiply-and-Accumulate Instruction: wgmma.mma_async.sp
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-multiply-and-accumulate-instruction-wgmma-mma-async-sp
+
+// 9.7.14.7.1. Asynchronous Multiply-and-Accumulate Instruction: wgmma.fence
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-multiply-and-accumulate-instruction-wgmma-fence
+
+// 9.7.14.7.2. Asynchronous Multiply-and-Accumulate Instruction: wgmma.commit_group
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-multiply-and-accumulate-instruction-wgmma-commit-group
+
+// 9.7.14.7.3. Asynchronous Multiply-and-Accumulate Instruction: wgmma.wait_group
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-multiply-and-accumulate-instruction-wgmma-wait-group
+
+
+/*
+ *  9.7.15. Stack Manipulation Instructions
+ *  https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#stack-manipulation-instructions
+ *
+ */
+
+// 9.7.15.1. Stack Manipulation Instructions: stacksave
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#stack-manipulation-instructions-stacksave
+
+// 9.7.15.2. Stack Manipulation Instructions: stackrestore
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#stack-manipulation-instructions-stackrestore
+
+// 9.7.15.3. Stack Manipulation Instructions: alloca
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#stack-manipulation-instructions-alloca
+
+
+/*
+ *  9.7.16. Video Instructions
+ *  https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#video-instructions
+ *
+ */
+
+// 9.7.16.1.1. Scalar Video Instructions: vadd, vsub, vabsdiff, vmin, vmax
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#scalar-video-instructions-vadd-vsub-vabsdiff-vmin-vmax
+
+// 9.7.16.1.2. Scalar Video Instructions: vshl, vshr
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#scalar-video-instructions-vshl-vshr
+
+// 9.7.16.1.3. Scalar Video Instructions: vmad
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#scalar-video-instructions-vmad
+
+// 9.7.16.1.4. Scalar Video Instructions: vset
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#scalar-video-instructions-vset
+
+
+/*
+ *  9.7.16.2. SIMD Video Instructions
+ *  https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#simd-video-instructions
+ *
+ */
+
+// 9.7.16.2.1. SIMD Video Instructions: vadd2, vsub2, vavrg2, vabsdiff2, vmin2, vmax2
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#simd-video-instructions-vadd2-vsub2-vavrg2-vabsdiff2-vmin2-vmax2
+
+// 9.7.16.2.2. SIMD Video Instructions: vset2
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#simd-video-instructions-vset2
+
+// 9.7.16.2.3. SIMD Video Instructions: vadd4, vsub4, vavrg4, vabsdiff4, vmin4, vmax4
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#simd-video-instructions-vadd4-vsub4-vavrg4-vabsdiff4-vmin4-vmax4
+
+// 9.7.16.2.4. SIMD Video Instructions: vset4
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#simd-video-instructions-vset4
+
+
+/*
+ *  9.7.17. Miscellaneous Instructions
+ *  https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions
+ *
+ */
+
+// 9.7.17.1. Miscellaneous Instructions: brkpt
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions-brkpt
+
+// 9.7.17.2. Miscellaneous Instructions: nanosleep
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions-nanosleep
+
+// 9.7.17.3. Miscellaneous Instructions: pmevent
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions-pmevent
+
+// 9.7.17.4. Miscellaneous Instructions: trap
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions-trap
+
+// 9.7.17.5. Miscellaneous Instructions: setmaxnreg
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions-setmaxnreg
+
+_LIBCUDACXX_END_NAMESPACE_CUDA_PTX
+
+#endif // _LIBCUDACXX___CUDA_PTX_H

From 42710f931ee6e3c7da02bafd04978aa08fdf8c7f Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Wed, 25 Oct 2023 15:22:00 +0200
Subject: [PATCH 18/49] Rename include guards

---
 ...ynchronization_and_communication_instructions_mbarrier.h | 6 +++---
 .../std/detail/libcxx/include/__cuda/ptx/ptx_dot_variants.h | 6 +++---
 .../detail/libcxx/include/__cuda/ptx/ptx_helper_functions.h | 6 +++---
 .../libcxx/include/__cuda/ptx/ptx_isa_target_macros.h       | 6 +++---
 4 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h
index 2bcfdc4bd33..ca5f9f09f42 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h
@@ -10,8 +10,8 @@
 //===----------------------------------------------------------------------===//
 
 
-#ifndef PARALLEL_SYNCHRONIZATION_AND_COMMUNICATION_INSTRUCTIONS_MBARRIER_H_
-#define PARALLEL_SYNCHRONIZATION_AND_COMMUNICATION_INSTRUCTIONS_MBARRIER_H_
+#ifndef _CUDA_PTX_PARALLEL_SYNCHRONIZATION_AND_COMMUNICATION_INSTRUCTIONS_MBARRIER_H_
+#define _CUDA_PTX_PARALLEL_SYNCHRONIZATION_AND_COMMUNICATION_INSTRUCTIONS_MBARRIER_H_
 
 _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
@@ -112,4 +112,4 @@ _LIBCUDACXX_DEVICE inline void mbarrier_arrive_expect_tx(
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_PTX
 
-#endif // PARALLEL_SYNCHRONIZATION_AND_COMMUNICATION_INSTRUCTIONS_MBARRIER_H_
+#endif // _CUDA_PTX_PARALLEL_SYNCHRONIZATION_AND_COMMUNICATION_INSTRUCTIONS_MBARRIER_H_
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_dot_variants.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_dot_variants.h
index c91a2512847..13f48692e4f 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_dot_variants.h
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_dot_variants.h
@@ -10,8 +10,8 @@
 //===----------------------------------------------------------------------===//
 
 
-#ifndef PTX_DOT_VARIANTS_H_
-#define PTX_DOT_VARIANTS_H_
+#ifndef _CUDA_PTX_DOT_VARIANTS_H_
+#define _CUDA_PTX_DOT_VARIANTS_H_
 
 /*
  * Public integral constant types and values for ".variant"s:
@@ -136,4 +136,4 @@ static constexpr scope_sys_t scope_sys{};
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_PTX
 
-#endif // PTX_DOT_VARIANTS_H_
+#endif // _CUDA_PTX_DOT_VARIANTS_H_
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_helper_functions.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_helper_functions.h
index 02ac1370d3d..6f2f8aa3060 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_helper_functions.h
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_helper_functions.h
@@ -9,8 +9,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef PTX_HELPER_FUNCTIONS_H_
-#define PTX_HELPER_FUNCTIONS_H_
+#ifndef _CUDA_PTX_HELPER_FUNCTIONS_H_
+#define _CUDA_PTX_HELPER_FUNCTIONS_H_
 
 _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
@@ -44,4 +44,4 @@ inline _LIBCUDACXX_DEVICE _CUDA_VSTD::uint64_t __as_b64(_Tp __val)
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_PTX
 
-#endif // PTX_HELPER_FUNCTIONS_H_
+#endif // _CUDA_PTX_HELPER_FUNCTIONS_H_
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h
index e0306dbf627..a1f88e3423f 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h
@@ -10,8 +10,8 @@
 //===----------------------------------------------------------------------===//
 
 
-#ifndef PTX_ISA_TARGET_MACROS_H_
-#define PTX_ISA_TARGET_MACROS_H_
+#ifndef _CUDA_PTX_PTX_ISA_TARGET_MACROS_H_
+#define _CUDA_PTX_PTX_ISA_TARGET_MACROS_H_
 
 
 /*
@@ -59,4 +59,4 @@
 #  define __cccl_ptx_isa 0ULL
 #endif
 
-#endif // PTX_ISA_TARGET_MACROS_H_
+#endif // _CUDA_PTX_PTX_ISA_TARGET_MACROS_H_

From 4144d43f22c4bd875bbc3093cb8a29f87f2a8fb1 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Wed, 25 Oct 2023 15:26:27 +0200
Subject: [PATCH 19/49] Fix missing includes

---
 .../include/cuda/std/detail/libcxx/include/__cuda/ptx.h      | 2 --
 ...synchronization_and_communication_instructions_mbarrier.h | 5 +++++
 .../std/detail/libcxx/include/__cuda/ptx/ptx_dot_variants.h  | 2 ++
 .../detail/libcxx/include/__cuda/ptx/ptx_helper_functions.h  | 4 ++++
 .../detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h | 1 +
 5 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h
index efa73e15f09..9c8a33c18dd 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h
@@ -13,10 +13,8 @@
 #define  _LIBCUDACXX___CUDA_PTX_H
 
 #include "../cstdint" // uint32_t
-#include "../type_traits" // std::integral_constant
 #include "../../../../../../nv/target" // __CUDA_MINIMUM_ARCH__ and friends
 
-// The following includes depend on the includes above:
 #include "cuda/std/detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h"
 #include "cuda/std/detail/libcxx/include/__cuda/ptx/ptx_dot_variants.h"
 #include "cuda/std/detail/libcxx/include/__cuda/ptx/ptx_helper_functions.h"
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h
index ca5f9f09f42..47ad09f5d66 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h
@@ -13,6 +13,11 @@
 #ifndef _CUDA_PTX_PARALLEL_SYNCHRONIZATION_AND_COMMUNICATION_INSTRUCTIONS_MBARRIER_H_
 #define _CUDA_PTX_PARALLEL_SYNCHRONIZATION_AND_COMMUNICATION_INSTRUCTIONS_MBARRIER_H_
 
+#include "ptx_dot_variants.h"
+#include "ptx_helper_functions.h"
+#include "ptx_isa_target_macros.h"
+#include "../../cstdint"
+
 _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
 /*
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_dot_variants.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_dot_variants.h
index 13f48692e4f..3ed1fca8bd2 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_dot_variants.h
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_dot_variants.h
@@ -13,6 +13,8 @@
 #ifndef _CUDA_PTX_DOT_VARIANTS_H_
 #define _CUDA_PTX_DOT_VARIANTS_H_
 
+#include "../../type_traits" // std::integral_constant
+
 /*
  * Public integral constant types and values for ".variant"s:
  *
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_helper_functions.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_helper_functions.h
index 6f2f8aa3060..f2bf91615e2 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_helper_functions.h
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_helper_functions.h
@@ -12,6 +12,8 @@
 #ifndef _CUDA_PTX_HELPER_FUNCTIONS_H_
 #define _CUDA_PTX_HELPER_FUNCTIONS_H_
 
+#include "../../cstdint"        // uint32_t
+
 _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
 // Private helper functions
@@ -32,6 +34,7 @@ template <typename _Tp>
 inline _LIBCUDACXX_DEVICE int __as_b32(_Tp __val)
 {
   static_assert(sizeof(_Tp) == 4, "");
+  // Consider using std::bitcast
   return *reinterpret_cast<int*>(&__val);
 }
 
@@ -39,6 +42,7 @@ template <typename _Tp>
 inline _LIBCUDACXX_DEVICE _CUDA_VSTD::uint64_t __as_b64(_Tp __val)
 {
   static_assert(sizeof(_Tp) == 8, "");
+  // Consider using std::bitcast
   return *reinterpret_cast<_CUDA_VSTD::uint64_t*>(&__val);
 }
 
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h
index a1f88e3423f..4e996a02eb4 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h
@@ -13,6 +13,7 @@
 #ifndef _CUDA_PTX_PTX_ISA_TARGET_MACROS_H_
 #define _CUDA_PTX_PTX_ISA_TARGET_MACROS_H_
 
+#include "../../../../../../../nv/target" // __CUDA_MINIMUM_ARCH__ and friends
 
 /*
  * Targeting macros

From 8a609cd8f1783eede2c12fe2f841b0338b5dc508 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Wed, 25 Oct 2023 15:28:00 +0200
Subject: [PATCH 20/49] Remove redundant comment

---
 .../std/detail/libcxx/include/__cuda/ptx/ptx_helper_functions.h  | 1 -
 1 file changed, 1 deletion(-)

diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_helper_functions.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_helper_functions.h
index f2bf91615e2..731c07aee12 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_helper_functions.h
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_helper_functions.h
@@ -16,7 +16,6 @@
 
 _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
-// Private helper functions
 inline _LIBCUDACXX_DEVICE _CUDA_VSTD::uint32_t __as_smem_ptr(const void* __ptr)
 {
   return static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__ptr));

From 6953ea0027e3899df78b301387eb2443171a69cd Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Wed, 25 Oct 2023 15:32:35 +0200
Subject: [PATCH 21/49] Rename __as_smem_ptr -> __as_ptr_smem for
 disambiguation

---
 ...ization_and_communication_instructions_mbarrier.h |  8 ++++----
 .../libcxx/include/__cuda/ptx/ptx_helper_functions.h | 12 +++++++++---
 2 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h
index 47ad09f5d66..66c10cfb762 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h
@@ -58,14 +58,14 @@ _LIBCUDACXX_DEVICE inline _CUDA_VSTD::uint64_t mbarrier_arrive_expect_tx(
       asm (
         "mbarrier.arrive.expect_tx.release.cta.shared::cta.b64 %0, [%1], %2;"
         : "=l"(__token)
-        : "r"(__as_smem_ptr(__addr)),
+        : "r"(__as_ptr_smem(__addr)),
           "r"(__tx_count)
         : "memory");
     } else {
     asm (
       "mbarrier.arrive.expect_tx.release.cluster.shared::cta.b64 %0, [%1], %2;"
       : "=l"(__token)
-      : "r"(__as_smem_ptr(__addr)),
+      : "r"(__as_ptr_smem(__addr)),
         "r"(__tx_count)
       : "memory");
   }
@@ -88,14 +88,14 @@ _LIBCUDACXX_DEVICE inline void mbarrier_arrive_expect_tx(
       asm (
         "mbarrier.arrive.expect_tx.release.cta.shared::cluster.b64 _, [%0], %1;"
         :
-        : "r"(__as_smem_ptr(__addr)),
+        : "r"(__as_ptr_remote_dsmem(__addr)),
           "r"(__tx_count)
         : "memory");
     } else {
     asm (
       "mbarrier.arrive.expect_tx.release.cluster.shared::cluster.b64 _, [%0], %1;"
       :
-      : "r"(__as_smem_ptr(__addr)),
+      : "r"(__as_ptr_remote_dsmem(__addr)),
         "r"(__tx_count)
       : "memory");
   }
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_helper_functions.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_helper_functions.h
index 731c07aee12..2d80f18b746 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_helper_functions.h
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_helper_functions.h
@@ -16,16 +16,22 @@
 
 _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
-inline _LIBCUDACXX_DEVICE _CUDA_VSTD::uint32_t __as_smem_ptr(const void* __ptr)
+inline _LIBCUDACXX_DEVICE _CUDA_VSTD::uint32_t __as_ptr_smem(const void* __ptr)
 {
+  // Consider adding debug asserts here.
   return static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__ptr));
 }
-inline _LIBCUDACXX_DEVICE _CUDA_VSTD::uint32_t __as_remote_dsmem_ptr(const void* __ptr)
+
+inline _LIBCUDACXX_DEVICE _CUDA_VSTD::uint32_t __as_ptr_remote_dsmem(const void* __ptr)
 {
+  // No difference in implementation to __as_ptr_smem.
+  // Consider adding debug asserts here.
   return static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__ptr));
 }
-inline _LIBCUDACXX_DEVICE _CUDA_VSTD::uint64_t __as_gmem_ptr(const void* __ptr)
+
+inline _LIBCUDACXX_DEVICE _CUDA_VSTD::uint64_t __as_ptr_gmem(const void* __ptr)
 {
+  // Consider adding debug asserts here.
   return static_cast<_CUDA_VSTD::uint64_t>(__cvta_generic_to_global(__ptr));
 }
 

From eae5df6f754f712d262f14851c86535352ff6571 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Wed, 25 Oct 2023 15:36:07 +0200
Subject: [PATCH 22/49] Use uint32_t

---
 .../detail/libcxx/include/__cuda/ptx/ptx_helper_functions.h   | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_helper_functions.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_helper_functions.h
index 2d80f18b746..41826081a54 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_helper_functions.h
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_helper_functions.h
@@ -36,11 +36,11 @@ inline _LIBCUDACXX_DEVICE _CUDA_VSTD::uint64_t __as_ptr_gmem(const void* __ptr)
 }
 
 template <typename _Tp>
-inline _LIBCUDACXX_DEVICE int __as_b32(_Tp __val)
+inline _LIBCUDACXX_DEVICE _CUDA_VSTD::uint32_t __as_b32(_Tp __val)
 {
   static_assert(sizeof(_Tp) == 4, "");
   // Consider using std::bitcast
-  return *reinterpret_cast<int*>(&__val);
+  return *reinterpret_cast<_CUDA_VSTD::uint32_t*>(&__val);
 }
 
 template <typename _Tp>

From eda6d9335e225377233307d8a218d1c64044a93a Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <allard@allardhendriksen.nl>
Date: Wed, 25 Oct 2023 15:59:12 +0200
Subject: [PATCH 23/49] Update
 libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h

Co-authored-by: Michael Schellenberger Costa <miscco@nvidia.com>
---
 .../include/cuda/std/detail/libcxx/include/__cuda/barrier.h     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h
index f5a65400d1e..c4bba0222dd 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h
@@ -27,7 +27,7 @@ _CCCL_IMPLICIT_SYSTEM_HEADER
 
 #include "../cstdlib"                // _LIBCUDACXX_UNREACHABLE
 #include "../__type_traits/void_t.h" // _CUDA_VSTD::__void_t
-#include <cuda/ptx>                  // cuda::ptx::*
+#include "../__cuda/ptx.h"                  // cuda::ptx::*
 
 #if defined(_LIBCUDACXX_COMPILER_NVRTC)
 #define _LIBCUDACXX_OFFSET_IS_ZERO(type, member) !(&(((type *)0)->member))

From f262f6cc9f07dc1108b1ac23adb7b47cbc92fefb Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <allard@allardhendriksen.nl>
Date: Wed, 25 Oct 2023 17:43:01 +0200
Subject: [PATCH 24/49] Apply suggestions from code review

Co-authored-by: Michael Schellenberger Costa <miscco@nvidia.com>
---
 libcudacxx/include/cuda/std/detail/libcxx/include/__config      | 1 +
 .../std/detail/libcxx/include/__cuda/ptx/ptx_dot_variants.h     | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__config b/libcudacxx/include/cuda/std/detail/libcxx/include/__config
index f5f24aa95fb..bc0398eddf6 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__config
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__config
@@ -1507,6 +1507,7 @@ typedef __char32_t char32_t;
 #define _LIBCUDACXX_END_NAMESPACE_CUDA_DEVICE  } } }
 #define _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX namespace cuda { namespace ptx { inline namespace _LIBCUDACXX_ABI_NAMESPACE {
 #define _LIBCUDACXX_END_NAMESPACE_CUDA_PTX  } } }
+#define _CUDA_VPTX ::cuda::ptx::_LIBCUDACXX_ABI_NAMESPACE
 #define _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_DEVICE_EXPERIMENTAL namespace cuda { namespace device { namespace experimental { inline namespace _LIBCUDACXX_ABI_NAMESPACE {
 #define _LIBCUDACXX_END_NAMESPACE_CUDA_DEVICE_EXPERIMENTAL  } } } }
 #endif
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_dot_variants.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_dot_variants.h
index 3ed1fca8bd2..433254e1e1b 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_dot_variants.h
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_dot_variants.h
@@ -13,7 +13,7 @@
 #ifndef _CUDA_PTX_DOT_VARIANTS_H_
 #define _CUDA_PTX_DOT_VARIANTS_H_
 
-#include "../../type_traits" // std::integral_constant
+#include "../../__type_traits/integral_constant.h" // std::integral_constant
 
 /*
  * Public integral constant types and values for ".variant"s:

From 5701b9f78c0fc586ce4bb023474e6d06d4b960bd Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Wed, 25 Oct 2023 17:35:11 +0200
Subject: [PATCH 25/49] Use <nv/target>

---
 .../detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h
index 4e996a02eb4..c53ee6a9679 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h
@@ -13,7 +13,7 @@
 #ifndef _CUDA_PTX_PTX_ISA_TARGET_MACROS_H_
 #define _CUDA_PTX_PTX_ISA_TARGET_MACROS_H_
 
-#include "../../../../../../../nv/target" // __CUDA_MINIMUM_ARCH__ and friends
+#include <nv/target>            // __CUDA_MINIMUM_ARCH__ and friends
 
 /*
  * Targeting macros

From 7a54b1996ae82830b642154246cbf25900627e90 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Wed, 25 Oct 2023 17:41:09 +0200
Subject: [PATCH 26/49] Reorder PTX ISA target macros

---
 .../libcxx/include/__cuda/ptx/ptx_isa_target_macros.h      | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h
index c53ee6a9679..592cc5f96e6 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h
@@ -40,18 +40,15 @@
 // PTX ISA 8.3 is available from CTK 12.3, driver r545
 #if   (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 3)) || (!defined(__CUDACC_VER_MAJOR__))
 #  define __cccl_ptx_isa 830ULL
-// PTX ISA 8.1 is available from CTK 12.1, driver r530
+// PTX ISA 8.2 is available from CTK 12.2, driver r535
 #elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 2)) || (!defined(__CUDACC_VER_MAJOR__))
 #  define __cccl_ptx_isa 820ULL
-// PTX ISA 8.2 is available from CTK 12.2, driver r535
+// PTX ISA 8.1 is available from CTK 12.1, driver r530
 #elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 1)) || (!defined(__CUDACC_VER_MAJOR__))
 #  define __cccl_ptx_isa 810ULL
 // PTX ISA 8.0 is available from CTK 12.0, driver r525
 #elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 0)) || (!defined(__CUDACC_VER_MAJOR__))
 #  define __cccl_ptx_isa 800ULL
-// PTX ISA 7.8 is available from CTK 11.8, driver r520 (so also from CTK 12.0 onwards)
-#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 0)) || (!defined(__CUDACC_VER_MAJOR__))
-#  define __cccl_ptx_isa 780ULL
 // PTX ISA 7.8 is available from CTK 11.8, driver r520
 #elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 8)) || (!defined(__CUDACC_VER_MAJOR__))
 #  define __cccl_ptx_isa 780ULL

From d4ec10f1cd729b4d5f3eefcca35df388ee0e5b13 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Wed, 25 Oct 2023 20:06:51 +0200
Subject: [PATCH 27/49] Add .op

---
 .../include/__cuda/ptx/ptx_dot_variants.h     | 57 ++++++++++++++++---
 1 file changed, 48 insertions(+), 9 deletions(-)

diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_dot_variants.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_dot_variants.h
index 433254e1e1b..aca4eac097e 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_dot_variants.h
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_dot_variants.h
@@ -18,9 +18,10 @@
 /*
  * Public integral constant types and values for ".variant"s:
  *
- * - .sem
- * - .space
- * - .scope
+ * - .sem:     acquire, release, ..
+ * - .space:   global, shared, constant, ..
+ * - .scope:   cta, cluster, gpu, ..
+ * - .op:      add, min, cas, ..
  *
  * For each .variant, the code below defines:
  * - An enum `dot_variant` with each possible value
@@ -86,8 +87,22 @@ enum class dot_scope
   sys
 };
 
-template <dot_sem sem>
-using sem_t         = _CUDA_VSTD::integral_constant<dot_sem, sem>;
+enum class dot_op
+{
+  add,
+  dec,
+  inc,
+  max,
+  min,
+  and_op, // Using and_op, as `and, or, xor` are reserved in C++.
+  or_op,
+  xor_op,
+  cas,
+  exch
+};
+
+template <dot_sem __sem>
+using sem_t         = _CUDA_VSTD::integral_constant<dot_sem, __sem>;
 using sem_acq_rel_t = sem_t<dot_sem::acq_rel>;
 using sem_acquire_t = sem_t<dot_sem::acquire>;
 using sem_relaxed_t = sem_t<dot_sem::relaxed>;
@@ -102,8 +117,8 @@ static constexpr sem_release_t sem_release{};
 static constexpr sem_sc_t sem_sc{};
 static constexpr sem_weak_t sem_weak{};
 
-template <dot_space spc>
-using space_t                = _CUDA_VSTD::integral_constant<dot_space, spc>;
+template <dot_space __spc>
+using space_t                = _CUDA_VSTD::integral_constant<dot_space, __spc>;
 using space_const_mem_t      = space_t<dot_space::const_mem>;
 using space_global_t         = space_t<dot_space::global>;
 using space_local_t          = space_t<dot_space::local>;
@@ -124,8 +139,8 @@ static constexpr space_shared_cluster_t space_shared_cluster{};
 static constexpr space_sreg_t space_sreg{};
 static constexpr space_tex_t space_tex{};
 
-template <dot_scope scope>
-using scope_t         = _CUDA_VSTD::integral_constant<dot_scope, scope>;
+template <dot_scope __scope>
+using scope_t         = _CUDA_VSTD::integral_constant<dot_scope, __scope>;
 using scope_cluster_t = scope_t<dot_scope::cluster>;
 using scope_cta_t     = scope_t<dot_scope::cta>;
 using scope_gpu_t     = scope_t<dot_scope::gpu>;
@@ -136,6 +151,30 @@ static constexpr scope_cta_t scope_cta{};
 static constexpr scope_gpu_t scope_gpu{};
 static constexpr scope_sys_t scope_sys{};
 
+template <dot_op __op>
+using op_t        = _CUDA_VSTD::integral_constant<dot_op, __op>;
+using op_add_t    = op_t<dot_op::add>;
+using op_dec_t    = op_t<dot_op::dec>;
+using op_inc_t    = op_t<dot_op::inc>;
+using op_max_t    = op_t<dot_op::max>;
+using op_min_t    = op_t<dot_op::min>;
+using op_and_op_t = op_t<dot_op::and_op>;
+using op_or_op_t  = op_t<dot_op::or_op>;
+using op_xor_op_t = op_t<dot_op::xor_op>;
+using op_cas_t    = op_t<dot_op::cas>;
+using op_exch_t   = op_t<dot_op::exch>;
+
+static constexpr op_add_t op_add{};
+static constexpr op_dec_t op_dec{};
+static constexpr op_inc_t op_inc{};
+static constexpr op_max_t op_max{};
+static constexpr op_min_t op_min{};
+static constexpr op_and_op_t op_and_op{};
+static constexpr op_or_op_t op_or_op{};
+static constexpr op_xor_op_t op_xor_op{};
+static constexpr op_cas_t op_cas{};
+static constexpr op_exch_t op_exch{};
+
 _LIBCUDACXX_END_NAMESPACE_CUDA_PTX
 
 #endif // _CUDA_PTX_DOT_VARIANTS_H_

From dd5764873f09151117d83987f51d6f1f31956aca Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Thu, 26 Oct 2023 19:34:02 +0200
Subject: [PATCH 28/49] Improve backward compatibility and docs

Previous versions were using PTX that did not compile on CTK 11.
---
 .../test/cuda/ptx/sm90.ptx.compile.pass.cpp   |  29 +-
 libcudacxx/docs/extended_api/ptx.md           | 153 ++++++++--
 ..._and_communication_instructions_mbarrier.h | 287 +++++++++++++++---
 3 files changed, 399 insertions(+), 70 deletions(-)

diff --git a/libcudacxx/.upstream-tests/test/cuda/ptx/sm90.ptx.compile.pass.cpp b/libcudacxx/.upstream-tests/test/cuda/ptx/sm90.ptx.compile.pass.cpp
index 787a9c1f327..83335510a52 100644
--- a/libcudacxx/.upstream-tests/test/cuda/ptx/sm90.ptx.compile.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/cuda/ptx/sm90.ptx.compile.pass.cpp
@@ -8,7 +8,6 @@
 //
 //===----------------------------------------------------------------------===//
 // UNSUPPORTED: libcpp-has-no-threads
-// UNSUPPORTED: pre-sm-90
 
 // <cuda/ptx>
 
@@ -37,11 +36,31 @@ int main(int, char**)
             using cuda::ptx::scope_cta;
 
             __shared__ uint64_t bar;
-            cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cta, space_shared, &bar, 1);
-            cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cluster, space_shared, &bar, 1);
+            uint64_t state;
 
-            cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cta, space_shared_cluster, &bar, 1);
-            cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cluster, space_shared_cluster, &bar, 1);
+            // TODO: check PTX ISA version.
+
+            NV_IF_TARGET(NV_PROVIDES_SM_80, (
+              state = cuda::ptx::mbarrier_arrive(sem_release, scope_cta,     space_shared, &bar);              // 1.
+              state = cuda::ptx::mbarrier_arrive_no_complete(sem_release, scope_cta, space_shared, &bar, 1);   // 2.
+            ));
+
+            NV_IF_TARGET(NV_PROVIDES_SM_90, (
+              state = cuda::ptx::mbarrier_arrive(sem_release, scope_cta,     space_shared, &bar, 1);           // 3.
+              state = cuda::ptx::mbarrier_arrive(sem_release, scope_cluster, space_shared, &bar, 1);           // 4.
+
+              cuda::ptx::mbarrier_arrive(sem_release, scope_cta,     space_shared_cluster, &bar, 1);           // 5.
+              cuda::ptx::mbarrier_arrive(sem_release, scope_cluster, space_shared_cluster, &bar, 1);           // 5.
+
+              state = cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cta,     space_shared, &bar, 1); // 6.
+              state = cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cluster, space_shared, &bar, 1); // 6.
+
+              cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cta,     space_shared_cluster, &bar, 1); // 7.
+              cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cluster, space_shared_cluster, &bar, 1); // 7.
+            ));
+
+            state += 1;         // "Use" state to prevent compiler warnings
+            (void) state;
         }
     ));
 
diff --git a/libcudacxx/docs/extended_api/ptx.md b/libcudacxx/docs/extended_api/ptx.md
index feb4040d724..b90d8900a7e 100644
--- a/libcudacxx/docs/extended_api/ptx.md
+++ b/libcudacxx/docs/extended_api/ptx.md
@@ -4,6 +4,70 @@ The `cuda::ptx` namespace contains functions that map one-to-one to
 [PTX instructions](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html). These can be used for maximal control of the generated code, or to
 experiment with new hardware features before a high-level C++ API is available.
 
+### Variants
+
+### Versions and compatibility
+
+The `cuda/ptx` header is intended to present a stable API (not ABI) within one
+major version of the CTK on a best effort basis. This means that:
+
+- All functions are marked inline.
+
+- The type of a function parameter can be changed to be more generic if
+  that means that code that called the original version can still be
+  compiled.
+
+- Good exposure of the PTX should be high priority. If, at a new major
+  version, we face a difficult choice between breaking backward-compatibility
+  and an improvement of the PTX exposure, we will tend to the latter option
+  more easily than in other parts of libcu++.
+
+API stability is not taken to the extreme. Call functions like below to ensure
+forward-compatibility:
+
+```
+// Use arguments to driver overload resolution:
+cuda::ptx::mbarrier_arrive_expect_tx(cuda::ptx::sem_release, cuda::ptx::scope_cta, cuda::ptx::space_shared, &bar, 1);
+
+// Specifying templates directly is not forward-compatible, as order and number
+// of template parameters may change in a minor release:
+cuda::ptx::mbarrier_arrive_expect_tx<cuda::ptx::sem_release_t>(
+  cuda::ptx::sem_release, cuda::ptx::scope_cta, cuda::ptx::space_shared, &bar, 1
+);
+```
+
+**PTX ISA version and compute capability.** Each binding notes under which PTX
+ISA version and SM version it may be used. Example:
+
+```cuda
+// mbarrier.arrive.shared::cta.b64 state, [addr]; // 1.  PTX ISA 70, SM_80
+__device__ inline uint64_t mbarrier_arrive(
+  cuda::ptx::sem_release_t sem,
+  cuda::ptx::scope_cta_t scope,
+  cuda::ptx::space_shared_t space,
+  uint64_t* addr);
+```
+
+To check if the current compiler is recent enough, use:
+```cuda
+#if __cccl_ptx_isa >= 700
+cuda::ptx::mbarrier_arrive(cuda::ptx::sem_release, cuda::ptx::scope_cta, cuda::ptx::space_shared, &bar, 1);
+#endif
+```
+
+Ensure that you only call the function when compiling for a recent enough
+compute capability (SM version), like this:
+```
+NV_IF_TARGET(NV_PROVIDES_SM_80,(
+  cuda::ptx::mbarrier_arrive(cuda::ptx::sem_release, cuda::ptx::scope_cta, cuda::ptx::space_shared, &bar, 1);
+));
+```
+
+For more information on which compilers correspond to which PTX ISA, see the
+[PTX ISA release
+notes](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#release-notes).
+
+
 ### [9.7.1. Integer Arithmetic Instructions](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions)
 
 | Instruction                              | Available in libcu++ |
@@ -380,17 +444,69 @@ experiment with new hardware features before a high-level C++ API is available.
 
  
 ```cuda
-template <dot_scope _Sco>
-__device__ inline
-uint64_t mbarrier_arrive_expect_tx(sem_release_t sem, scope_t<_Sco> scope, space_shared_t spc, uint64_t* addr, uint32_t tx_count);
-
-template <dot_scope _Sco>
-__device__ inline
-void mbarrier_arrive_expect_tx(sem_release_t sem, scope_t<_Sco> scope, space_shared_cluster_t spc, uint64_t* addr, uint32_t tx_count);
+// mbarrier.arrive.shared::cta.b64 state, [addr];                                        // 1.  PTX ISA 70, SM_80
+__device__ inline uint64_t mbarrier_arrive(
+  cuda::ptx::sem_release_t sem,
+  cuda::ptx::scope_cta_t scope,
+  cuda::ptx::space_shared_t space,
+  uint64_t* addr);
+
+// mbarrier.arrive.noComplete.shared::cta.b64 state, [addr], count;                      // 2.  PTX ISA 70, SM_80
+__device__ inline uint64_t mbarrier_arrive_no_complete(
+  cuda::ptx::sem_release_t sem,
+  cuda::ptx::scope_cta_t scope,
+  cuda::ptx::space_shared_t space,
+  uint64_t* addr,
+  uint32_t count);
+
+// mbarrier.arrive.shared::cta.b64 state, [addr], count;                                 // 3. PTX ISA 78, SM_90
+__device__ inline uint64_t mbarrier_arrive(
+  cuda::ptx::sem_release_t sem,
+  cuda::ptx::scope_cta_t scope,
+  cuda::ptx::space_shared_t space,
+  uint64_t* addr,
+  uint32_t count);
+
+// mbarrier.arrive.release.cluster.shared::cta.b64 state,  [addr], count;                // 4. PTX ISA 80, SM_90
+__device__ inline uint64_t mbarrier_arrive(
+  cuda::ptx::sem_release_t sem,
+  cuda::ptx::scope_cluster_t scope,
+  cuda::ptx::space_shared_t space,
+  uint64_t* addr,
+  uint32_t count);
+
+// mbarrier.arrive.release{.scope}.shared::cluster.b64 _, [addr], count;                 // 5.  PTX ISA 80, SM_90
+// .scope     = { .cta, .cluster }
+template <cuda::ptx::dot_scope Scope>
+__device__ inline void mbarrier_arrive(
+  cuda::ptx::sem_release_t sem,
+  cuda::ptx::scope_t<Scope> scope,
+  cuda::ptx::space_shared_cluster_t space,
+  uint64_t* addr,
+  uint32_t count);
+
+// mbarrier.arrive.expect_tx.release{.scope}.shared::cta.b64   state, [addr], tx_count;  // 6.  PTX ISA 80, SM_90
+// .scope     = { .cta, .cluster }
+template <cuda::ptx::dot_scope Scope>
+__device__ inline uint64_t mbarrier_arrive_expect_tx(
+  cuda::ptx::sem_release_t sem,
+  cuda::ptx::scope_t<Scope> scope,
+  cuda::ptx::space_shared_t space,
+  uint64_t* addr,
+  uint32_t tx_count);
+
+// mbarrier.arrive.expect_tx.release{.scope}.shared::cluster.b64 _, [addr], tx_count;    // 7.  PTX ISA 80, SM_90
+// .scope     = { .cta, .cluster }
+template <cuda::ptx::dot_scope Scope>
+__device__ inline void mbarrier_arrive_expect_tx(
+  cuda::ptx::sem_release_t sem,
+  cuda::ptx::scope_t<Scope> scope,
+  cuda::ptx::space_shared_cluster_t space,
+  uint64_t* addr,
+  uint32_t tx_count);
 ```
 
 Usage:
-
 ```cuda
 #include <cuda/ptx>
 #include <cuda/barrier>
@@ -522,24 +638,3 @@ __global__ void kernel() {
 [`pmevent`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions-pmevent
 [`trap`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions-trap
 [`setmaxnreg`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions-setmaxnreg
-
-
-
-
-
-
-
-
-
-
-
-### Shared memory barrier (mbarrier)
-
-| Instruction | Compute capability | CUDA Toolkit |
-|----------------------------------------|--------------------|--------------|
-| `cuda::ptx::mbarrier_arrive_expect_tx` | 9.0                | CTK 12.4     |
-
-
-
-
-
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h
index 66c10cfb762..c752abcf65f 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h
@@ -41,67 +41,282 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 // 9.7.12.15.13. Parallel Synchronization and Communication Instructions: mbarrier.arrive
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive
 
-#if __cccl_ptx_sm >= 900 && __cccl_ptx_isa >= 780
-template <dot_scope _Sco>
+// mbarrier.arrive{.sem}{.scope}{.shared{::cta}}.b64           state, [addr]{, count};
+// mbarrier.arrive{.sem}{.scope}{.shared::cluster}.b64         _, [addr] {,count}
+// mbarrier.arrive.expect_tx{.sem}{.scope}{.shared{::cta}}.b64 state, [addr], txCount;
+// mbarrier.arrive.expect_tx{.sem}{.scope}{.shared::cluster}.b64   _, [addr], txCount;
+// mbarrier.arrive.noComplete{.sem}{.cta}{.shared{::cta}}.b64  state, [addr], count;
+//
+// .sem   = { .release }
+// .scope = { .cta, .cluster }
+
+/*
+// mbarrier.arrive.shared::cta.b64 state, [addr]; // 1.  PTX ISA 70, SM_80
+__device__ inline uint64_t mbarrier_arrive(
+  cuda::ptx::sem_release_t sem,
+  cuda::ptx::scope_cta_t scope,
+  cuda::ptx::space_shared_t space,
+  uint64_t* addr);
+*/
+#if __cccl_ptx_isa >= 700 && __cccl_ptx_sm >= 800
+_LIBCUDACXX_DEVICE inline _CUDA_VSTD::uint64_t mbarrier_arrive(
+  sem_release_t __sem,
+  scope_cta_t __scope,
+  space_shared_t __space,
+  _CUDA_VSTD::uint64_t* __addr)
+{
+  // __sem == sem_release (due to parameter type constraint)
+  // __scope == scope_cta (due to parameter type constraint)
+  // __space == space_shared (due to parameter type constraint)
+
+  _CUDA_VSTD::uint64_t __state;
+
+  asm (
+    "mbarrier.arrive.shared::cta.b64 %0, [%1]; // 1. "
+    : "=l"(__state)
+    : "r"(__as_ptr_smem(__addr))
+    : "memory"
+  );
+  return __state;
+}
+#endif // __cccl_ptx_isa >= 700 && __cccl_ptx_sm >= 800
+/*
+// mbarrier.arrive.noComplete.shared::cta.b64 state, [addr], count; // 2.  PTX ISA 70, SM_80
+__device__ inline uint64_t mbarrier_arrive_no_complete(
+  cuda::ptx::sem_release_t sem,
+  cuda::ptx::scope_cta_t scope,
+  cuda::ptx::space_shared_t space,
+  uint64_t* addr,
+  uint32_t count);
+*/
+#if __cccl_ptx_isa >= 700 && __cccl_ptx_sm >= 800
+_LIBCUDACXX_DEVICE inline _CUDA_VSTD::uint64_t mbarrier_arrive_no_complete(
+  sem_release_t __sem,
+  scope_cta_t __scope,
+  space_shared_t __space,
+  _CUDA_VSTD::uint64_t* __addr,
+  _CUDA_VSTD::uint32_t __count)
+{
+  // __sem == sem_release (due to parameter type constraint)
+  // __scope == scope_cta (due to parameter type constraint)
+  // __space == space_shared (due to parameter type constraint)
+
+  _CUDA_VSTD::uint64_t __state;
+
+  asm (
+    "mbarrier.arrive.noComplete.shared::cta.b64 %0, [%1], %2; // 2. "
+    : "=l"(__state)
+    : "r"(__as_ptr_smem(__addr)),
+      "r"(__count)
+    : "memory"
+  );
+  return __state;
+}
+#endif // __cccl_ptx_isa >= 700 && __cccl_ptx_sm >= 800
+/*
+// mbarrier.arrive.shared::cta.b64 state, [addr], count; // 3. PTX ISA 78, SM_90
+__device__ inline uint64_t mbarrier_arrive(
+  cuda::ptx::sem_release_t sem,
+  cuda::ptx::scope_cta_t scope,
+  cuda::ptx::space_shared_t space,
+  uint64_t* addr,
+  uint32_t count);
+*/
+#if __cccl_ptx_isa >= 780 && __cccl_ptx_sm >= 900
+_LIBCUDACXX_DEVICE inline _CUDA_VSTD::uint64_t mbarrier_arrive(
+  sem_release_t __sem,
+  scope_cta_t __scope,
+  space_shared_t __space,
+  _CUDA_VSTD::uint64_t* __addr,
+  _CUDA_VSTD::uint32_t __count)
+{
+  // __sem == sem_release (due to parameter type constraint)
+  // __scope == scope_cta (due to parameter type constraint)
+  // __space == space_shared (due to parameter type constraint)
+
+  _CUDA_VSTD::uint64_t __state;
+
+  asm (
+    "mbarrier.arrive.shared::cta.b64 %0, [%1], %2; // 3."
+    : "=l"(__state)
+    : "r"(__as_ptr_smem(__addr)),
+      "r"(__count)
+    : "memory"
+  );
+  return __state;
+}
+#endif // __cccl_ptx_isa >= 780 && __cccl_ptx_sm >= 900
+/*
+// mbarrier.arrive.release.cluster.shared::cta.b64 state,  [addr], count; // 4. PTX ISA 80, SM_90
+__device__ inline uint64_t mbarrier_arrive(
+  cuda::ptx::sem_release_t sem,
+  cuda::ptx::scope_cluster_t scope,
+  cuda::ptx::space_shared_t space,
+  uint64_t* addr,
+  uint32_t count);
+*/
+#if __cccl_ptx_isa >= 800 && __cccl_ptx_sm >= 900
+_LIBCUDACXX_DEVICE inline _CUDA_VSTD::uint64_t mbarrier_arrive(
+  sem_release_t __sem,
+  scope_cluster_t __scope,
+  space_shared_t __space,
+  _CUDA_VSTD::uint64_t* __addr,
+  _CUDA_VSTD::uint32_t __count)
+{
+  // __sem == sem_release (due to parameter type constraint)
+  // __scope == scope_cluster (due to parameter type constraint)
+  // __space == space_shared (due to parameter type constraint)
+
+  _CUDA_VSTD::uint64_t __state;
+
+  asm (
+    "mbarrier.arrive.release.cluster.shared::cta.b64 %0,  [%1], %2; // 4."
+    : "=l"(__state)
+    : "r"(__as_ptr_smem(__addr)),
+      "r"(__count)
+    : "memory"
+  );
+  return __state;
+}
+#endif // __cccl_ptx_isa >= 800 && __cccl_ptx_sm >= 900
+/*
+// mbarrier.arrive.release{.scope}.shared::cluster.b64 _, [addr], count;   // 5.  PTX ISA 80, SM_90
+// .scope     = { .cta, .cluster }
+template <cuda::ptx::dot_scope Scope>
+__device__ inline void mbarrier_arrive(
+  cuda::ptx::sem_release_t sem,
+  cuda::ptx::scope_t<Scope> scope,
+  cuda::ptx::space_shared_cluster_t space,
+  uint64_t* addr,
+  uint32_t count);
+*/
+#if __cccl_ptx_isa >= 800 && __cccl_ptx_sm >= 900
+template <dot_scope _Scope>
+_LIBCUDACXX_DEVICE inline void mbarrier_arrive(
+  sem_release_t __sem,
+  scope_t<_Scope> __scope,
+  space_shared_cluster_t __space,
+  _CUDA_VSTD::uint64_t* __addr,
+  _CUDA_VSTD::uint32_t __count)
+{
+  // __sem == sem_release (due to parameter type constraint)
+  static_assert(__scope == scope_cta || __scope == scope_cluster, "");
+  // __space == space_shared_cluster (due to parameter type constraint)
+
+
+
+  if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cta) {
+    asm (
+      "mbarrier.arrive.release.cta.shared::cluster.b64 _, [%0], %1;   // 5. "
+      :
+      : "r"(__as_ptr_smem(__addr)),
+        "r"(__count)
+      : "memory"
+    );
+  } else if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cluster) {
+    asm (
+      "mbarrier.arrive.release.cluster.shared::cluster.b64 _, [%0], %1;   // 5. "
+      :
+      : "r"(__as_ptr_smem(__addr)),
+        "r"(__count)
+      : "memory"
+    );
+  }
+
+}
+#endif // __cccl_ptx_isa >= 800 && __cccl_ptx_sm >= 900
+/*
+// mbarrier.arrive.expect_tx.release{.scope}.shared::cta.b64   state, [addr], tx_count; // 6.  PTX ISA 80, SM_90
+// .scope     = { .cta, .cluster }
+template <cuda::ptx::dot_scope Scope>
+__device__ inline uint64_t mbarrier_arrive_expect_tx(
+  cuda::ptx::sem_release_t sem,
+  cuda::ptx::scope_t<Scope> scope,
+  cuda::ptx::space_shared_t space,
+  uint64_t* addr,
+  uint32_t tx_count);
+*/
+#if __cccl_ptx_isa >= 800 && __cccl_ptx_sm >= 900
+template <dot_scope _Scope>
 _LIBCUDACXX_DEVICE inline _CUDA_VSTD::uint64_t mbarrier_arrive_expect_tx(
   sem_release_t __sem,
-  scope_t<_Sco> __scope,
-  space_shared_t __spc,
+  scope_t<_Scope> __scope,
+  space_shared_t __space,
   _CUDA_VSTD::uint64_t* __addr,
   _CUDA_VSTD::uint32_t __tx_count)
 {
-  // Arrive on local shared memory barrier
+  // __sem == sem_release (due to parameter type constraint)
   static_assert(__scope == scope_cta || __scope == scope_cluster, "");
-  _CUDA_VSTD::uint64_t __token;
+  // __space == space_shared (due to parameter type constraint)
+
+  _CUDA_VSTD::uint64_t __state;
 
   if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cta) {
-      asm (
-        "mbarrier.arrive.expect_tx.release.cta.shared::cta.b64 %0, [%1], %2;"
-        : "=l"(__token)
-        : "r"(__as_ptr_smem(__addr)),
-          "r"(__tx_count)
-        : "memory");
-    } else {
     asm (
-      "mbarrier.arrive.expect_tx.release.cluster.shared::cta.b64 %0, [%1], %2;"
-      : "=l"(__token)
+      "mbarrier.arrive.expect_tx.release.cta.shared::cta.b64   %0, [%1], %2; // 6. "
+      : "=l"(__state)
+      : "r"(__as_ptr_smem(__addr)),
+        "r"(__tx_count)
+      : "memory"
+    );
+  } else if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cluster) {
+    asm (
+      "mbarrier.arrive.expect_tx.release.cluster.shared::cta.b64   %0, [%1], %2; // 6. "
+      : "=l"(__state)
       : "r"(__as_ptr_smem(__addr)),
         "r"(__tx_count)
-      : "memory");
+      : "memory"
+    );
   }
-  return __token;
+  return __state;
 }
-#endif // __cccl_ptx_isa
-
-#if __cccl_ptx_sm >= 900 && __cccl_ptx_isa >= 780
-template <dot_scope _Sco>
+#endif // __cccl_ptx_isa >= 800 && __cccl_ptx_sm >= 900
+/*
+// mbarrier.arrive.expect_tx.release{.scope}.shared::cluster.b64 _, [addr], tx_count; // 7.  PTX ISA 80, SM_90
+// .scope     = { .cta, .cluster }
+template <cuda::ptx::dot_scope Scope>
+__device__ inline void mbarrier_arrive_expect_tx(
+  cuda::ptx::sem_release_t sem,
+  cuda::ptx::scope_t<Scope> scope,
+  cuda::ptx::space_shared_cluster_t space,
+  uint64_t* addr,
+  uint32_t tx_count);
+*/
+#if __cccl_ptx_isa >= 800 && __cccl_ptx_sm >= 900
+template <dot_scope _Scope>
 _LIBCUDACXX_DEVICE inline void mbarrier_arrive_expect_tx(
   sem_release_t __sem,
-  scope_t<_Sco> __scope,
-  space_shared_cluster_t __spc,
+  scope_t<_Scope> __scope,
+  space_shared_cluster_t __space,
   _CUDA_VSTD::uint64_t* __addr,
   _CUDA_VSTD::uint32_t __tx_count)
 {
-  // Arrive on remote cluster barrier
+  // __sem == sem_release (due to parameter type constraint)
   static_assert(__scope == scope_cta || __scope == scope_cluster, "");
+  // __space == space_shared_cluster (due to parameter type constraint)
+
+
+
   if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cta) {
-      asm (
-        "mbarrier.arrive.expect_tx.release.cta.shared::cluster.b64 _, [%0], %1;"
-        :
-        : "r"(__as_ptr_remote_dsmem(__addr)),
-          "r"(__tx_count)
-        : "memory");
-    } else {
     asm (
-      "mbarrier.arrive.expect_tx.release.cluster.shared::cluster.b64 _, [%0], %1;"
+      "mbarrier.arrive.expect_tx.release.cta.shared::cluster.b64 _, [%0], %1; // 7. "
+      :
+      : "r"(__as_ptr_smem(__addr)),
+        "r"(__tx_count)
+      : "memory"
+    );
+  } else if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cluster) {
+    asm (
+      "mbarrier.arrive.expect_tx.release.cluster.shared::cluster.b64 _, [%0], %1; // 7. "
       :
-      : "r"(__as_ptr_remote_dsmem(__addr)),
+      : "r"(__as_ptr_smem(__addr)),
         "r"(__tx_count)
-      : "memory");
+      : "memory"
+    );
   }
-}
-#endif // __cccl_ptx_isa
 
+}
+#endif // __cccl_ptx_isa >= 800 && __cccl_ptx_sm >= 900
 
 // 9.7.12.15.14. Parallel Synchronization and Communication Instructions: mbarrier.arrive_drop
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive-drop

From e1688151083a1c3bf5d1ad22dc1db7e2b10ce9ef Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Thu, 26 Oct 2023 21:44:57 +0200
Subject: [PATCH 29/49] Use cuda code-blocks for syntax highlighting

---
 libcudacxx/docs/extended_api/ptx.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libcudacxx/docs/extended_api/ptx.md b/libcudacxx/docs/extended_api/ptx.md
index b90d8900a7e..64bcbb5d2e0 100644
--- a/libcudacxx/docs/extended_api/ptx.md
+++ b/libcudacxx/docs/extended_api/ptx.md
@@ -25,7 +25,7 @@ major version of the CTK on a best effort basis. This means that:
 API stability is not taken to the extreme. Call functions like below to ensure
 forward-compatibility:
 
-```
+```cuda
 // Use arguments to driver overload resolution:
 cuda::ptx::mbarrier_arrive_expect_tx(cuda::ptx::sem_release, cuda::ptx::scope_cta, cuda::ptx::space_shared, &bar, 1);
 
@@ -57,7 +57,7 @@ cuda::ptx::mbarrier_arrive(cuda::ptx::sem_release, cuda::ptx::scope_cta, cuda::p
 
 Ensure that you only call the function when compiling for a recent enough
 compute capability (SM version), like this:
-```
+```cuda
 NV_IF_TARGET(NV_PROVIDES_SM_80,(
   cuda::ptx::mbarrier_arrive(cuda::ptx::sem_release, cuda::ptx::scope_cta, cuda::ptx::space_shared, &bar, 1);
 ));

From bd967f0e55eb19308c2eb242497324727ad38635 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Fri, 27 Oct 2023 12:16:17 +0200
Subject: [PATCH 30/49] Use backward-compatible PTX spelling

---
 ..._and_communication_instructions_mbarrier.h | 22 +++++++++----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h
index c752abcf65f..002b0990b1a 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h
@@ -51,7 +51,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 // .scope = { .cta, .cluster }
 
 /*
-// mbarrier.arrive.shared::cta.b64 state, [addr]; // 1.  PTX ISA 70, SM_80
+// mbarrier.arrive.shared.b64 state, [addr]; // 1.  PTX ISA 70, SM_80
 __device__ inline uint64_t mbarrier_arrive(
   cuda::ptx::sem_release_t sem,
   cuda::ptx::scope_cta_t scope,
@@ -72,7 +72,7 @@ _LIBCUDACXX_DEVICE inline _CUDA_VSTD::uint64_t mbarrier_arrive(
   _CUDA_VSTD::uint64_t __state;
 
   asm (
-    "mbarrier.arrive.shared::cta.b64 %0, [%1]; // 1. "
+    "mbarrier.arrive.shared.b64 %0, [%1]; // 1. "
     : "=l"(__state)
     : "r"(__as_ptr_smem(__addr))
     : "memory"
@@ -81,7 +81,7 @@ _LIBCUDACXX_DEVICE inline _CUDA_VSTD::uint64_t mbarrier_arrive(
 }
 #endif // __cccl_ptx_isa >= 700 && __cccl_ptx_sm >= 800
 /*
-// mbarrier.arrive.noComplete.shared::cta.b64 state, [addr], count; // 2.  PTX ISA 70, SM_80
+// mbarrier.arrive.noComplete.shared.b64 state, [addr], count; // 2.  PTX ISA 70, SM_80
 __device__ inline uint64_t mbarrier_arrive_no_complete(
   cuda::ptx::sem_release_t sem,
   cuda::ptx::scope_cta_t scope,
@@ -104,7 +104,7 @@ _LIBCUDACXX_DEVICE inline _CUDA_VSTD::uint64_t mbarrier_arrive_no_complete(
   _CUDA_VSTD::uint64_t __state;
 
   asm (
-    "mbarrier.arrive.noComplete.shared::cta.b64 %0, [%1], %2; // 2. "
+    "mbarrier.arrive.noComplete.shared.b64 %0, [%1], %2; // 2. "
     : "=l"(__state)
     : "r"(__as_ptr_smem(__addr)),
       "r"(__count)
@@ -114,7 +114,7 @@ _LIBCUDACXX_DEVICE inline _CUDA_VSTD::uint64_t mbarrier_arrive_no_complete(
 }
 #endif // __cccl_ptx_isa >= 700 && __cccl_ptx_sm >= 800
 /*
-// mbarrier.arrive.shared::cta.b64 state, [addr], count; // 3. PTX ISA 78, SM_90
+// mbarrier.arrive.shared.b64 state, [addr], count; // 3. PTX ISA 78, SM_90
 __device__ inline uint64_t mbarrier_arrive(
   cuda::ptx::sem_release_t sem,
   cuda::ptx::scope_cta_t scope,
@@ -137,7 +137,7 @@ _LIBCUDACXX_DEVICE inline _CUDA_VSTD::uint64_t mbarrier_arrive(
   _CUDA_VSTD::uint64_t __state;
 
   asm (
-    "mbarrier.arrive.shared::cta.b64 %0, [%1], %2; // 3."
+    "mbarrier.arrive.shared.b64 %0, [%1], %2; // 3."
     : "=l"(__state)
     : "r"(__as_ptr_smem(__addr)),
       "r"(__count)
@@ -147,7 +147,7 @@ _LIBCUDACXX_DEVICE inline _CUDA_VSTD::uint64_t mbarrier_arrive(
 }
 #endif // __cccl_ptx_isa >= 780 && __cccl_ptx_sm >= 900
 /*
-// mbarrier.arrive.release.cluster.shared::cta.b64 state,  [addr], count; // 4. PTX ISA 80, SM_90
+// mbarrier.arrive.release.cluster.shared.b64 state,  [addr], count; // 4. PTX ISA 80, SM_90
 __device__ inline uint64_t mbarrier_arrive(
   cuda::ptx::sem_release_t sem,
   cuda::ptx::scope_cluster_t scope,
@@ -170,7 +170,7 @@ _LIBCUDACXX_DEVICE inline _CUDA_VSTD::uint64_t mbarrier_arrive(
   _CUDA_VSTD::uint64_t __state;
 
   asm (
-    "mbarrier.arrive.release.cluster.shared::cta.b64 %0,  [%1], %2; // 4."
+    "mbarrier.arrive.release.cluster.shared.b64 %0,  [%1], %2; // 4."
     : "=l"(__state)
     : "r"(__as_ptr_smem(__addr)),
       "r"(__count)
@@ -226,7 +226,7 @@ _LIBCUDACXX_DEVICE inline void mbarrier_arrive(
 }
 #endif // __cccl_ptx_isa >= 800 && __cccl_ptx_sm >= 900
 /*
-// mbarrier.arrive.expect_tx.release{.scope}.shared::cta.b64   state, [addr], tx_count; // 6.  PTX ISA 80, SM_90
+// mbarrier.arrive.expect_tx.release{.scope}.shared.b64   state, [addr], tx_count; // 6.  PTX ISA 80, SM_90
 // .scope     = { .cta, .cluster }
 template <cuda::ptx::dot_scope Scope>
 __device__ inline uint64_t mbarrier_arrive_expect_tx(
@@ -253,7 +253,7 @@ _LIBCUDACXX_DEVICE inline _CUDA_VSTD::uint64_t mbarrier_arrive_expect_tx(
 
   if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cta) {
     asm (
-      "mbarrier.arrive.expect_tx.release.cta.shared::cta.b64   %0, [%1], %2; // 6. "
+      "mbarrier.arrive.expect_tx.release.cta.shared.b64   %0, [%1], %2; // 6. "
       : "=l"(__state)
       : "r"(__as_ptr_smem(__addr)),
         "r"(__tx_count)
@@ -261,7 +261,7 @@ _LIBCUDACXX_DEVICE inline _CUDA_VSTD::uint64_t mbarrier_arrive_expect_tx(
     );
   } else if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cluster) {
     asm (
-      "mbarrier.arrive.expect_tx.release.cluster.shared::cta.b64   %0, [%1], %2; // 6. "
+      "mbarrier.arrive.expect_tx.release.cluster.shared.b64   %0, [%1], %2; // 6. "
       : "=l"(__state)
       : "r"(__as_ptr_smem(__addr)),
         "r"(__tx_count)

From db33678f36502eaef4121c5812ad3a256781c021 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Fri, 27 Oct 2023 12:17:29 +0200
Subject: [PATCH 31/49] Use linker-error trick to enable architecture selection

All functions are now function templates. If they are instantiated on an
unsupported architecture, they call a declared, but undefined function.
This leads to a linker error.

In addition, functions are marked static to avoid them being linked.
---
 ..._and_communication_instructions_mbarrier.h | 283 +++++++++++-------
 1 file changed, 170 insertions(+), 113 deletions(-)

diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h
index 002b0990b1a..92dce26cbee 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h
@@ -52,14 +52,16 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
 /*
 // mbarrier.arrive.shared.b64 state, [addr]; // 1.  PTX ISA 70, SM_80
-__device__ inline uint64_t mbarrier_arrive(
+template <typename=void>
+__device__ static inline uint64_t mbarrier_arrive(
   cuda::ptx::sem_release_t sem,
   cuda::ptx::scope_cta_t scope,
   cuda::ptx::space_shared_t space,
   uint64_t* addr);
 */
-#if __cccl_ptx_isa >= 700 && __cccl_ptx_sm >= 800
-_LIBCUDACXX_DEVICE inline _CUDA_VSTD::uint64_t mbarrier_arrive(
+#if __cccl_ptx_isa >= 700
+template <typename=void>
+_LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive(
   sem_release_t __sem,
   scope_cta_t __scope,
   space_shared_t __space,
@@ -71,26 +73,35 @@ _LIBCUDACXX_DEVICE inline _CUDA_VSTD::uint64_t mbarrier_arrive(
 
   _CUDA_VSTD::uint64_t __state;
 
-  asm (
-    "mbarrier.arrive.shared.b64 %0, [%1]; // 1. "
-    : "=l"(__state)
-    : "r"(__as_ptr_smem(__addr))
-    : "memory"
-  );
-  return __state;
+  NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,(
+    asm (
+      "mbarrier.arrive.shared.b64 %0, [%1]; // 1. "
+      : "=l"(__state)
+      : "r"(__as_ptr_smem(__addr))
+      : "memory"
+    );
+    return __state;
+  ),(
+    // Unsupported architectures will have a linker error with a semi-decent error message
+    __device__ _CUDA_VSTD::uint64_t ___cuda_vstd_uint64_t__cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_80__();
+    return ___cuda_vstd_uint64_t__cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_80__();
+  ));
 }
-#endif // __cccl_ptx_isa >= 700 && __cccl_ptx_sm >= 800
+#endif // __cccl_ptx_isa >= 700
+
 /*
 // mbarrier.arrive.noComplete.shared.b64 state, [addr], count; // 2.  PTX ISA 70, SM_80
-__device__ inline uint64_t mbarrier_arrive_no_complete(
+template <typename=void>
+__device__ static inline uint64_t mbarrier_arrive_no_complete(
   cuda::ptx::sem_release_t sem,
   cuda::ptx::scope_cta_t scope,
   cuda::ptx::space_shared_t space,
   uint64_t* addr,
   uint32_t count);
 */
-#if __cccl_ptx_isa >= 700 && __cccl_ptx_sm >= 800
-_LIBCUDACXX_DEVICE inline _CUDA_VSTD::uint64_t mbarrier_arrive_no_complete(
+#if __cccl_ptx_isa >= 700
+template <typename=void>
+_LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive_no_complete(
   sem_release_t __sem,
   scope_cta_t __scope,
   space_shared_t __space,
@@ -103,27 +114,36 @@ _LIBCUDACXX_DEVICE inline _CUDA_VSTD::uint64_t mbarrier_arrive_no_complete(
 
   _CUDA_VSTD::uint64_t __state;
 
-  asm (
-    "mbarrier.arrive.noComplete.shared.b64 %0, [%1], %2; // 2. "
-    : "=l"(__state)
-    : "r"(__as_ptr_smem(__addr)),
-      "r"(__count)
-    : "memory"
-  );
-  return __state;
+  NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,(
+    asm (
+      "mbarrier.arrive.noComplete.shared.b64 %0, [%1], %2; // 2. "
+      : "=l"(__state)
+      : "r"(__as_ptr_smem(__addr)),
+        "r"(__count)
+      : "memory"
+    );
+    return __state;
+  ),(
+    // Unsupported architectures will have a linker error with a semi-decent error message
+    __device__ _CUDA_VSTD::uint64_t ___cuda_vstd_uint64_t__cuda_ptx_mbarrier_arrive_no_complete_is_not_supported_before_SM_80__();
+    return ___cuda_vstd_uint64_t__cuda_ptx_mbarrier_arrive_no_complete_is_not_supported_before_SM_80__();
+  ));
 }
-#endif // __cccl_ptx_isa >= 700 && __cccl_ptx_sm >= 800
+#endif // __cccl_ptx_isa >= 700
+
 /*
 // mbarrier.arrive.shared.b64 state, [addr], count; // 3. PTX ISA 78, SM_90
-__device__ inline uint64_t mbarrier_arrive(
+template <typename=void>
+__device__ static inline uint64_t mbarrier_arrive(
   cuda::ptx::sem_release_t sem,
   cuda::ptx::scope_cta_t scope,
   cuda::ptx::space_shared_t space,
   uint64_t* addr,
   uint32_t count);
 */
-#if __cccl_ptx_isa >= 780 && __cccl_ptx_sm >= 900
-_LIBCUDACXX_DEVICE inline _CUDA_VSTD::uint64_t mbarrier_arrive(
+#if __cccl_ptx_isa >= 780
+template <typename=void>
+_LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive(
   sem_release_t __sem,
   scope_cta_t __scope,
   space_shared_t __space,
@@ -136,27 +156,36 @@ _LIBCUDACXX_DEVICE inline _CUDA_VSTD::uint64_t mbarrier_arrive(
 
   _CUDA_VSTD::uint64_t __state;
 
-  asm (
-    "mbarrier.arrive.shared.b64 %0, [%1], %2; // 3."
-    : "=l"(__state)
-    : "r"(__as_ptr_smem(__addr)),
-      "r"(__count)
-    : "memory"
-  );
-  return __state;
+  NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
+    asm (
+      "mbarrier.arrive.shared.b64 %0, [%1], %2; // 3."
+      : "=l"(__state)
+      : "r"(__as_ptr_smem(__addr)),
+        "r"(__count)
+      : "memory"
+    );
+    return __state;
+  ),(
+    // Unsupported architectures will have a linker error with a semi-decent error message
+    __device__ _CUDA_VSTD::uint64_t ___cuda_vstd_uint64_t__cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__();
+    return ___cuda_vstd_uint64_t__cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__();
+  ));
 }
-#endif // __cccl_ptx_isa >= 780 && __cccl_ptx_sm >= 900
+#endif // __cccl_ptx_isa >= 780
+
 /*
 // mbarrier.arrive.release.cluster.shared.b64 state,  [addr], count; // 4. PTX ISA 80, SM_90
-__device__ inline uint64_t mbarrier_arrive(
+template <typename=void>
+__device__ static inline uint64_t mbarrier_arrive(
   cuda::ptx::sem_release_t sem,
   cuda::ptx::scope_cluster_t scope,
   cuda::ptx::space_shared_t space,
   uint64_t* addr,
   uint32_t count);
 */
-#if __cccl_ptx_isa >= 800 && __cccl_ptx_sm >= 900
-_LIBCUDACXX_DEVICE inline _CUDA_VSTD::uint64_t mbarrier_arrive(
+#if __cccl_ptx_isa >= 800
+template <typename=void>
+_LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive(
   sem_release_t __sem,
   scope_cluster_t __scope,
   space_shared_t __space,
@@ -169,30 +198,37 @@ _LIBCUDACXX_DEVICE inline _CUDA_VSTD::uint64_t mbarrier_arrive(
 
   _CUDA_VSTD::uint64_t __state;
 
-  asm (
-    "mbarrier.arrive.release.cluster.shared.b64 %0,  [%1], %2; // 4."
-    : "=l"(__state)
-    : "r"(__as_ptr_smem(__addr)),
-      "r"(__count)
-    : "memory"
-  );
-  return __state;
+  NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
+    asm (
+      "mbarrier.arrive.release.cluster.shared.b64 %0,  [%1], %2; // 4."
+      : "=l"(__state)
+      : "r"(__as_ptr_smem(__addr)),
+        "r"(__count)
+      : "memory"
+    );
+    return __state;
+  ),(
+    // Unsupported architectures will have a linker error with a semi-decent error message
+    __device__ _CUDA_VSTD::uint64_t ___cuda_vstd_uint64_t__cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__();
+    return ___cuda_vstd_uint64_t__cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__();
+  ));
 }
-#endif // __cccl_ptx_isa >= 800 && __cccl_ptx_sm >= 900
+#endif // __cccl_ptx_isa >= 800
+
 /*
 // mbarrier.arrive.release{.scope}.shared::cluster.b64 _, [addr], count;   // 5.  PTX ISA 80, SM_90
 // .scope     = { .cta, .cluster }
 template <cuda::ptx::dot_scope Scope>
-__device__ inline void mbarrier_arrive(
+__device__ static inline void mbarrier_arrive(
   cuda::ptx::sem_release_t sem,
   cuda::ptx::scope_t<Scope> scope,
   cuda::ptx::space_shared_cluster_t space,
   uint64_t* addr,
   uint32_t count);
 */
-#if __cccl_ptx_isa >= 800 && __cccl_ptx_sm >= 900
+#if __cccl_ptx_isa >= 800
 template <dot_scope _Scope>
-_LIBCUDACXX_DEVICE inline void mbarrier_arrive(
+_LIBCUDACXX_DEVICE static inline void mbarrier_arrive(
   sem_release_t __sem,
   scope_t<_Scope> __scope,
   space_shared_cluster_t __space,
@@ -205,40 +241,47 @@ _LIBCUDACXX_DEVICE inline void mbarrier_arrive(
 
 
 
-  if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cta) {
-    asm (
-      "mbarrier.arrive.release.cta.shared::cluster.b64 _, [%0], %1;   // 5. "
-      :
-      : "r"(__as_ptr_smem(__addr)),
-        "r"(__count)
-      : "memory"
-    );
-  } else if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cluster) {
-    asm (
-      "mbarrier.arrive.release.cluster.shared::cluster.b64 _, [%0], %1;   // 5. "
-      :
-      : "r"(__as_ptr_smem(__addr)),
-        "r"(__count)
-      : "memory"
-    );
-  }
-
+  NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
+    if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cta) {
+      asm (
+        "mbarrier.arrive.release.cta.shared::cluster.b64 _, [%0], %1;   // 5. "
+        :
+        : "r"(__as_ptr_smem(__addr)),
+          "r"(__count)
+        : "memory"
+      );
+    } else if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cluster) {
+      asm (
+        "mbarrier.arrive.release.cluster.shared::cluster.b64 _, [%0], %1;   // 5. "
+        :
+        : "r"(__as_ptr_smem(__addr)),
+          "r"(__count)
+        : "memory"
+      );
+    }
+
+  ),(
+    // Unsupported architectures will have a linker error with a semi-decent error message
+    __device__ void __void__cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__();
+    return __void__cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__();
+  ));
 }
-#endif // __cccl_ptx_isa >= 800 && __cccl_ptx_sm >= 900
+#endif // __cccl_ptx_isa >= 800
+
 /*
 // mbarrier.arrive.expect_tx.release{.scope}.shared.b64   state, [addr], tx_count; // 6.  PTX ISA 80, SM_90
 // .scope     = { .cta, .cluster }
 template <cuda::ptx::dot_scope Scope>
-__device__ inline uint64_t mbarrier_arrive_expect_tx(
+__device__ static inline uint64_t mbarrier_arrive_expect_tx(
   cuda::ptx::sem_release_t sem,
   cuda::ptx::scope_t<Scope> scope,
   cuda::ptx::space_shared_t space,
   uint64_t* addr,
   uint32_t tx_count);
 */
-#if __cccl_ptx_isa >= 800 && __cccl_ptx_sm >= 900
+#if __cccl_ptx_isa >= 800
 template <dot_scope _Scope>
-_LIBCUDACXX_DEVICE inline _CUDA_VSTD::uint64_t mbarrier_arrive_expect_tx(
+_LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive_expect_tx(
   sem_release_t __sem,
   scope_t<_Scope> __scope,
   space_shared_t __space,
@@ -251,40 +294,47 @@ _LIBCUDACXX_DEVICE inline _CUDA_VSTD::uint64_t mbarrier_arrive_expect_tx(
 
   _CUDA_VSTD::uint64_t __state;
 
-  if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cta) {
-    asm (
-      "mbarrier.arrive.expect_tx.release.cta.shared.b64   %0, [%1], %2; // 6. "
-      : "=l"(__state)
-      : "r"(__as_ptr_smem(__addr)),
-        "r"(__tx_count)
-      : "memory"
-    );
-  } else if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cluster) {
-    asm (
-      "mbarrier.arrive.expect_tx.release.cluster.shared.b64   %0, [%1], %2; // 6. "
-      : "=l"(__state)
-      : "r"(__as_ptr_smem(__addr)),
-        "r"(__tx_count)
-      : "memory"
-    );
-  }
-  return __state;
+  NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
+    if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cta) {
+      asm (
+        "mbarrier.arrive.expect_tx.release.cta.shared.b64   %0, [%1], %2; // 6. "
+        : "=l"(__state)
+        : "r"(__as_ptr_smem(__addr)),
+          "r"(__tx_count)
+        : "memory"
+      );
+    } else if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cluster) {
+      asm (
+        "mbarrier.arrive.expect_tx.release.cluster.shared.b64   %0, [%1], %2; // 6. "
+        : "=l"(__state)
+        : "r"(__as_ptr_smem(__addr)),
+          "r"(__tx_count)
+        : "memory"
+      );
+    }
+    return __state;
+  ),(
+    // Unsupported architectures will have a linker error with a semi-decent error message
+    __device__ _CUDA_VSTD::uint64_t ___cuda_vstd_uint64_t__cuda_ptx_mbarrier_arrive_expect_tx_is_not_supported_before_SM_90__();
+    return ___cuda_vstd_uint64_t__cuda_ptx_mbarrier_arrive_expect_tx_is_not_supported_before_SM_90__();
+  ));
 }
-#endif // __cccl_ptx_isa >= 800 && __cccl_ptx_sm >= 900
+#endif // __cccl_ptx_isa >= 800
+
 /*
 // mbarrier.arrive.expect_tx.release{.scope}.shared::cluster.b64 _, [addr], tx_count; // 7.  PTX ISA 80, SM_90
 // .scope     = { .cta, .cluster }
 template <cuda::ptx::dot_scope Scope>
-__device__ inline void mbarrier_arrive_expect_tx(
+__device__ static inline void mbarrier_arrive_expect_tx(
   cuda::ptx::sem_release_t sem,
   cuda::ptx::scope_t<Scope> scope,
   cuda::ptx::space_shared_cluster_t space,
   uint64_t* addr,
   uint32_t tx_count);
 */
-#if __cccl_ptx_isa >= 800 && __cccl_ptx_sm >= 900
+#if __cccl_ptx_isa >= 800
 template <dot_scope _Scope>
-_LIBCUDACXX_DEVICE inline void mbarrier_arrive_expect_tx(
+_LIBCUDACXX_DEVICE static inline void mbarrier_arrive_expect_tx(
   sem_release_t __sem,
   scope_t<_Scope> __scope,
   space_shared_cluster_t __space,
@@ -297,26 +347,33 @@ _LIBCUDACXX_DEVICE inline void mbarrier_arrive_expect_tx(
 
 
 
-  if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cta) {
-    asm (
-      "mbarrier.arrive.expect_tx.release.cta.shared::cluster.b64 _, [%0], %1; // 7. "
-      :
-      : "r"(__as_ptr_smem(__addr)),
-        "r"(__tx_count)
-      : "memory"
-    );
-  } else if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cluster) {
-    asm (
-      "mbarrier.arrive.expect_tx.release.cluster.shared::cluster.b64 _, [%0], %1; // 7. "
-      :
-      : "r"(__as_ptr_smem(__addr)),
-        "r"(__tx_count)
-      : "memory"
-    );
-  }
-
+  NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
+    if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cta) {
+      asm (
+        "mbarrier.arrive.expect_tx.release.cta.shared::cluster.b64 _, [%0], %1; // 7. "
+        :
+        : "r"(__as_ptr_smem(__addr)),
+          "r"(__tx_count)
+        : "memory"
+      );
+    } else if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cluster) {
+      asm (
+        "mbarrier.arrive.expect_tx.release.cluster.shared::cluster.b64 _, [%0], %1; // 7. "
+        :
+        : "r"(__as_ptr_smem(__addr)),
+          "r"(__tx_count)
+        : "memory"
+      );
+    }
+
+  ),(
+    // Unsupported architectures will have a linker error with a semi-decent error message
+    __device__ void __void__cuda_ptx_mbarrier_arrive_expect_tx_is_not_supported_before_SM_90__();
+    return __void__cuda_ptx_mbarrier_arrive_expect_tx_is_not_supported_before_SM_90__();
+  ));
 }
-#endif // __cccl_ptx_isa >= 800 && __cccl_ptx_sm >= 900
+#endif // __cccl_ptx_isa >= 800
+
 
 // 9.7.12.15.14. Parallel Synchronization and Communication Instructions: mbarrier.arrive_drop
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive-drop

From 6a1b36e3dc73a5cc1dbdc7eff66c2407c1ad2719 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Fri, 27 Oct 2023 12:32:53 +0200
Subject: [PATCH 32/49] Use const references

---
 libcudacxx/docs/extended_api/ptx.md           | 46 ++++++++++---------
 ..._and_communication_instructions_mbarrier.h | 24 +++++-----
 2 files changed, 37 insertions(+), 33 deletions(-)

diff --git a/libcudacxx/docs/extended_api/ptx.md b/libcudacxx/docs/extended_api/ptx.md
index 64bcbb5d2e0..349149eaf42 100644
--- a/libcudacxx/docs/extended_api/ptx.md
+++ b/libcudacxx/docs/extended_api/ptx.md
@@ -444,66 +444,70 @@ notes](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#release
 
  
 ```cuda
-// mbarrier.arrive.shared::cta.b64 state, [addr];                                        // 1.  PTX ISA 70, SM_80
-__device__ inline uint64_t mbarrier_arrive(
+// mbarrier.arrive.shared.b64 state, [addr]; // 1.  PTX ISA 70, SM_80
+template <typename=void>
+__device__ static inline uint64_t mbarrier_arrive(
   cuda::ptx::sem_release_t sem,
   cuda::ptx::scope_cta_t scope,
   cuda::ptx::space_shared_t space,
   uint64_t* addr);
 
-// mbarrier.arrive.noComplete.shared::cta.b64 state, [addr], count;                      // 2.  PTX ISA 70, SM_80
-__device__ inline uint64_t mbarrier_arrive_no_complete(
+// mbarrier.arrive.noComplete.shared.b64 state, [addr], count; // 2.  PTX ISA 70, SM_80
+template <typename=void>
+__device__ static inline uint64_t mbarrier_arrive_no_complete(
   cuda::ptx::sem_release_t sem,
   cuda::ptx::scope_cta_t scope,
   cuda::ptx::space_shared_t space,
   uint64_t* addr,
-  uint32_t count);
+  const uint32_t& count);
 
-// mbarrier.arrive.shared::cta.b64 state, [addr], count;                                 // 3. PTX ISA 78, SM_90
-__device__ inline uint64_t mbarrier_arrive(
+// mbarrier.arrive.shared.b64 state, [addr], count; // 3. PTX ISA 78, SM_90
+template <typename=void>
+__device__ static inline uint64_t mbarrier_arrive(
   cuda::ptx::sem_release_t sem,
   cuda::ptx::scope_cta_t scope,
   cuda::ptx::space_shared_t space,
   uint64_t* addr,
-  uint32_t count);
-
-// mbarrier.arrive.release.cluster.shared::cta.b64 state,  [addr], count;                // 4. PTX ISA 80, SM_90
-__device__ inline uint64_t mbarrier_arrive(
+  const uint32_t& count);
+  
+// mbarrier.arrive.release.cluster.shared.b64 state,  [addr], count; // 4. PTX ISA 80, SM_90
+template <typename=void>
+__device__ static inline uint64_t mbarrier_arrive(
   cuda::ptx::sem_release_t sem,
   cuda::ptx::scope_cluster_t scope,
   cuda::ptx::space_shared_t space,
   uint64_t* addr,
-  uint32_t count);
+  const uint32_t& count);
 
-// mbarrier.arrive.release{.scope}.shared::cluster.b64 _, [addr], count;                 // 5.  PTX ISA 80, SM_90
+// mbarrier.arrive.release{.scope}.shared::cluster.b64 _, [addr], count;   // 5.  PTX ISA 80, SM_90
 // .scope     = { .cta, .cluster }
 template <cuda::ptx::dot_scope Scope>
-__device__ inline void mbarrier_arrive(
+__device__ static inline void mbarrier_arrive(
   cuda::ptx::sem_release_t sem,
   cuda::ptx::scope_t<Scope> scope,
   cuda::ptx::space_shared_cluster_t space,
   uint64_t* addr,
-  uint32_t count);
+  const uint32_t& count);
 
-// mbarrier.arrive.expect_tx.release{.scope}.shared::cta.b64   state, [addr], tx_count;  // 6.  PTX ISA 80, SM_90
+// mbarrier.arrive.expect_tx.release{.scope}.shared.b64   state, [addr], tx_count; // 6.  PTX ISA 80, SM_90
 // .scope     = { .cta, .cluster }
 template <cuda::ptx::dot_scope Scope>
-__device__ inline uint64_t mbarrier_arrive_expect_tx(
+__device__ static inline uint64_t mbarrier_arrive_expect_tx(
   cuda::ptx::sem_release_t sem,
   cuda::ptx::scope_t<Scope> scope,
   cuda::ptx::space_shared_t space,
   uint64_t* addr,
-  uint32_t tx_count);
+  const uint32_t& tx_count);
 
-// mbarrier.arrive.expect_tx.release{.scope}.shared::cluster.b64 _, [addr], tx_count;    // 7.  PTX ISA 80, SM_90
+// mbarrier.arrive.expect_tx.release{.scope}.shared::cluster.b64 _, [addr], tx_count; // 7.  PTX ISA 80, SM_90
 // .scope     = { .cta, .cluster }
 template <cuda::ptx::dot_scope Scope>
-__device__ inline void mbarrier_arrive_expect_tx(
+__device__ static inline void mbarrier_arrive_expect_tx(
   cuda::ptx::sem_release_t sem,
   cuda::ptx::scope_t<Scope> scope,
   cuda::ptx::space_shared_cluster_t space,
   uint64_t* addr,
-  uint32_t tx_count);
+  const uint32_t& tx_count);
 ```
 
 Usage:
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h
index 92dce26cbee..7313bff28ce 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h
@@ -97,7 +97,7 @@ __device__ static inline uint64_t mbarrier_arrive_no_complete(
   cuda::ptx::scope_cta_t scope,
   cuda::ptx::space_shared_t space,
   uint64_t* addr,
-  uint32_t count);
+  const uint32_t& count);
 */
 #if __cccl_ptx_isa >= 700
 template <typename=void>
@@ -106,7 +106,7 @@ _LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive_no_complet
   scope_cta_t __scope,
   space_shared_t __space,
   _CUDA_VSTD::uint64_t* __addr,
-  _CUDA_VSTD::uint32_t __count)
+  const _CUDA_VSTD::uint32_t& __count)
 {
   // __sem == sem_release (due to parameter type constraint)
   // __scope == scope_cta (due to parameter type constraint)
@@ -139,7 +139,7 @@ __device__ static inline uint64_t mbarrier_arrive(
   cuda::ptx::scope_cta_t scope,
   cuda::ptx::space_shared_t space,
   uint64_t* addr,
-  uint32_t count);
+  const uint32_t& count);
 */
 #if __cccl_ptx_isa >= 780
 template <typename=void>
@@ -148,7 +148,7 @@ _LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive(
   scope_cta_t __scope,
   space_shared_t __space,
   _CUDA_VSTD::uint64_t* __addr,
-  _CUDA_VSTD::uint32_t __count)
+  const _CUDA_VSTD::uint32_t& __count)
 {
   // __sem == sem_release (due to parameter type constraint)
   // __scope == scope_cta (due to parameter type constraint)
@@ -181,7 +181,7 @@ __device__ static inline uint64_t mbarrier_arrive(
   cuda::ptx::scope_cluster_t scope,
   cuda::ptx::space_shared_t space,
   uint64_t* addr,
-  uint32_t count);
+  const uint32_t& count);
 */
 #if __cccl_ptx_isa >= 800
 template <typename=void>
@@ -190,7 +190,7 @@ _LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive(
   scope_cluster_t __scope,
   space_shared_t __space,
   _CUDA_VSTD::uint64_t* __addr,
-  _CUDA_VSTD::uint32_t __count)
+  const _CUDA_VSTD::uint32_t& __count)
 {
   // __sem == sem_release (due to parameter type constraint)
   // __scope == scope_cluster (due to parameter type constraint)
@@ -224,7 +224,7 @@ __device__ static inline void mbarrier_arrive(
   cuda::ptx::scope_t<Scope> scope,
   cuda::ptx::space_shared_cluster_t space,
   uint64_t* addr,
-  uint32_t count);
+  const uint32_t& count);
 */
 #if __cccl_ptx_isa >= 800
 template <dot_scope _Scope>
@@ -233,7 +233,7 @@ _LIBCUDACXX_DEVICE static inline void mbarrier_arrive(
   scope_t<_Scope> __scope,
   space_shared_cluster_t __space,
   _CUDA_VSTD::uint64_t* __addr,
-  _CUDA_VSTD::uint32_t __count)
+  const _CUDA_VSTD::uint32_t& __count)
 {
   // __sem == sem_release (due to parameter type constraint)
   static_assert(__scope == scope_cta || __scope == scope_cluster, "");
@@ -277,7 +277,7 @@ __device__ static inline uint64_t mbarrier_arrive_expect_tx(
   cuda::ptx::scope_t<Scope> scope,
   cuda::ptx::space_shared_t space,
   uint64_t* addr,
-  uint32_t tx_count);
+  const uint32_t& tx_count);
 */
 #if __cccl_ptx_isa >= 800
 template <dot_scope _Scope>
@@ -286,7 +286,7 @@ _LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive_expect_tx(
   scope_t<_Scope> __scope,
   space_shared_t __space,
   _CUDA_VSTD::uint64_t* __addr,
-  _CUDA_VSTD::uint32_t __tx_count)
+  const _CUDA_VSTD::uint32_t& __tx_count)
 {
   // __sem == sem_release (due to parameter type constraint)
   static_assert(__scope == scope_cta || __scope == scope_cluster, "");
@@ -330,7 +330,7 @@ __device__ static inline void mbarrier_arrive_expect_tx(
   cuda::ptx::scope_t<Scope> scope,
   cuda::ptx::space_shared_cluster_t space,
   uint64_t* addr,
-  uint32_t tx_count);
+  const uint32_t& tx_count);
 */
 #if __cccl_ptx_isa >= 800
 template <dot_scope _Scope>
@@ -339,7 +339,7 @@ _LIBCUDACXX_DEVICE static inline void mbarrier_arrive_expect_tx(
   scope_t<_Scope> __scope,
   space_shared_cluster_t __space,
   _CUDA_VSTD::uint64_t* __addr,
-  _CUDA_VSTD::uint32_t __tx_count)
+  const _CUDA_VSTD::uint32_t& __tx_count)
 {
   // __sem == sem_release (due to parameter type constraint)
   static_assert(__scope == scope_cta || __scope == scope_cluster, "");

From 594c82fb472af101b714f6b4b68b6aafd7a09478 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Fri, 27 Oct 2023 12:39:19 +0200
Subject: [PATCH 33/49] Do not name unused parameters

---
 libcudacxx/docs/extended_api/ptx.md           | 52 +++++++-------
 ..._and_communication_instructions_mbarrier.h | 72 +++++++++----------
 2 files changed, 62 insertions(+), 62 deletions(-)

diff --git a/libcudacxx/docs/extended_api/ptx.md b/libcudacxx/docs/extended_api/ptx.md
index 349149eaf42..baf859f044a 100644
--- a/libcudacxx/docs/extended_api/ptx.md
+++ b/libcudacxx/docs/extended_api/ptx.md
@@ -444,68 +444,68 @@ notes](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#release
 
  
 ```cuda
-// mbarrier.arrive.shared.b64 state, [addr]; // 1.  PTX ISA 70, SM_80
+// mbarrier.arrive.shared.b64 state, [addr];                                           // 1.  PTX ISA 70, SM_80
 template <typename=void>
 __device__ static inline uint64_t mbarrier_arrive(
-  cuda::ptx::sem_release_t sem,
-  cuda::ptx::scope_cta_t scope,
-  cuda::ptx::space_shared_t space,
+  cuda::ptx::sem_release_t,
+  cuda::ptx::scope_cta_t,
+  cuda::ptx::space_shared_t,
   uint64_t* addr);
 
-// mbarrier.arrive.noComplete.shared.b64 state, [addr], count; // 2.  PTX ISA 70, SM_80
+// mbarrier.arrive.noComplete.shared.b64 state, [addr], count;                         // 2.  PTX ISA 70, SM_80
 template <typename=void>
 __device__ static inline uint64_t mbarrier_arrive_no_complete(
-  cuda::ptx::sem_release_t sem,
-  cuda::ptx::scope_cta_t scope,
-  cuda::ptx::space_shared_t space,
+  cuda::ptx::sem_release_t,
+  cuda::ptx::scope_cta_t,
+  cuda::ptx::space_shared_t,
   uint64_t* addr,
   const uint32_t& count);
 
-// mbarrier.arrive.shared.b64 state, [addr], count; // 3. PTX ISA 78, SM_90
+// mbarrier.arrive.shared.b64 state, [addr], count;                                    // 3. PTX ISA 78, SM_90
 template <typename=void>
 __device__ static inline uint64_t mbarrier_arrive(
-  cuda::ptx::sem_release_t sem,
-  cuda::ptx::scope_cta_t scope,
-  cuda::ptx::space_shared_t space,
+  cuda::ptx::sem_release_t,
+  cuda::ptx::scope_cta_t,
+  cuda::ptx::space_shared_t,
   uint64_t* addr,
   const uint32_t& count);
-  
-// mbarrier.arrive.release.cluster.shared.b64 state,  [addr], count; // 4. PTX ISA 80, SM_90
+
+// mbarrier.arrive.release.cluster.shared.b64 state,  [addr], count;                   // 4. PTX ISA 80, SM_90
 template <typename=void>
 __device__ static inline uint64_t mbarrier_arrive(
-  cuda::ptx::sem_release_t sem,
-  cuda::ptx::scope_cluster_t scope,
-  cuda::ptx::space_shared_t space,
+  cuda::ptx::sem_release_t,
+  cuda::ptx::scope_cluster_t,
+  cuda::ptx::space_shared_t,
   uint64_t* addr,
   const uint32_t& count);
 
-// mbarrier.arrive.release{.scope}.shared::cluster.b64 _, [addr], count;   // 5.  PTX ISA 80, SM_90
+// mbarrier.arrive.release{.scope}.shared::cluster.b64 _, [addr], count;               // 5.  PTX ISA 80, SM_90
 // .scope     = { .cta, .cluster }
 template <cuda::ptx::dot_scope Scope>
 __device__ static inline void mbarrier_arrive(
-  cuda::ptx::sem_release_t sem,
+  cuda::ptx::sem_release_t,
   cuda::ptx::scope_t<Scope> scope,
-  cuda::ptx::space_shared_cluster_t space,
+  cuda::ptx::space_shared_cluster_t,
   uint64_t* addr,
   const uint32_t& count);
 
-// mbarrier.arrive.expect_tx.release{.scope}.shared.b64   state, [addr], tx_count; // 6.  PTX ISA 80, SM_90
+// mbarrier.arrive.expect_tx.release{.scope}.shared.b64   state, [addr], tx_count;     // 6.  PTX ISA 80, SM_90
 // .scope     = { .cta, .cluster }
 template <cuda::ptx::dot_scope Scope>
 __device__ static inline uint64_t mbarrier_arrive_expect_tx(
-  cuda::ptx::sem_release_t sem,
+  cuda::ptx::sem_release_t,
   cuda::ptx::scope_t<Scope> scope,
-  cuda::ptx::space_shared_t space,
+  cuda::ptx::space_shared_t,
   uint64_t* addr,
   const uint32_t& tx_count);
 
-// mbarrier.arrive.expect_tx.release{.scope}.shared::cluster.b64 _, [addr], tx_count; // 7.  PTX ISA 80, SM_90
+// mbarrier.arrive.expect_tx.release{.scope}.shared::cluster.b64 _, [addr], tx_count;  // 7.  PTX ISA 80, SM_90
 // .scope     = { .cta, .cluster }
 template <cuda::ptx::dot_scope Scope>
 __device__ static inline void mbarrier_arrive_expect_tx(
-  cuda::ptx::sem_release_t sem,
+  cuda::ptx::sem_release_t,
   cuda::ptx::scope_t<Scope> scope,
-  cuda::ptx::space_shared_cluster_t space,
+  cuda::ptx::space_shared_cluster_t,
   uint64_t* addr,
   const uint32_t& tx_count);
 ```
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h
index 7313bff28ce..1a1418dfeae 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h
@@ -54,17 +54,17 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 // mbarrier.arrive.shared.b64 state, [addr]; // 1.  PTX ISA 70, SM_80
 template <typename=void>
 __device__ static inline uint64_t mbarrier_arrive(
-  cuda::ptx::sem_release_t sem,
-  cuda::ptx::scope_cta_t scope,
-  cuda::ptx::space_shared_t space,
+  cuda::ptx::sem_release_t,
+  cuda::ptx::scope_cta_t,
+  cuda::ptx::space_shared_t,
   uint64_t* addr);
 */
 #if __cccl_ptx_isa >= 700
 template <typename=void>
 _LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive(
-  sem_release_t __sem,
-  scope_cta_t __scope,
-  space_shared_t __space,
+  sem_release_t,
+  scope_cta_t,
+  space_shared_t,
   _CUDA_VSTD::uint64_t* __addr)
 {
   // __sem == sem_release (due to parameter type constraint)
@@ -93,18 +93,18 @@ _LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive(
 // mbarrier.arrive.noComplete.shared.b64 state, [addr], count; // 2.  PTX ISA 70, SM_80
 template <typename=void>
 __device__ static inline uint64_t mbarrier_arrive_no_complete(
-  cuda::ptx::sem_release_t sem,
-  cuda::ptx::scope_cta_t scope,
-  cuda::ptx::space_shared_t space,
+  cuda::ptx::sem_release_t,
+  cuda::ptx::scope_cta_t,
+  cuda::ptx::space_shared_t,
   uint64_t* addr,
   const uint32_t& count);
 */
 #if __cccl_ptx_isa >= 700
 template <typename=void>
 _LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive_no_complete(
-  sem_release_t __sem,
-  scope_cta_t __scope,
-  space_shared_t __space,
+  sem_release_t,
+  scope_cta_t,
+  space_shared_t,
   _CUDA_VSTD::uint64_t* __addr,
   const _CUDA_VSTD::uint32_t& __count)
 {
@@ -135,18 +135,18 @@ _LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive_no_complet
 // mbarrier.arrive.shared.b64 state, [addr], count; // 3. PTX ISA 78, SM_90
 template <typename=void>
 __device__ static inline uint64_t mbarrier_arrive(
-  cuda::ptx::sem_release_t sem,
-  cuda::ptx::scope_cta_t scope,
-  cuda::ptx::space_shared_t space,
+  cuda::ptx::sem_release_t,
+  cuda::ptx::scope_cta_t,
+  cuda::ptx::space_shared_t,
   uint64_t* addr,
   const uint32_t& count);
 */
 #if __cccl_ptx_isa >= 780
 template <typename=void>
 _LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive(
-  sem_release_t __sem,
-  scope_cta_t __scope,
-  space_shared_t __space,
+  sem_release_t,
+  scope_cta_t,
+  space_shared_t,
   _CUDA_VSTD::uint64_t* __addr,
   const _CUDA_VSTD::uint32_t& __count)
 {
@@ -177,18 +177,18 @@ _LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive(
 // mbarrier.arrive.release.cluster.shared.b64 state,  [addr], count; // 4. PTX ISA 80, SM_90
 template <typename=void>
 __device__ static inline uint64_t mbarrier_arrive(
-  cuda::ptx::sem_release_t sem,
-  cuda::ptx::scope_cluster_t scope,
-  cuda::ptx::space_shared_t space,
+  cuda::ptx::sem_release_t,
+  cuda::ptx::scope_cluster_t,
+  cuda::ptx::space_shared_t,
   uint64_t* addr,
   const uint32_t& count);
 */
 #if __cccl_ptx_isa >= 800
 template <typename=void>
 _LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive(
-  sem_release_t __sem,
-  scope_cluster_t __scope,
-  space_shared_t __space,
+  sem_release_t,
+  scope_cluster_t,
+  space_shared_t,
   _CUDA_VSTD::uint64_t* __addr,
   const _CUDA_VSTD::uint32_t& __count)
 {
@@ -220,18 +220,18 @@ _LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive(
 // .scope     = { .cta, .cluster }
 template <cuda::ptx::dot_scope Scope>
 __device__ static inline void mbarrier_arrive(
-  cuda::ptx::sem_release_t sem,
+  cuda::ptx::sem_release_t,
   cuda::ptx::scope_t<Scope> scope,
-  cuda::ptx::space_shared_cluster_t space,
+  cuda::ptx::space_shared_cluster_t,
   uint64_t* addr,
   const uint32_t& count);
 */
 #if __cccl_ptx_isa >= 800
 template <dot_scope _Scope>
 _LIBCUDACXX_DEVICE static inline void mbarrier_arrive(
-  sem_release_t __sem,
+  sem_release_t,
   scope_t<_Scope> __scope,
-  space_shared_cluster_t __space,
+  space_shared_cluster_t,
   _CUDA_VSTD::uint64_t* __addr,
   const _CUDA_VSTD::uint32_t& __count)
 {
@@ -273,18 +273,18 @@ _LIBCUDACXX_DEVICE static inline void mbarrier_arrive(
 // .scope     = { .cta, .cluster }
 template <cuda::ptx::dot_scope Scope>
 __device__ static inline uint64_t mbarrier_arrive_expect_tx(
-  cuda::ptx::sem_release_t sem,
+  cuda::ptx::sem_release_t,
   cuda::ptx::scope_t<Scope> scope,
-  cuda::ptx::space_shared_t space,
+  cuda::ptx::space_shared_t,
   uint64_t* addr,
   const uint32_t& tx_count);
 */
 #if __cccl_ptx_isa >= 800
 template <dot_scope _Scope>
 _LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive_expect_tx(
-  sem_release_t __sem,
+  sem_release_t,
   scope_t<_Scope> __scope,
-  space_shared_t __space,
+  space_shared_t,
   _CUDA_VSTD::uint64_t* __addr,
   const _CUDA_VSTD::uint32_t& __tx_count)
 {
@@ -326,18 +326,18 @@ _LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive_expect_tx(
 // .scope     = { .cta, .cluster }
 template <cuda::ptx::dot_scope Scope>
 __device__ static inline void mbarrier_arrive_expect_tx(
-  cuda::ptx::sem_release_t sem,
+  cuda::ptx::sem_release_t,
   cuda::ptx::scope_t<Scope> scope,
-  cuda::ptx::space_shared_cluster_t space,
+  cuda::ptx::space_shared_cluster_t,
   uint64_t* addr,
   const uint32_t& tx_count);
 */
 #if __cccl_ptx_isa >= 800
 template <dot_scope _Scope>
 _LIBCUDACXX_DEVICE static inline void mbarrier_arrive_expect_tx(
-  sem_release_t __sem,
+  sem_release_t,
   scope_t<_Scope> __scope,
-  space_shared_cluster_t __space,
+  space_shared_cluster_t,
   _CUDA_VSTD::uint64_t* __addr,
   const _CUDA_VSTD::uint32_t& __tx_count)
 {

From 6b4d380f2decd12c0c98bd5e294b7f15e11e9e92 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Fri, 27 Oct 2023 14:23:27 +0200
Subject: [PATCH 34/49] Add PTX ISA target macros for CUDA 11.X

---
 .../__cuda/ptx/ptx_isa_target_macros.h        | 34 ++++++++++++++++---
 1 file changed, 29 insertions(+), 5 deletions(-)

diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h
index 592cc5f96e6..ab59d8d33f7 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h
@@ -37,21 +37,45 @@
 
 // PTX ISA version
 
-// PTX ISA 8.3 is available from CTK 12.3, driver r545
+// PTX ISA 8.3 is available from CUDA 12.3, driver r545
 #if   (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 3)) || (!defined(__CUDACC_VER_MAJOR__))
 #  define __cccl_ptx_isa 830ULL
-// PTX ISA 8.2 is available from CTK 12.2, driver r535
+// PTX ISA 8.2 is available from CUDA 12.2, driver r535
 #elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 2)) || (!defined(__CUDACC_VER_MAJOR__))
 #  define __cccl_ptx_isa 820ULL
-// PTX ISA 8.1 is available from CTK 12.1, driver r530
+// PTX ISA 8.1 is available from CUDA 12.1, driver r530
 #elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 1)) || (!defined(__CUDACC_VER_MAJOR__))
 #  define __cccl_ptx_isa 810ULL
-// PTX ISA 8.0 is available from CTK 12.0, driver r525
+// PTX ISA 8.0 is available from CUDA 12.0, driver r525
 #elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 0)) || (!defined(__CUDACC_VER_MAJOR__))
 #  define __cccl_ptx_isa 800ULL
-// PTX ISA 7.8 is available from CTK 11.8, driver r520
+// PTX ISA 7.8 is available from CUDA 11.8, driver r520
 #elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 8)) || (!defined(__CUDACC_VER_MAJOR__))
 #  define __cccl_ptx_isa 780ULL
+// PTX ISA 7.7 is available from CUDA 11.7, driver r515
+#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 7)) || (!defined(__CUDACC_VER_MAJOR__))
+#  define __cccl_ptx_isa 770ULL
+// PTX ISA 7.6 is available from CUDA 11.6, driver r510
+#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 6)) || (!defined(__CUDACC_VER_MAJOR__))
+#  define __cccl_ptx_isa 760ULL
+// PTX ISA 7.5 is available from CUDA 11.5, driver r495
+#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 5)) || (!defined(__CUDACC_VER_MAJOR__))
+#  define __cccl_ptx_isa 750ULL
+// PTX ISA 7.4 is available from CUDA 11.4, driver r470
+#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 4)) || (!defined(__CUDACC_VER_MAJOR__))
+#  define __cccl_ptx_isa 740ULL
+// PTX ISA 7.3 is available from CUDA 11.3, driver r465
+#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 3)) || (!defined(__CUDACC_VER_MAJOR__))
+#  define __cccl_ptx_isa 730ULL
+// PTX ISA 7.2 is available from CUDA 11.2, driver r460
+#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 2)) || (!defined(__CUDACC_VER_MAJOR__))
+#  define __cccl_ptx_isa 720ULL
+// PTX ISA 7.1 is available from CUDA 11.1, driver r455
+#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 1)) || (!defined(__CUDACC_VER_MAJOR__))
+#  define __cccl_ptx_isa 710ULL
+// PTX ISA 7.0 is available from CUDA 11.0, driver r445
+#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 0)) || (!defined(__CUDACC_VER_MAJOR__))
+#  define __cccl_ptx_isa 700ULL
 // Fallback case. Define the ISA version to be zero. This ensures that the macro is always defined.
 #else
 #  define __cccl_ptx_isa 0ULL

From 87f300c595a3461bc6ddd5cba710f2cc7cd13e71 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Fri, 27 Oct 2023 14:23:44 +0200
Subject: [PATCH 35/49] Use _CUDA_VPTX in barrier.h

---
 .../include/cuda/std/detail/libcxx/include/__cuda/barrier.h   | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h
index c4bba0222dd..3e4052a7feb 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h
@@ -607,8 +607,8 @@ barrier<thread_scope_block>::arrival_token barrier_arrive_tx(
             auto __native_handle = barrier_native_handle(__b);
             auto __bh = __cvta_generic_to_shared(__native_handle);
             if (__arrive_count_update == 1) {
-                __token = cuda::ptx::mbarrier_arrive_expect_tx(
-                    cuda::ptx::sem_release, cuda::ptx::scope_cta, cuda::ptx::space_shared, __native_handle, __transaction_count_update
+                __token = _CUDA_VPTX::mbarrier_arrive_expect_tx(
+                    _CUDA_VPTX::sem_release, _CUDA_VPTX::scope_cta, _CUDA_VPTX::space_shared, __native_handle, __transaction_count_update
                 );
             } else {
                 asm (

From 3535036444c1cebd81f0091a760f30d65d08b55a Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Fri, 27 Oct 2023 15:00:04 +0200
Subject: [PATCH 36/49] Replace internal use of mbarrier.arrive with
 cuda::ptx::mbarrier_arrive

---
 .../detail/libcxx/include/__cuda/barrier.h    | 39 ++++++++-----------
 1 file changed, 17 insertions(+), 22 deletions(-)

diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h
index 3e4052a7feb..c388d802b30 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h
@@ -207,29 +207,27 @@ friend class _CUDA_VSTD::__barrier_poll_tester_parity;
                 else if (!__isShared(&__barrier)) {
                     __trap();
                 }
-
-                asm volatile ("mbarrier.arrive.shared.b64 %0, [%1], %2;"
-                    : "=l"(__token)
-                    : "r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(&__barrier))),
-                    "r"(static_cast<_CUDA_VSTD::uint32_t>(__update))
-                    : "memory");
+                // Cannot use cuda::device::barrier_native_handle here, as it is
+                // only defined for block-scope barriers. This barrier may be a
+                // non-block scoped barrier.
+                auto __bh = reinterpret_cast<_CUDA_VSTD::uint64_t*>(&__barrier);
+                __token = _CUDA_VPTX::mbarrier_arrive(
+                    _CUDA_VPTX::sem_release, _CUDA_VPTX::scope_cta, _CUDA_VPTX::space_shared, __bh, __update
+                );
             ), NV_PROVIDES_SM_80, (
                 if (!__isShared(&__barrier)) {
                     return __barrier.arrive(__update);
                 }
-
+                auto __bh = reinterpret_cast<_CUDA_VSTD::uint64_t*>(&__barrier);
                 // Need 2 instructions, can't finish barrier with arrive > 1
                 if (__update > 1) {
-                    asm volatile ("mbarrier.arrive.noComplete.shared.b64 %0, [%1], %2;"
-                        : "=l"(__token)
-                        : "r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(&__barrier))),
-                            "r"(static_cast<_CUDA_VSTD::uint32_t>(__update - 1))
-                        : "memory");
+                    ___CUDA_VPTX::mbarrier_arrive_no_complete(
+                        _CUDA_VPTX::sem_release, _CUDA_VPTX::scope_cta, _CUDA_VPTX::space_shared,
+                        __bh, __update - 1);
                 }
-                asm volatile ("mbarrier.arrive.shared.b64 %0, [%1];"
-                    : "=l"(__token)
-                    : "r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(&__barrier)))
-                    : "memory");
+                __token = _CUDA_VPTX::mbarrier_arrive(
+                    _CUDA_VPTX::sem_release, _CUDA_VPTX::scope_cta, _CUDA_VPTX::space_shared, __bh
+                );
             ), NV_IS_DEVICE, (
                 if (!__isShared(&__barrier)) {
                     return __barrier.arrive(__update);
@@ -617,12 +615,9 @@ barrier<thread_scope_block>::arrival_token barrier_arrive_tx(
                     : "r"(static_cast<_CUDA_VSTD::uint32_t>(__bh)),
                       "r"(static_cast<_CUDA_VSTD::uint32_t>(__transaction_count_update))
                     : "memory");
-                asm (
-                    "mbarrier.arrive.release.cta.shared::cta.b64 %0, [%1], %2;"
-                    : "=l"(__token)
-                    : "r"(static_cast<_CUDA_VSTD::uint32_t>(__bh)),
-                      "r"(static_cast<_CUDA_VSTD::uint32_t>(__arrive_count_update))
-                    : "memory");
+                __token = _CUDA_VPTX::mbarrier_arrive(
+                    _CUDA_VPTX::sem_release, _CUDA_VPTX::scope_cta, _CUDA_VPTX::space_shared, __native_handle, __arrive_count_update
+                );
             }
         )
     );

From 82db00d6eadb4a711875f17b3c0ed707ddb2aac8 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Fri, 27 Oct 2023 15:16:30 +0200
Subject: [PATCH 37/49] Guard for PTX ISA version in test

---
 .../test/cuda/ptx/sm90.ptx.compile.pass.cpp               | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/libcudacxx/.upstream-tests/test/cuda/ptx/sm90.ptx.compile.pass.cpp b/libcudacxx/.upstream-tests/test/cuda/ptx/sm90.ptx.compile.pass.cpp
index 83335510a52..a643ce78222 100644
--- a/libcudacxx/.upstream-tests/test/cuda/ptx/sm90.ptx.compile.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/cuda/ptx/sm90.ptx.compile.pass.cpp
@@ -38,15 +38,18 @@ int main(int, char**)
             __shared__ uint64_t bar;
             uint64_t state;
 
-            // TODO: check PTX ISA version.
-
             NV_IF_TARGET(NV_PROVIDES_SM_80, (
+#if __cccl_ptx_isa >= 700
               state = cuda::ptx::mbarrier_arrive(sem_release, scope_cta,     space_shared, &bar);              // 1.
               state = cuda::ptx::mbarrier_arrive_no_complete(sem_release, scope_cta, space_shared, &bar, 1);   // 2.
+#endif
             ));
 
             NV_IF_TARGET(NV_PROVIDES_SM_90, (
+#if __cccl_ptx_isa >= 780 // This guard is redundant: before PTX ISA 7.8, there was no support for SM_90
               state = cuda::ptx::mbarrier_arrive(sem_release, scope_cta,     space_shared, &bar, 1);           // 3.
+#endif
+#if __cccl_ptx_isa >= 800
               state = cuda::ptx::mbarrier_arrive(sem_release, scope_cluster, space_shared, &bar, 1);           // 4.
 
               cuda::ptx::mbarrier_arrive(sem_release, scope_cta,     space_shared_cluster, &bar, 1);           // 5.
@@ -57,6 +60,7 @@ int main(int, char**)
 
               cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cta,     space_shared_cluster, &bar, 1); // 7.
               cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cluster, space_shared_cluster, &bar, 1); // 7.
+#endif
             ));
 
             state += 1;         // "Use" state to prevent compiler warnings

From e9abe97ab0d580bfa1c21fb3fe366762b686b435 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Fri, 27 Oct 2023 15:20:54 +0200
Subject: [PATCH 38/49] Remove __cccl_ptx_sm targeting macros

They are not used anymore
---
 .../include/__cuda/ptx/ptx_isa_target_macros.h    | 15 ---------------
 1 file changed, 15 deletions(-)

diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h
index ab59d8d33f7..f3b412bb6b6 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h
@@ -22,21 +22,6 @@
  * https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#release-notes
  */
 
-
-// SM version
-
-#if (defined(__CUDA_MINIMUM_ARCH__) && 900 <= __CUDA_MINIMUM_ARCH__) || (!defined(__CUDA_MINIMUM_ARCH__))
-#  define __cccl_ptx_sm 900ULL
-#elif (defined(__CUDA_MINIMUM_ARCH__) && 800 <= __CUDA_MINIMUM_ARCH__) || (!defined(__CUDA_MINIMUM_ARCH__))
-#  define __cccl_ptx_sm 800ULL
-// Fallback case. Define the SM version to be zero. This ensures that the macro is always defined.
-#else
-#  define __cccl_ptx_sm 0ULL
-#endif
-
-
-// PTX ISA version
-
 // PTX ISA 8.3 is available from CUDA 12.3, driver r545
 #if   (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 3)) || (!defined(__CUDACC_VER_MAJOR__))
 #  define __cccl_ptx_isa 830ULL

From f806ca0254536bcdb0c328454b7319ec52c77de4 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Fri, 27 Oct 2023 17:38:43 +0200
Subject: [PATCH 39/49] Prevent unused compiler warnings in test

---
 .../test/cuda/ptx/sm90.ptx.compile.pass.cpp              | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/libcudacxx/.upstream-tests/test/cuda/ptx/sm90.ptx.compile.pass.cpp b/libcudacxx/.upstream-tests/test/cuda/ptx/sm90.ptx.compile.pass.cpp
index a643ce78222..ffc4d671dad 100644
--- a/libcudacxx/.upstream-tests/test/cuda/ptx/sm90.ptx.compile.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/cuda/ptx/sm90.ptx.compile.pass.cpp
@@ -19,6 +19,9 @@
 #include "cuda_space_selector.h"
 #include "test_macros.h"
 
+template <typename ... _Ty>
+__device__ inline bool __unused(_Ty...) { return true; }
+
 int main(int, char**)
 {
     NV_IF_TARGET(NV_IS_DEVICE, (
@@ -36,7 +39,8 @@ int main(int, char**)
             using cuda::ptx::scope_cta;
 
             __shared__ uint64_t bar;
-            uint64_t state;
+            bar = 1;
+            uint64_t state = 1;
 
             NV_IF_TARGET(NV_PROVIDES_SM_80, (
 #if __cccl_ptx_isa >= 700
@@ -63,8 +67,7 @@ int main(int, char**)
 #endif
             ));
 
-            state += 1;         // "Use" state to prevent compiler warnings
-            (void) state;
+            __unused(bar, state);
         }
     ));
 

From 6917e60e6cac82909169ed2de15dca2aa5a99a1f Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Fri, 27 Oct 2023 20:16:42 +0200
Subject: [PATCH 40/49] Use extern "C" error function declaration

---
 ...ation_and_communication_instructions_mbarrier.h | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h
index 1a1418dfeae..9fda72e1980 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h
@@ -60,6 +60,7 @@ __device__ static inline uint64_t mbarrier_arrive(
   uint64_t* addr);
 */
 #if __cccl_ptx_isa >= 700
+extern "C" __device__ _CUDA_VSTD::uint64_t ___cuda_vstd_uint64_t__cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_80__();
 template <typename=void>
 _LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive(
   sem_release_t,
@@ -83,7 +84,6 @@ _LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive(
     return __state;
   ),(
     // Unsupported architectures will have a linker error with a semi-decent error message
-    __device__ _CUDA_VSTD::uint64_t ___cuda_vstd_uint64_t__cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_80__();
     return ___cuda_vstd_uint64_t__cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_80__();
   ));
 }
@@ -100,6 +100,7 @@ __device__ static inline uint64_t mbarrier_arrive_no_complete(
   const uint32_t& count);
 */
 #if __cccl_ptx_isa >= 700
+extern "C" __device__ _CUDA_VSTD::uint64_t ___cuda_vstd_uint64_t__cuda_ptx_mbarrier_arrive_no_complete_is_not_supported_before_SM_80__();
 template <typename=void>
 _LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive_no_complete(
   sem_release_t,
@@ -125,7 +126,6 @@ _LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive_no_complet
     return __state;
   ),(
     // Unsupported architectures will have a linker error with a semi-decent error message
-    __device__ _CUDA_VSTD::uint64_t ___cuda_vstd_uint64_t__cuda_ptx_mbarrier_arrive_no_complete_is_not_supported_before_SM_80__();
     return ___cuda_vstd_uint64_t__cuda_ptx_mbarrier_arrive_no_complete_is_not_supported_before_SM_80__();
   ));
 }
@@ -142,6 +142,7 @@ __device__ static inline uint64_t mbarrier_arrive(
   const uint32_t& count);
 */
 #if __cccl_ptx_isa >= 780
+extern "C" __device__ _CUDA_VSTD::uint64_t ___cuda_vstd_uint64_t__cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__();
 template <typename=void>
 _LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive(
   sem_release_t,
@@ -167,7 +168,6 @@ _LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive(
     return __state;
   ),(
     // Unsupported architectures will have a linker error with a semi-decent error message
-    __device__ _CUDA_VSTD::uint64_t ___cuda_vstd_uint64_t__cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__();
     return ___cuda_vstd_uint64_t__cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__();
   ));
 }
@@ -184,6 +184,7 @@ __device__ static inline uint64_t mbarrier_arrive(
   const uint32_t& count);
 */
 #if __cccl_ptx_isa >= 800
+extern "C" __device__ _CUDA_VSTD::uint64_t ___cuda_vstd_uint64_t__cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__();
 template <typename=void>
 _LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive(
   sem_release_t,
@@ -209,7 +210,6 @@ _LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive(
     return __state;
   ),(
     // Unsupported architectures will have a linker error with a semi-decent error message
-    __device__ _CUDA_VSTD::uint64_t ___cuda_vstd_uint64_t__cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__();
     return ___cuda_vstd_uint64_t__cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__();
   ));
 }
@@ -227,6 +227,7 @@ __device__ static inline void mbarrier_arrive(
   const uint32_t& count);
 */
 #if __cccl_ptx_isa >= 800
+extern "C" __device__ void __void__cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__();
 template <dot_scope _Scope>
 _LIBCUDACXX_DEVICE static inline void mbarrier_arrive(
   sem_release_t,
@@ -262,7 +263,6 @@ _LIBCUDACXX_DEVICE static inline void mbarrier_arrive(
 
   ),(
     // Unsupported architectures will have a linker error with a semi-decent error message
-    __device__ void __void__cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__();
     return __void__cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__();
   ));
 }
@@ -280,6 +280,7 @@ __device__ static inline uint64_t mbarrier_arrive_expect_tx(
   const uint32_t& tx_count);
 */
 #if __cccl_ptx_isa >= 800
+extern "C" __device__ _CUDA_VSTD::uint64_t ___cuda_vstd_uint64_t__cuda_ptx_mbarrier_arrive_expect_tx_is_not_supported_before_SM_90__();
 template <dot_scope _Scope>
 _LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive_expect_tx(
   sem_release_t,
@@ -315,7 +316,6 @@ _LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive_expect_tx(
     return __state;
   ),(
     // Unsupported architectures will have a linker error with a semi-decent error message
-    __device__ _CUDA_VSTD::uint64_t ___cuda_vstd_uint64_t__cuda_ptx_mbarrier_arrive_expect_tx_is_not_supported_before_SM_90__();
     return ___cuda_vstd_uint64_t__cuda_ptx_mbarrier_arrive_expect_tx_is_not_supported_before_SM_90__();
   ));
 }
@@ -333,6 +333,7 @@ __device__ static inline void mbarrier_arrive_expect_tx(
   const uint32_t& tx_count);
 */
 #if __cccl_ptx_isa >= 800
+extern "C" __device__ void __void__cuda_ptx_mbarrier_arrive_expect_tx_is_not_supported_before_SM_90__();
 template <dot_scope _Scope>
 _LIBCUDACXX_DEVICE static inline void mbarrier_arrive_expect_tx(
   sem_release_t,
@@ -368,7 +369,6 @@ _LIBCUDACXX_DEVICE static inline void mbarrier_arrive_expect_tx(
 
   ),(
     // Unsupported architectures will have a linker error with a semi-decent error message
-    __device__ void __void__cuda_ptx_mbarrier_arrive_expect_tx_is_not_supported_before_SM_90__();
     return __void__cuda_ptx_mbarrier_arrive_expect_tx_is_not_supported_before_SM_90__();
   ));
 }

From 6a5b42304c559b4209ae0ed303aa3f80b0c20935 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Fri, 27 Oct 2023 20:31:38 +0200
Subject: [PATCH 41/49] Fix wrapping of ifdef and NV_IF_TARGET for Windows

The MS C++ compiler apparently chokes on an ifdef within NV_IF_TARGET
---
 .../test/cuda/ptx/sm90.ptx.compile.pass.cpp           | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/libcudacxx/.upstream-tests/test/cuda/ptx/sm90.ptx.compile.pass.cpp b/libcudacxx/.upstream-tests/test/cuda/ptx/sm90.ptx.compile.pass.cpp
index ffc4d671dad..bfeea02a359 100644
--- a/libcudacxx/.upstream-tests/test/cuda/ptx/sm90.ptx.compile.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/cuda/ptx/sm90.ptx.compile.pass.cpp
@@ -42,18 +42,21 @@ int main(int, char**)
             bar = 1;
             uint64_t state = 1;
 
-            NV_IF_TARGET(NV_PROVIDES_SM_80, (
 #if __cccl_ptx_isa >= 700
+            NV_IF_TARGET(NV_PROVIDES_SM_80, (
               state = cuda::ptx::mbarrier_arrive(sem_release, scope_cta,     space_shared, &bar);              // 1.
               state = cuda::ptx::mbarrier_arrive_no_complete(sem_release, scope_cta, space_shared, &bar, 1);   // 2.
-#endif
             ));
+#endif
 
-            NV_IF_TARGET(NV_PROVIDES_SM_90, (
 #if __cccl_ptx_isa >= 780 // This guard is redundant: before PTX ISA 7.8, there was no support for SM_90
+            NV_IF_TARGET(NV_PROVIDES_SM_90, (
               state = cuda::ptx::mbarrier_arrive(sem_release, scope_cta,     space_shared, &bar, 1);           // 3.
+            ));
 #endif
+
 #if __cccl_ptx_isa >= 800
+            NV_IF_TARGET(NV_PROVIDES_SM_90, (
               state = cuda::ptx::mbarrier_arrive(sem_release, scope_cluster, space_shared, &bar, 1);           // 4.
 
               cuda::ptx::mbarrier_arrive(sem_release, scope_cta,     space_shared_cluster, &bar, 1);           // 5.
@@ -64,8 +67,8 @@ int main(int, char**)
 
               cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cta,     space_shared_cluster, &bar, 1); // 7.
               cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cluster, space_shared_cluster, &bar, 1); // 7.
-#endif
             ));
+#endif
 
             __unused(bar, state);
         }

From 7d6d4d5e99914eca6322cec8ee68391af7638faa Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Wed, 1 Nov 2023 16:05:18 +0100
Subject: [PATCH 42/49] Try and fix CI issues

---
 .../test/cuda/ptx/sm90.ptx.compile.pass.cpp   | 73 ++++++++-----------
 .../test/support/concurrent_agents.h          |  2 +
 ..._and_communication_instructions_mbarrier.h | 24 ++----
 3 files changed, 39 insertions(+), 60 deletions(-)

diff --git a/libcudacxx/.upstream-tests/test/cuda/ptx/sm90.ptx.compile.pass.cpp b/libcudacxx/.upstream-tests/test/cuda/ptx/sm90.ptx.compile.pass.cpp
index bfeea02a359..da086fe3811 100644
--- a/libcudacxx/.upstream-tests/test/cuda/ptx/sm90.ptx.compile.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/cuda/ptx/sm90.ptx.compile.pass.cpp
@@ -22,57 +22,48 @@
 template <typename ... _Ty>
 __device__ inline bool __unused(_Ty...) { return true; }
 
-int main(int, char**)
-{
-    NV_IF_TARGET(NV_IS_DEVICE, (
-        // Do not execute. Just check if below PTX compiles (that is: assembles) without error.
-
-        // This condition always evaluates to false, but the compiler does not
-        // reason through it. This avoids dead code elimination.
-        const bool non_eliminated_false = threadIdx.x > 1024;
-
-        if (non_eliminated_false) {
-            using cuda::ptx::sem_release;
-            using cuda::ptx::space_shared_cluster;
-            using cuda::ptx::space_shared;
-            using cuda::ptx::scope_cluster;
-            using cuda::ptx::scope_cta;
+__device__ void test_compilation() {
+  using cuda::ptx::sem_release;
+  using cuda::ptx::space_shared_cluster;
+  using cuda::ptx::space_shared;
+  using cuda::ptx::scope_cluster;
+  using cuda::ptx::scope_cta;
 
-            __shared__ uint64_t bar;
-            bar = 1;
-            uint64_t state = 1;
+  __shared__ uint64_t bar;
+  bar = 1;
+  uint64_t state = 1;
 
 #if __cccl_ptx_isa >= 700
-            NV_IF_TARGET(NV_PROVIDES_SM_80, (
-              state = cuda::ptx::mbarrier_arrive(sem_release, scope_cta,     space_shared, &bar);              // 1.
-              state = cuda::ptx::mbarrier_arrive_no_complete(sem_release, scope_cta, space_shared, &bar, 1);   // 2.
-            ));
-#endif
+  NV_IF_TARGET(NV_PROVIDES_SM_80, (
+    state = cuda::ptx::mbarrier_arrive(sem_release, scope_cta, space_shared, &bar);                  // 1.
+    state = cuda::ptx::mbarrier_arrive_no_complete(sem_release, scope_cta, space_shared, &bar, 1);   // 2.
+  ));
+#endif // __cccl_ptx_isa >= 700
 
 #if __cccl_ptx_isa >= 780 // This guard is redundant: before PTX ISA 7.8, there was no support for SM_90
-            NV_IF_TARGET(NV_PROVIDES_SM_90, (
-              state = cuda::ptx::mbarrier_arrive(sem_release, scope_cta,     space_shared, &bar, 1);           // 3.
-            ));
-#endif
+  NV_IF_TARGET(NV_PROVIDES_SM_90, (
+    state = cuda::ptx::mbarrier_arrive(sem_release, scope_cta,     space_shared, &bar, 1);           // 3.
+  ));
+#endif // __cccl_ptx_isa >= 780
 
 #if __cccl_ptx_isa >= 800
-            NV_IF_TARGET(NV_PROVIDES_SM_90, (
-              state = cuda::ptx::mbarrier_arrive(sem_release, scope_cluster, space_shared, &bar, 1);           // 4.
+  NV_IF_TARGET(NV_PROVIDES_SM_90, (
+    state = cuda::ptx::mbarrier_arrive(sem_release, scope_cluster, space_shared, &bar, 1);           // 4.
 
-              cuda::ptx::mbarrier_arrive(sem_release, scope_cta,     space_shared_cluster, &bar, 1);           // 5.
-              cuda::ptx::mbarrier_arrive(sem_release, scope_cluster, space_shared_cluster, &bar, 1);           // 5.
+    cuda::ptx::mbarrier_arrive(sem_release, scope_cta,     space_shared_cluster, &bar, 1);           // 5.
+    cuda::ptx::mbarrier_arrive(sem_release, scope_cluster, space_shared_cluster, &bar, 1);           // 5.
 
-              state = cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cta,     space_shared, &bar, 1); // 6.
-              state = cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cluster, space_shared, &bar, 1); // 6.
+    state = cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cta,     space_shared, &bar, 1); // 6.
+    state = cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cluster, space_shared, &bar, 1); // 6.
 
-              cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cta,     space_shared_cluster, &bar, 1); // 7.
-              cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cluster, space_shared_cluster, &bar, 1); // 7.
-            ));
-#endif
-
-            __unused(bar, state);
-        }
-    ));
+    cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cta,     space_shared_cluster, &bar, 1); // 7.
+    cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cluster, space_shared_cluster, &bar, 1); // 7.
+  ));
+#endif // __cccl_ptx_isa >= 800
+  __unused(bar, state);
+}
 
+int main(int, char**)
+{
     return 0;
 }
diff --git a/libcudacxx/.upstream-tests/test/support/concurrent_agents.h b/libcudacxx/.upstream-tests/test/support/concurrent_agents.h
index d0d3163c88f..33b338ff712 100644
--- a/libcudacxx/.upstream-tests/test/support/concurrent_agents.h
+++ b/libcudacxx/.upstream-tests/test/support/concurrent_agents.h
@@ -19,6 +19,8 @@
     #endif
 #endif
 
+#include <cuda/std/cassert>
+
 #include "test_macros.h"
 
 TEST_EXEC_CHECK_DISABLE
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h
index 9fda72e1980..fed0f732555 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h
@@ -71,10 +71,8 @@ _LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive(
   // __sem == sem_release (due to parameter type constraint)
   // __scope == scope_cta (due to parameter type constraint)
   // __space == space_shared (due to parameter type constraint)
-
-  _CUDA_VSTD::uint64_t __state;
-
   NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,(
+    _CUDA_VSTD::uint64_t __state;
     asm (
       "mbarrier.arrive.shared.b64 %0, [%1]; // 1. "
       : "=l"(__state)
@@ -112,10 +110,8 @@ _LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive_no_complet
   // __sem == sem_release (due to parameter type constraint)
   // __scope == scope_cta (due to parameter type constraint)
   // __space == space_shared (due to parameter type constraint)
-
-  _CUDA_VSTD::uint64_t __state;
-
   NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,(
+    _CUDA_VSTD::uint64_t __state;
     asm (
       "mbarrier.arrive.noComplete.shared.b64 %0, [%1], %2; // 2. "
       : "=l"(__state)
@@ -154,10 +150,8 @@ _LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive(
   // __sem == sem_release (due to parameter type constraint)
   // __scope == scope_cta (due to parameter type constraint)
   // __space == space_shared (due to parameter type constraint)
-
-  _CUDA_VSTD::uint64_t __state;
-
   NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
+    _CUDA_VSTD::uint64_t __state;
     asm (
       "mbarrier.arrive.shared.b64 %0, [%1], %2; // 3."
       : "=l"(__state)
@@ -196,10 +190,8 @@ _LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive(
   // __sem == sem_release (due to parameter type constraint)
   // __scope == scope_cluster (due to parameter type constraint)
   // __space == space_shared (due to parameter type constraint)
-
-  _CUDA_VSTD::uint64_t __state;
-
   NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
+    _CUDA_VSTD::uint64_t __state;
     asm (
       "mbarrier.arrive.release.cluster.shared.b64 %0,  [%1], %2; // 4."
       : "=l"(__state)
@@ -240,8 +232,6 @@ _LIBCUDACXX_DEVICE static inline void mbarrier_arrive(
   static_assert(__scope == scope_cta || __scope == scope_cluster, "");
   // __space == space_shared_cluster (due to parameter type constraint)
 
-
-
   NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
     if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cta) {
       asm (
@@ -292,10 +282,8 @@ _LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive_expect_tx(
   // __sem == sem_release (due to parameter type constraint)
   static_assert(__scope == scope_cta || __scope == scope_cluster, "");
   // __space == space_shared (due to parameter type constraint)
-
-  _CUDA_VSTD::uint64_t __state;
-
   NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
+    _CUDA_VSTD::uint64_t __state;
     if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cta) {
       asm (
         "mbarrier.arrive.expect_tx.release.cta.shared.b64   %0, [%1], %2; // 6. "
@@ -346,8 +334,6 @@ _LIBCUDACXX_DEVICE static inline void mbarrier_arrive_expect_tx(
   static_assert(__scope == scope_cta || __scope == scope_cluster, "");
   // __space == space_shared_cluster (due to parameter type constraint)
 
-
-
   NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
     if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cta) {
       asm (

From bd242653bae356c633b001e63c032e7d166280a3 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Thu, 2 Nov 2023 11:40:36 +0100
Subject: [PATCH 43/49] Rename space_shared_cluster -> space_cluster

---
 .../test/cuda/ptx/sm90.ptx.compile.pass.cpp   | 13 ++---
 libcudacxx/docs/extended_api/ptx.md           | 10 ++--
 ..._and_communication_instructions_mbarrier.h | 12 ++---
 .../include/__cuda/ptx/ptx_dot_variants.h     | 47 +++++++------------
 4 files changed, 36 insertions(+), 46 deletions(-)

diff --git a/libcudacxx/.upstream-tests/test/cuda/ptx/sm90.ptx.compile.pass.cpp b/libcudacxx/.upstream-tests/test/cuda/ptx/sm90.ptx.compile.pass.cpp
index da086fe3811..2fc4e346507 100644
--- a/libcudacxx/.upstream-tests/test/cuda/ptx/sm90.ptx.compile.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/cuda/ptx/sm90.ptx.compile.pass.cpp
@@ -24,7 +24,7 @@ __device__ inline bool __unused(_Ty...) { return true; }
 
 __device__ void test_compilation() {
   using cuda::ptx::sem_release;
-  using cuda::ptx::space_shared_cluster;
+  using cuda::ptx::space_cluster;
   using cuda::ptx::space_shared;
   using cuda::ptx::scope_cluster;
   using cuda::ptx::scope_cta;
@@ -40,7 +40,8 @@ __device__ void test_compilation() {
   ));
 #endif // __cccl_ptx_isa >= 700
 
-#if __cccl_ptx_isa >= 780 // This guard is redundant: before PTX ISA 7.8, there was no support for SM_90
+  // This guard is redundant: before PTX ISA 7.8, there was no support for SM_90
+#if __cccl_ptx_isa >= 780
   NV_IF_TARGET(NV_PROVIDES_SM_90, (
     state = cuda::ptx::mbarrier_arrive(sem_release, scope_cta,     space_shared, &bar, 1);           // 3.
   ));
@@ -50,14 +51,14 @@ __device__ void test_compilation() {
   NV_IF_TARGET(NV_PROVIDES_SM_90, (
     state = cuda::ptx::mbarrier_arrive(sem_release, scope_cluster, space_shared, &bar, 1);           // 4.
 
-    cuda::ptx::mbarrier_arrive(sem_release, scope_cta,     space_shared_cluster, &bar, 1);           // 5.
-    cuda::ptx::mbarrier_arrive(sem_release, scope_cluster, space_shared_cluster, &bar, 1);           // 5.
+    cuda::ptx::mbarrier_arrive(sem_release, scope_cta,     space_cluster, &bar, 1);                  // 5.
+    cuda::ptx::mbarrier_arrive(sem_release, scope_cluster, space_cluster, &bar, 1);                  // 5.
 
     state = cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cta,     space_shared, &bar, 1); // 6.
     state = cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cluster, space_shared, &bar, 1); // 6.
 
-    cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cta,     space_shared_cluster, &bar, 1); // 7.
-    cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cluster, space_shared_cluster, &bar, 1); // 7.
+    cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cta,     space_cluster, &bar, 1);        // 7.
+    cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cluster, space_cluster, &bar, 1);        // 7.
   ));
 #endif // __cccl_ptx_isa >= 800
   __unused(bar, state);
diff --git a/libcudacxx/docs/extended_api/ptx.md b/libcudacxx/docs/extended_api/ptx.md
index baf859f044a..c5529684330 100644
--- a/libcudacxx/docs/extended_api/ptx.md
+++ b/libcudacxx/docs/extended_api/ptx.md
@@ -485,7 +485,7 @@ template <cuda::ptx::dot_scope Scope>
 __device__ static inline void mbarrier_arrive(
   cuda::ptx::sem_release_t,
   cuda::ptx::scope_t<Scope> scope,
-  cuda::ptx::space_shared_cluster_t,
+  cuda::ptx::space_cluster_t,
   uint64_t* addr,
   const uint32_t& count);
 
@@ -505,7 +505,7 @@ template <cuda::ptx::dot_scope Scope>
 __device__ static inline void mbarrier_arrive_expect_tx(
   cuda::ptx::sem_release_t,
   cuda::ptx::scope_t<Scope> scope,
-  cuda::ptx::space_shared_cluster_t,
+  cuda::ptx::space_cluster_t,
   uint64_t* addr,
   const uint32_t& tx_count);
 ```
@@ -518,7 +518,7 @@ Usage:
 
 __global__ void kernel() {
     using cuda::ptx::sem_release;
-    using cuda::ptx::space_shared_cluster;
+    using cuda::ptx::space_cluster;
     using cuda::ptx::space_shared;
     using cuda::ptx::scope_cluster;
     using cuda::ptx::scope_cta;
@@ -544,8 +544,8 @@ __global__ void kernel() {
         cluster.sync();
 
         // Arrive on remote cluster barrier:
-        cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cta, space_shared_cluster, remote_bar, 1);
-        cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cluster, space_shared_cluster, remote_bar, 1);
+        cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cta, space_cluster, remote_bar, 1);
+        cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cluster, space_cluster, remote_bar, 1);
     )
 }
 ```
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h
index fed0f732555..5ff96d974dd 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h
@@ -214,7 +214,7 @@ template <cuda::ptx::dot_scope Scope>
 __device__ static inline void mbarrier_arrive(
   cuda::ptx::sem_release_t,
   cuda::ptx::scope_t<Scope> scope,
-  cuda::ptx::space_shared_cluster_t,
+  cuda::ptx::space_cluster_t,
   uint64_t* addr,
   const uint32_t& count);
 */
@@ -224,13 +224,13 @@ template <dot_scope _Scope>
 _LIBCUDACXX_DEVICE static inline void mbarrier_arrive(
   sem_release_t,
   scope_t<_Scope> __scope,
-  space_shared_cluster_t,
+  space_cluster_t,
   _CUDA_VSTD::uint64_t* __addr,
   const _CUDA_VSTD::uint32_t& __count)
 {
   // __sem == sem_release (due to parameter type constraint)
   static_assert(__scope == scope_cta || __scope == scope_cluster, "");
-  // __space == space_shared_cluster (due to parameter type constraint)
+  // __space == space_cluster (due to parameter type constraint)
 
   NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
     if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cta) {
@@ -316,7 +316,7 @@ template <cuda::ptx::dot_scope Scope>
 __device__ static inline void mbarrier_arrive_expect_tx(
   cuda::ptx::sem_release_t,
   cuda::ptx::scope_t<Scope> scope,
-  cuda::ptx::space_shared_cluster_t,
+  cuda::ptx::space_cluster_t,
   uint64_t* addr,
   const uint32_t& tx_count);
 */
@@ -326,13 +326,13 @@ template <dot_scope _Scope>
 _LIBCUDACXX_DEVICE static inline void mbarrier_arrive_expect_tx(
   sem_release_t,
   scope_t<_Scope> __scope,
-  space_shared_cluster_t,
+  space_cluster_t,
   _CUDA_VSTD::uint64_t* __addr,
   const _CUDA_VSTD::uint32_t& __tx_count)
 {
   // __sem == sem_release (due to parameter type constraint)
   static_assert(__scope == scope_cta || __scope == scope_cluster, "");
-  // __space == space_shared_cluster (due to parameter type constraint)
+  // __space == space_cluster (due to parameter type constraint)
 
   NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
     if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cta) {
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_dot_variants.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_dot_variants.h
index aca4eac097e..18f67c37479 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_dot_variants.h
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_dot_variants.h
@@ -59,23 +59,24 @@ enum class dot_sem
   release,
   sc,
   weak
-  // mmio?
-  // volatile?
 };
 
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#state-spaces
 enum class dot_space
 {
-  reg,
-  sreg,
-  const_mem, // Using const_mem as `const` is reserved in C++.
   global,
-  local,
-  param,
-  shared, // The PTX spelling is shared::cta
-  shared_cluster, // The PTX spelling is shared::cluster, but we might want to go for cluster here.
-  tex // deprecated
-  // generic?
+  cluster, // The PTX spelling is shared::cluster
+  shared,  // The PTX spelling is shared::cta
+
+  // The following state spaces are unlikely to be used in cuda::ptx in the near
+  // future, so they are not exposed:
+
+  // reg,
+  // sreg,
+  // const_mem, // Using const_mem as `const` is reserved in C++.
+  // local,
+  // param,
+  // tex // deprecated
 };
 
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#scope
@@ -118,26 +119,14 @@ static constexpr sem_sc_t sem_sc{};
 static constexpr sem_weak_t sem_weak{};
 
 template <dot_space __spc>
-using space_t                = _CUDA_VSTD::integral_constant<dot_space, __spc>;
-using space_const_mem_t      = space_t<dot_space::const_mem>;
-using space_global_t         = space_t<dot_space::global>;
-using space_local_t          = space_t<dot_space::local>;
-using space_param_t          = space_t<dot_space::param>;
-using space_reg_t            = space_t<dot_space::reg>;
-using space_shared_t         = space_t<dot_space::shared>;
-using space_shared_cluster_t = space_t<dot_space::shared_cluster>;
-using space_sreg_t           = space_t<dot_space::sreg>;
-using space_tex_t            = space_t<dot_space::tex>;
-
-static constexpr space_const_mem_t space_const_mem{};
+using space_t         = _CUDA_VSTD::integral_constant<dot_space, __spc>;
+using space_global_t  = space_t<dot_space::global>;
+using space_shared_t  = space_t<dot_space::shared>;
+using space_cluster_t = space_t<dot_space::cluster>;
+
 static constexpr space_global_t space_global{};
-static constexpr space_local_t space_local{};
-static constexpr space_param_t space_param{};
-static constexpr space_reg_t space_reg{};
 static constexpr space_shared_t space_shared{};
-static constexpr space_shared_cluster_t space_shared_cluster{};
-static constexpr space_sreg_t space_sreg{};
-static constexpr space_tex_t space_tex{};
+static constexpr space_cluster_t space_cluster{};
 
 template <dot_scope __scope>
 using scope_t         = _CUDA_VSTD::integral_constant<dot_scope, __scope>;

From 4f26aa2a6f905b6a36ec3106842f397bfd4b2c14 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Thu, 2 Nov 2023 11:51:45 +0100
Subject: [PATCH 44/49] Ensure PTX test is actually assembled

---
 .../.upstream-tests/test/cuda/ptx/sm90.ptx.compile.pass.cpp     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libcudacxx/.upstream-tests/test/cuda/ptx/sm90.ptx.compile.pass.cpp b/libcudacxx/.upstream-tests/test/cuda/ptx/sm90.ptx.compile.pass.cpp
index 2fc4e346507..b7a1a5d9b52 100644
--- a/libcudacxx/.upstream-tests/test/cuda/ptx/sm90.ptx.compile.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/cuda/ptx/sm90.ptx.compile.pass.cpp
@@ -22,7 +22,7 @@
 template <typename ... _Ty>
 __device__ inline bool __unused(_Ty...) { return true; }
 
-__device__ void test_compilation() {
+__global__ void test_compilation() {
   using cuda::ptx::sem_release;
   using cuda::ptx::space_cluster;
   using cuda::ptx::space_shared;

From 9555532b45d4db07f34dc0498f276890e4d7a580 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Thu, 2 Nov 2023 11:54:21 +0100
Subject: [PATCH 45/49] Rename test

---
 ....ptx.compile.pass.cpp => ptx.mbarrier.arrive.compile.pass.cpp} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename libcudacxx/.upstream-tests/test/cuda/ptx/{sm90.ptx.compile.pass.cpp => ptx.mbarrier.arrive.compile.pass.cpp} (100%)

diff --git a/libcudacxx/.upstream-tests/test/cuda/ptx/sm90.ptx.compile.pass.cpp b/libcudacxx/.upstream-tests/test/cuda/ptx/ptx.mbarrier.arrive.compile.pass.cpp
similarity index 100%
rename from libcudacxx/.upstream-tests/test/cuda/ptx/sm90.ptx.compile.pass.cpp
rename to libcudacxx/.upstream-tests/test/cuda/ptx/ptx.mbarrier.arrive.compile.pass.cpp

From ffa1f304543fb802f5dc98e8ddb5995617b92d8f Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Thu, 2 Nov 2023 15:43:32 +0100
Subject: [PATCH 46/49] Stay closer to original PTX exposure

Use the original spellings as in PTX ISA 70 and 78 and also expose in
C++ as such.
---
 .../ptx/ptx.mbarrier.arrive.compile.pass.cpp  |  28 +-
 libcudacxx/docs/extended_api/ptx.md           |  67 ++--
 .../detail/libcxx/include/__cuda/barrier.h    |  14 +-
 ..._and_communication_instructions_mbarrier.h | 324 +++++++++++++-----
 4 files changed, 307 insertions(+), 126 deletions(-)

diff --git a/libcudacxx/.upstream-tests/test/cuda/ptx/ptx.mbarrier.arrive.compile.pass.cpp b/libcudacxx/.upstream-tests/test/cuda/ptx/ptx.mbarrier.arrive.compile.pass.cpp
index b7a1a5d9b52..4666467cad5 100644
--- a/libcudacxx/.upstream-tests/test/cuda/ptx/ptx.mbarrier.arrive.compile.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/cuda/ptx/ptx.mbarrier.arrive.compile.pass.cpp
@@ -12,7 +12,6 @@
 // <cuda/ptx>
 
 #include <cuda/ptx>
-
 #include <cuda/std/utility>
 
 #include "concurrent_agents.h"
@@ -35,30 +34,37 @@ __global__ void test_compilation() {
 
 #if __cccl_ptx_isa >= 700
   NV_IF_TARGET(NV_PROVIDES_SM_80, (
-    state = cuda::ptx::mbarrier_arrive(sem_release, scope_cta, space_shared, &bar);                  // 1.
-    state = cuda::ptx::mbarrier_arrive_no_complete(sem_release, scope_cta, space_shared, &bar, 1);   // 2.
+    state = cuda::ptx::mbarrier_arrive(&bar);                                                        // 1.
+    state = cuda::ptx::mbarrier_arrive_no_complete(&bar, 1);                                         // 5.
   ));
 #endif // __cccl_ptx_isa >= 700
 
   // This guard is redundant: before PTX ISA 7.8, there was no support for SM_90
 #if __cccl_ptx_isa >= 780
   NV_IF_TARGET(NV_PROVIDES_SM_90, (
-    state = cuda::ptx::mbarrier_arrive(sem_release, scope_cta,     space_shared, &bar, 1);           // 3.
+    state = cuda::ptx::mbarrier_arrive(&bar, 1);                                                     // 2.
   ));
 #endif // __cccl_ptx_isa >= 780
 
 #if __cccl_ptx_isa >= 800
   NV_IF_TARGET(NV_PROVIDES_SM_90, (
-    state = cuda::ptx::mbarrier_arrive(sem_release, scope_cluster, space_shared, &bar, 1);           // 4.
+    state = cuda::ptx::mbarrier_arrive(sem_release, scope_cta, space_shared, &bar);                  // 3a.
+    state = cuda::ptx::mbarrier_arrive(sem_release, scope_cluster, space_shared, &bar);              // 3a.
+
+    state = cuda::ptx::mbarrier_arrive(sem_release, scope_cta, space_shared, &bar, 1);               // 3b.
+    state = cuda::ptx::mbarrier_arrive(sem_release, scope_cluster, space_shared, &bar, 1);           // 3b.
+
+    cuda::ptx::mbarrier_arrive(sem_release, scope_cta, space_cluster, &bar);                         // 4a.
+    cuda::ptx::mbarrier_arrive(sem_release, scope_cluster, space_cluster, &bar);                     // 4a.
 
-    cuda::ptx::mbarrier_arrive(sem_release, scope_cta,     space_cluster, &bar, 1);                  // 5.
-    cuda::ptx::mbarrier_arrive(sem_release, scope_cluster, space_cluster, &bar, 1);                  // 5.
+    cuda::ptx::mbarrier_arrive(sem_release, scope_cta, space_cluster, &bar, 1);                      // 4b.
+    cuda::ptx::mbarrier_arrive(sem_release, scope_cluster, space_cluster, &bar, 1);                  // 4b.
 
-    state = cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cta,     space_shared, &bar, 1); // 6.
-    state = cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cluster, space_shared, &bar, 1); // 6.
+    state = cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cta, space_shared, &bar, 1);     // 8.
+    state = cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cluster, space_shared, &bar, 1); // 8.
 
-    cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cta,     space_cluster, &bar, 1);        // 7.
-    cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cluster, space_cluster, &bar, 1);        // 7.
+    cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cta, space_cluster, &bar, 1);            // 9.
+    cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cluster, space_cluster, &bar, 1);        // 9.
   ));
 #endif // __cccl_ptx_isa >= 800
   __unused(bar, state);
diff --git a/libcudacxx/docs/extended_api/ptx.md b/libcudacxx/docs/extended_api/ptx.md
index c5529684330..8b9efe694f0 100644
--- a/libcudacxx/docs/extended_api/ptx.md
+++ b/libcudacxx/docs/extended_api/ptx.md
@@ -442,45 +442,56 @@ notes](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#release
 
 -  PTX ISA: [mbarrier.arrive](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive)
 
- 
 ```cuda
-// mbarrier.arrive.shared.b64 state, [addr];                                           // 1.  PTX ISA 70, SM_80
+// mbarrier.arrive.shared.b64                                  state,  [addr];           // 1.  PTX ISA 70, SM_80
 template <typename=void>
 __device__ static inline uint64_t mbarrier_arrive(
-  cuda::ptx::sem_release_t,
-  cuda::ptx::scope_cta_t,
-  cuda::ptx::space_shared_t,
   uint64_t* addr);
 
-// mbarrier.arrive.noComplete.shared.b64 state, [addr], count;                         // 2.  PTX ISA 70, SM_80
+// mbarrier.arrive.shared::cta.b64                             state,  [addr], count;    // 2.  PTX ISA 78, SM_90
 template <typename=void>
-__device__ static inline uint64_t mbarrier_arrive_no_complete(
-  cuda::ptx::sem_release_t,
-  cuda::ptx::scope_cta_t,
-  cuda::ptx::space_shared_t,
+__device__ static inline uint64_t mbarrier_arrive(
   uint64_t* addr,
   const uint32_t& count);
 
-// mbarrier.arrive.shared.b64 state, [addr], count;                                    // 3. PTX ISA 78, SM_90
-template <typename=void>
+// mbarrier.arrive{.sem}{.scope}{.space}.b64                   state,  [addr];           // 3a.  PTX ISA 80, SM_90
+// .sem       = { .release }
+// .scope     = { .cta, .cluster }
+// .space     = { .shared::cta }
+template <cuda::ptx::dot_scope Scope>
 __device__ static inline uint64_t mbarrier_arrive(
   cuda::ptx::sem_release_t,
-  cuda::ptx::scope_cta_t,
+  cuda::ptx::scope_t<Scope> scope,
   cuda::ptx::space_shared_t,
-  uint64_t* addr,
-  const uint32_t& count);
+  uint64_t* addr);
 
-// mbarrier.arrive.release.cluster.shared.b64 state,  [addr], count;                   // 4. PTX ISA 80, SM_90
-template <typename=void>
+// mbarrier.arrive{.sem}{.scope}{.space}.b64                   state,  [addr], count;    // 3b.  PTX ISA 80, SM_90
+// .sem       = { .release }
+// .scope     = { .cta, .cluster }
+// .space     = { .shared::cta }
+template <cuda::ptx::dot_scope Scope>
 __device__ static inline uint64_t mbarrier_arrive(
   cuda::ptx::sem_release_t,
-  cuda::ptx::scope_cluster_t,
+  cuda::ptx::scope_t<Scope> scope,
   cuda::ptx::space_shared_t,
   uint64_t* addr,
   const uint32_t& count);
 
-// mbarrier.arrive.release{.scope}.shared::cluster.b64 _, [addr], count;               // 5.  PTX ISA 80, SM_90
+// mbarrier.arrive{.sem}{.scope}{.space}.b64                   _, [addr];                // 4a.  PTX ISA 80, SM_90
+// .sem       = { .release }
 // .scope     = { .cta, .cluster }
+// .space     = { .shared::cluster }
+template <cuda::ptx::dot_scope Scope>
+__device__ static inline void mbarrier_arrive(
+  cuda::ptx::sem_release_t,
+  cuda::ptx::scope_t<Scope> scope,
+  cuda::ptx::space_cluster_t,
+  uint64_t* addr);
+
+// mbarrier.arrive{.sem}{.scope}{.space}.b64                   _, [addr], count;         // 4b.  PTX ISA 80, SM_90
+// .sem       = { .release }
+// .scope     = { .cta, .cluster }
+// .space     = { .shared::cluster }
 template <cuda::ptx::dot_scope Scope>
 __device__ static inline void mbarrier_arrive(
   cuda::ptx::sem_release_t,
@@ -488,9 +499,21 @@ __device__ static inline void mbarrier_arrive(
   cuda::ptx::space_cluster_t,
   uint64_t* addr,
   const uint32_t& count);
+```
 
-// mbarrier.arrive.expect_tx.release{.scope}.shared.b64   state, [addr], tx_count;     // 6.  PTX ISA 80, SM_90
+```cuda
+// mbarrier.arrive.noComplete.shared.b64                       state,  [addr], count;    // 5.  PTX ISA 70, SM_80
+template <typename=void>
+__device__ static inline uint64_t mbarrier_arrive_no_complete(
+  uint64_t* addr,
+  const uint32_t& count);
+```
+
+```cuda
+// mbarrier.arrive.expect_tx{.sem}{.scope}{.space}.b64 state, [addr], tx_count; // 8.  PTX ISA 80, SM_90
+// .sem       = { .release }
 // .scope     = { .cta, .cluster }
+// .space     = { .shared::cta }
 template <cuda::ptx::dot_scope Scope>
 __device__ static inline uint64_t mbarrier_arrive_expect_tx(
   cuda::ptx::sem_release_t,
@@ -499,8 +522,10 @@ __device__ static inline uint64_t mbarrier_arrive_expect_tx(
   uint64_t* addr,
   const uint32_t& tx_count);
 
-// mbarrier.arrive.expect_tx.release{.scope}.shared::cluster.b64 _, [addr], tx_count;  // 7.  PTX ISA 80, SM_90
+// mbarrier.arrive.expect_tx{.sem}{.scope}{.space}.b64   _, [addr], tx_count; // 9.  PTX ISA 80, SM_90
+// .sem       = { .release }
 // .scope     = { .cta, .cluster }
+// .space     = { .shared::cluster }
 template <cuda::ptx::dot_scope Scope>
 __device__ static inline void mbarrier_arrive_expect_tx(
   cuda::ptx::sem_release_t,
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h
index c388d802b30..c4f8cedba71 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h
@@ -27,7 +27,7 @@ _CCCL_IMPLICIT_SYSTEM_HEADER
 
 #include "../cstdlib"                // _LIBCUDACXX_UNREACHABLE
 #include "../__type_traits/void_t.h" // _CUDA_VSTD::__void_t
-#include "../__cuda/ptx.h"                  // cuda::ptx::*
+#include "../__cuda/ptx.h"           // cuda::ptx::*
 
 #if defined(_LIBCUDACXX_COMPILER_NVRTC)
 #define _LIBCUDACXX_OFFSET_IS_ZERO(type, member) !(&(((type *)0)->member))
@@ -211,9 +211,7 @@ friend class _CUDA_VSTD::__barrier_poll_tester_parity;
                 // only defined for block-scope barriers. This barrier may be a
                 // non-block scoped barrier.
                 auto __bh = reinterpret_cast<_CUDA_VSTD::uint64_t*>(&__barrier);
-                __token = _CUDA_VPTX::mbarrier_arrive(
-                    _CUDA_VPTX::sem_release, _CUDA_VPTX::scope_cta, _CUDA_VPTX::space_shared, __bh, __update
-                );
+                __token = _CUDA_VPTX::mbarrier_arrive(__bh, __update);
             ), NV_PROVIDES_SM_80, (
                 if (!__isShared(&__barrier)) {
                     return __barrier.arrive(__update);
@@ -221,13 +219,9 @@ friend class _CUDA_VSTD::__barrier_poll_tester_parity;
                 auto __bh = reinterpret_cast<_CUDA_VSTD::uint64_t*>(&__barrier);
                 // Need 2 instructions, can't finish barrier with arrive > 1
                 if (__update > 1) {
-                    ___CUDA_VPTX::mbarrier_arrive_no_complete(
-                        _CUDA_VPTX::sem_release, _CUDA_VPTX::scope_cta, _CUDA_VPTX::space_shared,
-                        __bh, __update - 1);
+                    ___CUDA_VPTX::mbarrier_arrive_no_complete(__bh, __update - 1);
                 }
-                __token = _CUDA_VPTX::mbarrier_arrive(
-                    _CUDA_VPTX::sem_release, _CUDA_VPTX::scope_cta, _CUDA_VPTX::space_shared, __bh
-                );
+                __token = _CUDA_VPTX::mbarrier_arrive( __bh);
             ), NV_IS_DEVICE, (
                 if (!__isShared(&__barrier)) {
                     return __barrier.arrive(__update);
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h
index 5ff96d974dd..d6b5c72e38d 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h
@@ -41,40 +41,93 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 // 9.7.12.15.13. Parallel Synchronization and Communication Instructions: mbarrier.arrive
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive
 
-// mbarrier.arrive{.sem}{.scope}{.shared{::cta}}.b64           state, [addr]{, count};
-// mbarrier.arrive{.sem}{.scope}{.shared::cluster}.b64         _, [addr] {,count}
-// mbarrier.arrive.expect_tx{.sem}{.scope}{.shared{::cta}}.b64 state, [addr], txCount;
-// mbarrier.arrive.expect_tx{.sem}{.scope}{.shared::cluster}.b64   _, [addr], txCount;
-// mbarrier.arrive.noComplete{.sem}{.cta}{.shared{::cta}}.b64  state, [addr], count;
-//
-// .sem   = { .release }
-// .scope = { .cta, .cluster }
+/*
+PTX ISA docs:
+
+// mbarrier.arrive:
+mbarrier.arrive{.shared}.b64 state, [addr];                                         // 1. PTX ISA 70, SM_80
+mbarrier.arrive{.shared{::cta}}.b64 state, [addr]{, count};                         // 2. PTX ISA 78, SM_90 (due to count)
+
+mbarrier.arrive{.sem}{.scope}{.shared{::cta}}.b64           state, [addr]{, count}; // 3. PTX ISA 80, SM_90 (some variants are SM_80, but are covered by 1)
+mbarrier.arrive{.sem}{.scope}{.shared::cluster}.b64         _, [addr] {,count}      // 4. PTX ISA 80, SM_90
+
+.sem   = { .release }
+.scope = { .cta, .cluster }
+
+
+// mbarrier.arrive.noComplete:
+mbarrier.arrive.noComplete{.shared}.b64 state, [addr], count;                       // 5. PTX ISA 70, SM_80
+mbarrier.arrive.noComplete{.shared{::cta}}.b64 state, [addr], count;                // 6. PTX ISA 78, Not exposed. Just a spelling change (shared -> shared::cta)
+mbarrier.arrive.noComplete{.sem}{.cta}{.shared{::cta}}.b64  state, [addr], count;   // 7. PTX ISA 80, Not exposed. Adds .release, and .cta scope.
+
+
+// mbarrier.arrive.expect_tx:
+mbarrier.arrive.expect_tx{.sem}{.scope}{.shared{::cta}}.b64 state, [addr], tx_count; // 8. PTX ISA 80, SM_90
+mbarrier.arrive.expect_tx{.sem}{.scope}{.shared::cluster}.b64   _, [addr], tx_count; // 9. PTX ISA 80, SM_90
+
+.sem   = { .release }
+.scope = { .cta, .cluster }
+
+
+Corresponding Exposure:
+
+// mbarrier_arrive:
+mbarrier.arrive.shared.b64                                  state,  [addr];           // 1. PTX ISA 70, SM_80, !memory
+// count is non-optional, otherwise 3 would not be distinguishable from 1
+mbarrier.arrive.shared::cta.b64                             state,  [addr], count;    // 2. PTX ISA 78, SM_90, !memory
+mbarrier.arrive{.sem}{.scope}{.space}.b64                   state,  [addr];           // 3a. PTX ISA 80, SM_90, !memory
+.space = { .shared::cta}
+.sem   = { .release }
+.scope = { .cta, .cluster }
+
+mbarrier.arrive{.sem}{.scope}{.space}.b64                   state,  [addr], count;    // 3b. PTX ISA 80, SM_90, !memory
+.space = { .shared::cta}
+.sem   = { .release }
+.scope = { .cta, .cluster }
+
+mbarrier.arrive{.sem}{.scope}{.space}.b64                   _, [addr];                // 4a. PTX ISA 80, SM_90, !memory
+.space = { .shared::cluster}
+.sem   = { .release }
+.scope = { .cta, .cluster }
+
+mbarrier.arrive{.sem}{.scope}{.space}.b64                   _, [addr], count;         // 4b. PTX ISA 80, SM_90, !memory
+.space = { .shared::cluster}
+.sem   = { .release }
+.scope = { .cta, .cluster }
+
+
+// mbarrier_arrive_no_complete:
+mbarrier.arrive.noComplete.shared.b64                       state,  [addr], count;    // 5. PTX ISA 70, SM_80, !memory
+
+
+mbarrier.arrive.expect_tx{.sem}{.scope}{.space}.b64        state, [addr], tx_count;   // 8. PTX ISA 80, SM_90, !memory
+.space = { .shared::cta }
+.sem   = { .release }
+.scope = { .cta, .cluster }
+
+mbarrier.arrive.expect_tx{.sem}{.scope}{.space}.b64        _, [addr], tx_count;       // 9. PTX ISA 80, SM_90, !memory
+.space = { .shared::cluster }
+.sem   = { .release }
+.scope = { .cta, .cluster }
+
+*/
 
 /*
-// mbarrier.arrive.shared.b64 state, [addr]; // 1.  PTX ISA 70, SM_80
+// mbarrier.arrive.shared.b64                                  state,  [addr];           // 1.  PTX ISA 70, SM_80
 template <typename=void>
 __device__ static inline uint64_t mbarrier_arrive(
-  cuda::ptx::sem_release_t,
-  cuda::ptx::scope_cta_t,
-  cuda::ptx::space_shared_t,
   uint64_t* addr);
 */
 #if __cccl_ptx_isa >= 700
-extern "C" __device__ _CUDA_VSTD::uint64_t ___cuda_vstd_uint64_t__cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_80__();
+extern "C" _LIBCUDACXX_DEVICE _CUDA_VSTD::uint64_t ___cuda_vstd_uint64_t__cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_80__();
 template <typename=void>
 _LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive(
-  sem_release_t,
-  scope_cta_t,
-  space_shared_t,
   _CUDA_VSTD::uint64_t* __addr)
 {
-  // __sem == sem_release (due to parameter type constraint)
-  // __scope == scope_cta (due to parameter type constraint)
-  // __space == space_shared (due to parameter type constraint)
   NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,(
     _CUDA_VSTD::uint64_t __state;
     asm (
-      "mbarrier.arrive.shared.b64 %0, [%1]; // 1. "
+      "mbarrier.arrive.shared.b64                                  %0,  [%1];           // 1. "
       : "=l"(__state)
       : "r"(__as_ptr_smem(__addr))
       : "memory"
@@ -88,32 +141,23 @@ _LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive(
 #endif // __cccl_ptx_isa >= 700
 
 /*
-// mbarrier.arrive.noComplete.shared.b64 state, [addr], count; // 2.  PTX ISA 70, SM_80
+// mbarrier.arrive.shared::cta.b64                             state,  [addr], count;    // 2.  PTX ISA 78, SM_90
 template <typename=void>
-__device__ static inline uint64_t mbarrier_arrive_no_complete(
-  cuda::ptx::sem_release_t,
-  cuda::ptx::scope_cta_t,
-  cuda::ptx::space_shared_t,
+__device__ static inline uint64_t mbarrier_arrive(
   uint64_t* addr,
   const uint32_t& count);
 */
-#if __cccl_ptx_isa >= 700
-extern "C" __device__ _CUDA_VSTD::uint64_t ___cuda_vstd_uint64_t__cuda_ptx_mbarrier_arrive_no_complete_is_not_supported_before_SM_80__();
+#if __cccl_ptx_isa >= 780
+extern "C" _LIBCUDACXX_DEVICE _CUDA_VSTD::uint64_t ___cuda_vstd_uint64_t__cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__();
 template <typename=void>
-_LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive_no_complete(
-  sem_release_t,
-  scope_cta_t,
-  space_shared_t,
+_LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive(
   _CUDA_VSTD::uint64_t* __addr,
   const _CUDA_VSTD::uint32_t& __count)
 {
-  // __sem == sem_release (due to parameter type constraint)
-  // __scope == scope_cta (due to parameter type constraint)
-  // __space == space_shared (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,(
+  NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
     _CUDA_VSTD::uint64_t __state;
     asm (
-      "mbarrier.arrive.noComplete.shared.b64 %0, [%1], %2; // 2. "
+      "mbarrier.arrive.shared::cta.b64                             %0,  [%1], %2;    // 2. "
       : "=l"(__state)
       : "r"(__as_ptr_smem(__addr)),
         "r"(__count)
@@ -122,83 +166,107 @@ _LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive_no_complet
     return __state;
   ),(
     // Unsupported architectures will have a linker error with a semi-decent error message
-    return ___cuda_vstd_uint64_t__cuda_ptx_mbarrier_arrive_no_complete_is_not_supported_before_SM_80__();
+    return ___cuda_vstd_uint64_t__cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__();
   ));
 }
-#endif // __cccl_ptx_isa >= 700
+#endif // __cccl_ptx_isa >= 780
 
 /*
-// mbarrier.arrive.shared.b64 state, [addr], count; // 3. PTX ISA 78, SM_90
-template <typename=void>
+// mbarrier.arrive{.sem}{.scope}{.space}.b64                   state,  [addr];           // 3a.  PTX ISA 80, SM_90
+// .sem       = { .release }
+// .scope     = { .cta, .cluster }
+// .space     = { .shared::cta }
+template <cuda::ptx::dot_scope Scope>
 __device__ static inline uint64_t mbarrier_arrive(
   cuda::ptx::sem_release_t,
-  cuda::ptx::scope_cta_t,
+  cuda::ptx::scope_t<Scope> scope,
   cuda::ptx::space_shared_t,
-  uint64_t* addr,
-  const uint32_t& count);
+  uint64_t* addr);
 */
-#if __cccl_ptx_isa >= 780
-extern "C" __device__ _CUDA_VSTD::uint64_t ___cuda_vstd_uint64_t__cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__();
-template <typename=void>
+#if __cccl_ptx_isa >= 800
+extern "C" _LIBCUDACXX_DEVICE _CUDA_VSTD::uint64_t ___cuda_vstd_uint64_t__cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__();
+template <dot_scope _Scope>
 _LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive(
   sem_release_t,
-  scope_cta_t,
+  scope_t<_Scope> __scope,
   space_shared_t,
-  _CUDA_VSTD::uint64_t* __addr,
-  const _CUDA_VSTD::uint32_t& __count)
+  _CUDA_VSTD::uint64_t* __addr)
 {
   // __sem == sem_release (due to parameter type constraint)
-  // __scope == scope_cta (due to parameter type constraint)
+  static_assert(__scope == scope_cta || __scope == scope_cluster, "");
   // __space == space_shared (due to parameter type constraint)
+
   NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
     _CUDA_VSTD::uint64_t __state;
-    asm (
-      "mbarrier.arrive.shared.b64 %0, [%1], %2; // 3."
-      : "=l"(__state)
-      : "r"(__as_ptr_smem(__addr)),
-        "r"(__count)
-      : "memory"
-    );
+    if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cta) {
+      asm (
+        "mbarrier.arrive.release.cta.shared.b64                   %0,  [%1];           // 3a. "
+        : "=l"(__state)
+        : "r"(__as_ptr_smem(__addr))
+        : "memory"
+      );
+    } else if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cluster) {
+      asm (
+        "mbarrier.arrive.release.cluster.shared.b64                   %0,  [%1];           // 3a. "
+        : "=l"(__state)
+        : "r"(__as_ptr_smem(__addr))
+        : "memory"
+      );
+    }
     return __state;
   ),(
     // Unsupported architectures will have a linker error with a semi-decent error message
     return ___cuda_vstd_uint64_t__cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__();
   ));
 }
-#endif // __cccl_ptx_isa >= 780
+#endif // __cccl_ptx_isa >= 800
 
 /*
-// mbarrier.arrive.release.cluster.shared.b64 state,  [addr], count; // 4. PTX ISA 80, SM_90
-template <typename=void>
+// mbarrier.arrive{.sem}{.scope}{.space}.b64                   state,  [addr], count;    // 3b.  PTX ISA 80, SM_90
+// .sem       = { .release }
+// .scope     = { .cta, .cluster }
+// .space     = { .shared::cta }
+template <cuda::ptx::dot_scope Scope>
 __device__ static inline uint64_t mbarrier_arrive(
   cuda::ptx::sem_release_t,
-  cuda::ptx::scope_cluster_t,
+  cuda::ptx::scope_t<Scope> scope,
   cuda::ptx::space_shared_t,
   uint64_t* addr,
   const uint32_t& count);
 */
 #if __cccl_ptx_isa >= 800
-extern "C" __device__ _CUDA_VSTD::uint64_t ___cuda_vstd_uint64_t__cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__();
-template <typename=void>
+extern "C" _LIBCUDACXX_DEVICE _CUDA_VSTD::uint64_t ___cuda_vstd_uint64_t__cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__();
+template <dot_scope _Scope>
 _LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive(
   sem_release_t,
-  scope_cluster_t,
+  scope_t<_Scope> __scope,
   space_shared_t,
   _CUDA_VSTD::uint64_t* __addr,
   const _CUDA_VSTD::uint32_t& __count)
 {
   // __sem == sem_release (due to parameter type constraint)
-  // __scope == scope_cluster (due to parameter type constraint)
+  static_assert(__scope == scope_cta || __scope == scope_cluster, "");
   // __space == space_shared (due to parameter type constraint)
+
   NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
     _CUDA_VSTD::uint64_t __state;
-    asm (
-      "mbarrier.arrive.release.cluster.shared.b64 %0,  [%1], %2; // 4."
-      : "=l"(__state)
-      : "r"(__as_ptr_smem(__addr)),
-        "r"(__count)
-      : "memory"
-    );
+    if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cta) {
+      asm (
+        "mbarrier.arrive.release.cta.shared.b64                   %0,  [%1], %2;    // 3b. "
+        : "=l"(__state)
+        : "r"(__as_ptr_smem(__addr)),
+          "r"(__count)
+        : "memory"
+      );
+    } else if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cluster) {
+      asm (
+        "mbarrier.arrive.release.cluster.shared.b64                   %0,  [%1], %2;    // 3b. "
+        : "=l"(__state)
+        : "r"(__as_ptr_smem(__addr)),
+          "r"(__count)
+        : "memory"
+      );
+    }
     return __state;
   ),(
     // Unsupported architectures will have a linker error with a semi-decent error message
@@ -208,8 +276,59 @@ _LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive(
 #endif // __cccl_ptx_isa >= 800
 
 /*
-// mbarrier.arrive.release{.scope}.shared::cluster.b64 _, [addr], count;   // 5.  PTX ISA 80, SM_90
+// mbarrier.arrive{.sem}{.scope}{.space}.b64                   _, [addr];                // 4a.  PTX ISA 80, SM_90
+// .sem       = { .release }
+// .scope     = { .cta, .cluster }
+// .space     = { .shared::cluster }
+template <cuda::ptx::dot_scope Scope>
+__device__ static inline void mbarrier_arrive(
+  cuda::ptx::sem_release_t,
+  cuda::ptx::scope_t<Scope> scope,
+  cuda::ptx::space_cluster_t,
+  uint64_t* addr);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _LIBCUDACXX_DEVICE void __void__cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__();
+template <dot_scope _Scope>
+_LIBCUDACXX_DEVICE static inline void mbarrier_arrive(
+  sem_release_t,
+  scope_t<_Scope> __scope,
+  space_cluster_t,
+  _CUDA_VSTD::uint64_t* __addr)
+{
+  // __sem == sem_release (due to parameter type constraint)
+  static_assert(__scope == scope_cta || __scope == scope_cluster, "");
+  // __space == space_cluster (due to parameter type constraint)
+
+  NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
+    if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cta) {
+      asm (
+        "mbarrier.arrive.release.cta.shared::cluster.b64                   _, [%0];                // 4a. "
+        :
+        : "r"(__as_ptr_smem(__addr))
+        : "memory"
+      );
+    } else if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cluster) {
+      asm (
+        "mbarrier.arrive.release.cluster.shared::cluster.b64                   _, [%0];                // 4a. "
+        :
+        : "r"(__as_ptr_smem(__addr))
+        : "memory"
+      );
+    }
+
+  ),(
+    // Unsupported architectures will have a linker error with a semi-decent error message
+    return __void__cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__();
+  ));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// mbarrier.arrive{.sem}{.scope}{.space}.b64                   _, [addr], count;         // 4b.  PTX ISA 80, SM_90
+// .sem       = { .release }
 // .scope     = { .cta, .cluster }
+// .space     = { .shared::cluster }
 template <cuda::ptx::dot_scope Scope>
 __device__ static inline void mbarrier_arrive(
   cuda::ptx::sem_release_t,
@@ -219,7 +338,7 @@ __device__ static inline void mbarrier_arrive(
   const uint32_t& count);
 */
 #if __cccl_ptx_isa >= 800
-extern "C" __device__ void __void__cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__();
+extern "C" _LIBCUDACXX_DEVICE void __void__cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__();
 template <dot_scope _Scope>
 _LIBCUDACXX_DEVICE static inline void mbarrier_arrive(
   sem_release_t,
@@ -235,7 +354,7 @@ _LIBCUDACXX_DEVICE static inline void mbarrier_arrive(
   NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
     if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cta) {
       asm (
-        "mbarrier.arrive.release.cta.shared::cluster.b64 _, [%0], %1;   // 5. "
+        "mbarrier.arrive.release.cta.shared::cluster.b64                   _, [%0], %1;         // 4b. "
         :
         : "r"(__as_ptr_smem(__addr)),
           "r"(__count)
@@ -243,7 +362,7 @@ _LIBCUDACXX_DEVICE static inline void mbarrier_arrive(
       );
     } else if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cluster) {
       asm (
-        "mbarrier.arrive.release.cluster.shared::cluster.b64 _, [%0], %1;   // 5. "
+        "mbarrier.arrive.release.cluster.shared::cluster.b64                   _, [%0], %1;         // 4b. "
         :
         : "r"(__as_ptr_smem(__addr)),
           "r"(__count)
@@ -259,8 +378,41 @@ _LIBCUDACXX_DEVICE static inline void mbarrier_arrive(
 #endif // __cccl_ptx_isa >= 800
 
 /*
-// mbarrier.arrive.expect_tx.release{.scope}.shared.b64   state, [addr], tx_count; // 6.  PTX ISA 80, SM_90
+// mbarrier.arrive.noComplete.shared.b64                       state,  [addr], count;    // 5.  PTX ISA 70, SM_80
+template <typename=void>
+__device__ static inline uint64_t mbarrier_arrive_no_complete(
+  uint64_t* addr,
+  const uint32_t& count);
+*/
+#if __cccl_ptx_isa >= 700
+extern "C" _LIBCUDACXX_DEVICE _CUDA_VSTD::uint64_t ___cuda_vstd_uint64_t__cuda_ptx_mbarrier_arrive_no_complete_is_not_supported_before_SM_80__();
+template <typename=void>
+_LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive_no_complete(
+  _CUDA_VSTD::uint64_t* __addr,
+  const _CUDA_VSTD::uint32_t& __count)
+{
+  NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,(
+    _CUDA_VSTD::uint64_t __state;
+    asm (
+      "mbarrier.arrive.noComplete.shared.b64                       %0,  [%1], %2;    // 5. "
+      : "=l"(__state)
+      : "r"(__as_ptr_smem(__addr)),
+        "r"(__count)
+      : "memory"
+    );
+    return __state;
+  ),(
+    // Unsupported architectures will have a linker error with a semi-decent error message
+    return ___cuda_vstd_uint64_t__cuda_ptx_mbarrier_arrive_no_complete_is_not_supported_before_SM_80__();
+  ));
+}
+#endif // __cccl_ptx_isa >= 700
+
+/*
+// mbarrier.arrive.expect_tx{.sem}{.scope}{.space}.b64 state, [addr], tx_count; // 8.  PTX ISA 80, SM_90
+// .sem       = { .release }
 // .scope     = { .cta, .cluster }
+// .space     = { .shared::cta }
 template <cuda::ptx::dot_scope Scope>
 __device__ static inline uint64_t mbarrier_arrive_expect_tx(
   cuda::ptx::sem_release_t,
@@ -270,7 +422,7 @@ __device__ static inline uint64_t mbarrier_arrive_expect_tx(
   const uint32_t& tx_count);
 */
 #if __cccl_ptx_isa >= 800
-extern "C" __device__ _CUDA_VSTD::uint64_t ___cuda_vstd_uint64_t__cuda_ptx_mbarrier_arrive_expect_tx_is_not_supported_before_SM_90__();
+extern "C" _LIBCUDACXX_DEVICE _CUDA_VSTD::uint64_t ___cuda_vstd_uint64_t__cuda_ptx_mbarrier_arrive_expect_tx_is_not_supported_before_SM_90__();
 template <dot_scope _Scope>
 _LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive_expect_tx(
   sem_release_t,
@@ -282,11 +434,12 @@ _LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive_expect_tx(
   // __sem == sem_release (due to parameter type constraint)
   static_assert(__scope == scope_cta || __scope == scope_cluster, "");
   // __space == space_shared (due to parameter type constraint)
+
   NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
     _CUDA_VSTD::uint64_t __state;
     if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cta) {
       asm (
-        "mbarrier.arrive.expect_tx.release.cta.shared.b64   %0, [%1], %2; // 6. "
+        "mbarrier.arrive.expect_tx.release.cta.shared.b64 %0, [%1], %2; // 8. "
         : "=l"(__state)
         : "r"(__as_ptr_smem(__addr)),
           "r"(__tx_count)
@@ -294,7 +447,7 @@ _LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive_expect_tx(
       );
     } else if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cluster) {
       asm (
-        "mbarrier.arrive.expect_tx.release.cluster.shared.b64   %0, [%1], %2; // 6. "
+        "mbarrier.arrive.expect_tx.release.cluster.shared.b64 %0, [%1], %2; // 8. "
         : "=l"(__state)
         : "r"(__as_ptr_smem(__addr)),
           "r"(__tx_count)
@@ -310,8 +463,10 @@ _LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive_expect_tx(
 #endif // __cccl_ptx_isa >= 800
 
 /*
-// mbarrier.arrive.expect_tx.release{.scope}.shared::cluster.b64 _, [addr], tx_count; // 7.  PTX ISA 80, SM_90
+// mbarrier.arrive.expect_tx{.sem}{.scope}{.space}.b64   _, [addr], tx_count; // 9.  PTX ISA 80, SM_90
+// .sem       = { .release }
 // .scope     = { .cta, .cluster }
+// .space     = { .shared::cluster }
 template <cuda::ptx::dot_scope Scope>
 __device__ static inline void mbarrier_arrive_expect_tx(
   cuda::ptx::sem_release_t,
@@ -321,7 +476,7 @@ __device__ static inline void mbarrier_arrive_expect_tx(
   const uint32_t& tx_count);
 */
 #if __cccl_ptx_isa >= 800
-extern "C" __device__ void __void__cuda_ptx_mbarrier_arrive_expect_tx_is_not_supported_before_SM_90__();
+extern "C" _LIBCUDACXX_DEVICE void __void__cuda_ptx_mbarrier_arrive_expect_tx_is_not_supported_before_SM_90__();
 template <dot_scope _Scope>
 _LIBCUDACXX_DEVICE static inline void mbarrier_arrive_expect_tx(
   sem_release_t,
@@ -337,7 +492,7 @@ _LIBCUDACXX_DEVICE static inline void mbarrier_arrive_expect_tx(
   NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
     if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cta) {
       asm (
-        "mbarrier.arrive.expect_tx.release.cta.shared::cluster.b64 _, [%0], %1; // 7. "
+        "mbarrier.arrive.expect_tx.release.cta.shared::cluster.b64   _, [%0], %1; // 9. "
         :
         : "r"(__as_ptr_smem(__addr)),
           "r"(__tx_count)
@@ -345,7 +500,7 @@ _LIBCUDACXX_DEVICE static inline void mbarrier_arrive_expect_tx(
       );
     } else if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cluster) {
       asm (
-        "mbarrier.arrive.expect_tx.release.cluster.shared::cluster.b64 _, [%0], %1; // 7. "
+        "mbarrier.arrive.expect_tx.release.cluster.shared::cluster.b64   _, [%0], %1; // 9. "
         :
         : "r"(__as_ptr_smem(__addr)),
           "r"(__tx_count)
@@ -361,6 +516,7 @@ _LIBCUDACXX_DEVICE static inline void mbarrier_arrive_expect_tx(
 #endif // __cccl_ptx_isa >= 800
 
 
+
 // 9.7.12.15.14. Parallel Synchronization and Communication Instructions: mbarrier.arrive_drop
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive-drop
 

From 8b03da35467ab89965d78e610524155f5ae441fd Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Fri, 3 Nov 2023 08:30:32 +0100
Subject: [PATCH 47/49] Address review feedback

---
 libcudacxx/docs/extended_api/ptx.md           |  4 +--
 .../std/detail/libcxx/include/__cuda/ptx.h    | 26 ++++++++++++++-----
 ..._and_communication_instructions_mbarrier.h |  7 ++++-
 .../include/__cuda/ptx/ptx_dot_variants.h     |  7 ++++-
 .../include/__cuda/ptx/ptx_helper_functions.h |  6 +++++
 .../__cuda/ptx/ptx_isa_target_macros.h        |  6 +++++
 6 files changed, 46 insertions(+), 10 deletions(-)

diff --git a/libcudacxx/docs/extended_api/ptx.md b/libcudacxx/docs/extended_api/ptx.md
index 8b9efe694f0..1201b09748f 100644
--- a/libcudacxx/docs/extended_api/ptx.md
+++ b/libcudacxx/docs/extended_api/ptx.md
@@ -11,7 +11,7 @@ experiment with new hardware features before a high-level C++ API is available.
 The `cuda/ptx` header is intended to present a stable API (not ABI) within one
 major version of the CTK on a best effort basis. This means that:
 
-- All functions are marked inline.
+- All functions are marked static inline.
 
 - The type of a function parameter can be changed to be more generic if
   that means that code that called the original version can still be
@@ -26,7 +26,7 @@ API stability is not taken to the extreme. Call functions like below to ensure
 forward-compatibility:
 
 ```cuda
-// Use arguments to driver overload resolution:
+// Use arguments to drive overload resolution:
 cuda::ptx::mbarrier_arrive_expect_tx(cuda::ptx::sem_release, cuda::ptx::scope_cta, cuda::ptx::space_shared, &bar, 1);
 
 // Specifying templates directly is not forward-compatible, as order and number
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h
index 9c8a33c18dd..4ad22be7419 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h
@@ -12,13 +12,27 @@
 #ifndef _LIBCUDACXX___CUDA_PTX_H
 #define  _LIBCUDACXX___CUDA_PTX_H
 
+#ifndef __cuda_std__
+#error "<__cuda/ptx.h> should only be included in from <cuda/std/barrier>"
+#endif // __cuda_std__
+
+#if defined(__CUDA_MINIMUM_ARCH__) && __CUDA_MINIMUM_ARCH__ < 700
+#  error "CUDA synchronization primitives are only supported for sm_70 and up."
+#endif
+
+#if defined(_CCCL_COMPILER_NVHPC) && defined(_CCCL_USE_IMPLICIT_SYSTEM_DEADER)
+#pragma GCC system_header
+#else // ^^^ _CCCL_COMPILER_NVHPC ^^^ / vvv !_CCCL_COMPILER_NVHPC vvv
+_CCCL_IMPLICIT_SYSTEM_HEADER
+#endif // !_CCCL_COMPILER_NVHPC
+
+#include <nv/target> // __CUDA_MINIMUM_ARCH__ and friends
+
+#include "../__cuda/ptx/ptx_isa_target_macros.h"
+#include "../__cuda/ptx/ptx_dot_variants.h"
+#include "../__cuda/ptx/ptx_helper_functions.h"
+#include "../__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h"
 #include "../cstdint" // uint32_t
-#include "../../../../../../nv/target" // __CUDA_MINIMUM_ARCH__ and friends
-
-#include "cuda/std/detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h"
-#include "cuda/std/detail/libcxx/include/__cuda/ptx/ptx_dot_variants.h"
-#include "cuda/std/detail/libcxx/include/__cuda/ptx/ptx_helper_functions.h"
-#include "cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h"
 
 /*
  * The cuda::ptx namespace intends to provide PTX wrappers for new hardware
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h
index d6b5c72e38d..01cba7cd0b1 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h
@@ -9,7 +9,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-
 #ifndef _CUDA_PTX_PARALLEL_SYNCHRONIZATION_AND_COMMUNICATION_INSTRUCTIONS_MBARRIER_H_
 #define _CUDA_PTX_PARALLEL_SYNCHRONIZATION_AND_COMMUNICATION_INSTRUCTIONS_MBARRIER_H_
 
@@ -18,6 +17,12 @@
 #include "ptx_isa_target_macros.h"
 #include "../../cstdint"
 
+#if defined(_CCCL_COMPILER_NVHPC) && defined(_CCCL_USE_IMPLICIT_SYSTEM_DEADER)
+#pragma GCC system_header
+#else // ^^^ _CCCL_COMPILER_NVHPC ^^^ / vvv !_CCCL_COMPILER_NVHPC vvv
+_CCCL_IMPLICIT_SYSTEM_HEADER
+#endif // !_CCCL_COMPILER_NVHPC
+
 _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
 /*
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_dot_variants.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_dot_variants.h
index 18f67c37479..442c484e8eb 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_dot_variants.h
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_dot_variants.h
@@ -9,12 +9,17 @@
 //
 //===----------------------------------------------------------------------===//
 
-
 #ifndef _CUDA_PTX_DOT_VARIANTS_H_
 #define _CUDA_PTX_DOT_VARIANTS_H_
 
 #include "../../__type_traits/integral_constant.h" // std::integral_constant
 
+#if defined(_CCCL_COMPILER_NVHPC) && defined(_CCCL_USE_IMPLICIT_SYSTEM_DEADER)
+#pragma GCC system_header
+#else // ^^^ _CCCL_COMPILER_NVHPC ^^^ / vvv !_CCCL_COMPILER_NVHPC vvv
+_CCCL_IMPLICIT_SYSTEM_HEADER
+#endif // !_CCCL_COMPILER_NVHPC
+
 /*
  * Public integral constant types and values for ".variant"s:
  *
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_helper_functions.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_helper_functions.h
index 41826081a54..f6ec0b3959e 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_helper_functions.h
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_helper_functions.h
@@ -14,6 +14,12 @@
 
 #include "../../cstdint"        // uint32_t
 
+#if defined(_CCCL_COMPILER_NVHPC) && defined(_CCCL_USE_IMPLICIT_SYSTEM_DEADER)
+#pragma GCC system_header
+#else // ^^^ _CCCL_COMPILER_NVHPC ^^^ / vvv !_CCCL_COMPILER_NVHPC vvv
+_CCCL_IMPLICIT_SYSTEM_HEADER
+#endif // !_CCCL_COMPILER_NVHPC
+
 _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
 inline _LIBCUDACXX_DEVICE _CUDA_VSTD::uint32_t __as_ptr_smem(const void* __ptr)
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h
index f3b412bb6b6..ca5297e4de4 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h
@@ -15,6 +15,12 @@
 
 #include <nv/target>            // __CUDA_MINIMUM_ARCH__ and friends
 
+#if defined(_CCCL_COMPILER_NVHPC) && defined(_CCCL_USE_IMPLICIT_SYSTEM_DEADER)
+#pragma GCC system_header
+#else // ^^^ _CCCL_COMPILER_NVHPC ^^^ / vvv !_CCCL_COMPILER_NVHPC vvv
+_CCCL_IMPLICIT_SYSTEM_HEADER
+#endif // !_CCCL_COMPILER_NVHPC
+
 /*
  * Targeting macros
  *

From 614326b9260f5c4022c6adc69e0125d52d3978fd Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Fri, 3 Nov 2023 09:06:34 +0100
Subject: [PATCH 48/49] Do not require set arch

---
 .../include/cuda/std/detail/libcxx/include/__cuda/ptx.h       | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h
index 4ad22be7419..384f3ba14b3 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h
@@ -16,10 +16,6 @@
 #error "<__cuda/ptx.h> should only be included in from <cuda/std/barrier>"
 #endif // __cuda_std__
 
-#if defined(__CUDA_MINIMUM_ARCH__) && __CUDA_MINIMUM_ARCH__ < 700
-#  error "CUDA synchronization primitives are only supported for sm_70 and up."
-#endif
-
 #if defined(_CCCL_COMPILER_NVHPC) && defined(_CCCL_USE_IMPLICIT_SYSTEM_DEADER)
 #pragma GCC system_header
 #else // ^^^ _CCCL_COMPILER_NVHPC ^^^ / vvv !_CCCL_COMPILER_NVHPC vvv

From 9e9fb70b712a6799791997d3c70db6a28a71af72 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Fri, 3 Nov 2023 12:22:23 +0100
Subject: [PATCH 49/49] Do not expose remote mbarrier arrive with .cta scope

---
 .../ptx/ptx.mbarrier.arrive.compile.pass.cpp  |   3 -
 libcudacxx/docs/extended_api/ptx.md           |  18 +--
 ..._and_communication_instructions_mbarrier.h | 115 +++++++-----------
 3 files changed, 54 insertions(+), 82 deletions(-)

diff --git a/libcudacxx/.upstream-tests/test/cuda/ptx/ptx.mbarrier.arrive.compile.pass.cpp b/libcudacxx/.upstream-tests/test/cuda/ptx/ptx.mbarrier.arrive.compile.pass.cpp
index 4666467cad5..4316b3604fa 100644
--- a/libcudacxx/.upstream-tests/test/cuda/ptx/ptx.mbarrier.arrive.compile.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/cuda/ptx/ptx.mbarrier.arrive.compile.pass.cpp
@@ -54,16 +54,13 @@ __global__ void test_compilation() {
     state = cuda::ptx::mbarrier_arrive(sem_release, scope_cta, space_shared, &bar, 1);               // 3b.
     state = cuda::ptx::mbarrier_arrive(sem_release, scope_cluster, space_shared, &bar, 1);           // 3b.
 
-    cuda::ptx::mbarrier_arrive(sem_release, scope_cta, space_cluster, &bar);                         // 4a.
     cuda::ptx::mbarrier_arrive(sem_release, scope_cluster, space_cluster, &bar);                     // 4a.
 
-    cuda::ptx::mbarrier_arrive(sem_release, scope_cta, space_cluster, &bar, 1);                      // 4b.
     cuda::ptx::mbarrier_arrive(sem_release, scope_cluster, space_cluster, &bar, 1);                  // 4b.
 
     state = cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cta, space_shared, &bar, 1);     // 8.
     state = cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cluster, space_shared, &bar, 1); // 8.
 
-    cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cta, space_cluster, &bar, 1);            // 9.
     cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cluster, space_cluster, &bar, 1);        // 9.
   ));
 #endif // __cccl_ptx_isa >= 800
diff --git a/libcudacxx/docs/extended_api/ptx.md b/libcudacxx/docs/extended_api/ptx.md
index 1201b09748f..e45eed54a42 100644
--- a/libcudacxx/docs/extended_api/ptx.md
+++ b/libcudacxx/docs/extended_api/ptx.md
@@ -479,23 +479,23 @@ __device__ static inline uint64_t mbarrier_arrive(
 
 // mbarrier.arrive{.sem}{.scope}{.space}.b64                   _, [addr];                // 4a.  PTX ISA 80, SM_90
 // .sem       = { .release }
-// .scope     = { .cta, .cluster }
+// .scope     = { .cluster }
 // .space     = { .shared::cluster }
-template <cuda::ptx::dot_scope Scope>
+template <typename=void>
 __device__ static inline void mbarrier_arrive(
   cuda::ptx::sem_release_t,
-  cuda::ptx::scope_t<Scope> scope,
+  cuda::ptx::scope_cluster_t,
   cuda::ptx::space_cluster_t,
   uint64_t* addr);
 
 // mbarrier.arrive{.sem}{.scope}{.space}.b64                   _, [addr], count;         // 4b.  PTX ISA 80, SM_90
 // .sem       = { .release }
-// .scope     = { .cta, .cluster }
+// .scope     = { .cluster }
 // .space     = { .shared::cluster }
-template <cuda::ptx::dot_scope Scope>
+template <typename=void>
 __device__ static inline void mbarrier_arrive(
   cuda::ptx::sem_release_t,
-  cuda::ptx::scope_t<Scope> scope,
+  cuda::ptx::scope_cluster_t,
   cuda::ptx::space_cluster_t,
   uint64_t* addr,
   const uint32_t& count);
@@ -524,12 +524,12 @@ __device__ static inline uint64_t mbarrier_arrive_expect_tx(
 
 // mbarrier.arrive.expect_tx{.sem}{.scope}{.space}.b64   _, [addr], tx_count; // 9.  PTX ISA 80, SM_90
 // .sem       = { .release }
-// .scope     = { .cta, .cluster }
+// .scope     = { .cluster }
 // .space     = { .shared::cluster }
-template <cuda::ptx::dot_scope Scope>
+template <typename=void>
 __device__ static inline void mbarrier_arrive_expect_tx(
   cuda::ptx::sem_release_t,
-  cuda::ptx::scope_t<Scope> scope,
+  cuda::ptx::scope_cluster_t,
   cuda::ptx::space_cluster_t,
   uint64_t* addr,
   const uint32_t& tx_count);
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h
index 01cba7cd0b1..39bab140414 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h
@@ -90,15 +90,17 @@ mbarrier.arrive{.sem}{.scope}{.space}.b64                   state,  [addr], coun
 .sem   = { .release }
 .scope = { .cta, .cluster }
 
+// NOTE: .scope=.cta is dropped on purpose
 mbarrier.arrive{.sem}{.scope}{.space}.b64                   _, [addr];                // 4a. PTX ISA 80, SM_90, !memory
 .space = { .shared::cluster}
 .sem   = { .release }
-.scope = { .cta, .cluster }
+.scope = { .cluster }
 
+// NOTE: .scope=.cta is dropped on purpose
 mbarrier.arrive{.sem}{.scope}{.space}.b64                   _, [addr], count;         // 4b. PTX ISA 80, SM_90, !memory
 .space = { .shared::cluster}
 .sem   = { .release }
-.scope = { .cta, .cluster }
+.scope = { .cluster }
 
 
 // mbarrier_arrive_no_complete:
@@ -110,10 +112,11 @@ mbarrier.arrive.expect_tx{.sem}{.scope}{.space}.b64        state, [addr], tx_cou
 .sem   = { .release }
 .scope = { .cta, .cluster }
 
+// NOTE: .scope=.cta is dropped on purpose
 mbarrier.arrive.expect_tx{.sem}{.scope}{.space}.b64        _, [addr], tx_count;       // 9. PTX ISA 80, SM_90, !memory
 .space = { .shared::cluster }
 .sem   = { .release }
-.scope = { .cta, .cluster }
+.scope = { .cluster }
 
 */
 
@@ -283,44 +286,35 @@ _LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive(
 /*
 // mbarrier.arrive{.sem}{.scope}{.space}.b64                   _, [addr];                // 4a.  PTX ISA 80, SM_90
 // .sem       = { .release }
-// .scope     = { .cta, .cluster }
+// .scope     = { .cluster }
 // .space     = { .shared::cluster }
-template <cuda::ptx::dot_scope Scope>
+template <typename=void>
 __device__ static inline void mbarrier_arrive(
   cuda::ptx::sem_release_t,
-  cuda::ptx::scope_t<Scope> scope,
+  cuda::ptx::scope_cluster_t,
   cuda::ptx::space_cluster_t,
   uint64_t* addr);
 */
 #if __cccl_ptx_isa >= 800
 extern "C" _LIBCUDACXX_DEVICE void __void__cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__();
-template <dot_scope _Scope>
+template <typename=void>
 _LIBCUDACXX_DEVICE static inline void mbarrier_arrive(
   sem_release_t,
-  scope_t<_Scope> __scope,
+  scope_cluster_t,
   space_cluster_t,
   _CUDA_VSTD::uint64_t* __addr)
 {
   // __sem == sem_release (due to parameter type constraint)
-  static_assert(__scope == scope_cta || __scope == scope_cluster, "");
+  // __scope == scope_cluster (due to parameter type constraint)
   // __space == space_cluster (due to parameter type constraint)
 
   NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
-    if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cta) {
-      asm (
-        "mbarrier.arrive.release.cta.shared::cluster.b64                   _, [%0];                // 4a. "
-        :
-        : "r"(__as_ptr_smem(__addr))
-        : "memory"
-      );
-    } else if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cluster) {
-      asm (
-        "mbarrier.arrive.release.cluster.shared::cluster.b64                   _, [%0];                // 4a. "
-        :
-        : "r"(__as_ptr_smem(__addr))
-        : "memory"
-      );
-    }
+    asm (
+      "mbarrier.arrive.release.cluster.shared::cluster.b64                   _, [%0];                // 4a. "
+      :
+      : "r"(__as_ptr_smem(__addr))
+      : "memory"
+    );
 
   ),(
     // Unsupported architectures will have a linker error with a semi-decent error message
@@ -332,48 +326,38 @@ _LIBCUDACXX_DEVICE static inline void mbarrier_arrive(
 /*
 // mbarrier.arrive{.sem}{.scope}{.space}.b64                   _, [addr], count;         // 4b.  PTX ISA 80, SM_90
 // .sem       = { .release }
-// .scope     = { .cta, .cluster }
+// .scope     = { .cluster }
 // .space     = { .shared::cluster }
-template <cuda::ptx::dot_scope Scope>
+template <typename=void>
 __device__ static inline void mbarrier_arrive(
   cuda::ptx::sem_release_t,
-  cuda::ptx::scope_t<Scope> scope,
+  cuda::ptx::scope_cluster_t,
   cuda::ptx::space_cluster_t,
   uint64_t* addr,
   const uint32_t& count);
 */
 #if __cccl_ptx_isa >= 800
 extern "C" _LIBCUDACXX_DEVICE void __void__cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__();
-template <dot_scope _Scope>
+template <typename=void>
 _LIBCUDACXX_DEVICE static inline void mbarrier_arrive(
   sem_release_t,
-  scope_t<_Scope> __scope,
+  scope_cluster_t,
   space_cluster_t,
   _CUDA_VSTD::uint64_t* __addr,
   const _CUDA_VSTD::uint32_t& __count)
 {
   // __sem == sem_release (due to parameter type constraint)
-  static_assert(__scope == scope_cta || __scope == scope_cluster, "");
+  // __scope == scope_cluster (due to parameter type constraint)
   // __space == space_cluster (due to parameter type constraint)
 
   NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
-    if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cta) {
-      asm (
-        "mbarrier.arrive.release.cta.shared::cluster.b64                   _, [%0], %1;         // 4b. "
-        :
-        : "r"(__as_ptr_smem(__addr)),
-          "r"(__count)
-        : "memory"
-      );
-    } else if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cluster) {
-      asm (
-        "mbarrier.arrive.release.cluster.shared::cluster.b64                   _, [%0], %1;         // 4b. "
-        :
-        : "r"(__as_ptr_smem(__addr)),
-          "r"(__count)
-        : "memory"
-      );
-    }
+    asm (
+      "mbarrier.arrive.release.cluster.shared::cluster.b64                   _, [%0], %1;         // 4b. "
+      :
+      : "r"(__as_ptr_smem(__addr)),
+        "r"(__count)
+      : "memory"
+    );
 
   ),(
     // Unsupported architectures will have a linker error with a semi-decent error message
@@ -470,48 +454,38 @@ _LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive_expect_tx(
 /*
 // mbarrier.arrive.expect_tx{.sem}{.scope}{.space}.b64   _, [addr], tx_count; // 9.  PTX ISA 80, SM_90
 // .sem       = { .release }
-// .scope     = { .cta, .cluster }
+// .scope     = { .cluster }
 // .space     = { .shared::cluster }
-template <cuda::ptx::dot_scope Scope>
+template <typename=void>
 __device__ static inline void mbarrier_arrive_expect_tx(
   cuda::ptx::sem_release_t,
-  cuda::ptx::scope_t<Scope> scope,
+  cuda::ptx::scope_cluster_t,
   cuda::ptx::space_cluster_t,
   uint64_t* addr,
   const uint32_t& tx_count);
 */
 #if __cccl_ptx_isa >= 800
 extern "C" _LIBCUDACXX_DEVICE void __void__cuda_ptx_mbarrier_arrive_expect_tx_is_not_supported_before_SM_90__();
-template <dot_scope _Scope>
+template <typename=void>
 _LIBCUDACXX_DEVICE static inline void mbarrier_arrive_expect_tx(
   sem_release_t,
-  scope_t<_Scope> __scope,
+  scope_cluster_t,
   space_cluster_t,
   _CUDA_VSTD::uint64_t* __addr,
   const _CUDA_VSTD::uint32_t& __tx_count)
 {
   // __sem == sem_release (due to parameter type constraint)
-  static_assert(__scope == scope_cta || __scope == scope_cluster, "");
+  // __scope == scope_cluster (due to parameter type constraint)
   // __space == space_cluster (due to parameter type constraint)
 
   NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
-    if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cta) {
-      asm (
-        "mbarrier.arrive.expect_tx.release.cta.shared::cluster.b64   _, [%0], %1; // 9. "
-        :
-        : "r"(__as_ptr_smem(__addr)),
-          "r"(__tx_count)
-        : "memory"
-      );
-    } else if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cluster) {
-      asm (
-        "mbarrier.arrive.expect_tx.release.cluster.shared::cluster.b64   _, [%0], %1; // 9. "
-        :
-        : "r"(__as_ptr_smem(__addr)),
-          "r"(__tx_count)
-        : "memory"
-      );
-    }
+    asm (
+      "mbarrier.arrive.expect_tx.release.cluster.shared::cluster.b64   _, [%0], %1; // 9. "
+      :
+      : "r"(__as_ptr_smem(__addr)),
+        "r"(__tx_count)
+      : "memory"
+    );
 
   ),(
     // Unsupported architectures will have a linker error with a semi-decent error message
@@ -522,6 +496,7 @@ _LIBCUDACXX_DEVICE static inline void mbarrier_arrive_expect_tx(
 
 
 
+
 // 9.7.12.15.14. Parallel Synchronization and Communication Instructions: mbarrier.arrive_drop
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive-drop