NVIDIA · miscco · Nov 3, 2023 · Oct 17, 2023 · Oct 17, 2023 · Oct 17, 2023
diff --git a/libcudacxx/.upstream-tests/test/cuda/ptx/mbarrier_arrive_tx.pass.cpp b/libcudacxx/.upstream-tests/test/cuda/ptx/mbarrier_arrive_tx.pass.cpp
@@ -0,0 +1,52 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+// UNSUPPORTED: libcpp-has-no-threads
+// UNSUPPORTED: pre-sm-90
+
+// <cuda/ptx>
+
+#include <cuda/ptx>
+
+#include <cuda/std/utility>
+
+#include "concurrent_agents.h"
+#include "cuda_space_selector.h"
+#include "test_macros.h"
+
+int main(int, char**)
+{
+    NV_DISPATCH_TARGET(
+        NV_IS_HOST, (
+            // Required by concurrent_agents_launch to know how many we're
+            // launching. This can only be an int, because the nvrtc tests use grep
+            // to figure out how many threads to launch.
+            cuda_thread_count = 1;
+        ),
+        NV_IS_DEVICE, (
+            // Do not execute. Just check if this compiles (that is: assembles) without error.
+            if (false) {
+                using cuda::ptx::sem_release;
+                using cuda::ptx::space_shared_cluster;
+                using cuda::ptx::space_shared;
+                using cuda::ptx::scope_cluster;
+                using cuda::ptx::scope_cta;
+
+                __shared__ uint64_t bar;
+                cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cta, space_shared, &bar, 1);
+                cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cluster, space_shared, &bar, 1);
+
+                cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cta, space_shared_cluster, &bar, 1);
+                cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cluster, space_shared_cluster, &bar, 1);
+            }
+        )
+    );
+
+    return 0;
+}
@@ -21,6 +21,8 @@ nav_order: 3
 
 {% include_relative extended_api/functional.md %}
 
+{% include_relative extended_api/ptx.md %}
+
 [Thread Scopes]: ./extended_api/memory_model.md#thread-scopes
 [Thread Groups]: ./extended_api/thread_groups.md
 
@@ -0,0 +1,68 @@
+## PTX instructions
+
+The `cuda::ptx` namespace contains functions that map one-to-one to PTX
+[instructions](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html). These can be used for maximal control of the generated code, or to
+experiment with new hardware features before a high-level C++ API is available.
+
+### Shared memory barrier (mbarrier)
+
+| Instruction | Compute capability | CUDA Toolkit |
+|----------------------------------------|--------------------|--------------|
+| `cuda::ptx::mbarrier_arrive_expect_tx` | 9.0                | CTK 12.4     |
+
+
+#### [`cuda::ptx::mbarrier_arrive_expect_tx`](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive)
+
+```cuda
+template <dot_scope _Sco>
+__device__ inline
+uint64_t mbarrier_arrive_expect_tx(sem_release_t sem, scope_t<_Sco> scope, space_shared_t spc, uint64_t* addr, uint32_t tx_count);
+
+template <dot_scope _Sco>
+__device__ inline
+void mbarrier_arrive_expect_tx(sem_release_t sem, scope_t<_Sco> scope, space_shared_cluster_t spc, uint64_t* addr, uint32_t tx_count);
+```
+
+Usage:
+
+```cuda
+#include <cuda/ptx>
+#include <cuda/barrier>
+#include <cooperative_groups.h>
+
+__global__ void kernel() {
+    using cuda::ptx::sem_release;
+    using cuda::ptx::space_shared_cluster;
+    using cuda::ptx::space_shared;
+    using cuda::ptx::scope_cluster;
+    using cuda::ptx::scope_cta;
+
+    using barrier_t = cuda::barrier<cuda::thread_scope_block>;
+    __shared__ barrier_t bar;
+    init(&bar, blockDim.x);
+    __syncthreads();
+
+    NV_IF_TARGET(NV_PROVIDES_SM_90, (
+        // Arrive on local shared memory barrier:
+        uint64_t token;
+        token = cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cta, space_shared, &bar, 1);
+        token = cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cluster, space_shared, &bar, 1);
+
+        // Get address of remote cluster barrier:
+        namespace cg = cooperative_groups;
+        cg::cluster_group cluster = cg::this_cluster();
+        unsigned int other_block_rank = cluster.block_rank() ^ 1;
+        uint64_t * remote_bar = cluster.map_shared_rank(&bar, other_block_rank);
+
+        // Sync cluster to ensure remote barrier is initialized.
+        cluster.sync();
+
+        // Arrive on remote cluster barrier:
+        cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cta, space_shared_cluster, remote_bar, 1);
+        cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cluster, space_shared_cluster, remote_bar, 1);
+    )
+}
+```
+
+
+