From bea203d9cd9525291b012c44f4258399d7de1b66 Mon Sep 17 00:00:00 2001 From: Allard Hendriksen Date: Fri, 3 Nov 2023 22:15:20 +0100 Subject: [PATCH] Add `cuda::ptx::*` namespace (#574) --- .../ptx/ptx.mbarrier.arrive.compile.pass.cpp | 73 ++ .../test/support/concurrent_agents.h | 2 + libcudacxx/docs/extended_api.md | 2 + libcudacxx/docs/extended_api/ptx.md | 669 ++++++++++++++++ libcudacxx/include/cuda/ptx | 23 + .../cuda/std/detail/libcxx/include/__config | 3 + .../detail/libcxx/include/__cuda/barrier.h | 46 +- .../std/detail/libcxx/include/__cuda/ptx.h | 719 ++++++++++++++++++ ..._and_communication_instructions_mbarrier.h | 514 +++++++++++++ .../include/__cuda/ptx/ptx_dot_variants.h | 174 +++++ .../include/__cuda/ptx/ptx_helper_functions.h | 62 ++ .../__cuda/ptx/ptx_isa_target_macros.h | 75 ++ 12 files changed, 2333 insertions(+), 29 deletions(-) create mode 100644 libcudacxx/.upstream-tests/test/cuda/ptx/ptx.mbarrier.arrive.compile.pass.cpp create mode 100644 libcudacxx/docs/extended_api/ptx.md create mode 100644 libcudacxx/include/cuda/ptx create mode 100644 libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h create mode 100644 libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h create mode 100644 libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_dot_variants.h create mode 100644 libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_helper_functions.h create mode 100644 libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h diff --git a/libcudacxx/.upstream-tests/test/cuda/ptx/ptx.mbarrier.arrive.compile.pass.cpp b/libcudacxx/.upstream-tests/test/cuda/ptx/ptx.mbarrier.arrive.compile.pass.cpp new file mode 100644 index 00000000000..4316b3604fa --- /dev/null +++ b/libcudacxx/.upstream-tests/test/cuda/ptx/ptx.mbarrier.arrive.compile.pass.cpp @@ -0,0 +1,73 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// +// UNSUPPORTED: libcpp-has-no-threads + +// + +#include +#include + +#include "concurrent_agents.h" +#include "cuda_space_selector.h" +#include "test_macros.h" + +template +__device__ inline bool __unused(_Ty...) { return true; } + +__global__ void test_compilation() { + using cuda::ptx::sem_release; + using cuda::ptx::space_cluster; + using cuda::ptx::space_shared; + using cuda::ptx::scope_cluster; + using cuda::ptx::scope_cta; + + __shared__ uint64_t bar; + bar = 1; + uint64_t state = 1; + +#if __cccl_ptx_isa >= 700 + NV_IF_TARGET(NV_PROVIDES_SM_80, ( + state = cuda::ptx::mbarrier_arrive(&bar); // 1. + state = cuda::ptx::mbarrier_arrive_no_complete(&bar, 1); // 5. + )); +#endif // __cccl_ptx_isa >= 700 + + // This guard is redundant: before PTX ISA 7.8, there was no support for SM_90 +#if __cccl_ptx_isa >= 780 + NV_IF_TARGET(NV_PROVIDES_SM_90, ( + state = cuda::ptx::mbarrier_arrive(&bar, 1); // 2. + )); +#endif // __cccl_ptx_isa >= 780 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET(NV_PROVIDES_SM_90, ( + state = cuda::ptx::mbarrier_arrive(sem_release, scope_cta, space_shared, &bar); // 3a. + state = cuda::ptx::mbarrier_arrive(sem_release, scope_cluster, space_shared, &bar); // 3a. + + state = cuda::ptx::mbarrier_arrive(sem_release, scope_cta, space_shared, &bar, 1); // 3b. + state = cuda::ptx::mbarrier_arrive(sem_release, scope_cluster, space_shared, &bar, 1); // 3b. + + cuda::ptx::mbarrier_arrive(sem_release, scope_cluster, space_cluster, &bar); // 4a. + + cuda::ptx::mbarrier_arrive(sem_release, scope_cluster, space_cluster, &bar, 1); // 4b. + + state = cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cta, space_shared, &bar, 1); // 8. + state = cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cluster, space_shared, &bar, 1); // 8. + + cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cluster, space_cluster, &bar, 1); // 9. + )); +#endif // __cccl_ptx_isa >= 800 + __unused(bar, state); +} + +int main(int, char**) +{ + return 0; +} diff --git a/libcudacxx/.upstream-tests/test/support/concurrent_agents.h b/libcudacxx/.upstream-tests/test/support/concurrent_agents.h index d0d3163c88f..33b338ff712 100644 --- a/libcudacxx/.upstream-tests/test/support/concurrent_agents.h +++ b/libcudacxx/.upstream-tests/test/support/concurrent_agents.h @@ -19,6 +19,8 @@ #endif #endif +#include + #include "test_macros.h" TEST_EXEC_CHECK_DISABLE diff --git a/libcudacxx/docs/extended_api.md b/libcudacxx/docs/extended_api.md index 952b7c81e51..6f71683edc7 100644 --- a/libcudacxx/docs/extended_api.md +++ b/libcudacxx/docs/extended_api.md @@ -21,6 +21,8 @@ nav_order: 3 {% include_relative extended_api/functional.md %} +{% include_relative extended_api/ptx.md %} + [Thread Scopes]: ./extended_api/memory_model.md#thread-scopes [Thread Groups]: ./extended_api/thread_groups.md diff --git a/libcudacxx/docs/extended_api/ptx.md b/libcudacxx/docs/extended_api/ptx.md new file mode 100644 index 00000000000..e45eed54a42 --- /dev/null +++ b/libcudacxx/docs/extended_api/ptx.md @@ -0,0 +1,669 @@ +## PTX instructions + +The `cuda::ptx` namespace contains functions that map one-to-one to +[PTX instructions](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html). These can be used for maximal control of the generated code, or to +experiment with new hardware features before a high-level C++ API is available. + +### Variants + +### Versions and compatibility + +The `cuda/ptx` header is intended to present a stable API (not ABI) within one +major version of the CTK on a best effort basis. This means that: + +- All functions are marked static inline. + +- The type of a function parameter can be changed to be more generic if + that means that code that called the original version can still be + compiled. + +- Good exposure of the PTX should be high priority. If, at a new major + version, we face a difficult choice between breaking backward-compatibility + and an improvement of the PTX exposure, we will tend to the latter option + more easily than in other parts of libcu++. + +API stability is not taken to the extreme. Call functions like below to ensure +forward-compatibility: + +```cuda +// Use arguments to drive overload resolution: +cuda::ptx::mbarrier_arrive_expect_tx(cuda::ptx::sem_release, cuda::ptx::scope_cta, cuda::ptx::space_shared, &bar, 1); + +// Specifying templates directly is not forward-compatible, as order and number +// of template parameters may change in a minor release: +cuda::ptx::mbarrier_arrive_expect_tx( + cuda::ptx::sem_release, cuda::ptx::scope_cta, cuda::ptx::space_shared, &bar, 1 +); +``` + +**PTX ISA version and compute capability.** Each binding notes under which PTX +ISA version and SM version it may be used. Example: + +```cuda +// mbarrier.arrive.shared::cta.b64 state, [addr]; // 1. PTX ISA 70, SM_80 +__device__ inline uint64_t mbarrier_arrive( + cuda::ptx::sem_release_t sem, + cuda::ptx::scope_cta_t scope, + cuda::ptx::space_shared_t space, + uint64_t* addr); +``` + +To check if the current compiler is recent enough, use: +```cuda +#if __cccl_ptx_isa >= 700 +cuda::ptx::mbarrier_arrive(cuda::ptx::sem_release, cuda::ptx::scope_cta, cuda::ptx::space_shared, &bar, 1); +#endif +``` + +Ensure that you only call the function when compiling for a recent enough +compute capability (SM version), like this: +```cuda +NV_IF_TARGET(NV_PROVIDES_SM_80,( + cuda::ptx::mbarrier_arrive(cuda::ptx::sem_release, cuda::ptx::scope_cta, cuda::ptx::space_shared, &bar, 1); +)); +``` + +For more information on which compilers correspond to which PTX ISA, see the +[PTX ISA release +notes](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#release-notes). + + +### [9.7.1. Integer Arithmetic Instructions](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions) + +| Instruction | Available in libcu++ | +|------------------------------------------|----------------------| +| [`sad`] | No | +| [`div`] | No | +| [`rem`] | No | +| [`abs`] | No | +| [`neg`] | No | +| [`min`] | No | +| [`max`] | No | +| [`popc`] | No | +| [`clz`] | No | +| [`bfind`] | No | +| [`fns`] | No | +| [`brev`] | No | +| [`bfe`] | No | +| [`bfi`] | No | +| [`szext`] | No | +| [`bmsk`] | No | +| [`dp4a`] | No | +| [`dp2a`] | No | + +[`sad`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-sad +[`div`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-div +[`rem`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-rem +[`abs`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-abs +[`neg`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-neg +[`min`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-min +[`max`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-max +[`popc`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-popc +[`clz`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-clz +[`bfind`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-bfind +[`fns`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-fns +[`brev`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-brev +[`bfe`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-bfe +[`bfi`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-bfi +[`szext`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-szext +[`bmsk`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-bmsk +[`dp4a`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-dp4a +[`dp2a`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-dp2a + +### [9.7.2. Extended-Precision Integer Arithmetic Instructions](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-integer-arithmetic-instructions) + +| Instruction | Available in libcu++ | +|------------------------------------------|----------------------| +| [`add.cc`] | No | +| [`addc`] | No | +| [`sub.cc`] | No | +| [`subc`] | No | +| [`mad.cc`] | No | +| [`madc`] | No | + +[`add.cc`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-arithmetic-instructions-add-cc +[`addc`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-arithmetic-instructions-addc +[`sub.cc`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-arithmetic-instructions-sub-cc +[`subc`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-arithmetic-instructions-subc +[`mad.cc`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-arithmetic-instructions-mad-cc +[`madc`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-arithmetic-instructions-madc + +### [9.7.3. Floating-Point Instructions](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions) + +| Instruction | Available in libcu++ | +|------------------------------------------|----------------------| +| [`testp`] | No | +| [`copysign`] | No | +| [`add`] | No | +| [`sub`] | No | +| [`mul`] | No | +| [`fma`] | No | +| [`mad`] | No | +| [`div`] | No | +| [`abs`] | No | +| [`neg`] | No | +| [`min`] | No | +| [`max`] | No | +| [`rcp`] | No | +| [`rcp.approx.ftz.f64`] | No | +| [`sqrt`] | No | +| [`rsqrt`] | No | +| [`rsqrt.approx.ftz.f64`] | No | +| [`sin`] | No | +| [`cos`] | No | +| [`lg2`] | No | +| [`ex2`] | No | +| [`tanh`] | No | + +[`testp`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-testp +[`copysign`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-copysign +[`add`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-add +[`sub`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-sub +[`mul`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-mul +[`fma`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-fma +[`mad`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-mad +[`div`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-div +[`abs`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-abs +[`neg`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-neg +[`min`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-min +[`max`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-max +[`rcp`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-rcp +[`rcp.approx.ftz.f64`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-rcp-approx-ftz-f64 +[`sqrt`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-sqrt +[`rsqrt`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-rsqrt +[`rsqrt.approx.ftz.f64`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-rsqrt-approx-ftz-f64 +[`sin`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-sin +[`cos`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-cos +[`lg2`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-lg2 +[`ex2`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-ex2 +[`tanh`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-tanh + +### [9.7.4. Half Precision Floating-Point Instructions](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions) + +| Instruction | Available in libcu++ | +|------------------------------------------|----------------------| +| [`add`] | No | +| [`sub`] | No | +| [`mul`] | No | +| [`fma`] | No | +| [`neg`] | No | +| [`abs`] | No | +| [`min`] | No | +| [`max`] | No | +| [`tanh`] | No | +| [`ex2`] | No | + +[`add`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-add +[`sub`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-sub +[`mul`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-mul +[`fma`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-fma +[`neg`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-neg +[`abs`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-abs +[`min`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-min +[`max`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-max +[`tanh`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-tanh +[`ex2`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-ex2 + +### [9.7.5. Comparison and Selection Instructions](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#comparison-and-selection-instructions) + +| Instruction | Available in libcu++ | +|------------------------------------------|----------------------| +| [`set`] | No | +| [`setp`] | No | +| [`selp`] | No | +| [`slct`] | No | + +[`set`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#comparison-and-selection-instructions-set +[`setp`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#comparison-and-selection-instructions-setp +[`selp`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#comparison-and-selection-instructions-selp +[`slct`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#comparison-and-selection-instructions-slct + +### [9.7.6. Half Precision Comparison Instructions](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-comparison-instructions) + +| Instruction | Available in libcu++ | +|------------------------------------------|----------------------| +| [`set`] | No | +| [`setp`] | No | + +[`set`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-comparison-instructions-set +[`setp`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-comparison-instructions-setp + +### [9.7.7. Logic and Shift Instructions](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions) + +| Instruction | Available in libcu++ | +|------------------------------------------|----------------------| +| [`and`] | No | +| [`or`] | No | +| [`xor`] | No | +| [`not`] | No | +| [`cnot`] | No | +| [`lop3`] | No | +| [`shf`] | No | +| [`shl`] | No | +| [`shr`] | No | + +[`and`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-and +[`or`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-or +[`xor`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-xor +[`not`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-not +[`cnot`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-cnot +[`lop3`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-lop3 +[`shf`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-shf +[`shl`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-shl +[`shr`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-shr + +### [9.7.8. Data Movement and Conversion Instructions](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions) + +| Instruction | Available in libcu++ | +|------------------------------------------|----------------------| +| [`mov`] | No | +| [`mov`] | No | +| [`shfl (deprecated)`] | No | +| [`shfl.sync`] | No | +| [`prmt`] | No | +| [`ld`] | No | +| [`ld.global.nc`] | No | +| [`ldu`] | No | +| [`st`] | No | +| [`st.async`] | No | +| [`multimem.ld_reduce, multimem.st, multimem.red`] | No | +| [`prefetch, prefetchu`] | No | +| [`applypriority`] | No | +| [`discard`] | No | +| [`createpolicy`] | No | +| [`isspacep`] | No | +| [`cvta`] | No | +| [`cvt`] | No | +| [`cvt.pack`] | No | +| [`mapa`] | No | +| [`getctarank`] | No | + +[`mov`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-mov-2 +[`shfl (deprecated)`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-shfl-deprecated +[`shfl.sync`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-shfl-sync +[`prmt`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-prmt +[`ld`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-ld +[`ld.global.nc`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-ld-global-nc +[`ldu`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-ldu +[`st`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-st +[`st.async`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-st-async +[`multimem.ld_reduce, multimem.st, multimem.red`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-multimem-ld-reduce-multimem-st-multimem-red +[`prefetch, prefetchu`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-prefetch-prefetchu +[`applypriority`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-applypriority +[`discard`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-discard +[`createpolicy`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-createpolicy +[`isspacep`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-isspacep +[`cvta`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvta +[`cvt`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvt +[`cvt.pack`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvt-pack +[`mapa`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-mapa +[`getctarank`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-getctarank + +### [9.7.8.24. Data Movement and Conversion Instructions: Asynchronous copy](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-asynchronous-copy) + +| Instruction | Available in libcu++ | +|------------------------------------------|----------------------| +| [`cp.async`] | No | +| [`cp.async.commit_group`] | No | +| [`cp.async.wait_group / cp.async.wait_all`] | No | +| [`cp.async.bulk`] | No | +| [`cp.reduce.async.bulk`] | No | +| [`cp.async.bulk.prefetch`] | No | +| [`cp.async.bulk.tensor`] | No | +| [`cp.reduce.async.bulk.tensor`] | No | +| [`cp.async.bulk.prefetch.tensor`] | No | +| [`cp.async.bulk.commit_group`] | No | +| [`cp.async.bulk.wait_group`] | No | +| [`tensormap.replace`] | No | + +[`cp.async`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async +[`cp.async.commit_group`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-commit-group +[`cp.async.wait_group / cp.async.wait_all`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-wait-group-cp-async-wait-all +[`cp.async.bulk`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk +[`cp.reduce.async.bulk`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-reduce-async-bulk +[`cp.async.bulk.prefetch`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-prefetch +[`cp.async.bulk.tensor`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor +[`cp.reduce.async.bulk.tensor`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-reduce-async-bulk-tensor +[`cp.async.bulk.prefetch.tensor`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-prefetch-tensor +[`cp.async.bulk.commit_group`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-commit-group +[`cp.async.bulk.wait_group`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-wait-group +[`tensormap.replace`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-tensormap-replace + +### [9.7.9. Texture Instructions](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#texture-instructions) + +| Instruction | Available in libcu++ | +|------------------------------------------|----------------------| +| [`tex`] | No | +| [`tld4`] | No | +| [`txq`] | No | +| [`istypep`] | No | + +[`tex`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#texture-instructions-tex +[`tld4`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#texture-instructions-tld4 +[`txq`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#texture-instructions-txq +[`istypep`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#texture-instructions-istypep + +### [9.7.10. Surface Instructions](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#surface-instructions) + +| Instruction | Available in libcu++ | +|------------------------------------------|----------------------| +| [`suld`] | No | +| [`sust`] | No | +| [`sured`] | No | +| [`suq`] | No | + +[`suld`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#surface-instructions-suld +[`sust`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#surface-instructions-sust +[`sured`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#surface-instructions-sured +[`suq`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#surface-instructions-suq + +### [9.7.11. Control Flow Instructions](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions) + +| Instruction | Available in libcu++ | +|------------------------------------------|----------------------| +| [`{}`] | No | +| [`@`] | No | +| [`bra`] | No | +| [`brx.idx`] | No | +| [`call`] | No | +| [`ret`] | No | +| [`exit`] | No | + +[`{}`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-curly-braces +[`@`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-at +[`bra`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-bra +[`brx.idx`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-brx-idx +[`call`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-call +[`ret`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-ret +[`exit`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-exit + +### [9.7.12. Parallel Synchronization and Communication Instructions](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions) + +| Instruction | Available in libcu++ | +|------------------------------------------|----------------------| +| [`bar, barrier`] | No | +| [`bar.warp.sync`] | No | +| [`barrier.cluster`] | No | +| [`membar/fence`] | No | +| [`atom`] | No | +| [`red`] | No | +| [`red.async`] | No | +| [`vote (deprecated)`] | No | +| [`vote.sync`] | No | +| [`match.sync`] | No | +| [`activemask`] | No | +| [`redux.sync`] | No | +| [`griddepcontrol`] | No | +| [`elect.sync`] | No | + +[`bar, barrier`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-bar-barrier +[`bar.warp.sync`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-bar-warp-sync +[`barrier.cluster`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-barrier-cluster +[`membar/fence`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar-fence +[`atom`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-atom +[`red`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-red +[`red.async`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-red-async +[`vote (deprecated)`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-vote-deprecated +[`vote.sync`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-vote-sync +[`match.sync`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-match-sync +[`activemask`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-activemask +[`redux.sync`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-redux-sync +[`griddepcontrol`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-griddepcontrol +[`elect.sync`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-elect-sync + +### [9.7.12.15. Parallel Synchronization and Communication Instructions: mbarrier](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier) + +| Instruction | Available in libcu++ | +|------------------------------------------|----------------------| +| [`mbarrier.init`] | No | +| [`mbarrier.inval`] | No | +| [`mbarrier.expect_tx`] | No | +| [`mbarrier.complete_tx`] | No | +| [`mbarrier.arrive`] | CTK-FUTURE, CCCL v2.3.0 | +| [`mbarrier.arrive_drop`] | No | +| [`cp.async.mbarrier.arrive`] | No | +| [`mbarrier.test_wait/mbarrier.try_wait`] | No | +| [`mbarrier.pending_count`] | No | +| [`tensormap.cp_fenceproxy`] | No | + +[`mbarrier.init`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-init +[`mbarrier.inval`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-inval +[`mbarrier.expect_tx`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-expect-tx +[`mbarrier.complete_tx`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-complete-tx +[`mbarrier.arrive`]: #mbarrierarrive +[`mbarrier.arrive_drop`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive-drop +[`cp.async.mbarrier.arrive`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-cp-async-mbarrier-arrive +[`mbarrier.test_wait/mbarrier.try_wait`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-test-wait-mbarrier-try-wait +[`mbarrier.pending_count`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-pending-count +[`tensormap.cp_fenceproxy`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-tensormap-cp-fenceproxy + + +#### `mbarrier.arrive` + +- PTX ISA: [mbarrier.arrive](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive) + +```cuda +// mbarrier.arrive.shared.b64 state, [addr]; // 1. PTX ISA 70, SM_80 +template +__device__ static inline uint64_t mbarrier_arrive( + uint64_t* addr); + +// mbarrier.arrive.shared::cta.b64 state, [addr], count; // 2. PTX ISA 78, SM_90 +template +__device__ static inline uint64_t mbarrier_arrive( + uint64_t* addr, + const uint32_t& count); + +// mbarrier.arrive{.sem}{.scope}{.space}.b64 state, [addr]; // 3a. PTX ISA 80, SM_90 +// .sem = { .release } +// .scope = { .cta, .cluster } +// .space = { .shared::cta } +template +__device__ static inline uint64_t mbarrier_arrive( + cuda::ptx::sem_release_t, + cuda::ptx::scope_t scope, + cuda::ptx::space_shared_t, + uint64_t* addr); + +// mbarrier.arrive{.sem}{.scope}{.space}.b64 state, [addr], count; // 3b. PTX ISA 80, SM_90 +// .sem = { .release } +// .scope = { .cta, .cluster } +// .space = { .shared::cta } +template +__device__ static inline uint64_t mbarrier_arrive( + cuda::ptx::sem_release_t, + cuda::ptx::scope_t scope, + cuda::ptx::space_shared_t, + uint64_t* addr, + const uint32_t& count); + +// mbarrier.arrive{.sem}{.scope}{.space}.b64 _, [addr]; // 4a. PTX ISA 80, SM_90 +// .sem = { .release } +// .scope = { .cluster } +// .space = { .shared::cluster } +template +__device__ static inline void mbarrier_arrive( + cuda::ptx::sem_release_t, + cuda::ptx::scope_cluster_t, + cuda::ptx::space_cluster_t, + uint64_t* addr); + +// mbarrier.arrive{.sem}{.scope}{.space}.b64 _, [addr], count; // 4b. PTX ISA 80, SM_90 +// .sem = { .release } +// .scope = { .cluster } +// .space = { .shared::cluster } +template +__device__ static inline void mbarrier_arrive( + cuda::ptx::sem_release_t, + cuda::ptx::scope_cluster_t, + cuda::ptx::space_cluster_t, + uint64_t* addr, + const uint32_t& count); +``` + +```cuda +// mbarrier.arrive.noComplete.shared.b64 state, [addr], count; // 5. PTX ISA 70, SM_80 +template +__device__ static inline uint64_t mbarrier_arrive_no_complete( + uint64_t* addr, + const uint32_t& count); +``` + +```cuda +// mbarrier.arrive.expect_tx{.sem}{.scope}{.space}.b64 state, [addr], tx_count; // 8. PTX ISA 80, SM_90 +// .sem = { .release } +// .scope = { .cta, .cluster } +// .space = { .shared::cta } +template +__device__ static inline uint64_t mbarrier_arrive_expect_tx( + cuda::ptx::sem_release_t, + cuda::ptx::scope_t scope, + cuda::ptx::space_shared_t, + uint64_t* addr, + const uint32_t& tx_count); + +// mbarrier.arrive.expect_tx{.sem}{.scope}{.space}.b64 _, [addr], tx_count; // 9. PTX ISA 80, SM_90 +// .sem = { .release } +// .scope = { .cluster } +// .space = { .shared::cluster } +template +__device__ static inline void mbarrier_arrive_expect_tx( + cuda::ptx::sem_release_t, + cuda::ptx::scope_cluster_t, + cuda::ptx::space_cluster_t, + uint64_t* addr, + const uint32_t& tx_count); +``` + +Usage: +```cuda +#include +#include +#include + +__global__ void kernel() { + using cuda::ptx::sem_release; + using cuda::ptx::space_cluster; + using cuda::ptx::space_shared; + using cuda::ptx::scope_cluster; + using cuda::ptx::scope_cta; + + using barrier_t = cuda::barrier; + __shared__ barrier_t bar; + init(&bar, blockDim.x); + __syncthreads(); + + NV_IF_TARGET(NV_PROVIDES_SM_90, ( + // Arrive on local shared memory barrier: + uint64_t token; + token = cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cta, space_shared, &bar, 1); + token = cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cluster, space_shared, &bar, 1); + + // Get address of remote cluster barrier: + namespace cg = cooperative_groups; + cg::cluster_group cluster = cg::this_cluster(); + unsigned int other_block_rank = cluster.block_rank() ^ 1; + uint64_t * remote_bar = cluster.map_shared_rank(&bar, other_block_rank); + + // Sync cluster to ensure remote barrier is initialized. + cluster.sync(); + + // Arrive on remote cluster barrier: + cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cta, space_cluster, remote_bar, 1); + cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cluster, space_cluster, remote_bar, 1); + ) +} +``` +### [9.7.13. Warp Level Matrix Multiply-Accumulate Instructions](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-multiply-accumulate-instructions) + +| Instruction | Available in libcu++ | +|------------------------------------------|----------------------| +| [`wmma.load`] | No | +| [`wmma.store`] | No | +| [`wmma.mma`] | No | +| [`mma`] | No | +| [`ldmatrix`] | No | +| [`stmatrix`] | No | +| [`movmatrix`] | No | +| [`mma.sp`] | No | + +[`wmma.load`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-load-instruction-wmma-load +[`wmma.store`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-store-instruction-wmma-store +[`wmma.mma`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-multiply-and-accumulate-instruction-wmma-mma +[`mma`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#multiply-and-accumulate-instruction-mma +[`ldmatrix`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-load-instruction-ldmatrix +[`stmatrix`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-store-instruction-stmatrix +[`movmatrix`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-transpose-instruction-movmatrix +[`mma.sp`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#multiply-and-accumulate-instruction-mma-sp + +### [9.7.14. Asynchronous Warpgroup Level Matrix Multiply-Accumulate Instructions](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-warpgroup-level-matrix-multiply-accumulate-instructions) + +| Instruction | Available in libcu++ | +|------------------------------------------|----------------------| +| [`wgmma.mma_async`] | No | +| [`wgmma.mma_async.sp`] | No | +| [`wgmma.fence`] | No | +| [`wgmma.commit_group`] | No | +| [`wgmma.wait_group`] | No | + +[`wgmma.mma_async`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-multiply-and-accumulate-instruction-wgmma-mma-async +[`wgmma.mma_async.sp`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-multiply-and-accumulate-instruction-wgmma-mma-async-sp +[`wgmma.fence`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-multiply-and-accumulate-instruction-wgmma-fence +[`wgmma.commit_group`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-multiply-and-accumulate-instruction-wgmma-commit-group +[`wgmma.wait_group`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-multiply-and-accumulate-instruction-wgmma-wait-group + +### [9.7.15. Stack Manipulation Instructions](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#stack-manipulation-instructions) + +| Instruction | Available in libcu++ | +|------------------------------------------|----------------------| +| [`stacksave`] | No | +| [`stackrestore`] | No | +| [`alloca`] | No | + +[`stacksave`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#stack-manipulation-instructions-stacksave +[`stackrestore`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#stack-manipulation-instructions-stackrestore +[`alloca`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#stack-manipulation-instructions-alloca + +### [9.7.16. Video Instructions](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#video-instructions) + +| Instruction | Available in libcu++ | +|------------------------------------------|----------------------| +| [`vadd, vsub, vabsdiff, vmin, vmax`] | No | +| [`vshl, vshr`] | No | +| [`vmad`] | No | +| [`vset`] | No | + +[`vadd, vsub, vabsdiff, vmin, vmax`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#scalar-video-instructions-vadd-vsub-vabsdiff-vmin-vmax +[`vshl, vshr`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#scalar-video-instructions-vshl-vshr +[`vmad`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#scalar-video-instructions-vmad +[`vset`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#scalar-video-instructions-vset + +### [9.7.16.2. SIMD Video Instructions](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#simd-video-instructions) + +| Instruction | Available in libcu++ | +|------------------------------------------|----------------------| +| [`vadd2, vsub2, vavrg2, vabsdiff2, vmin2, vmax2`] | No | +| [`vset2`] | No | +| [`vadd4, vsub4, vavrg4, vabsdiff4, vmin4, vmax4`] | No | +| [`vset4`] | No | + +[`vadd2, vsub2, vavrg2, vabsdiff2, vmin2, vmax2`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#simd-video-instructions-vadd2-vsub2-vavrg2-vabsdiff2-vmin2-vmax2 +[`vset2`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#simd-video-instructions-vset2 +[`vadd4, vsub4, vavrg4, vabsdiff4, vmin4, vmax4`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#simd-video-instructions-vadd4-vsub4-vavrg4-vabsdiff4-vmin4-vmax4 +[`vset4`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#simd-video-instructions-vset4 + +### [9.7.17. Miscellaneous Instructions](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions) + +| Instruction | Available in libcu++ | +|------------------------------------------|----------------------| +| [`brkpt`] | No | +| [`nanosleep`] | No | +| [`pmevent`] | No | +| [`trap`] | No | +| [`setmaxnreg`] | No | + +[`brkpt`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions-brkpt +[`nanosleep`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions-nanosleep +[`pmevent`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions-pmevent +[`trap`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions-trap +[`setmaxnreg`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions-setmaxnreg diff --git a/libcudacxx/include/cuda/ptx b/libcudacxx/include/cuda/ptx new file mode 100644 index 00000000000..ab6ed62d9d2 --- /dev/null +++ b/libcudacxx/include/cuda/ptx @@ -0,0 +1,23 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_PTX +#define _CUDA_PTX + +#include "std/detail/__config" + +#include "std/detail/__pragma_push" + +#include "std/detail/libcxx/include/__cuda/ptx.h" + +#include "std/detail/__pragma_pop" + +#endif // _CUDA_PTX diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__config b/libcudacxx/include/cuda/std/detail/libcxx/include/__config index a7993a145a3..fe0d305d5b0 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/__config +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__config @@ -1505,6 +1505,9 @@ typedef __char32_t char32_t; #define _LIBCUDACXX_END_NAMESPACE_CUDA } } #define _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_DEVICE namespace cuda { namespace device { inline namespace _LIBCUDACXX_ABI_NAMESPACE { #define _LIBCUDACXX_END_NAMESPACE_CUDA_DEVICE } } } +#define _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX namespace cuda { namespace ptx { inline namespace _LIBCUDACXX_ABI_NAMESPACE { +#define _LIBCUDACXX_END_NAMESPACE_CUDA_PTX } } } +#define _CUDA_VPTX ::cuda::ptx::_LIBCUDACXX_ABI_NAMESPACE #define _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_DEVICE_EXPERIMENTAL namespace cuda { namespace device { namespace experimental { inline namespace _LIBCUDACXX_ABI_NAMESPACE { #define _LIBCUDACXX_END_NAMESPACE_CUDA_DEVICE_EXPERIMENTAL } } } } #endif diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h index 9fd883659bc..65cea8b77e9 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h @@ -27,6 +27,7 @@ _CCCL_IMPLICIT_SYSTEM_HEADER #include "../cstdlib" // _LIBCUDACXX_UNREACHABLE #include "../__type_traits/void_t.h" // _CUDA_VSTD::__void_t +#include "../__cuda/ptx.h" // cuda::ptx::* #if defined(_LIBCUDACXX_COMPILER_NVRTC) #define _LIBCUDACXX_OFFSET_IS_ZERO(type, member) !(&(((type *)0)->member)) @@ -206,29 +207,21 @@ friend class _CUDA_VSTD::__barrier_poll_tester_parity; else if (!__isShared(&__barrier)) { __trap(); } - - asm volatile ("mbarrier.arrive.shared.b64 %0, [%1], %2;" - : "=l"(__token) - : "r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(&__barrier))), - "r"(static_cast<_CUDA_VSTD::uint32_t>(__update)) - : "memory"); + // Cannot use cuda::device::barrier_native_handle here, as it is + // only defined for block-scope barriers. This barrier may be a + // non-block scoped barrier. + auto __bh = reinterpret_cast<_CUDA_VSTD::uint64_t*>(&__barrier); + __token = _CUDA_VPTX::mbarrier_arrive(__bh, __update); ), NV_PROVIDES_SM_80, ( if (!__isShared(&__barrier)) { return __barrier.arrive(__update); } - + auto __bh = reinterpret_cast<_CUDA_VSTD::uint64_t*>(&__barrier); // Need 2 instructions, can't finish barrier with arrive > 1 if (__update > 1) { - asm volatile ("mbarrier.arrive.noComplete.shared.b64 %0, [%1], %2;" - : "=l"(__token) - : "r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(&__barrier))), - "r"(static_cast<_CUDA_VSTD::uint32_t>(__update - 1)) - : "memory"); + ___CUDA_VPTX::mbarrier_arrive_no_complete(__bh, __update - 1); } - asm volatile ("mbarrier.arrive.shared.b64 %0, [%1];" - : "=l"(__token) - : "r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(&__barrier))) - : "memory"); + __token = _CUDA_VPTX::mbarrier_arrive( __bh); ), NV_IS_DEVICE, ( if (!__isShared(&__barrier)) { return __barrier.arrive(__update); @@ -603,14 +596,12 @@ barrier::arrival_token barrier_arrive_tx( // us in release builds. In debug builds, the error would be caught // by the asserts at the top of this function. - auto __bh = __cvta_generic_to_shared(barrier_native_handle(__b)); + auto __native_handle = barrier_native_handle(__b); + auto __bh = __cvta_generic_to_shared(__native_handle); if (__arrive_count_update == 1) { - asm ( - "mbarrier.arrive.expect_tx.release.cta.shared::cta.b64 %0, [%1], %2;" - : "=l"(__token) - : "r"(static_cast<_CUDA_VSTD::uint32_t>(__bh)), - "r"(static_cast<_CUDA_VSTD::uint32_t>(__transaction_count_update)) - : "memory"); + __token = _CUDA_VPTX::mbarrier_arrive_expect_tx( + _CUDA_VPTX::sem_release, _CUDA_VPTX::scope_cta, _CUDA_VPTX::space_shared, __native_handle, __transaction_count_update + ); } else { asm ( "mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;" @@ -618,12 +609,9 @@ barrier::arrival_token barrier_arrive_tx( : "r"(static_cast<_CUDA_VSTD::uint32_t>(__bh)), "r"(static_cast<_CUDA_VSTD::uint32_t>(__transaction_count_update)) : "memory"); - asm ( - "mbarrier.arrive.release.cta.shared::cta.b64 %0, [%1], %2;" - : "=l"(__token) - : "r"(static_cast<_CUDA_VSTD::uint32_t>(__bh)), - "r"(static_cast<_CUDA_VSTD::uint32_t>(__arrive_count_update)) - : "memory"); + __token = _CUDA_VPTX::mbarrier_arrive( + _CUDA_VPTX::sem_release, _CUDA_VPTX::scope_cta, _CUDA_VPTX::space_shared, __native_handle, __arrive_count_update + ); } ) ); diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h new file mode 100644 index 00000000000..384f3ba14b3 --- /dev/null +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h @@ -0,0 +1,719 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _LIBCUDACXX___CUDA_PTX_H +#define _LIBCUDACXX___CUDA_PTX_H + +#ifndef __cuda_std__ +#error "<__cuda/ptx.h> should only be included in from " +#endif // __cuda_std__ + +#if defined(_CCCL_COMPILER_NVHPC) && defined(_CCCL_USE_IMPLICIT_SYSTEM_DEADER) +#pragma GCC system_header +#else // ^^^ _CCCL_COMPILER_NVHPC ^^^ / vvv !_CCCL_COMPILER_NVHPC vvv +_CCCL_IMPLICIT_SYSTEM_HEADER +#endif // !_CCCL_COMPILER_NVHPC + +#include // __CUDA_MINIMUM_ARCH__ and friends + +#include "../__cuda/ptx/ptx_isa_target_macros.h" +#include "../__cuda/ptx/ptx_dot_variants.h" +#include "../__cuda/ptx/ptx_helper_functions.h" +#include "../__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h" +#include "../cstdint" // uint32_t + +/* + * The cuda::ptx namespace intends to provide PTX wrappers for new hardware + * features and new PTX instructions so that they can be experimented with + * before higher-level C++ APIs are designed and developed. + * + * The wrappers have the following responsibilities: + * + * - They must prevent any PTX assembler errors, that is: + * - They are defined only for versions of the CUDA Toolkit in which nvcc/ptxas + * actually recognizes the instruction. + * - Sizes and types of parameters are correct. + * - They must convert state spaces correctly. + * - They adhere to the libcu++ coding standards of using: + * - Reserved identifiers for all parameters, variables. E.g. `__meow` or `_Woof` + * - _CUDA_VSTD:: namespace for types + * + * The wrappers should not do the following: + * + * - Use any non-native types. For example, an mbarrier instruction wrapper + * takes the barrier address as a uint64_t pointer. + * + * This header is intended for: + * + * - internal consumption by higher-level APIs such as cuda::barrier, + * - outside developers who want to experiment with the latest features of the + * hardware. + * + * Stability: + * + * - These headers are intended to present a stable API (not ABI) within one + * major version of the CTK. This means that: + * - All functions are marked inline + * - The type of a function parameter can be changed to be more generic if + * that means that code that called the original version can still be + * compiled. + * + * - Good exposure of the PTX should be high priority. If, at a new major + * version, we face a difficult choice between breaking backward-compatibility + * and an improvement of the PTX exposure, we will tend to the latter option + * more easily than in other parts of libcu++. + */ + +_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX + +/* + * Instructions + * + * The organization of the instructions below follows that of the PTX ISA documentation: + * https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#instructions + * + * To improve code organization, some sections are separated into their own + * header. For instance, the mbarrier instructions are found in: + * __cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h + * + */ + +/* + * 9.7.1. Integer Arithmetic Instructions + * https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions + * + */ + +// 9.7.1.7. Integer Arithmetic Instructions: sad +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-sad + +// 9.7.1.8. Integer Arithmetic Instructions: div +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-div + +// 9.7.1.9. Integer Arithmetic Instructions: rem +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-rem + +// 9.7.1.10. Integer Arithmetic Instructions: abs +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-abs + +// 9.7.1.11. Integer Arithmetic Instructions: neg +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-neg + +// 9.7.1.12. Integer Arithmetic Instructions: min +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-min + +// 9.7.1.13. Integer Arithmetic Instructions: max +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-max + +// 9.7.1.14. Integer Arithmetic Instructions: popc +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-popc + +// 9.7.1.15. Integer Arithmetic Instructions: clz +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-clz + +// 9.7.1.16. Integer Arithmetic Instructions: bfind +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-bfind + +// 9.7.1.17. Integer Arithmetic Instructions: fns +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-fns + +// 9.7.1.18. Integer Arithmetic Instructions: brev +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-brev + +// 9.7.1.19. Integer Arithmetic Instructions: bfe +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-bfe + +// 9.7.1.20. Integer Arithmetic Instructions: bfi +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-bfi + +// 9.7.1.21. Integer Arithmetic Instructions: szext +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-szext + +// 9.7.1.22. Integer Arithmetic Instructions: bmsk +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-bmsk + +// 9.7.1.23. Integer Arithmetic Instructions: dp4a +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-dp4a + +// 9.7.1.24. Integer Arithmetic Instructions: dp2a +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-dp2a + + +/* + * 9.7.2. Extended-Precision Integer Arithmetic Instructions + * https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-integer-arithmetic-instructions + * + */ + +// 9.7.2.1. Extended-Precision Arithmetic Instructions: add.cc +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-arithmetic-instructions-add-cc + +// 9.7.2.2. Extended-Precision Arithmetic Instructions: addc +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-arithmetic-instructions-addc + +// 9.7.2.3. Extended-Precision Arithmetic Instructions: sub.cc +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-arithmetic-instructions-sub-cc + +// 9.7.2.4. Extended-Precision Arithmetic Instructions: subc +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-arithmetic-instructions-subc + +// 9.7.2.5. Extended-Precision Arithmetic Instructions: mad.cc +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-arithmetic-instructions-mad-cc + +// 9.7.2.6. Extended-Precision Arithmetic Instructions: madc +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-arithmetic-instructions-madc + + +/* + * 9.7.3. Floating-Point Instructions + * https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions + * + */ + +// 9.7.3.1. Floating Point Instructions: testp +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-testp + +// 9.7.3.2. Floating Point Instructions: copysign +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-copysign + +// 9.7.3.3. Floating Point Instructions: add +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-add + +// 9.7.3.4. Floating Point Instructions: sub +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-sub + +// 9.7.3.5. Floating Point Instructions: mul +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-mul + +// 9.7.3.6. Floating Point Instructions: fma +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-fma + +// 9.7.3.7. Floating Point Instructions: mad +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-mad + +// 9.7.3.8. Floating Point Instructions: div +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-div + +// 9.7.3.9. Floating Point Instructions: abs +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-abs + +// 9.7.3.10. Floating Point Instructions: neg +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-neg + +// 9.7.3.11. Floating Point Instructions: min +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-min + +// 9.7.3.12. Floating Point Instructions: max +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-max + +// 9.7.3.13. Floating Point Instructions: rcp +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-rcp + +// 9.7.3.14. Floating Point Instructions: rcp.approx.ftz.f64 +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-rcp-approx-ftz-f64 + +// 9.7.3.15. Floating Point Instructions: sqrt +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-sqrt + +// 9.7.3.16. Floating Point Instructions: rsqrt +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-rsqrt + +// 9.7.3.17. Floating Point Instructions: rsqrt.approx.ftz.f64 +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-rsqrt-approx-ftz-f64 + +// 9.7.3.18. Floating Point Instructions: sin +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-sin + +// 9.7.3.19. Floating Point Instructions: cos +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-cos + +// 9.7.3.20. Floating Point Instructions: lg2 +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-lg2 + +// 9.7.3.21. Floating Point Instructions: ex2 +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-ex2 + +// 9.7.3.22. Floating Point Instructions: tanh +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-tanh + + +/* + * 9.7.4. Half Precision Floating-Point Instructions + * https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions + * + */ + +// 9.7.4.1. Half Precision Floating Point Instructions: add +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-add + +// 9.7.4.2. Half Precision Floating Point Instructions: sub +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-sub + +// 9.7.4.3. Half Precision Floating Point Instructions: mul +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-mul + +// 9.7.4.4. Half Precision Floating Point Instructions: fma +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-fma + +// 9.7.4.5. Half Precision Floating Point Instructions: neg +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-neg + +// 9.7.4.6. Half Precision Floating Point Instructions: abs +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-abs + +// 9.7.4.7. Half Precision Floating Point Instructions: min +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-min + +// 9.7.4.8. Half Precision Floating Point Instructions: max +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-max + +// 9.7.4.9. Half Precision Floating Point Instructions: tanh +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-tanh + +// 9.7.4.10. Half Precision Floating Point Instructions: ex2 +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-ex2 + + +/* + * 9.7.5. Comparison and Selection Instructions + * https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#comparison-and-selection-instructions + * + */ + +// 9.7.5.1. Comparison and Selection Instructions: set +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#comparison-and-selection-instructions-set + +// 9.7.5.2. Comparison and Selection Instructions: setp +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#comparison-and-selection-instructions-setp + +// 9.7.5.3. Comparison and Selection Instructions: selp +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#comparison-and-selection-instructions-selp + +// 9.7.5.4. Comparison and Selection Instructions: slct +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#comparison-and-selection-instructions-slct + + +/* + * 9.7.6. Half Precision Comparison Instructions + * https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-comparison-instructions + * + */ + +// 9.7.6.1. Half Precision Comparison Instructions: set +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-comparison-instructions-set + +// 9.7.6.2. Half Precision Comparison Instructions: setp +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-comparison-instructions-setp + + +/* + * 9.7.7. Logic and Shift Instructions + * https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions + * + */ + +// 9.7.7.1. Logic and Shift Instructions: and +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-and + +// 9.7.7.2. Logic and Shift Instructions: or +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-or + +// 9.7.7.3. Logic and Shift Instructions: xor +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-xor + +// 9.7.7.4. Logic and Shift Instructions: not +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-not + +// 9.7.7.5. Logic and Shift Instructions: cnot +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-cnot + +// 9.7.7.6. Logic and Shift Instructions: lop3 +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-lop3 + +// 9.7.7.7. Logic and Shift Instructions: shf +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-shf + +// 9.7.7.8. Logic and Shift Instructions: shl +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-shl + +// 9.7.7.9. Logic and Shift Instructions: shr +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-shr + + +/* + * 9.7.8. Data Movement and Conversion Instructions + * https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions + * + */ + +// 9.7.8.3. Data Movement and Conversion Instructions: mov +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-mov + +// 9.7.8.4. Data Movement and Conversion Instructions: mov +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-mov-2 + +// 9.7.8.5. Data Movement and Conversion Instructions: shfl (deprecated) +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-shfl-deprecated + +// 9.7.8.6. Data Movement and Conversion Instructions: shfl.sync +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-shfl-sync + +// 9.7.8.7. Data Movement and Conversion Instructions: prmt +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-prmt + +// 9.7.8.8. Data Movement and Conversion Instructions: ld +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-ld + +// 9.7.8.9. Data Movement and Conversion Instructions: ld.global.nc +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-ld-global-nc + +// 9.7.8.10. Data Movement and Conversion Instructions: ldu +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-ldu + +// 9.7.8.11. Data Movement and Conversion Instructions: st +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-st + +// 9.7.8.12. Data Movement and Conversion Instructions: st.async +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-st-async + +// 9.7.8.13. Data Movement and Conversion Instructions: multimem.ld_reduce, multimem.st, multimem.red +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-multimem-ld-reduce-multimem-st-multimem-red + +// 9.7.8.14. Data Movement and Conversion Instructions: prefetch, prefetchu +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-prefetch-prefetchu + +// 9.7.8.15. Data Movement and Conversion Instructions: applypriority +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-applypriority + +// 9.7.8.16. Data Movement and Conversion Instructions: discard +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-discard + +// 9.7.8.17. Data Movement and Conversion Instructions: createpolicy +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-createpolicy + +// 9.7.8.18. Data Movement and Conversion Instructions: isspacep +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-isspacep + +// 9.7.8.19. Data Movement and Conversion Instructions: cvta +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvta + +// 9.7.8.20. Data Movement and Conversion Instructions: cvt +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvt + +// 9.7.8.21. Data Movement and Conversion Instructions: cvt.pack +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvt-pack + +// 9.7.8.22. Data Movement and Conversion Instructions: mapa +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-mapa + +// 9.7.8.23. Data Movement and Conversion Instructions: getctarank +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-getctarank + + +/* + * 9.7.8.24. Data Movement and Conversion Instructions: Asynchronous copy + * https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-asynchronous-copy + * + */ + +// 9.7.8.24.3. Data Movement and Conversion Instructions: cp.async +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async + +// 9.7.8.24.4. Data Movement and Conversion Instructions: cp.async.commit_group +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-commit-group + +// 9.7.8.24.5. Data Movement and Conversion Instructions: cp.async.wait_group / cp.async.wait_all +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-wait-group-cp-async-wait-all + +// 9.7.8.24.6. Data Movement and Conversion Instructions: cp.async.bulk +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk + +// 9.7.8.24.7. Data Movement and Conversion Instructions: cp.reduce.async.bulk +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-reduce-async-bulk + +// 9.7.8.24.8. Data Movement and Conversion Instructions: cp.async.bulk.prefetch +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-prefetch + +// 9.7.8.24.9. Data Movement and Conversion Instructions: cp.async.bulk.tensor +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor + +// 9.7.8.24.10. Data Movement and Conversion Instructions: cp.reduce.async.bulk.tensor +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-reduce-async-bulk-tensor + +// 9.7.8.24.11. Data Movement and Conversion Instructions: cp.async.bulk.prefetch.tensor +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-prefetch-tensor + +// 9.7.8.24.12. Data Movement and Conversion Instructions: cp.async.bulk.commit_group +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-commit-group + +// 9.7.8.24.13. Data Movement and Conversion Instructions: cp.async.bulk.wait_group +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-wait-group + +// 9.7.8.25. Data Movement and Conversion Instructions: tensormap.replace +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-tensormap-replace + + +/* + * 9.7.9. Texture Instructions + * https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#texture-instructions + * + */ + +// 9.7.9.3. Texture Instructions: tex +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#texture-instructions-tex + +// 9.7.9.4. Texture Instructions: tld4 +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#texture-instructions-tld4 + +// 9.7.9.5. Texture Instructions: txq +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#texture-instructions-txq + +// 9.7.9.6. Texture Instructions: istypep +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#texture-instructions-istypep + + +/* + * 9.7.10. Surface Instructions + * https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#surface-instructions + * + */ + +// 9.7.10.1. Surface Instructions: suld +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#surface-instructions-suld + +// 9.7.10.2. Surface Instructions: sust +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#surface-instructions-sust + +// 9.7.10.3. Surface Instructions: sured +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#surface-instructions-sured + +// 9.7.10.4. Surface Instructions: suq +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#surface-instructions-suq + + +/* + * 9.7.11. Control Flow Instructions + * https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions + * + */ + +// 9.7.11.1. Control Flow Instructions: {} +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-curly-braces + +// 9.7.11.2. Control Flow Instructions: @ +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-at + +// 9.7.11.3. Control Flow Instructions: bra +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-bra + +// 9.7.11.4. Control Flow Instructions: brx.idx +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-brx-idx + +// 9.7.11.5. Control Flow Instructions: call +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-call + +// 9.7.11.6. Control Flow Instructions: ret +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-ret + +// 9.7.11.7. Control Flow Instructions: exit +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-exit + + +/* + * 9.7.12. Parallel Synchronization and Communication Instructions + * https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions + * + */ + +// 9.7.12.1. Parallel Synchronization and Communication Instructions: bar, barrier +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-bar-barrier + +// 9.7.12.2. Parallel Synchronization and Communication Instructions: bar.warp.sync +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-bar-warp-sync + +// 9.7.12.3. Parallel Synchronization and Communication Instructions: barrier.cluster +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-barrier-cluster + +// 9.7.12.4. Parallel Synchronization and Communication Instructions: membar/fence +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar-fence + +// 9.7.12.5. Parallel Synchronization and Communication Instructions: atom +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-atom + +// 9.7.12.6. Parallel Synchronization and Communication Instructions: red +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-red + +// 9.7.12.7. Parallel Synchronization and Communication Instructions: red.async +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-red-async + +// 9.7.12.8. Parallel Synchronization and Communication Instructions: vote (deprecated) +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-vote-deprecated + +// 9.7.12.9. Parallel Synchronization and Communication Instructions: vote.sync +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-vote-sync + +// 9.7.12.10. Parallel Synchronization and Communication Instructions: match.sync +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-match-sync + +// 9.7.12.11. Parallel Synchronization and Communication Instructions: activemask +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-activemask + +// 9.7.12.12. Parallel Synchronization and Communication Instructions: redux.sync +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-redux-sync + +// 9.7.12.13. Parallel Synchronization and Communication Instructions: griddepcontrol +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-griddepcontrol + +// 9.7.12.14. Parallel Synchronization and Communication Instructions: elect.sync +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-elect-sync + +/* + * 9.7.12.15. Parallel Synchronization and Communication Instructions: mbarrier + * https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier + * + * Contained in: __cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h + */ + +// 9.7.12.15.18. Parallel Synchronization and Communication Instructions: tensormap.cp_fenceproxy +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-tensormap-cp-fenceproxy + + +/* + * 9.7.13. Warp Level Matrix Multiply-Accumulate Instructions + * https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-multiply-accumulate-instructions + * + */ + +// 9.7.13.3.3. Warp-level Matrix Load Instruction: wmma.load +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-load-instruction-wmma-load + +// 9.7.13.3.4. Warp-level Matrix Store Instruction: wmma.store +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-store-instruction-wmma-store + +// 9.7.13.3.5. Warp-level Matrix Multiply-and-Accumulate Instruction: wmma.mma +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-multiply-and-accumulate-instruction-wmma-mma + +// 9.7.13.4.14. Multiply-and-Accumulate Instruction: mma +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#multiply-and-accumulate-instruction-mma + +// 9.7.13.4.15. Warp-level matrix load instruction: ldmatrix +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-load-instruction-ldmatrix + +// 9.7.13.4.16. Warp-level matrix store instruction: stmatrix +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-store-instruction-stmatrix + +// 9.7.13.4.17. Warp-level matrix transpose instruction: movmatrix +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-transpose-instruction-movmatrix + +// 9.7.13.5.3. Multiply-and-Accumulate Instruction: mma.sp +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#multiply-and-accumulate-instruction-mma-sp + + +/* + * 9.7.14. Asynchronous Warpgroup Level Matrix Multiply-Accumulate Instructions + * https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-warpgroup-level-matrix-multiply-accumulate-instructions + * + */ + +// 9.7.14.5.2. Asynchronous Multiply-and-Accumulate Instruction: wgmma.mma_async +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-multiply-and-accumulate-instruction-wgmma-mma-async + +// 9.7.14.6.4. Asynchronous Multiply-and-Accumulate Instruction: wgmma.mma_async.sp +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-multiply-and-accumulate-instruction-wgmma-mma-async-sp + +// 9.7.14.7.1. Asynchronous Multiply-and-Accumulate Instruction: wgmma.fence +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-multiply-and-accumulate-instruction-wgmma-fence + +// 9.7.14.7.2. Asynchronous Multiply-and-Accumulate Instruction: wgmma.commit_group +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-multiply-and-accumulate-instruction-wgmma-commit-group + +// 9.7.14.7.3. Asynchronous Multiply-and-Accumulate Instruction: wgmma.wait_group +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-multiply-and-accumulate-instruction-wgmma-wait-group + + +/* + * 9.7.15. Stack Manipulation Instructions + * https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#stack-manipulation-instructions + * + */ + +// 9.7.15.1. Stack Manipulation Instructions: stacksave +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#stack-manipulation-instructions-stacksave + +// 9.7.15.2. Stack Manipulation Instructions: stackrestore +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#stack-manipulation-instructions-stackrestore + +// 9.7.15.3. Stack Manipulation Instructions: alloca +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#stack-manipulation-instructions-alloca + + +/* + * 9.7.16. Video Instructions + * https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#video-instructions + * + */ + +// 9.7.16.1.1. Scalar Video Instructions: vadd, vsub, vabsdiff, vmin, vmax +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#scalar-video-instructions-vadd-vsub-vabsdiff-vmin-vmax + +// 9.7.16.1.2. Scalar Video Instructions: vshl, vshr +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#scalar-video-instructions-vshl-vshr + +// 9.7.16.1.3. Scalar Video Instructions: vmad +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#scalar-video-instructions-vmad + +// 9.7.16.1.4. Scalar Video Instructions: vset +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#scalar-video-instructions-vset + + +/* + * 9.7.16.2. SIMD Video Instructions + * https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#simd-video-instructions + * + */ + +// 9.7.16.2.1. SIMD Video Instructions: vadd2, vsub2, vavrg2, vabsdiff2, vmin2, vmax2 +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#simd-video-instructions-vadd2-vsub2-vavrg2-vabsdiff2-vmin2-vmax2 + +// 9.7.16.2.2. SIMD Video Instructions: vset2 +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#simd-video-instructions-vset2 + +// 9.7.16.2.3. SIMD Video Instructions: vadd4, vsub4, vavrg4, vabsdiff4, vmin4, vmax4 +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#simd-video-instructions-vadd4-vsub4-vavrg4-vabsdiff4-vmin4-vmax4 + +// 9.7.16.2.4. SIMD Video Instructions: vset4 +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#simd-video-instructions-vset4 + + +/* + * 9.7.17. Miscellaneous Instructions + * https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions + * + */ + +// 9.7.17.1. Miscellaneous Instructions: brkpt +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions-brkpt + +// 9.7.17.2. Miscellaneous Instructions: nanosleep +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions-nanosleep + +// 9.7.17.3. Miscellaneous Instructions: pmevent +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions-pmevent + +// 9.7.17.4. Miscellaneous Instructions: trap +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions-trap + +// 9.7.17.5. Miscellaneous Instructions: setmaxnreg +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions-setmaxnreg + +_LIBCUDACXX_END_NAMESPACE_CUDA_PTX + +#endif // _LIBCUDACXX___CUDA_PTX_H diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h new file mode 100644 index 00000000000..39bab140414 --- /dev/null +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h @@ -0,0 +1,514 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_PTX_PARALLEL_SYNCHRONIZATION_AND_COMMUNICATION_INSTRUCTIONS_MBARRIER_H_ +#define _CUDA_PTX_PARALLEL_SYNCHRONIZATION_AND_COMMUNICATION_INSTRUCTIONS_MBARRIER_H_ + +#include "ptx_dot_variants.h" +#include "ptx_helper_functions.h" +#include "ptx_isa_target_macros.h" +#include "../../cstdint" + +#if defined(_CCCL_COMPILER_NVHPC) && defined(_CCCL_USE_IMPLICIT_SYSTEM_DEADER) +#pragma GCC system_header +#else // ^^^ _CCCL_COMPILER_NVHPC ^^^ / vvv !_CCCL_COMPILER_NVHPC vvv +_CCCL_IMPLICIT_SYSTEM_HEADER +#endif // !_CCCL_COMPILER_NVHPC + +_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX + +/* + * 9.7.12.15. Parallel Synchronization and Communication Instructions: mbarrier + * https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier + * + */ + +// 9.7.12.15.9. Parallel Synchronization and Communication Instructions: mbarrier.init +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-init + +// 9.7.12.15.10. Parallel Synchronization and Communication Instructions: mbarrier.inval +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-inval + +// 9.7.12.15.11. Parallel Synchronization and Communication Instructions: mbarrier.expect_tx +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-expect-tx + +// 9.7.12.15.12. Parallel Synchronization and Communication Instructions: mbarrier.complete_tx +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-complete-tx + +// 9.7.12.15.13. Parallel Synchronization and Communication Instructions: mbarrier.arrive +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive + +/* +PTX ISA docs: + +// mbarrier.arrive: +mbarrier.arrive{.shared}.b64 state, [addr]; // 1. PTX ISA 70, SM_80 +mbarrier.arrive{.shared{::cta}}.b64 state, [addr]{, count}; // 2. PTX ISA 78, SM_90 (due to count) + +mbarrier.arrive{.sem}{.scope}{.shared{::cta}}.b64 state, [addr]{, count}; // 3. PTX ISA 80, SM_90 (some variants are SM_80, but are covered by 1) +mbarrier.arrive{.sem}{.scope}{.shared::cluster}.b64 _, [addr] {,count} // 4. PTX ISA 80, SM_90 + +.sem = { .release } +.scope = { .cta, .cluster } + + +// mbarrier.arrive.noComplete: +mbarrier.arrive.noComplete{.shared}.b64 state, [addr], count; // 5. PTX ISA 70, SM_80 +mbarrier.arrive.noComplete{.shared{::cta}}.b64 state, [addr], count; // 6. PTX ISA 78, Not exposed. Just a spelling change (shared -> shared::cta) +mbarrier.arrive.noComplete{.sem}{.cta}{.shared{::cta}}.b64 state, [addr], count; // 7. PTX ISA 80, Not exposed. Adds .release, and .cta scope. + + +// mbarrier.arrive.expect_tx: +mbarrier.arrive.expect_tx{.sem}{.scope}{.shared{::cta}}.b64 state, [addr], tx_count; // 8. PTX ISA 80, SM_90 +mbarrier.arrive.expect_tx{.sem}{.scope}{.shared::cluster}.b64 _, [addr], tx_count; // 9. PTX ISA 80, SM_90 + +.sem = { .release } +.scope = { .cta, .cluster } + + +Corresponding Exposure: + +// mbarrier_arrive: +mbarrier.arrive.shared.b64 state, [addr]; // 1. PTX ISA 70, SM_80, !memory +// count is non-optional, otherwise 3 would not be distinguishable from 1 +mbarrier.arrive.shared::cta.b64 state, [addr], count; // 2. PTX ISA 78, SM_90, !memory +mbarrier.arrive{.sem}{.scope}{.space}.b64 state, [addr]; // 3a. PTX ISA 80, SM_90, !memory +.space = { .shared::cta} +.sem = { .release } +.scope = { .cta, .cluster } + +mbarrier.arrive{.sem}{.scope}{.space}.b64 state, [addr], count; // 3b. PTX ISA 80, SM_90, !memory +.space = { .shared::cta} +.sem = { .release } +.scope = { .cta, .cluster } + +// NOTE: .scope=.cta is dropped on purpose +mbarrier.arrive{.sem}{.scope}{.space}.b64 _, [addr]; // 4a. PTX ISA 80, SM_90, !memory +.space = { .shared::cluster} +.sem = { .release } +.scope = { .cluster } + +// NOTE: .scope=.cta is dropped on purpose +mbarrier.arrive{.sem}{.scope}{.space}.b64 _, [addr], count; // 4b. PTX ISA 80, SM_90, !memory +.space = { .shared::cluster} +.sem = { .release } +.scope = { .cluster } + + +// mbarrier_arrive_no_complete: +mbarrier.arrive.noComplete.shared.b64 state, [addr], count; // 5. PTX ISA 70, SM_80, !memory + + +mbarrier.arrive.expect_tx{.sem}{.scope}{.space}.b64 state, [addr], tx_count; // 8. PTX ISA 80, SM_90, !memory +.space = { .shared::cta } +.sem = { .release } +.scope = { .cta, .cluster } + +// NOTE: .scope=.cta is dropped on purpose +mbarrier.arrive.expect_tx{.sem}{.scope}{.space}.b64 _, [addr], tx_count; // 9. PTX ISA 80, SM_90, !memory +.space = { .shared::cluster } +.sem = { .release } +.scope = { .cluster } + +*/ + +/* +// mbarrier.arrive.shared.b64 state, [addr]; // 1. PTX ISA 70, SM_80 +template +__device__ static inline uint64_t mbarrier_arrive( + uint64_t* addr); +*/ +#if __cccl_ptx_isa >= 700 +extern "C" _LIBCUDACXX_DEVICE _CUDA_VSTD::uint64_t ___cuda_vstd_uint64_t__cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_80__(); +template +_LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive( + _CUDA_VSTD::uint64_t* __addr) +{ + NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,( + _CUDA_VSTD::uint64_t __state; + asm ( + "mbarrier.arrive.shared.b64 %0, [%1]; // 1. " + : "=l"(__state) + : "r"(__as_ptr_smem(__addr)) + : "memory" + ); + return __state; + ),( + // Unsupported architectures will have a linker error with a semi-decent error message + return ___cuda_vstd_uint64_t__cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_80__(); + )); +} +#endif // __cccl_ptx_isa >= 700 + +/* +// mbarrier.arrive.shared::cta.b64 state, [addr], count; // 2. PTX ISA 78, SM_90 +template +__device__ static inline uint64_t mbarrier_arrive( + uint64_t* addr, + const uint32_t& count); +*/ +#if __cccl_ptx_isa >= 780 +extern "C" _LIBCUDACXX_DEVICE _CUDA_VSTD::uint64_t ___cuda_vstd_uint64_t__cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); +template +_LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive( + _CUDA_VSTD::uint64_t* __addr, + const _CUDA_VSTD::uint32_t& __count) +{ + NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,( + _CUDA_VSTD::uint64_t __state; + asm ( + "mbarrier.arrive.shared::cta.b64 %0, [%1], %2; // 2. " + : "=l"(__state) + : "r"(__as_ptr_smem(__addr)), + "r"(__count) + : "memory" + ); + return __state; + ),( + // Unsupported architectures will have a linker error with a semi-decent error message + return ___cuda_vstd_uint64_t__cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); + )); +} +#endif // __cccl_ptx_isa >= 780 + +/* +// mbarrier.arrive{.sem}{.scope}{.space}.b64 state, [addr]; // 3a. PTX ISA 80, SM_90 +// .sem = { .release } +// .scope = { .cta, .cluster } +// .space = { .shared::cta } +template +__device__ static inline uint64_t mbarrier_arrive( + cuda::ptx::sem_release_t, + cuda::ptx::scope_t scope, + cuda::ptx::space_shared_t, + uint64_t* addr); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _LIBCUDACXX_DEVICE _CUDA_VSTD::uint64_t ___cuda_vstd_uint64_t__cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); +template +_LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive( + sem_release_t, + scope_t<_Scope> __scope, + space_shared_t, + _CUDA_VSTD::uint64_t* __addr) +{ + // __sem == sem_release (due to parameter type constraint) + static_assert(__scope == scope_cta || __scope == scope_cluster, ""); + // __space == space_shared (due to parameter type constraint) + + NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,( + _CUDA_VSTD::uint64_t __state; + if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cta) { + asm ( + "mbarrier.arrive.release.cta.shared.b64 %0, [%1]; // 3a. " + : "=l"(__state) + : "r"(__as_ptr_smem(__addr)) + : "memory" + ); + } else if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cluster) { + asm ( + "mbarrier.arrive.release.cluster.shared.b64 %0, [%1]; // 3a. " + : "=l"(__state) + : "r"(__as_ptr_smem(__addr)) + : "memory" + ); + } + return __state; + ),( + // Unsupported architectures will have a linker error with a semi-decent error message + return ___cuda_vstd_uint64_t__cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); + )); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// mbarrier.arrive{.sem}{.scope}{.space}.b64 state, [addr], count; // 3b. PTX ISA 80, SM_90 +// .sem = { .release } +// .scope = { .cta, .cluster } +// .space = { .shared::cta } +template +__device__ static inline uint64_t mbarrier_arrive( + cuda::ptx::sem_release_t, + cuda::ptx::scope_t scope, + cuda::ptx::space_shared_t, + uint64_t* addr, + const uint32_t& count); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _LIBCUDACXX_DEVICE _CUDA_VSTD::uint64_t ___cuda_vstd_uint64_t__cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); +template +_LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive( + sem_release_t, + scope_t<_Scope> __scope, + space_shared_t, + _CUDA_VSTD::uint64_t* __addr, + const _CUDA_VSTD::uint32_t& __count) +{ + // __sem == sem_release (due to parameter type constraint) + static_assert(__scope == scope_cta || __scope == scope_cluster, ""); + // __space == space_shared (due to parameter type constraint) + + NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,( + _CUDA_VSTD::uint64_t __state; + if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cta) { + asm ( + "mbarrier.arrive.release.cta.shared.b64 %0, [%1], %2; // 3b. " + : "=l"(__state) + : "r"(__as_ptr_smem(__addr)), + "r"(__count) + : "memory" + ); + } else if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cluster) { + asm ( + "mbarrier.arrive.release.cluster.shared.b64 %0, [%1], %2; // 3b. " + : "=l"(__state) + : "r"(__as_ptr_smem(__addr)), + "r"(__count) + : "memory" + ); + } + return __state; + ),( + // Unsupported architectures will have a linker error with a semi-decent error message + return ___cuda_vstd_uint64_t__cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); + )); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// mbarrier.arrive{.sem}{.scope}{.space}.b64 _, [addr]; // 4a. PTX ISA 80, SM_90 +// .sem = { .release } +// .scope = { .cluster } +// .space = { .shared::cluster } +template +__device__ static inline void mbarrier_arrive( + cuda::ptx::sem_release_t, + cuda::ptx::scope_cluster_t, + cuda::ptx::space_cluster_t, + uint64_t* addr); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _LIBCUDACXX_DEVICE void __void__cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); +template +_LIBCUDACXX_DEVICE static inline void mbarrier_arrive( + sem_release_t, + scope_cluster_t, + space_cluster_t, + _CUDA_VSTD::uint64_t* __addr) +{ + // __sem == sem_release (due to parameter type constraint) + // __scope == scope_cluster (due to parameter type constraint) + // __space == space_cluster (due to parameter type constraint) + + NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,( + asm ( + "mbarrier.arrive.release.cluster.shared::cluster.b64 _, [%0]; // 4a. " + : + : "r"(__as_ptr_smem(__addr)) + : "memory" + ); + + ),( + // Unsupported architectures will have a linker error with a semi-decent error message + return __void__cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); + )); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// mbarrier.arrive{.sem}{.scope}{.space}.b64 _, [addr], count; // 4b. PTX ISA 80, SM_90 +// .sem = { .release } +// .scope = { .cluster } +// .space = { .shared::cluster } +template +__device__ static inline void mbarrier_arrive( + cuda::ptx::sem_release_t, + cuda::ptx::scope_cluster_t, + cuda::ptx::space_cluster_t, + uint64_t* addr, + const uint32_t& count); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _LIBCUDACXX_DEVICE void __void__cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); +template +_LIBCUDACXX_DEVICE static inline void mbarrier_arrive( + sem_release_t, + scope_cluster_t, + space_cluster_t, + _CUDA_VSTD::uint64_t* __addr, + const _CUDA_VSTD::uint32_t& __count) +{ + // __sem == sem_release (due to parameter type constraint) + // __scope == scope_cluster (due to parameter type constraint) + // __space == space_cluster (due to parameter type constraint) + + NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,( + asm ( + "mbarrier.arrive.release.cluster.shared::cluster.b64 _, [%0], %1; // 4b. " + : + : "r"(__as_ptr_smem(__addr)), + "r"(__count) + : "memory" + ); + + ),( + // Unsupported architectures will have a linker error with a semi-decent error message + return __void__cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); + )); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// mbarrier.arrive.noComplete.shared.b64 state, [addr], count; // 5. PTX ISA 70, SM_80 +template +__device__ static inline uint64_t mbarrier_arrive_no_complete( + uint64_t* addr, + const uint32_t& count); +*/ +#if __cccl_ptx_isa >= 700 +extern "C" _LIBCUDACXX_DEVICE _CUDA_VSTD::uint64_t ___cuda_vstd_uint64_t__cuda_ptx_mbarrier_arrive_no_complete_is_not_supported_before_SM_80__(); +template +_LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive_no_complete( + _CUDA_VSTD::uint64_t* __addr, + const _CUDA_VSTD::uint32_t& __count) +{ + NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,( + _CUDA_VSTD::uint64_t __state; + asm ( + "mbarrier.arrive.noComplete.shared.b64 %0, [%1], %2; // 5. " + : "=l"(__state) + : "r"(__as_ptr_smem(__addr)), + "r"(__count) + : "memory" + ); + return __state; + ),( + // Unsupported architectures will have a linker error with a semi-decent error message + return ___cuda_vstd_uint64_t__cuda_ptx_mbarrier_arrive_no_complete_is_not_supported_before_SM_80__(); + )); +} +#endif // __cccl_ptx_isa >= 700 + +/* +// mbarrier.arrive.expect_tx{.sem}{.scope}{.space}.b64 state, [addr], tx_count; // 8. PTX ISA 80, SM_90 +// .sem = { .release } +// .scope = { .cta, .cluster } +// .space = { .shared::cta } +template +__device__ static inline uint64_t mbarrier_arrive_expect_tx( + cuda::ptx::sem_release_t, + cuda::ptx::scope_t scope, + cuda::ptx::space_shared_t, + uint64_t* addr, + const uint32_t& tx_count); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _LIBCUDACXX_DEVICE _CUDA_VSTD::uint64_t ___cuda_vstd_uint64_t__cuda_ptx_mbarrier_arrive_expect_tx_is_not_supported_before_SM_90__(); +template +_LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive_expect_tx( + sem_release_t, + scope_t<_Scope> __scope, + space_shared_t, + _CUDA_VSTD::uint64_t* __addr, + const _CUDA_VSTD::uint32_t& __tx_count) +{ + // __sem == sem_release (due to parameter type constraint) + static_assert(__scope == scope_cta || __scope == scope_cluster, ""); + // __space == space_shared (due to parameter type constraint) + + NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,( + _CUDA_VSTD::uint64_t __state; + if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cta) { + asm ( + "mbarrier.arrive.expect_tx.release.cta.shared.b64 %0, [%1], %2; // 8. " + : "=l"(__state) + : "r"(__as_ptr_smem(__addr)), + "r"(__tx_count) + : "memory" + ); + } else if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cluster) { + asm ( + "mbarrier.arrive.expect_tx.release.cluster.shared.b64 %0, [%1], %2; // 8. " + : "=l"(__state) + : "r"(__as_ptr_smem(__addr)), + "r"(__tx_count) + : "memory" + ); + } + return __state; + ),( + // Unsupported architectures will have a linker error with a semi-decent error message + return ___cuda_vstd_uint64_t__cuda_ptx_mbarrier_arrive_expect_tx_is_not_supported_before_SM_90__(); + )); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// mbarrier.arrive.expect_tx{.sem}{.scope}{.space}.b64 _, [addr], tx_count; // 9. PTX ISA 80, SM_90 +// .sem = { .release } +// .scope = { .cluster } +// .space = { .shared::cluster } +template +__device__ static inline void mbarrier_arrive_expect_tx( + cuda::ptx::sem_release_t, + cuda::ptx::scope_cluster_t, + cuda::ptx::space_cluster_t, + uint64_t* addr, + const uint32_t& tx_count); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _LIBCUDACXX_DEVICE void __void__cuda_ptx_mbarrier_arrive_expect_tx_is_not_supported_before_SM_90__(); +template +_LIBCUDACXX_DEVICE static inline void mbarrier_arrive_expect_tx( + sem_release_t, + scope_cluster_t, + space_cluster_t, + _CUDA_VSTD::uint64_t* __addr, + const _CUDA_VSTD::uint32_t& __tx_count) +{ + // __sem == sem_release (due to parameter type constraint) + // __scope == scope_cluster (due to parameter type constraint) + // __space == space_cluster (due to parameter type constraint) + + NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,( + asm ( + "mbarrier.arrive.expect_tx.release.cluster.shared::cluster.b64 _, [%0], %1; // 9. " + : + : "r"(__as_ptr_smem(__addr)), + "r"(__tx_count) + : "memory" + ); + + ),( + // Unsupported architectures will have a linker error with a semi-decent error message + return __void__cuda_ptx_mbarrier_arrive_expect_tx_is_not_supported_before_SM_90__(); + )); +} +#endif // __cccl_ptx_isa >= 800 + + + + +// 9.7.12.15.14. Parallel Synchronization and Communication Instructions: mbarrier.arrive_drop +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive-drop + +// 9.7.12.15.15. Parallel Synchronization and Communication Instructions: cp.async.mbarrier.arrive +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-cp-async-mbarrier-arrive + +// 9.7.12.15.16. Parallel Synchronization and Communication Instructions: mbarrier.test_wait/mbarrier.try_wait +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-test-wait-mbarrier-try-wait + +// 9.7.12.15.17. Parallel Synchronization and Communication Instructions: mbarrier.pending_count +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-pending-count + +_LIBCUDACXX_END_NAMESPACE_CUDA_PTX + +#endif // _CUDA_PTX_PARALLEL_SYNCHRONIZATION_AND_COMMUNICATION_INSTRUCTIONS_MBARRIER_H_ diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_dot_variants.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_dot_variants.h new file mode 100644 index 00000000000..442c484e8eb --- /dev/null +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_dot_variants.h @@ -0,0 +1,174 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_PTX_DOT_VARIANTS_H_ +#define _CUDA_PTX_DOT_VARIANTS_H_ + +#include "../../__type_traits/integral_constant.h" // std::integral_constant + +#if defined(_CCCL_COMPILER_NVHPC) && defined(_CCCL_USE_IMPLICIT_SYSTEM_DEADER) +#pragma GCC system_header +#else // ^^^ _CCCL_COMPILER_NVHPC ^^^ / vvv !_CCCL_COMPILER_NVHPC vvv +_CCCL_IMPLICIT_SYSTEM_HEADER +#endif // !_CCCL_COMPILER_NVHPC + +/* + * Public integral constant types and values for ".variant"s: + * + * - .sem: acquire, release, .. + * - .space: global, shared, constant, .. + * - .scope: cta, cluster, gpu, .. + * - .op: add, min, cas, .. + * + * For each .variant, the code below defines: + * - An enum `dot_variant` with each possible value + * - A type template `variant_t` + * - Types `variant_A_t`, ..., `variant_Z_t` + * - Constexpr values `variant_A` of type `variant_A_t` + * + * These types enable specifying fine-grained overloads of a PTX binding. If a + * binding can handle multiple variants, then it is defined as: + * + * template + * [...] void ptx_binding(variant_t __v) { ... } + * + * If it only handles a single variant, then it is defined as: + * + * [...] void ptx_binding(variant_A __v) { ... } + * + * If two variants have different behaviors or return types (see .space + * overloads of mbarrier.arrive.expect_tx for an example), then these can be + * provided as separate overloads of the same function: + * + * [...] void ptx_binding(variant_A __v) { ... } + * [...] int ptx_binding(variant_B __v) { ... } + * + */ + +_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX + +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#operation-types +enum class dot_sem +{ + acq_rel, + acquire, + relaxed, + release, + sc, + weak +}; + +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#state-spaces +enum class dot_space +{ + global, + cluster, // The PTX spelling is shared::cluster + shared, // The PTX spelling is shared::cta + + // The following state spaces are unlikely to be used in cuda::ptx in the near + // future, so they are not exposed: + + // reg, + // sreg, + // const_mem, // Using const_mem as `const` is reserved in C++. + // local, + // param, + // tex // deprecated +}; + +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#scope +enum class dot_scope +{ + cta, + cluster, + gpu, + sys +}; + +enum class dot_op +{ + add, + dec, + inc, + max, + min, + and_op, // Using and_op, as `and, or, xor` are reserved in C++. + or_op, + xor_op, + cas, + exch +}; + +template +using sem_t = _CUDA_VSTD::integral_constant; +using sem_acq_rel_t = sem_t; +using sem_acquire_t = sem_t; +using sem_relaxed_t = sem_t; +using sem_release_t = sem_t; +using sem_sc_t = sem_t; +using sem_weak_t = sem_t; + +static constexpr sem_acq_rel_t sem_acq_rel{}; +static constexpr sem_acquire_t sem_acquire{}; +static constexpr sem_relaxed_t sem_relaxed{}; +static constexpr sem_release_t sem_release{}; +static constexpr sem_sc_t sem_sc{}; +static constexpr sem_weak_t sem_weak{}; + +template +using space_t = _CUDA_VSTD::integral_constant; +using space_global_t = space_t; +using space_shared_t = space_t; +using space_cluster_t = space_t; + +static constexpr space_global_t space_global{}; +static constexpr space_shared_t space_shared{}; +static constexpr space_cluster_t space_cluster{}; + +template +using scope_t = _CUDA_VSTD::integral_constant; +using scope_cluster_t = scope_t; +using scope_cta_t = scope_t; +using scope_gpu_t = scope_t; +using scope_sys_t = scope_t; + +static constexpr scope_cluster_t scope_cluster{}; +static constexpr scope_cta_t scope_cta{}; +static constexpr scope_gpu_t scope_gpu{}; +static constexpr scope_sys_t scope_sys{}; + +template +using op_t = _CUDA_VSTD::integral_constant; +using op_add_t = op_t; +using op_dec_t = op_t; +using op_inc_t = op_t; +using op_max_t = op_t; +using op_min_t = op_t; +using op_and_op_t = op_t; +using op_or_op_t = op_t; +using op_xor_op_t = op_t; +using op_cas_t = op_t; +using op_exch_t = op_t; + +static constexpr op_add_t op_add{}; +static constexpr op_dec_t op_dec{}; +static constexpr op_inc_t op_inc{}; +static constexpr op_max_t op_max{}; +static constexpr op_min_t op_min{}; +static constexpr op_and_op_t op_and_op{}; +static constexpr op_or_op_t op_or_op{}; +static constexpr op_xor_op_t op_xor_op{}; +static constexpr op_cas_t op_cas{}; +static constexpr op_exch_t op_exch{}; + +_LIBCUDACXX_END_NAMESPACE_CUDA_PTX + +#endif // _CUDA_PTX_DOT_VARIANTS_H_ diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_helper_functions.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_helper_functions.h new file mode 100644 index 00000000000..f6ec0b3959e --- /dev/null +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_helper_functions.h @@ -0,0 +1,62 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_PTX_HELPER_FUNCTIONS_H_ +#define _CUDA_PTX_HELPER_FUNCTIONS_H_ + +#include "../../cstdint" // uint32_t + +#if defined(_CCCL_COMPILER_NVHPC) && defined(_CCCL_USE_IMPLICIT_SYSTEM_DEADER) +#pragma GCC system_header +#else // ^^^ _CCCL_COMPILER_NVHPC ^^^ / vvv !_CCCL_COMPILER_NVHPC vvv +_CCCL_IMPLICIT_SYSTEM_HEADER +#endif // !_CCCL_COMPILER_NVHPC + +_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX + +inline _LIBCUDACXX_DEVICE _CUDA_VSTD::uint32_t __as_ptr_smem(const void* __ptr) +{ + // Consider adding debug asserts here. + return static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__ptr)); +} + +inline _LIBCUDACXX_DEVICE _CUDA_VSTD::uint32_t __as_ptr_remote_dsmem(const void* __ptr) +{ + // No difference in implementation to __as_ptr_smem. + // Consider adding debug asserts here. + return static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__ptr)); +} + +inline _LIBCUDACXX_DEVICE _CUDA_VSTD::uint64_t __as_ptr_gmem(const void* __ptr) +{ + // Consider adding debug asserts here. + return static_cast<_CUDA_VSTD::uint64_t>(__cvta_generic_to_global(__ptr)); +} + +template +inline _LIBCUDACXX_DEVICE _CUDA_VSTD::uint32_t __as_b32(_Tp __val) +{ + static_assert(sizeof(_Tp) == 4, ""); + // Consider using std::bitcast + return *reinterpret_cast<_CUDA_VSTD::uint32_t*>(&__val); +} + +template +inline _LIBCUDACXX_DEVICE _CUDA_VSTD::uint64_t __as_b64(_Tp __val) +{ + static_assert(sizeof(_Tp) == 8, ""); + // Consider using std::bitcast + return *reinterpret_cast<_CUDA_VSTD::uint64_t*>(&__val); +} + +_LIBCUDACXX_END_NAMESPACE_CUDA_PTX + +#endif // _CUDA_PTX_HELPER_FUNCTIONS_H_ diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h new file mode 100644 index 00000000000..ca5297e4de4 --- /dev/null +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_isa_target_macros.h @@ -0,0 +1,75 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + + +#ifndef _CUDA_PTX_PTX_ISA_TARGET_MACROS_H_ +#define _CUDA_PTX_PTX_ISA_TARGET_MACROS_H_ + +#include // __CUDA_MINIMUM_ARCH__ and friends + +#if defined(_CCCL_COMPILER_NVHPC) && defined(_CCCL_USE_IMPLICIT_SYSTEM_DEADER) +#pragma GCC system_header +#else // ^^^ _CCCL_COMPILER_NVHPC ^^^ / vvv !_CCCL_COMPILER_NVHPC vvv +_CCCL_IMPLICIT_SYSTEM_HEADER +#endif // !_CCCL_COMPILER_NVHPC + +/* + * Targeting macros + * + * Information from: + * https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#release-notes + */ + +// PTX ISA 8.3 is available from CUDA 12.3, driver r545 +#if (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 3)) || (!defined(__CUDACC_VER_MAJOR__)) +# define __cccl_ptx_isa 830ULL +// PTX ISA 8.2 is available from CUDA 12.2, driver r535 +#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 2)) || (!defined(__CUDACC_VER_MAJOR__)) +# define __cccl_ptx_isa 820ULL +// PTX ISA 8.1 is available from CUDA 12.1, driver r530 +#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 1)) || (!defined(__CUDACC_VER_MAJOR__)) +# define __cccl_ptx_isa 810ULL +// PTX ISA 8.0 is available from CUDA 12.0, driver r525 +#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 0)) || (!defined(__CUDACC_VER_MAJOR__)) +# define __cccl_ptx_isa 800ULL +// PTX ISA 7.8 is available from CUDA 11.8, driver r520 +#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 8)) || (!defined(__CUDACC_VER_MAJOR__)) +# define __cccl_ptx_isa 780ULL +// PTX ISA 7.7 is available from CUDA 11.7, driver r515 +#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 7)) || (!defined(__CUDACC_VER_MAJOR__)) +# define __cccl_ptx_isa 770ULL +// PTX ISA 7.6 is available from CUDA 11.6, driver r510 +#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 6)) || (!defined(__CUDACC_VER_MAJOR__)) +# define __cccl_ptx_isa 760ULL +// PTX ISA 7.5 is available from CUDA 11.5, driver r495 +#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 5)) || (!defined(__CUDACC_VER_MAJOR__)) +# define __cccl_ptx_isa 750ULL +// PTX ISA 7.4 is available from CUDA 11.4, driver r470 +#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 4)) || (!defined(__CUDACC_VER_MAJOR__)) +# define __cccl_ptx_isa 740ULL +// PTX ISA 7.3 is available from CUDA 11.3, driver r465 +#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 3)) || (!defined(__CUDACC_VER_MAJOR__)) +# define __cccl_ptx_isa 730ULL +// PTX ISA 7.2 is available from CUDA 11.2, driver r460 +#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 2)) || (!defined(__CUDACC_VER_MAJOR__)) +# define __cccl_ptx_isa 720ULL +// PTX ISA 7.1 is available from CUDA 11.1, driver r455 +#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 1)) || (!defined(__CUDACC_VER_MAJOR__)) +# define __cccl_ptx_isa 710ULL +// PTX ISA 7.0 is available from CUDA 11.0, driver r445 +#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 0)) || (!defined(__CUDACC_VER_MAJOR__)) +# define __cccl_ptx_isa 700ULL +// Fallback case. Define the ISA version to be zero. This ensures that the macro is always defined. +#else +# define __cccl_ptx_isa 0ULL +#endif + +#endif // _CUDA_PTX_PTX_ISA_TARGET_MACROS_H_