From abe6d1cf882205eaef064103c0e8bc653f3ee951 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Fri, 3 Nov 2023 16:39:22 +0100
Subject: [PATCH] Fix C++11 support of recently added tests (#651)

---
 .../test/cuda/barrier/cp_async_bulk_tensor_1d.pass.cpp |  1 +
 .../test/cuda/barrier/cp_async_bulk_tensor_2d.pass.cpp |  1 +
 .../test/cuda/barrier/cp_async_bulk_tensor_3d.pass.cpp |  1 +
 .../test/cuda/barrier/cp_async_bulk_tensor_4d.pass.cpp |  1 +
 .../test/cuda/barrier/cp_async_bulk_tensor_5d.pass.cpp |  1 +
 .../cuda/std/detail/libcxx/include/__cuda/barrier.h    | 10 ++++++++++
 6 files changed, 15 insertions(+)

diff --git a/libcudacxx/.upstream-tests/test/cuda/barrier/cp_async_bulk_tensor_1d.pass.cpp b/libcudacxx/.upstream-tests/test/cuda/barrier/cp_async_bulk_tensor_1d.pass.cpp
index 4af32de2114..6d890edc9bc 100644
--- a/libcudacxx/.upstream-tests/test/cuda/barrier/cp_async_bulk_tensor_1d.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/cuda/barrier/cp_async_bulk_tensor_1d.pass.cpp
@@ -8,6 +8,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
+// UNSUPPORTED: c++11
 // UNSUPPORTED: libcpp-has-no-threads
 // UNSUPPORTED: pre-sm-90
 // UNSUPPORTED: nvrtc
diff --git a/libcudacxx/.upstream-tests/test/cuda/barrier/cp_async_bulk_tensor_2d.pass.cpp b/libcudacxx/.upstream-tests/test/cuda/barrier/cp_async_bulk_tensor_2d.pass.cpp
index be0c29f5eeb..e394515666a 100644
--- a/libcudacxx/.upstream-tests/test/cuda/barrier/cp_async_bulk_tensor_2d.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/cuda/barrier/cp_async_bulk_tensor_2d.pass.cpp
@@ -8,6 +8,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
+// UNSUPPORTED: c++11
 // UNSUPPORTED: libcpp-has-no-threads
 // UNSUPPORTED: pre-sm-90
 // UNSUPPORTED: nvrtc
diff --git a/libcudacxx/.upstream-tests/test/cuda/barrier/cp_async_bulk_tensor_3d.pass.cpp b/libcudacxx/.upstream-tests/test/cuda/barrier/cp_async_bulk_tensor_3d.pass.cpp
index 0b3a12f3539..b10c2bfc269 100644
--- a/libcudacxx/.upstream-tests/test/cuda/barrier/cp_async_bulk_tensor_3d.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/cuda/barrier/cp_async_bulk_tensor_3d.pass.cpp
@@ -8,6 +8,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
+// UNSUPPORTED: c++11
 // UNSUPPORTED: libcpp-has-no-threads
 // UNSUPPORTED: pre-sm-90
 // UNSUPPORTED: nvrtc
diff --git a/libcudacxx/.upstream-tests/test/cuda/barrier/cp_async_bulk_tensor_4d.pass.cpp b/libcudacxx/.upstream-tests/test/cuda/barrier/cp_async_bulk_tensor_4d.pass.cpp
index 68371a45ca0..8e7886db6a8 100644
--- a/libcudacxx/.upstream-tests/test/cuda/barrier/cp_async_bulk_tensor_4d.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/cuda/barrier/cp_async_bulk_tensor_4d.pass.cpp
@@ -8,6 +8,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
+// UNSUPPORTED: c++11
 // UNSUPPORTED: libcpp-has-no-threads
 // UNSUPPORTED: pre-sm-90
 // UNSUPPORTED: nvrtc
diff --git a/libcudacxx/.upstream-tests/test/cuda/barrier/cp_async_bulk_tensor_5d.pass.cpp b/libcudacxx/.upstream-tests/test/cuda/barrier/cp_async_bulk_tensor_5d.pass.cpp
index cbf6141a0af..8be14c21c14 100644
--- a/libcudacxx/.upstream-tests/test/cuda/barrier/cp_async_bulk_tensor_5d.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/cuda/barrier/cp_async_bulk_tensor_5d.pass.cpp
@@ -8,6 +8,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
+// UNSUPPORTED: c++11
 // UNSUPPORTED: libcpp-has-no-threads
 // UNSUPPORTED: pre-sm-90
 // UNSUPPORTED: nvrtc
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h
index 115e57a27d9..9fd883659bc 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h
@@ -955,7 +955,13 @@ template <size_t _Copy_size>
 inline __device__
 void __cp_async_shared_global(char * __dest, const char * __src) {
     // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async
+
+    // If `if constexpr` is not available, this function gets instantiated even
+    // if is not called. Do not static_assert in that case.
+#if _LIBCUDACXX_STD_VER >= 17
     static_assert(_Copy_size == 4 || _Copy_size == 8 || _Copy_size == 16, "cp.async.shared.global requires a copy size of 4, 8, or 16.");
+#endif // _LIBCUDACXX_STD_VER >= 17
+
     asm volatile(
         "cp.async.ca.shared.global [%0], [%1], %2, %2;"
         :
@@ -982,7 +988,11 @@ void __cp_async_shared_global<16>(char * __dest, const char * __src) {
 template <size_t _Alignment, typename _Group>
 inline __device__
 void __cp_async_shared_global_mechanism(_Group __g, char * __dest, const char * __src, _CUDA_VSTD::size_t __size) {
+    // If `if constexpr` is not available, this function gets instantiated even
+    // if is not called. Do not static_assert in that case.
+#if _LIBCUDACXX_STD_VER >= 17
     static_assert(4 <= _Alignment, "cp.async requires at least 4-byte alignment");
+#endif // _LIBCUDACXX_STD_VER >= 17
 
     // Maximal copy size is 16.
     constexpr int __copy_size = (_Alignment > 16) ? 16 : _Alignment;