From b60bf182b3b5bd425cbc1ad49a92de72010afc98 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Tue, 5 Mar 2024 13:55:38 -0800
Subject: [PATCH] Clean up usage of __CUDA_ARCH__ and other macros. (#15218)

Closes #15030.

This PR cleans up references to `__CUDA_ARCH__` and other macros.

- We can safely drop Pascal support now that the required minimum is Volta (`__CUDA_ARCH__` of 700).
- Removed a leftover reference to CUDA 10.
- Removed an instance of `#if 1` that was no longer needed.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Michael Schellenberger Costa (https://github.com/miscco)
  - Nghia Truong (https://github.com/ttnghia)
  - Mike Wilson (https://github.com/hyperbolic2346)

URL: https://github.com/rapidsai/cudf/pull/15218
---
 .../cudf/detail/utilities/device_atomics.cuh  |   3 -
 cpp/src/filling/repeat.cu                     |   9 +-
 cpp/src/hash/managed.cuh                      |   4 -
 cpp/src/io/comp/snap.cu                       |  10 --
 cpp/src/io/fst/agent_dfa.cuh                  |   2 +-
 cpp/src/transform/row_conversion.cu           | 100 +-----------------
 6 files changed, 8 insertions(+), 120 deletions(-)
diff --git a/cpp/include/cudf/detail/utilities/device_atomics.cuh b/cpp/include/cudf/detail/utilities/device_atomics.cuh
index 1e3fe3d08dc..6f23abc59a8 100644
--- a/cpp/include/cudf/detail/utilities/device_atomics.cuh
+++ b/cpp/include/cudf/detail/utilities/device_atomics.cuh
@@ -170,8 +170,6 @@ struct genericAtomicOperationImpl<float, DeviceSum, 4> {
   }
 };
 
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 600)
-// `atomicAdd(double)` is supported after cuda architecture 6.0
 template <>
 struct genericAtomicOperationImpl<double, DeviceSum, 8> {
   using T = double;
@@ -180,7 +178,6 @@ struct genericAtomicOperationImpl<double, DeviceSum, 8> {
     return atomicAdd(addr, update_value);
   }
 };
-#endif
 
 template <>
 struct genericAtomicOperationImpl<int32_t, DeviceSum, 4> {
diff --git a/cpp/src/filling/repeat.cu b/cpp/src/filling/repeat.cu
index bd53eeddbb5..87cc0f21d0e 100644
--- a/cpp/src/filling/repeat.cu
+++ b/cpp/src/filling/repeat.cu
@@ -55,13 +55,8 @@ struct count_accessor {
   std::enable_if_t<std::is_integral_v<T>, cudf::size_type> operator()(rmm::cuda_stream_view stream)
   {
     using ScalarType = cudf::scalar_type_t<T>;
-#if 1
-    // TODO: temporary till cudf::scalar's value() function is marked as const
-    auto p_count = const_cast<ScalarType*>(static_cast<ScalarType const*>(this->p_scalar));
-#else
-    auto p_count = static_cast<ScalarType const*>(this->p_scalar);
-#endif
-    auto count = p_count->value(stream);
+    auto p_count     = static_cast<ScalarType const*>(this->p_scalar);
+    auto count       = p_count->value(stream);
     // static_cast is necessary due to bool
     CUDF_EXPECTS(static_cast<int64_t>(count) <= std::numeric_limits<cudf::size_type>::max(),
                  "count should not exceed the column size limit",
diff --git a/cpp/src/hash/managed.cuh b/cpp/src/hash/managed.cuh
index aa7bff85ea6..9797c83c47c 100644
--- a/cpp/src/hash/managed.cuh
+++ b/cpp/src/hash/managed.cuh
@@ -37,9 +37,5 @@ struct managed {
 
 inline bool isPtrManaged(cudaPointerAttributes attr)
 {
-#if CUDART_VERSION >= 10000
   return (attr.type == cudaMemoryTypeManaged);
-#else
-  return attr.isManaged;
-#endif
 }
diff --git a/cpp/src/io/comp/snap.cu b/cpp/src/io/comp/snap.cu
index 252c96f496a..7d4dcffa713 100644
--- a/cpp/src/io/comp/snap.cu
+++ b/cpp/src/io/comp/snap.cu
@@ -153,17 +153,7 @@ static __device__ uint8_t* StoreCopy(uint8_t* dst,
  */
 static inline __device__ uint32_t HashMatchAny(uint32_t v, uint32_t t)
 {
-#if (__CUDA_ARCH__ >= 700)
   return __match_any_sync(~0, v);
-#else
-  uint32_t err_map = 0;
-  for (uint32_t i = 0; i < hash_bits; i++, v >>= 1) {
-    uint32_t b       = v & 1;
-    uint32_t match_b = ballot(b);
-    err_map |= match_b ^ -(int32_t)b;
-  }
-  return ~err_map;
-#endif
 }
 
 /**
diff --git a/cpp/src/io/fst/agent_dfa.cuh b/cpp/src/io/fst/agent_dfa.cuh
index 9ba8696370a..2171764decd 100644
--- a/cpp/src/io/fst/agent_dfa.cuh
+++ b/cpp/src/io/fst/agent_dfa.cuh
@@ -91,7 +91,7 @@ class DFASimulationCallbackWrapper {
   {
     uint32_t const count = transducer_table(old_state, symbol_id, read_symbol);
     if (write) {
-#if __CUDA_ARCH__ > 0
+#if defined(__CUDA_ARCH__)
 #pragma unroll 1
 #endif
       for (uint32_t out_char = 0; out_char < count; out_char++) {
diff --git a/cpp/src/transform/row_conversion.cu b/cpp/src/transform/row_conversion.cu
index 32faa097d0e..359e1ccb80d 100644
--- a/cpp/src/transform/row_conversion.cu
+++ b/cpp/src/transform/row_conversion.cu
@@ -39,24 +39,14 @@
 #include <rmm/exec_policy.hpp>
 
 #include <cooperative_groups.h>
+#include <cuda/barrier>
+#include <cuda/functional>
 #include <thrust/binary_search.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/discard_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
 #include <thrust/scan.h>
 
-#include <type_traits>
-
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
-#define ASYNC_MEMCPY_SUPPORTED
-#endif
-
-#if !defined(__CUDA_ARCH__) || defined(ASYNC_MEMCPY_SUPPORTED)
-#include <cuda/barrier>
-#endif  // #if !defined(__CUDA_ARCH__) || defined(ASYNC_MEMCPY_SUPPORTED)
-
-#include <cuda/functional>
-
 #include <algorithm>
 #include <cstdarg>
 #include <cstdint>
@@ -65,6 +55,7 @@
 #include <limits>
 #include <optional>
 #include <tuple>
+#include <type_traits>
 
 namespace {
 
@@ -90,13 +81,6 @@ using detail::make_device_uvector_async;
 using detail::make_device_uvector_sync;
 using rmm::device_uvector;
 
-#ifdef ASYNC_MEMCPY_SUPPORTED
-using cuda::aligned_size_t;
-#else
-template <std::size_t>
-using aligned_size_t = size_t;  // Local stub for cuda::aligned_size_t.
-#endif  // ASYNC_MEMCPY_SUPPORTED
-
 namespace cudf {
 namespace detail {
 
@@ -569,12 +553,6 @@ CUDF_KERNEL void copy_to_rows_fixed_width_optimized(const size_type start_row,
   }
 }
 
-#ifdef ASYNC_MEMCPY_SUPPORTED
-#define MEMCPY(dst, src, size, barrier) cuda::memcpy_async(dst, src, size, barrier)
-#else
-#define MEMCPY(dst, src, size, barrier) memcpy(dst, src, size)
-#endif  // ASYNC_MEMCPY_SUPPORTED
-
 /**
  * @brief copy data from cudf columns into JCUDF format, which is row-based
  *
@@ -615,11 +593,9 @@ CUDF_KERNEL void copy_to_rows(const size_type num_rows,
   auto const warp  = cooperative_groups::tiled_partition<cudf::detail::warp_size>(group);
   extern __shared__ int8_t shared_data[];
 
-#ifdef ASYNC_MEMCPY_SUPPORTED
   __shared__ cuda::barrier<cuda::thread_scope_block> tile_barrier;
   if (group.thread_rank() == 0) { init(&tile_barrier, group.size()); }
   group.sync();
-#endif  // ASYNC_MEMCPY_SUPPORTED
 
   auto const tile                   = tile_infos[blockIdx.x];
   auto const num_tile_cols          = tile.num_cols();
@@ -702,21 +678,11 @@ CUDF_KERNEL void copy_to_rows(const size_type num_rows,
     auto const src = &shared_data[tile_row_size * copy_row];
     auto const dst = tile_output_buffer + row_offsets(copy_row + tile.start_row, row_batch_start) +
                      starting_column_offset;
-#ifdef ASYNC_MEMCPY_SUPPORTED
     cuda::memcpy_async(warp, dst, src, tile_row_size, tile_barrier);
-#else
-    for (int b = warp.thread_rank(); b < tile_row_size; b += warp.size()) {
-      dst[b] = src[b];
-    }
-#endif
   }
 
-#ifdef ASYNC_MEMCPY_SUPPORTED
   // wait on the last copies to complete
   tile_barrier.arrive_and_wait();
-#else
-  group.sync();
-#endif  // ASYNC_MEMCPY_SUPPORTED
 }
 
 /**
@@ -752,12 +718,10 @@ CUDF_KERNEL void copy_validity_to_rows(const size_type num_rows,
   auto const group = cooperative_groups::this_thread_block();
   auto const warp  = cooperative_groups::tiled_partition<cudf::detail::warp_size>(group);
 
-#ifdef ASYNC_MEMCPY_SUPPORTED
   // Initialize cuda barriers for each tile.
   __shared__ cuda::barrier<cuda::thread_scope_block> shared_tile_barrier;
   if (group.thread_rank() == 0) { init(&shared_tile_barrier, group.size()); }
   group.sync();
-#endif  // ASYNC_MEMCPY_SUPPORTED
 
   auto tile                = tile_infos[blockIdx.x];
   auto const num_tile_cols = tile.num_cols();
@@ -822,21 +786,11 @@ CUDF_KERNEL void copy_validity_to_rows(const size_type num_rows,
        relative_row += warp.meta_group_size()) {
     auto const src = &shared_data[validity_data_row_length * relative_row];
     auto const dst = output_data_base + row_offsets(relative_row + tile.start_row, row_batch_start);
-#ifdef ASYNC_MEMCPY_SUPPORTED
     cuda::memcpy_async(warp, dst, src, row_bytes, shared_tile_barrier);
-#else
-    for (int b = warp.thread_rank(); b < row_bytes; b += warp.size()) {
-      dst[b] = src[b];
-    }
-#endif
   }
 
-#ifdef ASYNC_MEMCPY_SUPPORTED
   // wait for tile of data to arrive
   shared_tile_barrier.arrive_and_wait();
-#else
-  group.sync();
-#endif  // ASYNC_MEMCPY_SUPPORTED
 }
 
 /**
@@ -871,9 +825,7 @@ CUDF_KERNEL void copy_strings_to_rows(size_type const num_rows,
   // memcpy of the string data.
   auto const my_block = cooperative_groups::this_thread_block();
   auto const warp     = cooperative_groups::tiled_partition<cudf::detail::warp_size>(my_block);
-#ifdef ASYNC_MEMCPY_SUPPORTED
   cuda::barrier<cuda::thread_scope_block> block_barrier;
-#endif
 
   auto const start_row =
     blockIdx.x * NUM_STRING_ROWS_PER_BLOCK_TO_ROWS + warp.meta_group_rank() + batch_row_offset;
@@ -896,13 +848,7 @@ CUDF_KERNEL void copy_strings_to_rows(size_type const num_rows,
       auto string_output_dest = &output_data[base_row_offset + offset];
       auto string_output_src  = &variable_input_data[col][string_start_offset];
       warp.sync();
-#ifdef ASYNC_MEMCPY_SUPPORTED
       cuda::memcpy_async(warp, string_output_dest, string_output_src, string_length, block_barrier);
-#else
-      for (int c = warp.thread_rank(); c < string_length; c += warp.size()) {
-        string_output_dest[c] = string_output_src[c];
-      }
-#endif
       offset += string_length;
     }
   }
@@ -950,12 +896,10 @@ CUDF_KERNEL void copy_from_rows(const size_type num_rows,
   auto const warp  = cooperative_groups::tiled_partition<cudf::detail::warp_size>(group);
   extern __shared__ int8_t shared[];
 
-#ifdef ASYNC_MEMCPY_SUPPORTED
   // Initialize cuda barriers for each tile.
   __shared__ cuda::barrier<cuda::thread_scope_block> tile_barrier;
   if (group.thread_rank() == 0) { init(&tile_barrier, group.size()); }
   group.sync();
-#endif  // ASYNC_MEMCPY_SUPPORTED
 
   {
     auto const fetch_tile           = tile_infos[blockIdx.x];
@@ -973,13 +917,7 @@ CUDF_KERNEL void copy_from_rows(const size_type num_rows,
       auto dst           = &shared[shared_offset];
       auto src = &input_data[row_offsets(absolute_row, row_batch_start) + starting_col_offset];
       // copy the data
-#ifdef ASYNC_MEMCPY_SUPPORTED
       cuda::memcpy_async(warp, dst, src, fetch_tile_row_size, tile_barrier);
-#else
-      for (int b = warp.thread_rank(); b < fetch_tile_row_size; b += warp.size()) {
-        dst[b] = src[b];
-      }
-#endif
     }
   }
 
@@ -989,12 +927,8 @@ CUDF_KERNEL void copy_from_rows(const size_type num_rows,
     auto const cols_in_tile  = tile.num_cols();
     auto const tile_row_size = tile.get_shared_row_size(col_offsets, col_sizes);
 
-#ifdef ASYNC_MEMCPY_SUPPORTED
     // ensure our data is ready
     tile_barrier.arrive_and_wait();
-#else
-    group.sync();
-#endif  // ASYNC_MEMCPY_SUPPORTED
 
     // Now we copy from shared memory to final destination. The data is laid out in rows in shared
     // memory, so the reads for a column will be "vertical". Because of this and the different sizes
@@ -1017,17 +951,13 @@ CUDF_KERNEL void copy_from_rows(const size_type num_rows,
         int8_t* shmem_src = &shared[shared_memory_offset];
         int8_t* dst       = &output_data[absolute_col][absolute_row * column_size];
 
-        MEMCPY(dst, shmem_src, column_size, tile_barrier);
+        cuda::memcpy_async(dst, shmem_src, column_size, tile_barrier);
       }
     }
   }
 
-#ifdef ASYNC_MEMCPY_SUPPORTED
   // wait on the last copies to complete
   tile_barrier.arrive_and_wait();
-#else
-  group.sync();
-#endif  // ASYNC_MEMCPY_SUPPORTED
 }
 
 /**
@@ -1077,12 +1007,10 @@ CUDF_KERNEL void copy_validity_from_rows(const size_type num_rows,
   auto const group = cooperative_groups::this_thread_block();
   auto const warp  = cooperative_groups::tiled_partition<cudf::detail::warp_size>(group);
 
-#ifdef ASYNC_MEMCPY_SUPPORTED
   // Initialize cuda barriers for each tile.
   __shared__ cuda::barrier<cuda::thread_scope_block> shared_tile_barrier;
   if (group.thread_rank() == 0) { init(&shared_tile_barrier, group.size()); }
   group.sync();
-#endif  // ASYNC_MEMCPY_SUPPORTED
 
   auto const tile           = tile_infos[blockIdx.x];
   auto const tile_start_col = tile.start_col;
@@ -1147,22 +1075,12 @@ CUDF_KERNEL void copy_validity_from_rows(const size_type num_rows,
     auto const src =
       reinterpret_cast<bitmask_type*>(&shared[validity_data_col_length * relative_col]);
 
-#ifdef ASYNC_MEMCPY_SUPPORTED
     cuda::memcpy_async(
-      warp, dst, src, aligned_size_t<4>(validity_data_col_length), shared_tile_barrier);
-#else
-    for (int b = warp.thread_rank(); b < col_words; b += warp.size()) {
-      dst[b] = src[b];
-    }
-#endif
+      warp, dst, src, cuda::aligned_size_t<4>(validity_data_col_length), shared_tile_barrier);
   }
 
-#ifdef ASYNC_MEMCPY_SUPPORTED
   // wait for tile of data to arrive
   shared_tile_barrier.arrive_and_wait();
-#else
-  group.sync();
-#endif  // ASYNC_MEMCPY_SUPPORTED
 }
 
 /**
@@ -1193,9 +1111,7 @@ CUDF_KERNEL void copy_strings_from_rows(RowOffsetFunctor row_offsets,
   // Traversing in row-major order to coalesce the offsets and size reads.
   auto my_block = cooperative_groups::this_thread_block();
   auto warp     = cooperative_groups::tiled_partition<cudf::detail::warp_size>(my_block);
-#ifdef ASYNC_MEMCPY_SUPPORTED
   cuda::barrier<cuda::thread_scope_block> block_barrier;
-#endif
 
   // workaround for not being able to take a reference to a constexpr host variable
   auto const ROWS_PER_BLOCK = NUM_STRING_ROWS_PER_BLOCK_FROM_ROWS;
@@ -1216,13 +1132,7 @@ CUDF_KERNEL void copy_strings_from_rows(RowOffsetFunctor row_offsets,
       auto const src = &row_data[row_offsets(row, 0) + str_row_off[row]];
       auto dst       = &str_col_data[str_col_off[row]];
 
-#ifdef ASYNC_MEMCPY_SUPPORTED
       cuda::memcpy_async(warp, dst, src, str_len[row], block_barrier);
-#else
-      for (int c = warp.thread_rank(); c < str_len[row]; c += warp.size()) {
-        dst[c] = src[c];
-      }
-#endif
     }
   }
 }