From b60bf182b3b5bd425cbc1ad49a92de72010afc98 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Tue, 5 Mar 2024 13:55:38 -0800 Subject: [PATCH] Clean up usage of __CUDA_ARCH__ and other macros. (#15218) Closes #15030. This PR cleans up references to `__CUDA_ARCH__` and other macros. - We can safely drop Pascal support now that the required minimum is Volta (`__CUDA_ARCH__` of 700). - Removed a leftover reference to CUDA 10. - Removed an instance of `#if 1` that was no longer needed. Authors: - Bradley Dice (https://github.com/bdice) Approvers: - Michael Schellenberger Costa (https://github.com/miscco) - Nghia Truong (https://github.com/ttnghia) - Mike Wilson (https://github.com/hyperbolic2346) URL: https://github.com/rapidsai/cudf/pull/15218 --- .../cudf/detail/utilities/device_atomics.cuh | 3 - cpp/src/filling/repeat.cu | 9 +- cpp/src/hash/managed.cuh | 4 - cpp/src/io/comp/snap.cu | 10 -- cpp/src/io/fst/agent_dfa.cuh | 2 +- cpp/src/transform/row_conversion.cu | 100 +----------------- 6 files changed, 8 insertions(+), 120 deletions(-) diff --git a/cpp/include/cudf/detail/utilities/device_atomics.cuh b/cpp/include/cudf/detail/utilities/device_atomics.cuh index 1e3fe3d08dc..6f23abc59a8 100644 --- a/cpp/include/cudf/detail/utilities/device_atomics.cuh +++ b/cpp/include/cudf/detail/utilities/device_atomics.cuh @@ -170,8 +170,6 @@ struct genericAtomicOperationImpl { } }; -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 600) -// `atomicAdd(double)` is supported after cuda architecture 6.0 template <> struct genericAtomicOperationImpl { using T = double; @@ -180,7 +178,6 @@ struct genericAtomicOperationImpl { return atomicAdd(addr, update_value); } }; -#endif template <> struct genericAtomicOperationImpl { diff --git a/cpp/src/filling/repeat.cu b/cpp/src/filling/repeat.cu index bd53eeddbb5..87cc0f21d0e 100644 --- a/cpp/src/filling/repeat.cu +++ b/cpp/src/filling/repeat.cu @@ -55,13 +55,8 @@ struct count_accessor { std::enable_if_t, cudf::size_type> operator()(rmm::cuda_stream_view stream) { using ScalarType = cudf::scalar_type_t; -#if 1 - // TODO: temporary till cudf::scalar's value() function is marked as const - auto p_count = const_cast(static_cast(this->p_scalar)); -#else - auto p_count = static_cast(this->p_scalar); -#endif - auto count = p_count->value(stream); + auto p_count = static_cast(this->p_scalar); + auto count = p_count->value(stream); // static_cast is necessary due to bool CUDF_EXPECTS(static_cast(count) <= std::numeric_limits::max(), "count should not exceed the column size limit", diff --git a/cpp/src/hash/managed.cuh b/cpp/src/hash/managed.cuh index aa7bff85ea6..9797c83c47c 100644 --- a/cpp/src/hash/managed.cuh +++ b/cpp/src/hash/managed.cuh @@ -37,9 +37,5 @@ struct managed { inline bool isPtrManaged(cudaPointerAttributes attr) { -#if CUDART_VERSION >= 10000 return (attr.type == cudaMemoryTypeManaged); -#else - return attr.isManaged; -#endif } diff --git a/cpp/src/io/comp/snap.cu b/cpp/src/io/comp/snap.cu index 252c96f496a..7d4dcffa713 100644 --- a/cpp/src/io/comp/snap.cu +++ b/cpp/src/io/comp/snap.cu @@ -153,17 +153,7 @@ static __device__ uint8_t* StoreCopy(uint8_t* dst, */ static inline __device__ uint32_t HashMatchAny(uint32_t v, uint32_t t) { -#if (__CUDA_ARCH__ >= 700) return __match_any_sync(~0, v); -#else - uint32_t err_map = 0; - for (uint32_t i = 0; i < hash_bits; i++, v >>= 1) { - uint32_t b = v & 1; - uint32_t match_b = ballot(b); - err_map |= match_b ^ -(int32_t)b; - } - return ~err_map; -#endif } /** diff --git a/cpp/src/io/fst/agent_dfa.cuh b/cpp/src/io/fst/agent_dfa.cuh index 9ba8696370a..2171764decd 100644 --- a/cpp/src/io/fst/agent_dfa.cuh +++ b/cpp/src/io/fst/agent_dfa.cuh @@ -91,7 +91,7 @@ class DFASimulationCallbackWrapper { { uint32_t const count = transducer_table(old_state, symbol_id, read_symbol); if (write) { -#if __CUDA_ARCH__ > 0 +#if defined(__CUDA_ARCH__) #pragma unroll 1 #endif for (uint32_t out_char = 0; out_char < count; out_char++) { diff --git a/cpp/src/transform/row_conversion.cu b/cpp/src/transform/row_conversion.cu index 32faa097d0e..359e1ccb80d 100644 --- a/cpp/src/transform/row_conversion.cu +++ b/cpp/src/transform/row_conversion.cu @@ -39,24 +39,14 @@ #include #include +#include +#include #include #include #include #include #include -#include - -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 -#define ASYNC_MEMCPY_SUPPORTED -#endif - -#if !defined(__CUDA_ARCH__) || defined(ASYNC_MEMCPY_SUPPORTED) -#include -#endif // #if !defined(__CUDA_ARCH__) || defined(ASYNC_MEMCPY_SUPPORTED) - -#include - #include #include #include @@ -65,6 +55,7 @@ #include #include #include +#include namespace { @@ -90,13 +81,6 @@ using detail::make_device_uvector_async; using detail::make_device_uvector_sync; using rmm::device_uvector; -#ifdef ASYNC_MEMCPY_SUPPORTED -using cuda::aligned_size_t; -#else -template -using aligned_size_t = size_t; // Local stub for cuda::aligned_size_t. -#endif // ASYNC_MEMCPY_SUPPORTED - namespace cudf { namespace detail { @@ -569,12 +553,6 @@ CUDF_KERNEL void copy_to_rows_fixed_width_optimized(const size_type start_row, } } -#ifdef ASYNC_MEMCPY_SUPPORTED -#define MEMCPY(dst, src, size, barrier) cuda::memcpy_async(dst, src, size, barrier) -#else -#define MEMCPY(dst, src, size, barrier) memcpy(dst, src, size) -#endif // ASYNC_MEMCPY_SUPPORTED - /** * @brief copy data from cudf columns into JCUDF format, which is row-based * @@ -615,11 +593,9 @@ CUDF_KERNEL void copy_to_rows(const size_type num_rows, auto const warp = cooperative_groups::tiled_partition(group); extern __shared__ int8_t shared_data[]; -#ifdef ASYNC_MEMCPY_SUPPORTED __shared__ cuda::barrier tile_barrier; if (group.thread_rank() == 0) { init(&tile_barrier, group.size()); } group.sync(); -#endif // ASYNC_MEMCPY_SUPPORTED auto const tile = tile_infos[blockIdx.x]; auto const num_tile_cols = tile.num_cols(); @@ -702,21 +678,11 @@ CUDF_KERNEL void copy_to_rows(const size_type num_rows, auto const src = &shared_data[tile_row_size * copy_row]; auto const dst = tile_output_buffer + row_offsets(copy_row + tile.start_row, row_batch_start) + starting_column_offset; -#ifdef ASYNC_MEMCPY_SUPPORTED cuda::memcpy_async(warp, dst, src, tile_row_size, tile_barrier); -#else - for (int b = warp.thread_rank(); b < tile_row_size; b += warp.size()) { - dst[b] = src[b]; - } -#endif } -#ifdef ASYNC_MEMCPY_SUPPORTED // wait on the last copies to complete tile_barrier.arrive_and_wait(); -#else - group.sync(); -#endif // ASYNC_MEMCPY_SUPPORTED } /** @@ -752,12 +718,10 @@ CUDF_KERNEL void copy_validity_to_rows(const size_type num_rows, auto const group = cooperative_groups::this_thread_block(); auto const warp = cooperative_groups::tiled_partition(group); -#ifdef ASYNC_MEMCPY_SUPPORTED // Initialize cuda barriers for each tile. __shared__ cuda::barrier shared_tile_barrier; if (group.thread_rank() == 0) { init(&shared_tile_barrier, group.size()); } group.sync(); -#endif // ASYNC_MEMCPY_SUPPORTED auto tile = tile_infos[blockIdx.x]; auto const num_tile_cols = tile.num_cols(); @@ -822,21 +786,11 @@ CUDF_KERNEL void copy_validity_to_rows(const size_type num_rows, relative_row += warp.meta_group_size()) { auto const src = &shared_data[validity_data_row_length * relative_row]; auto const dst = output_data_base + row_offsets(relative_row + tile.start_row, row_batch_start); -#ifdef ASYNC_MEMCPY_SUPPORTED cuda::memcpy_async(warp, dst, src, row_bytes, shared_tile_barrier); -#else - for (int b = warp.thread_rank(); b < row_bytes; b += warp.size()) { - dst[b] = src[b]; - } -#endif } -#ifdef ASYNC_MEMCPY_SUPPORTED // wait for tile of data to arrive shared_tile_barrier.arrive_and_wait(); -#else - group.sync(); -#endif // ASYNC_MEMCPY_SUPPORTED } /** @@ -871,9 +825,7 @@ CUDF_KERNEL void copy_strings_to_rows(size_type const num_rows, // memcpy of the string data. auto const my_block = cooperative_groups::this_thread_block(); auto const warp = cooperative_groups::tiled_partition(my_block); -#ifdef ASYNC_MEMCPY_SUPPORTED cuda::barrier block_barrier; -#endif auto const start_row = blockIdx.x * NUM_STRING_ROWS_PER_BLOCK_TO_ROWS + warp.meta_group_rank() + batch_row_offset; @@ -896,13 +848,7 @@ CUDF_KERNEL void copy_strings_to_rows(size_type const num_rows, auto string_output_dest = &output_data[base_row_offset + offset]; auto string_output_src = &variable_input_data[col][string_start_offset]; warp.sync(); -#ifdef ASYNC_MEMCPY_SUPPORTED cuda::memcpy_async(warp, string_output_dest, string_output_src, string_length, block_barrier); -#else - for (int c = warp.thread_rank(); c < string_length; c += warp.size()) { - string_output_dest[c] = string_output_src[c]; - } -#endif offset += string_length; } } @@ -950,12 +896,10 @@ CUDF_KERNEL void copy_from_rows(const size_type num_rows, auto const warp = cooperative_groups::tiled_partition(group); extern __shared__ int8_t shared[]; -#ifdef ASYNC_MEMCPY_SUPPORTED // Initialize cuda barriers for each tile. __shared__ cuda::barrier tile_barrier; if (group.thread_rank() == 0) { init(&tile_barrier, group.size()); } group.sync(); -#endif // ASYNC_MEMCPY_SUPPORTED { auto const fetch_tile = tile_infos[blockIdx.x]; @@ -973,13 +917,7 @@ CUDF_KERNEL void copy_from_rows(const size_type num_rows, auto dst = &shared[shared_offset]; auto src = &input_data[row_offsets(absolute_row, row_batch_start) + starting_col_offset]; // copy the data -#ifdef ASYNC_MEMCPY_SUPPORTED cuda::memcpy_async(warp, dst, src, fetch_tile_row_size, tile_barrier); -#else - for (int b = warp.thread_rank(); b < fetch_tile_row_size; b += warp.size()) { - dst[b] = src[b]; - } -#endif } } @@ -989,12 +927,8 @@ CUDF_KERNEL void copy_from_rows(const size_type num_rows, auto const cols_in_tile = tile.num_cols(); auto const tile_row_size = tile.get_shared_row_size(col_offsets, col_sizes); -#ifdef ASYNC_MEMCPY_SUPPORTED // ensure our data is ready tile_barrier.arrive_and_wait(); -#else - group.sync(); -#endif // ASYNC_MEMCPY_SUPPORTED // Now we copy from shared memory to final destination. The data is laid out in rows in shared // memory, so the reads for a column will be "vertical". Because of this and the different sizes @@ -1017,17 +951,13 @@ CUDF_KERNEL void copy_from_rows(const size_type num_rows, int8_t* shmem_src = &shared[shared_memory_offset]; int8_t* dst = &output_data[absolute_col][absolute_row * column_size]; - MEMCPY(dst, shmem_src, column_size, tile_barrier); + cuda::memcpy_async(dst, shmem_src, column_size, tile_barrier); } } } -#ifdef ASYNC_MEMCPY_SUPPORTED // wait on the last copies to complete tile_barrier.arrive_and_wait(); -#else - group.sync(); -#endif // ASYNC_MEMCPY_SUPPORTED } /** @@ -1077,12 +1007,10 @@ CUDF_KERNEL void copy_validity_from_rows(const size_type num_rows, auto const group = cooperative_groups::this_thread_block(); auto const warp = cooperative_groups::tiled_partition(group); -#ifdef ASYNC_MEMCPY_SUPPORTED // Initialize cuda barriers for each tile. __shared__ cuda::barrier shared_tile_barrier; if (group.thread_rank() == 0) { init(&shared_tile_barrier, group.size()); } group.sync(); -#endif // ASYNC_MEMCPY_SUPPORTED auto const tile = tile_infos[blockIdx.x]; auto const tile_start_col = tile.start_col; @@ -1147,22 +1075,12 @@ CUDF_KERNEL void copy_validity_from_rows(const size_type num_rows, auto const src = reinterpret_cast(&shared[validity_data_col_length * relative_col]); -#ifdef ASYNC_MEMCPY_SUPPORTED cuda::memcpy_async( - warp, dst, src, aligned_size_t<4>(validity_data_col_length), shared_tile_barrier); -#else - for (int b = warp.thread_rank(); b < col_words; b += warp.size()) { - dst[b] = src[b]; - } -#endif + warp, dst, src, cuda::aligned_size_t<4>(validity_data_col_length), shared_tile_barrier); } -#ifdef ASYNC_MEMCPY_SUPPORTED // wait for tile of data to arrive shared_tile_barrier.arrive_and_wait(); -#else - group.sync(); -#endif // ASYNC_MEMCPY_SUPPORTED } /** @@ -1193,9 +1111,7 @@ CUDF_KERNEL void copy_strings_from_rows(RowOffsetFunctor row_offsets, // Traversing in row-major order to coalesce the offsets and size reads. auto my_block = cooperative_groups::this_thread_block(); auto warp = cooperative_groups::tiled_partition(my_block); -#ifdef ASYNC_MEMCPY_SUPPORTED cuda::barrier block_barrier; -#endif // workaround for not being able to take a reference to a constexpr host variable auto const ROWS_PER_BLOCK = NUM_STRING_ROWS_PER_BLOCK_FROM_ROWS; @@ -1216,13 +1132,7 @@ CUDF_KERNEL void copy_strings_from_rows(RowOffsetFunctor row_offsets, auto const src = &row_data[row_offsets(row, 0) + str_row_off[row]]; auto dst = &str_col_data[str_col_off[row]]; -#ifdef ASYNC_MEMCPY_SUPPORTED cuda::memcpy_async(warp, dst, src, str_len[row], block_barrier); -#else - for (int c = warp.thread_rank(); c < str_len[row]; c += warp.size()) { - dst[c] = src[c]; - } -#endif } } }