Skip to content

Commit

Permalink
Clean up usage of __CUDA_ARCH__ and other macros. (#15218)
Browse files Browse the repository at this point in the history
Closes #15030.

This PR cleans up references to `__CUDA_ARCH__` and other macros.

- We can safely drop Pascal support now that the required minimum is Volta (`__CUDA_ARCH__` of 700).
- Removed a leftover reference to CUDA 10.
- Removed an instance of `#if 1` that was no longer needed.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Michael Schellenberger Costa (https://github.com/miscco)
  - Nghia Truong (https://github.com/ttnghia)
  - Mike Wilson (https://github.com/hyperbolic2346)

URL: #15218
  • Loading branch information
bdice authored Mar 5, 2024
1 parent 2d1e3c7 commit b60bf18
Show file tree
Hide file tree
Showing 6 changed files with 8 additions and 120 deletions.
3 changes: 0 additions & 3 deletions cpp/include/cudf/detail/utilities/device_atomics.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -170,8 +170,6 @@ struct genericAtomicOperationImpl<float, DeviceSum, 4> {
}
};

#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 600)
// `atomicAdd(double)` is supported after cuda architecture 6.0
template <>
struct genericAtomicOperationImpl<double, DeviceSum, 8> {
using T = double;
Expand All @@ -180,7 +178,6 @@ struct genericAtomicOperationImpl<double, DeviceSum, 8> {
return atomicAdd(addr, update_value);
}
};
#endif

template <>
struct genericAtomicOperationImpl<int32_t, DeviceSum, 4> {
Expand Down
9 changes: 2 additions & 7 deletions cpp/src/filling/repeat.cu
Original file line number Diff line number Diff line change
Expand Up @@ -55,13 +55,8 @@ struct count_accessor {
std::enable_if_t<std::is_integral_v<T>, cudf::size_type> operator()(rmm::cuda_stream_view stream)
{
using ScalarType = cudf::scalar_type_t<T>;
#if 1
// TODO: temporary till cudf::scalar's value() function is marked as const
auto p_count = const_cast<ScalarType*>(static_cast<ScalarType const*>(this->p_scalar));
#else
auto p_count = static_cast<ScalarType const*>(this->p_scalar);
#endif
auto count = p_count->value(stream);
auto p_count = static_cast<ScalarType const*>(this->p_scalar);
auto count = p_count->value(stream);
// static_cast is necessary due to bool
CUDF_EXPECTS(static_cast<int64_t>(count) <= std::numeric_limits<cudf::size_type>::max(),
"count should not exceed the column size limit",
Expand Down
4 changes: 0 additions & 4 deletions cpp/src/hash/managed.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,5 @@ struct managed {

inline bool isPtrManaged(cudaPointerAttributes attr)
{
#if CUDART_VERSION >= 10000
return (attr.type == cudaMemoryTypeManaged);
#else
return attr.isManaged;
#endif
}
10 changes: 0 additions & 10 deletions cpp/src/io/comp/snap.cu
Original file line number Diff line number Diff line change
Expand Up @@ -153,17 +153,7 @@ static __device__ uint8_t* StoreCopy(uint8_t* dst,
*/
static inline __device__ uint32_t HashMatchAny(uint32_t v, uint32_t t)
{
#if (__CUDA_ARCH__ >= 700)
return __match_any_sync(~0, v);
#else
uint32_t err_map = 0;
for (uint32_t i = 0; i < hash_bits; i++, v >>= 1) {
uint32_t b = v & 1;
uint32_t match_b = ballot(b);
err_map |= match_b ^ -(int32_t)b;
}
return ~err_map;
#endif
}

/**
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/io/fst/agent_dfa.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ class DFASimulationCallbackWrapper {
{
uint32_t const count = transducer_table(old_state, symbol_id, read_symbol);
if (write) {
#if __CUDA_ARCH__ > 0
#if defined(__CUDA_ARCH__)
#pragma unroll 1
#endif
for (uint32_t out_char = 0; out_char < count; out_char++) {
Expand Down
100 changes: 5 additions & 95 deletions cpp/src/transform/row_conversion.cu
Original file line number Diff line number Diff line change
Expand Up @@ -39,24 +39,14 @@
#include <rmm/exec_policy.hpp>

#include <cooperative_groups.h>
#include <cuda/barrier>
#include <cuda/functional>
#include <thrust/binary_search.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/discard_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/scan.h>

#include <type_traits>

#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
#define ASYNC_MEMCPY_SUPPORTED
#endif

#if !defined(__CUDA_ARCH__) || defined(ASYNC_MEMCPY_SUPPORTED)
#include <cuda/barrier>
#endif // #if !defined(__CUDA_ARCH__) || defined(ASYNC_MEMCPY_SUPPORTED)

#include <cuda/functional>

#include <algorithm>
#include <cstdarg>
#include <cstdint>
Expand All @@ -65,6 +55,7 @@
#include <limits>
#include <optional>
#include <tuple>
#include <type_traits>

namespace {

Expand All @@ -90,13 +81,6 @@ using detail::make_device_uvector_async;
using detail::make_device_uvector_sync;
using rmm::device_uvector;

#ifdef ASYNC_MEMCPY_SUPPORTED
using cuda::aligned_size_t;
#else
template <std::size_t>
using aligned_size_t = size_t; // Local stub for cuda::aligned_size_t.
#endif // ASYNC_MEMCPY_SUPPORTED

namespace cudf {
namespace detail {

Expand Down Expand Up @@ -569,12 +553,6 @@ CUDF_KERNEL void copy_to_rows_fixed_width_optimized(const size_type start_row,
}
}

#ifdef ASYNC_MEMCPY_SUPPORTED
#define MEMCPY(dst, src, size, barrier) cuda::memcpy_async(dst, src, size, barrier)
#else
#define MEMCPY(dst, src, size, barrier) memcpy(dst, src, size)
#endif // ASYNC_MEMCPY_SUPPORTED

/**
* @brief copy data from cudf columns into JCUDF format, which is row-based
*
Expand Down Expand Up @@ -615,11 +593,9 @@ CUDF_KERNEL void copy_to_rows(const size_type num_rows,
auto const warp = cooperative_groups::tiled_partition<cudf::detail::warp_size>(group);
extern __shared__ int8_t shared_data[];

#ifdef ASYNC_MEMCPY_SUPPORTED
__shared__ cuda::barrier<cuda::thread_scope_block> tile_barrier;
if (group.thread_rank() == 0) { init(&tile_barrier, group.size()); }
group.sync();
#endif // ASYNC_MEMCPY_SUPPORTED

auto const tile = tile_infos[blockIdx.x];
auto const num_tile_cols = tile.num_cols();
Expand Down Expand Up @@ -702,21 +678,11 @@ CUDF_KERNEL void copy_to_rows(const size_type num_rows,
auto const src = &shared_data[tile_row_size * copy_row];
auto const dst = tile_output_buffer + row_offsets(copy_row + tile.start_row, row_batch_start) +
starting_column_offset;
#ifdef ASYNC_MEMCPY_SUPPORTED
cuda::memcpy_async(warp, dst, src, tile_row_size, tile_barrier);
#else
for (int b = warp.thread_rank(); b < tile_row_size; b += warp.size()) {
dst[b] = src[b];
}
#endif
}

#ifdef ASYNC_MEMCPY_SUPPORTED
// wait on the last copies to complete
tile_barrier.arrive_and_wait();
#else
group.sync();
#endif // ASYNC_MEMCPY_SUPPORTED
}

/**
Expand Down Expand Up @@ -752,12 +718,10 @@ CUDF_KERNEL void copy_validity_to_rows(const size_type num_rows,
auto const group = cooperative_groups::this_thread_block();
auto const warp = cooperative_groups::tiled_partition<cudf::detail::warp_size>(group);

#ifdef ASYNC_MEMCPY_SUPPORTED
// Initialize cuda barriers for each tile.
__shared__ cuda::barrier<cuda::thread_scope_block> shared_tile_barrier;
if (group.thread_rank() == 0) { init(&shared_tile_barrier, group.size()); }
group.sync();
#endif // ASYNC_MEMCPY_SUPPORTED

auto tile = tile_infos[blockIdx.x];
auto const num_tile_cols = tile.num_cols();
Expand Down Expand Up @@ -822,21 +786,11 @@ CUDF_KERNEL void copy_validity_to_rows(const size_type num_rows,
relative_row += warp.meta_group_size()) {
auto const src = &shared_data[validity_data_row_length * relative_row];
auto const dst = output_data_base + row_offsets(relative_row + tile.start_row, row_batch_start);
#ifdef ASYNC_MEMCPY_SUPPORTED
cuda::memcpy_async(warp, dst, src, row_bytes, shared_tile_barrier);
#else
for (int b = warp.thread_rank(); b < row_bytes; b += warp.size()) {
dst[b] = src[b];
}
#endif
}

#ifdef ASYNC_MEMCPY_SUPPORTED
// wait for tile of data to arrive
shared_tile_barrier.arrive_and_wait();
#else
group.sync();
#endif // ASYNC_MEMCPY_SUPPORTED
}

/**
Expand Down Expand Up @@ -871,9 +825,7 @@ CUDF_KERNEL void copy_strings_to_rows(size_type const num_rows,
// memcpy of the string data.
auto const my_block = cooperative_groups::this_thread_block();
auto const warp = cooperative_groups::tiled_partition<cudf::detail::warp_size>(my_block);
#ifdef ASYNC_MEMCPY_SUPPORTED
cuda::barrier<cuda::thread_scope_block> block_barrier;
#endif

auto const start_row =
blockIdx.x * NUM_STRING_ROWS_PER_BLOCK_TO_ROWS + warp.meta_group_rank() + batch_row_offset;
Expand All @@ -896,13 +848,7 @@ CUDF_KERNEL void copy_strings_to_rows(size_type const num_rows,
auto string_output_dest = &output_data[base_row_offset + offset];
auto string_output_src = &variable_input_data[col][string_start_offset];
warp.sync();
#ifdef ASYNC_MEMCPY_SUPPORTED
cuda::memcpy_async(warp, string_output_dest, string_output_src, string_length, block_barrier);
#else
for (int c = warp.thread_rank(); c < string_length; c += warp.size()) {
string_output_dest[c] = string_output_src[c];
}
#endif
offset += string_length;
}
}
Expand Down Expand Up @@ -950,12 +896,10 @@ CUDF_KERNEL void copy_from_rows(const size_type num_rows,
auto const warp = cooperative_groups::tiled_partition<cudf::detail::warp_size>(group);
extern __shared__ int8_t shared[];

#ifdef ASYNC_MEMCPY_SUPPORTED
// Initialize cuda barriers for each tile.
__shared__ cuda::barrier<cuda::thread_scope_block> tile_barrier;
if (group.thread_rank() == 0) { init(&tile_barrier, group.size()); }
group.sync();
#endif // ASYNC_MEMCPY_SUPPORTED

{
auto const fetch_tile = tile_infos[blockIdx.x];
Expand All @@ -973,13 +917,7 @@ CUDF_KERNEL void copy_from_rows(const size_type num_rows,
auto dst = &shared[shared_offset];
auto src = &input_data[row_offsets(absolute_row, row_batch_start) + starting_col_offset];
// copy the data
#ifdef ASYNC_MEMCPY_SUPPORTED
cuda::memcpy_async(warp, dst, src, fetch_tile_row_size, tile_barrier);
#else
for (int b = warp.thread_rank(); b < fetch_tile_row_size; b += warp.size()) {
dst[b] = src[b];
}
#endif
}
}

Expand All @@ -989,12 +927,8 @@ CUDF_KERNEL void copy_from_rows(const size_type num_rows,
auto const cols_in_tile = tile.num_cols();
auto const tile_row_size = tile.get_shared_row_size(col_offsets, col_sizes);

#ifdef ASYNC_MEMCPY_SUPPORTED
// ensure our data is ready
tile_barrier.arrive_and_wait();
#else
group.sync();
#endif // ASYNC_MEMCPY_SUPPORTED

// Now we copy from shared memory to final destination. The data is laid out in rows in shared
// memory, so the reads for a column will be "vertical". Because of this and the different sizes
Expand All @@ -1017,17 +951,13 @@ CUDF_KERNEL void copy_from_rows(const size_type num_rows,
int8_t* shmem_src = &shared[shared_memory_offset];
int8_t* dst = &output_data[absolute_col][absolute_row * column_size];

MEMCPY(dst, shmem_src, column_size, tile_barrier);
cuda::memcpy_async(dst, shmem_src, column_size, tile_barrier);
}
}
}

#ifdef ASYNC_MEMCPY_SUPPORTED
// wait on the last copies to complete
tile_barrier.arrive_and_wait();
#else
group.sync();
#endif // ASYNC_MEMCPY_SUPPORTED
}

/**
Expand Down Expand Up @@ -1077,12 +1007,10 @@ CUDF_KERNEL void copy_validity_from_rows(const size_type num_rows,
auto const group = cooperative_groups::this_thread_block();
auto const warp = cooperative_groups::tiled_partition<cudf::detail::warp_size>(group);

#ifdef ASYNC_MEMCPY_SUPPORTED
// Initialize cuda barriers for each tile.
__shared__ cuda::barrier<cuda::thread_scope_block> shared_tile_barrier;
if (group.thread_rank() == 0) { init(&shared_tile_barrier, group.size()); }
group.sync();
#endif // ASYNC_MEMCPY_SUPPORTED

auto const tile = tile_infos[blockIdx.x];
auto const tile_start_col = tile.start_col;
Expand Down Expand Up @@ -1147,22 +1075,12 @@ CUDF_KERNEL void copy_validity_from_rows(const size_type num_rows,
auto const src =
reinterpret_cast<bitmask_type*>(&shared[validity_data_col_length * relative_col]);

#ifdef ASYNC_MEMCPY_SUPPORTED
cuda::memcpy_async(
warp, dst, src, aligned_size_t<4>(validity_data_col_length), shared_tile_barrier);
#else
for (int b = warp.thread_rank(); b < col_words; b += warp.size()) {
dst[b] = src[b];
}
#endif
warp, dst, src, cuda::aligned_size_t<4>(validity_data_col_length), shared_tile_barrier);
}

#ifdef ASYNC_MEMCPY_SUPPORTED
// wait for tile of data to arrive
shared_tile_barrier.arrive_and_wait();
#else
group.sync();
#endif // ASYNC_MEMCPY_SUPPORTED
}

/**
Expand Down Expand Up @@ -1193,9 +1111,7 @@ CUDF_KERNEL void copy_strings_from_rows(RowOffsetFunctor row_offsets,
// Traversing in row-major order to coalesce the offsets and size reads.
auto my_block = cooperative_groups::this_thread_block();
auto warp = cooperative_groups::tiled_partition<cudf::detail::warp_size>(my_block);
#ifdef ASYNC_MEMCPY_SUPPORTED
cuda::barrier<cuda::thread_scope_block> block_barrier;
#endif

// workaround for not being able to take a reference to a constexpr host variable
auto const ROWS_PER_BLOCK = NUM_STRING_ROWS_PER_BLOCK_FROM_ROWS;
Expand All @@ -1216,13 +1132,7 @@ CUDF_KERNEL void copy_strings_from_rows(RowOffsetFunctor row_offsets,
auto const src = &row_data[row_offsets(row, 0) + str_row_off[row]];
auto dst = &str_col_data[str_col_off[row]];

#ifdef ASYNC_MEMCPY_SUPPORTED
cuda::memcpy_async(warp, dst, src, str_len[row], block_barrier);
#else
for (int c = warp.thread_rank(); c < str_len[row]; c += warp.size()) {
dst[c] = src[c];
}
#endif
}
}
}
Expand Down

0 comments on commit b60bf18

Please sign in to comment.