diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index e60c47fae2b..ef2141ed934 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -92,7 +92,8 @@ jobs: secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.04 with: - matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.11" and (.CUDA_VER == "11.8.0" or .CUDA_VER == "12.2.2"))) + # This selects "ARCH=amd64 + the latest supported Python + CUDA". + matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} sha: ${{ inputs.sha }} diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 4a662ed0f43..7599616a0c5 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -128,7 +128,8 @@ jobs: secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.04 with: - matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.11" and (.CUDA_VER == "11.8.0" or .CUDA_VER == "12.2.2"))) + # This selects "ARCH=amd64 + the latest supported Python + CUDA". + matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) build_type: pull-request script: "ci/build_wheel_dask_cudf.sh" wheel-tests-dask-cudf: @@ -136,7 +137,8 @@ jobs: secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.04 with: - matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.11" and (.CUDA_VER == "11.8.0" or .CUDA_VER == "12.2.2"))) + # This selects "ARCH=amd64 + the latest supported Python + CUDA". + matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) build_type: pull-request script: ci/test_wheel_dask_cudf.sh devcontainer: @@ -154,7 +156,8 @@ jobs: secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.04 with: - matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.11" and (.CUDA_VER == "11.8.0" or .CUDA_VER == "12.2.2"))) + # This selects "ARCH=amd64 + the latest supported Python + CUDA". + matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) build_type: pull-request script: ci/cudf_pandas_scripts/run_tests.sh # pandas-tests: diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index e66b2e1f872..bc5eeb2777b 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -99,7 +99,8 @@ jobs: secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.04 with: - matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.11" and (.CUDA_VER == "11.8.0" or .CUDA_VER == "12.2.2"))) + # This selects "ARCH=amd64 + the latest supported Python + CUDA". + matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) build_type: nightly branch: ${{ inputs.branch }} date: ${{ inputs.date }} diff --git a/cpp/include/cudf/detail/utilities/device_atomics.cuh b/cpp/include/cudf/detail/utilities/device_atomics.cuh index 1e3fe3d08dc..6f23abc59a8 100644 --- a/cpp/include/cudf/detail/utilities/device_atomics.cuh +++ b/cpp/include/cudf/detail/utilities/device_atomics.cuh @@ -170,8 +170,6 @@ struct genericAtomicOperationImpl { } }; -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 600) -// `atomicAdd(double)` is supported after cuda architecture 6.0 template <> struct genericAtomicOperationImpl { using T = double; @@ -180,7 +178,6 @@ struct genericAtomicOperationImpl { return atomicAdd(addr, update_value); } }; -#endif template <> struct genericAtomicOperationImpl { diff --git a/cpp/src/filling/repeat.cu b/cpp/src/filling/repeat.cu index bd53eeddbb5..87cc0f21d0e 100644 --- a/cpp/src/filling/repeat.cu +++ b/cpp/src/filling/repeat.cu @@ -55,13 +55,8 @@ struct count_accessor { std::enable_if_t, cudf::size_type> operator()(rmm::cuda_stream_view stream) { using ScalarType = cudf::scalar_type_t; -#if 1 - // TODO: temporary till cudf::scalar's value() function is marked as const - auto p_count = const_cast(static_cast(this->p_scalar)); -#else - auto p_count = static_cast(this->p_scalar); -#endif - auto count = p_count->value(stream); + auto p_count = static_cast(this->p_scalar); + auto count = p_count->value(stream); // static_cast is necessary due to bool CUDF_EXPECTS(static_cast(count) <= std::numeric_limits::max(), "count should not exceed the column size limit", diff --git a/cpp/src/hash/managed.cuh b/cpp/src/hash/managed.cuh index aa7bff85ea6..9797c83c47c 100644 --- a/cpp/src/hash/managed.cuh +++ b/cpp/src/hash/managed.cuh @@ -37,9 +37,5 @@ struct managed { inline bool isPtrManaged(cudaPointerAttributes attr) { -#if CUDART_VERSION >= 10000 return (attr.type == cudaMemoryTypeManaged); -#else - return attr.isManaged; -#endif } diff --git a/cpp/src/io/comp/snap.cu b/cpp/src/io/comp/snap.cu index 252c96f496a..7d4dcffa713 100644 --- a/cpp/src/io/comp/snap.cu +++ b/cpp/src/io/comp/snap.cu @@ -153,17 +153,7 @@ static __device__ uint8_t* StoreCopy(uint8_t* dst, */ static inline __device__ uint32_t HashMatchAny(uint32_t v, uint32_t t) { -#if (__CUDA_ARCH__ >= 700) return __match_any_sync(~0, v); -#else - uint32_t err_map = 0; - for (uint32_t i = 0; i < hash_bits; i++, v >>= 1) { - uint32_t b = v & 1; - uint32_t match_b = ballot(b); - err_map |= match_b ^ -(int32_t)b; - } - return ~err_map; -#endif } /** diff --git a/cpp/src/io/fst/agent_dfa.cuh b/cpp/src/io/fst/agent_dfa.cuh index 9ba8696370a..2171764decd 100644 --- a/cpp/src/io/fst/agent_dfa.cuh +++ b/cpp/src/io/fst/agent_dfa.cuh @@ -91,7 +91,7 @@ class DFASimulationCallbackWrapper { { uint32_t const count = transducer_table(old_state, symbol_id, read_symbol); if (write) { -#if __CUDA_ARCH__ > 0 +#if defined(__CUDA_ARCH__) #pragma unroll 1 #endif for (uint32_t out_char = 0; out_char < count; out_char++) { diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu index 506d7b6cddc..b03e0dd452b 100644 --- a/cpp/src/io/json/read_json.cu +++ b/cpp/src/io/json/read_json.cu @@ -140,10 +140,11 @@ size_type find_first_delimiter_in_chunk(host_span= source_size); } /** @@ -168,7 +169,7 @@ auto get_record_range_raw_input(host_span> sources, reader_opts.get_byte_range_offset(), reader_opts.get_byte_range_size(), stream); - if (should_load_whole_source(reader_opts)) return buffer; + if (should_load_whole_source(reader_opts, sources[0]->size())) return buffer; auto first_delim_pos = reader_opts.get_byte_range_offset() == 0 ? 0 : find_first_delimiter(buffer, '\n', stream); if (first_delim_pos == -1) { @@ -212,7 +213,7 @@ table_with_metadata read_json(host_span> sources, return legacy::read_json(sources, reader_opts, stream, mr); } - if (not should_load_whole_source(reader_opts)) { + if (reader_opts.get_byte_range_offset() != 0 or reader_opts.get_byte_range_size() != 0) { CUDF_EXPECTS(reader_opts.is_enabled_lines(), "Specifying a byte range is supported only for JSON Lines"); CUDF_EXPECTS(sources.size() == 1, diff --git a/cpp/src/transform/row_conversion.cu b/cpp/src/transform/row_conversion.cu index 32faa097d0e..359e1ccb80d 100644 --- a/cpp/src/transform/row_conversion.cu +++ b/cpp/src/transform/row_conversion.cu @@ -39,24 +39,14 @@ #include #include +#include +#include #include #include #include #include #include -#include - -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 -#define ASYNC_MEMCPY_SUPPORTED -#endif - -#if !defined(__CUDA_ARCH__) || defined(ASYNC_MEMCPY_SUPPORTED) -#include -#endif // #if !defined(__CUDA_ARCH__) || defined(ASYNC_MEMCPY_SUPPORTED) - -#include - #include #include #include @@ -65,6 +55,7 @@ #include #include #include +#include namespace { @@ -90,13 +81,6 @@ using detail::make_device_uvector_async; using detail::make_device_uvector_sync; using rmm::device_uvector; -#ifdef ASYNC_MEMCPY_SUPPORTED -using cuda::aligned_size_t; -#else -template -using aligned_size_t = size_t; // Local stub for cuda::aligned_size_t. -#endif // ASYNC_MEMCPY_SUPPORTED - namespace cudf { namespace detail { @@ -569,12 +553,6 @@ CUDF_KERNEL void copy_to_rows_fixed_width_optimized(const size_type start_row, } } -#ifdef ASYNC_MEMCPY_SUPPORTED -#define MEMCPY(dst, src, size, barrier) cuda::memcpy_async(dst, src, size, barrier) -#else -#define MEMCPY(dst, src, size, barrier) memcpy(dst, src, size) -#endif // ASYNC_MEMCPY_SUPPORTED - /** * @brief copy data from cudf columns into JCUDF format, which is row-based * @@ -615,11 +593,9 @@ CUDF_KERNEL void copy_to_rows(const size_type num_rows, auto const warp = cooperative_groups::tiled_partition(group); extern __shared__ int8_t shared_data[]; -#ifdef ASYNC_MEMCPY_SUPPORTED __shared__ cuda::barrier tile_barrier; if (group.thread_rank() == 0) { init(&tile_barrier, group.size()); } group.sync(); -#endif // ASYNC_MEMCPY_SUPPORTED auto const tile = tile_infos[blockIdx.x]; auto const num_tile_cols = tile.num_cols(); @@ -702,21 +678,11 @@ CUDF_KERNEL void copy_to_rows(const size_type num_rows, auto const src = &shared_data[tile_row_size * copy_row]; auto const dst = tile_output_buffer + row_offsets(copy_row + tile.start_row, row_batch_start) + starting_column_offset; -#ifdef ASYNC_MEMCPY_SUPPORTED cuda::memcpy_async(warp, dst, src, tile_row_size, tile_barrier); -#else - for (int b = warp.thread_rank(); b < tile_row_size; b += warp.size()) { - dst[b] = src[b]; - } -#endif } -#ifdef ASYNC_MEMCPY_SUPPORTED // wait on the last copies to complete tile_barrier.arrive_and_wait(); -#else - group.sync(); -#endif // ASYNC_MEMCPY_SUPPORTED } /** @@ -752,12 +718,10 @@ CUDF_KERNEL void copy_validity_to_rows(const size_type num_rows, auto const group = cooperative_groups::this_thread_block(); auto const warp = cooperative_groups::tiled_partition(group); -#ifdef ASYNC_MEMCPY_SUPPORTED // Initialize cuda barriers for each tile. __shared__ cuda::barrier shared_tile_barrier; if (group.thread_rank() == 0) { init(&shared_tile_barrier, group.size()); } group.sync(); -#endif // ASYNC_MEMCPY_SUPPORTED auto tile = tile_infos[blockIdx.x]; auto const num_tile_cols = tile.num_cols(); @@ -822,21 +786,11 @@ CUDF_KERNEL void copy_validity_to_rows(const size_type num_rows, relative_row += warp.meta_group_size()) { auto const src = &shared_data[validity_data_row_length * relative_row]; auto const dst = output_data_base + row_offsets(relative_row + tile.start_row, row_batch_start); -#ifdef ASYNC_MEMCPY_SUPPORTED cuda::memcpy_async(warp, dst, src, row_bytes, shared_tile_barrier); -#else - for (int b = warp.thread_rank(); b < row_bytes; b += warp.size()) { - dst[b] = src[b]; - } -#endif } -#ifdef ASYNC_MEMCPY_SUPPORTED // wait for tile of data to arrive shared_tile_barrier.arrive_and_wait(); -#else - group.sync(); -#endif // ASYNC_MEMCPY_SUPPORTED } /** @@ -871,9 +825,7 @@ CUDF_KERNEL void copy_strings_to_rows(size_type const num_rows, // memcpy of the string data. auto const my_block = cooperative_groups::this_thread_block(); auto const warp = cooperative_groups::tiled_partition(my_block); -#ifdef ASYNC_MEMCPY_SUPPORTED cuda::barrier block_barrier; -#endif auto const start_row = blockIdx.x * NUM_STRING_ROWS_PER_BLOCK_TO_ROWS + warp.meta_group_rank() + batch_row_offset; @@ -896,13 +848,7 @@ CUDF_KERNEL void copy_strings_to_rows(size_type const num_rows, auto string_output_dest = &output_data[base_row_offset + offset]; auto string_output_src = &variable_input_data[col][string_start_offset]; warp.sync(); -#ifdef ASYNC_MEMCPY_SUPPORTED cuda::memcpy_async(warp, string_output_dest, string_output_src, string_length, block_barrier); -#else - for (int c = warp.thread_rank(); c < string_length; c += warp.size()) { - string_output_dest[c] = string_output_src[c]; - } -#endif offset += string_length; } } @@ -950,12 +896,10 @@ CUDF_KERNEL void copy_from_rows(const size_type num_rows, auto const warp = cooperative_groups::tiled_partition(group); extern __shared__ int8_t shared[]; -#ifdef ASYNC_MEMCPY_SUPPORTED // Initialize cuda barriers for each tile. __shared__ cuda::barrier tile_barrier; if (group.thread_rank() == 0) { init(&tile_barrier, group.size()); } group.sync(); -#endif // ASYNC_MEMCPY_SUPPORTED { auto const fetch_tile = tile_infos[blockIdx.x]; @@ -973,13 +917,7 @@ CUDF_KERNEL void copy_from_rows(const size_type num_rows, auto dst = &shared[shared_offset]; auto src = &input_data[row_offsets(absolute_row, row_batch_start) + starting_col_offset]; // copy the data -#ifdef ASYNC_MEMCPY_SUPPORTED cuda::memcpy_async(warp, dst, src, fetch_tile_row_size, tile_barrier); -#else - for (int b = warp.thread_rank(); b < fetch_tile_row_size; b += warp.size()) { - dst[b] = src[b]; - } -#endif } } @@ -989,12 +927,8 @@ CUDF_KERNEL void copy_from_rows(const size_type num_rows, auto const cols_in_tile = tile.num_cols(); auto const tile_row_size = tile.get_shared_row_size(col_offsets, col_sizes); -#ifdef ASYNC_MEMCPY_SUPPORTED // ensure our data is ready tile_barrier.arrive_and_wait(); -#else - group.sync(); -#endif // ASYNC_MEMCPY_SUPPORTED // Now we copy from shared memory to final destination. The data is laid out in rows in shared // memory, so the reads for a column will be "vertical". Because of this and the different sizes @@ -1017,17 +951,13 @@ CUDF_KERNEL void copy_from_rows(const size_type num_rows, int8_t* shmem_src = &shared[shared_memory_offset]; int8_t* dst = &output_data[absolute_col][absolute_row * column_size]; - MEMCPY(dst, shmem_src, column_size, tile_barrier); + cuda::memcpy_async(dst, shmem_src, column_size, tile_barrier); } } } -#ifdef ASYNC_MEMCPY_SUPPORTED // wait on the last copies to complete tile_barrier.arrive_and_wait(); -#else - group.sync(); -#endif // ASYNC_MEMCPY_SUPPORTED } /** @@ -1077,12 +1007,10 @@ CUDF_KERNEL void copy_validity_from_rows(const size_type num_rows, auto const group = cooperative_groups::this_thread_block(); auto const warp = cooperative_groups::tiled_partition(group); -#ifdef ASYNC_MEMCPY_SUPPORTED // Initialize cuda barriers for each tile. __shared__ cuda::barrier shared_tile_barrier; if (group.thread_rank() == 0) { init(&shared_tile_barrier, group.size()); } group.sync(); -#endif // ASYNC_MEMCPY_SUPPORTED auto const tile = tile_infos[blockIdx.x]; auto const tile_start_col = tile.start_col; @@ -1147,22 +1075,12 @@ CUDF_KERNEL void copy_validity_from_rows(const size_type num_rows, auto const src = reinterpret_cast(&shared[validity_data_col_length * relative_col]); -#ifdef ASYNC_MEMCPY_SUPPORTED cuda::memcpy_async( - warp, dst, src, aligned_size_t<4>(validity_data_col_length), shared_tile_barrier); -#else - for (int b = warp.thread_rank(); b < col_words; b += warp.size()) { - dst[b] = src[b]; - } -#endif + warp, dst, src, cuda::aligned_size_t<4>(validity_data_col_length), shared_tile_barrier); } -#ifdef ASYNC_MEMCPY_SUPPORTED // wait for tile of data to arrive shared_tile_barrier.arrive_and_wait(); -#else - group.sync(); -#endif // ASYNC_MEMCPY_SUPPORTED } /** @@ -1193,9 +1111,7 @@ CUDF_KERNEL void copy_strings_from_rows(RowOffsetFunctor row_offsets, // Traversing in row-major order to coalesce the offsets and size reads. auto my_block = cooperative_groups::this_thread_block(); auto warp = cooperative_groups::tiled_partition(my_block); -#ifdef ASYNC_MEMCPY_SUPPORTED cuda::barrier block_barrier; -#endif // workaround for not being able to take a reference to a constexpr host variable auto const ROWS_PER_BLOCK = NUM_STRING_ROWS_PER_BLOCK_FROM_ROWS; @@ -1216,13 +1132,7 @@ CUDF_KERNEL void copy_strings_from_rows(RowOffsetFunctor row_offsets, auto const src = &row_data[row_offsets(row, 0) + str_row_off[row]]; auto dst = &str_col_data[str_col_off[row]]; -#ifdef ASYNC_MEMCPY_SUPPORTED cuda::memcpy_async(warp, dst, src, str_len[row], block_barrier); -#else - for (int c = warp.thread_rank(); c < str_len[row]; c += warp.size()) { - dst[c] = src[c]; - } -#endif } } }