From 5cf4c164224805b542f5d8411034574858224e64 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Fri, 2 Feb 2024 21:37:52 -0600 Subject: [PATCH 1/6] Refactor atomicAdd to cudf::detail::atomic_add. --- .../cudf/detail/aggregation/aggregation.cuh | 16 +++---- cpp/include/cudf/detail/copy_if_else.cuh | 3 +- cpp/include/cudf/detail/copy_range.cuh | 3 +- .../cudf/detail/labeling/label_segments.cuh | 5 +- cpp/include/cudf/detail/null_mask.cuh | 3 +- .../cudf/detail/utilities/device_atomics.cuh | 47 +++++++++++++------ cpp/include/cudf/detail/valid_if.cuh | 5 +- cpp/src/bitmask/null_mask.cu | 3 +- cpp/src/copying/concatenate.cu | 5 +- cpp/src/groupby/sort/group_quantiles.cu | 5 +- cpp/src/groupby/sort/group_std.cu | 5 +- cpp/src/io/avro/avro_gpu.cu | 4 +- cpp/src/io/csv/csv_gpu.cu | 19 ++++---- cpp/src/io/json/legacy/json_gpu.cu | 29 ++++++------ cpp/src/io/orc/stripe_data.cu | 14 ++++-- cpp/src/io/parquet/chunk_dict.cu | 5 +- cpp/src/io/utilities/data_casting.cu | 17 +++---- cpp/src/io/utilities/parsing_utils.cu | 3 +- cpp/src/io/utilities/type_inference.cu | 21 +++++---- cpp/src/join/join_common_utils.cuh | 9 ++-- cpp/src/json/json_path.cu | 3 +- cpp/src/partitioning/partitioning.cu | 9 ++-- cpp/src/replace/nulls.cu | 9 +++- cpp/src/replace/replace.cu | 9 +++- cpp/src/rolling/detail/rolling.cuh | 3 +- cpp/src/stream_compaction/distinct_helpers.cu | 5 +- cpp/src/strings/copying/concatenate.cu | 3 +- cpp/src/strings/replace/multi.cu | 3 +- cpp/src/transform/row_conversion.cu | 3 +- .../device_atomics/device_atomics_test.cu | 2 +- 30 files changed, 165 insertions(+), 105 deletions(-) diff --git a/cpp/include/cudf/detail/aggregation/aggregation.cuh b/cpp/include/cudf/detail/aggregation/aggregation.cuh index f13166d5321..968fc83e763 100644 --- a/cpp/include/cudf/detail/aggregation/aggregation.cuh +++ b/cpp/include/cudf/detail/aggregation/aggregation.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -242,8 +242,8 @@ struct update_target_element< if (source_has_nulls and source.is_null(source_index)) { return; } using Target = target_type_t; - atomicAdd(&target.element(target_index), - static_cast(source.element(source_index))); + cudf::detail::atomic_add(&target.element(target_index), + static_cast(source.element(source_index))); if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); } } @@ -268,8 +268,8 @@ struct update_target_element< using DeviceTarget = device_storage_type_t; using DeviceSource = device_storage_type_t; - atomicAdd(&target.element(target_index), - static_cast(source.element(source_index))); + cudf::detail::atomic_add(&target.element(target_index), + static_cast(source.element(source_index))); if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); } } @@ -368,7 +368,7 @@ struct update_target_element; auto value = static_cast(source.element(source_index)); - atomicAdd(&target.element(target_index), value * value); + cudf::detail::atomic_add(&target.element(target_index), value * value); if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); } } }; @@ -408,7 +408,7 @@ struct update_target_element< if (source_has_nulls and source.is_null(source_index)) { return; } using Target = target_type_t; - atomicAdd(&target.element(target_index), Target{1}); + cudf::detail::atomic_add(&target.element(target_index), Target{1}); // It is assumed the output for COUNT_VALID is initialized to be all valid } @@ -427,7 +427,7 @@ struct update_target_element< size_type source_index) const noexcept { using Target = target_type_t; - atomicAdd(&target.element(target_index), Target{1}); + cudf::detail::atomic_add(&target.element(target_index), Target{1}); // It is assumed the output for COUNT_ALL is initialized to be all valid } diff --git a/cpp/include/cudf/detail/copy_if_else.cuh b/cpp/include/cudf/detail/copy_if_else.cuh index 6162fa5ecf1..65b453e0bd1 100644 --- a/cpp/include/cudf/detail/copy_if_else.cuh +++ b/cpp/include/cudf/detail/copy_if_else.cuh @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -94,7 +95,7 @@ __launch_bounds__(block_size) CUDF_KERNEL // block_valid_count will only be valid on thread 0 if (threadIdx.x == 0) { // using an atomic here because there are multiple blocks doing this work - atomicAdd(valid_count, block_valid_count); + cudf::detail::atomic_add(valid_count, block_valid_count); } } } diff --git a/cpp/include/cudf/detail/copy_range.cuh b/cpp/include/cudf/detail/copy_range.cuh index 4bfdaa94c53..192a60cf9b3 100644 --- a/cpp/include/cudf/detail/copy_range.cuh +++ b/cpp/include/cudf/detail/copy_range.cuh @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -100,7 +101,7 @@ CUDF_KERNEL void copy_range_kernel(SourceValueIterator source_value_begin, auto block_null_change = cudf::detail::single_lane_block_sum_reduce(warp_null_change); if (threadIdx.x == 0) { // if the first thread in a block - atomicAdd(null_count, block_null_change); + cudf::detail::atomic_add(null_count, block_null_change); } } } diff --git a/cpp/include/cudf/detail/labeling/label_segments.cuh b/cpp/include/cudf/detail/labeling/label_segments.cuh index 9051230a272..405d111ab35 100644 --- a/cpp/include/cudf/detail/labeling/label_segments.cuh +++ b/cpp/include/cudf/detail/labeling/label_segments.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,6 +14,7 @@ * limitations under the License. */ +#include #include #include @@ -103,7 +104,7 @@ void label_segments(InputIterator offsets_begin, // In case we have repeated offsets (i.e., we have empty segments), this `atomicAdd` call will // make sure the label values corresponding to these empty segments will be skipped in the // output. - if (dst_idx < num_labels) { atomicAdd(&output[dst_idx], OutputType{1}); } + if (dst_idx < num_labels) { cudf::detail::atomic_add(&output[dst_idx], OutputType{1}); } }); thrust::inclusive_scan(rmm::exec_policy(stream), label_begin, label_end, label_begin); } diff --git a/cpp/include/cudf/detail/null_mask.cuh b/cpp/include/cudf/detail/null_mask.cuh index e57d85f2998..3113b991a79 100644 --- a/cpp/include/cudf/detail/null_mask.cuh +++ b/cpp/include/cudf/detail/null_mask.cuh @@ -17,6 +17,7 @@ #include #include +#include #include #include #include @@ -107,7 +108,7 @@ CUDF_KERNEL void offset_bitmask_binop(Binop op, __shared__ typename BlockReduce::TempStorage temp_storage; size_type block_count = BlockReduce(temp_storage).Sum(thread_count); - if (threadIdx.x == 0) { atomicAdd(count_ptr, block_count); } + if (threadIdx.x == 0) { cudf::detail::atomic_add(count_ptr, block_count); } } /** diff --git a/cpp/include/cudf/detail/utilities/device_atomics.cuh b/cpp/include/cudf/detail/utilities/device_atomics.cuh index c56e88f07a8..3a5108a3503 100644 --- a/cpp/include/cudf/detail/utilities/device_atomics.cuh +++ b/cpp/include/cudf/detail/utilities/device_atomics.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -158,8 +158,8 @@ struct genericAtomicOperationImpl { // ----------------------------------------------------------------------- // specialized functions for operators -// `atomicAdd` supports int32, float, double (signed int64 is not supported.) -// `atomicMin`, `atomicMax` support int32_t, int64_t +// `atomicAdd` supports int32_t, uint32_t, uint64_t, float, double (signed int64_t is not +// supported.) `atomicMin`, `atomicMax` support int32_t, uint32_t, uint64_t, int64_t template <> struct genericAtomicOperationImpl { using T = float; @@ -169,8 +169,6 @@ struct genericAtomicOperationImpl { } }; -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 600) -// `atomicAdd(double)` is supported after cuda architecture 6.0 template <> struct genericAtomicOperationImpl { using T = double; @@ -179,7 +177,27 @@ struct genericAtomicOperationImpl { return atomicAdd(addr, update_value); } }; -#endif + +template <> +struct genericAtomicOperationImpl { + using T = uint32_t; + __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceSum op) + { + return atomicAdd(addr, update_value); + } +}; + +template <> +struct genericAtomicOperationImpl { + using T = uint64_t; + __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceSum op) + { + using T_int = unsigned long long int; + static_assert(sizeof(T) == sizeof(T_int)); + T ret = atomicAdd(reinterpret_cast(addr), type_reinterpret(update_value)); + return ret; + } +}; template <> struct genericAtomicOperationImpl { @@ -190,9 +208,9 @@ struct genericAtomicOperationImpl { } }; -// Cuda natively supports `unsigned long long int` for `atomicAdd`, -// but doesn't supports `signed long long int`. -// However, since the signed integer is represented as Two's complement, +// CUDA natively supports `unsigned long long int` for `atomicAdd`, +// but doesn't support `signed long long int`. +// However, since the signed integer is represented as two's complement, // the fundamental arithmetic operations of addition are identical to // those for unsigned binary numbers. // Then, this computes as `unsigned long long int` with `atomicAdd` @@ -410,15 +428,14 @@ __forceinline__ __device__ bool genericAtomicOperation(bool* address, return T(fun(address, update_value, op)); } -} // namespace cudf - +namespace detail { /** - * @brief Overloads for `atomicAdd` + * @brief Overloads for `atomic_add` * reads the `old` located at the `address` in global or shared memory, * computes (old + val), and stores the result back to memory at the same * address. These three operations are performed in one atomic transaction. * - * The supported cudf types for `atomicAdd` are: + * The supported cudf types for `atomic_add` are: * int8_t, int16_t, int32_t, int64_t, float, double, * cudf::timestamp_D, cudf::timestamp_s, cudf::timestamp_ms cudf::timestamp_us, * cudf::timestamp_ns, cudf::duration_D, cudf::duration_s, cudf::duration_ms, @@ -434,10 +451,12 @@ __forceinline__ __device__ bool genericAtomicOperation(bool* address, * @returns The old value at `address` */ template -__forceinline__ __device__ T atomicAdd(T* address, T val) +__forceinline__ __device__ T atomic_add(T* address, T val) { return cudf::genericAtomicOperation(address, val, cudf::DeviceSum{}); } +} // namespace detail +} // namespace cudf /** * @brief Overloads for `atomicMul` diff --git a/cpp/include/cudf/detail/valid_if.cuh b/cpp/include/cudf/detail/valid_if.cuh index d0073177445..453a072fece 100644 --- a/cpp/include/cudf/detail/valid_if.cuh +++ b/cpp/include/cudf/detail/valid_if.cuh @@ -18,6 +18,7 @@ #include #include +#include #include #include #include @@ -65,7 +66,7 @@ CUDF_KERNEL void valid_if_kernel( } size_type block_count = single_lane_block_sum_reduce(warp_valid_count); - if (threadIdx.x == 0) { atomicAdd(valid_count, block_count); } + if (threadIdx.x == 0) { cudf::detail::atomic_add(valid_count, block_count); } } /** @@ -183,7 +184,7 @@ CUDF_KERNEL void valid_if_n_kernel(InputIterator1 begin1, auto block_valid_count = single_lane_block_sum_reduce(warp_valid_count); - if (threadIdx.x == 0) { atomicAdd(valid_counts + mask_idx, block_valid_count); } + if (threadIdx.x == 0) { cudf::detail::atomic_add(valid_counts + mask_idx, block_valid_count); } } } diff --git a/cpp/src/bitmask/null_mask.cu b/cpp/src/bitmask/null_mask.cu index bb320e4b81a..52f5b0108a4 100644 --- a/cpp/src/bitmask/null_mask.cu +++ b/cpp/src/bitmask/null_mask.cu @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -305,7 +306,7 @@ CUDF_KERNEL void count_set_bits_kernel(bitmask_type const* bitmask, __shared__ typename BlockReduce::TempStorage temp_storage; size_type block_count{BlockReduce(temp_storage).Sum(thread_count)}; - if (threadIdx.x == 0) { atomicAdd(global_count, block_count); } + if (threadIdx.x == 0) { cudf::detail::atomic_add(global_count, block_count); } } } // namespace diff --git a/cpp/src/copying/concatenate.cu b/cpp/src/copying/concatenate.cu index b1d850e0b27..e76dac38f05 100644 --- a/cpp/src/copying/concatenate.cu +++ b/cpp/src/copying/concatenate.cu @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -148,7 +149,7 @@ CUDF_KERNEL void concatenate_masks_kernel(column_device_view const* views, using detail::single_lane_block_sum_reduce; auto const block_valid_count = single_lane_block_sum_reduce(warp_valid_count); - if (threadIdx.x == 0) { atomicAdd(out_valid_count, block_valid_count); } + if (threadIdx.x == 0) { cudf::detail::atomic_add(out_valid_count, block_valid_count); } } } // namespace @@ -233,7 +234,7 @@ CUDF_KERNEL void fused_concatenate_kernel(column_device_view const* input_views, if (Nullable) { using detail::single_lane_block_sum_reduce; auto block_valid_count = single_lane_block_sum_reduce(warp_valid_count); - if (threadIdx.x == 0) { atomicAdd(out_valid_count, block_valid_count); } + if (threadIdx.x == 0) { cudf::detail::atomic_add(out_valid_count, block_valid_count); } } } diff --git a/cpp/src/groupby/sort/group_quantiles.cu b/cpp/src/groupby/sort/group_quantiles.cu index a456d4b5964..51c03375bb2 100644 --- a/cpp/src/groupby/sort/group_quantiles.cu +++ b/cpp/src/groupby/sort/group_quantiles.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -72,7 +73,7 @@ struct calculate_quantile_fn { [d_result = d_result, segment_size, offset, this](size_type j) { if (segment_size == 0) { d_result.set_null(offset + j); - atomicAdd(this->null_count, 1); + cudf::detail::atomic_add(this->null_count, 1); } else { d_result.set_valid(offset + j); } diff --git a/cpp/src/groupby/sort/group_std.cu b/cpp/src/groupby/sort/group_std.cu index 30b6f67dffe..5b37e31f949 100644 --- a/cpp/src/groupby/sort/group_std.cu +++ b/cpp/src/groupby/sort/group_std.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -150,7 +151,7 @@ struct var_functor { // fact. (1) is more work than it's worth without benchmarking, and // this approach should outperform (2) unless large amounts of the // data is null. - atomicAdd(d_null_count, 1); + cudf::detail::atomic_add(d_null_count, 1); } else { d_result.set_valid(i); } diff --git a/cpp/src/io/avro/avro_gpu.cu b/cpp/src/io/avro/avro_gpu.cu index 59177a68ee7..50241c2b0eb 100644 --- a/cpp/src/io/avro/avro_gpu.cu +++ b/cpp/src/io/avro/avro_gpu.cu @@ -17,6 +17,8 @@ #include +#include + #include using cudf::device_span; @@ -145,7 +147,7 @@ avro_decode_row(schemadesc_s const* schema, case type_null: if (dataptr != nullptr && dst_row >= 0) { atomicAnd(static_cast(dataptr) + (dst_row >> 5), ~(1 << (dst_row & 0x1f))); - atomicAdd(&schema_g[i].count, 1); + cudf::detail::atomic_add(&schema_g[i].count, 1U); *skipped_row = false; } break; diff --git a/cpp/src/io/csv/csv_gpu.cu b/cpp/src/io/csv/csv_gpu.cu index 8252cccbdb9..1653382e375 100644 --- a/cpp/src/io/csv/csv_gpu.cu +++ b/cpp/src/io/csv/csv_gpu.cu @@ -21,6 +21,7 @@ #include #include +#include #include #include #include @@ -201,12 +202,12 @@ CUDF_KERNEL void __launch_bounds__(csvparse_block_dim) // points to last character in the field auto const field_len = static_cast(next_delimiter - field_start); if (serialized_trie_contains(opts.trie_na, {field_start, field_len})) { - atomicAdd(&d_column_data[actual_col].null_count, 1); + cudf::detail::atomic_add(&d_column_data[actual_col].null_count, 1); } else if (serialized_trie_contains(opts.trie_true, {field_start, field_len}) || serialized_trie_contains(opts.trie_false, {field_start, field_len})) { - atomicAdd(&d_column_data[actual_col].bool_count, 1); + cudf::detail::atomic_add(&d_column_data[actual_col].bool_count, 1); } else if (cudf::io::is_infinity(field_start, next_delimiter)) { - atomicAdd(&d_column_data[actual_col].float_count, 1); + cudf::detail::atomic_add(&d_column_data[actual_col].float_count, 1); } else { long count_number = 0; long count_decimal = 0; @@ -261,9 +262,9 @@ CUDF_KERNEL void __launch_bounds__(csvparse_block_dim) if (column_flags[col] & column_parse::as_datetime) { // PANDAS uses `object` dtype if the date is unparseable if (is_datetime(count_string, count_decimal, count_colon, count_dash, count_slash)) { - atomicAdd(&d_column_data[actual_col].datetime_count, 1); + cudf::detail::atomic_add(&d_column_data[actual_col].datetime_count, 1); } else { - atomicAdd(&d_column_data[actual_col].string_count, 1); + cudf::detail::atomic_add(&d_column_data[actual_col].string_count, 1); } } else if (count_number == int_req_number_cnt) { auto const is_negative = (*trimmed_field_range.first == '-'); @@ -271,16 +272,16 @@ CUDF_KERNEL void __launch_bounds__(csvparse_block_dim) trimmed_field_range.first + (is_negative || (*trimmed_field_range.first == '+')); cudf::size_type* ptr = cudf::io::gpu::infer_integral_field_counter( data_begin, data_begin + count_number, is_negative, d_column_data[actual_col]); - atomicAdd(ptr, 1); + cudf::detail::atomic_add(ptr, 1); } else if (is_floatingpoint(trimmed_field_len, count_number, count_decimal, count_thousands, count_dash + count_plus, count_exponent)) { - atomicAdd(&d_column_data[actual_col].float_count, 1); + cudf::detail::atomic_add(&d_column_data[actual_col].float_count, 1); } else { - atomicAdd(&d_column_data[actual_col].string_count, 1); + cudf::detail::atomic_add(&d_column_data[actual_col].string_count, 1); } } actual_col++; @@ -372,7 +373,7 @@ CUDF_KERNEL void __launch_bounds__(csvparse_block_dim) column_flags[col] & column_parse::as_hexadecimal)) { // set the valid bitmap - all bits were set to 0 to start set_bit(valids[actual_col], rec_id); - atomicAdd(&valid_counts[actual_col], 1); + cudf::detail::atomic_add(&valid_counts[actual_col], 1); } } } else if (dtypes[actual_col].id() == cudf::type_id::STRING) { diff --git a/cpp/src/io/json/legacy/json_gpu.cu b/cpp/src/io/json/legacy/json_gpu.cu index 4d5293e12fd..713252cc39a 100644 --- a/cpp/src/io/json/legacy/json_gpu.cu +++ b/cpp/src/io/json/legacy/json_gpu.cu @@ -20,6 +20,7 @@ #include #include +#include #include #include #include @@ -282,7 +283,7 @@ CUDF_KERNEL void convert_data_to_columns_kernel(parse_options_view opts, // set the valid bitmap - all bits were set to 0 to start set_bit(valid_fields[desc.column], rec_id); - atomicAdd(&num_valid_fields[desc.column], 1); + cudf::detail::atomic_add(&num_valid_fields[desc.column], 1); } else { if (cudf::type_dispatcher(column_types[desc.column], ConvertFunctor{}, @@ -295,7 +296,7 @@ CUDF_KERNEL void convert_data_to_columns_kernel(parse_options_view opts, false)) { // set the valid bitmap - all bits were set to 0 to start set_bit(valid_fields[desc.column], rec_id); - atomicAdd(&num_valid_fields[desc.column], 1); + cudf::detail::atomic_add(&num_valid_fields[desc.column], 1); } } } else if (column_types[desc.column].id() == type_id::STRING) { @@ -349,16 +350,16 @@ CUDF_KERNEL void detect_data_types_kernel( // Checking if the field is empty/valid if (serialized_trie_contains(opts.trie_na, {desc.value_begin, value_len})) { // Increase the null count for array rows, where the null count is initialized to zero. - if (!are_rows_objects) { atomicAdd(&column_infos[desc.column].null_count, 1); } + if (!are_rows_objects) { cudf::detail::atomic_add(&column_infos[desc.column].null_count, 1); } continue; } else if (are_rows_objects) { // For files with object rows, null count is initialized to row count. The value is decreased // here for every valid field. - atomicAdd(&column_infos[desc.column].null_count, -1); + cudf::detail::atomic_add(&column_infos[desc.column].null_count, -1); } // Don't need counts to detect strings, any field in quotes is deduced to be a string if (desc.is_quoted) { - atomicAdd(&column_infos[desc.column].string_count, 1); + cudf::detail::atomic_add(&column_infos[desc.column].string_count, 1); continue; } @@ -405,21 +406,21 @@ CUDF_KERNEL void detect_data_types_kernel( if (maybe_hex) { --int_req_number_cnt; } if (serialized_trie_contains(opts.trie_true, {desc.value_begin, value_len}) || serialized_trie_contains(opts.trie_false, {desc.value_begin, value_len})) { - atomicAdd(&column_infos[desc.column].bool_count, 1); + cudf::detail::atomic_add(&column_infos[desc.column].bool_count, 1); } else if (digit_count == int_req_number_cnt) { bool is_negative = (*desc.value_begin == '-'); char const* data_begin = desc.value_begin + (is_negative || (*desc.value_begin == '+')); cudf::size_type* ptr = cudf::io::gpu::infer_integral_field_counter( data_begin, data_begin + digit_count, is_negative, column_infos[desc.column]); - atomicAdd(ptr, 1); + cudf::detail::atomic_add(ptr, 1); } else if (is_like_float( value_len, digit_count, decimal_count, dash_count + plus_count, exponent_count)) { - atomicAdd(&column_infos[desc.column].float_count, 1); + cudf::detail::atomic_add(&column_infos[desc.column].float_count, 1); } // A date-time field cannot have more than 3 non-special characters // A number field cannot have more than one decimal point else if (other_count > 3 || decimal_count > 1) { - atomicAdd(&column_infos[desc.column].string_count, 1); + cudf::detail::atomic_add(&column_infos[desc.column].string_count, 1); } else { // A date field can have either one or two '-' or '\'; A legal combination will only have one // of them To simplify the process of auto column detection, we are not covering all the @@ -427,20 +428,20 @@ CUDF_KERNEL void detect_data_types_kernel( if ((dash_count > 0 && dash_count <= 2 && slash_count == 0) || (dash_count == 0 && slash_count > 0 && slash_count <= 2)) { if (colon_count <= 2) { - atomicAdd(&column_infos[desc.column].datetime_count, 1); + cudf::detail::atomic_add(&column_infos[desc.column].datetime_count, 1); } else { - atomicAdd(&column_infos[desc.column].string_count, 1); + cudf::detail::atomic_add(&column_infos[desc.column].string_count, 1); } } else { // Default field type is string - atomicAdd(&column_infos[desc.column].string_count, 1); + cudf::detail::atomic_add(&column_infos[desc.column].string_count, 1); } } } if (!are_rows_objects) { // For array rows, mark missing fields as null for (; input_field_index < num_columns; ++input_field_index) - atomicAdd(&column_infos[input_field_index].null_count, 1); + cudf::detail::atomic_add(&column_infos[input_field_index].null_count, 1); } } @@ -498,7 +499,7 @@ CUDF_KERNEL void collect_keys_info_kernel(parse_options_view const options, for (auto field_range = advance(row_data_range.first); field_range.key_begin < row_data_range.second; field_range = advance(field_range.value_end)) { - auto const idx = atomicAdd(keys_cnt, 1); + auto const idx = cudf::detail::atomic_add(keys_cnt, 1ULL); if (keys_info.has_value()) { auto const len = field_range.key_end - field_range.key_begin; keys_info->column(0).element(idx) = field_range.key_begin - data.begin(); diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu index 14072d79172..09f2744e324 100644 --- a/cpp/src/io/orc/stripe_data.cu +++ b/cpp/src/io/orc/stripe_data.cu @@ -16,12 +16,15 @@ #include "orc_gpu.hpp" -#include #include -#include +#include +#include + #include +#include + namespace cudf { namespace io { namespace orc { @@ -1399,10 +1402,10 @@ CUDF_KERNEL void __launch_bounds__(block_size) // If we have an index, seek to the initial run and update row positions if (num_rowgroups > 0) { if (s->top.data.index.strm_offset[0] > s->chunk.strm_len[CI_DATA]) { - atomicAdd(error_count, 1); + cudf::detail::atomic_add(error_count, 1); } if (s->top.data.index.strm_offset[1] > s->chunk.strm_len[CI_DATA2]) { - atomicAdd(error_count, 1); + cudf::detail::atomic_add(error_count, 1); } uint32_t ofs0 = min(s->top.data.index.strm_offset[0], s->chunk.strm_len[CI_DATA]); uint32_t ofs1 = min(s->top.data.index.strm_offset[1], s->chunk.strm_len[CI_DATA2]); @@ -1823,7 +1826,8 @@ CUDF_KERNEL void __launch_bounds__(block_size) if (num_rowgroups > 0) { row_groups[blockIdx.y][blockIdx.x].num_child_rows = s->num_child_rows; } - atomicAdd(&chunks[chunk_id].num_child_rows, s->num_child_rows); + cudf::detail::atomic_add(&chunks[chunk_id].num_child_rows, + static_cast(s->num_child_rows)); } } diff --git a/cpp/src/io/parquet/chunk_dict.cu b/cpp/src/io/parquet/chunk_dict.cu index a43c6d4cbb6..643394ed8a1 100644 --- a/cpp/src/io/parquet/chunk_dict.cu +++ b/cpp/src/io/parquet/chunk_dict.cu @@ -18,6 +18,7 @@ #include #include +#include #include #include @@ -175,9 +176,9 @@ CUDF_KERNEL void __launch_bounds__(block_size) __syncthreads(); auto uniq_data_size = block_reduce(reduce_storage).Sum(uniq_elem_size); if (t == 0) { - total_num_dict_entries = atomicAdd(&chunk->num_dict_entries, num_unique); + total_num_dict_entries = cudf::detail::atomic_add(&chunk->num_dict_entries, num_unique); total_num_dict_entries += num_unique; - atomicAdd(&chunk->uniq_data_size, uniq_data_size); + cudf::detail::atomic_add(&chunk->uniq_data_size, uniq_data_size); } __syncthreads(); diff --git a/cpp/src/io/utilities/data_casting.cu b/cpp/src/io/utilities/data_casting.cu index 8fd860d9492..71ff52becc2 100644 --- a/cpp/src/io/utilities/data_casting.cu +++ b/cpp/src/io/utilities/data_casting.cu @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -437,14 +438,14 @@ CUDF_KERNEL void parse_fn_string_parallel(str_tuple_it str_tuples, auto get_next_string = [&]() { if constexpr (is_warp) { size_type istring; - if (lane == 0) { istring = atomicAdd(str_counter, 1); } + if (lane == 0) { istring = cudf::detail::atomic_add(str_counter, 1); } return __shfl_sync(0xffffffff, istring, 0); } else { // Ensure lane 0 doesn't update istring before all threads have read the previous iteration's // istring value __syncthreads(); __shared__ size_type istring; - if (lane == 0) { istring = atomicAdd(str_counter, 1); } + if (lane == 0) { istring = cudf::detail::atomic_add(str_counter, 1); } __syncthreads(); return istring; } @@ -474,7 +475,7 @@ CUDF_KERNEL void parse_fn_string_parallel(str_tuple_it str_tuples, if (is_null_literal && null_mask != nullptr) { if (lane == 0) { clear_bit(null_mask, istring); - atomicAdd(null_count_data, 1); + cudf::detail::atomic_add(null_count_data, 1); if (!d_chars) d_offsets[istring] = 0; } continue; // gride-stride return; @@ -618,7 +619,7 @@ CUDF_KERNEL void parse_fn_string_parallel(str_tuple_it str_tuples, if (!d_chars && lane == 0) { if (null_mask != nullptr) { clear_bit(null_mask, istring); - atomicAdd(null_count_data, 1); + cudf::detail::atomic_add(null_count_data, 1); } last_offset = 0; d_offsets[istring] = 0; @@ -759,7 +760,7 @@ struct string_parse { options.trie_na, {in_begin, static_cast(num_in_chars)}); if (is_null_literal && null_mask != nullptr) { clear_bit(null_mask, idx); - atomicAdd(null_count_data, 1); + cudf::detail::atomic_add(null_count_data, 1); if (!d_chars) d_offsets[idx] = 0; return; } @@ -770,7 +771,7 @@ struct string_parse { if (str_process_info.result != data_casting_result::PARSING_SUCCESS) { if (null_mask != nullptr) { clear_bit(null_mask, idx); - atomicAdd(null_count_data, 1); + cudf::detail::atomic_add(null_count_data, 1); } if (!d_chars) d_offsets[idx] = 0; } else { @@ -953,7 +954,7 @@ std::unique_ptr parse_data( if (is_null_literal) { col.set_null(row); - atomicAdd(null_count_data, 1); + cudf::detail::atomic_add(null_count_data, 1); return; } @@ -971,7 +972,7 @@ std::unique_ptr parse_data( false); if (not is_parsed) { col.set_null(row); - atomicAdd(null_count_data, 1); + cudf::detail::atomic_add(null_count_data, 1); } }); diff --git a/cpp/src/io/utilities/parsing_utils.cu b/cpp/src/io/utilities/parsing_utils.cu index d02ce99e6e5..5eed1e5ca04 100644 --- a/cpp/src/io/utilities/parsing_utils.cu +++ b/cpp/src/io/utilities/parsing_utils.cu @@ -15,6 +15,7 @@ */ #include +#include #include #include #include @@ -106,7 +107,7 @@ CUDF_KERNEL void count_and_set_positions(char const* data, // Process the data for (long i = 0; i < byteToProcess; i++) { if (raw[i] == key) { - auto const idx = atomicAdd(count, (cudf::size_type)1); + auto const idx = cudf::detail::atomic_add(count, static_cast(1)); setElement(positions, idx, did + offset + i, key); } } diff --git a/cpp/src/io/utilities/type_inference.cu b/cpp/src/io/utilities/type_inference.cu index b446ad41946..eead98ca1c8 100644 --- a/cpp/src/io/utilities/type_inference.cu +++ b/cpp/src/io/utilities/type_inference.cu @@ -19,6 +19,7 @@ #include #include +#include #include #include @@ -204,16 +205,16 @@ CUDF_KERNEL void infer_column_type_kernel(OptionsView options, auto const block_type_histogram = BlockReduce(temp_storage).Reduce(thread_type_histogram, custom_sum{}); if (threadIdx.x == 0) { - atomicAdd(&column_info->null_count, block_type_histogram.null_count); - atomicAdd(&column_info->float_count, block_type_histogram.float_count); - atomicAdd(&column_info->datetime_count, block_type_histogram.datetime_count); - atomicAdd(&column_info->string_count, block_type_histogram.string_count); - atomicAdd(&column_info->negative_small_int_count, - block_type_histogram.negative_small_int_count); - atomicAdd(&column_info->positive_small_int_count, - block_type_histogram.positive_small_int_count); - atomicAdd(&column_info->big_int_count, block_type_histogram.big_int_count); - atomicAdd(&column_info->bool_count, block_type_histogram.bool_count); + cudf::detail::atomic_add(&column_info->null_count, block_type_histogram.null_count); + cudf::detail::atomic_add(&column_info->float_count, block_type_histogram.float_count); + cudf::detail::atomic_add(&column_info->datetime_count, block_type_histogram.datetime_count); + cudf::detail::atomic_add(&column_info->string_count, block_type_histogram.string_count); + cudf::detail::atomic_add(&column_info->negative_small_int_count, + block_type_histogram.negative_small_int_count); + cudf::detail::atomic_add(&column_info->positive_small_int_count, + block_type_histogram.positive_small_int_count); + cudf::detail::atomic_add(&column_info->big_int_count, block_type_histogram.big_int_count); + cudf::detail::atomic_add(&column_info->bool_count, block_type_histogram.bool_count); } } diff --git a/cpp/src/join/join_common_utils.cuh b/cpp/src/join/join_common_utils.cuh index f3ce6de4598..c87aa590385 100644 --- a/cpp/src/join/join_common_utils.cuh +++ b/cpp/src/join/join_common_utils.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -280,7 +281,7 @@ __inline__ __device__ void add_pair_to_cache(size_type const first, size_type* joined_shared_l, size_type* joined_shared_r) { - size_type my_current_idx{atomicAdd(current_idx_shared + warp_id, size_type(1))}; + size_type my_current_idx{cudf::detail::atomic_add(current_idx_shared + warp_id, size_type(1))}; // its guaranteed to fit into the shared cache joined_shared_l[my_current_idx] = first; @@ -303,7 +304,9 @@ __device__ void flush_output_cache(unsigned int const activemask, int num_threads = __popc(activemask); cudf::size_type output_offset = 0; - if (0 == lane_id) { output_offset = atomicAdd(current_idx, current_idx_shared[warp_id]); } + if (0 == lane_id) { + output_offset = cudf::detail::atomic_add(current_idx, current_idx_shared[warp_id]); + } // No warp sync is necessary here because we are assuming that ShuffleIndex // is internally using post-CUDA 9.0 synchronization-safe primitives diff --git a/cpp/src/json/json_path.cu b/cpp/src/json/json_path.cu index 146b54c0d87..5e67841b328 100644 --- a/cpp/src/json/json_path.cu +++ b/cpp/src/json/json_path.cu @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -954,7 +955,7 @@ __launch_bounds__(block_size) CUDF_KERNEL if (out_valid_count) { size_type block_valid_count = cudf::detail::single_lane_block_sum_reduce(warp_valid_count); - if (threadIdx.x == 0) { atomicAdd(out_valid_count.value(), block_valid_count); } + if (threadIdx.x == 0) { cudf::detail::atomic_add(out_valid_count.value(), block_valid_count); } } } diff --git a/cpp/src/partitioning/partitioning.cu b/cpp/src/partitioning/partitioning.cu index 8d8f1a71672..0828a0fc772 100644 --- a/cpp/src/partitioning/partitioning.cu +++ b/cpp/src/partitioning/partitioning.cu @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -158,7 +159,7 @@ CUDF_KERNEL void compute_row_partition_numbers(row_hasher_t the_hasher, row_partition_numbers[row_number] = partition_number; row_partition_offset[row_number] = - atomicAdd(&(shared_partition_sizes[partition_number]), size_type(1)); + cudf::detail::atomic_add(&(shared_partition_sizes[partition_number]), size_type(1)); tid += stride; } @@ -171,7 +172,7 @@ CUDF_KERNEL void compute_row_partition_numbers(row_hasher_t the_hasher, size_type const block_partition_size = shared_partition_sizes[partition_number]; // Update global size of each partition - atomicAdd(&global_partition_sizes[partition_number], block_partition_size); + cudf::detail::atomic_add(&global_partition_sizes[partition_number], block_partition_size); // Record the size of this partition in this block size_type const write_location = partition_number * gridDim.x + blockIdx.x; @@ -229,7 +230,7 @@ CUDF_KERNEL void compute_row_output_locations(size_type* __restrict__ row_partit // Get output location based on partition number by incrementing the // corresponding partition offset for this block size_type const row_output_location = - atomicAdd(&(shared_partition_offsets[partition_number]), size_type(1)); + cudf::detail::atomic_add(&(shared_partition_offsets[partition_number]), size_type(1)); // Store the row's output location in-place row_partition_numbers[row_number] = row_output_location; @@ -708,7 +709,7 @@ struct dispatch_map_type { partition_map.end(), scatter_map.begin(), [offsets = histogram.data()] __device__(auto partition_number) { - return atomicAdd(&offsets[partition_number], 1); + return cudf::detail::atomic_add(&offsets[partition_number], 1); }); // Scatter the rows into their partitions diff --git a/cpp/src/replace/nulls.cu b/cpp/src/replace/nulls.cu index 8ea229368cc..95177f53a72 100644 --- a/cpp/src/replace/nulls.cu +++ b/cpp/src/replace/nulls.cu @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -108,7 +109,9 @@ CUDF_KERNEL void replace_nulls_strings(cudf::column_device_view input, // Compute total valid count for this block and add it to global count uint32_t block_valid_count = cudf::detail::single_lane_block_sum_reduce(valid_sum); // one thread computes and adds to output_valid_count - if (threadIdx.x == 0) { atomicAdd(valid_counter, block_valid_count); } + if (threadIdx.x == 0) { + cudf::detail::atomic_add(valid_counter, static_cast(block_valid_count)); + } } template @@ -153,7 +156,9 @@ CUDF_KERNEL void replace_nulls(cudf::column_device_view input, uint32_t block_valid_count = cudf::detail::single_lane_block_sum_reduce(valid_sum); // one thread computes and adds to output_valid_count - if (threadIdx.x == 0) { atomicAdd(output_valid_count, block_valid_count); } + if (threadIdx.x == 0) { + cudf::detail::atomic_add(output_valid_count, static_cast(block_valid_count)); + } } } diff --git a/cpp/src/replace/replace.cu b/cpp/src/replace/replace.cu index 184c30246c7..1b0a7316ad2 100644 --- a/cpp/src/replace/replace.cu +++ b/cpp/src/replace/replace.cu @@ -41,6 +41,7 @@ #include #include #include +#include #include #include #include @@ -168,7 +169,9 @@ CUDF_KERNEL void replace_strings_first_pass(cudf::column_device_view input, // Compute total valid count for this block and add it to global count uint32_t block_valid_count = cudf::detail::single_lane_block_sum_reduce(valid_sum); // one thread computes and adds to output_valid_count - if (threadIdx.x == 0) { atomicAdd(output_valid_count, block_valid_count); } + if (threadIdx.x == 0) { + cudf::detail::atomic_add(output_valid_count, static_cast(block_valid_count)); + } } /** @@ -295,7 +298,9 @@ CUDF_KERNEL void replace_kernel(cudf::column_device_view input, uint32_t block_valid_count = cudf::detail::single_lane_block_sum_reduce(valid_sum); // one thread computes and adds to output_valid_count - if (threadIdx.x == 0) { atomicAdd(output_valid_count, block_valid_count); } + if (threadIdx.x == 0) { + cudf::detail::atomic_add(output_valid_count, static_cast(block_valid_count)); + } } } diff --git a/cpp/src/rolling/detail/rolling.cuh b/cpp/src/rolling/detail/rolling.cuh index 20845a97c7e..d14d4e3e94e 100644 --- a/cpp/src/rolling/detail/rolling.cuh +++ b/cpp/src/rolling/detail/rolling.cuh @@ -35,6 +35,7 @@ #include #include #include +#include #include #include #include @@ -1077,7 +1078,7 @@ __launch_bounds__(block_size) CUDF_KERNEL size_type block_valid_count = cudf::detail::single_lane_block_sum_reduce(warp_valid_count); - if (threadIdx.x == 0) { atomicAdd(output_valid_count, block_valid_count); } + if (threadIdx.x == 0) { cudf::detail::atomic_add(output_valid_count, block_valid_count); } } /** diff --git a/cpp/src/stream_compaction/distinct_helpers.cu b/cpp/src/stream_compaction/distinct_helpers.cu index 8f36ec98f4a..34688e8655b 100644 --- a/cpp/src/stream_compaction/distinct_helpers.cu +++ b/cpp/src/stream_compaction/distinct_helpers.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,6 +17,7 @@ #include "distinct_helpers.hpp" #include +#include namespace cudf::detail { @@ -53,7 +54,7 @@ struct reduce_fn : reduce_by_row_fn_base #include #include +#include #include #include #include @@ -167,7 +168,7 @@ CUDF_KERNEL void fused_concatenate_string_offset_kernel( if (Nullable) { using cudf::detail::single_lane_block_sum_reduce; auto block_valid_count = single_lane_block_sum_reduce(warp_valid_count); - if (threadIdx.x == 0) { atomicAdd(out_valid_count, block_valid_count); } + if (threadIdx.x == 0) { cudf::detail::atomic_add(out_valid_count, block_valid_count); } } } diff --git a/cpp/src/strings/replace/multi.cu b/cpp/src/strings/replace/multi.cu index 3d0210d61b0..69fd407b6c2 100644 --- a/cpp/src/strings/replace/multi.cu +++ b/cpp/src/strings/replace/multi.cu @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -330,7 +331,7 @@ std::unique_ptr replace_character_parallel(strings_column_view const& in target_count, [d_string_indices, d_targets_offsets] __device__(size_type idx) { auto const str_idx = d_string_indices[idx] - 1; - atomicAdd(d_targets_offsets + str_idx, 1); + cudf::detail::atomic_add(d_targets_offsets + str_idx, 1); }); // finally, convert the counts into offsets thrust::exclusive_scan(rmm::exec_policy(stream), diff --git a/cpp/src/transform/row_conversion.cu b/cpp/src/transform/row_conversion.cu index b294369a90e..8c51bea3c88 100644 --- a/cpp/src/transform/row_conversion.cu +++ b/cpp/src/transform/row_conversion.cu @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -250,7 +251,7 @@ build_string_row_offsets(table_view const& tbl, auto const col = element_idx / num_rows; auto const val = d_offsets_iterators[col][row + 1] - d_offsets_iterators[col][row]; - atomicAdd(&d_row_sizes[row], val); + cudf::detail::atomic_add(&d_row_sizes[row], val); }); // transform the row sizes to include fixed width size and alignment diff --git a/cpp/tests/device_atomics/device_atomics_test.cu b/cpp/tests/device_atomics/device_atomics_test.cu index 6e90d4462df..dc570efd7aa 100644 --- a/cpp/tests/device_atomics/device_atomics_test.cu +++ b/cpp/tests/device_atomics/device_atomics_test.cu @@ -37,7 +37,7 @@ CUDF_KERNEL void gpu_atomic_test(T* result, T* data, size_t size) size_t step = blockDim.x * gridDim.x; for (; id < size; id += step) { - atomicAdd(&result[0], data[id]); + cudf::detail::atomic_add(&result[0], data[id]); atomicMin(&result[1], data[id]); atomicMax(&result[2], data[id]); cudf::genericAtomicOperation(&result[3], data[id], cudf::DeviceSum{}); From 69dc774d546f1e29c99e9d84ae40060a7d55497d Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Fri, 2 Feb 2024 22:53:00 -0600 Subject: [PATCH 2/6] Refactor other atomic operators. --- .../cudf/detail/aggregation/aggregation.cuh | 30 ++++++----- .../cudf/detail/utilities/device_atomics.cuh | 51 ++++++++++--------- .../device_atomics/device_atomics_test.cu | 6 +-- 3 files changed, 45 insertions(+), 42 deletions(-) diff --git a/cpp/include/cudf/detail/aggregation/aggregation.cuh b/cpp/include/cudf/detail/aggregation/aggregation.cuh index 968fc83e763..ecf2f610697 100644 --- a/cpp/include/cudf/detail/aggregation/aggregation.cuh +++ b/cpp/include/cudf/detail/aggregation/aggregation.cuh @@ -144,8 +144,8 @@ struct update_target_element< if (source_has_nulls and source.is_null(source_index)) { return; } using Target = target_type_t; - atomicMin(&target.element(target_index), - static_cast(source.element(source_index))); + cudf::detail::atomic_min(&target.element(target_index), + static_cast(source.element(source_index))); if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); } } @@ -170,8 +170,8 @@ struct update_target_element< using DeviceTarget = device_storage_type_t; using DeviceSource = device_storage_type_t; - atomicMin(&target.element(target_index), - static_cast(source.element(source_index))); + cudf::detail::atomic_min(&target.element(target_index), + static_cast(source.element(source_index))); if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); } } @@ -193,8 +193,8 @@ struct update_target_element< if (source_has_nulls and source.is_null(source_index)) { return; } using Target = target_type_t; - atomicMax(&target.element(target_index), - static_cast(source.element(source_index))); + cudf::detail::atomic_max(&target.element(target_index), + static_cast(source.element(source_index))); if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); } } @@ -219,8 +219,8 @@ struct update_target_element< using DeviceTarget = device_storage_type_t; using DeviceSource = device_storage_type_t; - atomicMax(&target.element(target_index), - static_cast(source.element(source_index))); + cudf::detail::atomic_max(&target.element(target_index), + static_cast(source.element(source_index))); if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); } } @@ -387,8 +387,8 @@ struct update_target_element; - atomicMul(&target.element(target_index), - static_cast(source.element(source_index))); + cudf::detail::atomic_mul(&target.element(target_index), + static_cast(source.element(source_index))); if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); } } }; @@ -449,10 +449,11 @@ struct update_target_element< if (source_has_nulls and source.is_null(source_index)) { return; } using Target = target_type_t; - auto old = atomicCAS(&target.element(target_index), ARGMAX_SENTINEL, source_index); + auto old = cudf::detail::atomic_cas( + &target.element(target_index), ARGMAX_SENTINEL, source_index); if (old != ARGMAX_SENTINEL) { while (source.element(source_index) > source.element(old)) { - old = atomicCAS(&target.element(target_index), old, source_index); + old = cudf::detail::atomic_cas(&target.element(target_index), old, source_index); } } @@ -476,10 +477,11 @@ struct update_target_element< if (source_has_nulls and source.is_null(source_index)) { return; } using Target = target_type_t; - auto old = atomicCAS(&target.element(target_index), ARGMIN_SENTINEL, source_index); + auto old = cudf::detail::atomic_cas( + &target.element(target_index), ARGMIN_SENTINEL, source_index); if (old != ARGMIN_SENTINEL) { while (source.element(source_index) < source.element(old)) { - old = atomicCAS(&target.element(target_index), old, source_index); + old = cudf::detail::atomic_cas(&target.element(target_index), old, source_index); } } diff --git a/cpp/include/cudf/detail/utilities/device_atomics.cuh b/cpp/include/cudf/detail/utilities/device_atomics.cuh index 3a5108a3503..93ad9e076d5 100644 --- a/cpp/include/cudf/detail/utilities/device_atomics.cuh +++ b/cpp/include/cudf/detail/utilities/device_atomics.cuh @@ -27,7 +27,8 @@ * cudf::duration_us, cudf::duration_ns and bool * where CUDA atomic operations are, `atomicAdd`, `atomicMin`, `atomicMax`, * `atomicCAS`. - * Also provides `cudf::genericAtomicOperation` which performs atomic operation + * + * Also provides `cudf::detail::genericAtomicOperation` which performs atomic operation * with the given binary operator. */ @@ -364,8 +365,6 @@ struct typesAtomicCASImpl { } }; -} // namespace detail - /** * @brief compute atomic binary operation * reads the `old` located at the `address` in global or shared memory, @@ -428,7 +427,6 @@ __forceinline__ __device__ bool genericAtomicOperation(bool* address, return T(fun(address, update_value, op)); } -namespace detail { /** * @brief Overloads for `atomic_add` * reads the `old` located at the `address` in global or shared memory, @@ -441,7 +439,7 @@ namespace detail { * cudf::timestamp_ns, cudf::duration_D, cudf::duration_s, cudf::duration_ms, * cudf::duration_us, cudf::duration_ns and bool * - * Cuda natively supports `sint32`, `uint32`, `uint64`, `float`, `double. + * CUDA natively supports `int32_t`, `uint32_t`, `uint64_t`, `float`, `double. * (`double` is supported after Pascal). * Other types are implemented by `atomicCAS`. * @@ -453,13 +451,11 @@ namespace detail { template __forceinline__ __device__ T atomic_add(T* address, T val) { - return cudf::genericAtomicOperation(address, val, cudf::DeviceSum{}); + return cudf::detail::genericAtomicOperation(address, val, cudf::DeviceSum{}); } -} // namespace detail -} // namespace cudf /** - * @brief Overloads for `atomicMul` + * @brief Overloads for `atomic_mul` * reads the `old` located at the `address` in global or shared memory, * computes (old * val), and stores the result back to memory at the same * address. These three operations are performed in one atomic transaction. @@ -475,24 +471,25 @@ __forceinline__ __device__ T atomic_add(T* address, T val) * @returns The old value at `address` */ template -__forceinline__ __device__ T atomicMul(T* address, T val) +__forceinline__ __device__ T atomic_mul(T* address, T val) { - return cudf::genericAtomicOperation(address, val, cudf::DeviceProduct{}); + return cudf::detail::genericAtomicOperation(address, val, cudf::DeviceProduct{}); } /** - * @brief Overloads for `atomicMin` + * @brief Overloads for `atomic_min` * reads the `old` located at the `address` in global or shared memory, * computes the minimum of old and val, and stores the result back to memory * at the same address. * These three operations are performed in one atomic transaction. * - * The supported cudf types for `atomicMin` are: + * The supported cudf types for `atomic_min` are: * int8_t, int16_t, int32_t, int64_t, float, double, * cudf::timestamp_D, cudf::timestamp_s, cudf::timestamp_ms, cudf::timestamp_us, * cudf::timestamp_ns, cudf::duration_D, cudf::duration_s, cudf::duration_ms, * cudf::duration_us, cudf::duration_ns and bool - * Cuda natively supports `sint32`, `uint32`, `sint64`, `uint64`. + * + * CUDA natively supports `int32_t`, `uint32_t`, `int64_t`, `uint64_t`. * Other types are implemented by `atomicCAS`. * * @param[in] address The address of old value in global or shared memory @@ -501,24 +498,25 @@ __forceinline__ __device__ T atomicMul(T* address, T val) * @returns The old value at `address` */ template -__forceinline__ __device__ T atomicMin(T* address, T val) +__forceinline__ __device__ T atomic_min(T* address, T val) { - return cudf::genericAtomicOperation(address, val, cudf::DeviceMin{}); + return cudf::detail::genericAtomicOperation(address, val, cudf::DeviceMin{}); } /** - * @brief Overloads for `atomicMax` + * @brief Overloads for `atomic_max` * reads the `old` located at the `address` in global or shared memory, * computes the maximum of old and val, and stores the result back to memory * at the same address. * These three operations are performed in one atomic transaction. * - * The supported cudf types for `atomicMax` are: + * The supported cudf types for `atomic_max` are: * int8_t, int16_t, int32_t, int64_t, float, double, * cudf::timestamp_D, cudf::timestamp_s, cudf::timestamp_ms, cudf::timestamp_us, * cudf::timestamp_ns, cudf::duration_D, cudf::duration_s, cudf::duration_ms, * cudf::duration_us, cudf::duration_ns and bool - * Cuda natively supports `sint32`, `uint32`, `sint64`, `uint64`. + * + * CUDA natively supports `int32_t`, `uint32_t`, `int64_t`, `uint64_t`. * Other types are implemented by `atomicCAS`. * * @param[in] address The address of old value in global or shared memory @@ -527,24 +525,24 @@ __forceinline__ __device__ T atomicMin(T* address, T val) * @returns The old value at `address` */ template -__forceinline__ __device__ T atomicMax(T* address, T val) +__forceinline__ __device__ T atomic_max(T* address, T val) { - return cudf::genericAtomicOperation(address, val, cudf::DeviceMax{}); + return cudf::detail::genericAtomicOperation(address, val, cudf::DeviceMax{}); } /** - * @brief Overloads for `atomicCAS` + * @brief Overloads for `atomic_cas` * reads the `old` located at the `address` in global or shared memory, * computes (`old` == `compare` ? `val` : `old`), * and stores the result back to memory at the same address. * These three operations are performed in one atomic transaction. * - * The supported cudf types for `atomicCAS` are: + * The supported cudf types for `atomic_cas` are: * int8_t, int16_t, int32_t, int64_t, float, double, * cudf::timestamp_D, cudf::timestamp_s, cudf::timestamp_ms, cudf::timestamp_us, * cudf::timestamp_ns, cudf::duration_D, cudf::duration_s, cudf::duration_ms, * cudf::duration_us, cudf::duration_ns and bool - * Cuda natively supports `sint32`, `uint32`, `uint64`. + * CUDA natively supports `int32_t`, `uint32_t`, `uint64_t`. * Other types are implemented by `atomicCAS`. * * @param[in] address The address of old value in global or shared memory @@ -554,7 +552,10 @@ __forceinline__ __device__ T atomicMax(T* address, T val) * @returns The old value at `address` */ template -__forceinline__ __device__ T atomicCAS(T* address, T compare, T val) +__forceinline__ __device__ T atomic_cas(T* address, T compare, T val) { return cudf::detail::typesAtomicCASImpl()(address, compare, val); } + +} // namespace detail +} // namespace cudf diff --git a/cpp/tests/device_atomics/device_atomics_test.cu b/cpp/tests/device_atomics/device_atomics_test.cu index dc570efd7aa..316137068f2 100644 --- a/cpp/tests/device_atomics/device_atomics_test.cu +++ b/cpp/tests/device_atomics/device_atomics_test.cu @@ -40,9 +40,9 @@ CUDF_KERNEL void gpu_atomic_test(T* result, T* data, size_t size) cudf::detail::atomic_add(&result[0], data[id]); atomicMin(&result[1], data[id]); atomicMax(&result[2], data[id]); - cudf::genericAtomicOperation(&result[3], data[id], cudf::DeviceSum{}); - cudf::genericAtomicOperation(&result[4], data[id], cudf::DeviceMin{}); - cudf::genericAtomicOperation(&result[5], data[id], cudf::DeviceMax{}); + cudf::detail::genericAtomicOperation(&result[3], data[id], cudf::DeviceSum{}); + cudf::detail::genericAtomicOperation(&result[4], data[id], cudf::DeviceMin{}); + cudf::detail::genericAtomicOperation(&result[5], data[id], cudf::DeviceMax{}); } } From 25a112523719969ccfb5896e2e4ce97a3ad6cacc Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Fri, 2 Feb 2024 22:53:33 -0600 Subject: [PATCH 3/6] Refactor docs and clean up device_atomic.cuh. --- .../cudf/detail/utilities/device_atomics.cuh | 211 ++++++++++-------- 1 file changed, 122 insertions(+), 89 deletions(-) diff --git a/cpp/include/cudf/detail/utilities/device_atomics.cuh b/cpp/include/cudf/detail/utilities/device_atomics.cuh index 93ad9e076d5..61c5f35d62a 100644 --- a/cpp/include/cudf/detail/utilities/device_atomics.cuh +++ b/cpp/include/cudf/detail/utilities/device_atomics.cuh @@ -20,16 +20,15 @@ * @brief overloads for CUDA atomic operations * @file device_atomics.cuh * - * Provides the overloads for all of possible cudf's data types, - * where cudf's data types are, int8_t, int16_t, int32_t, int64_t, float, double, - * cudf::timestamp_D, cudf::timestamp_s, cudf::timestamp_ms, cudf::timestamp_us, + * Provides the overloads for all of cudf's data types, specifically int8_t, + * int16_t, int32_t, int64_t, float, double, cudf::timestamp_D, + * cudf::timestamp_s, cudf::timestamp_ms, cudf::timestamp_us, * cudf::timestamp_ns, cudf::duration_D, cudf::duration_s, cudf::duration_ms, - * cudf::duration_us, cudf::duration_ns and bool - * where CUDA atomic operations are, `atomicAdd`, `atomicMin`, `atomicMax`, - * `atomicCAS`. + * cudf::duration_us, cudf::duration_ns and bool for the CUDA atomic operations + * `atomicAdd`, `atomicMin`, `atomicMax`, `atomicCAS`. * - * Also provides `cudf::detail::genericAtomicOperation` which performs atomic operation - * with the given binary operator. + * Also provides `cudf::detail::genericAtomicOperation` which performs an + * atomic operation with the given binary operator. */ #include @@ -38,6 +37,7 @@ #include #include #include + #include namespace cudf { @@ -85,26 +85,22 @@ template struct genericAtomicOperationImpl { __forceinline__ __device__ T operator()(T* addr, T const& update_value, Op op) { - using T_int = unsigned int; - bool is_32_align = (reinterpret_cast(addr) & 2) == 0; - auto* address_uint32 = - reinterpret_cast(reinterpret_cast(addr) - (is_32_align ? 0 : 2)); + using T_int = unsigned short int; + static_assert(sizeof(T) == sizeof(T_int)); - T_int old = *address_uint32; + T old_value = *addr; T_int assumed; + T_int ret; do { - assumed = old; - T const target_value = (is_32_align) ? T(old & 0xffff) : T(old >> 16); - uint16_t updating_value = type_reinterpret(op(target_value, update_value)); + T_int const new_value = type_reinterpret(op(old_value, update_value)); - T_int const new_value = (is_32_align) ? (old & 0xffff'0000) | updating_value - : (old & 0xffff) | (T_int(updating_value) << 16); - old = atomicCAS(address_uint32, assumed, new_value); - } while (assumed != old); + assumed = type_reinterpret(old_value); + ret = atomicCAS(reinterpret_cast(addr), assumed, new_value); + old_value = type_reinterpret(ret); + } while (assumed != ret); - return (is_32_align) ? T(old & 0xffff) : T(old >> 16); - ; + return old_value; } }; @@ -114,6 +110,7 @@ struct genericAtomicOperationImpl { __forceinline__ __device__ T operator()(T* addr, T const& update_value, Op op) { using T_int = unsigned int; + static_assert(sizeof(T) == sizeof(T_int)); T old_value = *addr; T_int assumed; @@ -125,7 +122,6 @@ struct genericAtomicOperationImpl { assumed = type_reinterpret(old_value); ret = atomicCAS(reinterpret_cast(addr), assumed, new_value); old_value = type_reinterpret(ret); - } while (assumed != ret); return old_value; @@ -150,17 +146,17 @@ struct genericAtomicOperationImpl { assumed = type_reinterpret(old_value); ret = atomicCAS(reinterpret_cast(addr), assumed, new_value); old_value = type_reinterpret(ret); - } while (assumed != ret); return old_value; } }; -// ----------------------------------------------------------------------- -// specialized functions for operators -// `atomicAdd` supports int32_t, uint32_t, uint64_t, float, double (signed int64_t is not -// supported.) `atomicMin`, `atomicMax` support int32_t, uint32_t, uint64_t, int64_t +// Specialized functions for operators. + +// `atomicAdd` supports int32_t, uint32_t, uint64_t, float, double. +// `atomicAdd` does not support int64_t. + template <> struct genericAtomicOperationImpl { using T = float; @@ -180,17 +176,24 @@ struct genericAtomicOperationImpl { }; template <> -struct genericAtomicOperationImpl { - using T = uint32_t; +struct genericAtomicOperationImpl { + using T = int32_t; __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceSum op) { return atomicAdd(addr, update_value); } }; +// CUDA natively supports `unsigned long long int` for `atomicAdd`, +// but doesn't support `signed long long int`. +// However, since the signed integer is represented as two's complement, +// the fundamental arithmetic operations of addition are identical to +// those for unsigned binary numbers. +// Then, this computes as `unsigned long long int` with `atomicAdd` +// @sa https://en.wikipedia.org/wiki/Two%27s_complement template <> -struct genericAtomicOperationImpl { - using T = uint64_t; +struct genericAtomicOperationImpl { + using T = int64_t; __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceSum op) { using T_int = unsigned long long int; @@ -201,24 +204,17 @@ struct genericAtomicOperationImpl { }; template <> -struct genericAtomicOperationImpl { - using T = int32_t; +struct genericAtomicOperationImpl { + using T = uint32_t; __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceSum op) { return atomicAdd(addr, update_value); } }; -// CUDA natively supports `unsigned long long int` for `atomicAdd`, -// but doesn't support `signed long long int`. -// However, since the signed integer is represented as two's complement, -// the fundamental arithmetic operations of addition are identical to -// those for unsigned binary numbers. -// Then, this computes as `unsigned long long int` with `atomicAdd` -// @sa https://en.wikipedia.org/wiki/Two%27s_complement template <> -struct genericAtomicOperationImpl { - using T = int64_t; +struct genericAtomicOperationImpl { + using T = uint64_t; __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceSum op) { using T_int = unsigned long long int; @@ -228,6 +224,8 @@ struct genericAtomicOperationImpl { } }; +// `atomicMin`, `atomicMax` support int32_t, int64_t, uint32_t, uint64_t. + template <> struct genericAtomicOperationImpl { using T = int32_t; @@ -238,11 +236,11 @@ struct genericAtomicOperationImpl { }; template <> -struct genericAtomicOperationImpl { - using T = int32_t; - __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceMax op) +struct genericAtomicOperationImpl { + using T = uint32_t; + __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceMin op) { - return atomicMax(addr, update_value); + return atomicMin(addr, update_value); } }; @@ -258,6 +256,36 @@ struct genericAtomicOperationImpl { } }; +template <> +struct genericAtomicOperationImpl { + using T = uint64_t; + __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceMin op) + { + using T_int = unsigned long long int; + static_assert(sizeof(T) == sizeof(T_int)); + T ret = atomicMin(reinterpret_cast(addr), type_reinterpret(update_value)); + return ret; + } +}; + +template <> +struct genericAtomicOperationImpl { + using T = int32_t; + __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceMax op) + { + return atomicMax(addr, update_value); + } +}; + +template <> +struct genericAtomicOperationImpl { + using T = uint32_t; + __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceMax op) + { + return atomicMax(addr, update_value); + } +}; + template <> struct genericAtomicOperationImpl { using T = int64_t; @@ -269,6 +297,19 @@ struct genericAtomicOperationImpl { return ret; } }; + +template <> +struct genericAtomicOperationImpl { + using T = uint64_t; + __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceMax op) + { + using T_int = unsigned long long int; + static_assert(sizeof(T) == sizeof(T_int)); + T ret = atomicMax(reinterpret_cast(addr), type_reinterpret(update_value)); + return ret; + } +}; + // ----------------------------------------------------------------------- // the implementation of `typesAtomicCASImpl` template @@ -310,28 +351,14 @@ template struct typesAtomicCASImpl { __forceinline__ __device__ T operator()(T* addr, T const& compare, T const& update_value) { - using T_int = unsigned int; - - bool is_32_align = (reinterpret_cast(addr) & 2) == 0; - auto* address_uint32 = - reinterpret_cast(reinterpret_cast(addr) - (is_32_align ? 0 : 2)); - - T_int old = *address_uint32; - T_int assumed; - T target_value; - uint16_t u_val = type_reinterpret(update_value); - - do { - assumed = old; - target_value = (is_32_align) ? T(old & 0xffff) : T(old >> 16); - if (target_value != compare) break; + using T_int = unsigned short int; + static_assert(sizeof(T) == sizeof(T_int)); - T_int new_value = - (is_32_align) ? (old & 0xffff'0000) | u_val : (old & 0xffff) | (T_int(u_val) << 16); - old = atomicCAS(address_uint32, assumed, new_value); - } while (assumed != old); + T_int ret = atomicCAS(reinterpret_cast(addr), + type_reinterpret(compare), + type_reinterpret(update_value)); - return target_value; + return type_reinterpret(ret); } }; @@ -340,6 +367,7 @@ struct typesAtomicCASImpl { __forceinline__ __device__ T operator()(T* addr, T const& compare, T const& update_value) { using T_int = unsigned int; + static_assert(sizeof(T) == sizeof(T_int)); T_int ret = atomicCAS(reinterpret_cast(addr), type_reinterpret(compare), @@ -349,7 +377,6 @@ struct typesAtomicCASImpl { } }; -// 8 bytes atomic operation template struct typesAtomicCASImpl { __forceinline__ __device__ T operator()(T* addr, T const& compare, T const& update_value) @@ -366,8 +393,9 @@ struct typesAtomicCASImpl { }; /** - * @brief compute atomic binary operation - * reads the `old` located at the `address` in global or shared memory, + * @brief Compute atomic binary operation + * + * Reads the `old` located at the `address` in global or shared memory, * computes 'BinaryOp'('old', 'update_value'), * and stores the result back to memory at the same address. * These three operations are performed in one atomic transaction. @@ -375,9 +403,9 @@ struct typesAtomicCASImpl { * The supported cudf types for `genericAtomicOperation` are: * int8_t, int16_t, int32_t, int64_t, float, double * - * @param[in] address The address of old value in global or shared memory - * @param[in] val The value to be computed - * @param[in] op The binary operator used for compute + * @param address The address of old value in global or shared memory + * @param val The value to be computed + * @param op The binary operator used for compute * * @returns The old value at `address` */ @@ -429,7 +457,8 @@ __forceinline__ __device__ bool genericAtomicOperation(bool* address, /** * @brief Overloads for `atomic_add` - * reads the `old` located at the `address` in global or shared memory, + * + * Reads the `old` located at the `address` in global or shared memory, * computes (old + val), and stores the result back to memory at the same * address. These three operations are performed in one atomic transaction. * @@ -443,8 +472,8 @@ __forceinline__ __device__ bool genericAtomicOperation(bool* address, * (`double` is supported after Pascal). * Other types are implemented by `atomicCAS`. * - * @param[in] address The address of old value in global or shared memory - * @param[in] val The value to be added + * @param address The address of old value in global or shared memory + * @param val The value to be added * * @returns The old value at `address` */ @@ -456,7 +485,8 @@ __forceinline__ __device__ T atomic_add(T* address, T val) /** * @brief Overloads for `atomic_mul` - * reads the `old` located at the `address` in global or shared memory, + * + * Reads the `old` located at the `address` in global or shared memory, * computes (old * val), and stores the result back to memory at the same * address. These three operations are performed in one atomic transaction. * @@ -465,8 +495,8 @@ __forceinline__ __device__ T atomic_add(T* address, T val) * * All types are implemented by `atomicCAS`. * - * @param[in] address The address of old value in global or shared memory - * @param[in] val The value to be multiplied + * @param address The address of old value in global or shared memory + * @param val The value to be multiplied * * @returns The old value at `address` */ @@ -478,7 +508,8 @@ __forceinline__ __device__ T atomic_mul(T* address, T val) /** * @brief Overloads for `atomic_min` - * reads the `old` located at the `address` in global or shared memory, + * + * Reads the `old` located at the `address` in global or shared memory, * computes the minimum of old and val, and stores the result back to memory * at the same address. * These three operations are performed in one atomic transaction. @@ -492,8 +523,8 @@ __forceinline__ __device__ T atomic_mul(T* address, T val) * CUDA natively supports `int32_t`, `uint32_t`, `int64_t`, `uint64_t`. * Other types are implemented by `atomicCAS`. * - * @param[in] address The address of old value in global or shared memory - * @param[in] val The value to be computed + * @param address The address of old value in global or shared memory + * @param val The value to be computed * * @returns The old value at `address` */ @@ -505,7 +536,8 @@ __forceinline__ __device__ T atomic_min(T* address, T val) /** * @brief Overloads for `atomic_max` - * reads the `old` located at the `address` in global or shared memory, + * + * Reads the `old` located at the `address` in global or shared memory, * computes the maximum of old and val, and stores the result back to memory * at the same address. * These three operations are performed in one atomic transaction. @@ -519,8 +551,8 @@ __forceinline__ __device__ T atomic_min(T* address, T val) * CUDA natively supports `int32_t`, `uint32_t`, `int64_t`, `uint64_t`. * Other types are implemented by `atomicCAS`. * - * @param[in] address The address of old value in global or shared memory - * @param[in] val The value to be computed + * @param address The address of old value in global or shared memory + * @param val The value to be computed * * @returns The old value at `address` */ @@ -532,7 +564,8 @@ __forceinline__ __device__ T atomic_max(T* address, T val) /** * @brief Overloads for `atomic_cas` - * reads the `old` located at the `address` in global or shared memory, + * + * Reads the `old` located at the `address` in global or shared memory, * computes (`old` == `compare` ? `val` : `old`), * and stores the result back to memory at the same address. * These three operations are performed in one atomic transaction. @@ -545,9 +578,9 @@ __forceinline__ __device__ T atomic_max(T* address, T val) * CUDA natively supports `int32_t`, `uint32_t`, `uint64_t`. * Other types are implemented by `atomicCAS`. * - * @param[in] address The address of old value in global or shared memory - * @param[in] compare The value to be compared - * @param[in] val The value to be computed + * @param address The address of old value in global or shared memory + * @param compare The value to be compared + * @param val The value to be computed * * @returns The old value at `address` */ From 5ecf52f50c3f33d986bcac12e0589df4a740d45b Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Sat, 3 Feb 2024 20:38:06 -0600 Subject: [PATCH 4/6] Revert atomic_add in most places, update tests. --- cpp/benchmarks/join/generate_input_tables.cuh | 1 - cpp/include/cudf/detail/copy_if_else.cuh | 3 +- cpp/include/cudf/detail/copy_range.cuh | 3 +- .../cudf/detail/labeling/label_segments.cuh | 3 +- cpp/include/cudf/detail/null_mask.cuh | 3 +- cpp/include/cudf/detail/valid_if.cuh | 5 ++-- cpp/src/bitmask/null_mask.cu | 3 +- cpp/src/copying/concatenate.cu | 5 ++-- cpp/src/groupby/sort/group_quantiles.cu | 3 +- cpp/src/groupby/sort/group_std.cu | 3 +- cpp/src/io/avro/avro_gpu.cu | 4 +-- cpp/src/io/csv/csv_gpu.cu | 19 ++++++------ cpp/src/io/json/legacy/json_gpu.cu | 29 +++++++++---------- cpp/src/io/orc/stripe_data.cu | 8 ++--- cpp/src/io/parquet/chunk_dict.cu | 5 ++-- cpp/src/io/utilities/data_casting.cu | 17 +++++------ cpp/src/io/utilities/parsing_utils.cu | 3 +- cpp/src/io/utilities/type_inference.cu | 21 +++++++------- cpp/src/join/join_common_utils.cuh | 7 ++--- cpp/src/json/json_path.cu | 3 +- cpp/src/partitioning/partitioning.cu | 9 +++--- cpp/src/replace/nulls.cu | 5 ++-- cpp/src/replace/replace.cu | 5 ++-- cpp/src/rolling/detail/rolling.cuh | 3 +- cpp/src/stream_compaction/distinct_helpers.cu | 3 +- cpp/src/strings/copying/concatenate.cu | 3 +- cpp/src/strings/replace/multi.cu | 3 +- cpp/src/transform/row_conversion.cu | 3 +- .../device_atomics/device_atomics_test.cu | 6 ++-- 29 files changed, 78 insertions(+), 110 deletions(-) diff --git a/cpp/benchmarks/join/generate_input_tables.cuh b/cpp/benchmarks/join/generate_input_tables.cuh index b14541564dd..d9c428f86af 100644 --- a/cpp/benchmarks/join/generate_input_tables.cuh +++ b/cpp/benchmarks/join/generate_input_tables.cuh @@ -16,7 +16,6 @@ #pragma once -#include #include #include diff --git a/cpp/include/cudf/detail/copy_if_else.cuh b/cpp/include/cudf/detail/copy_if_else.cuh index 65b453e0bd1..6162fa5ecf1 100644 --- a/cpp/include/cudf/detail/copy_if_else.cuh +++ b/cpp/include/cudf/detail/copy_if_else.cuh @@ -20,7 +20,6 @@ #include #include #include -#include #include #include @@ -95,7 +94,7 @@ __launch_bounds__(block_size) CUDF_KERNEL // block_valid_count will only be valid on thread 0 if (threadIdx.x == 0) { // using an atomic here because there are multiple blocks doing this work - cudf::detail::atomic_add(valid_count, block_valid_count); + atomicAdd(valid_count, block_valid_count); } } } diff --git a/cpp/include/cudf/detail/copy_range.cuh b/cpp/include/cudf/detail/copy_range.cuh index 192a60cf9b3..4bfdaa94c53 100644 --- a/cpp/include/cudf/detail/copy_range.cuh +++ b/cpp/include/cudf/detail/copy_range.cuh @@ -19,7 +19,6 @@ #include #include #include -#include #include #include #include @@ -101,7 +100,7 @@ CUDF_KERNEL void copy_range_kernel(SourceValueIterator source_value_begin, auto block_null_change = cudf::detail::single_lane_block_sum_reduce(warp_null_change); if (threadIdx.x == 0) { // if the first thread in a block - cudf::detail::atomic_add(null_count, block_null_change); + atomicAdd(null_count, block_null_change); } } } diff --git a/cpp/include/cudf/detail/labeling/label_segments.cuh b/cpp/include/cudf/detail/labeling/label_segments.cuh index 405d111ab35..f522abef0b1 100644 --- a/cpp/include/cudf/detail/labeling/label_segments.cuh +++ b/cpp/include/cudf/detail/labeling/label_segments.cuh @@ -14,7 +14,6 @@ * limitations under the License. */ -#include #include #include @@ -104,7 +103,7 @@ void label_segments(InputIterator offsets_begin, // In case we have repeated offsets (i.e., we have empty segments), this `atomicAdd` call will // make sure the label values corresponding to these empty segments will be skipped in the // output. - if (dst_idx < num_labels) { cudf::detail::atomic_add(&output[dst_idx], OutputType{1}); } + if (dst_idx < num_labels) { atomicAdd(&output[dst_idx], OutputType{1}); } }); thrust::inclusive_scan(rmm::exec_policy(stream), label_begin, label_end, label_begin); } diff --git a/cpp/include/cudf/detail/null_mask.cuh b/cpp/include/cudf/detail/null_mask.cuh index 3113b991a79..e57d85f2998 100644 --- a/cpp/include/cudf/detail/null_mask.cuh +++ b/cpp/include/cudf/detail/null_mask.cuh @@ -17,7 +17,6 @@ #include #include -#include #include #include #include @@ -108,7 +107,7 @@ CUDF_KERNEL void offset_bitmask_binop(Binop op, __shared__ typename BlockReduce::TempStorage temp_storage; size_type block_count = BlockReduce(temp_storage).Sum(thread_count); - if (threadIdx.x == 0) { cudf::detail::atomic_add(count_ptr, block_count); } + if (threadIdx.x == 0) { atomicAdd(count_ptr, block_count); } } /** diff --git a/cpp/include/cudf/detail/valid_if.cuh b/cpp/include/cudf/detail/valid_if.cuh index 453a072fece..d0073177445 100644 --- a/cpp/include/cudf/detail/valid_if.cuh +++ b/cpp/include/cudf/detail/valid_if.cuh @@ -18,7 +18,6 @@ #include #include -#include #include #include #include @@ -66,7 +65,7 @@ CUDF_KERNEL void valid_if_kernel( } size_type block_count = single_lane_block_sum_reduce(warp_valid_count); - if (threadIdx.x == 0) { cudf::detail::atomic_add(valid_count, block_count); } + if (threadIdx.x == 0) { atomicAdd(valid_count, block_count); } } /** @@ -184,7 +183,7 @@ CUDF_KERNEL void valid_if_n_kernel(InputIterator1 begin1, auto block_valid_count = single_lane_block_sum_reduce(warp_valid_count); - if (threadIdx.x == 0) { cudf::detail::atomic_add(valid_counts + mask_idx, block_valid_count); } + if (threadIdx.x == 0) { atomicAdd(valid_counts + mask_idx, block_valid_count); } } } diff --git a/cpp/src/bitmask/null_mask.cu b/cpp/src/bitmask/null_mask.cu index 52f5b0108a4..bb320e4b81a 100644 --- a/cpp/src/bitmask/null_mask.cu +++ b/cpp/src/bitmask/null_mask.cu @@ -19,7 +19,6 @@ #include #include #include -#include #include #include #include @@ -306,7 +305,7 @@ CUDF_KERNEL void count_set_bits_kernel(bitmask_type const* bitmask, __shared__ typename BlockReduce::TempStorage temp_storage; size_type block_count{BlockReduce(temp_storage).Sum(thread_count)}; - if (threadIdx.x == 0) { cudf::detail::atomic_add(global_count, block_count); } + if (threadIdx.x == 0) { atomicAdd(global_count, block_count); } } } // namespace diff --git a/cpp/src/copying/concatenate.cu b/cpp/src/copying/concatenate.cu index e76dac38f05..b1d850e0b27 100644 --- a/cpp/src/copying/concatenate.cu +++ b/cpp/src/copying/concatenate.cu @@ -22,7 +22,6 @@ #include #include #include -#include #include #include #include @@ -149,7 +148,7 @@ CUDF_KERNEL void concatenate_masks_kernel(column_device_view const* views, using detail::single_lane_block_sum_reduce; auto const block_valid_count = single_lane_block_sum_reduce(warp_valid_count); - if (threadIdx.x == 0) { cudf::detail::atomic_add(out_valid_count, block_valid_count); } + if (threadIdx.x == 0) { atomicAdd(out_valid_count, block_valid_count); } } } // namespace @@ -234,7 +233,7 @@ CUDF_KERNEL void fused_concatenate_kernel(column_device_view const* input_views, if (Nullable) { using detail::single_lane_block_sum_reduce; auto block_valid_count = single_lane_block_sum_reduce(warp_valid_count); - if (threadIdx.x == 0) { cudf::detail::atomic_add(out_valid_count, block_valid_count); } + if (threadIdx.x == 0) { atomicAdd(out_valid_count, block_valid_count); } } } diff --git a/cpp/src/groupby/sort/group_quantiles.cu b/cpp/src/groupby/sort/group_quantiles.cu index 51c03375bb2..35185c283f4 100644 --- a/cpp/src/groupby/sort/group_quantiles.cu +++ b/cpp/src/groupby/sort/group_quantiles.cu @@ -21,7 +21,6 @@ #include #include #include -#include #include #include #include @@ -73,7 +72,7 @@ struct calculate_quantile_fn { [d_result = d_result, segment_size, offset, this](size_type j) { if (segment_size == 0) { d_result.set_null(offset + j); - cudf::detail::atomic_add(this->null_count, 1); + atomicAdd(this->null_count, 1); } else { d_result.set_valid(offset + j); } diff --git a/cpp/src/groupby/sort/group_std.cu b/cpp/src/groupby/sort/group_std.cu index 5b37e31f949..49ad34838e3 100644 --- a/cpp/src/groupby/sort/group_std.cu +++ b/cpp/src/groupby/sort/group_std.cu @@ -20,7 +20,6 @@ #include #include #include -#include #include #include #include @@ -151,7 +150,7 @@ struct var_functor { // fact. (1) is more work than it's worth without benchmarking, and // this approach should outperform (2) unless large amounts of the // data is null. - cudf::detail::atomic_add(d_null_count, 1); + atomicAdd(d_null_count, 1); } else { d_result.set_valid(i); } diff --git a/cpp/src/io/avro/avro_gpu.cu b/cpp/src/io/avro/avro_gpu.cu index 50241c2b0eb..0f5952b0330 100644 --- a/cpp/src/io/avro/avro_gpu.cu +++ b/cpp/src/io/avro/avro_gpu.cu @@ -17,8 +17,6 @@ #include -#include - #include using cudf::device_span; @@ -147,7 +145,7 @@ avro_decode_row(schemadesc_s const* schema, case type_null: if (dataptr != nullptr && dst_row >= 0) { atomicAnd(static_cast(dataptr) + (dst_row >> 5), ~(1 << (dst_row & 0x1f))); - cudf::detail::atomic_add(&schema_g[i].count, 1U); + atomicAdd(&schema_g[i].count, 1U); *skipped_row = false; } break; diff --git a/cpp/src/io/csv/csv_gpu.cu b/cpp/src/io/csv/csv_gpu.cu index 1653382e375..8252cccbdb9 100644 --- a/cpp/src/io/csv/csv_gpu.cu +++ b/cpp/src/io/csv/csv_gpu.cu @@ -21,7 +21,6 @@ #include #include -#include #include #include #include @@ -202,12 +201,12 @@ CUDF_KERNEL void __launch_bounds__(csvparse_block_dim) // points to last character in the field auto const field_len = static_cast(next_delimiter - field_start); if (serialized_trie_contains(opts.trie_na, {field_start, field_len})) { - cudf::detail::atomic_add(&d_column_data[actual_col].null_count, 1); + atomicAdd(&d_column_data[actual_col].null_count, 1); } else if (serialized_trie_contains(opts.trie_true, {field_start, field_len}) || serialized_trie_contains(opts.trie_false, {field_start, field_len})) { - cudf::detail::atomic_add(&d_column_data[actual_col].bool_count, 1); + atomicAdd(&d_column_data[actual_col].bool_count, 1); } else if (cudf::io::is_infinity(field_start, next_delimiter)) { - cudf::detail::atomic_add(&d_column_data[actual_col].float_count, 1); + atomicAdd(&d_column_data[actual_col].float_count, 1); } else { long count_number = 0; long count_decimal = 0; @@ -262,9 +261,9 @@ CUDF_KERNEL void __launch_bounds__(csvparse_block_dim) if (column_flags[col] & column_parse::as_datetime) { // PANDAS uses `object` dtype if the date is unparseable if (is_datetime(count_string, count_decimal, count_colon, count_dash, count_slash)) { - cudf::detail::atomic_add(&d_column_data[actual_col].datetime_count, 1); + atomicAdd(&d_column_data[actual_col].datetime_count, 1); } else { - cudf::detail::atomic_add(&d_column_data[actual_col].string_count, 1); + atomicAdd(&d_column_data[actual_col].string_count, 1); } } else if (count_number == int_req_number_cnt) { auto const is_negative = (*trimmed_field_range.first == '-'); @@ -272,16 +271,16 @@ CUDF_KERNEL void __launch_bounds__(csvparse_block_dim) trimmed_field_range.first + (is_negative || (*trimmed_field_range.first == '+')); cudf::size_type* ptr = cudf::io::gpu::infer_integral_field_counter( data_begin, data_begin + count_number, is_negative, d_column_data[actual_col]); - cudf::detail::atomic_add(ptr, 1); + atomicAdd(ptr, 1); } else if (is_floatingpoint(trimmed_field_len, count_number, count_decimal, count_thousands, count_dash + count_plus, count_exponent)) { - cudf::detail::atomic_add(&d_column_data[actual_col].float_count, 1); + atomicAdd(&d_column_data[actual_col].float_count, 1); } else { - cudf::detail::atomic_add(&d_column_data[actual_col].string_count, 1); + atomicAdd(&d_column_data[actual_col].string_count, 1); } } actual_col++; @@ -373,7 +372,7 @@ CUDF_KERNEL void __launch_bounds__(csvparse_block_dim) column_flags[col] & column_parse::as_hexadecimal)) { // set the valid bitmap - all bits were set to 0 to start set_bit(valids[actual_col], rec_id); - cudf::detail::atomic_add(&valid_counts[actual_col], 1); + atomicAdd(&valid_counts[actual_col], 1); } } } else if (dtypes[actual_col].id() == cudf::type_id::STRING) { diff --git a/cpp/src/io/json/legacy/json_gpu.cu b/cpp/src/io/json/legacy/json_gpu.cu index 713252cc39a..b196643e7e7 100644 --- a/cpp/src/io/json/legacy/json_gpu.cu +++ b/cpp/src/io/json/legacy/json_gpu.cu @@ -20,7 +20,6 @@ #include #include -#include #include #include #include @@ -283,7 +282,7 @@ CUDF_KERNEL void convert_data_to_columns_kernel(parse_options_view opts, // set the valid bitmap - all bits were set to 0 to start set_bit(valid_fields[desc.column], rec_id); - cudf::detail::atomic_add(&num_valid_fields[desc.column], 1); + atomicAdd(&num_valid_fields[desc.column], 1); } else { if (cudf::type_dispatcher(column_types[desc.column], ConvertFunctor{}, @@ -296,7 +295,7 @@ CUDF_KERNEL void convert_data_to_columns_kernel(parse_options_view opts, false)) { // set the valid bitmap - all bits were set to 0 to start set_bit(valid_fields[desc.column], rec_id); - cudf::detail::atomic_add(&num_valid_fields[desc.column], 1); + atomicAdd(&num_valid_fields[desc.column], 1); } } } else if (column_types[desc.column].id() == type_id::STRING) { @@ -350,16 +349,16 @@ CUDF_KERNEL void detect_data_types_kernel( // Checking if the field is empty/valid if (serialized_trie_contains(opts.trie_na, {desc.value_begin, value_len})) { // Increase the null count for array rows, where the null count is initialized to zero. - if (!are_rows_objects) { cudf::detail::atomic_add(&column_infos[desc.column].null_count, 1); } + if (!are_rows_objects) { atomicAdd(&column_infos[desc.column].null_count, 1); } continue; } else if (are_rows_objects) { // For files with object rows, null count is initialized to row count. The value is decreased // here for every valid field. - cudf::detail::atomic_add(&column_infos[desc.column].null_count, -1); + atomicAdd(&column_infos[desc.column].null_count, -1); } // Don't need counts to detect strings, any field in quotes is deduced to be a string if (desc.is_quoted) { - cudf::detail::atomic_add(&column_infos[desc.column].string_count, 1); + atomicAdd(&column_infos[desc.column].string_count, 1); continue; } @@ -406,21 +405,21 @@ CUDF_KERNEL void detect_data_types_kernel( if (maybe_hex) { --int_req_number_cnt; } if (serialized_trie_contains(opts.trie_true, {desc.value_begin, value_len}) || serialized_trie_contains(opts.trie_false, {desc.value_begin, value_len})) { - cudf::detail::atomic_add(&column_infos[desc.column].bool_count, 1); + atomicAdd(&column_infos[desc.column].bool_count, 1); } else if (digit_count == int_req_number_cnt) { bool is_negative = (*desc.value_begin == '-'); char const* data_begin = desc.value_begin + (is_negative || (*desc.value_begin == '+')); cudf::size_type* ptr = cudf::io::gpu::infer_integral_field_counter( data_begin, data_begin + digit_count, is_negative, column_infos[desc.column]); - cudf::detail::atomic_add(ptr, 1); + atomicAdd(ptr, 1); } else if (is_like_float( value_len, digit_count, decimal_count, dash_count + plus_count, exponent_count)) { - cudf::detail::atomic_add(&column_infos[desc.column].float_count, 1); + atomicAdd(&column_infos[desc.column].float_count, 1); } // A date-time field cannot have more than 3 non-special characters // A number field cannot have more than one decimal point else if (other_count > 3 || decimal_count > 1) { - cudf::detail::atomic_add(&column_infos[desc.column].string_count, 1); + atomicAdd(&column_infos[desc.column].string_count, 1); } else { // A date field can have either one or two '-' or '\'; A legal combination will only have one // of them To simplify the process of auto column detection, we are not covering all the @@ -428,20 +427,20 @@ CUDF_KERNEL void detect_data_types_kernel( if ((dash_count > 0 && dash_count <= 2 && slash_count == 0) || (dash_count == 0 && slash_count > 0 && slash_count <= 2)) { if (colon_count <= 2) { - cudf::detail::atomic_add(&column_infos[desc.column].datetime_count, 1); + atomicAdd(&column_infos[desc.column].datetime_count, 1); } else { - cudf::detail::atomic_add(&column_infos[desc.column].string_count, 1); + atomicAdd(&column_infos[desc.column].string_count, 1); } } else { // Default field type is string - cudf::detail::atomic_add(&column_infos[desc.column].string_count, 1); + atomicAdd(&column_infos[desc.column].string_count, 1); } } } if (!are_rows_objects) { // For array rows, mark missing fields as null for (; input_field_index < num_columns; ++input_field_index) - cudf::detail::atomic_add(&column_infos[input_field_index].null_count, 1); + atomicAdd(&column_infos[input_field_index].null_count, 1); } } @@ -499,7 +498,7 @@ CUDF_KERNEL void collect_keys_info_kernel(parse_options_view const options, for (auto field_range = advance(row_data_range.first); field_range.key_begin < row_data_range.second; field_range = advance(field_range.value_end)) { - auto const idx = cudf::detail::atomic_add(keys_cnt, 1ULL); + auto const idx = atomicAdd(keys_cnt, 1ULL); if (keys_info.has_value()) { auto const len = field_range.key_end - field_range.key_begin; keys_info->column(0).element(idx) = field_range.key_begin - data.begin(); diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu index 09f2744e324..966af14560a 100644 --- a/cpp/src/io/orc/stripe_data.cu +++ b/cpp/src/io/orc/stripe_data.cu @@ -18,7 +18,6 @@ #include -#include #include #include @@ -1402,10 +1401,10 @@ CUDF_KERNEL void __launch_bounds__(block_size) // If we have an index, seek to the initial run and update row positions if (num_rowgroups > 0) { if (s->top.data.index.strm_offset[0] > s->chunk.strm_len[CI_DATA]) { - cudf::detail::atomic_add(error_count, 1); + atomicAdd(error_count, 1); } if (s->top.data.index.strm_offset[1] > s->chunk.strm_len[CI_DATA2]) { - cudf::detail::atomic_add(error_count, 1); + atomicAdd(error_count, 1); } uint32_t ofs0 = min(s->top.data.index.strm_offset[0], s->chunk.strm_len[CI_DATA]); uint32_t ofs1 = min(s->top.data.index.strm_offset[1], s->chunk.strm_len[CI_DATA2]); @@ -1826,8 +1825,7 @@ CUDF_KERNEL void __launch_bounds__(block_size) if (num_rowgroups > 0) { row_groups[blockIdx.y][blockIdx.x].num_child_rows = s->num_child_rows; } - cudf::detail::atomic_add(&chunks[chunk_id].num_child_rows, - static_cast(s->num_child_rows)); + atomicAdd(&chunks[chunk_id].num_child_rows, static_cast(s->num_child_rows)); } } diff --git a/cpp/src/io/parquet/chunk_dict.cu b/cpp/src/io/parquet/chunk_dict.cu index 643394ed8a1..a43c6d4cbb6 100644 --- a/cpp/src/io/parquet/chunk_dict.cu +++ b/cpp/src/io/parquet/chunk_dict.cu @@ -18,7 +18,6 @@ #include #include -#include #include #include @@ -176,9 +175,9 @@ CUDF_KERNEL void __launch_bounds__(block_size) __syncthreads(); auto uniq_data_size = block_reduce(reduce_storage).Sum(uniq_elem_size); if (t == 0) { - total_num_dict_entries = cudf::detail::atomic_add(&chunk->num_dict_entries, num_unique); + total_num_dict_entries = atomicAdd(&chunk->num_dict_entries, num_unique); total_num_dict_entries += num_unique; - cudf::detail::atomic_add(&chunk->uniq_data_size, uniq_data_size); + atomicAdd(&chunk->uniq_data_size, uniq_data_size); } __syncthreads(); diff --git a/cpp/src/io/utilities/data_casting.cu b/cpp/src/io/utilities/data_casting.cu index 71ff52becc2..8fd860d9492 100644 --- a/cpp/src/io/utilities/data_casting.cu +++ b/cpp/src/io/utilities/data_casting.cu @@ -23,7 +23,6 @@ #include #include #include -#include #include #include #include @@ -438,14 +437,14 @@ CUDF_KERNEL void parse_fn_string_parallel(str_tuple_it str_tuples, auto get_next_string = [&]() { if constexpr (is_warp) { size_type istring; - if (lane == 0) { istring = cudf::detail::atomic_add(str_counter, 1); } + if (lane == 0) { istring = atomicAdd(str_counter, 1); } return __shfl_sync(0xffffffff, istring, 0); } else { // Ensure lane 0 doesn't update istring before all threads have read the previous iteration's // istring value __syncthreads(); __shared__ size_type istring; - if (lane == 0) { istring = cudf::detail::atomic_add(str_counter, 1); } + if (lane == 0) { istring = atomicAdd(str_counter, 1); } __syncthreads(); return istring; } @@ -475,7 +474,7 @@ CUDF_KERNEL void parse_fn_string_parallel(str_tuple_it str_tuples, if (is_null_literal && null_mask != nullptr) { if (lane == 0) { clear_bit(null_mask, istring); - cudf::detail::atomic_add(null_count_data, 1); + atomicAdd(null_count_data, 1); if (!d_chars) d_offsets[istring] = 0; } continue; // gride-stride return; @@ -619,7 +618,7 @@ CUDF_KERNEL void parse_fn_string_parallel(str_tuple_it str_tuples, if (!d_chars && lane == 0) { if (null_mask != nullptr) { clear_bit(null_mask, istring); - cudf::detail::atomic_add(null_count_data, 1); + atomicAdd(null_count_data, 1); } last_offset = 0; d_offsets[istring] = 0; @@ -760,7 +759,7 @@ struct string_parse { options.trie_na, {in_begin, static_cast(num_in_chars)}); if (is_null_literal && null_mask != nullptr) { clear_bit(null_mask, idx); - cudf::detail::atomic_add(null_count_data, 1); + atomicAdd(null_count_data, 1); if (!d_chars) d_offsets[idx] = 0; return; } @@ -771,7 +770,7 @@ struct string_parse { if (str_process_info.result != data_casting_result::PARSING_SUCCESS) { if (null_mask != nullptr) { clear_bit(null_mask, idx); - cudf::detail::atomic_add(null_count_data, 1); + atomicAdd(null_count_data, 1); } if (!d_chars) d_offsets[idx] = 0; } else { @@ -954,7 +953,7 @@ std::unique_ptr parse_data( if (is_null_literal) { col.set_null(row); - cudf::detail::atomic_add(null_count_data, 1); + atomicAdd(null_count_data, 1); return; } @@ -972,7 +971,7 @@ std::unique_ptr parse_data( false); if (not is_parsed) { col.set_null(row); - cudf::detail::atomic_add(null_count_data, 1); + atomicAdd(null_count_data, 1); } }); diff --git a/cpp/src/io/utilities/parsing_utils.cu b/cpp/src/io/utilities/parsing_utils.cu index 5eed1e5ca04..ddd094b09e5 100644 --- a/cpp/src/io/utilities/parsing_utils.cu +++ b/cpp/src/io/utilities/parsing_utils.cu @@ -15,7 +15,6 @@ */ #include -#include #include #include #include @@ -107,7 +106,7 @@ CUDF_KERNEL void count_and_set_positions(char const* data, // Process the data for (long i = 0; i < byteToProcess; i++) { if (raw[i] == key) { - auto const idx = cudf::detail::atomic_add(count, static_cast(1)); + auto const idx = atomicAdd(count, static_cast(1)); setElement(positions, idx, did + offset + i, key); } } diff --git a/cpp/src/io/utilities/type_inference.cu b/cpp/src/io/utilities/type_inference.cu index eead98ca1c8..b446ad41946 100644 --- a/cpp/src/io/utilities/type_inference.cu +++ b/cpp/src/io/utilities/type_inference.cu @@ -19,7 +19,6 @@ #include #include -#include #include #include @@ -205,16 +204,16 @@ CUDF_KERNEL void infer_column_type_kernel(OptionsView options, auto const block_type_histogram = BlockReduce(temp_storage).Reduce(thread_type_histogram, custom_sum{}); if (threadIdx.x == 0) { - cudf::detail::atomic_add(&column_info->null_count, block_type_histogram.null_count); - cudf::detail::atomic_add(&column_info->float_count, block_type_histogram.float_count); - cudf::detail::atomic_add(&column_info->datetime_count, block_type_histogram.datetime_count); - cudf::detail::atomic_add(&column_info->string_count, block_type_histogram.string_count); - cudf::detail::atomic_add(&column_info->negative_small_int_count, - block_type_histogram.negative_small_int_count); - cudf::detail::atomic_add(&column_info->positive_small_int_count, - block_type_histogram.positive_small_int_count); - cudf::detail::atomic_add(&column_info->big_int_count, block_type_histogram.big_int_count); - cudf::detail::atomic_add(&column_info->bool_count, block_type_histogram.bool_count); + atomicAdd(&column_info->null_count, block_type_histogram.null_count); + atomicAdd(&column_info->float_count, block_type_histogram.float_count); + atomicAdd(&column_info->datetime_count, block_type_histogram.datetime_count); + atomicAdd(&column_info->string_count, block_type_histogram.string_count); + atomicAdd(&column_info->negative_small_int_count, + block_type_histogram.negative_small_int_count); + atomicAdd(&column_info->positive_small_int_count, + block_type_histogram.positive_small_int_count); + atomicAdd(&column_info->big_int_count, block_type_histogram.big_int_count); + atomicAdd(&column_info->bool_count, block_type_histogram.bool_count); } } diff --git a/cpp/src/join/join_common_utils.cuh b/cpp/src/join/join_common_utils.cuh index c87aa590385..71b60e1477e 100644 --- a/cpp/src/join/join_common_utils.cuh +++ b/cpp/src/join/join_common_utils.cuh @@ -20,7 +20,6 @@ #include #include #include -#include #include #include @@ -281,7 +280,7 @@ __inline__ __device__ void add_pair_to_cache(size_type const first, size_type* joined_shared_l, size_type* joined_shared_r) { - size_type my_current_idx{cudf::detail::atomic_add(current_idx_shared + warp_id, size_type(1))}; + size_type my_current_idx{atomicAdd(current_idx_shared + warp_id, size_type(1))}; // its guaranteed to fit into the shared cache joined_shared_l[my_current_idx] = first; @@ -304,9 +303,7 @@ __device__ void flush_output_cache(unsigned int const activemask, int num_threads = __popc(activemask); cudf::size_type output_offset = 0; - if (0 == lane_id) { - output_offset = cudf::detail::atomic_add(current_idx, current_idx_shared[warp_id]); - } + if (0 == lane_id) { output_offset = atomicAdd(current_idx, current_idx_shared[warp_id]); } // No warp sync is necessary here because we are assuming that ShuffleIndex // is internally using post-CUDA 9.0 synchronization-safe primitives diff --git a/cpp/src/json/json_path.cu b/cpp/src/json/json_path.cu index 5e67841b328..146b54c0d87 100644 --- a/cpp/src/json/json_path.cu +++ b/cpp/src/json/json_path.cu @@ -20,7 +20,6 @@ #include #include #include -#include #include #include #include @@ -955,7 +954,7 @@ __launch_bounds__(block_size) CUDF_KERNEL if (out_valid_count) { size_type block_valid_count = cudf::detail::single_lane_block_sum_reduce(warp_valid_count); - if (threadIdx.x == 0) { cudf::detail::atomic_add(out_valid_count.value(), block_valid_count); } + if (threadIdx.x == 0) { atomicAdd(out_valid_count.value(), block_valid_count); } } } diff --git a/cpp/src/partitioning/partitioning.cu b/cpp/src/partitioning/partitioning.cu index 0828a0fc772..8d8f1a71672 100644 --- a/cpp/src/partitioning/partitioning.cu +++ b/cpp/src/partitioning/partitioning.cu @@ -21,7 +21,6 @@ #include #include #include -#include #include #include #include @@ -159,7 +158,7 @@ CUDF_KERNEL void compute_row_partition_numbers(row_hasher_t the_hasher, row_partition_numbers[row_number] = partition_number; row_partition_offset[row_number] = - cudf::detail::atomic_add(&(shared_partition_sizes[partition_number]), size_type(1)); + atomicAdd(&(shared_partition_sizes[partition_number]), size_type(1)); tid += stride; } @@ -172,7 +171,7 @@ CUDF_KERNEL void compute_row_partition_numbers(row_hasher_t the_hasher, size_type const block_partition_size = shared_partition_sizes[partition_number]; // Update global size of each partition - cudf::detail::atomic_add(&global_partition_sizes[partition_number], block_partition_size); + atomicAdd(&global_partition_sizes[partition_number], block_partition_size); // Record the size of this partition in this block size_type const write_location = partition_number * gridDim.x + blockIdx.x; @@ -230,7 +229,7 @@ CUDF_KERNEL void compute_row_output_locations(size_type* __restrict__ row_partit // Get output location based on partition number by incrementing the // corresponding partition offset for this block size_type const row_output_location = - cudf::detail::atomic_add(&(shared_partition_offsets[partition_number]), size_type(1)); + atomicAdd(&(shared_partition_offsets[partition_number]), size_type(1)); // Store the row's output location in-place row_partition_numbers[row_number] = row_output_location; @@ -709,7 +708,7 @@ struct dispatch_map_type { partition_map.end(), scatter_map.begin(), [offsets = histogram.data()] __device__(auto partition_number) { - return cudf::detail::atomic_add(&offsets[partition_number], 1); + return atomicAdd(&offsets[partition_number], 1); }); // Scatter the rows into their partitions diff --git a/cpp/src/replace/nulls.cu b/cpp/src/replace/nulls.cu index 95177f53a72..014171f2b40 100644 --- a/cpp/src/replace/nulls.cu +++ b/cpp/src/replace/nulls.cu @@ -27,7 +27,6 @@ #include #include #include -#include #include #include #include @@ -110,7 +109,7 @@ CUDF_KERNEL void replace_nulls_strings(cudf::column_device_view input, uint32_t block_valid_count = cudf::detail::single_lane_block_sum_reduce(valid_sum); // one thread computes and adds to output_valid_count if (threadIdx.x == 0) { - cudf::detail::atomic_add(valid_counter, static_cast(block_valid_count)); + atomicAdd(valid_counter, static_cast(block_valid_count)); } } @@ -157,7 +156,7 @@ CUDF_KERNEL void replace_nulls(cudf::column_device_view input, cudf::detail::single_lane_block_sum_reduce(valid_sum); // one thread computes and adds to output_valid_count if (threadIdx.x == 0) { - cudf::detail::atomic_add(output_valid_count, static_cast(block_valid_count)); + atomicAdd(output_valid_count, static_cast(block_valid_count)); } } } diff --git a/cpp/src/replace/replace.cu b/cpp/src/replace/replace.cu index 1b0a7316ad2..88d5d3a2375 100644 --- a/cpp/src/replace/replace.cu +++ b/cpp/src/replace/replace.cu @@ -41,7 +41,6 @@ #include #include #include -#include #include #include #include @@ -170,7 +169,7 @@ CUDF_KERNEL void replace_strings_first_pass(cudf::column_device_view input, uint32_t block_valid_count = cudf::detail::single_lane_block_sum_reduce(valid_sum); // one thread computes and adds to output_valid_count if (threadIdx.x == 0) { - cudf::detail::atomic_add(output_valid_count, static_cast(block_valid_count)); + atomicAdd(output_valid_count, static_cast(block_valid_count)); } } @@ -299,7 +298,7 @@ CUDF_KERNEL void replace_kernel(cudf::column_device_view input, cudf::detail::single_lane_block_sum_reduce(valid_sum); // one thread computes and adds to output_valid_count if (threadIdx.x == 0) { - cudf::detail::atomic_add(output_valid_count, static_cast(block_valid_count)); + atomicAdd(output_valid_count, static_cast(block_valid_count)); } } } diff --git a/cpp/src/rolling/detail/rolling.cuh b/cpp/src/rolling/detail/rolling.cuh index d14d4e3e94e..20845a97c7e 100644 --- a/cpp/src/rolling/detail/rolling.cuh +++ b/cpp/src/rolling/detail/rolling.cuh @@ -35,7 +35,6 @@ #include #include #include -#include #include #include #include @@ -1078,7 +1077,7 @@ __launch_bounds__(block_size) CUDF_KERNEL size_type block_valid_count = cudf::detail::single_lane_block_sum_reduce(warp_valid_count); - if (threadIdx.x == 0) { cudf::detail::atomic_add(output_valid_count, block_valid_count); } + if (threadIdx.x == 0) { atomicAdd(output_valid_count, block_valid_count); } } /** diff --git a/cpp/src/stream_compaction/distinct_helpers.cu b/cpp/src/stream_compaction/distinct_helpers.cu index 34688e8655b..c3127aa2d20 100644 --- a/cpp/src/stream_compaction/distinct_helpers.cu +++ b/cpp/src/stream_compaction/distinct_helpers.cu @@ -17,7 +17,6 @@ #include "distinct_helpers.hpp" #include -#include namespace cudf::detail { @@ -54,7 +53,7 @@ struct reduce_fn : reduce_by_row_fn_base #include #include -#include #include #include #include @@ -168,7 +167,7 @@ CUDF_KERNEL void fused_concatenate_string_offset_kernel( if (Nullable) { using cudf::detail::single_lane_block_sum_reduce; auto block_valid_count = single_lane_block_sum_reduce(warp_valid_count); - if (threadIdx.x == 0) { cudf::detail::atomic_add(out_valid_count, block_valid_count); } + if (threadIdx.x == 0) { atomicAdd(out_valid_count, block_valid_count); } } } diff --git a/cpp/src/strings/replace/multi.cu b/cpp/src/strings/replace/multi.cu index 69fd407b6c2..3d0210d61b0 100644 --- a/cpp/src/strings/replace/multi.cu +++ b/cpp/src/strings/replace/multi.cu @@ -20,7 +20,6 @@ #include #include #include -#include #include #include #include @@ -331,7 +330,7 @@ std::unique_ptr replace_character_parallel(strings_column_view const& in target_count, [d_string_indices, d_targets_offsets] __device__(size_type idx) { auto const str_idx = d_string_indices[idx] - 1; - cudf::detail::atomic_add(d_targets_offsets + str_idx, 1); + atomicAdd(d_targets_offsets + str_idx, 1); }); // finally, convert the counts into offsets thrust::exclusive_scan(rmm::exec_policy(stream), diff --git a/cpp/src/transform/row_conversion.cu b/cpp/src/transform/row_conversion.cu index 8c51bea3c88..b294369a90e 100644 --- a/cpp/src/transform/row_conversion.cu +++ b/cpp/src/transform/row_conversion.cu @@ -20,7 +20,6 @@ #include #include #include -#include #include #include #include @@ -251,7 +250,7 @@ build_string_row_offsets(table_view const& tbl, auto const col = element_idx / num_rows; auto const val = d_offsets_iterators[col][row + 1] - d_offsets_iterators[col][row]; - cudf::detail::atomic_add(&d_row_sizes[row], val); + atomicAdd(&d_row_sizes[row], val); }); // transform the row sizes to include fixed width size and alignment diff --git a/cpp/tests/device_atomics/device_atomics_test.cu b/cpp/tests/device_atomics/device_atomics_test.cu index 316137068f2..f215b4fce24 100644 --- a/cpp/tests/device_atomics/device_atomics_test.cu +++ b/cpp/tests/device_atomics/device_atomics_test.cu @@ -38,8 +38,8 @@ CUDF_KERNEL void gpu_atomic_test(T* result, T* data, size_t size) for (; id < size; id += step) { cudf::detail::atomic_add(&result[0], data[id]); - atomicMin(&result[1], data[id]); - atomicMax(&result[2], data[id]); + cudf::detail::atomic_min(&result[1], data[id]); + cudf::detail::atomic_max(&result[2], data[id]); cudf::detail::genericAtomicOperation(&result[3], data[id], cudf::DeviceSum{}); cudf::detail::genericAtomicOperation(&result[4], data[id], cudf::DeviceMin{}); cudf::detail::genericAtomicOperation(&result[5], data[id], cudf::DeviceMax{}); @@ -72,7 +72,7 @@ __device__ T atomic_op(T* addr, T const& value, BinaryOp op) assumed = old_value; T new_value = op(old_value, value); - old_value = atomicCAS(addr, assumed, new_value); + old_value = cudf::detail::atomic_cas(addr, assumed, new_value); } while (assumed != old_value); return old_value; From 0d46d99ba2725bda2e9840b54dd2bff6cd72769b Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Fri, 8 Mar 2024 08:53:57 -0800 Subject: [PATCH 5/6] Fix copyright. --- cpp/include/cudf/detail/labeling/label_segments.cuh | 2 +- cpp/src/groupby/sort/group_std.cu | 2 +- cpp/src/stream_compaction/distinct_helpers.cu | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/include/cudf/detail/labeling/label_segments.cuh b/cpp/include/cudf/detail/labeling/label_segments.cuh index f522abef0b1..9051230a272 100644 --- a/cpp/include/cudf/detail/labeling/label_segments.cuh +++ b/cpp/include/cudf/detail/labeling/label_segments.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * Copyright (c) 2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/src/groupby/sort/group_std.cu b/cpp/src/groupby/sort/group_std.cu index 49ad34838e3..30b6f67dffe 100644 --- a/cpp/src/groupby/sort/group_std.cu +++ b/cpp/src/groupby/sort/group_std.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/src/stream_compaction/distinct_helpers.cu b/cpp/src/stream_compaction/distinct_helpers.cu index c3127aa2d20..8f36ec98f4a 100644 --- a/cpp/src/stream_compaction/distinct_helpers.cu +++ b/cpp/src/stream_compaction/distinct_helpers.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. From 282a3415fd6ff77d66362d0ba678c0bdd9d1f1f0 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Fri, 8 Mar 2024 17:20:49 -0800 Subject: [PATCH 6/6] Add missing headers that were implicitly included before this PR. --- cpp/benchmarks/join/generate_input_tables.cuh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cpp/benchmarks/join/generate_input_tables.cuh b/cpp/benchmarks/join/generate_input_tables.cuh index d9c428f86af..93401f01026 100644 --- a/cpp/benchmarks/join/generate_input_tables.cuh +++ b/cpp/benchmarks/join/generate_input_tables.cuh @@ -16,9 +16,11 @@ #pragma once +#include #include #include +#include #include #include