From 155405b4454e64442562d04c8448a81fe8eca87b Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Tue, 12 Mar 2024 18:08:09 -0500 Subject: [PATCH] Add missing atomic operators, refactor atomic operators, move atomic operators to detail namespace. (#14962) This PR does a thorough refactoring of `device_atomics.cuh`. - I moved all atomic-related functions to `cudf::detail::` (making this an API-breaking change, but most likely a low-impact break) - I added all missing operators for natively supported types to `atomicAdd`, `atomicMin`, `atomicMax`, etc. as discussed in #10149 and #14907. - This should prevent fallback to the `atomicCAS` path for types that are natively supported for those atomic operators, which we suspect as the root cause of the performance regression in #14886. - I kept `atomicAdd` rather than `cudf::detail::atomic_add` in locations where a native CUDA overload exists, and the same for min/max/CAS operations. Aggregations are the only place where we use the special overloads. We were previously calling the native CUDA function rather than our special overloads in many cases, so I retained the previous behavior. This avoids including the additional headers that implement an unnecessary level of wrapping for natively supported overloads. - I enabled native 2-byte CAS operations (on `unsigned short int`) that eliminate the do-while loop and extra alignment-checking logic - The CUDA docs don't state this, but some forum posts claim this is only supported by compute capability 7.0+. We now have 7.0 as a lower bound for RAPIDS so I'm not concerned by this as long as builds/tests pass. - I improved/cleaned the documentation and moved around some code so that the operators were in a logical order. - I assessed the existing tests and it looks like all the types are being covered. I'm not sure if there is a good way to enforce that certain types (like `uint64_t`) are passing through native `atomicAdd` calls. Authors: - Bradley Dice (https://github.com/bdice) Approvers: - David Wendt (https://github.com/davidwendt) - Suraj Aralihalli (https://github.com/SurajAralihalli) URL: https://github.com/rapidsai/cudf/pull/14962 --- cpp/benchmarks/join/generate_input_tables.cuh | 3 +- .../cudf/detail/aggregation/aggregation.cuh | 46 +-- .../cudf/detail/utilities/device_atomics.cuh | 261 +++++++++++------- cpp/src/io/avro/avro_gpu.cu | 2 +- cpp/src/io/json/legacy/json_gpu.cu | 2 +- cpp/src/io/utilities/parsing_utils.cu | 2 +- cpp/src/replace/nulls.cu | 8 +- cpp/src/replace/replace.cu | 8 +- .../device_atomics/device_atomics_test.cu | 14 +- 9 files changed, 206 insertions(+), 140 deletions(-) diff --git a/cpp/benchmarks/join/generate_input_tables.cuh b/cpp/benchmarks/join/generate_input_tables.cuh index b14541564dd..93401f01026 100644 --- a/cpp/benchmarks/join/generate_input_tables.cuh +++ b/cpp/benchmarks/join/generate_input_tables.cuh @@ -16,10 +16,11 @@ #pragma once -#include +#include #include #include +#include #include #include diff --git a/cpp/include/cudf/detail/aggregation/aggregation.cuh b/cpp/include/cudf/detail/aggregation/aggregation.cuh index f13166d5321..ecf2f610697 100644 --- a/cpp/include/cudf/detail/aggregation/aggregation.cuh +++ b/cpp/include/cudf/detail/aggregation/aggregation.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -144,8 +144,8 @@ struct update_target_element< if (source_has_nulls and source.is_null(source_index)) { return; } using Target = target_type_t; - atomicMin(&target.element(target_index), - static_cast(source.element(source_index))); + cudf::detail::atomic_min(&target.element(target_index), + static_cast(source.element(source_index))); if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); } } @@ -170,8 +170,8 @@ struct update_target_element< using DeviceTarget = device_storage_type_t; using DeviceSource = device_storage_type_t; - atomicMin(&target.element(target_index), - static_cast(source.element(source_index))); + cudf::detail::atomic_min(&target.element(target_index), + static_cast(source.element(source_index))); if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); } } @@ -193,8 +193,8 @@ struct update_target_element< if (source_has_nulls and source.is_null(source_index)) { return; } using Target = target_type_t; - atomicMax(&target.element(target_index), - static_cast(source.element(source_index))); + cudf::detail::atomic_max(&target.element(target_index), + static_cast(source.element(source_index))); if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); } } @@ -219,8 +219,8 @@ struct update_target_element< using DeviceTarget = device_storage_type_t; using DeviceSource = device_storage_type_t; - atomicMax(&target.element(target_index), - static_cast(source.element(source_index))); + cudf::detail::atomic_max(&target.element(target_index), + static_cast(source.element(source_index))); if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); } } @@ -242,8 +242,8 @@ struct update_target_element< if (source_has_nulls and source.is_null(source_index)) { return; } using Target = target_type_t; - atomicAdd(&target.element(target_index), - static_cast(source.element(source_index))); + cudf::detail::atomic_add(&target.element(target_index), + static_cast(source.element(source_index))); if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); } } @@ -268,8 +268,8 @@ struct update_target_element< using DeviceTarget = device_storage_type_t; using DeviceSource = device_storage_type_t; - atomicAdd(&target.element(target_index), - static_cast(source.element(source_index))); + cudf::detail::atomic_add(&target.element(target_index), + static_cast(source.element(source_index))); if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); } } @@ -368,7 +368,7 @@ struct update_target_element; auto value = static_cast(source.element(source_index)); - atomicAdd(&target.element(target_index), value * value); + cudf::detail::atomic_add(&target.element(target_index), value * value); if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); } } }; @@ -387,8 +387,8 @@ struct update_target_element; - atomicMul(&target.element(target_index), - static_cast(source.element(source_index))); + cudf::detail::atomic_mul(&target.element(target_index), + static_cast(source.element(source_index))); if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); } } }; @@ -408,7 +408,7 @@ struct update_target_element< if (source_has_nulls and source.is_null(source_index)) { return; } using Target = target_type_t; - atomicAdd(&target.element(target_index), Target{1}); + cudf::detail::atomic_add(&target.element(target_index), Target{1}); // It is assumed the output for COUNT_VALID is initialized to be all valid } @@ -427,7 +427,7 @@ struct update_target_element< size_type source_index) const noexcept { using Target = target_type_t; - atomicAdd(&target.element(target_index), Target{1}); + cudf::detail::atomic_add(&target.element(target_index), Target{1}); // It is assumed the output for COUNT_ALL is initialized to be all valid } @@ -449,10 +449,11 @@ struct update_target_element< if (source_has_nulls and source.is_null(source_index)) { return; } using Target = target_type_t; - auto old = atomicCAS(&target.element(target_index), ARGMAX_SENTINEL, source_index); + auto old = cudf::detail::atomic_cas( + &target.element(target_index), ARGMAX_SENTINEL, source_index); if (old != ARGMAX_SENTINEL) { while (source.element(source_index) > source.element(old)) { - old = atomicCAS(&target.element(target_index), old, source_index); + old = cudf::detail::atomic_cas(&target.element(target_index), old, source_index); } } @@ -476,10 +477,11 @@ struct update_target_element< if (source_has_nulls and source.is_null(source_index)) { return; } using Target = target_type_t; - auto old = atomicCAS(&target.element(target_index), ARGMIN_SENTINEL, source_index); + auto old = cudf::detail::atomic_cas( + &target.element(target_index), ARGMIN_SENTINEL, source_index); if (old != ARGMIN_SENTINEL) { while (source.element(source_index) < source.element(old)) { - old = atomicCAS(&target.element(target_index), old, source_index); + old = cudf::detail::atomic_cas(&target.element(target_index), old, source_index); } } diff --git a/cpp/include/cudf/detail/utilities/device_atomics.cuh b/cpp/include/cudf/detail/utilities/device_atomics.cuh index 6f23abc59a8..61c5f35d62a 100644 --- a/cpp/include/cudf/detail/utilities/device_atomics.cuh +++ b/cpp/include/cudf/detail/utilities/device_atomics.cuh @@ -20,15 +20,15 @@ * @brief overloads for CUDA atomic operations * @file device_atomics.cuh * - * Provides the overloads for all of possible cudf's data types, - * where cudf's data types are, int8_t, int16_t, int32_t, int64_t, float, double, - * cudf::timestamp_D, cudf::timestamp_s, cudf::timestamp_ms, cudf::timestamp_us, + * Provides the overloads for all of cudf's data types, specifically int8_t, + * int16_t, int32_t, int64_t, float, double, cudf::timestamp_D, + * cudf::timestamp_s, cudf::timestamp_ms, cudf::timestamp_us, * cudf::timestamp_ns, cudf::duration_D, cudf::duration_s, cudf::duration_ms, - * cudf::duration_us, cudf::duration_ns and bool - * where CUDA atomic operations are, `atomicAdd`, `atomicMin`, `atomicMax`, - * `atomicCAS`. - * Also provides `cudf::genericAtomicOperation` which performs atomic operation - * with the given binary operator. + * cudf::duration_us, cudf::duration_ns and bool for the CUDA atomic operations + * `atomicAdd`, `atomicMin`, `atomicMax`, `atomicCAS`. + * + * Also provides `cudf::detail::genericAtomicOperation` which performs an + * atomic operation with the given binary operator. */ #include @@ -85,26 +85,22 @@ template struct genericAtomicOperationImpl { __forceinline__ __device__ T operator()(T* addr, T const& update_value, Op op) { - using T_int = unsigned int; - bool is_32_align = (reinterpret_cast(addr) & 2) == 0; - auto* address_uint32 = - reinterpret_cast(reinterpret_cast(addr) - (is_32_align ? 0 : 2)); + using T_int = unsigned short int; + static_assert(sizeof(T) == sizeof(T_int)); - T_int old = *address_uint32; + T old_value = *addr; T_int assumed; + T_int ret; do { - assumed = old; - T const target_value = (is_32_align) ? T(old & 0xffff) : T(old >> 16); - uint16_t updating_value = type_reinterpret(op(target_value, update_value)); + T_int const new_value = type_reinterpret(op(old_value, update_value)); - T_int const new_value = (is_32_align) ? (old & 0xffff'0000) | updating_value - : (old & 0xffff) | (T_int(updating_value) << 16); - old = atomicCAS(address_uint32, assumed, new_value); - } while (assumed != old); + assumed = type_reinterpret(old_value); + ret = atomicCAS(reinterpret_cast(addr), assumed, new_value); + old_value = type_reinterpret(ret); + } while (assumed != ret); - return (is_32_align) ? T(old & 0xffff) : T(old >> 16); - ; + return old_value; } }; @@ -114,6 +110,7 @@ struct genericAtomicOperationImpl { __forceinline__ __device__ T operator()(T* addr, T const& update_value, Op op) { using T_int = unsigned int; + static_assert(sizeof(T) == sizeof(T_int)); T old_value = *addr; T_int assumed; @@ -125,7 +122,6 @@ struct genericAtomicOperationImpl { assumed = type_reinterpret(old_value); ret = atomicCAS(reinterpret_cast(addr), assumed, new_value); old_value = type_reinterpret(ret); - } while (assumed != ret); return old_value; @@ -150,17 +146,17 @@ struct genericAtomicOperationImpl { assumed = type_reinterpret(old_value); ret = atomicCAS(reinterpret_cast(addr), assumed, new_value); old_value = type_reinterpret(ret); - } while (assumed != ret); return old_value; } }; -// ----------------------------------------------------------------------- -// specialized functions for operators -// `atomicAdd` supports int32, float, double (signed int64 is not supported.) -// `atomicMin`, `atomicMax` support int32_t, int64_t +// Specialized functions for operators. + +// `atomicAdd` supports int32_t, uint32_t, uint64_t, float, double. +// `atomicAdd` does not support int64_t. + template <> struct genericAtomicOperationImpl { using T = float; @@ -188,9 +184,9 @@ struct genericAtomicOperationImpl { } }; -// Cuda natively supports `unsigned long long int` for `atomicAdd`, -// but doesn't supports `signed long long int`. -// However, since the signed integer is represented as Two's complement, +// CUDA natively supports `unsigned long long int` for `atomicAdd`, +// but doesn't support `signed long long int`. +// However, since the signed integer is represented as two's complement, // the fundamental arithmetic operations of addition are identical to // those for unsigned binary numbers. // Then, this computes as `unsigned long long int` with `atomicAdd` @@ -207,6 +203,29 @@ struct genericAtomicOperationImpl { } }; +template <> +struct genericAtomicOperationImpl { + using T = uint32_t; + __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceSum op) + { + return atomicAdd(addr, update_value); + } +}; + +template <> +struct genericAtomicOperationImpl { + using T = uint64_t; + __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceSum op) + { + using T_int = unsigned long long int; + static_assert(sizeof(T) == sizeof(T_int)); + T ret = atomicAdd(reinterpret_cast(addr), type_reinterpret(update_value)); + return ret; + } +}; + +// `atomicMin`, `atomicMax` support int32_t, int64_t, uint32_t, uint64_t. + template <> struct genericAtomicOperationImpl { using T = int32_t; @@ -217,11 +236,11 @@ struct genericAtomicOperationImpl { }; template <> -struct genericAtomicOperationImpl { - using T = int32_t; - __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceMax op) +struct genericAtomicOperationImpl { + using T = uint32_t; + __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceMin op) { - return atomicMax(addr, update_value); + return atomicMin(addr, update_value); } }; @@ -237,6 +256,36 @@ struct genericAtomicOperationImpl { } }; +template <> +struct genericAtomicOperationImpl { + using T = uint64_t; + __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceMin op) + { + using T_int = unsigned long long int; + static_assert(sizeof(T) == sizeof(T_int)); + T ret = atomicMin(reinterpret_cast(addr), type_reinterpret(update_value)); + return ret; + } +}; + +template <> +struct genericAtomicOperationImpl { + using T = int32_t; + __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceMax op) + { + return atomicMax(addr, update_value); + } +}; + +template <> +struct genericAtomicOperationImpl { + using T = uint32_t; + __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceMax op) + { + return atomicMax(addr, update_value); + } +}; + template <> struct genericAtomicOperationImpl { using T = int64_t; @@ -248,6 +297,19 @@ struct genericAtomicOperationImpl { return ret; } }; + +template <> +struct genericAtomicOperationImpl { + using T = uint64_t; + __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceMax op) + { + using T_int = unsigned long long int; + static_assert(sizeof(T) == sizeof(T_int)); + T ret = atomicMax(reinterpret_cast(addr), type_reinterpret(update_value)); + return ret; + } +}; + // ----------------------------------------------------------------------- // the implementation of `typesAtomicCASImpl` template @@ -289,28 +351,14 @@ template struct typesAtomicCASImpl { __forceinline__ __device__ T operator()(T* addr, T const& compare, T const& update_value) { - using T_int = unsigned int; - - bool is_32_align = (reinterpret_cast(addr) & 2) == 0; - auto* address_uint32 = - reinterpret_cast(reinterpret_cast(addr) - (is_32_align ? 0 : 2)); - - T_int old = *address_uint32; - T_int assumed; - T target_value; - uint16_t u_val = type_reinterpret(update_value); - - do { - assumed = old; - target_value = (is_32_align) ? T(old & 0xffff) : T(old >> 16); - if (target_value != compare) break; + using T_int = unsigned short int; + static_assert(sizeof(T) == sizeof(T_int)); - T_int new_value = - (is_32_align) ? (old & 0xffff'0000) | u_val : (old & 0xffff) | (T_int(u_val) << 16); - old = atomicCAS(address_uint32, assumed, new_value); - } while (assumed != old); + T_int ret = atomicCAS(reinterpret_cast(addr), + type_reinterpret(compare), + type_reinterpret(update_value)); - return target_value; + return type_reinterpret(ret); } }; @@ -319,6 +367,7 @@ struct typesAtomicCASImpl { __forceinline__ __device__ T operator()(T* addr, T const& compare, T const& update_value) { using T_int = unsigned int; + static_assert(sizeof(T) == sizeof(T_int)); T_int ret = atomicCAS(reinterpret_cast(addr), type_reinterpret(compare), @@ -328,7 +377,6 @@ struct typesAtomicCASImpl { } }; -// 8 bytes atomic operation template struct typesAtomicCASImpl { __forceinline__ __device__ T operator()(T* addr, T const& compare, T const& update_value) @@ -344,11 +392,10 @@ struct typesAtomicCASImpl { } }; -} // namespace detail - /** - * @brief compute atomic binary operation - * reads the `old` located at the `address` in global or shared memory, + * @brief Compute atomic binary operation + * + * Reads the `old` located at the `address` in global or shared memory, * computes 'BinaryOp'('old', 'update_value'), * and stores the result back to memory at the same address. * These three operations are performed in one atomic transaction. @@ -356,9 +403,9 @@ struct typesAtomicCASImpl { * The supported cudf types for `genericAtomicOperation` are: * int8_t, int16_t, int32_t, int64_t, float, double * - * @param[in] address The address of old value in global or shared memory - * @param[in] val The value to be computed - * @param[in] op The binary operator used for compute + * @param address The address of old value in global or shared memory + * @param val The value to be computed + * @param op The binary operator used for compute * * @returns The old value at `address` */ @@ -408,38 +455,38 @@ __forceinline__ __device__ bool genericAtomicOperation(bool* address, return T(fun(address, update_value, op)); } -} // namespace cudf - /** - * @brief Overloads for `atomicAdd` - * reads the `old` located at the `address` in global or shared memory, + * @brief Overloads for `atomic_add` + * + * Reads the `old` located at the `address` in global or shared memory, * computes (old + val), and stores the result back to memory at the same * address. These three operations are performed in one atomic transaction. * - * The supported cudf types for `atomicAdd` are: + * The supported cudf types for `atomic_add` are: * int8_t, int16_t, int32_t, int64_t, float, double, * cudf::timestamp_D, cudf::timestamp_s, cudf::timestamp_ms cudf::timestamp_us, * cudf::timestamp_ns, cudf::duration_D, cudf::duration_s, cudf::duration_ms, * cudf::duration_us, cudf::duration_ns and bool * - * Cuda natively supports `sint32`, `uint32`, `uint64`, `float`, `double. + * CUDA natively supports `int32_t`, `uint32_t`, `uint64_t`, `float`, `double. * (`double` is supported after Pascal). * Other types are implemented by `atomicCAS`. * - * @param[in] address The address of old value in global or shared memory - * @param[in] val The value to be added + * @param address The address of old value in global or shared memory + * @param val The value to be added * * @returns The old value at `address` */ template -__forceinline__ __device__ T atomicAdd(T* address, T val) +__forceinline__ __device__ T atomic_add(T* address, T val) { - return cudf::genericAtomicOperation(address, val, cudf::DeviceSum{}); + return cudf::detail::genericAtomicOperation(address, val, cudf::DeviceSum{}); } /** - * @brief Overloads for `atomicMul` - * reads the `old` located at the `address` in global or shared memory, + * @brief Overloads for `atomic_mul` + * + * Reads the `old` located at the `address` in global or shared memory, * computes (old * val), and stores the result back to memory at the same * address. These three operations are performed in one atomic transaction. * @@ -448,92 +495,100 @@ __forceinline__ __device__ T atomicAdd(T* address, T val) * * All types are implemented by `atomicCAS`. * - * @param[in] address The address of old value in global or shared memory - * @param[in] val The value to be multiplied + * @param address The address of old value in global or shared memory + * @param val The value to be multiplied * * @returns The old value at `address` */ template -__forceinline__ __device__ T atomicMul(T* address, T val) +__forceinline__ __device__ T atomic_mul(T* address, T val) { - return cudf::genericAtomicOperation(address, val, cudf::DeviceProduct{}); + return cudf::detail::genericAtomicOperation(address, val, cudf::DeviceProduct{}); } /** - * @brief Overloads for `atomicMin` - * reads the `old` located at the `address` in global or shared memory, + * @brief Overloads for `atomic_min` + * + * Reads the `old` located at the `address` in global or shared memory, * computes the minimum of old and val, and stores the result back to memory * at the same address. * These three operations are performed in one atomic transaction. * - * The supported cudf types for `atomicMin` are: + * The supported cudf types for `atomic_min` are: * int8_t, int16_t, int32_t, int64_t, float, double, * cudf::timestamp_D, cudf::timestamp_s, cudf::timestamp_ms, cudf::timestamp_us, * cudf::timestamp_ns, cudf::duration_D, cudf::duration_s, cudf::duration_ms, * cudf::duration_us, cudf::duration_ns and bool - * Cuda natively supports `sint32`, `uint32`, `sint64`, `uint64`. + * + * CUDA natively supports `int32_t`, `uint32_t`, `int64_t`, `uint64_t`. * Other types are implemented by `atomicCAS`. * - * @param[in] address The address of old value in global or shared memory - * @param[in] val The value to be computed + * @param address The address of old value in global or shared memory + * @param val The value to be computed * * @returns The old value at `address` */ template -__forceinline__ __device__ T atomicMin(T* address, T val) +__forceinline__ __device__ T atomic_min(T* address, T val) { - return cudf::genericAtomicOperation(address, val, cudf::DeviceMin{}); + return cudf::detail::genericAtomicOperation(address, val, cudf::DeviceMin{}); } /** - * @brief Overloads for `atomicMax` - * reads the `old` located at the `address` in global or shared memory, + * @brief Overloads for `atomic_max` + * + * Reads the `old` located at the `address` in global or shared memory, * computes the maximum of old and val, and stores the result back to memory * at the same address. * These three operations are performed in one atomic transaction. * - * The supported cudf types for `atomicMax` are: + * The supported cudf types for `atomic_max` are: * int8_t, int16_t, int32_t, int64_t, float, double, * cudf::timestamp_D, cudf::timestamp_s, cudf::timestamp_ms, cudf::timestamp_us, * cudf::timestamp_ns, cudf::duration_D, cudf::duration_s, cudf::duration_ms, * cudf::duration_us, cudf::duration_ns and bool - * Cuda natively supports `sint32`, `uint32`, `sint64`, `uint64`. + * + * CUDA natively supports `int32_t`, `uint32_t`, `int64_t`, `uint64_t`. * Other types are implemented by `atomicCAS`. * - * @param[in] address The address of old value in global or shared memory - * @param[in] val The value to be computed + * @param address The address of old value in global or shared memory + * @param val The value to be computed * * @returns The old value at `address` */ template -__forceinline__ __device__ T atomicMax(T* address, T val) +__forceinline__ __device__ T atomic_max(T* address, T val) { - return cudf::genericAtomicOperation(address, val, cudf::DeviceMax{}); + return cudf::detail::genericAtomicOperation(address, val, cudf::DeviceMax{}); } /** - * @brief Overloads for `atomicCAS` - * reads the `old` located at the `address` in global or shared memory, + * @brief Overloads for `atomic_cas` + * + * Reads the `old` located at the `address` in global or shared memory, * computes (`old` == `compare` ? `val` : `old`), * and stores the result back to memory at the same address. * These three operations are performed in one atomic transaction. * - * The supported cudf types for `atomicCAS` are: + * The supported cudf types for `atomic_cas` are: * int8_t, int16_t, int32_t, int64_t, float, double, * cudf::timestamp_D, cudf::timestamp_s, cudf::timestamp_ms, cudf::timestamp_us, * cudf::timestamp_ns, cudf::duration_D, cudf::duration_s, cudf::duration_ms, * cudf::duration_us, cudf::duration_ns and bool - * Cuda natively supports `sint32`, `uint32`, `uint64`. + * CUDA natively supports `int32_t`, `uint32_t`, `uint64_t`. * Other types are implemented by `atomicCAS`. * - * @param[in] address The address of old value in global or shared memory - * @param[in] compare The value to be compared - * @param[in] val The value to be computed + * @param address The address of old value in global or shared memory + * @param compare The value to be compared + * @param val The value to be computed * * @returns The old value at `address` */ template -__forceinline__ __device__ T atomicCAS(T* address, T compare, T val) +__forceinline__ __device__ T atomic_cas(T* address, T compare, T val) { return cudf::detail::typesAtomicCASImpl()(address, compare, val); } + +} // namespace detail +} // namespace cudf diff --git a/cpp/src/io/avro/avro_gpu.cu b/cpp/src/io/avro/avro_gpu.cu index 612b2d32b7d..b3c8882f541 100644 --- a/cpp/src/io/avro/avro_gpu.cu +++ b/cpp/src/io/avro/avro_gpu.cu @@ -144,7 +144,7 @@ avro_decode_row(schemadesc_s const* schema, case type_null: if (dataptr != nullptr && dst_row >= 0) { atomicAnd(static_cast(dataptr) + (dst_row >> 5), ~(1 << (dst_row & 0x1f))); - atomicAdd(&schema_g[i].count, 1); + atomicAdd(&schema_g[i].count, 1U); *skipped_row = false; } break; diff --git a/cpp/src/io/json/legacy/json_gpu.cu b/cpp/src/io/json/legacy/json_gpu.cu index 9beeecdd6fb..ff4845fcecb 100644 --- a/cpp/src/io/json/legacy/json_gpu.cu +++ b/cpp/src/io/json/legacy/json_gpu.cu @@ -497,7 +497,7 @@ CUDF_KERNEL void collect_keys_info_kernel(parse_options_view const options, for (auto field_range = advance(row_data_range.first); field_range.key_begin < row_data_range.second; field_range = advance(field_range.value_end)) { - auto const idx = atomicAdd(keys_cnt, 1); + auto const idx = atomicAdd(keys_cnt, 1ULL); if (keys_info.has_value()) { auto const len = field_range.key_end - field_range.key_begin; keys_info->column(0).element(idx) = field_range.key_begin - data.begin(); diff --git a/cpp/src/io/utilities/parsing_utils.cu b/cpp/src/io/utilities/parsing_utils.cu index c1cbcd0baca..cb8be380c5b 100644 --- a/cpp/src/io/utilities/parsing_utils.cu +++ b/cpp/src/io/utilities/parsing_utils.cu @@ -106,7 +106,7 @@ CUDF_KERNEL void count_and_set_positions(char const* data, // Process the data for (long i = 0; i < byteToProcess; i++) { if (raw[i] == key) { - auto const idx = atomicAdd(count, (cudf::size_type)1); + auto const idx = atomicAdd(count, static_cast(1)); setElement(positions, idx, did + offset + i, key); } } diff --git a/cpp/src/replace/nulls.cu b/cpp/src/replace/nulls.cu index 8ea229368cc..014171f2b40 100644 --- a/cpp/src/replace/nulls.cu +++ b/cpp/src/replace/nulls.cu @@ -108,7 +108,9 @@ CUDF_KERNEL void replace_nulls_strings(cudf::column_device_view input, // Compute total valid count for this block and add it to global count uint32_t block_valid_count = cudf::detail::single_lane_block_sum_reduce(valid_sum); // one thread computes and adds to output_valid_count - if (threadIdx.x == 0) { atomicAdd(valid_counter, block_valid_count); } + if (threadIdx.x == 0) { + atomicAdd(valid_counter, static_cast(block_valid_count)); + } } template @@ -153,7 +155,9 @@ CUDF_KERNEL void replace_nulls(cudf::column_device_view input, uint32_t block_valid_count = cudf::detail::single_lane_block_sum_reduce(valid_sum); // one thread computes and adds to output_valid_count - if (threadIdx.x == 0) { atomicAdd(output_valid_count, block_valid_count); } + if (threadIdx.x == 0) { + atomicAdd(output_valid_count, static_cast(block_valid_count)); + } } } diff --git a/cpp/src/replace/replace.cu b/cpp/src/replace/replace.cu index 184c30246c7..88d5d3a2375 100644 --- a/cpp/src/replace/replace.cu +++ b/cpp/src/replace/replace.cu @@ -168,7 +168,9 @@ CUDF_KERNEL void replace_strings_first_pass(cudf::column_device_view input, // Compute total valid count for this block and add it to global count uint32_t block_valid_count = cudf::detail::single_lane_block_sum_reduce(valid_sum); // one thread computes and adds to output_valid_count - if (threadIdx.x == 0) { atomicAdd(output_valid_count, block_valid_count); } + if (threadIdx.x == 0) { + atomicAdd(output_valid_count, static_cast(block_valid_count)); + } } /** @@ -295,7 +297,9 @@ CUDF_KERNEL void replace_kernel(cudf::column_device_view input, uint32_t block_valid_count = cudf::detail::single_lane_block_sum_reduce(valid_sum); // one thread computes and adds to output_valid_count - if (threadIdx.x == 0) { atomicAdd(output_valid_count, block_valid_count); } + if (threadIdx.x == 0) { + atomicAdd(output_valid_count, static_cast(block_valid_count)); + } } } diff --git a/cpp/tests/device_atomics/device_atomics_test.cu b/cpp/tests/device_atomics/device_atomics_test.cu index 0d846404ea2..ccf5ccae187 100644 --- a/cpp/tests/device_atomics/device_atomics_test.cu +++ b/cpp/tests/device_atomics/device_atomics_test.cu @@ -37,12 +37,12 @@ CUDF_KERNEL void gpu_atomic_test(T* result, T* data, size_t size) size_t step = blockDim.x * gridDim.x; for (; id < size; id += step) { - atomicAdd(&result[0], data[id]); - atomicMin(&result[1], data[id]); - atomicMax(&result[2], data[id]); - cudf::genericAtomicOperation(&result[3], data[id], cudf::DeviceSum{}); - cudf::genericAtomicOperation(&result[4], data[id], cudf::DeviceMin{}); - cudf::genericAtomicOperation(&result[5], data[id], cudf::DeviceMax{}); + cudf::detail::atomic_add(&result[0], data[id]); + cudf::detail::atomic_min(&result[1], data[id]); + cudf::detail::atomic_max(&result[2], data[id]); + cudf::detail::genericAtomicOperation(&result[3], data[id], cudf::DeviceSum{}); + cudf::detail::genericAtomicOperation(&result[4], data[id], cudf::DeviceMin{}); + cudf::detail::genericAtomicOperation(&result[5], data[id], cudf::DeviceMax{}); } } @@ -72,7 +72,7 @@ __device__ T atomic_op(T* addr, T const& value, BinaryOp op) assumed = old_value; T new_value = op(old_value, value); - old_value = atomicCAS(addr, assumed, new_value); + old_value = cudf::detail::atomic_cas(addr, assumed, new_value); } while (assumed != old_value); return old_value;