diff --git a/cpp/benchmarks/join/generate_input_tables.cuh b/cpp/benchmarks/join/generate_input_tables.cuh
index b14541564dd..93401f01026 100644
--- a/cpp/benchmarks/join/generate_input_tables.cuh
+++ b/cpp/benchmarks/join/generate_input_tables.cuh
@@ -16,10 +16,11 @@
 
 #pragma once
 
-#include <cudf/detail/utilities/device_atomics.cuh>
+#include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 
+#include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
 #include <thrust/distance.h>
diff --git a/cpp/include/cudf/detail/aggregation/aggregation.cuh b/cpp/include/cudf/detail/aggregation/aggregation.cuh
index f13166d5321..ecf2f610697 100644
--- a/cpp/include/cudf/detail/aggregation/aggregation.cuh
+++ b/cpp/include/cudf/detail/aggregation/aggregation.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -144,8 +144,8 @@ struct update_target_element<
     if (source_has_nulls and source.is_null(source_index)) { return; }
 
     using Target = target_type_t<Source, aggregation::MIN>;
-    atomicMin(&target.element<Target>(target_index),
-              static_cast<Target>(source.element<Source>(source_index)));
+    cudf::detail::atomic_min(&target.element<Target>(target_index),
+                             static_cast<Target>(source.element<Source>(source_index)));
 
     if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
   }
@@ -170,8 +170,8 @@ struct update_target_element<
     using DeviceTarget = device_storage_type_t<Target>;
     using DeviceSource = device_storage_type_t<Source>;
 
-    atomicMin(&target.element<DeviceTarget>(target_index),
-              static_cast<DeviceTarget>(source.element<DeviceSource>(source_index)));
+    cudf::detail::atomic_min(&target.element<DeviceTarget>(target_index),
+                             static_cast<DeviceTarget>(source.element<DeviceSource>(source_index)));
 
     if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
   }
@@ -193,8 +193,8 @@ struct update_target_element<
     if (source_has_nulls and source.is_null(source_index)) { return; }
 
     using Target = target_type_t<Source, aggregation::MAX>;
-    atomicMax(&target.element<Target>(target_index),
-              static_cast<Target>(source.element<Source>(source_index)));
+    cudf::detail::atomic_max(&target.element<Target>(target_index),
+                             static_cast<Target>(source.element<Source>(source_index)));
 
     if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
   }
@@ -219,8 +219,8 @@ struct update_target_element<
     using DeviceTarget = device_storage_type_t<Target>;
     using DeviceSource = device_storage_type_t<Source>;
 
-    atomicMax(&target.element<DeviceTarget>(target_index),
-              static_cast<DeviceTarget>(source.element<DeviceSource>(source_index)));
+    cudf::detail::atomic_max(&target.element<DeviceTarget>(target_index),
+                             static_cast<DeviceTarget>(source.element<DeviceSource>(source_index)));
 
     if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
   }
@@ -242,8 +242,8 @@ struct update_target_element<
     if (source_has_nulls and source.is_null(source_index)) { return; }
 
     using Target = target_type_t<Source, aggregation::SUM>;
-    atomicAdd(&target.element<Target>(target_index),
-              static_cast<Target>(source.element<Source>(source_index)));
+    cudf::detail::atomic_add(&target.element<Target>(target_index),
+                             static_cast<Target>(source.element<Source>(source_index)));
 
     if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
   }
@@ -268,8 +268,8 @@ struct update_target_element<
     using DeviceTarget = device_storage_type_t<Target>;
     using DeviceSource = device_storage_type_t<Source>;
 
-    atomicAdd(&target.element<DeviceTarget>(target_index),
-              static_cast<DeviceTarget>(source.element<DeviceSource>(source_index)));
+    cudf::detail::atomic_add(&target.element<DeviceTarget>(target_index),
+                             static_cast<DeviceTarget>(source.element<DeviceSource>(source_index)));
 
     if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
   }
@@ -368,7 +368,7 @@ struct update_target_element<Source,
 
     using Target = target_type_t<Source, aggregation::SUM_OF_SQUARES>;
     auto value   = static_cast<Target>(source.element<Source>(source_index));
-    atomicAdd(&target.element<Target>(target_index), value * value);
+    cudf::detail::atomic_add(&target.element<Target>(target_index), value * value);
     if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
   }
 };
@@ -387,8 +387,8 @@ struct update_target_element<Source,
     if (source_has_nulls and source.is_null(source_index)) { return; }
 
     using Target = target_type_t<Source, aggregation::PRODUCT>;
-    atomicMul(&target.element<Target>(target_index),
-              static_cast<Target>(source.element<Source>(source_index)));
+    cudf::detail::atomic_mul(&target.element<Target>(target_index),
+                             static_cast<Target>(source.element<Source>(source_index)));
     if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
   }
 };
@@ -408,7 +408,7 @@ struct update_target_element<
     if (source_has_nulls and source.is_null(source_index)) { return; }
 
     using Target = target_type_t<Source, aggregation::COUNT_VALID>;
-    atomicAdd(&target.element<Target>(target_index), Target{1});
+    cudf::detail::atomic_add(&target.element<Target>(target_index), Target{1});
 
     // It is assumed the output for COUNT_VALID is initialized to be all valid
   }
@@ -427,7 +427,7 @@ struct update_target_element<
                              size_type source_index) const noexcept
   {
     using Target = target_type_t<Source, aggregation::COUNT_ALL>;
-    atomicAdd(&target.element<Target>(target_index), Target{1});
+    cudf::detail::atomic_add(&target.element<Target>(target_index), Target{1});
 
     // It is assumed the output for COUNT_ALL is initialized to be all valid
   }
@@ -449,10 +449,11 @@ struct update_target_element<
     if (source_has_nulls and source.is_null(source_index)) { return; }
 
     using Target = target_type_t<Source, aggregation::ARGMAX>;
-    auto old     = atomicCAS(&target.element<Target>(target_index), ARGMAX_SENTINEL, source_index);
+    auto old     = cudf::detail::atomic_cas(
+      &target.element<Target>(target_index), ARGMAX_SENTINEL, source_index);
     if (old != ARGMAX_SENTINEL) {
       while (source.element<Source>(source_index) > source.element<Source>(old)) {
-        old = atomicCAS(&target.element<Target>(target_index), old, source_index);
+        old = cudf::detail::atomic_cas(&target.element<Target>(target_index), old, source_index);
       }
     }
 
@@ -476,10 +477,11 @@ struct update_target_element<
     if (source_has_nulls and source.is_null(source_index)) { return; }
 
     using Target = target_type_t<Source, aggregation::ARGMIN>;
-    auto old     = atomicCAS(&target.element<Target>(target_index), ARGMIN_SENTINEL, source_index);
+    auto old     = cudf::detail::atomic_cas(
+      &target.element<Target>(target_index), ARGMIN_SENTINEL, source_index);
     if (old != ARGMIN_SENTINEL) {
       while (source.element<Source>(source_index) < source.element<Source>(old)) {
-        old = atomicCAS(&target.element<Target>(target_index), old, source_index);
+        old = cudf::detail::atomic_cas(&target.element<Target>(target_index), old, source_index);
       }
     }
 
diff --git a/cpp/include/cudf/detail/utilities/device_atomics.cuh b/cpp/include/cudf/detail/utilities/device_atomics.cuh
index 6f23abc59a8..61c5f35d62a 100644
--- a/cpp/include/cudf/detail/utilities/device_atomics.cuh
+++ b/cpp/include/cudf/detail/utilities/device_atomics.cuh
@@ -20,15 +20,15 @@
  * @brief overloads for CUDA atomic operations
  * @file device_atomics.cuh
  *
- * Provides the overloads for all of possible cudf's data types,
- * where cudf's data types are, int8_t, int16_t, int32_t, int64_t, float, double,
- * cudf::timestamp_D, cudf::timestamp_s, cudf::timestamp_ms, cudf::timestamp_us,
+ * Provides the overloads for all of cudf's data types, specifically int8_t,
+ * int16_t, int32_t, int64_t, float, double, cudf::timestamp_D,
+ * cudf::timestamp_s, cudf::timestamp_ms, cudf::timestamp_us,
  * cudf::timestamp_ns, cudf::duration_D, cudf::duration_s, cudf::duration_ms,
- * cudf::duration_us, cudf::duration_ns and bool
- * where CUDA atomic operations are, `atomicAdd`, `atomicMin`, `atomicMax`,
- * `atomicCAS`.
- * Also provides `cudf::genericAtomicOperation` which performs atomic operation
- * with the given binary operator.
+ * cudf::duration_us, cudf::duration_ns and bool for the CUDA atomic operations
+ * `atomicAdd`, `atomicMin`, `atomicMax`, `atomicCAS`.
+ *
+ * Also provides `cudf::detail::genericAtomicOperation` which performs an
+ * atomic operation with the given binary operator.
  */
 
 #include <cudf/detail/utilities/device_operators.cuh>
@@ -85,26 +85,22 @@ template <typename T, typename Op>
 struct genericAtomicOperationImpl<T, Op, 2> {
   __forceinline__ __device__ T operator()(T* addr, T const& update_value, Op op)
   {
-    using T_int      = unsigned int;
-    bool is_32_align = (reinterpret_cast<size_t>(addr) & 2) == 0;
-    auto* address_uint32 =
-      reinterpret_cast<T_int*>(reinterpret_cast<size_t>(addr) - (is_32_align ? 0 : 2));
+    using T_int = unsigned short int;
+    static_assert(sizeof(T) == sizeof(T_int));
 
-    T_int old = *address_uint32;
+    T old_value = *addr;
     T_int assumed;
+    T_int ret;
 
     do {
-      assumed                 = old;
-      T const target_value    = (is_32_align) ? T(old & 0xffff) : T(old >> 16);
-      uint16_t updating_value = type_reinterpret<uint16_t, T>(op(target_value, update_value));
+      T_int const new_value = type_reinterpret<T_int, T>(op(old_value, update_value));
 
-      T_int const new_value = (is_32_align) ? (old & 0xffff'0000) | updating_value
-                                            : (old & 0xffff) | (T_int(updating_value) << 16);
-      old                   = atomicCAS(address_uint32, assumed, new_value);
-    } while (assumed != old);
+      assumed   = type_reinterpret<T_int, T>(old_value);
+      ret       = atomicCAS(reinterpret_cast<T_int*>(addr), assumed, new_value);
+      old_value = type_reinterpret<T, T_int>(ret);
+    } while (assumed != ret);
 
-    return (is_32_align) ? T(old & 0xffff) : T(old >> 16);
-    ;
+    return old_value;
   }
 };
 
@@ -114,6 +110,7 @@ struct genericAtomicOperationImpl<T, Op, 4> {
   __forceinline__ __device__ T operator()(T* addr, T const& update_value, Op op)
   {
     using T_int = unsigned int;
+    static_assert(sizeof(T) == sizeof(T_int));
 
     T old_value = *addr;
     T_int assumed;
@@ -125,7 +122,6 @@ struct genericAtomicOperationImpl<T, Op, 4> {
       assumed   = type_reinterpret<T_int, T>(old_value);
       ret       = atomicCAS(reinterpret_cast<T_int*>(addr), assumed, new_value);
       old_value = type_reinterpret<T, T_int>(ret);
-
     } while (assumed != ret);
 
     return old_value;
@@ -150,17 +146,17 @@ struct genericAtomicOperationImpl<T, Op, 8> {
       assumed   = type_reinterpret<T_int, T>(old_value);
       ret       = atomicCAS(reinterpret_cast<T_int*>(addr), assumed, new_value);
       old_value = type_reinterpret<T, T_int>(ret);
-
     } while (assumed != ret);
 
     return old_value;
   }
 };
 
-// -----------------------------------------------------------------------
-// specialized functions for operators
-// `atomicAdd` supports int32, float, double (signed int64 is not supported.)
-// `atomicMin`, `atomicMax` support int32_t, int64_t
+// Specialized functions for operators.
+
+// `atomicAdd` supports int32_t, uint32_t, uint64_t, float, double.
+// `atomicAdd` does not support int64_t.
+
 template <>
 struct genericAtomicOperationImpl<float, DeviceSum, 4> {
   using T = float;
@@ -188,9 +184,9 @@ struct genericAtomicOperationImpl<int32_t, DeviceSum, 4> {
   }
 };
 
-// Cuda natively supports `unsigned long long int` for `atomicAdd`,
-// but doesn't supports `signed long long int`.
-// However, since the signed integer is represented as Two's complement,
+// CUDA natively supports `unsigned long long int` for `atomicAdd`,
+// but doesn't support `signed long long int`.
+// However, since the signed integer is represented as two's complement,
 // the fundamental arithmetic operations of addition are identical to
 // those for unsigned binary numbers.
 // Then, this computes as `unsigned long long int` with `atomicAdd`
@@ -207,6 +203,29 @@ struct genericAtomicOperationImpl<int64_t, DeviceSum, 8> {
   }
 };
 
+template <>
+struct genericAtomicOperationImpl<uint32_t, DeviceSum, 4> {
+  using T = uint32_t;
+  __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceSum op)
+  {
+    return atomicAdd(addr, update_value);
+  }
+};
+
+template <>
+struct genericAtomicOperationImpl<uint64_t, DeviceSum, 8> {
+  using T = uint64_t;
+  __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceSum op)
+  {
+    using T_int = unsigned long long int;
+    static_assert(sizeof(T) == sizeof(T_int));
+    T ret = atomicAdd(reinterpret_cast<T_int*>(addr), type_reinterpret<T_int, T>(update_value));
+    return ret;
+  }
+};
+
+// `atomicMin`, `atomicMax` support int32_t, int64_t, uint32_t, uint64_t.
+
 template <>
 struct genericAtomicOperationImpl<int32_t, DeviceMin, 4> {
   using T = int32_t;
@@ -217,11 +236,11 @@ struct genericAtomicOperationImpl<int32_t, DeviceMin, 4> {
 };
 
 template <>
-struct genericAtomicOperationImpl<int32_t, DeviceMax, 4> {
-  using T = int32_t;
-  __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceMax op)
+struct genericAtomicOperationImpl<uint32_t, DeviceMin, 4> {
+  using T = uint32_t;
+  __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceMin op)
   {
-    return atomicMax(addr, update_value);
+    return atomicMin(addr, update_value);
   }
 };
 
@@ -237,6 +256,36 @@ struct genericAtomicOperationImpl<int64_t, DeviceMin, 8> {
   }
 };
 
+template <>
+struct genericAtomicOperationImpl<uint64_t, DeviceMin, 8> {
+  using T = uint64_t;
+  __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceMin op)
+  {
+    using T_int = unsigned long long int;
+    static_assert(sizeof(T) == sizeof(T_int));
+    T ret = atomicMin(reinterpret_cast<T_int*>(addr), type_reinterpret<T_int, T>(update_value));
+    return ret;
+  }
+};
+
+template <>
+struct genericAtomicOperationImpl<int32_t, DeviceMax, 4> {
+  using T = int32_t;
+  __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceMax op)
+  {
+    return atomicMax(addr, update_value);
+  }
+};
+
+template <>
+struct genericAtomicOperationImpl<uint32_t, DeviceMax, 4> {
+  using T = uint32_t;
+  __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceMax op)
+  {
+    return atomicMax(addr, update_value);
+  }
+};
+
 template <>
 struct genericAtomicOperationImpl<int64_t, DeviceMax, 8> {
   using T = int64_t;
@@ -248,6 +297,19 @@ struct genericAtomicOperationImpl<int64_t, DeviceMax, 8> {
     return ret;
   }
 };
+
+template <>
+struct genericAtomicOperationImpl<uint64_t, DeviceMax, 8> {
+  using T = uint64_t;
+  __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceMax op)
+  {
+    using T_int = unsigned long long int;
+    static_assert(sizeof(T) == sizeof(T_int));
+    T ret = atomicMax(reinterpret_cast<T_int*>(addr), type_reinterpret<T_int, T>(update_value));
+    return ret;
+  }
+};
+
 // -----------------------------------------------------------------------
 // the implementation of `typesAtomicCASImpl`
 template <typename T, size_t N = sizeof(T)>
@@ -289,28 +351,14 @@ template <typename T>
 struct typesAtomicCASImpl<T, 2> {
   __forceinline__ __device__ T operator()(T* addr, T const& compare, T const& update_value)
   {
-    using T_int = unsigned int;
-
-    bool is_32_align = (reinterpret_cast<size_t>(addr) & 2) == 0;
-    auto* address_uint32 =
-      reinterpret_cast<T_int*>(reinterpret_cast<size_t>(addr) - (is_32_align ? 0 : 2));
-
-    T_int old = *address_uint32;
-    T_int assumed;
-    T target_value;
-    uint16_t u_val = type_reinterpret<uint16_t, T>(update_value);
-
-    do {
-      assumed      = old;
-      target_value = (is_32_align) ? T(old & 0xffff) : T(old >> 16);
-      if (target_value != compare) break;
+    using T_int = unsigned short int;
+    static_assert(sizeof(T) == sizeof(T_int));
 
-      T_int new_value =
-        (is_32_align) ? (old & 0xffff'0000) | u_val : (old & 0xffff) | (T_int(u_val) << 16);
-      old = atomicCAS(address_uint32, assumed, new_value);
-    } while (assumed != old);
+    T_int ret = atomicCAS(reinterpret_cast<T_int*>(addr),
+                          type_reinterpret<T_int, T>(compare),
+                          type_reinterpret<T_int, T>(update_value));
 
-    return target_value;
+    return type_reinterpret<T, T_int>(ret);
   }
 };
 
@@ -319,6 +367,7 @@ struct typesAtomicCASImpl<T, 4> {
   __forceinline__ __device__ T operator()(T* addr, T const& compare, T const& update_value)
   {
     using T_int = unsigned int;
+    static_assert(sizeof(T) == sizeof(T_int));
 
     T_int ret = atomicCAS(reinterpret_cast<T_int*>(addr),
                           type_reinterpret<T_int, T>(compare),
@@ -328,7 +377,6 @@ struct typesAtomicCASImpl<T, 4> {
   }
 };
 
-// 8 bytes atomic operation
 template <typename T>
 struct typesAtomicCASImpl<T, 8> {
   __forceinline__ __device__ T operator()(T* addr, T const& compare, T const& update_value)
@@ -344,11 +392,10 @@ struct typesAtomicCASImpl<T, 8> {
   }
 };
 
-}  // namespace detail
-
 /**
- * @brief compute atomic binary operation
- * reads the `old` located at the `address` in global or shared memory,
+ * @brief Compute atomic binary operation
+ *
+ * Reads the `old` located at the `address` in global or shared memory,
  * computes 'BinaryOp'('old', 'update_value'),
  * and stores the result back to memory at the same address.
  * These three operations are performed in one atomic transaction.
@@ -356,9 +403,9 @@ struct typesAtomicCASImpl<T, 8> {
  * The supported cudf types for `genericAtomicOperation` are:
  * int8_t, int16_t, int32_t, int64_t, float, double
  *
- * @param[in] address The address of old value in global or shared memory
- * @param[in] val The value to be computed
- * @param[in] op  The binary operator used for compute
+ * @param address The address of old value in global or shared memory
+ * @param val The value to be computed
+ * @param op  The binary operator used for compute
  *
  * @returns The old value at `address`
  */
@@ -408,38 +455,38 @@ __forceinline__ __device__ bool genericAtomicOperation(bool* address,
   return T(fun(address, update_value, op));
 }
 
-}  // namespace cudf
-
 /**
- * @brief Overloads for `atomicAdd`
- * reads the `old` located at the `address` in global or shared memory,
+ * @brief Overloads for `atomic_add`
+ *
+ * Reads the `old` located at the `address` in global or shared memory,
  * computes (old + val), and stores the result back to memory at the same
  * address. These three operations are performed in one atomic transaction.
  *
- * The supported cudf types for `atomicAdd` are:
+ * The supported cudf types for `atomic_add` are:
  * int8_t, int16_t, int32_t, int64_t, float, double,
  * cudf::timestamp_D, cudf::timestamp_s, cudf::timestamp_ms cudf::timestamp_us,
  * cudf::timestamp_ns, cudf::duration_D, cudf::duration_s, cudf::duration_ms,
  * cudf::duration_us, cudf::duration_ns and bool
  *
- * Cuda natively supports `sint32`, `uint32`, `uint64`, `float`, `double.
+ * CUDA natively supports `int32_t`, `uint32_t`, `uint64_t`, `float`, `double.
  * (`double` is supported after Pascal).
  * Other types are implemented by `atomicCAS`.
  *
- * @param[in] address The address of old value in global or shared memory
- * @param[in] val The value to be added
+ * @param address The address of old value in global or shared memory
+ * @param val The value to be added
  *
  * @returns The old value at `address`
  */
 template <typename T>
-__forceinline__ __device__ T atomicAdd(T* address, T val)
+__forceinline__ __device__ T atomic_add(T* address, T val)
 {
-  return cudf::genericAtomicOperation(address, val, cudf::DeviceSum{});
+  return cudf::detail::genericAtomicOperation(address, val, cudf::DeviceSum{});
 }
 
 /**
- * @brief Overloads for `atomicMul`
- * reads the `old` located at the `address` in global or shared memory,
+ * @brief Overloads for `atomic_mul`
+ *
+ * Reads the `old` located at the `address` in global or shared memory,
  * computes (old * val), and stores the result back to memory at the same
  * address. These three operations are performed in one atomic transaction.
  *
@@ -448,92 +495,100 @@ __forceinline__ __device__ T atomicAdd(T* address, T val)
  *
  * All types are implemented by `atomicCAS`.
  *
- * @param[in] address The address of old value in global or shared memory
- * @param[in] val The value to be multiplied
+ * @param address The address of old value in global or shared memory
+ * @param val The value to be multiplied
  *
  * @returns The old value at `address`
  */
 template <typename T>
-__forceinline__ __device__ T atomicMul(T* address, T val)
+__forceinline__ __device__ T atomic_mul(T* address, T val)
 {
-  return cudf::genericAtomicOperation(address, val, cudf::DeviceProduct{});
+  return cudf::detail::genericAtomicOperation(address, val, cudf::DeviceProduct{});
 }
 
 /**
- * @brief Overloads for `atomicMin`
- * reads the `old` located at the `address` in global or shared memory,
+ * @brief Overloads for `atomic_min`
+ *
+ * Reads the `old` located at the `address` in global or shared memory,
  * computes the minimum of old and val, and stores the result back to memory
  * at the same address.
  * These three operations are performed in one atomic transaction.
  *
- * The supported cudf types for `atomicMin` are:
+ * The supported cudf types for `atomic_min` are:
  * int8_t, int16_t, int32_t, int64_t, float, double,
  * cudf::timestamp_D, cudf::timestamp_s, cudf::timestamp_ms, cudf::timestamp_us,
  * cudf::timestamp_ns, cudf::duration_D, cudf::duration_s, cudf::duration_ms,
  * cudf::duration_us, cudf::duration_ns and bool
- * Cuda natively supports `sint32`, `uint32`, `sint64`, `uint64`.
+ *
+ * CUDA natively supports `int32_t`, `uint32_t`, `int64_t`, `uint64_t`.
  * Other types are implemented by `atomicCAS`.
  *
- * @param[in] address The address of old value in global or shared memory
- * @param[in] val The value to be computed
+ * @param address The address of old value in global or shared memory
+ * @param val The value to be computed
  *
  * @returns The old value at `address`
  */
 template <typename T>
-__forceinline__ __device__ T atomicMin(T* address, T val)
+__forceinline__ __device__ T atomic_min(T* address, T val)
 {
-  return cudf::genericAtomicOperation(address, val, cudf::DeviceMin{});
+  return cudf::detail::genericAtomicOperation(address, val, cudf::DeviceMin{});
 }
 
 /**
- * @brief Overloads for `atomicMax`
- * reads the `old` located at the `address` in global or shared memory,
+ * @brief Overloads for `atomic_max`
+ *
+ * Reads the `old` located at the `address` in global or shared memory,
  * computes the maximum of old and val, and stores the result back to memory
  * at the same address.
  * These three operations are performed in one atomic transaction.
  *
- * The supported cudf types for `atomicMax` are:
+ * The supported cudf types for `atomic_max` are:
  * int8_t, int16_t, int32_t, int64_t, float, double,
  * cudf::timestamp_D, cudf::timestamp_s, cudf::timestamp_ms, cudf::timestamp_us,
  * cudf::timestamp_ns, cudf::duration_D, cudf::duration_s, cudf::duration_ms,
  * cudf::duration_us, cudf::duration_ns and bool
- * Cuda natively supports `sint32`, `uint32`, `sint64`, `uint64`.
+ *
+ * CUDA natively supports `int32_t`, `uint32_t`, `int64_t`, `uint64_t`.
  * Other types are implemented by `atomicCAS`.
  *
- * @param[in] address The address of old value in global or shared memory
- * @param[in] val The value to be computed
+ * @param address The address of old value in global or shared memory
+ * @param val The value to be computed
  *
  * @returns The old value at `address`
  */
 template <typename T>
-__forceinline__ __device__ T atomicMax(T* address, T val)
+__forceinline__ __device__ T atomic_max(T* address, T val)
 {
-  return cudf::genericAtomicOperation(address, val, cudf::DeviceMax{});
+  return cudf::detail::genericAtomicOperation(address, val, cudf::DeviceMax{});
 }
 
 /**
- * @brief Overloads for `atomicCAS`
- * reads the `old` located at the `address` in global or shared memory,
+ * @brief Overloads for `atomic_cas`
+ *
+ * Reads the `old` located at the `address` in global or shared memory,
  * computes (`old` == `compare` ? `val` : `old`),
  * and stores the result back to memory at the same address.
  * These three operations are performed in one atomic transaction.
  *
- * The supported cudf types for `atomicCAS` are:
+ * The supported cudf types for `atomic_cas` are:
  * int8_t, int16_t, int32_t, int64_t, float, double,
  * cudf::timestamp_D, cudf::timestamp_s, cudf::timestamp_ms, cudf::timestamp_us,
  * cudf::timestamp_ns, cudf::duration_D, cudf::duration_s, cudf::duration_ms,
  * cudf::duration_us, cudf::duration_ns and bool
- * Cuda natively supports `sint32`, `uint32`, `uint64`.
+ * CUDA natively supports `int32_t`, `uint32_t`, `uint64_t`.
  * Other types are implemented by `atomicCAS`.
  *
- * @param[in] address The address of old value in global or shared memory
- * @param[in] compare The value to be compared
- * @param[in] val The value to be computed
+ * @param address The address of old value in global or shared memory
+ * @param compare The value to be compared
+ * @param val The value to be computed
  *
  * @returns The old value at `address`
  */
 template <typename T>
-__forceinline__ __device__ T atomicCAS(T* address, T compare, T val)
+__forceinline__ __device__ T atomic_cas(T* address, T compare, T val)
 {
   return cudf::detail::typesAtomicCASImpl<T>()(address, compare, val);
 }
+
+}  // namespace detail
+}  // namespace cudf
diff --git a/cpp/src/io/avro/avro_gpu.cu b/cpp/src/io/avro/avro_gpu.cu
index 612b2d32b7d..b3c8882f541 100644
--- a/cpp/src/io/avro/avro_gpu.cu
+++ b/cpp/src/io/avro/avro_gpu.cu
@@ -144,7 +144,7 @@ avro_decode_row(schemadesc_s const* schema,
       case type_null:
         if (dataptr != nullptr && dst_row >= 0) {
           atomicAnd(static_cast<uint32_t*>(dataptr) + (dst_row >> 5), ~(1 << (dst_row & 0x1f)));
-          atomicAdd(&schema_g[i].count, 1);
+          atomicAdd(&schema_g[i].count, 1U);
           *skipped_row = false;
         }
         break;
diff --git a/cpp/src/io/json/legacy/json_gpu.cu b/cpp/src/io/json/legacy/json_gpu.cu
index 9beeecdd6fb..ff4845fcecb 100644
--- a/cpp/src/io/json/legacy/json_gpu.cu
+++ b/cpp/src/io/json/legacy/json_gpu.cu
@@ -497,7 +497,7 @@ CUDF_KERNEL void collect_keys_info_kernel(parse_options_view const options,
   for (auto field_range = advance(row_data_range.first);
        field_range.key_begin < row_data_range.second;
        field_range = advance(field_range.value_end)) {
-    auto const idx = atomicAdd(keys_cnt, 1);
+    auto const idx = atomicAdd(keys_cnt, 1ULL);
     if (keys_info.has_value()) {
       auto const len                              = field_range.key_end - field_range.key_begin;
       keys_info->column(0).element<uint64_t>(idx) = field_range.key_begin - data.begin();
diff --git a/cpp/src/io/utilities/parsing_utils.cu b/cpp/src/io/utilities/parsing_utils.cu
index c1cbcd0baca..cb8be380c5b 100644
--- a/cpp/src/io/utilities/parsing_utils.cu
+++ b/cpp/src/io/utilities/parsing_utils.cu
@@ -106,7 +106,7 @@ CUDF_KERNEL void count_and_set_positions(char const* data,
   // Process the data
   for (long i = 0; i < byteToProcess; i++) {
     if (raw[i] == key) {
-      auto const idx = atomicAdd(count, (cudf::size_type)1);
+      auto const idx = atomicAdd(count, static_cast<cudf::size_type>(1));
       setElement(positions, idx, did + offset + i, key);
     }
   }
diff --git a/cpp/src/replace/nulls.cu b/cpp/src/replace/nulls.cu
index 8ea229368cc..014171f2b40 100644
--- a/cpp/src/replace/nulls.cu
+++ b/cpp/src/replace/nulls.cu
@@ -108,7 +108,9 @@ CUDF_KERNEL void replace_nulls_strings(cudf::column_device_view input,
   // Compute total valid count for this block and add it to global count
   uint32_t block_valid_count = cudf::detail::single_lane_block_sum_reduce<BLOCK_SIZE, 0>(valid_sum);
   // one thread computes and adds to output_valid_count
-  if (threadIdx.x == 0) { atomicAdd(valid_counter, block_valid_count); }
+  if (threadIdx.x == 0) {
+    atomicAdd(valid_counter, static_cast<cudf::size_type>(block_valid_count));
+  }
 }
 
 template <typename Type, bool replacement_has_nulls>
@@ -153,7 +155,9 @@ CUDF_KERNEL void replace_nulls(cudf::column_device_view input,
     uint32_t block_valid_count =
       cudf::detail::single_lane_block_sum_reduce<BLOCK_SIZE, 0>(valid_sum);
     // one thread computes and adds to output_valid_count
-    if (threadIdx.x == 0) { atomicAdd(output_valid_count, block_valid_count); }
+    if (threadIdx.x == 0) {
+      atomicAdd(output_valid_count, static_cast<cudf::size_type>(block_valid_count));
+    }
   }
 }
 
diff --git a/cpp/src/replace/replace.cu b/cpp/src/replace/replace.cu
index 184c30246c7..88d5d3a2375 100644
--- a/cpp/src/replace/replace.cu
+++ b/cpp/src/replace/replace.cu
@@ -168,7 +168,9 @@ CUDF_KERNEL void replace_strings_first_pass(cudf::column_device_view input,
   // Compute total valid count for this block and add it to global count
   uint32_t block_valid_count = cudf::detail::single_lane_block_sum_reduce<BLOCK_SIZE, 0>(valid_sum);
   // one thread computes and adds to output_valid_count
-  if (threadIdx.x == 0) { atomicAdd(output_valid_count, block_valid_count); }
+  if (threadIdx.x == 0) {
+    atomicAdd(output_valid_count, static_cast<cudf::size_type>(block_valid_count));
+  }
 }
 
 /**
@@ -295,7 +297,9 @@ CUDF_KERNEL void replace_kernel(cudf::column_device_view input,
     uint32_t block_valid_count =
       cudf::detail::single_lane_block_sum_reduce<BLOCK_SIZE, 0>(valid_sum);
     // one thread computes and adds to output_valid_count
-    if (threadIdx.x == 0) { atomicAdd(output_valid_count, block_valid_count); }
+    if (threadIdx.x == 0) {
+      atomicAdd(output_valid_count, static_cast<cudf::size_type>(block_valid_count));
+    }
   }
 }
 
diff --git a/cpp/tests/device_atomics/device_atomics_test.cu b/cpp/tests/device_atomics/device_atomics_test.cu
index 0d846404ea2..ccf5ccae187 100644
--- a/cpp/tests/device_atomics/device_atomics_test.cu
+++ b/cpp/tests/device_atomics/device_atomics_test.cu
@@ -37,12 +37,12 @@ CUDF_KERNEL void gpu_atomic_test(T* result, T* data, size_t size)
   size_t step = blockDim.x * gridDim.x;
 
   for (; id < size; id += step) {
-    atomicAdd(&result[0], data[id]);
-    atomicMin(&result[1], data[id]);
-    atomicMax(&result[2], data[id]);
-    cudf::genericAtomicOperation(&result[3], data[id], cudf::DeviceSum{});
-    cudf::genericAtomicOperation(&result[4], data[id], cudf::DeviceMin{});
-    cudf::genericAtomicOperation(&result[5], data[id], cudf::DeviceMax{});
+    cudf::detail::atomic_add(&result[0], data[id]);
+    cudf::detail::atomic_min(&result[1], data[id]);
+    cudf::detail::atomic_max(&result[2], data[id]);
+    cudf::detail::genericAtomicOperation(&result[3], data[id], cudf::DeviceSum{});
+    cudf::detail::genericAtomicOperation(&result[4], data[id], cudf::DeviceMin{});
+    cudf::detail::genericAtomicOperation(&result[5], data[id], cudf::DeviceMax{});
   }
 }
 
@@ -72,7 +72,7 @@ __device__ T atomic_op(T* addr, T const& value, BinaryOp op)
     assumed     = old_value;
     T new_value = op(old_value, value);
 
-    old_value = atomicCAS(addr, assumed, new_value);
+    old_value = cudf::detail::atomic_cas(addr, assumed, new_value);
   } while (assumed != old_value);
 
   return old_value;