From 5cf4c164224805b542f5d8411034574858224e64 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Fri, 2 Feb 2024 21:37:52 -0600
Subject: [PATCH 1/6] Refactor atomicAdd to cudf::detail::atomic_add.

---
 .../cudf/detail/aggregation/aggregation.cuh   | 16 +++----
 cpp/include/cudf/detail/copy_if_else.cuh      |  3 +-
 cpp/include/cudf/detail/copy_range.cuh        |  3 +-
 .../cudf/detail/labeling/label_segments.cuh   |  5 +-
 cpp/include/cudf/detail/null_mask.cuh         |  3 +-
 .../cudf/detail/utilities/device_atomics.cuh  | 47 +++++++++++++------
 cpp/include/cudf/detail/valid_if.cuh          |  5 +-
 cpp/src/bitmask/null_mask.cu                  |  3 +-
 cpp/src/copying/concatenate.cu                |  5 +-
 cpp/src/groupby/sort/group_quantiles.cu       |  5 +-
 cpp/src/groupby/sort/group_std.cu             |  5 +-
 cpp/src/io/avro/avro_gpu.cu                   |  4 +-
 cpp/src/io/csv/csv_gpu.cu                     | 19 ++++----
 cpp/src/io/json/legacy/json_gpu.cu            | 29 ++++++------
 cpp/src/io/orc/stripe_data.cu                 | 14 ++++--
 cpp/src/io/parquet/chunk_dict.cu              |  5 +-
 cpp/src/io/utilities/data_casting.cu          | 17 +++----
 cpp/src/io/utilities/parsing_utils.cu         |  3 +-
 cpp/src/io/utilities/type_inference.cu        | 21 +++++----
 cpp/src/join/join_common_utils.cuh            |  9 ++--
 cpp/src/json/json_path.cu                     |  3 +-
 cpp/src/partitioning/partitioning.cu          |  9 ++--
 cpp/src/replace/nulls.cu                      |  9 +++-
 cpp/src/replace/replace.cu                    |  9 +++-
 cpp/src/rolling/detail/rolling.cuh            |  3 +-
 cpp/src/stream_compaction/distinct_helpers.cu |  5 +-
 cpp/src/strings/copying/concatenate.cu        |  3 +-
 cpp/src/strings/replace/multi.cu              |  3 +-
 cpp/src/transform/row_conversion.cu           |  3 +-
 .../device_atomics/device_atomics_test.cu     |  2 +-
 30 files changed, 165 insertions(+), 105 deletions(-)
diff --git a/cpp/include/cudf/detail/aggregation/aggregation.cuh b/cpp/include/cudf/detail/aggregation/aggregation.cuh
index f13166d5321..968fc83e763 100644
--- a/cpp/include/cudf/detail/aggregation/aggregation.cuh
+++ b/cpp/include/cudf/detail/aggregation/aggregation.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -242,8 +242,8 @@ struct update_target_element<
     if (source_has_nulls and source.is_null(source_index)) { return; }
 
     using Target = target_type_t<Source, aggregation::SUM>;
-    atomicAdd(&target.element<Target>(target_index),
-              static_cast<Target>(source.element<Source>(source_index)));
+    cudf::detail::atomic_add(&target.element<Target>(target_index),
+                             static_cast<Target>(source.element<Source>(source_index)));
 
     if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
   }
@@ -268,8 +268,8 @@ struct update_target_element<
     using DeviceTarget = device_storage_type_t<Target>;
     using DeviceSource = device_storage_type_t<Source>;
 
-    atomicAdd(&target.element<DeviceTarget>(target_index),
-              static_cast<DeviceTarget>(source.element<DeviceSource>(source_index)));
+    cudf::detail::atomic_add(&target.element<DeviceTarget>(target_index),
+                             static_cast<DeviceTarget>(source.element<DeviceSource>(source_index)));
 
     if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
   }
@@ -368,7 +368,7 @@ struct update_target_element<Source,
 
     using Target = target_type_t<Source, aggregation::SUM_OF_SQUARES>;
     auto value   = static_cast<Target>(source.element<Source>(source_index));
-    atomicAdd(&target.element<Target>(target_index), value * value);
+    cudf::detail::atomic_add(&target.element<Target>(target_index), value * value);
     if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
   }
 };
@@ -408,7 +408,7 @@ struct update_target_element<
     if (source_has_nulls and source.is_null(source_index)) { return; }
 
     using Target = target_type_t<Source, aggregation::COUNT_VALID>;
-    atomicAdd(&target.element<Target>(target_index), Target{1});
+    cudf::detail::atomic_add(&target.element<Target>(target_index), Target{1});
 
     // It is assumed the output for COUNT_VALID is initialized to be all valid
   }
@@ -427,7 +427,7 @@ struct update_target_element<
                              size_type source_index) const noexcept
   {
     using Target = target_type_t<Source, aggregation::COUNT_ALL>;
-    atomicAdd(&target.element<Target>(target_index), Target{1});
+    cudf::detail::atomic_add(&target.element<Target>(target_index), Target{1});
 
     // It is assumed the output for COUNT_ALL is initialized to be all valid
   }
diff --git a/cpp/include/cudf/detail/copy_if_else.cuh b/cpp/include/cudf/detail/copy_if_else.cuh
index 6162fa5ecf1..65b453e0bd1 100644
--- a/cpp/include/cudf/detail/copy_if_else.cuh
+++ b/cpp/include/cudf/detail/copy_if_else.cuh
@@ -20,6 +20,7 @@
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/device_atomics.cuh>
 #include <cudf/detail/utilities/integer_utils.hpp>
 
 #include <rmm/device_scalar.hpp>
@@ -94,7 +95,7 @@ __launch_bounds__(block_size) CUDF_KERNEL
     // block_valid_count will only be valid on thread 0
     if (threadIdx.x == 0) {
       // using an atomic here because there are multiple blocks doing this work
-      atomicAdd(valid_count, block_valid_count);
+      cudf::detail::atomic_add(valid_count, block_valid_count);
     }
   }
 }
diff --git a/cpp/include/cudf/detail/copy_range.cuh b/cpp/include/cudf/detail/copy_range.cuh
index 4bfdaa94c53..192a60cf9b3 100644
--- a/cpp/include/cudf/detail/copy_range.cuh
+++ b/cpp/include/cudf/detail/copy_range.cuh
@@ -19,6 +19,7 @@
 #include <cudf/column/column_view.hpp>
 #include <cudf/copying.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/device_atomics.cuh>
 #include <cudf/types.hpp>
 #include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/default_stream.hpp>
@@ -100,7 +101,7 @@ CUDF_KERNEL void copy_range_kernel(SourceValueIterator source_value_begin,
     auto block_null_change =
       cudf::detail::single_lane_block_sum_reduce<block_size, leader_lane>(warp_null_change);
     if (threadIdx.x == 0) {  // if the first thread in a block
-      atomicAdd(null_count, block_null_change);
+      cudf::detail::atomic_add(null_count, block_null_change);
     }
   }
 }
diff --git a/cpp/include/cudf/detail/labeling/label_segments.cuh b/cpp/include/cudf/detail/labeling/label_segments.cuh
index 9051230a272..405d111ab35 100644
--- a/cpp/include/cudf/detail/labeling/label_segments.cuh
+++ b/cpp/include/cudf/detail/labeling/label_segments.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include <cudf/detail/utilities/device_atomics.cuh>
 #include <cudf/types.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -103,7 +104,7 @@ void label_segments(InputIterator offsets_begin,
       // In case we have repeated offsets (i.e., we have empty segments), this `atomicAdd` call will
       // make sure the label values corresponding to these empty segments will be skipped in the
       // output.
-      if (dst_idx < num_labels) { atomicAdd(&output[dst_idx], OutputType{1}); }
+      if (dst_idx < num_labels) { cudf::detail::atomic_add(&output[dst_idx], OutputType{1}); }
     });
   thrust::inclusive_scan(rmm::exec_policy(stream), label_begin, label_end, label_begin);
 }
diff --git a/cpp/include/cudf/detail/null_mask.cuh b/cpp/include/cudf/detail/null_mask.cuh
index e57d85f2998..3113b991a79 100644
--- a/cpp/include/cudf/detail/null_mask.cuh
+++ b/cpp/include/cudf/detail/null_mask.cuh
@@ -17,6 +17,7 @@
 
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/device_atomics.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/detail/valid_if.cuh>
 #include <cudf/null_mask.hpp>
@@ -107,7 +108,7 @@ CUDF_KERNEL void offset_bitmask_binop(Binop op,
   __shared__ typename BlockReduce::TempStorage temp_storage;
   size_type block_count = BlockReduce(temp_storage).Sum(thread_count);
 
-  if (threadIdx.x == 0) { atomicAdd(count_ptr, block_count); }
+  if (threadIdx.x == 0) { cudf::detail::atomic_add(count_ptr, block_count); }
 }
 
 /**
diff --git a/cpp/include/cudf/detail/utilities/device_atomics.cuh b/cpp/include/cudf/detail/utilities/device_atomics.cuh
index c56e88f07a8..3a5108a3503 100644
--- a/cpp/include/cudf/detail/utilities/device_atomics.cuh
+++ b/cpp/include/cudf/detail/utilities/device_atomics.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -158,8 +158,8 @@ struct genericAtomicOperationImpl<T, Op, 8> {
 
 // -----------------------------------------------------------------------
 // specialized functions for operators
-// `atomicAdd` supports int32, float, double (signed int64 is not supported.)
-// `atomicMin`, `atomicMax` support int32_t, int64_t
+// `atomicAdd` supports int32_t, uint32_t, uint64_t, float, double (signed int64_t is not
+// supported.) `atomicMin`, `atomicMax` support int32_t, uint32_t, uint64_t, int64_t
 template <>
 struct genericAtomicOperationImpl<float, DeviceSum, 4> {
   using T = float;
@@ -169,8 +169,6 @@ struct genericAtomicOperationImpl<float, DeviceSum, 4> {
   }
 };
 
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 600)
-// `atomicAdd(double)` is supported after cuda architecture 6.0
 template <>
 struct genericAtomicOperationImpl<double, DeviceSum, 8> {
   using T = double;
@@ -179,7 +177,27 @@ struct genericAtomicOperationImpl<double, DeviceSum, 8> {
     return atomicAdd(addr, update_value);
   }
 };
-#endif
+
+template <>
+struct genericAtomicOperationImpl<uint32_t, DeviceSum, 4> {
+  using T = uint32_t;
+  __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceSum op)
+  {
+    return atomicAdd(addr, update_value);
+  }
+};
+
+template <>
+struct genericAtomicOperationImpl<uint64_t, DeviceSum, 8> {
+  using T = uint64_t;
+  __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceSum op)
+  {
+    using T_int = unsigned long long int;
+    static_assert(sizeof(T) == sizeof(T_int));
+    T ret = atomicAdd(reinterpret_cast<T_int*>(addr), type_reinterpret<T_int, T>(update_value));
+    return ret;
+  }
+};
 
 template <>
 struct genericAtomicOperationImpl<int32_t, DeviceSum, 4> {
@@ -190,9 +208,9 @@ struct genericAtomicOperationImpl<int32_t, DeviceSum, 4> {
   }
 };
 
-// Cuda natively supports `unsigned long long int` for `atomicAdd`,
-// but doesn't supports `signed long long int`.
-// However, since the signed integer is represented as Two's complement,
+// CUDA natively supports `unsigned long long int` for `atomicAdd`,
+// but doesn't support `signed long long int`.
+// However, since the signed integer is represented as two's complement,
 // the fundamental arithmetic operations of addition are identical to
 // those for unsigned binary numbers.
 // Then, this computes as `unsigned long long int` with `atomicAdd`
@@ -410,15 +428,14 @@ __forceinline__ __device__ bool genericAtomicOperation(bool* address,
   return T(fun(address, update_value, op));
 }
 
-}  // namespace cudf
-
+namespace detail {
 /**
- * @brief Overloads for `atomicAdd`
+ * @brief Overloads for `atomic_add`
  * reads the `old` located at the `address` in global or shared memory,
  * computes (old + val), and stores the result back to memory at the same
  * address. These three operations are performed in one atomic transaction.
  *
- * The supported cudf types for `atomicAdd` are:
+ * The supported cudf types for `atomic_add` are:
  * int8_t, int16_t, int32_t, int64_t, float, double,
  * cudf::timestamp_D, cudf::timestamp_s, cudf::timestamp_ms cudf::timestamp_us,
  * cudf::timestamp_ns, cudf::duration_D, cudf::duration_s, cudf::duration_ms,
@@ -434,10 +451,12 @@ __forceinline__ __device__ bool genericAtomicOperation(bool* address,
  * @returns The old value at `address`
  */
 template <typename T>
-__forceinline__ __device__ T atomicAdd(T* address, T val)
+__forceinline__ __device__ T atomic_add(T* address, T val)
 {
   return cudf::genericAtomicOperation(address, val, cudf::DeviceSum{});
 }
+}  // namespace detail
+}  // namespace cudf
 
 /**
  * @brief Overloads for `atomicMul`
diff --git a/cpp/include/cudf/detail/valid_if.cuh b/cpp/include/cudf/detail/valid_if.cuh
index d0073177445..453a072fece 100644
--- a/cpp/include/cudf/detail/valid_if.cuh
+++ b/cpp/include/cudf/detail/valid_if.cuh
@@ -18,6 +18,7 @@
 
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/device_atomics.cuh>
 #include <cudf/types.hpp>
 #include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/default_stream.hpp>
@@ -65,7 +66,7 @@ CUDF_KERNEL void valid_if_kernel(
   }
 
   size_type block_count = single_lane_block_sum_reduce<block_size, leader_lane>(warp_valid_count);
-  if (threadIdx.x == 0) { atomicAdd(valid_count, block_count); }
+  if (threadIdx.x == 0) { cudf::detail::atomic_add(valid_count, block_count); }
 }
 
 /**
@@ -183,7 +184,7 @@ CUDF_KERNEL void valid_if_n_kernel(InputIterator1 begin1,
 
     auto block_valid_count = single_lane_block_sum_reduce<block_size, 0>(warp_valid_count);
 
-    if (threadIdx.x == 0) { atomicAdd(valid_counts + mask_idx, block_valid_count); }
+    if (threadIdx.x == 0) { cudf::detail::atomic_add(valid_counts + mask_idx, block_valid_count); }
   }
 }
 
diff --git a/cpp/src/bitmask/null_mask.cu b/cpp/src/bitmask/null_mask.cu
index bb320e4b81a..52f5b0108a4 100644
--- a/cpp/src/bitmask/null_mask.cu
+++ b/cpp/src/bitmask/null_mask.cu
@@ -19,6 +19,7 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/device_atomics.cuh>
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/null_mask.hpp>
@@ -305,7 +306,7 @@ CUDF_KERNEL void count_set_bits_kernel(bitmask_type const* bitmask,
   __shared__ typename BlockReduce::TempStorage temp_storage;
   size_type block_count{BlockReduce(temp_storage).Sum(thread_count)};
 
-  if (threadIdx.x == 0) { atomicAdd(global_count, block_count); }
+  if (threadIdx.x == 0) { cudf::detail::atomic_add(global_count, block_count); }
 }
 
 }  // namespace
diff --git a/cpp/src/copying/concatenate.cu b/cpp/src/copying/concatenate.cu
index b1d850e0b27..e76dac38f05 100644
--- a/cpp/src/copying/concatenate.cu
+++ b/cpp/src/copying/concatenate.cu
@@ -22,6 +22,7 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/device_atomics.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/dictionary/detail/concatenate.hpp>
 #include <cudf/lists/detail/concatenate.hpp>
@@ -148,7 +149,7 @@ CUDF_KERNEL void concatenate_masks_kernel(column_device_view const* views,
 
   using detail::single_lane_block_sum_reduce;
   auto const block_valid_count = single_lane_block_sum_reduce<block_size, 0>(warp_valid_count);
-  if (threadIdx.x == 0) { atomicAdd(out_valid_count, block_valid_count); }
+  if (threadIdx.x == 0) { cudf::detail::atomic_add(out_valid_count, block_valid_count); }
 }
 }  // namespace
 
@@ -233,7 +234,7 @@ CUDF_KERNEL void fused_concatenate_kernel(column_device_view const* input_views,
   if (Nullable) {
     using detail::single_lane_block_sum_reduce;
     auto block_valid_count = single_lane_block_sum_reduce<block_size, 0>(warp_valid_count);
-    if (threadIdx.x == 0) { atomicAdd(out_valid_count, block_valid_count); }
+    if (threadIdx.x == 0) { cudf::detail::atomic_add(out_valid_count, block_valid_count); }
   }
 }
 
diff --git a/cpp/src/groupby/sort/group_quantiles.cu b/cpp/src/groupby/sort/group_quantiles.cu
index a456d4b5964..51c03375bb2 100644
--- a/cpp/src/groupby/sort/group_quantiles.cu
+++ b/cpp/src/groupby/sort/group_quantiles.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/column/column_view.hpp>
 #include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/detail/utilities/device_atomics.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/dictionary/detail/iterator.cuh>
 #include <cudf/dictionary/dictionary_column_view.hpp>
@@ -72,7 +73,7 @@ struct calculate_quantile_fn {
                        [d_result = d_result, segment_size, offset, this](size_type j) {
                          if (segment_size == 0) {
                            d_result.set_null(offset + j);
-                           atomicAdd(this->null_count, 1);
+                           cudf::detail::atomic_add(this->null_count, 1);
                          } else {
                            d_result.set_valid(offset + j);
                          }
diff --git a/cpp/src/groupby/sort/group_std.cu b/cpp/src/groupby/sort/group_std.cu
index 30b6f67dffe..5b37e31f949 100644
--- a/cpp/src/groupby/sort/group_std.cu
+++ b/cpp/src/groupby/sort/group_std.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/column/column_view.hpp>
 #include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/detail/utilities/device_atomics.cuh>
 #include <cudf/dictionary/detail/iterator.cuh>
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/utilities/span.hpp>
@@ -150,7 +151,7 @@ struct var_functor {
           // fact. (1) is more work than it's worth without benchmarking, and
           // this approach should outperform (2) unless large amounts of the
           // data is null.
-          atomicAdd(d_null_count, 1);
+          cudf::detail::atomic_add(d_null_count, 1);
         } else {
           d_result.set_valid(i);
         }
diff --git a/cpp/src/io/avro/avro_gpu.cu b/cpp/src/io/avro/avro_gpu.cu
index 59177a68ee7..50241c2b0eb 100644
--- a/cpp/src/io/avro/avro_gpu.cu
+++ b/cpp/src/io/avro/avro_gpu.cu
@@ -17,6 +17,8 @@
 
 #include <io/utilities/block_utils.cuh>
 
+#include <cudf/detail/utilities/device_atomics.cuh>
+
 #include <rmm/cuda_stream_view.hpp>
 
 using cudf::device_span;
@@ -145,7 +147,7 @@ avro_decode_row(schemadesc_s const* schema,
       case type_null:
         if (dataptr != nullptr && dst_row >= 0) {
           atomicAnd(static_cast<uint32_t*>(dataptr) + (dst_row >> 5), ~(1 << (dst_row & 0x1f)));
-          atomicAdd(&schema_g[i].count, 1);
+          cudf::detail::atomic_add(&schema_g[i].count, 1U);
           *skipped_row = false;
         }
         break;
diff --git a/cpp/src/io/csv/csv_gpu.cu b/cpp/src/io/csv/csv_gpu.cu
index 8252cccbdb9..1653382e375 100644
--- a/cpp/src/io/csv/csv_gpu.cu
+++ b/cpp/src/io/csv/csv_gpu.cu
@@ -21,6 +21,7 @@
 #include <io/utilities/parsing_utils.cuh>
 
 #include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/device_atomics.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/fixed_point/fixed_point.hpp>
 #include <cudf/null_mask.hpp>
@@ -201,12 +202,12 @@ CUDF_KERNEL void __launch_bounds__(csvparse_block_dim)
       // points to last character in the field
       auto const field_len = static_cast<size_t>(next_delimiter - field_start);
       if (serialized_trie_contains(opts.trie_na, {field_start, field_len})) {
-        atomicAdd(&d_column_data[actual_col].null_count, 1);
+        cudf::detail::atomic_add(&d_column_data[actual_col].null_count, 1);
       } else if (serialized_trie_contains(opts.trie_true, {field_start, field_len}) ||
                  serialized_trie_contains(opts.trie_false, {field_start, field_len})) {
-        atomicAdd(&d_column_data[actual_col].bool_count, 1);
+        cudf::detail::atomic_add(&d_column_data[actual_col].bool_count, 1);
       } else if (cudf::io::is_infinity(field_start, next_delimiter)) {
-        atomicAdd(&d_column_data[actual_col].float_count, 1);
+        cudf::detail::atomic_add(&d_column_data[actual_col].float_count, 1);
       } else {
         long count_number    = 0;
         long count_decimal   = 0;
@@ -261,9 +262,9 @@ CUDF_KERNEL void __launch_bounds__(csvparse_block_dim)
         if (column_flags[col] & column_parse::as_datetime) {
           // PANDAS uses `object` dtype if the date is unparseable
           if (is_datetime(count_string, count_decimal, count_colon, count_dash, count_slash)) {
-            atomicAdd(&d_column_data[actual_col].datetime_count, 1);
+            cudf::detail::atomic_add(&d_column_data[actual_col].datetime_count, 1);
           } else {
-            atomicAdd(&d_column_data[actual_col].string_count, 1);
+            cudf::detail::atomic_add(&d_column_data[actual_col].string_count, 1);
           }
         } else if (count_number == int_req_number_cnt) {
           auto const is_negative = (*trimmed_field_range.first == '-');
@@ -271,16 +272,16 @@ CUDF_KERNEL void __launch_bounds__(csvparse_block_dim)
             trimmed_field_range.first + (is_negative || (*trimmed_field_range.first == '+'));
           cudf::size_type* ptr = cudf::io::gpu::infer_integral_field_counter(
             data_begin, data_begin + count_number, is_negative, d_column_data[actual_col]);
-          atomicAdd(ptr, 1);
+          cudf::detail::atomic_add(ptr, 1);
         } else if (is_floatingpoint(trimmed_field_len,
                                     count_number,
                                     count_decimal,
                                     count_thousands,
                                     count_dash + count_plus,
                                     count_exponent)) {
-          atomicAdd(&d_column_data[actual_col].float_count, 1);
+          cudf::detail::atomic_add(&d_column_data[actual_col].float_count, 1);
         } else {
-          atomicAdd(&d_column_data[actual_col].string_count, 1);
+          cudf::detail::atomic_add(&d_column_data[actual_col].string_count, 1);
         }
       }
       actual_col++;
@@ -372,7 +373,7 @@ CUDF_KERNEL void __launch_bounds__(csvparse_block_dim)
                                     column_flags[col] & column_parse::as_hexadecimal)) {
             // set the valid bitmap - all bits were set to 0 to start
             set_bit(valids[actual_col], rec_id);
-            atomicAdd(&valid_counts[actual_col], 1);
+            cudf::detail::atomic_add(&valid_counts[actual_col], 1);
           }
         }
       } else if (dtypes[actual_col].id() == cudf::type_id::STRING) {
diff --git a/cpp/src/io/json/legacy/json_gpu.cu b/cpp/src/io/json/legacy/json_gpu.cu
index 4d5293e12fd..713252cc39a 100644
--- a/cpp/src/io/json/legacy/json_gpu.cu
+++ b/cpp/src/io/json/legacy/json_gpu.cu
@@ -20,6 +20,7 @@
 #include <io/utilities/parsing_utils.cuh>
 
 #include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/device_atomics.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/hashing/detail/murmurhash3_x86_32.cuh>
 #include <cudf/types.hpp>
@@ -282,7 +283,7 @@ CUDF_KERNEL void convert_data_to_columns_kernel(parse_options_view opts,
 
         // set the valid bitmap - all bits were set to 0 to start
         set_bit(valid_fields[desc.column], rec_id);
-        atomicAdd(&num_valid_fields[desc.column], 1);
+        cudf::detail::atomic_add(&num_valid_fields[desc.column], 1);
       } else {
         if (cudf::type_dispatcher(column_types[desc.column],
                                   ConvertFunctor{},
@@ -295,7 +296,7 @@ CUDF_KERNEL void convert_data_to_columns_kernel(parse_options_view opts,
                                   false)) {
           // set the valid bitmap - all bits were set to 0 to start
           set_bit(valid_fields[desc.column], rec_id);
-          atomicAdd(&num_valid_fields[desc.column], 1);
+          cudf::detail::atomic_add(&num_valid_fields[desc.column], 1);
         }
       }
     } else if (column_types[desc.column].id() == type_id::STRING) {
@@ -349,16 +350,16 @@ CUDF_KERNEL void detect_data_types_kernel(
     // Checking if the field is empty/valid
     if (serialized_trie_contains(opts.trie_na, {desc.value_begin, value_len})) {
       // Increase the null count for array rows, where the null count is initialized to zero.
-      if (!are_rows_objects) { atomicAdd(&column_infos[desc.column].null_count, 1); }
+      if (!are_rows_objects) { cudf::detail::atomic_add(&column_infos[desc.column].null_count, 1); }
       continue;
     } else if (are_rows_objects) {
       // For files with object rows, null count is initialized to row count. The value is decreased
       // here for every valid field.
-      atomicAdd(&column_infos[desc.column].null_count, -1);
+      cudf::detail::atomic_add(&column_infos[desc.column].null_count, -1);
     }
     // Don't need counts to detect strings, any field in quotes is deduced to be a string
     if (desc.is_quoted) {
-      atomicAdd(&column_infos[desc.column].string_count, 1);
+      cudf::detail::atomic_add(&column_infos[desc.column].string_count, 1);
       continue;
     }
 
@@ -405,21 +406,21 @@ CUDF_KERNEL void detect_data_types_kernel(
     if (maybe_hex) { --int_req_number_cnt; }
     if (serialized_trie_contains(opts.trie_true, {desc.value_begin, value_len}) ||
         serialized_trie_contains(opts.trie_false, {desc.value_begin, value_len})) {
-      atomicAdd(&column_infos[desc.column].bool_count, 1);
+      cudf::detail::atomic_add(&column_infos[desc.column].bool_count, 1);
     } else if (digit_count == int_req_number_cnt) {
       bool is_negative       = (*desc.value_begin == '-');
       char const* data_begin = desc.value_begin + (is_negative || (*desc.value_begin == '+'));
       cudf::size_type* ptr   = cudf::io::gpu::infer_integral_field_counter(
         data_begin, data_begin + digit_count, is_negative, column_infos[desc.column]);
-      atomicAdd(ptr, 1);
+      cudf::detail::atomic_add(ptr, 1);
     } else if (is_like_float(
                  value_len, digit_count, decimal_count, dash_count + plus_count, exponent_count)) {
-      atomicAdd(&column_infos[desc.column].float_count, 1);
+      cudf::detail::atomic_add(&column_infos[desc.column].float_count, 1);
     }
     // A date-time field cannot have more than 3 non-special characters
     // A number field cannot have more than one decimal point
     else if (other_count > 3 || decimal_count > 1) {
-      atomicAdd(&column_infos[desc.column].string_count, 1);
+      cudf::detail::atomic_add(&column_infos[desc.column].string_count, 1);
     } else {
       // A date field can have either one or two '-' or '\'; A legal combination will only have one
       // of them To simplify the process of auto column detection, we are not covering all the
@@ -427,20 +428,20 @@ CUDF_KERNEL void detect_data_types_kernel(
       if ((dash_count > 0 && dash_count <= 2 && slash_count == 0) ||
           (dash_count == 0 && slash_count > 0 && slash_count <= 2)) {
         if (colon_count <= 2) {
-          atomicAdd(&column_infos[desc.column].datetime_count, 1);
+          cudf::detail::atomic_add(&column_infos[desc.column].datetime_count, 1);
         } else {
-          atomicAdd(&column_infos[desc.column].string_count, 1);
+          cudf::detail::atomic_add(&column_infos[desc.column].string_count, 1);
         }
       } else {
         // Default field type is string
-        atomicAdd(&column_infos[desc.column].string_count, 1);
+        cudf::detail::atomic_add(&column_infos[desc.column].string_count, 1);
       }
     }
   }
   if (!are_rows_objects) {
     // For array rows, mark missing fields as null
     for (; input_field_index < num_columns; ++input_field_index)
-      atomicAdd(&column_infos[input_field_index].null_count, 1);
+      cudf::detail::atomic_add(&column_infos[input_field_index].null_count, 1);
   }
 }
 
@@ -498,7 +499,7 @@ CUDF_KERNEL void collect_keys_info_kernel(parse_options_view const options,
   for (auto field_range = advance(row_data_range.first);
        field_range.key_begin < row_data_range.second;
        field_range = advance(field_range.value_end)) {
-    auto const idx = atomicAdd(keys_cnt, 1);
+    auto const idx = cudf::detail::atomic_add(keys_cnt, 1ULL);
     if (keys_info.has_value()) {
       auto const len                              = field_range.key_end - field_range.key_begin;
       keys_info->column(0).element<uint64_t>(idx) = field_range.key_begin - data.begin();
diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu
index 14072d79172..09f2744e324 100644
--- a/cpp/src/io/orc/stripe_data.cu
+++ b/cpp/src/io/orc/stripe_data.cu
@@ -16,12 +16,15 @@
 
 #include "orc_gpu.hpp"
 
-#include <cudf/io/orc_types.hpp>
 #include <io/utilities/block_utils.cuh>
 
-#include <cub/cub.cuh>
+#include <cudf/detail/utilities/device_atomics.cuh>
+#include <cudf/io/orc_types.hpp>
+
 #include <rmm/cuda_stream_view.hpp>
 
+#include <cub/cub.cuh>
+
 namespace cudf {
 namespace io {
 namespace orc {
@@ -1399,10 +1402,10 @@ CUDF_KERNEL void __launch_bounds__(block_size)
     // If we have an index, seek to the initial run and update row positions
     if (num_rowgroups > 0) {
       if (s->top.data.index.strm_offset[0] > s->chunk.strm_len[CI_DATA]) {
-        atomicAdd(error_count, 1);
+        cudf::detail::atomic_add(error_count, 1);
       }
       if (s->top.data.index.strm_offset[1] > s->chunk.strm_len[CI_DATA2]) {
-        atomicAdd(error_count, 1);
+        cudf::detail::atomic_add(error_count, 1);
       }
       uint32_t ofs0 = min(s->top.data.index.strm_offset[0], s->chunk.strm_len[CI_DATA]);
       uint32_t ofs1 = min(s->top.data.index.strm_offset[1], s->chunk.strm_len[CI_DATA2]);
@@ -1823,7 +1826,8 @@ CUDF_KERNEL void __launch_bounds__(block_size)
     if (num_rowgroups > 0) {
       row_groups[blockIdx.y][blockIdx.x].num_child_rows = s->num_child_rows;
     }
-    atomicAdd(&chunks[chunk_id].num_child_rows, s->num_child_rows);
+    cudf::detail::atomic_add(&chunks[chunk_id].num_child_rows,
+                             static_cast<uint32_t>(s->num_child_rows));
   }
 }
 
diff --git a/cpp/src/io/parquet/chunk_dict.cu b/cpp/src/io/parquet/chunk_dict.cu
index a43c6d4cbb6..643394ed8a1 100644
--- a/cpp/src/io/parquet/chunk_dict.cu
+++ b/cpp/src/io/parquet/chunk_dict.cu
@@ -18,6 +18,7 @@
 
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/device_atomics.cuh>
 #include <cudf/table/experimental/row_operators.cuh>
 
 #include <rmm/exec_policy.hpp>
@@ -175,9 +176,9 @@ CUDF_KERNEL void __launch_bounds__(block_size)
     __syncthreads();
     auto uniq_data_size = block_reduce(reduce_storage).Sum(uniq_elem_size);
     if (t == 0) {
-      total_num_dict_entries = atomicAdd(&chunk->num_dict_entries, num_unique);
+      total_num_dict_entries = cudf::detail::atomic_add(&chunk->num_dict_entries, num_unique);
       total_num_dict_entries += num_unique;
-      atomicAdd(&chunk->uniq_data_size, uniq_data_size);
+      cudf::detail::atomic_add(&chunk->uniq_data_size, uniq_data_size);
     }
     __syncthreads();
 
diff --git a/cpp/src/io/utilities/data_casting.cu b/cpp/src/io/utilities/data_casting.cu
index 8fd860d9492..71ff52becc2 100644
--- a/cpp/src/io/utilities/data_casting.cu
+++ b/cpp/src/io/utilities/data_casting.cu
@@ -23,6 +23,7 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/device_atomics.cuh>
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/detail/utf8.hpp>
@@ -437,14 +438,14 @@ CUDF_KERNEL void parse_fn_string_parallel(str_tuple_it str_tuples,
   auto get_next_string = [&]() {
     if constexpr (is_warp) {
       size_type istring;
-      if (lane == 0) { istring = atomicAdd(str_counter, 1); }
+      if (lane == 0) { istring = cudf::detail::atomic_add(str_counter, 1); }
       return __shfl_sync(0xffffffff, istring, 0);
     } else {
       // Ensure lane 0 doesn't update istring before all threads have read the previous iteration's
       // istring value
       __syncthreads();
       __shared__ size_type istring;
-      if (lane == 0) { istring = atomicAdd(str_counter, 1); }
+      if (lane == 0) { istring = cudf::detail::atomic_add(str_counter, 1); }
       __syncthreads();
       return istring;
     }
@@ -474,7 +475,7 @@ CUDF_KERNEL void parse_fn_string_parallel(str_tuple_it str_tuples,
       if (is_null_literal && null_mask != nullptr) {
         if (lane == 0) {
           clear_bit(null_mask, istring);
-          atomicAdd(null_count_data, 1);
+          cudf::detail::atomic_add(null_count_data, 1);
           if (!d_chars) d_offsets[istring] = 0;
         }
         continue;  // gride-stride return;
@@ -618,7 +619,7 @@ CUDF_KERNEL void parse_fn_string_parallel(str_tuple_it str_tuples,
         if (!d_chars && lane == 0) {
           if (null_mask != nullptr) {
             clear_bit(null_mask, istring);
-            atomicAdd(null_count_data, 1);
+            cudf::detail::atomic_add(null_count_data, 1);
           }
           last_offset        = 0;
           d_offsets[istring] = 0;
@@ -759,7 +760,7 @@ struct string_parse {
         options.trie_na, {in_begin, static_cast<std::size_t>(num_in_chars)});
       if (is_null_literal && null_mask != nullptr) {
         clear_bit(null_mask, idx);
-        atomicAdd(null_count_data, 1);
+        cudf::detail::atomic_add(null_count_data, 1);
         if (!d_chars) d_offsets[idx] = 0;
         return;
       }
@@ -770,7 +771,7 @@ struct string_parse {
     if (str_process_info.result != data_casting_result::PARSING_SUCCESS) {
       if (null_mask != nullptr) {
         clear_bit(null_mask, idx);
-        atomicAdd(null_count_data, 1);
+        cudf::detail::atomic_add(null_count_data, 1);
       }
       if (!d_chars) d_offsets[idx] = 0;
     } else {
@@ -953,7 +954,7 @@ std::unique_ptr<column> parse_data(
 
       if (is_null_literal) {
         col.set_null(row);
-        atomicAdd(null_count_data, 1);
+        cudf::detail::atomic_add(null_count_data, 1);
         return;
       }
 
@@ -971,7 +972,7 @@ std::unique_ptr<column> parse_data(
                                                    false);
       if (not is_parsed) {
         col.set_null(row);
-        atomicAdd(null_count_data, 1);
+        cudf::detail::atomic_add(null_count_data, 1);
       }
     });
 
diff --git a/cpp/src/io/utilities/parsing_utils.cu b/cpp/src/io/utilities/parsing_utils.cu
index d02ce99e6e5..5eed1e5ca04 100644
--- a/cpp/src/io/utilities/parsing_utils.cu
+++ b/cpp/src/io/utilities/parsing_utils.cu
@@ -15,6 +15,7 @@
  */
 
 #include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/device_atomics.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/io/types.hpp>
 #include <cudf/utilities/error.hpp>
@@ -106,7 +107,7 @@ CUDF_KERNEL void count_and_set_positions(char const* data,
   // Process the data
   for (long i = 0; i < byteToProcess; i++) {
     if (raw[i] == key) {
-      auto const idx = atomicAdd(count, (cudf::size_type)1);
+      auto const idx = cudf::detail::atomic_add(count, static_cast<cudf::size_type>(1));
       setElement(positions, idx, did + offset + i, key);
     }
   }
diff --git a/cpp/src/io/utilities/type_inference.cu b/cpp/src/io/utilities/type_inference.cu
index b446ad41946..eead98ca1c8 100644
--- a/cpp/src/io/utilities/type_inference.cu
+++ b/cpp/src/io/utilities/type_inference.cu
@@ -19,6 +19,7 @@
 #include <io/utilities/trie.cuh>
 
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/utilities/device_atomics.cuh>
 #include <cudf/utilities/error.hpp>
 
 #include <rmm/device_scalar.hpp>
@@ -204,16 +205,16 @@ CUDF_KERNEL void infer_column_type_kernel(OptionsView options,
   auto const block_type_histogram =
     BlockReduce(temp_storage).Reduce(thread_type_histogram, custom_sum{});
   if (threadIdx.x == 0) {
-    atomicAdd(&column_info->null_count, block_type_histogram.null_count);
-    atomicAdd(&column_info->float_count, block_type_histogram.float_count);
-    atomicAdd(&column_info->datetime_count, block_type_histogram.datetime_count);
-    atomicAdd(&column_info->string_count, block_type_histogram.string_count);
-    atomicAdd(&column_info->negative_small_int_count,
-              block_type_histogram.negative_small_int_count);
-    atomicAdd(&column_info->positive_small_int_count,
-              block_type_histogram.positive_small_int_count);
-    atomicAdd(&column_info->big_int_count, block_type_histogram.big_int_count);
-    atomicAdd(&column_info->bool_count, block_type_histogram.bool_count);
+    cudf::detail::atomic_add(&column_info->null_count, block_type_histogram.null_count);
+    cudf::detail::atomic_add(&column_info->float_count, block_type_histogram.float_count);
+    cudf::detail::atomic_add(&column_info->datetime_count, block_type_histogram.datetime_count);
+    cudf::detail::atomic_add(&column_info->string_count, block_type_histogram.string_count);
+    cudf::detail::atomic_add(&column_info->negative_small_int_count,
+                             block_type_histogram.negative_small_int_count);
+    cudf::detail::atomic_add(&column_info->positive_small_int_count,
+                             block_type_histogram.positive_small_int_count);
+    cudf::detail::atomic_add(&column_info->big_int_count, block_type_histogram.big_int_count);
+    cudf::detail::atomic_add(&column_info->bool_count, block_type_histogram.bool_count);
   }
 }
 
diff --git a/cpp/src/join/join_common_utils.cuh b/cpp/src/join/join_common_utils.cuh
index f3ce6de4598..c87aa590385 100644
--- a/cpp/src/join/join_common_utils.cuh
+++ b/cpp/src/join/join_common_utils.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/device_atomics.cuh>
 #include <cudf/table/experimental/row_operators.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -280,7 +281,7 @@ __inline__ __device__ void add_pair_to_cache(size_type const first,
                                              size_type* joined_shared_l,
                                              size_type* joined_shared_r)
 {
-  size_type my_current_idx{atomicAdd(current_idx_shared + warp_id, size_type(1))};
+  size_type my_current_idx{cudf::detail::atomic_add(current_idx_shared + warp_id, size_type(1))};
 
   // its guaranteed to fit into the shared cache
   joined_shared_l[my_current_idx] = first;
@@ -303,7 +304,9 @@ __device__ void flush_output_cache(unsigned int const activemask,
   int num_threads               = __popc(activemask);
   cudf::size_type output_offset = 0;
 
-  if (0 == lane_id) { output_offset = atomicAdd(current_idx, current_idx_shared[warp_id]); }
+  if (0 == lane_id) {
+    output_offset = cudf::detail::atomic_add(current_idx, current_idx_shared[warp_id]);
+  }
 
   // No warp sync is necessary here because we are assuming that ShuffleIndex
   // is internally using post-CUDA 9.0 synchronization-safe primitives
diff --git a/cpp/src/json/json_path.cu b/cpp/src/json/json_path.cu
index 146b54c0d87..5e67841b328 100644
--- a/cpp/src/json/json_path.cu
+++ b/cpp/src/json/json_path.cu
@@ -20,6 +20,7 @@
 #include <cudf/detail/get_value.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/device_atomics.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/json/json.hpp>
 #include <cudf/scalar/scalar.hpp>
@@ -954,7 +955,7 @@ __launch_bounds__(block_size) CUDF_KERNEL
   if (out_valid_count) {
     size_type block_valid_count =
       cudf::detail::single_lane_block_sum_reduce<block_size, 0>(warp_valid_count);
-    if (threadIdx.x == 0) { atomicAdd(out_valid_count.value(), block_valid_count); }
+    if (threadIdx.x == 0) { cudf::detail::atomic_add(out_valid_count.value(), block_valid_count); }
   }
 }
 
diff --git a/cpp/src/partitioning/partitioning.cu b/cpp/src/partitioning/partitioning.cu
index 8d8f1a71672..0828a0fc772 100644
--- a/cpp/src/partitioning/partitioning.cu
+++ b/cpp/src/partitioning/partitioning.cu
@@ -21,6 +21,7 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/scatter.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/device_atomics.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/hashing/detail/murmurhash3_x86_32.cuh>
 #include <cudf/partitioning.hpp>
@@ -158,7 +159,7 @@ CUDF_KERNEL void compute_row_partition_numbers(row_hasher_t the_hasher,
     row_partition_numbers[row_number] = partition_number;
 
     row_partition_offset[row_number] =
-      atomicAdd(&(shared_partition_sizes[partition_number]), size_type(1));
+      cudf::detail::atomic_add(&(shared_partition_sizes[partition_number]), size_type(1));
 
     tid += stride;
   }
@@ -171,7 +172,7 @@ CUDF_KERNEL void compute_row_partition_numbers(row_hasher_t the_hasher,
     size_type const block_partition_size = shared_partition_sizes[partition_number];
 
     // Update global size of each partition
-    atomicAdd(&global_partition_sizes[partition_number], block_partition_size);
+    cudf::detail::atomic_add(&global_partition_sizes[partition_number], block_partition_size);
 
     // Record the size of this partition in this block
     size_type const write_location        = partition_number * gridDim.x + blockIdx.x;
@@ -229,7 +230,7 @@ CUDF_KERNEL void compute_row_output_locations(size_type* __restrict__ row_partit
     // Get output location based on partition number by incrementing the
     // corresponding partition offset for this block
     size_type const row_output_location =
-      atomicAdd(&(shared_partition_offsets[partition_number]), size_type(1));
+      cudf::detail::atomic_add(&(shared_partition_offsets[partition_number]), size_type(1));
 
     // Store the row's output location in-place
     row_partition_numbers[row_number] = row_output_location;
@@ -708,7 +709,7 @@ struct dispatch_map_type {
                       partition_map.end<MapType>(),
                       scatter_map.begin(),
                       [offsets = histogram.data()] __device__(auto partition_number) {
-                        return atomicAdd(&offsets[partition_number], 1);
+                        return cudf::detail::atomic_add(&offsets[partition_number], 1);
                       });
 
     // Scatter the rows into their partitions
diff --git a/cpp/src/replace/nulls.cu b/cpp/src/replace/nulls.cu
index 8ea229368cc..95177f53a72 100644
--- a/cpp/src/replace/nulls.cu
+++ b/cpp/src/replace/nulls.cu
@@ -27,6 +27,7 @@
 #include <cudf/detail/replace.hpp>
 #include <cudf/detail/replace/nulls.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/device_atomics.cuh>
 #include <cudf/dictionary/detail/replace.hpp>
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/null_mask.hpp>
@@ -108,7 +109,9 @@ CUDF_KERNEL void replace_nulls_strings(cudf::column_device_view input,
   // Compute total valid count for this block and add it to global count
   uint32_t block_valid_count = cudf::detail::single_lane_block_sum_reduce<BLOCK_SIZE, 0>(valid_sum);
   // one thread computes and adds to output_valid_count
-  if (threadIdx.x == 0) { atomicAdd(valid_counter, block_valid_count); }
+  if (threadIdx.x == 0) {
+    cudf::detail::atomic_add(valid_counter, static_cast<cudf::size_type>(block_valid_count));
+  }
 }
 
 template <typename Type, bool replacement_has_nulls>
@@ -153,7 +156,9 @@ CUDF_KERNEL void replace_nulls(cudf::column_device_view input,
     uint32_t block_valid_count =
       cudf::detail::single_lane_block_sum_reduce<BLOCK_SIZE, 0>(valid_sum);
     // one thread computes and adds to output_valid_count
-    if (threadIdx.x == 0) { atomicAdd(output_valid_count, block_valid_count); }
+    if (threadIdx.x == 0) {
+      cudf::detail::atomic_add(output_valid_count, static_cast<cudf::size_type>(block_valid_count));
+    }
   }
 }
 
diff --git a/cpp/src/replace/replace.cu b/cpp/src/replace/replace.cu
index 184c30246c7..1b0a7316ad2 100644
--- a/cpp/src/replace/replace.cu
+++ b/cpp/src/replace/replace.cu
@@ -41,6 +41,7 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/replace.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/device_atomics.cuh>
 #include <cudf/dictionary/detail/update_keys.hpp>
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/dictionary/dictionary_factories.hpp>
@@ -168,7 +169,9 @@ CUDF_KERNEL void replace_strings_first_pass(cudf::column_device_view input,
   // Compute total valid count for this block and add it to global count
   uint32_t block_valid_count = cudf::detail::single_lane_block_sum_reduce<BLOCK_SIZE, 0>(valid_sum);
   // one thread computes and adds to output_valid_count
-  if (threadIdx.x == 0) { atomicAdd(output_valid_count, block_valid_count); }
+  if (threadIdx.x == 0) {
+    cudf::detail::atomic_add(output_valid_count, static_cast<cudf::size_type>(block_valid_count));
+  }
 }
 
 /**
@@ -295,7 +298,9 @@ CUDF_KERNEL void replace_kernel(cudf::column_device_view input,
     uint32_t block_valid_count =
       cudf::detail::single_lane_block_sum_reduce<BLOCK_SIZE, 0>(valid_sum);
     // one thread computes and adds to output_valid_count
-    if (threadIdx.x == 0) { atomicAdd(output_valid_count, block_valid_count); }
+    if (threadIdx.x == 0) {
+      cudf::detail::atomic_add(output_valid_count, static_cast<cudf::size_type>(block_valid_count));
+    }
   }
 }
 
diff --git a/cpp/src/rolling/detail/rolling.cuh b/cpp/src/rolling/detail/rolling.cuh
index 20845a97c7e..d14d4e3e94e 100644
--- a/cpp/src/rolling/detail/rolling.cuh
+++ b/cpp/src/rolling/detail/rolling.cuh
@@ -35,6 +35,7 @@
 #include <cudf/detail/groupby/sort_helper.hpp>
 #include <cudf/detail/unary.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/device_atomics.cuh>
 #include <cudf/detail/utilities/device_operators.cuh>
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/dictionary/dictionary_factories.hpp>
@@ -1077,7 +1078,7 @@ __launch_bounds__(block_size) CUDF_KERNEL
   size_type block_valid_count =
     cudf::detail::single_lane_block_sum_reduce<block_size, 0>(warp_valid_count);
 
-  if (threadIdx.x == 0) { atomicAdd(output_valid_count, block_valid_count); }
+  if (threadIdx.x == 0) { cudf::detail::atomic_add(output_valid_count, block_valid_count); }
 }
 
 /**
diff --git a/cpp/src/stream_compaction/distinct_helpers.cu b/cpp/src/stream_compaction/distinct_helpers.cu
index 8f36ec98f4a..34688e8655b 100644
--- a/cpp/src/stream_compaction/distinct_helpers.cu
+++ b/cpp/src/stream_compaction/distinct_helpers.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,6 +17,7 @@
 #include "distinct_helpers.hpp"
 
 #include <cudf/detail/hash_reduce_by_row.cuh>
+#include <cudf/detail/utilities/device_atomics.cuh>
 
 namespace cudf::detail {
 
@@ -53,7 +54,7 @@ struct reduce_fn : reduce_by_row_fn_base<MapView, KeyHasher, KeyEqual, size_type
       atomicMax(out_ptr, idx);
     } else {
       // Count the number of rows in each group of rows that are compared equal.
-      atomicAdd(out_ptr, size_type{1});
+      cudf::detail::atomic_add(out_ptr, size_type{1});
     }
   }
 };
diff --git a/cpp/src/strings/copying/concatenate.cu b/cpp/src/strings/copying/concatenate.cu
index c4564b1105b..19083ca661f 100644
--- a/cpp/src/strings/copying/concatenate.cu
+++ b/cpp/src/strings/copying/concatenate.cu
@@ -19,6 +19,7 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/offsets_iterator_factory.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/device_atomics.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/strings/detail/concatenate.hpp>
 #include <cudf/strings/detail/utilities.hpp>
@@ -167,7 +168,7 @@ CUDF_KERNEL void fused_concatenate_string_offset_kernel(
   if (Nullable) {
     using cudf::detail::single_lane_block_sum_reduce;
     auto block_valid_count = single_lane_block_sum_reduce<block_size, 0>(warp_valid_count);
-    if (threadIdx.x == 0) { atomicAdd(out_valid_count, block_valid_count); }
+    if (threadIdx.x == 0) { cudf::detail::atomic_add(out_valid_count, block_valid_count); }
   }
 }
 
diff --git a/cpp/src/strings/replace/multi.cu b/cpp/src/strings/replace/multi.cu
index 3d0210d61b0..69fd407b6c2 100644
--- a/cpp/src/strings/replace/multi.cu
+++ b/cpp/src/strings/replace/multi.cu
@@ -20,6 +20,7 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/device_atomics.cuh>
 #include <cudf/strings/detail/char_tables.hpp>
 #include <cudf/strings/detail/replace.hpp>
 #include <cudf/strings/detail/strings_children.cuh>
@@ -330,7 +331,7 @@ std::unique_ptr<column> replace_character_parallel(strings_column_view const& in
                        target_count,
                        [d_string_indices, d_targets_offsets] __device__(size_type idx) {
                          auto const str_idx = d_string_indices[idx] - 1;
-                         atomicAdd(d_targets_offsets + str_idx, 1);
+                         cudf::detail::atomic_add(d_targets_offsets + str_idx, 1);
                        });
     // finally, convert the counts into offsets
     thrust::exclusive_scan(rmm::exec_policy(stream),
diff --git a/cpp/src/transform/row_conversion.cu b/cpp/src/transform/row_conversion.cu
index b294369a90e..8c51bea3c88 100644
--- a/cpp/src/transform/row_conversion.cu
+++ b/cpp/src/transform/row_conversion.cu
@@ -20,6 +20,7 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/sequence.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/device_atomics.cuh>
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/lists/lists_column_device_view.cuh>
@@ -250,7 +251,7 @@ build_string_row_offsets(table_view const& tbl,
                      auto const col = element_idx / num_rows;
                      auto const val =
                        d_offsets_iterators[col][row + 1] - d_offsets_iterators[col][row];
-                     atomicAdd(&d_row_sizes[row], val);
+                     cudf::detail::atomic_add(&d_row_sizes[row], val);
                    });
 
   // transform the row sizes to include fixed width size and alignment
diff --git a/cpp/tests/device_atomics/device_atomics_test.cu b/cpp/tests/device_atomics/device_atomics_test.cu
index 6e90d4462df..dc570efd7aa 100644
--- a/cpp/tests/device_atomics/device_atomics_test.cu
+++ b/cpp/tests/device_atomics/device_atomics_test.cu
@@ -37,7 +37,7 @@ CUDF_KERNEL void gpu_atomic_test(T* result, T* data, size_t size)
   size_t step = blockDim.x * gridDim.x;
 
   for (; id < size; id += step) {
-    atomicAdd(&result[0], data[id]);
+    cudf::detail::atomic_add(&result[0], data[id]);
     atomicMin(&result[1], data[id]);
     atomicMax(&result[2], data[id]);
     cudf::genericAtomicOperation(&result[3], data[id], cudf::DeviceSum{});

From 69dc774d546f1e29c99e9d84ae40060a7d55497d Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Fri, 2 Feb 2024 22:53:00 -0600
Subject: [PATCH 2/6] Refactor other atomic operators.

---
 .../cudf/detail/aggregation/aggregation.cuh   | 30 ++++++-----
 .../cudf/detail/utilities/device_atomics.cuh  | 51 ++++++++++---------
 .../device_atomics/device_atomics_test.cu     |  6 +--
 3 files changed, 45 insertions(+), 42 deletions(-)

diff --git a/cpp/include/cudf/detail/aggregation/aggregation.cuh b/cpp/include/cudf/detail/aggregation/aggregation.cuh
index 968fc83e763..ecf2f610697 100644
--- a/cpp/include/cudf/detail/aggregation/aggregation.cuh
+++ b/cpp/include/cudf/detail/aggregation/aggregation.cuh
@@ -144,8 +144,8 @@ struct update_target_element<
     if (source_has_nulls and source.is_null(source_index)) { return; }
 
     using Target = target_type_t<Source, aggregation::MIN>;
-    atomicMin(&target.element<Target>(target_index),
-              static_cast<Target>(source.element<Source>(source_index)));
+    cudf::detail::atomic_min(&target.element<Target>(target_index),
+                             static_cast<Target>(source.element<Source>(source_index)));
 
     if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
   }
@@ -170,8 +170,8 @@ struct update_target_element<
     using DeviceTarget = device_storage_type_t<Target>;
     using DeviceSource = device_storage_type_t<Source>;
 
-    atomicMin(&target.element<DeviceTarget>(target_index),
-              static_cast<DeviceTarget>(source.element<DeviceSource>(source_index)));
+    cudf::detail::atomic_min(&target.element<DeviceTarget>(target_index),
+                             static_cast<DeviceTarget>(source.element<DeviceSource>(source_index)));
 
     if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
   }
@@ -193,8 +193,8 @@ struct update_target_element<
     if (source_has_nulls and source.is_null(source_index)) { return; }
 
     using Target = target_type_t<Source, aggregation::MAX>;
-    atomicMax(&target.element<Target>(target_index),
-              static_cast<Target>(source.element<Source>(source_index)));
+    cudf::detail::atomic_max(&target.element<Target>(target_index),
+                             static_cast<Target>(source.element<Source>(source_index)));
 
     if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
   }
@@ -219,8 +219,8 @@ struct update_target_element<
     using DeviceTarget = device_storage_type_t<Target>;
     using DeviceSource = device_storage_type_t<Source>;
 
-    atomicMax(&target.element<DeviceTarget>(target_index),
-              static_cast<DeviceTarget>(source.element<DeviceSource>(source_index)));
+    cudf::detail::atomic_max(&target.element<DeviceTarget>(target_index),
+                             static_cast<DeviceTarget>(source.element<DeviceSource>(source_index)));
 
     if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
   }
@@ -387,8 +387,8 @@ struct update_target_element<Source,
     if (source_has_nulls and source.is_null(source_index)) { return; }
 
     using Target = target_type_t<Source, aggregation::PRODUCT>;
-    atomicMul(&target.element<Target>(target_index),
-              static_cast<Target>(source.element<Source>(source_index)));
+    cudf::detail::atomic_mul(&target.element<Target>(target_index),
+                             static_cast<Target>(source.element<Source>(source_index)));
     if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
   }
 };
@@ -449,10 +449,11 @@ struct update_target_element<
     if (source_has_nulls and source.is_null(source_index)) { return; }
 
     using Target = target_type_t<Source, aggregation::ARGMAX>;
-    auto old     = atomicCAS(&target.element<Target>(target_index), ARGMAX_SENTINEL, source_index);
+    auto old     = cudf::detail::atomic_cas(
+      &target.element<Target>(target_index), ARGMAX_SENTINEL, source_index);
     if (old != ARGMAX_SENTINEL) {
       while (source.element<Source>(source_index) > source.element<Source>(old)) {
-        old = atomicCAS(&target.element<Target>(target_index), old, source_index);
+        old = cudf::detail::atomic_cas(&target.element<Target>(target_index), old, source_index);
       }
     }
 
@@ -476,10 +477,11 @@ struct update_target_element<
     if (source_has_nulls and source.is_null(source_index)) { return; }
 
     using Target = target_type_t<Source, aggregation::ARGMIN>;
-    auto old     = atomicCAS(&target.element<Target>(target_index), ARGMIN_SENTINEL, source_index);
+    auto old     = cudf::detail::atomic_cas(
+      &target.element<Target>(target_index), ARGMIN_SENTINEL, source_index);
     if (old != ARGMIN_SENTINEL) {
       while (source.element<Source>(source_index) < source.element<Source>(old)) {
-        old = atomicCAS(&target.element<Target>(target_index), old, source_index);
+        old = cudf::detail::atomic_cas(&target.element<Target>(target_index), old, source_index);
       }
     }
 
diff --git a/cpp/include/cudf/detail/utilities/device_atomics.cuh b/cpp/include/cudf/detail/utilities/device_atomics.cuh
index 3a5108a3503..93ad9e076d5 100644
--- a/cpp/include/cudf/detail/utilities/device_atomics.cuh
+++ b/cpp/include/cudf/detail/utilities/device_atomics.cuh
@@ -27,7 +27,8 @@
  * cudf::duration_us, cudf::duration_ns and bool
  * where CUDA atomic operations are, `atomicAdd`, `atomicMin`, `atomicMax`,
  * `atomicCAS`.
- * Also provides `cudf::genericAtomicOperation` which performs atomic operation
+ *
+ * Also provides `cudf::detail::genericAtomicOperation` which performs atomic operation
  * with the given binary operator.
  */
 
@@ -364,8 +365,6 @@ struct typesAtomicCASImpl<T, 8> {
   }
 };
 
-}  // namespace detail
-
 /**
  * @brief compute atomic binary operation
  * reads the `old` located at the `address` in global or shared memory,
@@ -428,7 +427,6 @@ __forceinline__ __device__ bool genericAtomicOperation(bool* address,
   return T(fun(address, update_value, op));
 }
 
-namespace detail {
 /**
  * @brief Overloads for `atomic_add`
  * reads the `old` located at the `address` in global or shared memory,
@@ -441,7 +439,7 @@ namespace detail {
  * cudf::timestamp_ns, cudf::duration_D, cudf::duration_s, cudf::duration_ms,
  * cudf::duration_us, cudf::duration_ns and bool
  *
- * Cuda natively supports `sint32`, `uint32`, `uint64`, `float`, `double.
+ * CUDA natively supports `int32_t`, `uint32_t`, `uint64_t`, `float`, `double.
  * (`double` is supported after Pascal).
  * Other types are implemented by `atomicCAS`.
  *
@@ -453,13 +451,11 @@ namespace detail {
 template <typename T>
 __forceinline__ __device__ T atomic_add(T* address, T val)
 {
-  return cudf::genericAtomicOperation(address, val, cudf::DeviceSum{});
+  return cudf::detail::genericAtomicOperation(address, val, cudf::DeviceSum{});
 }
-}  // namespace detail
-}  // namespace cudf
 
 /**
- * @brief Overloads for `atomicMul`
+ * @brief Overloads for `atomic_mul`
  * reads the `old` located at the `address` in global or shared memory,
  * computes (old * val), and stores the result back to memory at the same
  * address. These three operations are performed in one atomic transaction.
@@ -475,24 +471,25 @@ __forceinline__ __device__ T atomic_add(T* address, T val)
  * @returns The old value at `address`
  */
 template <typename T>
-__forceinline__ __device__ T atomicMul(T* address, T val)
+__forceinline__ __device__ T atomic_mul(T* address, T val)
 {
-  return cudf::genericAtomicOperation(address, val, cudf::DeviceProduct{});
+  return cudf::detail::genericAtomicOperation(address, val, cudf::DeviceProduct{});
 }
 
 /**
- * @brief Overloads for `atomicMin`
+ * @brief Overloads for `atomic_min`
  * reads the `old` located at the `address` in global or shared memory,
  * computes the minimum of old and val, and stores the result back to memory
  * at the same address.
  * These three operations are performed in one atomic transaction.
  *
- * The supported cudf types for `atomicMin` are:
+ * The supported cudf types for `atomic_min` are:
  * int8_t, int16_t, int32_t, int64_t, float, double,
  * cudf::timestamp_D, cudf::timestamp_s, cudf::timestamp_ms, cudf::timestamp_us,
  * cudf::timestamp_ns, cudf::duration_D, cudf::duration_s, cudf::duration_ms,
  * cudf::duration_us, cudf::duration_ns and bool
- * Cuda natively supports `sint32`, `uint32`, `sint64`, `uint64`.
+ *
+ * CUDA natively supports `int32_t`, `uint32_t`, `int64_t`, `uint64_t`.
  * Other types are implemented by `atomicCAS`.
  *
  * @param[in] address The address of old value in global or shared memory
@@ -501,24 +498,25 @@ __forceinline__ __device__ T atomicMul(T* address, T val)
  * @returns The old value at `address`
  */
 template <typename T>
-__forceinline__ __device__ T atomicMin(T* address, T val)
+__forceinline__ __device__ T atomic_min(T* address, T val)
 {
-  return cudf::genericAtomicOperation(address, val, cudf::DeviceMin{});
+  return cudf::detail::genericAtomicOperation(address, val, cudf::DeviceMin{});
 }
 
 /**
- * @brief Overloads for `atomicMax`
+ * @brief Overloads for `atomic_max`
  * reads the `old` located at the `address` in global or shared memory,
  * computes the maximum of old and val, and stores the result back to memory
  * at the same address.
  * These three operations are performed in one atomic transaction.
  *
- * The supported cudf types for `atomicMax` are:
+ * The supported cudf types for `atomic_max` are:
  * int8_t, int16_t, int32_t, int64_t, float, double,
  * cudf::timestamp_D, cudf::timestamp_s, cudf::timestamp_ms, cudf::timestamp_us,
  * cudf::timestamp_ns, cudf::duration_D, cudf::duration_s, cudf::duration_ms,
  * cudf::duration_us, cudf::duration_ns and bool
- * Cuda natively supports `sint32`, `uint32`, `sint64`, `uint64`.
+ *
+ * CUDA natively supports `int32_t`, `uint32_t`, `int64_t`, `uint64_t`.
  * Other types are implemented by `atomicCAS`.
  *
  * @param[in] address The address of old value in global or shared memory
@@ -527,24 +525,24 @@ __forceinline__ __device__ T atomicMin(T* address, T val)
  * @returns The old value at `address`
  */
 template <typename T>
-__forceinline__ __device__ T atomicMax(T* address, T val)
+__forceinline__ __device__ T atomic_max(T* address, T val)
 {
-  return cudf::genericAtomicOperation(address, val, cudf::DeviceMax{});
+  return cudf::detail::genericAtomicOperation(address, val, cudf::DeviceMax{});
 }
 
 /**
- * @brief Overloads for `atomicCAS`
+ * @brief Overloads for `atomic_cas`
  * reads the `old` located at the `address` in global or shared memory,
  * computes (`old` == `compare` ? `val` : `old`),
  * and stores the result back to memory at the same address.
  * These three operations are performed in one atomic transaction.
  *
- * The supported cudf types for `atomicCAS` are:
+ * The supported cudf types for `atomic_cas` are:
  * int8_t, int16_t, int32_t, int64_t, float, double,
  * cudf::timestamp_D, cudf::timestamp_s, cudf::timestamp_ms, cudf::timestamp_us,
  * cudf::timestamp_ns, cudf::duration_D, cudf::duration_s, cudf::duration_ms,
  * cudf::duration_us, cudf::duration_ns and bool
- * Cuda natively supports `sint32`, `uint32`, `uint64`.
+ * CUDA natively supports `int32_t`, `uint32_t`, `uint64_t`.
  * Other types are implemented by `atomicCAS`.
  *
  * @param[in] address The address of old value in global or shared memory
@@ -554,7 +552,10 @@ __forceinline__ __device__ T atomicMax(T* address, T val)
  * @returns The old value at `address`
  */
 template <typename T>
-__forceinline__ __device__ T atomicCAS(T* address, T compare, T val)
+__forceinline__ __device__ T atomic_cas(T* address, T compare, T val)
 {
   return cudf::detail::typesAtomicCASImpl<T>()(address, compare, val);
 }
+
+}  // namespace detail
+}  // namespace cudf
diff --git a/cpp/tests/device_atomics/device_atomics_test.cu b/cpp/tests/device_atomics/device_atomics_test.cu
index dc570efd7aa..316137068f2 100644
--- a/cpp/tests/device_atomics/device_atomics_test.cu
+++ b/cpp/tests/device_atomics/device_atomics_test.cu
@@ -40,9 +40,9 @@ CUDF_KERNEL void gpu_atomic_test(T* result, T* data, size_t size)
     cudf::detail::atomic_add(&result[0], data[id]);
     atomicMin(&result[1], data[id]);
     atomicMax(&result[2], data[id]);
-    cudf::genericAtomicOperation(&result[3], data[id], cudf::DeviceSum{});
-    cudf::genericAtomicOperation(&result[4], data[id], cudf::DeviceMin{});
-    cudf::genericAtomicOperation(&result[5], data[id], cudf::DeviceMax{});
+    cudf::detail::genericAtomicOperation(&result[3], data[id], cudf::DeviceSum{});
+    cudf::detail::genericAtomicOperation(&result[4], data[id], cudf::DeviceMin{});
+    cudf::detail::genericAtomicOperation(&result[5], data[id], cudf::DeviceMax{});
   }
 }
 

From 25a112523719969ccfb5896e2e4ce97a3ad6cacc Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Fri, 2 Feb 2024 22:53:33 -0600
Subject: [PATCH 3/6] Refactor docs and clean up device_atomic.cuh.

---
 .../cudf/detail/utilities/device_atomics.cuh  | 211 ++++++++++--------
 1 file changed, 122 insertions(+), 89 deletions(-)

diff --git a/cpp/include/cudf/detail/utilities/device_atomics.cuh b/cpp/include/cudf/detail/utilities/device_atomics.cuh
index 93ad9e076d5..61c5f35d62a 100644
--- a/cpp/include/cudf/detail/utilities/device_atomics.cuh
+++ b/cpp/include/cudf/detail/utilities/device_atomics.cuh
@@ -20,16 +20,15 @@
  * @brief overloads for CUDA atomic operations
  * @file device_atomics.cuh
  *
- * Provides the overloads for all of possible cudf's data types,
- * where cudf's data types are, int8_t, int16_t, int32_t, int64_t, float, double,
- * cudf::timestamp_D, cudf::timestamp_s, cudf::timestamp_ms, cudf::timestamp_us,
+ * Provides the overloads for all of cudf's data types, specifically int8_t,
+ * int16_t, int32_t, int64_t, float, double, cudf::timestamp_D,
+ * cudf::timestamp_s, cudf::timestamp_ms, cudf::timestamp_us,
  * cudf::timestamp_ns, cudf::duration_D, cudf::duration_s, cudf::duration_ms,
- * cudf::duration_us, cudf::duration_ns and bool
- * where CUDA atomic operations are, `atomicAdd`, `atomicMin`, `atomicMax`,
- * `atomicCAS`.
+ * cudf::duration_us, cudf::duration_ns and bool for the CUDA atomic operations
+ * `atomicAdd`, `atomicMin`, `atomicMax`, `atomicCAS`.
  *
- * Also provides `cudf::detail::genericAtomicOperation` which performs atomic operation
- * with the given binary operator.
+ * Also provides `cudf::detail::genericAtomicOperation` which performs an
+ * atomic operation with the given binary operator.
  */
 
 #include <cudf/detail/utilities/device_operators.cuh>
@@ -38,6 +37,7 @@
 #include <cudf/utilities/traits.hpp>
 #include <cudf/wrappers/durations.hpp>
 #include <cudf/wrappers/timestamps.hpp>
+
 #include <type_traits>
 
 namespace cudf {
@@ -85,26 +85,22 @@ template <typename T, typename Op>
 struct genericAtomicOperationImpl<T, Op, 2> {
   __forceinline__ __device__ T operator()(T* addr, T const& update_value, Op op)
   {
-    using T_int      = unsigned int;
-    bool is_32_align = (reinterpret_cast<size_t>(addr) & 2) == 0;
-    auto* address_uint32 =
-      reinterpret_cast<T_int*>(reinterpret_cast<size_t>(addr) - (is_32_align ? 0 : 2));
+    using T_int = unsigned short int;
+    static_assert(sizeof(T) == sizeof(T_int));
 
-    T_int old = *address_uint32;
+    T old_value = *addr;
     T_int assumed;
+    T_int ret;
 
     do {
-      assumed                 = old;
-      T const target_value    = (is_32_align) ? T(old & 0xffff) : T(old >> 16);
-      uint16_t updating_value = type_reinterpret<uint16_t, T>(op(target_value, update_value));
+      T_int const new_value = type_reinterpret<T_int, T>(op(old_value, update_value));
 
-      T_int const new_value = (is_32_align) ? (old & 0xffff'0000) | updating_value
-                                            : (old & 0xffff) | (T_int(updating_value) << 16);
-      old                   = atomicCAS(address_uint32, assumed, new_value);
-    } while (assumed != old);
+      assumed   = type_reinterpret<T_int, T>(old_value);
+      ret       = atomicCAS(reinterpret_cast<T_int*>(addr), assumed, new_value);
+      old_value = type_reinterpret<T, T_int>(ret);
+    } while (assumed != ret);
 
-    return (is_32_align) ? T(old & 0xffff) : T(old >> 16);
-    ;
+    return old_value;
   }
 };
 
@@ -114,6 +110,7 @@ struct genericAtomicOperationImpl<T, Op, 4> {
   __forceinline__ __device__ T operator()(T* addr, T const& update_value, Op op)
   {
     using T_int = unsigned int;
+    static_assert(sizeof(T) == sizeof(T_int));
 
     T old_value = *addr;
     T_int assumed;
@@ -125,7 +122,6 @@ struct genericAtomicOperationImpl<T, Op, 4> {
       assumed   = type_reinterpret<T_int, T>(old_value);
       ret       = atomicCAS(reinterpret_cast<T_int*>(addr), assumed, new_value);
       old_value = type_reinterpret<T, T_int>(ret);
-
     } while (assumed != ret);
 
     return old_value;
@@ -150,17 +146,17 @@ struct genericAtomicOperationImpl<T, Op, 8> {
       assumed   = type_reinterpret<T_int, T>(old_value);
       ret       = atomicCAS(reinterpret_cast<T_int*>(addr), assumed, new_value);
       old_value = type_reinterpret<T, T_int>(ret);
-
     } while (assumed != ret);
 
     return old_value;
   }
 };
 
-// -----------------------------------------------------------------------
-// specialized functions for operators
-// `atomicAdd` supports int32_t, uint32_t, uint64_t, float, double (signed int64_t is not
-// supported.) `atomicMin`, `atomicMax` support int32_t, uint32_t, uint64_t, int64_t
+// Specialized functions for operators.
+
+// `atomicAdd` supports int32_t, uint32_t, uint64_t, float, double.
+// `atomicAdd` does not support int64_t.
+
 template <>
 struct genericAtomicOperationImpl<float, DeviceSum, 4> {
   using T = float;
@@ -180,17 +176,24 @@ struct genericAtomicOperationImpl<double, DeviceSum, 8> {
 };
 
 template <>
-struct genericAtomicOperationImpl<uint32_t, DeviceSum, 4> {
-  using T = uint32_t;
+struct genericAtomicOperationImpl<int32_t, DeviceSum, 4> {
+  using T = int32_t;
   __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceSum op)
   {
     return atomicAdd(addr, update_value);
   }
 };
 
+// CUDA natively supports `unsigned long long int` for `atomicAdd`,
+// but doesn't support `signed long long int`.
+// However, since the signed integer is represented as two's complement,
+// the fundamental arithmetic operations of addition are identical to
+// those for unsigned binary numbers.
+// Then, this computes as `unsigned long long int` with `atomicAdd`
+// @sa https://en.wikipedia.org/wiki/Two%27s_complement
 template <>
-struct genericAtomicOperationImpl<uint64_t, DeviceSum, 8> {
-  using T = uint64_t;
+struct genericAtomicOperationImpl<int64_t, DeviceSum, 8> {
+  using T = int64_t;
   __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceSum op)
   {
     using T_int = unsigned long long int;
@@ -201,24 +204,17 @@ struct genericAtomicOperationImpl<uint64_t, DeviceSum, 8> {
 };
 
 template <>
-struct genericAtomicOperationImpl<int32_t, DeviceSum, 4> {
-  using T = int32_t;
+struct genericAtomicOperationImpl<uint32_t, DeviceSum, 4> {
+  using T = uint32_t;
   __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceSum op)
   {
     return atomicAdd(addr, update_value);
   }
 };
 
-// CUDA natively supports `unsigned long long int` for `atomicAdd`,
-// but doesn't support `signed long long int`.
-// However, since the signed integer is represented as two's complement,
-// the fundamental arithmetic operations of addition are identical to
-// those for unsigned binary numbers.
-// Then, this computes as `unsigned long long int` with `atomicAdd`
-// @sa https://en.wikipedia.org/wiki/Two%27s_complement
 template <>
-struct genericAtomicOperationImpl<int64_t, DeviceSum, 8> {
-  using T = int64_t;
+struct genericAtomicOperationImpl<uint64_t, DeviceSum, 8> {
+  using T = uint64_t;
   __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceSum op)
   {
     using T_int = unsigned long long int;
@@ -228,6 +224,8 @@ struct genericAtomicOperationImpl<int64_t, DeviceSum, 8> {
   }
 };
 
+// `atomicMin`, `atomicMax` support int32_t, int64_t, uint32_t, uint64_t.
+
 template <>
 struct genericAtomicOperationImpl<int32_t, DeviceMin, 4> {
   using T = int32_t;
@@ -238,11 +236,11 @@ struct genericAtomicOperationImpl<int32_t, DeviceMin, 4> {
 };
 
 template <>
-struct genericAtomicOperationImpl<int32_t, DeviceMax, 4> {
-  using T = int32_t;
-  __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceMax op)
+struct genericAtomicOperationImpl<uint32_t, DeviceMin, 4> {
+  using T = uint32_t;
+  __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceMin op)
   {
-    return atomicMax(addr, update_value);
+    return atomicMin(addr, update_value);
   }
 };
 
@@ -258,6 +256,36 @@ struct genericAtomicOperationImpl<int64_t, DeviceMin, 8> {
   }
 };
 
+template <>
+struct genericAtomicOperationImpl<uint64_t, DeviceMin, 8> {
+  using T = uint64_t;
+  __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceMin op)
+  {
+    using T_int = unsigned long long int;
+    static_assert(sizeof(T) == sizeof(T_int));
+    T ret = atomicMin(reinterpret_cast<T_int*>(addr), type_reinterpret<T_int, T>(update_value));
+    return ret;
+  }
+};
+
+template <>
+struct genericAtomicOperationImpl<int32_t, DeviceMax, 4> {
+  using T = int32_t;
+  __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceMax op)
+  {
+    return atomicMax(addr, update_value);
+  }
+};
+
+template <>
+struct genericAtomicOperationImpl<uint32_t, DeviceMax, 4> {
+  using T = uint32_t;
+  __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceMax op)
+  {
+    return atomicMax(addr, update_value);
+  }
+};
+
 template <>
 struct genericAtomicOperationImpl<int64_t, DeviceMax, 8> {
   using T = int64_t;
@@ -269,6 +297,19 @@ struct genericAtomicOperationImpl<int64_t, DeviceMax, 8> {
     return ret;
   }
 };
+
+template <>
+struct genericAtomicOperationImpl<uint64_t, DeviceMax, 8> {
+  using T = uint64_t;
+  __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceMax op)
+  {
+    using T_int = unsigned long long int;
+    static_assert(sizeof(T) == sizeof(T_int));
+    T ret = atomicMax(reinterpret_cast<T_int*>(addr), type_reinterpret<T_int, T>(update_value));
+    return ret;
+  }
+};
+
 // -----------------------------------------------------------------------
 // the implementation of `typesAtomicCASImpl`
 template <typename T, size_t N = sizeof(T)>
@@ -310,28 +351,14 @@ template <typename T>
 struct typesAtomicCASImpl<T, 2> {
   __forceinline__ __device__ T operator()(T* addr, T const& compare, T const& update_value)
   {
-    using T_int = unsigned int;
-
-    bool is_32_align = (reinterpret_cast<size_t>(addr) & 2) == 0;
-    auto* address_uint32 =
-      reinterpret_cast<T_int*>(reinterpret_cast<size_t>(addr) - (is_32_align ? 0 : 2));
-
-    T_int old = *address_uint32;
-    T_int assumed;
-    T target_value;
-    uint16_t u_val = type_reinterpret<uint16_t, T>(update_value);
-
-    do {
-      assumed      = old;
-      target_value = (is_32_align) ? T(old & 0xffff) : T(old >> 16);
-      if (target_value != compare) break;
+    using T_int = unsigned short int;
+    static_assert(sizeof(T) == sizeof(T_int));
 
-      T_int new_value =
-        (is_32_align) ? (old & 0xffff'0000) | u_val : (old & 0xffff) | (T_int(u_val) << 16);
-      old = atomicCAS(address_uint32, assumed, new_value);
-    } while (assumed != old);
+    T_int ret = atomicCAS(reinterpret_cast<T_int*>(addr),
+                          type_reinterpret<T_int, T>(compare),
+                          type_reinterpret<T_int, T>(update_value));
 
-    return target_value;
+    return type_reinterpret<T, T_int>(ret);
   }
 };
 
@@ -340,6 +367,7 @@ struct typesAtomicCASImpl<T, 4> {
   __forceinline__ __device__ T operator()(T* addr, T const& compare, T const& update_value)
   {
     using T_int = unsigned int;
+    static_assert(sizeof(T) == sizeof(T_int));
 
     T_int ret = atomicCAS(reinterpret_cast<T_int*>(addr),
                           type_reinterpret<T_int, T>(compare),
@@ -349,7 +377,6 @@ struct typesAtomicCASImpl<T, 4> {
   }
 };
 
-// 8 bytes atomic operation
 template <typename T>
 struct typesAtomicCASImpl<T, 8> {
   __forceinline__ __device__ T operator()(T* addr, T const& compare, T const& update_value)
@@ -366,8 +393,9 @@ struct typesAtomicCASImpl<T, 8> {
 };
 
 /**
- * @brief compute atomic binary operation
- * reads the `old` located at the `address` in global or shared memory,
+ * @brief Compute atomic binary operation
+ *
+ * Reads the `old` located at the `address` in global or shared memory,
  * computes 'BinaryOp'('old', 'update_value'),
  * and stores the result back to memory at the same address.
  * These three operations are performed in one atomic transaction.
@@ -375,9 +403,9 @@ struct typesAtomicCASImpl<T, 8> {
  * The supported cudf types for `genericAtomicOperation` are:
  * int8_t, int16_t, int32_t, int64_t, float, double
  *
- * @param[in] address The address of old value in global or shared memory
- * @param[in] val The value to be computed
- * @param[in] op  The binary operator used for compute
+ * @param address The address of old value in global or shared memory
+ * @param val The value to be computed
+ * @param op  The binary operator used for compute
  *
  * @returns The old value at `address`
  */
@@ -429,7 +457,8 @@ __forceinline__ __device__ bool genericAtomicOperation(bool* address,
 
 /**
  * @brief Overloads for `atomic_add`
- * reads the `old` located at the `address` in global or shared memory,
+ *
+ * Reads the `old` located at the `address` in global or shared memory,
  * computes (old + val), and stores the result back to memory at the same
  * address. These three operations are performed in one atomic transaction.
  *
@@ -443,8 +472,8 @@ __forceinline__ __device__ bool genericAtomicOperation(bool* address,
  * (`double` is supported after Pascal).
  * Other types are implemented by `atomicCAS`.
  *
- * @param[in] address The address of old value in global or shared memory
- * @param[in] val The value to be added
+ * @param address The address of old value in global or shared memory
+ * @param val The value to be added
  *
  * @returns The old value at `address`
  */
@@ -456,7 +485,8 @@ __forceinline__ __device__ T atomic_add(T* address, T val)
 
 /**
  * @brief Overloads for `atomic_mul`
- * reads the `old` located at the `address` in global or shared memory,
+ *
+ * Reads the `old` located at the `address` in global or shared memory,
  * computes (old * val), and stores the result back to memory at the same
  * address. These three operations are performed in one atomic transaction.
  *
@@ -465,8 +495,8 @@ __forceinline__ __device__ T atomic_add(T* address, T val)
  *
  * All types are implemented by `atomicCAS`.
  *
- * @param[in] address The address of old value in global or shared memory
- * @param[in] val The value to be multiplied
+ * @param address The address of old value in global or shared memory
+ * @param val The value to be multiplied
  *
  * @returns The old value at `address`
  */
@@ -478,7 +508,8 @@ __forceinline__ __device__ T atomic_mul(T* address, T val)
 
 /**
  * @brief Overloads for `atomic_min`
- * reads the `old` located at the `address` in global or shared memory,
+ *
+ * Reads the `old` located at the `address` in global or shared memory,
  * computes the minimum of old and val, and stores the result back to memory
  * at the same address.
  * These three operations are performed in one atomic transaction.
@@ -492,8 +523,8 @@ __forceinline__ __device__ T atomic_mul(T* address, T val)
  * CUDA natively supports `int32_t`, `uint32_t`, `int64_t`, `uint64_t`.
  * Other types are implemented by `atomicCAS`.
  *
- * @param[in] address The address of old value in global or shared memory
- * @param[in] val The value to be computed
+ * @param address The address of old value in global or shared memory
+ * @param val The value to be computed
  *
  * @returns The old value at `address`
  */
@@ -505,7 +536,8 @@ __forceinline__ __device__ T atomic_min(T* address, T val)
 
 /**
  * @brief Overloads for `atomic_max`
- * reads the `old` located at the `address` in global or shared memory,
+ *
+ * Reads the `old` located at the `address` in global or shared memory,
  * computes the maximum of old and val, and stores the result back to memory
  * at the same address.
  * These three operations are performed in one atomic transaction.
@@ -519,8 +551,8 @@ __forceinline__ __device__ T atomic_min(T* address, T val)
  * CUDA natively supports `int32_t`, `uint32_t`, `int64_t`, `uint64_t`.
  * Other types are implemented by `atomicCAS`.
  *
- * @param[in] address The address of old value in global or shared memory
- * @param[in] val The value to be computed
+ * @param address The address of old value in global or shared memory
+ * @param val The value to be computed
  *
  * @returns The old value at `address`
  */
@@ -532,7 +564,8 @@ __forceinline__ __device__ T atomic_max(T* address, T val)
 
 /**
  * @brief Overloads for `atomic_cas`
- * reads the `old` located at the `address` in global or shared memory,
+ *
+ * Reads the `old` located at the `address` in global or shared memory,
  * computes (`old` == `compare` ? `val` : `old`),
  * and stores the result back to memory at the same address.
  * These three operations are performed in one atomic transaction.
@@ -545,9 +578,9 @@ __forceinline__ __device__ T atomic_max(T* address, T val)
  * CUDA natively supports `int32_t`, `uint32_t`, `uint64_t`.
  * Other types are implemented by `atomicCAS`.
  *
- * @param[in] address The address of old value in global or shared memory
- * @param[in] compare The value to be compared
- * @param[in] val The value to be computed
+ * @param address The address of old value in global or shared memory
+ * @param compare The value to be compared
+ * @param val The value to be computed
  *
  * @returns The old value at `address`
  */

From 5ecf52f50c3f33d986bcac12e0589df4a740d45b Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Sat, 3 Feb 2024 20:38:06 -0600
Subject: [PATCH 4/6] Revert atomic_add in most places, update tests.

---
 cpp/benchmarks/join/generate_input_tables.cuh |  1 -
 cpp/include/cudf/detail/copy_if_else.cuh      |  3 +-
 cpp/include/cudf/detail/copy_range.cuh        |  3 +-
 .../cudf/detail/labeling/label_segments.cuh   |  3 +-
 cpp/include/cudf/detail/null_mask.cuh         |  3 +-
 cpp/include/cudf/detail/valid_if.cuh          |  5 ++--
 cpp/src/bitmask/null_mask.cu                  |  3 +-
 cpp/src/copying/concatenate.cu                |  5 ++--
 cpp/src/groupby/sort/group_quantiles.cu       |  3 +-
 cpp/src/groupby/sort/group_std.cu             |  3 +-
 cpp/src/io/avro/avro_gpu.cu                   |  4 +--
 cpp/src/io/csv/csv_gpu.cu                     | 19 ++++++------
 cpp/src/io/json/legacy/json_gpu.cu            | 29 +++++++++----------
 cpp/src/io/orc/stripe_data.cu                 |  8 ++---
 cpp/src/io/parquet/chunk_dict.cu              |  5 ++--
 cpp/src/io/utilities/data_casting.cu          | 17 +++++------
 cpp/src/io/utilities/parsing_utils.cu         |  3 +-
 cpp/src/io/utilities/type_inference.cu        | 21 +++++++-------
 cpp/src/join/join_common_utils.cuh            |  7 ++---
 cpp/src/json/json_path.cu                     |  3 +-
 cpp/src/partitioning/partitioning.cu          |  9 +++---
 cpp/src/replace/nulls.cu                      |  5 ++--
 cpp/src/replace/replace.cu                    |  5 ++--
 cpp/src/rolling/detail/rolling.cuh            |  3 +-
 cpp/src/stream_compaction/distinct_helpers.cu |  3 +-
 cpp/src/strings/copying/concatenate.cu        |  3 +-
 cpp/src/strings/replace/multi.cu              |  3 +-
 cpp/src/transform/row_conversion.cu           |  3 +-
 .../device_atomics/device_atomics_test.cu     |  6 ++--
 29 files changed, 78 insertions(+), 110 deletions(-)

diff --git a/cpp/benchmarks/join/generate_input_tables.cuh b/cpp/benchmarks/join/generate_input_tables.cuh
index b14541564dd..d9c428f86af 100644
--- a/cpp/benchmarks/join/generate_input_tables.cuh
+++ b/cpp/benchmarks/join/generate_input_tables.cuh
@@ -16,7 +16,6 @@
 
 #pragma once
 
-#include <cudf/detail/utilities/device_atomics.cuh>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 
diff --git a/cpp/include/cudf/detail/copy_if_else.cuh b/cpp/include/cudf/detail/copy_if_else.cuh
index 65b453e0bd1..6162fa5ecf1 100644
--- a/cpp/include/cudf/detail/copy_if_else.cuh
+++ b/cpp/include/cudf/detail/copy_if_else.cuh
@@ -20,7 +20,6 @@
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
-#include <cudf/detail/utilities/device_atomics.cuh>
 #include <cudf/detail/utilities/integer_utils.hpp>
 
 #include <rmm/device_scalar.hpp>
@@ -95,7 +94,7 @@ __launch_bounds__(block_size) CUDF_KERNEL
     // block_valid_count will only be valid on thread 0
     if (threadIdx.x == 0) {
       // using an atomic here because there are multiple blocks doing this work
-      cudf::detail::atomic_add(valid_count, block_valid_count);
+      atomicAdd(valid_count, block_valid_count);
     }
   }
 }
diff --git a/cpp/include/cudf/detail/copy_range.cuh b/cpp/include/cudf/detail/copy_range.cuh
index 192a60cf9b3..4bfdaa94c53 100644
--- a/cpp/include/cudf/detail/copy_range.cuh
+++ b/cpp/include/cudf/detail/copy_range.cuh
@@ -19,7 +19,6 @@
 #include <cudf/column/column_view.hpp>
 #include <cudf/copying.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
-#include <cudf/detail/utilities/device_atomics.cuh>
 #include <cudf/types.hpp>
 #include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/default_stream.hpp>
@@ -101,7 +100,7 @@ CUDF_KERNEL void copy_range_kernel(SourceValueIterator source_value_begin,
     auto block_null_change =
       cudf::detail::single_lane_block_sum_reduce<block_size, leader_lane>(warp_null_change);
     if (threadIdx.x == 0) {  // if the first thread in a block
-      cudf::detail::atomic_add(null_count, block_null_change);
+      atomicAdd(null_count, block_null_change);
     }
   }
 }
diff --git a/cpp/include/cudf/detail/labeling/label_segments.cuh b/cpp/include/cudf/detail/labeling/label_segments.cuh
index 405d111ab35..f522abef0b1 100644
--- a/cpp/include/cudf/detail/labeling/label_segments.cuh
+++ b/cpp/include/cudf/detail/labeling/label_segments.cuh
@@ -14,7 +14,6 @@
  * limitations under the License.
  */
 
-#include <cudf/detail/utilities/device_atomics.cuh>
 #include <cudf/types.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -104,7 +103,7 @@ void label_segments(InputIterator offsets_begin,
       // In case we have repeated offsets (i.e., we have empty segments), this `atomicAdd` call will
       // make sure the label values corresponding to these empty segments will be skipped in the
       // output.
-      if (dst_idx < num_labels) { cudf::detail::atomic_add(&output[dst_idx], OutputType{1}); }
+      if (dst_idx < num_labels) { atomicAdd(&output[dst_idx], OutputType{1}); }
     });
   thrust::inclusive_scan(rmm::exec_policy(stream), label_begin, label_end, label_begin);
 }
diff --git a/cpp/include/cudf/detail/null_mask.cuh b/cpp/include/cudf/detail/null_mask.cuh
index 3113b991a79..e57d85f2998 100644
--- a/cpp/include/cudf/detail/null_mask.cuh
+++ b/cpp/include/cudf/detail/null_mask.cuh
@@ -17,7 +17,6 @@
 
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
-#include <cudf/detail/utilities/device_atomics.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/detail/valid_if.cuh>
 #include <cudf/null_mask.hpp>
@@ -108,7 +107,7 @@ CUDF_KERNEL void offset_bitmask_binop(Binop op,
   __shared__ typename BlockReduce::TempStorage temp_storage;
   size_type block_count = BlockReduce(temp_storage).Sum(thread_count);
 
-  if (threadIdx.x == 0) { cudf::detail::atomic_add(count_ptr, block_count); }
+  if (threadIdx.x == 0) { atomicAdd(count_ptr, block_count); }
 }
 
 /**
diff --git a/cpp/include/cudf/detail/valid_if.cuh b/cpp/include/cudf/detail/valid_if.cuh
index 453a072fece..d0073177445 100644
--- a/cpp/include/cudf/detail/valid_if.cuh
+++ b/cpp/include/cudf/detail/valid_if.cuh
@@ -18,7 +18,6 @@
 
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
-#include <cudf/detail/utilities/device_atomics.cuh>
 #include <cudf/types.hpp>
 #include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/default_stream.hpp>
@@ -66,7 +65,7 @@ CUDF_KERNEL void valid_if_kernel(
   }
 
   size_type block_count = single_lane_block_sum_reduce<block_size, leader_lane>(warp_valid_count);
-  if (threadIdx.x == 0) { cudf::detail::atomic_add(valid_count, block_count); }
+  if (threadIdx.x == 0) { atomicAdd(valid_count, block_count); }
 }
 
 /**
@@ -184,7 +183,7 @@ CUDF_KERNEL void valid_if_n_kernel(InputIterator1 begin1,
 
     auto block_valid_count = single_lane_block_sum_reduce<block_size, 0>(warp_valid_count);
 
-    if (threadIdx.x == 0) { cudf::detail::atomic_add(valid_counts + mask_idx, block_valid_count); }
+    if (threadIdx.x == 0) { atomicAdd(valid_counts + mask_idx, block_valid_count); }
   }
 }
 
diff --git a/cpp/src/bitmask/null_mask.cu b/cpp/src/bitmask/null_mask.cu
index 52f5b0108a4..bb320e4b81a 100644
--- a/cpp/src/bitmask/null_mask.cu
+++ b/cpp/src/bitmask/null_mask.cu
@@ -19,7 +19,6 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
-#include <cudf/detail/utilities/device_atomics.cuh>
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/null_mask.hpp>
@@ -306,7 +305,7 @@ CUDF_KERNEL void count_set_bits_kernel(bitmask_type const* bitmask,
   __shared__ typename BlockReduce::TempStorage temp_storage;
   size_type block_count{BlockReduce(temp_storage).Sum(thread_count)};
 
-  if (threadIdx.x == 0) { cudf::detail::atomic_add(global_count, block_count); }
+  if (threadIdx.x == 0) { atomicAdd(global_count, block_count); }
 }
 
 }  // namespace
diff --git a/cpp/src/copying/concatenate.cu b/cpp/src/copying/concatenate.cu
index e76dac38f05..b1d850e0b27 100644
--- a/cpp/src/copying/concatenate.cu
+++ b/cpp/src/copying/concatenate.cu
@@ -22,7 +22,6 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
-#include <cudf/detail/utilities/device_atomics.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/dictionary/detail/concatenate.hpp>
 #include <cudf/lists/detail/concatenate.hpp>
@@ -149,7 +148,7 @@ CUDF_KERNEL void concatenate_masks_kernel(column_device_view const* views,
 
   using detail::single_lane_block_sum_reduce;
   auto const block_valid_count = single_lane_block_sum_reduce<block_size, 0>(warp_valid_count);
-  if (threadIdx.x == 0) { cudf::detail::atomic_add(out_valid_count, block_valid_count); }
+  if (threadIdx.x == 0) { atomicAdd(out_valid_count, block_valid_count); }
 }
 }  // namespace
 
@@ -234,7 +233,7 @@ CUDF_KERNEL void fused_concatenate_kernel(column_device_view const* input_views,
   if (Nullable) {
     using detail::single_lane_block_sum_reduce;
     auto block_valid_count = single_lane_block_sum_reduce<block_size, 0>(warp_valid_count);
-    if (threadIdx.x == 0) { cudf::detail::atomic_add(out_valid_count, block_valid_count); }
+    if (threadIdx.x == 0) { atomicAdd(out_valid_count, block_valid_count); }
   }
 }
 
diff --git a/cpp/src/groupby/sort/group_quantiles.cu b/cpp/src/groupby/sort/group_quantiles.cu
index 51c03375bb2..35185c283f4 100644
--- a/cpp/src/groupby/sort/group_quantiles.cu
+++ b/cpp/src/groupby/sort/group_quantiles.cu
@@ -21,7 +21,6 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/column/column_view.hpp>
 #include <cudf/detail/aggregation/aggregation.hpp>
-#include <cudf/detail/utilities/device_atomics.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/dictionary/detail/iterator.cuh>
 #include <cudf/dictionary/dictionary_column_view.hpp>
@@ -73,7 +72,7 @@ struct calculate_quantile_fn {
                        [d_result = d_result, segment_size, offset, this](size_type j) {
                          if (segment_size == 0) {
                            d_result.set_null(offset + j);
-                           cudf::detail::atomic_add(this->null_count, 1);
+                           atomicAdd(this->null_count, 1);
                          } else {
                            d_result.set_valid(offset + j);
                          }
diff --git a/cpp/src/groupby/sort/group_std.cu b/cpp/src/groupby/sort/group_std.cu
index 5b37e31f949..49ad34838e3 100644
--- a/cpp/src/groupby/sort/group_std.cu
+++ b/cpp/src/groupby/sort/group_std.cu
@@ -20,7 +20,6 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/column/column_view.hpp>
 #include <cudf/detail/aggregation/aggregation.hpp>
-#include <cudf/detail/utilities/device_atomics.cuh>
 #include <cudf/dictionary/detail/iterator.cuh>
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/utilities/span.hpp>
@@ -151,7 +150,7 @@ struct var_functor {
           // fact. (1) is more work than it's worth without benchmarking, and
           // this approach should outperform (2) unless large amounts of the
           // data is null.
-          cudf::detail::atomic_add(d_null_count, 1);
+          atomicAdd(d_null_count, 1);
         } else {
           d_result.set_valid(i);
         }
diff --git a/cpp/src/io/avro/avro_gpu.cu b/cpp/src/io/avro/avro_gpu.cu
index 50241c2b0eb..0f5952b0330 100644
--- a/cpp/src/io/avro/avro_gpu.cu
+++ b/cpp/src/io/avro/avro_gpu.cu
@@ -17,8 +17,6 @@
 
 #include <io/utilities/block_utils.cuh>
 
-#include <cudf/detail/utilities/device_atomics.cuh>
-
 #include <rmm/cuda_stream_view.hpp>
 
 using cudf::device_span;
@@ -147,7 +145,7 @@ avro_decode_row(schemadesc_s const* schema,
       case type_null:
         if (dataptr != nullptr && dst_row >= 0) {
           atomicAnd(static_cast<uint32_t*>(dataptr) + (dst_row >> 5), ~(1 << (dst_row & 0x1f)));
-          cudf::detail::atomic_add(&schema_g[i].count, 1U);
+          atomicAdd(&schema_g[i].count, 1U);
           *skipped_row = false;
         }
         break;
diff --git a/cpp/src/io/csv/csv_gpu.cu b/cpp/src/io/csv/csv_gpu.cu
index 1653382e375..8252cccbdb9 100644
--- a/cpp/src/io/csv/csv_gpu.cu
+++ b/cpp/src/io/csv/csv_gpu.cu
@@ -21,7 +21,6 @@
 #include <io/utilities/parsing_utils.cuh>
 
 #include <cudf/detail/utilities/cuda.cuh>
-#include <cudf/detail/utilities/device_atomics.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/fixed_point/fixed_point.hpp>
 #include <cudf/null_mask.hpp>
@@ -202,12 +201,12 @@ CUDF_KERNEL void __launch_bounds__(csvparse_block_dim)
       // points to last character in the field
       auto const field_len = static_cast<size_t>(next_delimiter - field_start);
       if (serialized_trie_contains(opts.trie_na, {field_start, field_len})) {
-        cudf::detail::atomic_add(&d_column_data[actual_col].null_count, 1);
+        atomicAdd(&d_column_data[actual_col].null_count, 1);
       } else if (serialized_trie_contains(opts.trie_true, {field_start, field_len}) ||
                  serialized_trie_contains(opts.trie_false, {field_start, field_len})) {
-        cudf::detail::atomic_add(&d_column_data[actual_col].bool_count, 1);
+        atomicAdd(&d_column_data[actual_col].bool_count, 1);
       } else if (cudf::io::is_infinity(field_start, next_delimiter)) {
-        cudf::detail::atomic_add(&d_column_data[actual_col].float_count, 1);
+        atomicAdd(&d_column_data[actual_col].float_count, 1);
       } else {
         long count_number    = 0;
         long count_decimal   = 0;
@@ -262,9 +261,9 @@ CUDF_KERNEL void __launch_bounds__(csvparse_block_dim)
         if (column_flags[col] & column_parse::as_datetime) {
           // PANDAS uses `object` dtype if the date is unparseable
           if (is_datetime(count_string, count_decimal, count_colon, count_dash, count_slash)) {
-            cudf::detail::atomic_add(&d_column_data[actual_col].datetime_count, 1);
+            atomicAdd(&d_column_data[actual_col].datetime_count, 1);
           } else {
-            cudf::detail::atomic_add(&d_column_data[actual_col].string_count, 1);
+            atomicAdd(&d_column_data[actual_col].string_count, 1);
           }
         } else if (count_number == int_req_number_cnt) {
           auto const is_negative = (*trimmed_field_range.first == '-');
@@ -272,16 +271,16 @@ CUDF_KERNEL void __launch_bounds__(csvparse_block_dim)
             trimmed_field_range.first + (is_negative || (*trimmed_field_range.first == '+'));
           cudf::size_type* ptr = cudf::io::gpu::infer_integral_field_counter(
             data_begin, data_begin + count_number, is_negative, d_column_data[actual_col]);
-          cudf::detail::atomic_add(ptr, 1);
+          atomicAdd(ptr, 1);
         } else if (is_floatingpoint(trimmed_field_len,
                                     count_number,
                                     count_decimal,
                                     count_thousands,
                                     count_dash + count_plus,
                                     count_exponent)) {
-          cudf::detail::atomic_add(&d_column_data[actual_col].float_count, 1);
+          atomicAdd(&d_column_data[actual_col].float_count, 1);
         } else {
-          cudf::detail::atomic_add(&d_column_data[actual_col].string_count, 1);
+          atomicAdd(&d_column_data[actual_col].string_count, 1);
         }
       }
       actual_col++;
@@ -373,7 +372,7 @@ CUDF_KERNEL void __launch_bounds__(csvparse_block_dim)
                                     column_flags[col] & column_parse::as_hexadecimal)) {
             // set the valid bitmap - all bits were set to 0 to start
             set_bit(valids[actual_col], rec_id);
-            cudf::detail::atomic_add(&valid_counts[actual_col], 1);
+            atomicAdd(&valid_counts[actual_col], 1);
           }
         }
       } else if (dtypes[actual_col].id() == cudf::type_id::STRING) {
diff --git a/cpp/src/io/json/legacy/json_gpu.cu b/cpp/src/io/json/legacy/json_gpu.cu
index 713252cc39a..b196643e7e7 100644
--- a/cpp/src/io/json/legacy/json_gpu.cu
+++ b/cpp/src/io/json/legacy/json_gpu.cu
@@ -20,7 +20,6 @@
 #include <io/utilities/parsing_utils.cuh>
 
 #include <cudf/detail/utilities/cuda.cuh>
-#include <cudf/detail/utilities/device_atomics.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/hashing/detail/murmurhash3_x86_32.cuh>
 #include <cudf/types.hpp>
@@ -283,7 +282,7 @@ CUDF_KERNEL void convert_data_to_columns_kernel(parse_options_view opts,
 
         // set the valid bitmap - all bits were set to 0 to start
         set_bit(valid_fields[desc.column], rec_id);
-        cudf::detail::atomic_add(&num_valid_fields[desc.column], 1);
+        atomicAdd(&num_valid_fields[desc.column], 1);
       } else {
         if (cudf::type_dispatcher(column_types[desc.column],
                                   ConvertFunctor{},
@@ -296,7 +295,7 @@ CUDF_KERNEL void convert_data_to_columns_kernel(parse_options_view opts,
                                   false)) {
           // set the valid bitmap - all bits were set to 0 to start
           set_bit(valid_fields[desc.column], rec_id);
-          cudf::detail::atomic_add(&num_valid_fields[desc.column], 1);
+          atomicAdd(&num_valid_fields[desc.column], 1);
         }
       }
     } else if (column_types[desc.column].id() == type_id::STRING) {
@@ -350,16 +349,16 @@ CUDF_KERNEL void detect_data_types_kernel(
     // Checking if the field is empty/valid
     if (serialized_trie_contains(opts.trie_na, {desc.value_begin, value_len})) {
       // Increase the null count for array rows, where the null count is initialized to zero.
-      if (!are_rows_objects) { cudf::detail::atomic_add(&column_infos[desc.column].null_count, 1); }
+      if (!are_rows_objects) { atomicAdd(&column_infos[desc.column].null_count, 1); }
       continue;
     } else if (are_rows_objects) {
       // For files with object rows, null count is initialized to row count. The value is decreased
       // here for every valid field.
-      cudf::detail::atomic_add(&column_infos[desc.column].null_count, -1);
+      atomicAdd(&column_infos[desc.column].null_count, -1);
     }
     // Don't need counts to detect strings, any field in quotes is deduced to be a string
     if (desc.is_quoted) {
-      cudf::detail::atomic_add(&column_infos[desc.column].string_count, 1);
+      atomicAdd(&column_infos[desc.column].string_count, 1);
       continue;
     }
 
@@ -406,21 +405,21 @@ CUDF_KERNEL void detect_data_types_kernel(
     if (maybe_hex) { --int_req_number_cnt; }
     if (serialized_trie_contains(opts.trie_true, {desc.value_begin, value_len}) ||
         serialized_trie_contains(opts.trie_false, {desc.value_begin, value_len})) {
-      cudf::detail::atomic_add(&column_infos[desc.column].bool_count, 1);
+      atomicAdd(&column_infos[desc.column].bool_count, 1);
     } else if (digit_count == int_req_number_cnt) {
       bool is_negative       = (*desc.value_begin == '-');
       char const* data_begin = desc.value_begin + (is_negative || (*desc.value_begin == '+'));
       cudf::size_type* ptr   = cudf::io::gpu::infer_integral_field_counter(
         data_begin, data_begin + digit_count, is_negative, column_infos[desc.column]);
-      cudf::detail::atomic_add(ptr, 1);
+      atomicAdd(ptr, 1);
     } else if (is_like_float(
                  value_len, digit_count, decimal_count, dash_count + plus_count, exponent_count)) {
-      cudf::detail::atomic_add(&column_infos[desc.column].float_count, 1);
+      atomicAdd(&column_infos[desc.column].float_count, 1);
     }
     // A date-time field cannot have more than 3 non-special characters
     // A number field cannot have more than one decimal point
     else if (other_count > 3 || decimal_count > 1) {
-      cudf::detail::atomic_add(&column_infos[desc.column].string_count, 1);
+      atomicAdd(&column_infos[desc.column].string_count, 1);
     } else {
       // A date field can have either one or two '-' or '\'; A legal combination will only have one
       // of them To simplify the process of auto column detection, we are not covering all the
@@ -428,20 +427,20 @@ CUDF_KERNEL void detect_data_types_kernel(
       if ((dash_count > 0 && dash_count <= 2 && slash_count == 0) ||
           (dash_count == 0 && slash_count > 0 && slash_count <= 2)) {
         if (colon_count <= 2) {
-          cudf::detail::atomic_add(&column_infos[desc.column].datetime_count, 1);
+          atomicAdd(&column_infos[desc.column].datetime_count, 1);
         } else {
-          cudf::detail::atomic_add(&column_infos[desc.column].string_count, 1);
+          atomicAdd(&column_infos[desc.column].string_count, 1);
         }
       } else {
         // Default field type is string
-        cudf::detail::atomic_add(&column_infos[desc.column].string_count, 1);
+        atomicAdd(&column_infos[desc.column].string_count, 1);
       }
     }
   }
   if (!are_rows_objects) {
     // For array rows, mark missing fields as null
     for (; input_field_index < num_columns; ++input_field_index)
-      cudf::detail::atomic_add(&column_infos[input_field_index].null_count, 1);
+      atomicAdd(&column_infos[input_field_index].null_count, 1);
   }
 }
 
@@ -499,7 +498,7 @@ CUDF_KERNEL void collect_keys_info_kernel(parse_options_view const options,
   for (auto field_range = advance(row_data_range.first);
        field_range.key_begin < row_data_range.second;
        field_range = advance(field_range.value_end)) {
-    auto const idx = cudf::detail::atomic_add(keys_cnt, 1ULL);
+    auto const idx = atomicAdd(keys_cnt, 1ULL);
     if (keys_info.has_value()) {
       auto const len                              = field_range.key_end - field_range.key_begin;
       keys_info->column(0).element<uint64_t>(idx) = field_range.key_begin - data.begin();
diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu
index 09f2744e324..966af14560a 100644
--- a/cpp/src/io/orc/stripe_data.cu
+++ b/cpp/src/io/orc/stripe_data.cu
@@ -18,7 +18,6 @@
 
 #include <io/utilities/block_utils.cuh>
 
-#include <cudf/detail/utilities/device_atomics.cuh>
 #include <cudf/io/orc_types.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -1402,10 +1401,10 @@ CUDF_KERNEL void __launch_bounds__(block_size)
     // If we have an index, seek to the initial run and update row positions
     if (num_rowgroups > 0) {
       if (s->top.data.index.strm_offset[0] > s->chunk.strm_len[CI_DATA]) {
-        cudf::detail::atomic_add(error_count, 1);
+        atomicAdd(error_count, 1);
       }
       if (s->top.data.index.strm_offset[1] > s->chunk.strm_len[CI_DATA2]) {
-        cudf::detail::atomic_add(error_count, 1);
+        atomicAdd(error_count, 1);
       }
       uint32_t ofs0 = min(s->top.data.index.strm_offset[0], s->chunk.strm_len[CI_DATA]);
       uint32_t ofs1 = min(s->top.data.index.strm_offset[1], s->chunk.strm_len[CI_DATA2]);
@@ -1826,8 +1825,7 @@ CUDF_KERNEL void __launch_bounds__(block_size)
     if (num_rowgroups > 0) {
       row_groups[blockIdx.y][blockIdx.x].num_child_rows = s->num_child_rows;
     }
-    cudf::detail::atomic_add(&chunks[chunk_id].num_child_rows,
-                             static_cast<uint32_t>(s->num_child_rows));
+    atomicAdd(&chunks[chunk_id].num_child_rows, static_cast<uint32_t>(s->num_child_rows));
   }
 }
 
diff --git a/cpp/src/io/parquet/chunk_dict.cu b/cpp/src/io/parquet/chunk_dict.cu
index 643394ed8a1..a43c6d4cbb6 100644
--- a/cpp/src/io/parquet/chunk_dict.cu
+++ b/cpp/src/io/parquet/chunk_dict.cu
@@ -18,7 +18,6 @@
 
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
-#include <cudf/detail/utilities/device_atomics.cuh>
 #include <cudf/table/experimental/row_operators.cuh>
 
 #include <rmm/exec_policy.hpp>
@@ -176,9 +175,9 @@ CUDF_KERNEL void __launch_bounds__(block_size)
     __syncthreads();
     auto uniq_data_size = block_reduce(reduce_storage).Sum(uniq_elem_size);
     if (t == 0) {
-      total_num_dict_entries = cudf::detail::atomic_add(&chunk->num_dict_entries, num_unique);
+      total_num_dict_entries = atomicAdd(&chunk->num_dict_entries, num_unique);
       total_num_dict_entries += num_unique;
-      cudf::detail::atomic_add(&chunk->uniq_data_size, uniq_data_size);
+      atomicAdd(&chunk->uniq_data_size, uniq_data_size);
     }
     __syncthreads();
 
diff --git a/cpp/src/io/utilities/data_casting.cu b/cpp/src/io/utilities/data_casting.cu
index 71ff52becc2..8fd860d9492 100644
--- a/cpp/src/io/utilities/data_casting.cu
+++ b/cpp/src/io/utilities/data_casting.cu
@@ -23,7 +23,6 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
-#include <cudf/detail/utilities/device_atomics.cuh>
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/detail/utf8.hpp>
@@ -438,14 +437,14 @@ CUDF_KERNEL void parse_fn_string_parallel(str_tuple_it str_tuples,
   auto get_next_string = [&]() {
     if constexpr (is_warp) {
       size_type istring;
-      if (lane == 0) { istring = cudf::detail::atomic_add(str_counter, 1); }
+      if (lane == 0) { istring = atomicAdd(str_counter, 1); }
       return __shfl_sync(0xffffffff, istring, 0);
     } else {
       // Ensure lane 0 doesn't update istring before all threads have read the previous iteration's
       // istring value
       __syncthreads();
       __shared__ size_type istring;
-      if (lane == 0) { istring = cudf::detail::atomic_add(str_counter, 1); }
+      if (lane == 0) { istring = atomicAdd(str_counter, 1); }
       __syncthreads();
       return istring;
     }
@@ -475,7 +474,7 @@ CUDF_KERNEL void parse_fn_string_parallel(str_tuple_it str_tuples,
       if (is_null_literal && null_mask != nullptr) {
         if (lane == 0) {
           clear_bit(null_mask, istring);
-          cudf::detail::atomic_add(null_count_data, 1);
+          atomicAdd(null_count_data, 1);
           if (!d_chars) d_offsets[istring] = 0;
         }
         continue;  // gride-stride return;
@@ -619,7 +618,7 @@ CUDF_KERNEL void parse_fn_string_parallel(str_tuple_it str_tuples,
         if (!d_chars && lane == 0) {
           if (null_mask != nullptr) {
             clear_bit(null_mask, istring);
-            cudf::detail::atomic_add(null_count_data, 1);
+            atomicAdd(null_count_data, 1);
           }
           last_offset        = 0;
           d_offsets[istring] = 0;
@@ -760,7 +759,7 @@ struct string_parse {
         options.trie_na, {in_begin, static_cast<std::size_t>(num_in_chars)});
       if (is_null_literal && null_mask != nullptr) {
         clear_bit(null_mask, idx);
-        cudf::detail::atomic_add(null_count_data, 1);
+        atomicAdd(null_count_data, 1);
         if (!d_chars) d_offsets[idx] = 0;
         return;
       }
@@ -771,7 +770,7 @@ struct string_parse {
     if (str_process_info.result != data_casting_result::PARSING_SUCCESS) {
       if (null_mask != nullptr) {
         clear_bit(null_mask, idx);
-        cudf::detail::atomic_add(null_count_data, 1);
+        atomicAdd(null_count_data, 1);
       }
       if (!d_chars) d_offsets[idx] = 0;
     } else {
@@ -954,7 +953,7 @@ std::unique_ptr<column> parse_data(
 
       if (is_null_literal) {
         col.set_null(row);
-        cudf::detail::atomic_add(null_count_data, 1);
+        atomicAdd(null_count_data, 1);
         return;
       }
 
@@ -972,7 +971,7 @@ std::unique_ptr<column> parse_data(
                                                    false);
       if (not is_parsed) {
         col.set_null(row);
-        cudf::detail::atomic_add(null_count_data, 1);
+        atomicAdd(null_count_data, 1);
       }
     });
 
diff --git a/cpp/src/io/utilities/parsing_utils.cu b/cpp/src/io/utilities/parsing_utils.cu
index 5eed1e5ca04..ddd094b09e5 100644
--- a/cpp/src/io/utilities/parsing_utils.cu
+++ b/cpp/src/io/utilities/parsing_utils.cu
@@ -15,7 +15,6 @@
  */
 
 #include <cudf/detail/utilities/cuda.cuh>
-#include <cudf/detail/utilities/device_atomics.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/io/types.hpp>
 #include <cudf/utilities/error.hpp>
@@ -107,7 +106,7 @@ CUDF_KERNEL void count_and_set_positions(char const* data,
   // Process the data
   for (long i = 0; i < byteToProcess; i++) {
     if (raw[i] == key) {
-      auto const idx = cudf::detail::atomic_add(count, static_cast<cudf::size_type>(1));
+      auto const idx = atomicAdd(count, static_cast<cudf::size_type>(1));
       setElement(positions, idx, did + offset + i, key);
     }
   }
diff --git a/cpp/src/io/utilities/type_inference.cu b/cpp/src/io/utilities/type_inference.cu
index eead98ca1c8..b446ad41946 100644
--- a/cpp/src/io/utilities/type_inference.cu
+++ b/cpp/src/io/utilities/type_inference.cu
@@ -19,7 +19,6 @@
 #include <io/utilities/trie.cuh>
 
 #include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/detail/utilities/device_atomics.cuh>
 #include <cudf/utilities/error.hpp>
 
 #include <rmm/device_scalar.hpp>
@@ -205,16 +204,16 @@ CUDF_KERNEL void infer_column_type_kernel(OptionsView options,
   auto const block_type_histogram =
     BlockReduce(temp_storage).Reduce(thread_type_histogram, custom_sum{});
   if (threadIdx.x == 0) {
-    cudf::detail::atomic_add(&column_info->null_count, block_type_histogram.null_count);
-    cudf::detail::atomic_add(&column_info->float_count, block_type_histogram.float_count);
-    cudf::detail::atomic_add(&column_info->datetime_count, block_type_histogram.datetime_count);
-    cudf::detail::atomic_add(&column_info->string_count, block_type_histogram.string_count);
-    cudf::detail::atomic_add(&column_info->negative_small_int_count,
-                             block_type_histogram.negative_small_int_count);
-    cudf::detail::atomic_add(&column_info->positive_small_int_count,
-                             block_type_histogram.positive_small_int_count);
-    cudf::detail::atomic_add(&column_info->big_int_count, block_type_histogram.big_int_count);
-    cudf::detail::atomic_add(&column_info->bool_count, block_type_histogram.bool_count);
+    atomicAdd(&column_info->null_count, block_type_histogram.null_count);
+    atomicAdd(&column_info->float_count, block_type_histogram.float_count);
+    atomicAdd(&column_info->datetime_count, block_type_histogram.datetime_count);
+    atomicAdd(&column_info->string_count, block_type_histogram.string_count);
+    atomicAdd(&column_info->negative_small_int_count,
+              block_type_histogram.negative_small_int_count);
+    atomicAdd(&column_info->positive_small_int_count,
+              block_type_histogram.positive_small_int_count);
+    atomicAdd(&column_info->big_int_count, block_type_histogram.big_int_count);
+    atomicAdd(&column_info->bool_count, block_type_histogram.bool_count);
   }
 }
 
diff --git a/cpp/src/join/join_common_utils.cuh b/cpp/src/join/join_common_utils.cuh
index c87aa590385..71b60e1477e 100644
--- a/cpp/src/join/join_common_utils.cuh
+++ b/cpp/src/join/join_common_utils.cuh
@@ -20,7 +20,6 @@
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
-#include <cudf/detail/utilities/device_atomics.cuh>
 #include <cudf/table/experimental/row_operators.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -281,7 +280,7 @@ __inline__ __device__ void add_pair_to_cache(size_type const first,
                                              size_type* joined_shared_l,
                                              size_type* joined_shared_r)
 {
-  size_type my_current_idx{cudf::detail::atomic_add(current_idx_shared + warp_id, size_type(1))};
+  size_type my_current_idx{atomicAdd(current_idx_shared + warp_id, size_type(1))};
 
   // its guaranteed to fit into the shared cache
   joined_shared_l[my_current_idx] = first;
@@ -304,9 +303,7 @@ __device__ void flush_output_cache(unsigned int const activemask,
   int num_threads               = __popc(activemask);
   cudf::size_type output_offset = 0;
 
-  if (0 == lane_id) {
-    output_offset = cudf::detail::atomic_add(current_idx, current_idx_shared[warp_id]);
-  }
+  if (0 == lane_id) { output_offset = atomicAdd(current_idx, current_idx_shared[warp_id]); }
 
   // No warp sync is necessary here because we are assuming that ShuffleIndex
   // is internally using post-CUDA 9.0 synchronization-safe primitives
diff --git a/cpp/src/json/json_path.cu b/cpp/src/json/json_path.cu
index 5e67841b328..146b54c0d87 100644
--- a/cpp/src/json/json_path.cu
+++ b/cpp/src/json/json_path.cu
@@ -20,7 +20,6 @@
 #include <cudf/detail/get_value.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
-#include <cudf/detail/utilities/device_atomics.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/json/json.hpp>
 #include <cudf/scalar/scalar.hpp>
@@ -955,7 +954,7 @@ __launch_bounds__(block_size) CUDF_KERNEL
   if (out_valid_count) {
     size_type block_valid_count =
       cudf::detail::single_lane_block_sum_reduce<block_size, 0>(warp_valid_count);
-    if (threadIdx.x == 0) { cudf::detail::atomic_add(out_valid_count.value(), block_valid_count); }
+    if (threadIdx.x == 0) { atomicAdd(out_valid_count.value(), block_valid_count); }
   }
 }
 
diff --git a/cpp/src/partitioning/partitioning.cu b/cpp/src/partitioning/partitioning.cu
index 0828a0fc772..8d8f1a71672 100644
--- a/cpp/src/partitioning/partitioning.cu
+++ b/cpp/src/partitioning/partitioning.cu
@@ -21,7 +21,6 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/scatter.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
-#include <cudf/detail/utilities/device_atomics.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/hashing/detail/murmurhash3_x86_32.cuh>
 #include <cudf/partitioning.hpp>
@@ -159,7 +158,7 @@ CUDF_KERNEL void compute_row_partition_numbers(row_hasher_t the_hasher,
     row_partition_numbers[row_number] = partition_number;
 
     row_partition_offset[row_number] =
-      cudf::detail::atomic_add(&(shared_partition_sizes[partition_number]), size_type(1));
+      atomicAdd(&(shared_partition_sizes[partition_number]), size_type(1));
 
     tid += stride;
   }
@@ -172,7 +171,7 @@ CUDF_KERNEL void compute_row_partition_numbers(row_hasher_t the_hasher,
     size_type const block_partition_size = shared_partition_sizes[partition_number];
 
     // Update global size of each partition
-    cudf::detail::atomic_add(&global_partition_sizes[partition_number], block_partition_size);
+    atomicAdd(&global_partition_sizes[partition_number], block_partition_size);
 
     // Record the size of this partition in this block
     size_type const write_location        = partition_number * gridDim.x + blockIdx.x;
@@ -230,7 +229,7 @@ CUDF_KERNEL void compute_row_output_locations(size_type* __restrict__ row_partit
     // Get output location based on partition number by incrementing the
     // corresponding partition offset for this block
     size_type const row_output_location =
-      cudf::detail::atomic_add(&(shared_partition_offsets[partition_number]), size_type(1));
+      atomicAdd(&(shared_partition_offsets[partition_number]), size_type(1));
 
     // Store the row's output location in-place
     row_partition_numbers[row_number] = row_output_location;
@@ -709,7 +708,7 @@ struct dispatch_map_type {
                       partition_map.end<MapType>(),
                       scatter_map.begin(),
                       [offsets = histogram.data()] __device__(auto partition_number) {
-                        return cudf::detail::atomic_add(&offsets[partition_number], 1);
+                        return atomicAdd(&offsets[partition_number], 1);
                       });
 
     // Scatter the rows into their partitions
diff --git a/cpp/src/replace/nulls.cu b/cpp/src/replace/nulls.cu
index 95177f53a72..014171f2b40 100644
--- a/cpp/src/replace/nulls.cu
+++ b/cpp/src/replace/nulls.cu
@@ -27,7 +27,6 @@
 #include <cudf/detail/replace.hpp>
 #include <cudf/detail/replace/nulls.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
-#include <cudf/detail/utilities/device_atomics.cuh>
 #include <cudf/dictionary/detail/replace.hpp>
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/null_mask.hpp>
@@ -110,7 +109,7 @@ CUDF_KERNEL void replace_nulls_strings(cudf::column_device_view input,
   uint32_t block_valid_count = cudf::detail::single_lane_block_sum_reduce<BLOCK_SIZE, 0>(valid_sum);
   // one thread computes and adds to output_valid_count
   if (threadIdx.x == 0) {
-    cudf::detail::atomic_add(valid_counter, static_cast<cudf::size_type>(block_valid_count));
+    atomicAdd(valid_counter, static_cast<cudf::size_type>(block_valid_count));
   }
 }
 
@@ -157,7 +156,7 @@ CUDF_KERNEL void replace_nulls(cudf::column_device_view input,
       cudf::detail::single_lane_block_sum_reduce<BLOCK_SIZE, 0>(valid_sum);
     // one thread computes and adds to output_valid_count
     if (threadIdx.x == 0) {
-      cudf::detail::atomic_add(output_valid_count, static_cast<cudf::size_type>(block_valid_count));
+      atomicAdd(output_valid_count, static_cast<cudf::size_type>(block_valid_count));
     }
   }
 }
diff --git a/cpp/src/replace/replace.cu b/cpp/src/replace/replace.cu
index 1b0a7316ad2..88d5d3a2375 100644
--- a/cpp/src/replace/replace.cu
+++ b/cpp/src/replace/replace.cu
@@ -41,7 +41,6 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/replace.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
-#include <cudf/detail/utilities/device_atomics.cuh>
 #include <cudf/dictionary/detail/update_keys.hpp>
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/dictionary/dictionary_factories.hpp>
@@ -170,7 +169,7 @@ CUDF_KERNEL void replace_strings_first_pass(cudf::column_device_view input,
   uint32_t block_valid_count = cudf::detail::single_lane_block_sum_reduce<BLOCK_SIZE, 0>(valid_sum);
   // one thread computes and adds to output_valid_count
   if (threadIdx.x == 0) {
-    cudf::detail::atomic_add(output_valid_count, static_cast<cudf::size_type>(block_valid_count));
+    atomicAdd(output_valid_count, static_cast<cudf::size_type>(block_valid_count));
   }
 }
 
@@ -299,7 +298,7 @@ CUDF_KERNEL void replace_kernel(cudf::column_device_view input,
       cudf::detail::single_lane_block_sum_reduce<BLOCK_SIZE, 0>(valid_sum);
     // one thread computes and adds to output_valid_count
     if (threadIdx.x == 0) {
-      cudf::detail::atomic_add(output_valid_count, static_cast<cudf::size_type>(block_valid_count));
+      atomicAdd(output_valid_count, static_cast<cudf::size_type>(block_valid_count));
     }
   }
 }
diff --git a/cpp/src/rolling/detail/rolling.cuh b/cpp/src/rolling/detail/rolling.cuh
index d14d4e3e94e..20845a97c7e 100644
--- a/cpp/src/rolling/detail/rolling.cuh
+++ b/cpp/src/rolling/detail/rolling.cuh
@@ -35,7 +35,6 @@
 #include <cudf/detail/groupby/sort_helper.hpp>
 #include <cudf/detail/unary.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
-#include <cudf/detail/utilities/device_atomics.cuh>
 #include <cudf/detail/utilities/device_operators.cuh>
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/dictionary/dictionary_factories.hpp>
@@ -1078,7 +1077,7 @@ __launch_bounds__(block_size) CUDF_KERNEL
   size_type block_valid_count =
     cudf::detail::single_lane_block_sum_reduce<block_size, 0>(warp_valid_count);
 
-  if (threadIdx.x == 0) { cudf::detail::atomic_add(output_valid_count, block_valid_count); }
+  if (threadIdx.x == 0) { atomicAdd(output_valid_count, block_valid_count); }
 }
 
 /**
diff --git a/cpp/src/stream_compaction/distinct_helpers.cu b/cpp/src/stream_compaction/distinct_helpers.cu
index 34688e8655b..c3127aa2d20 100644
--- a/cpp/src/stream_compaction/distinct_helpers.cu
+++ b/cpp/src/stream_compaction/distinct_helpers.cu
@@ -17,7 +17,6 @@
 #include "distinct_helpers.hpp"
 
 #include <cudf/detail/hash_reduce_by_row.cuh>
-#include <cudf/detail/utilities/device_atomics.cuh>
 
 namespace cudf::detail {
 
@@ -54,7 +53,7 @@ struct reduce_fn : reduce_by_row_fn_base<MapView, KeyHasher, KeyEqual, size_type
       atomicMax(out_ptr, idx);
     } else {
       // Count the number of rows in each group of rows that are compared equal.
-      cudf::detail::atomic_add(out_ptr, size_type{1});
+      atomicAdd(out_ptr, size_type{1});
     }
   }
 };
diff --git a/cpp/src/strings/copying/concatenate.cu b/cpp/src/strings/copying/concatenate.cu
index 19083ca661f..c4564b1105b 100644
--- a/cpp/src/strings/copying/concatenate.cu
+++ b/cpp/src/strings/copying/concatenate.cu
@@ -19,7 +19,6 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/offsets_iterator_factory.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
-#include <cudf/detail/utilities/device_atomics.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/strings/detail/concatenate.hpp>
 #include <cudf/strings/detail/utilities.hpp>
@@ -168,7 +167,7 @@ CUDF_KERNEL void fused_concatenate_string_offset_kernel(
   if (Nullable) {
     using cudf::detail::single_lane_block_sum_reduce;
     auto block_valid_count = single_lane_block_sum_reduce<block_size, 0>(warp_valid_count);
-    if (threadIdx.x == 0) { cudf::detail::atomic_add(out_valid_count, block_valid_count); }
+    if (threadIdx.x == 0) { atomicAdd(out_valid_count, block_valid_count); }
   }
 }
 
diff --git a/cpp/src/strings/replace/multi.cu b/cpp/src/strings/replace/multi.cu
index 69fd407b6c2..3d0210d61b0 100644
--- a/cpp/src/strings/replace/multi.cu
+++ b/cpp/src/strings/replace/multi.cu
@@ -20,7 +20,6 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
-#include <cudf/detail/utilities/device_atomics.cuh>
 #include <cudf/strings/detail/char_tables.hpp>
 #include <cudf/strings/detail/replace.hpp>
 #include <cudf/strings/detail/strings_children.cuh>
@@ -331,7 +330,7 @@ std::unique_ptr<column> replace_character_parallel(strings_column_view const& in
                        target_count,
                        [d_string_indices, d_targets_offsets] __device__(size_type idx) {
                          auto const str_idx = d_string_indices[idx] - 1;
-                         cudf::detail::atomic_add(d_targets_offsets + str_idx, 1);
+                         atomicAdd(d_targets_offsets + str_idx, 1);
                        });
     // finally, convert the counts into offsets
     thrust::exclusive_scan(rmm::exec_policy(stream),
diff --git a/cpp/src/transform/row_conversion.cu b/cpp/src/transform/row_conversion.cu
index 8c51bea3c88..b294369a90e 100644
--- a/cpp/src/transform/row_conversion.cu
+++ b/cpp/src/transform/row_conversion.cu
@@ -20,7 +20,6 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/sequence.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
-#include <cudf/detail/utilities/device_atomics.cuh>
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/lists/lists_column_device_view.cuh>
@@ -251,7 +250,7 @@ build_string_row_offsets(table_view const& tbl,
                      auto const col = element_idx / num_rows;
                      auto const val =
                        d_offsets_iterators[col][row + 1] - d_offsets_iterators[col][row];
-                     cudf::detail::atomic_add(&d_row_sizes[row], val);
+                     atomicAdd(&d_row_sizes[row], val);
                    });
 
   // transform the row sizes to include fixed width size and alignment
diff --git a/cpp/tests/device_atomics/device_atomics_test.cu b/cpp/tests/device_atomics/device_atomics_test.cu
index 316137068f2..f215b4fce24 100644
--- a/cpp/tests/device_atomics/device_atomics_test.cu
+++ b/cpp/tests/device_atomics/device_atomics_test.cu
@@ -38,8 +38,8 @@ CUDF_KERNEL void gpu_atomic_test(T* result, T* data, size_t size)
 
   for (; id < size; id += step) {
     cudf::detail::atomic_add(&result[0], data[id]);
-    atomicMin(&result[1], data[id]);
-    atomicMax(&result[2], data[id]);
+    cudf::detail::atomic_min(&result[1], data[id]);
+    cudf::detail::atomic_max(&result[2], data[id]);
     cudf::detail::genericAtomicOperation(&result[3], data[id], cudf::DeviceSum{});
     cudf::detail::genericAtomicOperation(&result[4], data[id], cudf::DeviceMin{});
     cudf::detail::genericAtomicOperation(&result[5], data[id], cudf::DeviceMax{});
@@ -72,7 +72,7 @@ __device__ T atomic_op(T* addr, T const& value, BinaryOp op)
     assumed     = old_value;
     T new_value = op(old_value, value);
 
-    old_value = atomicCAS(addr, assumed, new_value);
+    old_value = cudf::detail::atomic_cas(addr, assumed, new_value);
   } while (assumed != old_value);
 
   return old_value;

From 0d46d99ba2725bda2e9840b54dd2bff6cd72769b Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Fri, 8 Mar 2024 08:53:57 -0800
Subject: [PATCH 5/6] Fix copyright.

---
 cpp/include/cudf/detail/labeling/label_segments.cuh | 2 +-
 cpp/src/groupby/sort/group_std.cu                   | 2 +-
 cpp/src/stream_compaction/distinct_helpers.cu       | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/cpp/include/cudf/detail/labeling/label_segments.cuh b/cpp/include/cudf/detail/labeling/label_segments.cuh
index f522abef0b1..9051230a272 100644
--- a/cpp/include/cudf/detail/labeling/label_segments.cuh
+++ b/cpp/include/cudf/detail/labeling/label_segments.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/groupby/sort/group_std.cu b/cpp/src/groupby/sort/group_std.cu
index 49ad34838e3..30b6f67dffe 100644
--- a/cpp/src/groupby/sort/group_std.cu
+++ b/cpp/src/groupby/sort/group_std.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/stream_compaction/distinct_helpers.cu b/cpp/src/stream_compaction/distinct_helpers.cu
index c3127aa2d20..8f36ec98f4a 100644
--- a/cpp/src/stream_compaction/distinct_helpers.cu
+++ b/cpp/src/stream_compaction/distinct_helpers.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.

From 282a3415fd6ff77d66362d0ba678c0bdd9d1f1f0 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Fri, 8 Mar 2024 17:20:49 -0800
Subject: [PATCH 6/6] Add missing headers that were implicitly included before
 this PR.

---
 cpp/benchmarks/join/generate_input_tables.cuh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/cpp/benchmarks/join/generate_input_tables.cuh b/cpp/benchmarks/join/generate_input_tables.cuh
index d9c428f86af..93401f01026 100644
--- a/cpp/benchmarks/join/generate_input_tables.cuh
+++ b/cpp/benchmarks/join/generate_input_tables.cuh
@@ -16,9 +16,11 @@
 
 #pragma once
 
+#include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 
+#include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
 #include <thrust/distance.h>