diff --git a/cpp/benchmarks/join/generate_input_tables.cuh b/cpp/benchmarks/join/generate_input_tables.cuh
index 84e607a9f28..b14541564dd 100644
--- a/cpp/benchmarks/join/generate_input_tables.cuh
+++ b/cpp/benchmarks/join/generate_input_tables.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -31,7 +31,7 @@
 
 #include <cassert>
 
-__global__ static void init_curand(curandState* state, int const nstates)
+CUDF_KERNEL void init_curand(curandState* state, int const nstates)
 {
   int ithread = threadIdx.x + blockIdx.x * blockDim.x;
 
@@ -39,11 +39,11 @@ __global__ static void init_curand(curandState* state, int const nstates)
 }
 
 template <typename key_type, typename size_type>
-__global__ static void init_build_tbl(key_type* const build_tbl,
-                                      size_type const build_tbl_size,
-                                      int const multiplicity,
-                                      curandState* state,
-                                      int const num_states)
+CUDF_KERNEL void init_build_tbl(key_type* const build_tbl,
+                                size_type const build_tbl_size,
+                                int const multiplicity,
+                                curandState* state,
+                                int const num_states)
 {
   auto const start_idx = blockIdx.x * blockDim.x + threadIdx.x;
   auto const stride    = blockDim.x * gridDim.x;
@@ -61,14 +61,14 @@ __global__ static void init_build_tbl(key_type* const build_tbl,
 }
 
 template <typename key_type, typename size_type>
-__global__ void init_probe_tbl(key_type* const probe_tbl,
-                               size_type const probe_tbl_size,
-                               size_type const build_tbl_size,
-                               key_type const rand_max,
-                               double const selectivity,
-                               int const multiplicity,
-                               curandState* state,
-                               int const num_states)
+CUDF_KERNEL void init_probe_tbl(key_type* const probe_tbl,
+                                size_type const probe_tbl_size,
+                                size_type const build_tbl_size,
+                                key_type const rand_max,
+                                double const selectivity,
+                                int const multiplicity,
+                                curandState* state,
+                                int const num_states)
 {
   auto const start_idx = blockIdx.x * blockDim.x + threadIdx.x;
   auto const stride    = blockDim.x * gridDim.x;
diff --git a/cpp/benchmarks/type_dispatcher/type_dispatcher.cu b/cpp/benchmarks/type_dispatcher/type_dispatcher.cu
index 3f985cffb1f..161328ae088 100644
--- a/cpp/benchmarks/type_dispatcher/type_dispatcher.cu
+++ b/cpp/benchmarks/type_dispatcher/type_dispatcher.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -58,7 +58,7 @@ constexpr int block_size = 256;
 
 // This is for NO_DISPATCHING
 template <FunctorType functor_type, class T>
-__global__ void no_dispatching_kernel(T** A, cudf::size_type n_rows, cudf::size_type n_cols)
+CUDF_KERNEL void no_dispatching_kernel(T** A, cudf::size_type n_rows, cudf::size_type n_cols)
 {
   using F               = Functor<T, functor_type>;
   cudf::size_type index = blockIdx.x * blockDim.x + threadIdx.x;
@@ -72,7 +72,7 @@ __global__ void no_dispatching_kernel(T** A, cudf::size_type n_rows, cudf::size_
 
 // This is for HOST_DISPATCHING
 template <FunctorType functor_type, class T>
-__global__ void host_dispatching_kernel(cudf::mutable_column_device_view source_column)
+CUDF_KERNEL void host_dispatching_kernel(cudf::mutable_column_device_view source_column)
 {
   using F               = Functor<T, functor_type>;
   T* A                  = source_column.data<T>();
@@ -124,7 +124,7 @@ struct RowHandle {
 
 // This is for DEVICE_DISPATCHING
 template <FunctorType functor_type>
-__global__ void device_dispatching_kernel(cudf::mutable_table_device_view source)
+CUDF_KERNEL void device_dispatching_kernel(cudf::mutable_table_device_view source)
 {
   cudf::size_type const n_rows = source.num_rows();
   cudf::size_type index        = threadIdx.x + blockIdx.x * blockDim.x;
diff --git a/cpp/examples/strings/custom_optimized.cu b/cpp/examples/strings/custom_optimized.cu
index 36521871ad8..522093bc647 100644
--- a/cpp/examples/strings/custom_optimized.cu
+++ b/cpp/examples/strings/custom_optimized.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -37,9 +37,9 @@
  * @param d_visibilities Column of visibilities
  * @param d_sizes Output sizes for each row
  */
-__global__ void sizes_kernel(cudf::column_device_view const d_names,
-                             cudf::column_device_view const d_visibilities,
-                             cudf::size_type* d_sizes)
+__global__ static void sizes_kernel(cudf::column_device_view const d_names,
+                                    cudf::column_device_view const d_visibilities,
+                                    cudf::size_type* d_sizes)
 {
   // The row index is resolved from the CUDA thread/block objects
   auto index = threadIdx.x + blockIdx.x * blockDim.x;
@@ -74,10 +74,10 @@ __global__ void sizes_kernel(cudf::column_device_view const d_names,
  * @param d_offsets Byte offset in `d_chars` for each row
  * @param d_chars Output memory for all rows
  */
-__global__ void redact_kernel(cudf::column_device_view const d_names,
-                              cudf::column_device_view const d_visibilities,
-                              cudf::size_type const* d_offsets,
-                              char* d_chars)
+__global__ static void redact_kernel(cudf::column_device_view const d_names,
+                                     cudf::column_device_view const d_visibilities,
+                                     cudf::size_type const* d_offsets,
+                                     char* d_chars)
 {
   // The row index is resolved from the CUDA thread/block objects
   auto index = threadIdx.x + blockIdx.x * blockDim.x;
diff --git a/cpp/examples/strings/custom_prealloc.cu b/cpp/examples/strings/custom_prealloc.cu
index 0af4c47e947..0ef1bf2465d 100644
--- a/cpp/examples/strings/custom_prealloc.cu
+++ b/cpp/examples/strings/custom_prealloc.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -37,12 +37,12 @@
  * @param d_offsets Byte offset in `d_chars` for each row
  * @param d_output Output array of string_view objects
  */
-__global__ void redact_kernel(cudf::column_device_view const d_names,
-                              cudf::column_device_view const d_visibilities,
-                              cudf::string_view redaction,
-                              char* working_memory,
-                              cudf::size_type const* d_offsets,
-                              cudf::string_view* d_output)
+__global__ static void redact_kernel(cudf::column_device_view const d_names,
+                                     cudf::column_device_view const d_visibilities,
+                                     cudf::string_view redaction,
+                                     char* working_memory,
+                                     cudf::size_type const* d_offsets,
+                                     cudf::string_view* d_output)
 {
   // The row index is resolved from the CUDA thread/block objects
   auto index = threadIdx.x + blockIdx.x * blockDim.x;
diff --git a/cpp/examples/strings/custom_with_malloc.cu b/cpp/examples/strings/custom_with_malloc.cu
index 32f7bf7cbd0..e02fb52cd76 100644
--- a/cpp/examples/strings/custom_with_malloc.cu
+++ b/cpp/examples/strings/custom_with_malloc.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -64,10 +64,10 @@ void set_malloc_heap_size(size_t heap_size = 1073741824)  // 1GB
  * @param redaction Redacted string replacement
  * @param d_output Output array of string_view objects
  */
-__global__ void redact_kernel(cudf::column_device_view const d_names,
-                              cudf::column_device_view const d_visibilities,
-                              cudf::string_view redaction,
-                              cudf::string_view* d_output)
+__global__ static void redact_kernel(cudf::column_device_view const d_names,
+                                     cudf::column_device_view const d_visibilities,
+                                     cudf::string_view redaction,
+                                     cudf::string_view* d_output)
 {
   // The row index is resolved from the CUDA thread/block objects
   auto index = threadIdx.x + blockIdx.x * blockDim.x;
@@ -107,7 +107,9 @@ __global__ void redact_kernel(cudf::column_device_view const d_names,
  * @param redaction Redacted string replacement (not to be freed)
  * @param d_output Output array of string_view objects to free
  */
-__global__ void free_kernel(cudf::string_view redaction, cudf::string_view* d_output, int count)
+__global__ static void free_kernel(cudf::string_view redaction,
+                                   cudf::string_view* d_output,
+                                   int count)
 {
   auto index = threadIdx.x + blockIdx.x * blockDim.x;
   if (index >= count) return;
diff --git a/cpp/include/cudf/detail/copy_if.cuh b/cpp/include/cudf/detail/copy_if.cuh
index ebe7e052b6d..1d051ea32ff 100644
--- a/cpp/include/cudf/detail/copy_if.cuh
+++ b/cpp/include/cudf/detail/copy_if.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -52,10 +52,10 @@ namespace detail {
 
 // Compute the count of elements that pass the mask within each block
 template <typename Filter, int block_size>
-__global__ void compute_block_counts(cudf::size_type* __restrict__ block_counts,
-                                     cudf::size_type size,
-                                     cudf::size_type per_thread,
-                                     Filter filter)
+CUDF_KERNEL void compute_block_counts(cudf::size_type* __restrict__ block_counts,
+                                      cudf::size_type size,
+                                      cudf::size_type per_thread,
+                                      Filter filter)
 {
   int tid   = threadIdx.x + per_thread * block_size * blockIdx.x;
   int count = 0;
@@ -96,7 +96,7 @@ __device__ cudf::size_type block_scan_mask(bool mask_true, cudf::size_type& bloc
 //
 // Note: `filter` is not run on indices larger than the input column size
 template <typename T, typename Filter, int block_size, bool has_validity>
-__launch_bounds__(block_size) __global__
+__launch_bounds__(block_size) CUDF_KERNEL
   void scatter_kernel(cudf::mutable_column_device_view output_view,
                       cudf::size_type* output_null_count,
                       cudf::column_device_view input_view,
diff --git a/cpp/include/cudf/detail/copy_if_else.cuh b/cpp/include/cudf/detail/copy_if_else.cuh
index 04ad1f20196..6162fa5ecf1 100644
--- a/cpp/include/cudf/detail/copy_if_else.cuh
+++ b/cpp/include/cudf/detail/copy_if_else.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -37,7 +37,7 @@ template <size_type block_size,
           typename RightIter,
           typename Filter,
           bool has_nulls>
-__launch_bounds__(block_size) __global__
+__launch_bounds__(block_size) CUDF_KERNEL
   void copy_if_else_kernel(LeftIter lhs,
                            RightIter rhs,
                            Filter filter,
diff --git a/cpp/include/cudf/detail/copy_range.cuh b/cpp/include/cudf/detail/copy_range.cuh
index 16e4e7a1297..4bfdaa94c53 100644
--- a/cpp/include/cudf/detail/copy_range.cuh
+++ b/cpp/include/cudf/detail/copy_range.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -40,12 +40,12 @@ template <cudf::size_type block_size,
           typename SourceValidityIterator,
           typename T,
           bool has_validity>
-__global__ void copy_range_kernel(SourceValueIterator source_value_begin,
-                                  SourceValidityIterator source_validity_begin,
-                                  cudf::mutable_column_device_view target,
-                                  cudf::size_type target_begin,
-                                  cudf::size_type target_end,
-                                  cudf::size_type* __restrict__ const null_count)
+CUDF_KERNEL void copy_range_kernel(SourceValueIterator source_value_begin,
+                                   SourceValidityIterator source_validity_begin,
+                                   cudf::mutable_column_device_view target,
+                                   cudf::size_type target_begin,
+                                   cudf::size_type target_end,
+                                   cudf::size_type* __restrict__ const null_count)
 {
   using cudf::detail::warp_size;
 
diff --git a/cpp/include/cudf/detail/null_mask.cuh b/cpp/include/cudf/detail/null_mask.cuh
index ae05d4c6954..e57d85f2998 100644
--- a/cpp/include/cudf/detail/null_mask.cuh
+++ b/cpp/include/cudf/detail/null_mask.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -61,12 +61,12 @@ namespace detail {
  * @param count_ptr Pointer to counter of set bits
  */
 template <int block_size, typename Binop>
-__global__ void offset_bitmask_binop(Binop op,
-                                     device_span<bitmask_type> destination,
-                                     device_span<bitmask_type const* const> source,
-                                     device_span<size_type const> source_begin_bits,
-                                     size_type source_size_bits,
-                                     size_type* count_ptr)
+CUDF_KERNEL void offset_bitmask_binop(Binop op,
+                                      device_span<bitmask_type> destination,
+                                      device_span<bitmask_type const* const> source,
+                                      device_span<size_type const> source_begin_bits,
+                                      size_type source_size_bits,
+                                      size_type* count_ptr)
 {
   auto const tid = threadIdx.x + blockIdx.x * blockDim.x;
 
@@ -214,11 +214,11 @@ enum class count_bits_policy : bool {
  * in each range is updated.
  */
 template <typename OffsetIterator, typename OutputIterator>
-__global__ void subtract_set_bits_range_boundaries_kernel(bitmask_type const* bitmask,
-                                                          size_type num_ranges,
-                                                          OffsetIterator first_bit_indices,
-                                                          OffsetIterator last_bit_indices,
-                                                          OutputIterator null_counts)
+CUDF_KERNEL void subtract_set_bits_range_boundaries_kernel(bitmask_type const* bitmask,
+                                                           size_type num_ranges,
+                                                           OffsetIterator first_bit_indices,
+                                                           OffsetIterator last_bit_indices,
+                                                           OutputIterator null_counts)
 {
   constexpr size_type const word_size_in_bits{detail::size_in_bits<bitmask_type>()};
 
diff --git a/cpp/include/cudf/detail/utilities/cuda.cuh b/cpp/include/cudf/detail/utilities/cuda.cuh
index 264302df0e9..86c85ca8d06 100644
--- a/cpp/include/cudf/detail/utilities/cuda.cuh
+++ b/cpp/include/cudf/detail/utilities/cuda.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -211,7 +211,7 @@ __device__ inline T round_up_pow2(T number_to_round, T modulus)
 }
 
 template <class F>
-__global__ void single_thread_kernel(F f)
+CUDF_KERNEL void single_thread_kernel(F f)
 {
   f();
 }
diff --git a/cpp/include/cudf/detail/valid_if.cuh b/cpp/include/cudf/detail/valid_if.cuh
index f3f95dad017..d0073177445 100644
--- a/cpp/include/cudf/detail/valid_if.cuh
+++ b/cpp/include/cudf/detail/valid_if.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -44,7 +44,7 @@ namespace detail {
  * @param[out] valid_count The count of set bits in the output bitmask
  */
 template <size_type block_size, typename InputIterator, typename Predicate>
-__global__ void valid_if_kernel(
+CUDF_KERNEL void valid_if_kernel(
   bitmask_type* output, InputIterator begin, size_type size, Predicate p, size_type* valid_count)
 {
   constexpr size_type leader_lane{0};
@@ -151,13 +151,13 @@ template <typename InputIterator1,
           typename InputIterator2,
           typename BinaryPredicate,
           int32_t block_size>
-__global__ void valid_if_n_kernel(InputIterator1 begin1,
-                                  InputIterator2 begin2,
-                                  BinaryPredicate p,
-                                  bitmask_type* masks[],
-                                  size_type mask_count,
-                                  size_type mask_num_bits,
-                                  size_type* valid_counts)
+CUDF_KERNEL void valid_if_n_kernel(InputIterator1 begin1,
+                                   InputIterator2 begin2,
+                                   BinaryPredicate p,
+                                   bitmask_type* masks[],
+                                   size_type mask_count,
+                                   size_type mask_num_bits,
+                                   size_type* valid_counts)
 {
   for (size_type mask_idx = 0; mask_idx < mask_count; mask_idx++) {
     auto const mask = masks[mask_idx];
diff --git a/cpp/include/cudf/hashing/detail/helper_functions.cuh b/cpp/include/cudf/hashing/detail/helper_functions.cuh
index cd58ec5f57d..3489fdeccee 100644
--- a/cpp/include/cudf/hashing/detail/helper_functions.cuh
+++ b/cpp/include/cudf/hashing/detail/helper_functions.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2017-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -130,10 +130,10 @@ __forceinline__ __device__ void store_pair_vectorized(pair_type* __restrict__ co
 }
 
 template <typename value_type, typename size_type, typename key_type, typename elem_type>
-__global__ void init_hashtbl(value_type* __restrict__ const hashtbl_values,
-                             size_type const n,
-                             key_type const key_val,
-                             elem_type const elem_val)
+CUDF_KERNEL void init_hashtbl(value_type* __restrict__ const hashtbl_values,
+                              size_type const n,
+                              key_type const key_val,
+                              elem_type const elem_val)
 {
   size_type const idx = blockIdx.x * blockDim.x + threadIdx.x;
   if (idx < n) {
diff --git a/cpp/include/cudf/strings/detail/gather.cuh b/cpp/include/cudf/strings/detail/gather.cuh
index e681373e6e0..f7d2ebebe9a 100644
--- a/cpp/include/cudf/strings/detail/gather.cuh
+++ b/cpp/include/cudf/strings/detail/gather.cuh
@@ -78,11 +78,11 @@ __forceinline__ __device__ uint4 load_uint4(char const* ptr)
  * @param total_out_strings Number of output strings to be gathered.
  */
 template <typename StringIterator, typename MapIterator>
-__global__ void gather_chars_fn_string_parallel(StringIterator strings_begin,
-                                                char* out_chars,
-                                                cudf::detail::input_offsetalator const out_offsets,
-                                                MapIterator string_indices,
-                                                size_type total_out_strings)
+CUDF_KERNEL void gather_chars_fn_string_parallel(StringIterator strings_begin,
+                                                 char* out_chars,
+                                                 cudf::detail::input_offsetalator const out_offsets,
+                                                 MapIterator string_indices,
+                                                 size_type total_out_strings)
 {
   constexpr size_t out_datatype_size = sizeof(uint4);
   constexpr size_t in_datatype_size  = sizeof(uint);
@@ -160,11 +160,11 @@ __global__ void gather_chars_fn_string_parallel(StringIterator strings_begin,
  * @param total_out_strings Number of output strings to be gathered.
  */
 template <int strings_per_threadblock, typename StringIterator, typename MapIterator>
-__global__ void gather_chars_fn_char_parallel(StringIterator strings_begin,
-                                              char* out_chars,
-                                              cudf::detail::input_offsetalator const out_offsets,
-                                              MapIterator string_indices,
-                                              size_type total_out_strings)
+CUDF_KERNEL void gather_chars_fn_char_parallel(StringIterator strings_begin,
+                                               char* out_chars,
+                                               cudf::detail::input_offsetalator const out_offsets,
+                                               MapIterator string_indices,
+                                               size_type total_out_strings)
 {
   __shared__ int64_t out_offsets_threadblock[strings_per_threadblock + 1];
 
diff --git a/cpp/include/cudf/types.hpp b/cpp/include/cudf/types.hpp
index addab160b6e..7e70bcb4395 100644
--- a/cpp/include/cudf/types.hpp
+++ b/cpp/include/cudf/types.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,8 +18,10 @@
 
 #ifdef __CUDACC__
 #define CUDF_HOST_DEVICE __host__ __device__
+#define CUDF_KERNEL      __global__ static
 #else
 #define CUDF_HOST_DEVICE
+#define CUDF_KERNEL static
 #endif
 
 #include <cassert>
diff --git a/cpp/include/cudf_test/print_utilities.cuh b/cpp/include/cudf_test/print_utilities.cuh
index 37ffcd401fc..ae6c8cef029 100644
--- a/cpp/include/cudf_test/print_utilities.cuh
+++ b/cpp/include/cudf_test/print_utilities.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -103,7 +103,7 @@ CUDF_HOST_DEVICE void print_values(int32_t width, char delimiter, T arg, Ts... a
 }
 
 template <typename... Ts>
-__global__ void print_array_kernel(std::size_t count, int32_t width, char delimiter, Ts... args)
+CUDF_KERNEL void print_array_kernel(std::size_t count, int32_t width, char delimiter, Ts... args)
 {
   if (threadIdx.x == 0 && blockIdx.x == 0) {
     for (std::size_t i = 0; i < count; i++) {
diff --git a/cpp/src/binaryop/compiled/binary_ops.cuh b/cpp/src/binaryop/compiled/binary_ops.cuh
index 9a50eb0d0ec..d605c877d3f 100644
--- a/cpp/src/binaryop/compiled/binary_ops.cuh
+++ b/cpp/src/binaryop/compiled/binary_ops.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -237,7 +237,7 @@ struct binary_op_double_device_dispatcher {
  * @param f Functor object to call for each element.
  */
 template <typename Functor>
-__global__ void for_each_kernel(cudf::size_type size, Functor f)
+CUDF_KERNEL void for_each_kernel(cudf::size_type size, Functor f)
 {
   int tid    = threadIdx.x;
   int blkid  = blockIdx.x;
diff --git a/cpp/src/binaryop/jit/kernel.cu b/cpp/src/binaryop/jit/kernel.cu
index c9cc61a4f34..39735a43474 100644
--- a/cpp/src/binaryop/jit/kernel.cu
+++ b/cpp/src/binaryop/jit/kernel.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Copyright 2018-2019 BlazingDB, Inc.
  *     Copyright 2018 Christian Noboa Mardini <christian@blazingdb.com>
@@ -43,10 +43,10 @@ struct UserDefinedOp {
 };
 
 template <typename TypeOut, typename TypeLhs, typename TypeRhs, typename TypeOpe>
-__global__ void kernel_v_v(cudf::size_type size,
-                           TypeOut* out_data,
-                           TypeLhs* lhs_data,
-                           TypeRhs* rhs_data)
+CUDF_KERNEL void kernel_v_v(cudf::size_type size,
+                            TypeOut* out_data,
+                            TypeLhs* lhs_data,
+                            TypeRhs* rhs_data)
 {
   int tid    = threadIdx.x;
   int blkid  = blockIdx.x;
@@ -62,15 +62,15 @@ __global__ void kernel_v_v(cudf::size_type size,
 }
 
 template <typename TypeOut, typename TypeLhs, typename TypeRhs, typename TypeOpe>
-__global__ void kernel_v_v_with_validity(cudf::size_type size,
-                                         TypeOut* out_data,
-                                         TypeLhs* lhs_data,
-                                         TypeRhs* rhs_data,
-                                         cudf::bitmask_type* output_mask,
-                                         cudf::bitmask_type const* lhs_mask,
-                                         cudf::size_type lhs_offset,
-                                         cudf::bitmask_type const* rhs_mask,
-                                         cudf::size_type rhs_offset)
+CUDF_KERNEL void kernel_v_v_with_validity(cudf::size_type size,
+                                          TypeOut* out_data,
+                                          TypeLhs* lhs_data,
+                                          TypeRhs* rhs_data,
+                                          cudf::bitmask_type* output_mask,
+                                          cudf::bitmask_type const* lhs_mask,
+                                          cudf::size_type lhs_offset,
+                                          cudf::bitmask_type const* rhs_mask,
+                                          cudf::size_type rhs_offset)
 {
   int tid    = threadIdx.x;
   int blkid  = blockIdx.x;
diff --git a/cpp/src/bitmask/null_mask.cu b/cpp/src/bitmask/null_mask.cu
index 1a1cbb17d15..bb320e4b81a 100644
--- a/cpp/src/bitmask/null_mask.cu
+++ b/cpp/src/bitmask/null_mask.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -98,11 +98,11 @@ rmm::device_buffer create_null_mask(size_type size,
 }
 
 namespace {
-__global__ void set_null_mask_kernel(bitmask_type* __restrict__ destination,
-                                     size_type begin_bit,
-                                     size_type end_bit,
-                                     bool valid,
-                                     size_type number_of_mask_words)
+CUDF_KERNEL void set_null_mask_kernel(bitmask_type* __restrict__ destination,
+                                      size_type begin_bit,
+                                      size_type end_bit,
+                                      bool valid,
+                                      size_type number_of_mask_words)
 {
   auto x                            = destination + word_index(begin_bit);
   thread_index_type const last_word = word_index(end_bit) - word_index(begin_bit);
@@ -190,11 +190,11 @@ namespace {
  * @param number_of_mask_words The number of `cudf::bitmask_type` words to copy
  */
 // TODO: Also make binops test that uses offset in column_view
-__global__ void copy_offset_bitmask(bitmask_type* __restrict__ destination,
-                                    bitmask_type const* __restrict__ source,
-                                    size_type source_begin_bit,
-                                    size_type source_end_bit,
-                                    size_type number_of_mask_words)
+CUDF_KERNEL void copy_offset_bitmask(bitmask_type* __restrict__ destination,
+                                     bitmask_type const* __restrict__ source,
+                                     size_type source_begin_bit,
+                                     size_type source_end_bit,
+                                     size_type number_of_mask_words)
 {
   auto const stride = cudf::detail::grid_1d::grid_stride();
   for (thread_index_type destination_word_index = grid_1d::global_thread_id();
@@ -260,10 +260,10 @@ namespace {
  * @param[out] global_count The number of non-zero bits in the specified range
  */
 template <size_type block_size>
-__global__ void count_set_bits_kernel(bitmask_type const* bitmask,
-                                      size_type first_bit_index,
-                                      size_type last_bit_index,
-                                      size_type* global_count)
+CUDF_KERNEL void count_set_bits_kernel(bitmask_type const* bitmask,
+                                       size_type first_bit_index,
+                                       size_type last_bit_index,
+                                       size_type* global_count)
 {
   constexpr auto const word_size{detail::size_in_bits<bitmask_type>()};
 
diff --git a/cpp/src/copying/concatenate.cu b/cpp/src/copying/concatenate.cu
index ddf39e21685..b1d850e0b27 100644
--- a/cpp/src/copying/concatenate.cu
+++ b/cpp/src/copying/concatenate.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -111,12 +111,12 @@ auto create_device_views(host_span<column_view const> views, rmm::cuda_stream_vi
  * @param out_valid_count To hold the total number of valid bits set
  */
 template <size_type block_size>
-__global__ void concatenate_masks_kernel(column_device_view const* views,
-                                         size_t const* output_offsets,
-                                         size_type number_of_views,
-                                         bitmask_type* dest_mask,
-                                         size_type number_of_mask_bits,
-                                         size_type* out_valid_count)
+CUDF_KERNEL void concatenate_masks_kernel(column_device_view const* views,
+                                          size_t const* output_offsets,
+                                          size_type number_of_views,
+                                          bitmask_type* dest_mask,
+                                          size_type number_of_mask_bits,
+                                          size_type* out_valid_count)
 {
   auto tidx         = cudf::detail::grid_1d::global_thread_id();
   auto const stride = cudf::detail::grid_1d::grid_stride();
@@ -187,11 +187,11 @@ size_type concatenate_masks(host_span<column_view const> views,
 
 namespace {
 template <typename T, size_type block_size, bool Nullable>
-__global__ void fused_concatenate_kernel(column_device_view const* input_views,
-                                         size_t const* input_offsets,
-                                         size_type num_input_views,
-                                         mutable_column_device_view output_view,
-                                         size_type* out_valid_count)
+CUDF_KERNEL void fused_concatenate_kernel(column_device_view const* input_views,
+                                          size_t const* input_offsets,
+                                          size_type num_input_views,
+                                          mutable_column_device_view output_view,
+                                          size_type* out_valid_count)
 {
   auto const output_size = output_view.size();
   auto* output_data      = output_view.data<T>();
diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu
index dd4af236ecf..855c2fb242c 100644
--- a/cpp/src/copying/contiguous_split.cu
+++ b/cpp/src/copying/contiguous_split.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -280,9 +280,9 @@ __device__ void copy_buffer(uint8_t* __restrict__ dst,
  * @param buf_info Information on the range of values to be copied for each destination buffer
  */
 template <int block_size, typename IndexToDstBuf>
-__global__ void copy_partitions(IndexToDstBuf index_to_buffer,
-                                uint8_t const** src_bufs,
-                                dst_buf_info* buf_info)
+CUDF_KERNEL void copy_partitions(IndexToDstBuf index_to_buffer,
+                                 uint8_t const** src_bufs,
+                                 dst_buf_info* buf_info)
 {
   auto const buf_index     = blockIdx.x;
   auto const src_buf_index = buf_info[buf_index].src_buf_index;
diff --git a/cpp/src/copying/scatter.cu b/cpp/src/copying/scatter.cu
index 8f326184012..517435503ee 100644
--- a/cpp/src/copying/scatter.cu
+++ b/cpp/src/copying/scatter.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -50,9 +50,9 @@ namespace detail {
 namespace {
 
 template <bool mark_true, typename MapIterator>
-__global__ void marking_bitmask_kernel(mutable_column_device_view destination,
-                                       MapIterator scatter_map,
-                                       size_type num_scatter_rows)
+CUDF_KERNEL void marking_bitmask_kernel(mutable_column_device_view destination,
+                                        MapIterator scatter_map,
+                                        size_type num_scatter_rows)
 {
   auto row          = cudf::detail::grid_1d::global_thread_id();
   auto const stride = cudf::detail::grid_1d::grid_stride();
diff --git a/cpp/src/io/avro/avro_gpu.cu b/cpp/src/io/avro/avro_gpu.cu
index 365f6d6875c..59177a68ee7 100644
--- a/cpp/src/io/avro/avro_gpu.cu
+++ b/cpp/src/io/avro/avro_gpu.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -324,7 +324,7 @@ avro_decode_row(schemadesc_s const* schema,
  * @param[in] min_row_size Minimum size in bytes of a row
  */
 // blockDim {32,num_warps,1}
-__global__ void __launch_bounds__(num_warps * 32, 2)
+CUDF_KERNEL void __launch_bounds__(num_warps * 32, 2)
   gpuDecodeAvroColumnData(device_span<block_desc_s const> blocks,
                           schemadesc_s* schema_g,
                           device_span<string_index_pair const> global_dictionary,
diff --git a/cpp/src/io/comp/debrotli.cu b/cpp/src/io/comp/debrotli.cu
index 8bafd054bdb..9c936fefd6c 100644
--- a/cpp/src/io/comp/debrotli.cu
+++ b/cpp/src/io/comp/debrotli.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -1911,7 +1911,7 @@ static __device__ void ProcessCommands(debrotli_state_s* s, brotli_dictionary_s
  * @param scratch_size Size of scratch heap space (smaller sizes may result in serialization between
  * blocks)
  */
-__global__ void __launch_bounds__(block_size, 2)
+CUDF_KERNEL void __launch_bounds__(block_size, 2)
   gpu_debrotli_kernel(device_span<device_span<uint8_t const> const> inputs,
                       device_span<device_span<uint8_t> const> outputs,
                       device_span<compression_result> results,
diff --git a/cpp/src/io/comp/gpuinflate.cu b/cpp/src/io/comp/gpuinflate.cu
index 8993815e560..cd50545afbd 100644
--- a/cpp/src/io/comp/gpuinflate.cu
+++ b/cpp/src/io/comp/gpuinflate.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -1024,7 +1024,7 @@ __device__ int parse_gzip_header(uint8_t const* src, size_t src_size)
  * @param parse_hdr If nonzero, indicates that the compressed bitstream includes a GZIP header
  */
 template <int block_size>
-__global__ void __launch_bounds__(block_size)
+CUDF_KERNEL void __launch_bounds__(block_size)
   inflate_kernel(device_span<device_span<uint8_t const> const> inputs,
                  device_span<device_span<uint8_t> const> outputs,
                  device_span<compression_result> results,
@@ -1152,7 +1152,7 @@ __global__ void __launch_bounds__(block_size)
  *
  * @param inputs Source and destination information per block
  */
-__global__ void __launch_bounds__(1024)
+CUDF_KERNEL void __launch_bounds__(1024)
   copy_uncompressed_kernel(device_span<device_span<uint8_t const> const> inputs,
                            device_span<device_span<uint8_t> const> outputs)
 {
diff --git a/cpp/src/io/comp/snap.cu b/cpp/src/io/comp/snap.cu
index 0428f4edaf2..a45e8b2083b 100644
--- a/cpp/src/io/comp/snap.cu
+++ b/cpp/src/io/comp/snap.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -257,7 +257,7 @@ static __device__ uint32_t Match60(uint8_t const* src1,
  * @param[out] outputs Compression status per block
  * @param[in] count Number of blocks to compress
  */
-__global__ void __launch_bounds__(128)
+CUDF_KERNEL void __launch_bounds__(128)
   snap_kernel(device_span<device_span<uint8_t const> const> inputs,
               device_span<device_span<uint8_t> const> outputs,
               device_span<compression_result> results)
diff --git a/cpp/src/io/comp/unsnap.cu b/cpp/src/io/comp/unsnap.cu
index 504a2fe377c..46555a97e9c 100644
--- a/cpp/src/io/comp/unsnap.cu
+++ b/cpp/src/io/comp/unsnap.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -628,7 +628,7 @@ __device__ void snappy_process_symbols(unsnap_state_s* s, int t, Storage& temp_s
  * @param[out] outputs Decompression status per block
  */
 template <int block_size>
-__global__ void __launch_bounds__(block_size)
+CUDF_KERNEL void __launch_bounds__(block_size)
   unsnap_kernel(device_span<device_span<uint8_t const> const> inputs,
                 device_span<device_span<uint8_t> const> outputs,
                 device_span<compression_result> results)
diff --git a/cpp/src/io/csv/csv_gpu.cu b/cpp/src/io/csv/csv_gpu.cu
index 248e17669bc..8252cccbdb9 100644
--- a/cpp/src/io/csv/csv_gpu.cu
+++ b/cpp/src/io/csv/csv_gpu.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -168,7 +168,7 @@ __device__ __inline__ bool is_floatingpoint(long len,
  * @param row_offsets The start the CSV data of interest
  * @param d_column_data The count for each column data type
  */
-__global__ void __launch_bounds__(csvparse_block_dim)
+CUDF_KERNEL void __launch_bounds__(csvparse_block_dim)
   data_type_detection(parse_options_view const opts,
                       device_span<char const> csv_text,
                       device_span<column_parse::flags const> const column_flags,
@@ -305,7 +305,7 @@ __global__ void __launch_bounds__(csvparse_block_dim)
  * @param[out] valids The bitmaps indicating whether column fields are valid
  * @param[out] valid_counts The number of valid fields in each column
  */
-__global__ void __launch_bounds__(csvparse_block_dim)
+CUDF_KERNEL void __launch_bounds__(csvparse_block_dim)
   convert_csv_to_cudf(cudf::io::parse_options_view options,
                       device_span<char const> data,
                       device_span<column_parse::flags const> column_flags,
@@ -622,7 +622,7 @@ static inline __device__ rowctx32_t rowctx_inverse_merge_transform(uint64_t ctxt
  * @param escapechar Delimiter escape character
  * @param commentchar Comment line character (skip rows starting with this character)
  */
-__global__ void __launch_bounds__(rowofs_block_dim)
+CUDF_KERNEL void __launch_bounds__(rowofs_block_dim)
   gather_row_offsets_gpu(uint64_t* row_ctx,
                          device_span<uint64_t> offsets_out,
                          device_span<char const> const data,
diff --git a/cpp/src/io/fst/agent_dfa.cuh b/cpp/src/io/fst/agent_dfa.cuh
index f867a95a864..9bb087e788d 100644
--- a/cpp/src/io/fst/agent_dfa.cuh
+++ b/cpp/src/io/fst/agent_dfa.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -493,7 +493,7 @@ template <bool IS_TRANS_VECTOR_PASS,
           typename TransducedOutItT,
           typename TransducedIndexOutItT,
           typename TransducedCountOutItT>
-__launch_bounds__(int32_t(AgentDFAPolicy::BLOCK_THREADS)) __global__
+__launch_bounds__(int32_t(AgentDFAPolicy::BLOCK_THREADS)) CUDF_KERNEL
   void SimulateDFAKernel(DfaT dfa,
                          SymbolItT d_chars,
                          OffsetT const num_chars,
diff --git a/cpp/src/io/fst/dispatch_dfa.cuh b/cpp/src/io/fst/dispatch_dfa.cuh
index a5c1a4f4f5c..be63ec6539f 100644
--- a/cpp/src/io/fst/dispatch_dfa.cuh
+++ b/cpp/src/io/fst/dispatch_dfa.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -67,7 +67,7 @@ struct DeviceFSMPolicy {
  * @return
  */
 template <typename TileState>
-__global__ void initialization_pass_kernel(TileState items_state, uint32_t num_tiles)
+CUDF_KERNEL void initialization_pass_kernel(TileState items_state, uint32_t num_tiles)
 {
   items_state.InitializeStatus(num_tiles);
 }
diff --git a/cpp/src/io/json/legacy/json_gpu.cu b/cpp/src/io/json/legacy/json_gpu.cu
index b358cc2071b..4d5293e12fd 100644
--- a/cpp/src/io/json/legacy/json_gpu.cu
+++ b/cpp/src/io/json/legacy/json_gpu.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -245,14 +245,14 @@ __device__ std::pair<char const*, char const*> get_row_data_range(
  * @param[out] valid_fields The bitmaps indicating whether column fields are valid
  * @param[out] num_valid_fields The numbers of valid fields in columns
  */
-__global__ void convert_data_to_columns_kernel(parse_options_view opts,
-                                               device_span<char const> const data,
-                                               device_span<uint64_t const> const row_offsets,
-                                               device_span<data_type const> const column_types,
-                                               col_map_type col_map,
-                                               device_span<void* const> const output_columns,
-                                               device_span<bitmask_type* const> const valid_fields,
-                                               device_span<cudf::size_type> const num_valid_fields)
+CUDF_KERNEL void convert_data_to_columns_kernel(parse_options_view opts,
+                                                device_span<char const> const data,
+                                                device_span<uint64_t const> const row_offsets,
+                                                device_span<data_type const> const column_types,
+                                                col_map_type col_map,
+                                                device_span<void* const> const output_columns,
+                                                device_span<bitmask_type* const> const valid_fields,
+                                                device_span<cudf::size_type> const num_valid_fields)
 {
   auto const rec_id = grid_1d::global_thread_id();
   if (rec_id >= row_offsets.size()) return;
@@ -321,7 +321,7 @@ __global__ void convert_data_to_columns_kernel(parse_options_view opts,
  * @param[in] num_columns The number of columns of input data
  * @param[out] column_infos The count for each column data type
  */
-__global__ void detect_data_types_kernel(
+CUDF_KERNEL void detect_data_types_kernel(
   parse_options_view const opts,
   device_span<char const> const data,
   device_span<uint64_t const> const row_offsets,
@@ -481,11 +481,11 @@ __device__ key_value_range get_next_key_value_range(char const* begin,
  * @param[out] keys_cnt Number of keys found in the file
  * @param[out] keys_info optional, information (offset, length, hash) for each found key
  */
-__global__ void collect_keys_info_kernel(parse_options_view const options,
-                                         device_span<char const> const data,
-                                         device_span<uint64_t const> const row_offsets,
-                                         unsigned long long int* keys_cnt,
-                                         thrust::optional<mutable_table_device_view> keys_info)
+CUDF_KERNEL void collect_keys_info_kernel(parse_options_view const options,
+                                          device_span<char const> const data,
+                                          device_span<uint64_t const> const row_offsets,
+                                          unsigned long long int* keys_cnt,
+                                          thrust::optional<mutable_table_device_view> keys_info)
 {
   auto const rec_id = grid_1d::global_thread_id();
   if (rec_id >= row_offsets.size()) return;
diff --git a/cpp/src/io/orc/dict_enc.cu b/cpp/src/io/orc/dict_enc.cu
index 1d2262a1ccc..5971482f80c 100644
--- a/cpp/src/io/orc/dict_enc.cu
+++ b/cpp/src/io/orc/dict_enc.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,10 +27,10 @@ namespace cudf::io::orc::gpu {
 /**
  * @brief Counts the number of characters in each rowgroup of each string column.
  */
-__global__ void rowgroup_char_counts_kernel(device_2dspan<size_type> char_counts,
-                                            device_span<orc_column_device_view const> orc_columns,
-                                            device_2dspan<rowgroup_rows const> rowgroup_bounds,
-                                            device_span<uint32_t const> str_col_indexes)
+CUDF_KERNEL void rowgroup_char_counts_kernel(device_2dspan<size_type> char_counts,
+                                             device_span<orc_column_device_view const> orc_columns,
+                                             device_2dspan<rowgroup_rows const> rowgroup_bounds,
+                                             device_span<uint32_t const> str_col_indexes)
 {
   // Index of the column in the `str_col_indexes` array
   auto const str_col_idx = blockIdx.y;
@@ -75,7 +75,7 @@ void rowgroup_char_counts(device_2dspan<size_type> counts,
 }
 
 template <int block_size>
-__global__ void __launch_bounds__(block_size)
+CUDF_KERNEL void __launch_bounds__(block_size)
   initialize_dictionary_hash_maps_kernel(device_span<stripe_dictionary> dictionaries)
 {
   auto const dict_map = dictionaries[blockIdx.x].map_slots;
@@ -107,7 +107,7 @@ struct hash_functor {
 };
 
 template <int block_size>
-__global__ void __launch_bounds__(block_size)
+CUDF_KERNEL void __launch_bounds__(block_size)
   populate_dictionary_hash_maps_kernel(device_2dspan<stripe_dictionary> dictionaries,
                                        device_span<orc_column_device_view const> columns)
 {
@@ -162,7 +162,7 @@ __global__ void __launch_bounds__(block_size)
 }
 
 template <int block_size>
-__global__ void __launch_bounds__(block_size)
+CUDF_KERNEL void __launch_bounds__(block_size)
   collect_map_entries_kernel(device_2dspan<stripe_dictionary> dictionaries)
 {
   auto const col_idx    = blockIdx.x;
@@ -196,7 +196,7 @@ __global__ void __launch_bounds__(block_size)
 }
 
 template <int block_size>
-__global__ void __launch_bounds__(block_size)
+CUDF_KERNEL void __launch_bounds__(block_size)
   get_dictionary_indices_kernel(device_2dspan<stripe_dictionary> dictionaries,
                                 device_span<orc_column_device_view const> columns)
 {
diff --git a/cpp/src/io/orc/stats_enc.cu b/cpp/src/io/orc/stats_enc.cu
index 7436e22e125..c4bcc11fe80 100644
--- a/cpp/src/io/orc/stats_enc.cu
+++ b/cpp/src/io/orc/stats_enc.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -35,7 +35,7 @@ constexpr unsigned int init_threads_per_group = 32;
 constexpr unsigned int init_groups_per_block  = 4;
 constexpr unsigned int init_threads_per_block = init_threads_per_group * init_groups_per_block;
 
-__global__ void __launch_bounds__(init_threads_per_block)
+CUDF_KERNEL void __launch_bounds__(init_threads_per_block)
   gpu_init_statistics_groups(statistics_group* groups,
                              stats_column_desc const* cols,
                              device_2dspan<rowgroup_rows const> rowgroup_bounds)
@@ -73,7 +73,7 @@ constexpr unsigned int pb_fldlen_common =
   pb_fld_hdrlen + (pb_fld_hdrlen + pb_fldlen_int64) + 2 * pb_fld_hdrlen;
 
 template <unsigned int block_size>
-__global__ void __launch_bounds__(block_size, 1)
+CUDF_KERNEL void __launch_bounds__(block_size, 1)
   gpu_init_statistics_buffersize(statistics_merge_group* groups,
                                  statistics_chunk const* chunks,
                                  uint32_t statistics_count)
@@ -249,7 +249,7 @@ constexpr unsigned int encode_chunks_per_block  = 4;
 constexpr unsigned int encode_threads_per_block =
   encode_threads_per_chunk * encode_chunks_per_block;
 
-__global__ void __launch_bounds__(encode_threads_per_block)
+CUDF_KERNEL void __launch_bounds__(encode_threads_per_block)
   gpu_encode_statistics(uint8_t* blob_bfr,
                         statistics_merge_group* groups,
                         statistics_chunk const* chunks,
diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu
index 0b249bbdafe..14072d79172 100644
--- a/cpp/src/io/orc/stripe_data.cu
+++ b/cpp/src/io/orc/stripe_data.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -1082,7 +1082,7 @@ static __device__ int Decode_Decimals(orc_bytestream_s* bs,
  */
 // blockDim {block_size,1,1}
 template <int block_size>
-__global__ void __launch_bounds__(block_size)
+CUDF_KERNEL void __launch_bounds__(block_size)
   gpuDecodeNullsAndStringDictionaries(ColumnDesc* chunks,
                                       DictionaryEntry* global_dictionary,
                                       uint32_t num_columns,
@@ -1358,7 +1358,7 @@ static const __device__ __constant__ uint32_t kTimestampNanoScale[8] = {
  */
 // blockDim {block_size,1,1}
 template <int block_size>
-__global__ void __launch_bounds__(block_size)
+CUDF_KERNEL void __launch_bounds__(block_size)
   gpuDecodeOrcColumnData(ColumnDesc* chunks,
                          DictionaryEntry* global_dictionary,
                          table_device_view tz_table,
diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu
index b99826e070e..b7dd0ea9ec3 100644
--- a/cpp/src/io/orc/stripe_enc.cu
+++ b/cpp/src/io/orc/stripe_enc.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -723,7 +723,7 @@ static __device__ void encode_null_mask(orcenc_state_s* s,
  */
 // blockDim {`encode_block_size`,1,1}
 template <int block_size>
-__global__ void __launch_bounds__(block_size)
+CUDF_KERNEL void __launch_bounds__(block_size)
   gpuEncodeOrcColumnData(device_2dspan<EncChunk const> chunks,
                          device_2dspan<encoder_chunk_streams> streams)
 {
@@ -1008,7 +1008,7 @@ __global__ void __launch_bounds__(block_size)
  */
 // blockDim {512,1,1}
 template <int block_size>
-__global__ void __launch_bounds__(block_size)
+CUDF_KERNEL void __launch_bounds__(block_size)
   gpuEncodeStringDictionaries(stripe_dictionary const* stripes,
                               device_span<orc_column_device_view const> columns,
                               device_2dspan<EncChunk const> chunks,
@@ -1091,7 +1091,7 @@ __global__ void __launch_bounds__(block_size)
  * @param[in,out] streams List of encoder chunk streams [column][rowgroup]
  */
 // blockDim {compact_streams_block_size,1,1}
-__global__ void __launch_bounds__(compact_streams_block_size)
+CUDF_KERNEL void __launch_bounds__(compact_streams_block_size)
   gpuCompactOrcDataStreams(device_2dspan<StripeStream> strm_desc,
                            device_2dspan<encoder_chunk_streams> streams)
 {
@@ -1136,7 +1136,7 @@ __global__ void __launch_bounds__(compact_streams_block_size)
  * @param[in] comp_block_align Required alignment for compressed blocks
  */
 // blockDim {256,1,1}
-__global__ void __launch_bounds__(256)
+CUDF_KERNEL void __launch_bounds__(256)
   gpuInitCompressionBlocks(device_2dspan<StripeStream const> strm_desc,
                            device_2dspan<encoder_chunk_streams> streams,  // const?
                            device_span<device_span<uint8_t const>> inputs,
@@ -1191,7 +1191,7 @@ __global__ void __launch_bounds__(256)
  * @param[in] max_comp_blk_size Max size of any block after compression
  */
 // blockDim {1024,1,1}
-__global__ void __launch_bounds__(1024)
+CUDF_KERNEL void __launch_bounds__(1024)
   gpuCompactCompressedBlocks(device_2dspan<StripeStream> strm_desc,
                              device_span<device_span<uint8_t const> const> inputs,
                              device_span<device_span<uint8_t> const> outputs,
@@ -1274,8 +1274,8 @@ struct decimal_column_element_sizes {
 // Converts sizes of individual decimal elements to offsets within each row group
 // Conversion is done in-place
 template <int block_size>
-__global__ void decimal_sizes_to_offsets_kernel(device_2dspan<rowgroup_rows const> rg_bounds,
-                                                device_span<decimal_column_element_sizes> sizes)
+CUDF_KERNEL void decimal_sizes_to_offsets_kernel(device_2dspan<rowgroup_rows const> rg_bounds,
+                                                 device_span<decimal_column_element_sizes> sizes)
 {
   using block_scan = cub::BlockScan<uint32_t, block_size>;
   __shared__ typename block_scan::TempStorage scan_storage;
diff --git a/cpp/src/io/orc/stripe_init.cu b/cpp/src/io/orc/stripe_init.cu
index b31a4a081d1..327b9557176 100644
--- a/cpp/src/io/orc/stripe_init.cu
+++ b/cpp/src/io/orc/stripe_init.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -41,7 +41,7 @@ struct compressed_stream_s {
 };
 
 // blockDim {128,1,1}
-__global__ void __launch_bounds__(128, 8) gpuParseCompressedStripeData(
+CUDF_KERNEL void __launch_bounds__(128, 8) gpuParseCompressedStripeData(
   CompressedStreamInfo* strm_info, int32_t num_streams, uint32_t block_size, uint32_t log2maxcr)
 {
   __shared__ compressed_stream_s strm_g[4];
@@ -138,7 +138,7 @@ __global__ void __launch_bounds__(128, 8) gpuParseCompressedStripeData(
 }
 
 // blockDim {128,1,1}
-__global__ void __launch_bounds__(128, 8)
+CUDF_KERNEL void __launch_bounds__(128, 8)
   gpuPostDecompressionReassemble(CompressedStreamInfo* strm_info, int32_t num_streams)
 {
   __shared__ compressed_stream_s strm_g[4];
@@ -442,14 +442,14 @@ static __device__ void gpuMapRowIndexToUncompressed(rowindex_state_s* s,
  * value
  */
 // blockDim {128,1,1}
-__global__ void __launch_bounds__(128, 8) gpuParseRowGroupIndex(RowGroup* row_groups,
-                                                                CompressedStreamInfo* strm_info,
-                                                                ColumnDesc* chunks,
-                                                                uint32_t num_columns,
-                                                                uint32_t num_stripes,
-                                                                uint32_t num_rowgroups,
-                                                                uint32_t rowidx_stride,
-                                                                bool use_base_stride)
+CUDF_KERNEL void __launch_bounds__(128, 8) gpuParseRowGroupIndex(RowGroup* row_groups,
+                                                                 CompressedStreamInfo* strm_info,
+                                                                 ColumnDesc* chunks,
+                                                                 uint32_t num_columns,
+                                                                 uint32_t num_stripes,
+                                                                 uint32_t num_rowgroups,
+                                                                 uint32_t rowidx_stride,
+                                                                 bool use_base_stride)
 {
   __shared__ __align__(16) rowindex_state_s state_g;
   rowindex_state_s* const s = &state_g;
@@ -513,7 +513,7 @@ __global__ void __launch_bounds__(128, 8) gpuParseRowGroupIndex(RowGroup* row_gr
 }
 
 template <int block_size>
-__global__ void __launch_bounds__(block_size)
+CUDF_KERNEL void __launch_bounds__(block_size)
   gpu_reduce_pushdown_masks(device_span<orc_column_device_view const> orc_columns,
                             device_2dspan<rowgroup_rows const> rowgroup_bounds,
                             device_2dspan<size_type> set_counts)
diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index 42eea37e695..e9cd48f6a58 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -357,10 +357,10 @@ struct string_length_functor {
   statistics_merge_group const* stripe_stat_merge;
 };
 
-__global__ void copy_string_data(char* string_pool,
-                                 size_type* offsets,
-                                 statistics_chunk* chunks,
-                                 statistics_merge_group const* groups)
+CUDF_KERNEL void copy_string_data(char* string_pool,
+                                  size_type* offsets,
+                                  statistics_chunk* chunks,
+                                  statistics_merge_group const* groups)
 {
   auto const idx = blockIdx.x / 2;
   if (groups[idx].stats_dtype == dtype_string) {
diff --git a/cpp/src/io/parquet/chunk_dict.cu b/cpp/src/io/parquet/chunk_dict.cu
index 53ff31ab0a7..a43c6d4cbb6 100644
--- a/cpp/src/io/parquet/chunk_dict.cu
+++ b/cpp/src/io/parquet/chunk_dict.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -31,7 +31,7 @@ constexpr int DEFAULT_BLOCK_SIZE = 256;
 }
 
 template <int block_size>
-__global__ void __launch_bounds__(block_size)
+CUDF_KERNEL void __launch_bounds__(block_size)
   initialize_chunk_hash_maps_kernel(device_span<EncColumnChunk> chunks)
 {
   auto const chunk = chunks[blockIdx.x];
@@ -98,7 +98,7 @@ struct map_find_fn {
 };
 
 template <int block_size>
-__global__ void __launch_bounds__(block_size)
+CUDF_KERNEL void __launch_bounds__(block_size)
   populate_chunk_hash_maps_kernel(cudf::detail::device_2dspan<PageFragment const> frags)
 {
   auto col_idx = blockIdx.y;
@@ -189,7 +189,7 @@ __global__ void __launch_bounds__(block_size)
 }
 
 template <int block_size>
-__global__ void __launch_bounds__(block_size)
+CUDF_KERNEL void __launch_bounds__(block_size)
   collect_map_entries_kernel(device_span<EncColumnChunk> chunks)
 {
   auto& chunk = chunks[blockIdx.x];
@@ -223,7 +223,7 @@ __global__ void __launch_bounds__(block_size)
 }
 
 template <int block_size>
-__global__ void __launch_bounds__(block_size)
+CUDF_KERNEL void __launch_bounds__(block_size)
   get_dictionary_indices_kernel(cudf::detail::device_2dspan<PageFragment const> frags)
 {
   auto col_idx = blockIdx.y;
diff --git a/cpp/src/io/parquet/decode_preprocess.cu b/cpp/src/io/parquet/decode_preprocess.cu
index afe9a76a6d0..2d000600028 100644
--- a/cpp/src/io/parquet/decode_preprocess.cu
+++ b/cpp/src/io/parquet/decode_preprocess.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -207,7 +207,7 @@ static __device__ void gpuUpdatePageSizes(page_state_s* s,
  * (PageInfo::str_bytes) as part of the pass
  */
 template <typename level_t>
-__global__ void __launch_bounds__(preprocess_block_size)
+CUDF_KERNEL void __launch_bounds__(preprocess_block_size)
   gpuComputePageSizes(PageInfo* pages,
                       device_span<ColumnChunkDesc const> chunks,
                       size_t min_row,
diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu
index d39edd70fcd..8d220e6fa96 100644
--- a/cpp/src/io/parquet/page_data.cu
+++ b/cpp/src/io/parquet/page_data.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -421,7 +421,7 @@ static __device__ void gpuOutputGeneric(
  * @param error_code Error code to set if an error is encountered
  */
 template <int lvl_buf_size, typename level_t>
-__global__ void __launch_bounds__(decode_block_size)
+CUDF_KERNEL void __launch_bounds__(decode_block_size)
   gpuDecodePageData(PageInfo* pages,
                     device_span<ColumnChunkDesc const> chunks,
                     size_t min_row,
diff --git a/cpp/src/io/parquet/page_delta_decode.cu b/cpp/src/io/parquet/page_delta_decode.cu
index 44ec0e1e027..d0557446f14 100644
--- a/cpp/src/io/parquet/page_delta_decode.cu
+++ b/cpp/src/io/parquet/page_delta_decode.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -305,7 +305,7 @@ struct delta_byte_array_decoder {
 // with V2 page headers; see https://www.mail-archive.com/dev@parquet.apache.org/msg11826.html).
 // this kernel only needs 96 threads (3 warps)(for now).
 template <typename level_t>
-__global__ void __launch_bounds__(96)
+CUDF_KERNEL void __launch_bounds__(96)
   gpuDecodeDeltaBinary(PageInfo* pages,
                        device_span<ColumnChunkDesc const> chunks,
                        size_t min_row,
@@ -430,7 +430,7 @@ __global__ void __launch_bounds__(96)
 // suffixes are not encoded in the header, we're going to have to first do a quick pass through them
 // to find the start/end of each structure.
 template <typename level_t>
-__global__ void __launch_bounds__(decode_block_size)
+CUDF_KERNEL void __launch_bounds__(decode_block_size)
   gpuDecodeDeltaByteArray(PageInfo* pages,
                           device_span<ColumnChunkDesc const> chunks,
                           size_t min_row,
@@ -587,7 +587,7 @@ __global__ void __launch_bounds__(decode_block_size)
 // Decode page data that is DELTA_LENGTH_BYTE_ARRAY packed. This encoding consists of a
 // DELTA_BINARY_PACKED array of string lengths, followed by the string data.
 template <typename level_t>
-__global__ void __launch_bounds__(decode_block_size)
+CUDF_KERNEL void __launch_bounds__(decode_block_size)
   gpuDecodeDeltaLengthByteArray(PageInfo* pages,
                                 device_span<ColumnChunkDesc const> chunks,
                                 size_t min_row,
diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu
index 8e1c0682ffd..c48c870807a 100644
--- a/cpp/src/io/parquet/page_enc.cu
+++ b/cpp/src/io/parquet/page_enc.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -385,7 +385,7 @@ __device__ uint8_t const* delta_encode(page_enc_state_s<0>* s, uint64_t* buffer,
 
 // blockDim {512,1,1}
 template <int block_size>
-__global__ void __launch_bounds__(block_size)
+CUDF_KERNEL void __launch_bounds__(block_size)
   gpuInitRowGroupFragments(device_2dspan<PageFragment> frag,
                            device_span<parquet_column_device_view const> col_desc,
                            device_span<partition_info const> partitions,
@@ -422,7 +422,7 @@ __global__ void __launch_bounds__(block_size)
 
 // blockDim {512,1,1}
 template <int block_size>
-__global__ void __launch_bounds__(block_size)
+CUDF_KERNEL void __launch_bounds__(block_size)
   gpuCalculatePageFragments(device_span<PageFragment> frag,
                             device_span<size_type const> column_frag_sizes)
 {
@@ -449,7 +449,7 @@ __global__ void __launch_bounds__(block_size)
 }
 
 // blockDim {128,1,1}
-__global__ void __launch_bounds__(128)
+CUDF_KERNEL void __launch_bounds__(128)
   gpuInitFragmentStats(device_span<statistics_group> groups,
                        device_span<PageFragment const> fragments)
 {
@@ -510,7 +510,7 @@ __device__ size_t delta_data_len(Type physical_type,
 }
 
 // blockDim {128,1,1}
-__global__ void __launch_bounds__(128)
+CUDF_KERNEL void __launch_bounds__(128)
   gpuInitPages(device_2dspan<EncColumnChunk> chunks,
                device_span<EncPage> pages,
                device_span<size_type> page_sizes,
@@ -1244,9 +1244,10 @@ __device__ auto julian_days_with_time(int64_t v)
 // the level data is encoded.
 // blockDim(128, 1, 1)
 template <int block_size>
-__global__ void __launch_bounds__(block_size, 8) gpuEncodePageLevels(device_span<EncPage> pages,
-                                                                     bool write_v2_headers,
-                                                                     encode_kernel_mask kernel_mask)
+CUDF_KERNEL void __launch_bounds__(block_size, 8)
+  gpuEncodePageLevels(device_span<EncPage> pages,
+                      bool write_v2_headers,
+                      encode_kernel_mask kernel_mask)
 {
   __shared__ __align__(8) rle_page_enc_state_s state_g;
 
@@ -1504,7 +1505,7 @@ __device__ void finish_page_encode(state_buf* s,
 // PLAIN page data encoder
 // blockDim(128, 1, 1)
 template <int block_size>
-__global__ void __launch_bounds__(block_size, 8)
+CUDF_KERNEL void __launch_bounds__(block_size, 8)
   gpuEncodePages(device_span<EncPage> pages,
                  device_span<device_span<uint8_t const>> comp_in,
                  device_span<device_span<uint8_t>> comp_out,
@@ -1739,7 +1740,7 @@ __global__ void __launch_bounds__(block_size, 8)
 // DICTIONARY page data encoder
 // blockDim(128, 1, 1)
 template <int block_size>
-__global__ void __launch_bounds__(block_size, 8)
+CUDF_KERNEL void __launch_bounds__(block_size, 8)
   gpuEncodeDictPages(device_span<EncPage> pages,
                      device_span<device_span<uint8_t const>> comp_in,
                      device_span<device_span<uint8_t>> comp_out,
@@ -1871,7 +1872,7 @@ __global__ void __launch_bounds__(block_size, 8)
 // DELTA_BINARY_PACKED page data encoder
 // blockDim(128, 1, 1)
 template <int block_size>
-__global__ void __launch_bounds__(block_size, 8)
+CUDF_KERNEL void __launch_bounds__(block_size, 8)
   gpuEncodeDeltaBinaryPages(device_span<EncPage> pages,
                             device_span<device_span<uint8_t const>> comp_in,
                             device_span<device_span<uint8_t>> comp_out,
@@ -1975,7 +1976,7 @@ __global__ void __launch_bounds__(block_size, 8)
 // DELTA_LENGTH_BYTE_ARRAY page data encoder
 // blockDim(128, 1, 1)
 template <int block_size>
-__global__ void __launch_bounds__(block_size, 8)
+CUDF_KERNEL void __launch_bounds__(block_size, 8)
   gpuEncodeDeltaLengthByteArrayPages(device_span<EncPage> pages,
                                      device_span<device_span<uint8_t const>> comp_in,
                                      device_span<device_span<uint8_t>> comp_out,
@@ -2105,7 +2106,7 @@ constexpr int decide_compression_block_size =
   decide_compression_warps_in_block * cudf::detail::warp_size;
 
 // blockDim(decide_compression_block_size, 1, 1)
-__global__ void __launch_bounds__(decide_compression_block_size)
+CUDF_KERNEL void __launch_bounds__(decide_compression_block_size)
   gpuDecideCompression(device_span<EncColumnChunk> chunks)
 {
   __shared__ __align__(8) EncColumnChunk ck_g[decide_compression_warps_in_block];
@@ -2566,7 +2567,7 @@ __device__ uint8_t* EncodeStatistics(uint8_t* start,
 }
 
 // blockDim(128, 1, 1)
-__global__ void __launch_bounds__(128)
+CUDF_KERNEL void __launch_bounds__(128)
   gpuEncodePageHeaders(device_span<EncPage> pages,
                        device_span<compression_result const> comp_results,
                        device_span<statistics_chunk const> page_stats,
@@ -2661,7 +2662,7 @@ __global__ void __launch_bounds__(128)
 }
 
 // blockDim(1024, 1, 1)
-__global__ void __launch_bounds__(1024)
+CUDF_KERNEL void __launch_bounds__(1024)
   gpuGatherPages(device_span<EncColumnChunk> chunks, device_span<EncPage const> pages)
 {
   __shared__ __align__(8) EncColumnChunk ck_g;
@@ -2839,7 +2840,7 @@ struct mask_tform {
 }  // namespace
 
 // blockDim(1, 1, 1)
-__global__ void __launch_bounds__(1)
+CUDF_KERNEL void __launch_bounds__(1)
   gpuEncodeColumnIndexes(device_span<EncColumnChunk> chunks,
                          device_span<statistics_chunk const> column_stats,
                          int32_t column_index_truncate_length)
diff --git a/cpp/src/io/parquet/page_hdr.cu b/cpp/src/io/parquet/page_hdr.cu
index 36157f725e3..e61eb2cdc30 100644
--- a/cpp/src/io/parquet/page_hdr.cu
+++ b/cpp/src/io/parquet/page_hdr.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -348,9 +348,9 @@ struct gpuParsePageHeader {
  * @param[in] num_chunks Number of column chunks
  */
 // blockDim {128,1,1}
-__global__ void __launch_bounds__(128) gpuDecodePageHeaders(ColumnChunkDesc* chunks,
-                                                            int32_t num_chunks,
-                                                            kernel_error::pointer error_code)
+CUDF_KERNEL void __launch_bounds__(128) gpuDecodePageHeaders(ColumnChunkDesc* chunks,
+                                                             int32_t num_chunks,
+                                                             kernel_error::pointer error_code)
 {
   using cudf::detail::warp_size;
   gpuParsePageHeader parse_page_header;
@@ -480,7 +480,7 @@ __global__ void __launch_bounds__(128) gpuDecodePageHeaders(ColumnChunkDesc* chu
  * @param[in] num_chunks Number of column chunks
  */
 // blockDim {128,1,1}
-__global__ void __launch_bounds__(128)
+CUDF_KERNEL void __launch_bounds__(128)
   gpuBuildStringDictionaryIndex(ColumnChunkDesc* chunks, int32_t num_chunks)
 {
   __shared__ ColumnChunkDesc chunk_g[4];
diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu
index d559f93f45b..37a8cabc182 100644
--- a/cpp/src/io/parquet/page_string_decode.cu
+++ b/cpp/src/io/parquet/page_string_decode.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -584,7 +584,7 @@ __device__ thrust::pair<size_t, size_t> totalDeltaByteArraySize(uint8_t const* d
  * @tparam level_t Type used to store decoded repetition and definition levels
  */
 template <typename level_t>
-__global__ void __launch_bounds__(preprocess_block_size) gpuComputeStringPageBounds(
+CUDF_KERNEL void __launch_bounds__(preprocess_block_size) gpuComputeStringPageBounds(
   PageInfo* pages, device_span<ColumnChunkDesc const> chunks, size_t min_row, size_t num_rows)
 {
   __shared__ __align__(16) page_state_s state_g;
@@ -653,7 +653,7 @@ __global__ void __launch_bounds__(preprocess_block_size) gpuComputeStringPageBou
  * @param min_rows crop all rows below min_row
  * @param num_rows Maximum number of rows to read
  */
-__global__ void __launch_bounds__(delta_preproc_block_size) gpuComputeDeltaPageStringSizes(
+CUDF_KERNEL void __launch_bounds__(delta_preproc_block_size) gpuComputeDeltaPageStringSizes(
   PageInfo* pages, device_span<ColumnChunkDesc const> chunks, size_t min_row, size_t num_rows)
 {
   __shared__ __align__(16) page_state_s state_g;
@@ -725,7 +725,7 @@ __global__ void __launch_bounds__(delta_preproc_block_size) gpuComputeDeltaPageS
  * @param min_rows crop all rows below min_row
  * @param num_rows Maximum number of rows to read
  */
-__global__ void __launch_bounds__(delta_length_block_size) gpuComputeDeltaLengthPageStringSizes(
+CUDF_KERNEL void __launch_bounds__(delta_length_block_size) gpuComputeDeltaLengthPageStringSizes(
   PageInfo* pages, device_span<ColumnChunkDesc const> chunks, size_t min_row, size_t num_rows)
 {
   using cudf::detail::warp_size;
@@ -820,7 +820,7 @@ __global__ void __launch_bounds__(delta_length_block_size) gpuComputeDeltaLength
  * @param min_rows crop all rows below min_row
  * @param num_rows Maximum number of rows to read
  */
-__global__ void __launch_bounds__(preprocess_block_size) gpuComputePageStringSizes(
+CUDF_KERNEL void __launch_bounds__(preprocess_block_size) gpuComputePageStringSizes(
   PageInfo* pages, device_span<ColumnChunkDesc const> chunks, size_t min_row, size_t num_rows)
 {
   __shared__ __align__(16) page_state_s state_g;
@@ -912,7 +912,7 @@ __global__ void __launch_bounds__(preprocess_block_size) gpuComputePageStringSiz
  * @tparam level_t Type used to store decoded repetition and definition levels
  */
 template <typename level_t>
-__global__ void __launch_bounds__(decode_block_size)
+CUDF_KERNEL void __launch_bounds__(decode_block_size)
   gpuDecodeStringPageData(PageInfo* pages,
                           device_span<ColumnChunkDesc const> chunks,
                           size_t min_row,
diff --git a/cpp/src/io/statistics/column_statistics.cuh b/cpp/src/io/statistics/column_statistics.cuh
index f71fb95949f..db0d56ac321 100644
--- a/cpp/src/io/statistics/column_statistics.cuh
+++ b/cpp/src/io/statistics/column_statistics.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -289,7 +289,7 @@ __device__ void cooperative_load(T& destination, T const* source = nullptr)
  * @tparam IO File format for which statistics calculation is being done
  */
 template <int block_size, detail::io_file_format IO>
-__global__ void __launch_bounds__(block_size, 1)
+CUDF_KERNEL void __launch_bounds__(block_size, 1)
   gpu_calculate_group_statistics(statistics_chunk* chunks,
                                  statistics_group const* groups,
                                  bool const int96_timestamps)
@@ -368,7 +368,7 @@ void calculate_group_statistics(statistics_chunk* chunks,
  * @tparam IO File format for which statistics calculation is being done
  */
 template <int block_size, detail::io_file_format IO>
-__global__ void __launch_bounds__(block_size, 1)
+CUDF_KERNEL void __launch_bounds__(block_size, 1)
   gpu_merge_group_statistics(statistics_chunk* chunks_out,
                              statistics_chunk const* chunks_in,
                              statistics_merge_group const* groups)
diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu
index 443ca0f5fe7..2194ee1aaa1 100644
--- a/cpp/src/io/text/multibyte_split.cu
+++ b/cpp/src/io/text/multibyte_split.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -138,7 +138,7 @@ using byte_offset   = int64_t;
 // it begins in. From there, each thread can then take deterministic action. In this case, the
 // deterministic action is counting and outputting delimiter offsets when a delimiter is found.
 
-__global__ void multibyte_split_init_kernel(
+CUDF_KERNEL void multibyte_split_init_kernel(
   cudf::size_type base_tile_idx,
   cudf::size_type num_tiles,
   cudf::io::text::detail::scan_tile_state_view<multistate> tile_multistates,
@@ -154,7 +154,7 @@ __global__ void multibyte_split_init_kernel(
   }
 }
 
-__global__ __launch_bounds__(THREADS_PER_TILE) void multibyte_split_kernel(
+CUDF_KERNEL __launch_bounds__(THREADS_PER_TILE) void multibyte_split_kernel(
   cudf::size_type base_tile_idx,
   byte_offset base_input_offset,
   output_offset base_output_offset,
@@ -231,7 +231,7 @@ __global__ __launch_bounds__(THREADS_PER_TILE) void multibyte_split_kernel(
   }
 }
 
-__global__ __launch_bounds__(THREADS_PER_TILE) void byte_split_kernel(
+CUDF_KERNEL __launch_bounds__(THREADS_PER_TILE) void byte_split_kernel(
   cudf::size_type base_tile_idx,
   byte_offset base_input_offset,
   output_offset base_output_offset,
diff --git a/cpp/src/io/utilities/data_casting.cu b/cpp/src/io/utilities/data_casting.cu
index 9e5c5c76392..9545811a542 100644
--- a/cpp/src/io/utilities/data_casting.cu
+++ b/cpp/src/io/utilities/data_casting.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -420,14 +420,14 @@ struct bitfield_block {
  * @param d_chars Character array to store the characters of strings
  */
 template <bool is_warp, size_type num_warps, typename str_tuple_it>
-__global__ void parse_fn_string_parallel(str_tuple_it str_tuples,
-                                         size_type total_out_strings,
-                                         size_type* str_counter,
-                                         bitmask_type* null_mask,
-                                         size_type* null_count_data,
-                                         cudf::io::parse_options_view const options,
-                                         size_type* d_offsets,
-                                         char* d_chars)
+CUDF_KERNEL void parse_fn_string_parallel(str_tuple_it str_tuples,
+                                          size_type total_out_strings,
+                                          size_type* str_counter,
+                                          bitmask_type* null_mask,
+                                          size_type* null_count_data,
+                                          cudf::io::parse_options_view const options,
+                                          size_type* d_offsets,
+                                          char* d_chars)
 {
   constexpr auto BLOCK_SIZE =
     is_warp ? cudf::detail::warp_size : cudf::detail::warp_size * num_warps;
diff --git a/cpp/src/io/utilities/parsing_utils.cu b/cpp/src/io/utilities/parsing_utils.cu
index 06b86f33c85..d02ce99e6e5 100644
--- a/cpp/src/io/utilities/parsing_utils.cu
+++ b/cpp/src/io/utilities/parsing_utils.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -87,12 +87,12 @@ __device__ __forceinline__ void setElement(void*, cudf::size_type, T const&, V c
  * @param[out] positions Array containing the output positions
  */
 template <class T>
-__global__ void count_and_set_positions(char const* data,
-                                        uint64_t size,
-                                        uint64_t offset,
-                                        char const key,
-                                        cudf::size_type* count,
-                                        T* positions)
+CUDF_KERNEL void count_and_set_positions(char const* data,
+                                         uint64_t size,
+                                         uint64_t offset,
+                                         char const key,
+                                         cudf::size_type* count,
+                                         T* positions)
 {
   // thread IDs range per block, so also need the block id
   auto const tid = cudf::detail::grid_1d::global_thread_id();
diff --git a/cpp/src/io/utilities/type_inference.cu b/cpp/src/io/utilities/type_inference.cu
index 79a5c8f1c4c..b446ad41946 100644
--- a/cpp/src/io/utilities/type_inference.cu
+++ b/cpp/src/io/utilities/type_inference.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -112,11 +112,11 @@ __device__ __inline__ bool is_like_float(std::size_t len,
  * @param[out] column_info Histogram of column type counters
  */
 template <int BlockSize, typename OptionsView, typename ColumnStringIter>
-__global__ void infer_column_type_kernel(OptionsView options,
-                                         device_span<char const> data,
-                                         ColumnStringIter offset_length_begin,
-                                         std::size_t size,
-                                         cudf::io::column_type_histogram* column_info)
+CUDF_KERNEL void infer_column_type_kernel(OptionsView options,
+                                          device_span<char const> data,
+                                          ColumnStringIter offset_length_begin,
+                                          std::size_t size,
+                                          cudf::io::column_type_histogram* column_info)
 {
   auto thread_type_histogram = cudf::io::column_type_histogram{};
 
diff --git a/cpp/src/join/conditional_join_kernels.cuh b/cpp/src/join/conditional_join_kernels.cuh
index f665aba698f..02ce27a36ba 100644
--- a/cpp/src/join/conditional_join_kernels.cuh
+++ b/cpp/src/join/conditional_join_kernels.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -48,7 +48,7 @@ namespace detail {
  * @param[out] output_size The resulting output size
  */
 template <int block_size, bool has_nulls>
-__global__ void compute_conditional_join_output_size(
+CUDF_KERNEL void compute_conditional_join_output_size(
   table_device_view left_table,
   table_device_view right_table,
   join_kind join_type,
@@ -138,15 +138,15 @@ __global__ void compute_conditional_join_output_size(
  * the kernel needs to internally loop over left rows. Otherwise, loop over right rows.
  */
 template <cudf::size_type block_size, cudf::size_type output_cache_size, bool has_nulls>
-__global__ void conditional_join(table_device_view left_table,
-                                 table_device_view right_table,
-                                 join_kind join_type,
-                                 cudf::size_type* join_output_l,
-                                 cudf::size_type* join_output_r,
-                                 cudf::size_type* current_idx,
-                                 cudf::ast::detail::expression_device_view device_expression_data,
-                                 cudf::size_type const max_size,
-                                 bool const swap_tables)
+CUDF_KERNEL void conditional_join(table_device_view left_table,
+                                  table_device_view right_table,
+                                  join_kind join_type,
+                                  cudf::size_type* join_output_l,
+                                  cudf::size_type* join_output_r,
+                                  cudf::size_type* current_idx,
+                                  cudf::ast::detail::expression_device_view device_expression_data,
+                                  cudf::size_type const max_size,
+                                  bool const swap_tables)
 {
   constexpr int num_warps = block_size / detail::warp_size;
   __shared__ cudf::size_type current_idx_shared[num_warps];
diff --git a/cpp/src/join/mixed_join_kernel.cuh b/cpp/src/join/mixed_join_kernel.cuh
index efe575e14de..22bbbff967a 100644
--- a/cpp/src/join/mixed_join_kernel.cuh
+++ b/cpp/src/join/mixed_join_kernel.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -36,8 +36,10 @@ namespace detail {
 
 namespace cg = cooperative_groups;
 
+#pragma GCC diagnostic ignored "-Wattributes"
+
 template <cudf::size_type block_size, bool has_nulls>
-__launch_bounds__(block_size) __global__
+__attribute__((visibility("hidden"))) __launch_bounds__(block_size) __global__
   void mixed_join(table_device_view left_table,
                   table_device_view right_table,
                   table_device_view probe,
diff --git a/cpp/src/join/mixed_join_kernels.cuh b/cpp/src/join/mixed_join_kernels.cuh
index 2cd4d0c3b38..1d36a246f02 100644
--- a/cpp/src/join/mixed_join_kernels.cuh
+++ b/cpp/src/join/mixed_join_kernels.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -57,6 +57,7 @@ namespace detail {
  * left/right tables to determine which is the build table and which is the
  * probe table has already happened on the host.
  */
+
 template <int block_size, bool has_nulls>
 __global__ void compute_mixed_join_output_size(
   table_device_view left_table,
diff --git a/cpp/src/join/mixed_join_kernels_semi.cu b/cpp/src/join/mixed_join_kernels_semi.cu
index e31e35ff788..bde75395371 100644
--- a/cpp/src/join/mixed_join_kernels_semi.cu
+++ b/cpp/src/join/mixed_join_kernels_semi.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -31,8 +31,10 @@ namespace detail {
 
 namespace cg = cooperative_groups;
 
+#pragma GCC diagnostic ignored "-Wattributes"
+
 template <cudf::size_type block_size, bool has_nulls>
-__launch_bounds__(block_size) __global__
+__attribute__((visibility("hidden"))) __launch_bounds__(block_size) __global__
   void mixed_join_semi(table_device_view left_table,
                        table_device_view right_table,
                        table_device_view probe,
diff --git a/cpp/src/join/mixed_join_size_kernel.cuh b/cpp/src/join/mixed_join_size_kernel.cuh
index ef377dadc4b..dc3adfa87d6 100644
--- a/cpp/src/join/mixed_join_size_kernel.cuh
+++ b/cpp/src/join/mixed_join_size_kernel.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -34,7 +34,7 @@ namespace detail {
 namespace cg = cooperative_groups;
 
 template <int block_size, bool has_nulls>
-__launch_bounds__(block_size) __global__ void compute_mixed_join_output_size(
+__launch_bounds__(block_size) CUDF_KERNEL void compute_mixed_join_output_size(
   table_device_view left_table,
   table_device_view right_table,
   table_device_view probe,
diff --git a/cpp/src/join/mixed_join_size_kernels_semi.cu b/cpp/src/join/mixed_join_size_kernels_semi.cu
index fd7bf0234e9..31da6677aef 100644
--- a/cpp/src/join/mixed_join_size_kernels_semi.cu
+++ b/cpp/src/join/mixed_join_size_kernels_semi.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -31,20 +31,23 @@ namespace detail {
 
 namespace cg = cooperative_groups;
 
+#pragma GCC diagnostic ignored "-Wattributes"
+
 template <int block_size, bool has_nulls>
-__launch_bounds__(block_size) __global__ void compute_mixed_join_output_size_semi(
-  table_device_view left_table,
-  table_device_view right_table,
-  table_device_view probe,
-  table_device_view build,
-  row_hash const hash_probe,
-  row_equality const equality_probe,
-  join_kind const join_type,
-  cudf::detail::semi_map_type::device_view hash_table_view,
-  ast::detail::expression_device_view device_expression_data,
-  bool const swap_tables,
-  std::size_t* output_size,
-  cudf::device_span<cudf::size_type> matches_per_row)
+__attribute__((visibility("hidden"))) __launch_bounds__(block_size) __global__
+  void compute_mixed_join_output_size_semi(
+    table_device_view left_table,
+    table_device_view right_table,
+    table_device_view probe,
+    table_device_view build,
+    row_hash const hash_probe,
+    row_equality const equality_probe,
+    join_kind const join_type,
+    cudf::detail::semi_map_type::device_view hash_table_view,
+    ast::detail::expression_device_view device_expression_data,
+    bool const swap_tables,
+    std::size_t* output_size,
+    cudf::device_span<cudf::size_type> matches_per_row)
 {
   // The (required) extern storage of the shared memory array leads to
   // conflicting declarations between different templates. The easiest
diff --git a/cpp/src/json/json_path.cu b/cpp/src/json/json_path.cu
index c01357c96ca..6794838c70f 100644
--- a/cpp/src/json/json_path.cu
+++ b/cpp/src/json/json_path.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -900,7 +900,7 @@ __device__ thrust::pair<parse_result, json_output> get_json_object_single(
  * @param options Options controlling behavior
  */
 template <int block_size>
-__launch_bounds__(block_size) __global__
+__launch_bounds__(block_size) CUDF_KERNEL
   void get_json_object_kernel(column_device_view col,
                               path_operator const* const commands,
                               size_type* output_offsets,
diff --git a/cpp/src/merge/merge.cu b/cpp/src/merge/merge.cu
index 0d30230de28..073a2a6b97e 100644
--- a/cpp/src/merge/merge.cu
+++ b/cpp/src/merge/merge.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -122,7 +122,7 @@ using index_type = detail::index_type;
  * to be copied to the output. Length must be equal to `num_destination_rows`
  */
 template <bool left_have_valids, bool right_have_valids>
-__global__ void materialize_merged_bitmask_kernel(
+CUDF_KERNEL void materialize_merged_bitmask_kernel(
   column_device_view left_dcol,
   column_device_view right_dcol,
   bitmask_type* out_validity,
diff --git a/cpp/src/partitioning/partitioning.cu b/cpp/src/partitioning/partitioning.cu
index 7b6676346c2..8d8f1a71672 100644
--- a/cpp/src/partitioning/partitioning.cu
+++ b/cpp/src/partitioning/partitioning.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -122,14 +122,14 @@ class bitwise_partitioner {
  * @param[out] global_partition_sizes The number of rows in each partition.
  */
 template <class row_hasher_t, typename partitioner_type>
-__global__ void compute_row_partition_numbers(row_hasher_t the_hasher,
-                                              size_type const num_rows,
-                                              size_type const num_partitions,
-                                              partitioner_type const the_partitioner,
-                                              size_type* __restrict__ row_partition_numbers,
-                                              size_type* __restrict__ row_partition_offset,
-                                              size_type* __restrict__ block_partition_sizes,
-                                              size_type* __restrict__ global_partition_sizes)
+CUDF_KERNEL void compute_row_partition_numbers(row_hasher_t the_hasher,
+                                               size_type const num_rows,
+                                               size_type const num_partitions,
+                                               partitioner_type const the_partitioner,
+                                               size_type* __restrict__ row_partition_numbers,
+                                               size_type* __restrict__ row_partition_offset,
+                                               size_type* __restrict__ block_partition_sizes,
+                                               size_type* __restrict__ global_partition_sizes)
 {
   // Accumulate histogram of the size of each partition in shared memory
   extern __shared__ size_type shared_partition_sizes[];
@@ -197,10 +197,10 @@ __global__ void compute_row_partition_numbers(row_hasher_t the_hasher,
          {block0 partition(num_partitions-1) offset, block1
  partition(num_partitions -1) offset, ...} }
  */
-__global__ void compute_row_output_locations(size_type* __restrict__ row_partition_numbers,
-                                             size_type const num_rows,
-                                             size_type const num_partitions,
-                                             size_type* __restrict__ block_partition_offsets)
+CUDF_KERNEL void compute_row_output_locations(size_type* __restrict__ row_partition_numbers,
+                                              size_type const num_rows,
+                                              size_type const num_partitions,
+                                              size_type* __restrict__ block_partition_offsets)
 {
   // Shared array that holds the offset of this blocks partitions in
   // global memory
@@ -255,14 +255,14 @@ __global__ void compute_row_output_locations(size_type* __restrict__ row_partiti
  * @param[in] scanned_block_partition_sizes The scan of block_partition_sizes
  */
 template <typename InputIter, typename DataType>
-__global__ void copy_block_partitions(InputIter input_iter,
-                                      DataType* __restrict__ output_buf,
-                                      size_type const num_rows,
-                                      size_type const num_partitions,
-                                      size_type const* __restrict__ row_partition_numbers,
-                                      size_type const* __restrict__ row_partition_offset,
-                                      size_type const* __restrict__ block_partition_sizes,
-                                      size_type const* __restrict__ scanned_block_partition_sizes)
+CUDF_KERNEL void copy_block_partitions(InputIter input_iter,
+                                       DataType* __restrict__ output_buf,
+                                       size_type const num_rows,
+                                       size_type const num_partitions,
+                                       size_type const* __restrict__ row_partition_numbers,
+                                       size_type const* __restrict__ row_partition_offset,
+                                       size_type const* __restrict__ block_partition_sizes,
+                                       size_type const* __restrict__ scanned_block_partition_sizes)
 {
   extern __shared__ char shared_memory[];
   auto block_output = reinterpret_cast<DataType*>(shared_memory);
diff --git a/cpp/src/quantiles/tdigest/tdigest.cu b/cpp/src/quantiles/tdigest/tdigest.cu
index 4764ac4d87a..c8ac19e01cc 100644
--- a/cpp/src/quantiles/tdigest/tdigest.cu
+++ b/cpp/src/quantiles/tdigest/tdigest.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -68,13 +68,13 @@ struct make_centroid {
 
 // kernel for computing percentiles on input tdigest (mean, weight) centroid data.
 template <typename CentroidIter>
-__global__ void compute_percentiles_kernel(device_span<size_type const> tdigest_offsets,
-                                           column_device_view percentiles,
-                                           CentroidIter centroids_,
-                                           double const* min_,
-                                           double const* max_,
-                                           double const* cumulative_weight_,
-                                           double* output)
+CUDF_KERNEL void compute_percentiles_kernel(device_span<size_type const> tdigest_offsets,
+                                            column_device_view percentiles,
+                                            CentroidIter centroids_,
+                                            double const* min_,
+                                            double const* max_,
+                                            double const* cumulative_weight_,
+                                            double* output)
 {
   auto const tid = cudf::detail::grid_1d::global_thread_id();
 
diff --git a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
index 450996a43d2..fc56d17d73b 100644
--- a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
+++ b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -370,15 +370,15 @@ std::unique_ptr<scalar> to_tdigest_scalar(std::unique_ptr<column>&& tdigest,
  */
 
 template <typename GroupInfo, typename NearestWeightFunc, typename CumulativeWeight>
-__global__ void generate_cluster_limits_kernel(int delta,
-                                               size_type num_groups,
-                                               NearestWeightFunc nearest_weight,
-                                               GroupInfo group_info,
-                                               CumulativeWeight cumulative_weight,
-                                               double* group_cluster_wl,
-                                               size_type* group_num_clusters,
-                                               size_type const* group_cluster_offsets,
-                                               bool has_nulls)
+CUDF_KERNEL void generate_cluster_limits_kernel(int delta,
+                                                size_type num_groups,
+                                                NearestWeightFunc nearest_weight,
+                                                GroupInfo group_info,
+                                                CumulativeWeight cumulative_weight,
+                                                double* group_cluster_wl,
+                                                size_type* group_num_clusters,
+                                                size_type const* group_cluster_offsets,
+                                                bool has_nulls)
 {
   int const tid = threadIdx.x + blockIdx.x * blockDim.x;
 
diff --git a/cpp/src/replace/nulls.cu b/cpp/src/replace/nulls.cu
index 2eb624d3f05..bd3e75e2e80 100644
--- a/cpp/src/replace/nulls.cu
+++ b/cpp/src/replace/nulls.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -57,12 +57,12 @@ namespace {  // anonymous
 static constexpr int BLOCK_SIZE = 256;
 
 template <int phase, bool replacement_has_nulls>
-__global__ void replace_nulls_strings(cudf::column_device_view input,
-                                      cudf::column_device_view replacement,
-                                      cudf::bitmask_type* output_valid,
-                                      cudf::size_type* offsets,
-                                      char* chars,
-                                      cudf::size_type* valid_counter)
+CUDF_KERNEL void replace_nulls_strings(cudf::column_device_view input,
+                                       cudf::column_device_view replacement,
+                                       cudf::bitmask_type* output_valid,
+                                       cudf::size_type* offsets,
+                                       char* chars,
+                                       cudf::size_type* valid_counter)
 {
   cudf::size_type nrows = input.size();
   auto i                = cudf::detail::grid_1d::global_thread_id();
@@ -112,10 +112,10 @@ __global__ void replace_nulls_strings(cudf::column_device_view input,
 }
 
 template <typename Type, bool replacement_has_nulls>
-__global__ void replace_nulls(cudf::column_device_view input,
-                              cudf::column_device_view replacement,
-                              cudf::mutable_column_device_view output,
-                              cudf::size_type* output_valid_count)
+CUDF_KERNEL void replace_nulls(cudf::column_device_view input,
+                               cudf::column_device_view replacement,
+                               cudf::mutable_column_device_view output,
+                               cudf::size_type* output_valid_count)
 {
   cudf::size_type nrows = input.size();
   auto i                = cudf::detail::grid_1d::global_thread_id();
diff --git a/cpp/src/replace/replace.cu b/cpp/src/replace/replace.cu
index 9341929de44..7cad2fb10d3 100644
--- a/cpp/src/replace/replace.cu
+++ b/cpp/src/replace/replace.cu
@@ -17,7 +17,7 @@
  * limitations under the License.
  */
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -118,13 +118,13 @@ __device__ int get_new_string_value(cudf::size_type idx,
  * @param output_valid_count The output valid count
  */
 template <bool input_has_nulls, bool replacement_has_nulls>
-__global__ void replace_strings_first_pass(cudf::column_device_view input,
-                                           cudf::column_device_view values_to_replace,
-                                           cudf::column_device_view replacement,
-                                           cudf::mutable_column_device_view offsets,
-                                           cudf::mutable_column_device_view indices,
-                                           cudf::bitmask_type* output_valid,
-                                           cudf::size_type* __restrict__ output_valid_count)
+CUDF_KERNEL void replace_strings_first_pass(cudf::column_device_view input,
+                                            cudf::column_device_view values_to_replace,
+                                            cudf::column_device_view replacement,
+                                            cudf::mutable_column_device_view offsets,
+                                            cudf::mutable_column_device_view indices,
+                                            cudf::bitmask_type* output_valid,
+                                            cudf::size_type* __restrict__ output_valid_count)
 {
   cudf::size_type nrows = input.size();
   auto tid              = cudf::detail::grid_1d::global_thread_id();
@@ -184,11 +184,11 @@ __global__ void replace_strings_first_pass(cudf::column_device_view input,
  * @param indices Temporary column used to store the replacement indices.
  */
 template <bool input_has_nulls, bool replacement_has_nulls>
-__global__ void replace_strings_second_pass(cudf::column_device_view input,
-                                            cudf::column_device_view replacement,
-                                            cudf::mutable_column_device_view offsets,
-                                            cudf::mutable_column_device_view strings,
-                                            cudf::mutable_column_device_view indices)
+CUDF_KERNEL void replace_strings_second_pass(cudf::column_device_view input,
+                                             cudf::column_device_view replacement,
+                                             cudf::mutable_column_device_view offsets,
+                                             cudf::mutable_column_device_view strings,
+                                             cudf::mutable_column_device_view indices)
 {
   cudf::size_type nrows = input.size();
   auto tid              = cudf::detail::grid_1d::global_thread_id();
@@ -245,12 +245,12 @@ __global__ void replace_strings_second_pass(cudf::column_device_view input,
  * @param[in] replacement_valid Valid mask associated with d_replacement_values
  */
 template <class T, bool input_has_nulls, bool replacement_has_nulls>
-__global__ void replace_kernel(cudf::column_device_view input,
-                               cudf::mutable_column_device_view output,
-                               cudf::size_type* __restrict__ output_valid_count,
-                               cudf::size_type nrows,
-                               cudf::column_device_view values_to_replace,
-                               cudf::column_device_view replacement)
+CUDF_KERNEL void replace_kernel(cudf::column_device_view input,
+                                cudf::mutable_column_device_view output,
+                                cudf::size_type* __restrict__ output_valid_count,
+                                cudf::size_type nrows,
+                                cudf::column_device_view values_to_replace,
+                                cudf::column_device_view replacement)
 {
   T* __restrict__ output_data = output.data<T>();
 
diff --git a/cpp/src/rolling/detail/rolling.cuh b/cpp/src/rolling/detail/rolling.cuh
index 0648ef3d30f..20845a97c7e 100644
--- a/cpp/src/rolling/detail/rolling.cuh
+++ b/cpp/src/rolling/detail/rolling.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -1022,7 +1022,7 @@ template <typename OutputType,
           typename DeviceRollingOperator,
           typename PrecedingWindowIterator,
           typename FollowingWindowIterator>
-__launch_bounds__(block_size) __global__
+__launch_bounds__(block_size) CUDF_KERNEL
   void gpu_rolling(column_device_view input,
                    column_device_view default_outputs,
                    mutable_column_device_view output,
diff --git a/cpp/src/rolling/jit/kernel.cu b/cpp/src/rolling/jit/kernel.cu
index 06b224c39ad..2c753965c1c 100644
--- a/cpp/src/rolling/jit/kernel.cu
+++ b/cpp/src/rolling/jit/kernel.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -41,15 +41,15 @@ template <typename InType,
           class agg_op,
           typename PrecedingWindowType,
           typename FollowingWindowType>
-__global__ void gpu_rolling_new(cudf::size_type nrows,
-                                InType const* const __restrict__ in_col,
-                                cudf::bitmask_type const* const __restrict__ in_col_valid,
-                                OutType* __restrict__ out_col,
-                                cudf::bitmask_type* __restrict__ out_col_valid,
-                                cudf::size_type* __restrict__ output_valid_count,
-                                PrecedingWindowType preceding_window_begin,
-                                FollowingWindowType following_window_begin,
-                                cudf::size_type min_periods)
+CUDF_KERNEL void gpu_rolling_new(cudf::size_type nrows,
+                                 InType const* const __restrict__ in_col,
+                                 cudf::bitmask_type const* const __restrict__ in_col_valid,
+                                 OutType* __restrict__ out_col,
+                                 cudf::bitmask_type* __restrict__ out_col_valid,
+                                 cudf::size_type* __restrict__ output_valid_count,
+                                 PrecedingWindowType preceding_window_begin,
+                                 FollowingWindowType following_window_begin,
+                                 cudf::size_type min_periods)
 {
   cudf::thread_index_type i            = blockIdx.x * blockDim.x + threadIdx.x;
   cudf::thread_index_type const stride = blockDim.x * gridDim.x;
diff --git a/cpp/src/strings/attributes.cu b/cpp/src/strings/attributes.cu
index de51356845c..b7d35e0a25c 100644
--- a/cpp/src/strings/attributes.cu
+++ b/cpp/src/strings/attributes.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -110,8 +110,8 @@ std::unique_ptr<column> counts_fn(strings_column_view const& strings,
  * @param d_strings Column with strings to count
  * @param d_lengths Results of the counts per string
  */
-__global__ void count_characters_parallel_fn(column_device_view const d_strings,
-                                             size_type* d_lengths)
+CUDF_KERNEL void count_characters_parallel_fn(column_device_view const d_strings,
+                                              size_type* d_lengths)
 {
   auto const idx    = cudf::detail::grid_1d::global_thread_id();
   using warp_reduce = cub::WarpReduce<size_type>;
diff --git a/cpp/src/strings/convert/convert_urls.cu b/cpp/src/strings/convert/convert_urls.cu
index 511acc38d75..b16eb318b39 100644
--- a/cpp/src/strings/convert/convert_urls.cu
+++ b/cpp/src/strings/convert/convert_urls.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -197,8 +197,8 @@ __forceinline__ __device__ char escaped_sequence_to_byte(char const* const ptr)
  * @param[out] out_counts Number of characters in each decode URL.
  */
 template <size_type num_warps_per_threadblock, size_type char_block_size>
-__global__ void url_decode_char_counter(column_device_view const in_strings,
-                                        size_type* const out_counts)
+CUDF_KERNEL void url_decode_char_counter(column_device_view const in_strings,
+                                         size_type* const out_counts)
 {
   constexpr int halo_size = 2;
   __shared__ char temporary_buffer[num_warps_per_threadblock][char_block_size + halo_size];
@@ -280,9 +280,9 @@ __global__ void url_decode_char_counter(column_device_view const in_strings,
  * @param[in] out_offsets Offset value of each string associated with `out_chars`.
  */
 template <size_type num_warps_per_threadblock, size_type char_block_size>
-__global__ void url_decode_char_replacer(column_device_view const in_strings,
-                                         char* const out_chars,
-                                         size_type const* const out_offsets)
+CUDF_KERNEL void url_decode_char_replacer(column_device_view const in_strings,
+                                          char* const out_chars,
+                                          size_type const* const out_offsets)
 {
   constexpr int halo_size = 2;
   __shared__ char temporary_buffer[num_warps_per_threadblock][char_block_size + halo_size * 2];
diff --git a/cpp/src/strings/copying/concatenate.cu b/cpp/src/strings/copying/concatenate.cu
index 26cd4fff09b..9701335d85e 100644
--- a/cpp/src/strings/copying/concatenate.cu
+++ b/cpp/src/strings/copying/concatenate.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -112,14 +112,14 @@ auto create_strings_device_views(host_span<column_view const> views, rmm::cuda_s
 }
 
 template <size_type block_size, bool Nullable>
-__global__ void fused_concatenate_string_offset_kernel(column_device_view const* input_views,
-                                                       size_t const* input_offsets,
-                                                       size_t const* partition_offsets,
-                                                       size_type const num_input_views,
-                                                       size_type const output_size,
-                                                       int32_t* output_data,
-                                                       bitmask_type* output_mask,
-                                                       size_type* out_valid_count)
+CUDF_KERNEL void fused_concatenate_string_offset_kernel(column_device_view const* input_views,
+                                                        size_t const* input_offsets,
+                                                        size_t const* partition_offsets,
+                                                        size_type const num_input_views,
+                                                        size_type const output_size,
+                                                        int32_t* output_data,
+                                                        bitmask_type* output_mask,
+                                                        size_type* out_valid_count)
 {
   cudf::thread_index_type output_index = threadIdx.x + blockIdx.x * blockDim.x;
   size_type warp_valid_count           = 0;
@@ -169,11 +169,11 @@ __global__ void fused_concatenate_string_offset_kernel(column_device_view const*
   }
 }
 
-__global__ void fused_concatenate_string_chars_kernel(column_device_view const* input_views,
-                                                      size_t const* partition_offsets,
-                                                      size_type const num_input_views,
-                                                      size_type const output_size,
-                                                      char* output_data)
+CUDF_KERNEL void fused_concatenate_string_chars_kernel(column_device_view const* input_views,
+                                                       size_t const* partition_offsets,
+                                                       size_type const num_input_views,
+                                                       size_type const output_size,
+                                                       char* output_data)
 {
   cudf::thread_index_type output_index = threadIdx.x + blockIdx.x * blockDim.x;
 
diff --git a/cpp/src/strings/regex/utilities.cuh b/cpp/src/strings/regex/utilities.cuh
index 23b53062bf3..bc8f5d68a4b 100644
--- a/cpp/src/strings/regex/utilities.cuh
+++ b/cpp/src/strings/regex/utilities.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -37,7 +37,7 @@ namespace detail {
 constexpr auto regex_launch_kernel_block_size = 256;
 
 template <typename ForEachFunction>
-__global__ void for_each_kernel(ForEachFunction fn, reprog_device const d_prog, size_type size)
+CUDF_KERNEL void for_each_kernel(ForEachFunction fn, reprog_device const d_prog, size_type size)
 {
   extern __shared__ u_char shmem[];
   if (threadIdx.x == 0) { d_prog.store(shmem); }
@@ -71,10 +71,10 @@ void launch_for_each_kernel(ForEachFunction fn,
 }
 
 template <typename TransformFunction, typename OutputType>
-__global__ void transform_kernel(TransformFunction fn,
-                                 reprog_device const d_prog,
-                                 OutputType* d_output,
-                                 size_type size)
+CUDF_KERNEL void transform_kernel(TransformFunction fn,
+                                  reprog_device const d_prog,
+                                  OutputType* d_output,
+                                  size_type size)
 {
   extern __shared__ u_char shmem[];
   if (threadIdx.x == 0) { d_prog.store(shmem); }
diff --git a/cpp/src/strings/search/find.cu b/cpp/src/strings/search/find.cu
index d35f512e0f7..7d262522137 100644
--- a/cpp/src/strings/search/find.cu
+++ b/cpp/src/strings/search/find.cu
@@ -115,11 +115,11 @@ struct empty_target_fn {
  * @brief String per warp function for find/rfind
  */
 template <typename TargetIterator, bool forward = true>
-__global__ void finder_warp_parallel_fn(column_device_view const d_strings,
-                                        TargetIterator const d_targets,
-                                        size_type const start,
-                                        size_type const stop,
-                                        size_type* d_results)
+CUDF_KERNEL void finder_warp_parallel_fn(column_device_view const d_strings,
+                                         TargetIterator const d_targets,
+                                         size_type const start,
+                                         size_type const stop,
+                                         size_type* d_results)
 {
   size_type const idx = static_cast<size_type>(threadIdx.x + blockIdx.x * blockDim.x);
 
@@ -346,9 +346,9 @@ namespace {
  * @param d_target String to search for in each row of `d_strings`
  * @param d_results Indicates which rows contain `d_target`
  */
-__global__ void contains_warp_parallel_fn(column_device_view const d_strings,
-                                          string_view const d_target,
-                                          bool* d_results)
+CUDF_KERNEL void contains_warp_parallel_fn(column_device_view const d_strings,
+                                           string_view const d_target,
+                                           bool* d_results)
 {
   size_type const idx = static_cast<size_type>(threadIdx.x + blockIdx.x * blockDim.x);
   using warp_reduce   = cub::WarpReduce<bool>;
diff --git a/cpp/src/text/bpe/byte_pair_encoding.cu b/cpp/src/text/bpe/byte_pair_encoding.cu
index 2d53faf548e..2a6674b213b 100644
--- a/cpp/src/text/bpe/byte_pair_encoding.cu
+++ b/cpp/src/text/bpe/byte_pair_encoding.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -122,11 +122,11 @@ struct bpe_unpairable_offsets_fn {
  * @param d_rerank_data Working memory to hold locations where reranking is required
  */
 template <typename MapRefType>
-__global__ void bpe_parallel_fn(cudf::column_device_view const d_strings,
-                                MapRefType const d_map,
-                                int8_t* d_spaces_data,          // working memory
-                                cudf::size_type* d_ranks_data,  // more working memory
-                                int8_t* d_rerank_data           // and one more working memory
+CUDF_KERNEL void bpe_parallel_fn(cudf::column_device_view const d_strings,
+                                 MapRefType const d_map,
+                                 int8_t* d_spaces_data,          // working memory
+                                 cudf::size_type* d_ranks_data,  // more working memory
+                                 int8_t* d_rerank_data           // and one more working memory
 )
 {
   // string per block
@@ -291,9 +291,9 @@ __global__ void bpe_parallel_fn(cudf::column_device_view const d_strings,
  * @param d_spaces_data Output the location where separator will be inserted
  * @param d_sizes Output sizes of each row
  */
-__global__ void bpe_finalize(cudf::column_device_view const d_strings,
-                             int8_t* d_spaces_data,    // where separators are inserted
-                             cudf::size_type* d_sizes  // output sizes of encoded strings
+CUDF_KERNEL void bpe_finalize(cudf::column_device_view const d_strings,
+                              int8_t* d_spaces_data,    // where separators are inserted
+                              cudf::size_type* d_sizes  // output sizes of encoded strings
 )
 {
   // string per block
diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu
index 4e0a538ffe9..dcb59166cec 100644
--- a/cpp/src/text/minhash.cu
+++ b/cpp/src/text/minhash.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -62,10 +62,10 @@ template <
   typename HashFunction,
   typename hash_value_type = std::
     conditional_t<std::is_same_v<typename HashFunction::result_type, uint32_t>, uint32_t, uint64_t>>
-__global__ void minhash_kernel(cudf::column_device_view const d_strings,
-                               cudf::device_span<hash_value_type const> seeds,
-                               cudf::size_type width,
-                               hash_value_type* d_hashes)
+CUDF_KERNEL void minhash_kernel(cudf::column_device_view const d_strings,
+                                cudf::device_span<hash_value_type const> seeds,
+                                cudf::size_type width,
+                                hash_value_type* d_hashes)
 {
   auto const idx = static_cast<std::size_t>(threadIdx.x + blockIdx.x * blockDim.x);
   if (idx >= (static_cast<std::size_t>(d_strings.size()) *
diff --git a/cpp/src/text/subword/data_normalizer.cu b/cpp/src/text/subword/data_normalizer.cu
index 34eb95bea5c..c83bc2e318f 100644
--- a/cpp/src/text/subword/data_normalizer.cu
+++ b/cpp/src/text/subword/data_normalizer.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -204,13 +204,13 @@ extract_code_points_from_utf8(unsigned char const* strings,
  * @param[out] code_points The resulting code point values from normalization.
  * @param[out] chars_per_thread Output number of code point values per string.
  */
-__global__ void kernel_data_normalizer(unsigned char const* strings,
-                                       size_t const total_bytes,
-                                       uint32_t const* cp_metadata,
-                                       uint64_t const* aux_table,
-                                       bool const do_lower_case,
-                                       uint32_t* code_points,
-                                       uint32_t* chars_per_thread)
+CUDF_KERNEL void kernel_data_normalizer(unsigned char const* strings,
+                                        size_t const total_bytes,
+                                        uint32_t const* cp_metadata,
+                                        uint64_t const* aux_table,
+                                        bool const do_lower_case,
+                                        uint32_t* code_points,
+                                        uint32_t* chars_per_thread)
 {
   constexpr uint32_t init_val                     = (1 << FILTER_BIT);
   uint32_t replacement_code_points[MAX_NEW_CHARS] = {init_val, init_val, init_val};
diff --git a/cpp/src/text/subword/subword_tokenize.cu b/cpp/src/text/subword/subword_tokenize.cu
index 1a3084a257f..1d3c1ccb7e4 100644
--- a/cpp/src/text/subword/subword_tokenize.cu
+++ b/cpp/src/text/subword/subword_tokenize.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -56,7 +56,7 @@ namespace {
  * @param[out] attn_mask Identifies valid token id entries
  * @param[out] metadata Additional data per row
  */
-__global__ void kernel_compute_tensor_metadata(
+CUDF_KERNEL void kernel_compute_tensor_metadata(
   // input
   uint32_t const* token_ids,
   cudf::size_type const* offsets,
diff --git a/cpp/src/text/subword/wordpiece_tokenizer.cu b/cpp/src/text/subword/wordpiece_tokenizer.cu
index 3b912017320..d2804af5f8b 100644
--- a/cpp/src/text/subword/wordpiece_tokenizer.cu
+++ b/cpp/src/text/subword/wordpiece_tokenizer.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -75,12 +75,12 @@ namespace {
  * @param[out] tokens_per_word An array of size `num_code_points` which hold the number of
  *        tokens. This kernel just sets all the values to 0.
  */
-__global__ void init_data_and_mark_word_start_and_ends(uint32_t const* code_points,
-                                                       uint32_t* start_word_indices,
-                                                       uint32_t* end_word_indices,
-                                                       size_t num_code_points,
-                                                       uint32_t* token_ids,
-                                                       uint8_t* tokens_per_word)
+CUDF_KERNEL void init_data_and_mark_word_start_and_ends(uint32_t const* code_points,
+                                                        uint32_t* start_word_indices,
+                                                        uint32_t* end_word_indices,
+                                                        size_t num_code_points,
+                                                        uint32_t* token_ids,
+                                                        uint8_t* tokens_per_word)
 {
   cudf::thread_index_type char_for_thread = static_cast<cudf::thread_index_type>(blockDim.x) *
                                               static_cast<cudf::thread_index_type>(blockIdx.x) +
@@ -131,11 +131,11 @@ __global__ void init_data_and_mark_word_start_and_ends(uint32_t const* code_poin
  *        written to indicate this.
  * @param num_strings The total number of strings to be processed.
  */
-__global__ void mark_string_start_and_ends(uint32_t const* code_points,
-                                           cudf::size_type const* strings_offsets,
-                                           uint32_t* start_word_indices,
-                                           uint32_t* end_word_indices,
-                                           uint32_t num_strings)
+CUDF_KERNEL void mark_string_start_and_ends(uint32_t const* code_points,
+                                            cudf::size_type const* strings_offsets,
+                                            uint32_t* start_word_indices,
+                                            uint32_t* end_word_indices,
+                                            uint32_t num_strings)
 {
   cudf::thread_index_type idx = static_cast<cudf::thread_index_type>(blockDim.x) *
                                   static_cast<cudf::thread_index_type>(blockIdx.x) +
@@ -319,20 +319,20 @@ struct mark_special_tokens {
  * @param outer_hash_b_param: The b parameter for the outer hash
  * @param num_outer_bins: The number of bins for the outer hash
  */
-__global__ void kernel_wordpiece_tokenizer(uint32_t const* code_points,
-                                           uint64_t const* hash_table,
-                                           uint64_t const* bin_coefficients,
-                                           uint16_t const* bin_offsets,
-                                           uint16_t unk_token_id,
-                                           uint32_t outer_hash_a_param,
-                                           uint32_t outer_hash_b_param,
-                                           uint16_t num_outer_bins,
-                                           uint32_t const* word_starts,
-                                           uint32_t const* word_ends,
-                                           uint32_t max_word_length,
-                                           uint32_t total_words,
-                                           uint32_t* token_ids,
-                                           uint8_t* tokens_per_word)
+CUDF_KERNEL void kernel_wordpiece_tokenizer(uint32_t const* code_points,
+                                            uint64_t const* hash_table,
+                                            uint64_t const* bin_coefficients,
+                                            uint16_t const* bin_offsets,
+                                            uint16_t unk_token_id,
+                                            uint32_t outer_hash_a_param,
+                                            uint32_t outer_hash_b_param,
+                                            uint16_t num_outer_bins,
+                                            uint32_t const* word_starts,
+                                            uint32_t const* word_ends,
+                                            uint32_t max_word_length,
+                                            uint32_t total_words,
+                                            uint32_t* token_ids,
+                                            uint8_t* tokens_per_word)
 {
   cudf::thread_index_type word_to_tokenize = static_cast<cudf::thread_index_type>(blockDim.x) *
                                                static_cast<cudf::thread_index_type>(blockIdx.x) +
diff --git a/cpp/src/text/vocabulary_tokenize.cu b/cpp/src/text/vocabulary_tokenize.cu
index 511f1995374..683c6f7f735 100644
--- a/cpp/src/text/vocabulary_tokenize.cu
+++ b/cpp/src/text/vocabulary_tokenize.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -214,10 +214,10 @@ struct mark_delimiters_fn {
   }
 };
 
-__global__ void token_counts_fn(cudf::column_device_view const d_strings,
-                                cudf::string_view const d_delimiter,
-                                cudf::size_type* d_counts,
-                                int8_t* d_results)
+CUDF_KERNEL void token_counts_fn(cudf::column_device_view const d_strings,
+                                 cudf::string_view const d_delimiter,
+                                 cudf::size_type* d_counts,
+                                 int8_t* d_results)
 {
   // string per warp
   auto const idx = static_cast<std::size_t>(threadIdx.x + blockIdx.x * blockDim.x);
diff --git a/cpp/src/transform/compute_column.cu b/cpp/src/transform/compute_column.cu
index 224dd93b048..eaf47adec10 100644
--- a/cpp/src/transform/compute_column.cu
+++ b/cpp/src/transform/compute_column.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -54,7 +54,7 @@ namespace detail {
  * @param output_column The destination for the results of evaluating the expression.
  */
 template <cudf::size_type max_block_size, bool has_nulls>
-__launch_bounds__(max_block_size) __global__
+__launch_bounds__(max_block_size) CUDF_KERNEL
   void compute_column_kernel(table_device_view const table,
                              ast::detail::expression_device_view device_expression_data,
                              mutable_column_device_view output_column)
diff --git a/cpp/src/transform/jit/kernel.cu b/cpp/src/transform/jit/kernel.cu
index 0170cc50c6f..1e913ecb5bb 100644
--- a/cpp/src/transform/jit/kernel.cu
+++ b/cpp/src/transform/jit/kernel.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -35,7 +35,7 @@ namespace transformation {
 namespace jit {
 
 template <typename TypeOut, typename TypeIn>
-__global__ void kernel(cudf::size_type size, TypeOut* out_data, TypeIn* in_data)
+CUDF_KERNEL void kernel(cudf::size_type size, TypeOut* out_data, TypeIn* in_data)
 {
   // cannot use global_thread_id utility due to a JIT build issue by including
   // the `cudf/detail/utilities/cuda.cuh` header
diff --git a/cpp/src/transform/row_bit_count.cu b/cpp/src/transform/row_bit_count.cu
index b151b44565d..a91dc8fbbc6 100644
--- a/cpp/src/transform/row_bit_count.cu
+++ b/cpp/src/transform/row_bit_count.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -398,10 +398,10 @@ __device__ size_type row_size_functor::operator()<struct_view>(column_device_vie
  * @param output Output span of size (# rows) where per-row bit sizes are stored
  * @param max_branch_depth Maximum depth of the span stack needed per-thread
  */
-__global__ void compute_row_sizes(device_span<column_device_view const> cols,
-                                  device_span<column_info const> info,
-                                  device_span<size_type> output,
-                                  size_type max_branch_depth)
+CUDF_KERNEL void compute_row_sizes(device_span<column_device_view const> cols,
+                                   device_span<column_info const> info,
+                                   device_span<size_type> output,
+                                   size_type max_branch_depth)
 {
   extern __shared__ row_span thread_branch_stacks[];
   int const tid = threadIdx.x + blockIdx.x * blockDim.x;
diff --git a/cpp/src/transform/row_conversion.cu b/cpp/src/transform/row_conversion.cu
index a1c5827e5da..361c507a1cc 100644
--- a/cpp/src/transform/row_conversion.cu
+++ b/cpp/src/transform/row_conversion.cu
@@ -314,14 +314,14 @@ struct fixed_width_row_offset_functor {
  * @param output_nm array of pointers to the output null masks
  * @param input_data pointing to the incoming row data
  */
-__global__ void copy_from_rows_fixed_width_optimized(const size_type num_rows,
-                                                     const size_type num_columns,
-                                                     const size_type row_size,
-                                                     const size_type* input_offset_in_row,
-                                                     const size_type* num_bytes,
-                                                     int8_t** output_data,
-                                                     bitmask_type** output_nm,
-                                                     const int8_t* input_data)
+CUDF_KERNEL void copy_from_rows_fixed_width_optimized(const size_type num_rows,
+                                                      const size_type num_columns,
+                                                      const size_type row_size,
+                                                      const size_type* input_offset_in_row,
+                                                      const size_type* num_bytes,
+                                                      int8_t** output_data,
+                                                      bitmask_type** output_nm,
+                                                      const int8_t* input_data)
 {
   // We are going to copy the data in two passes.
   // The first pass copies a chunk of data into shared memory.
@@ -433,15 +433,15 @@ __global__ void copy_from_rows_fixed_width_optimized(const size_type num_rows,
   }
 }
 
-__global__ void copy_to_rows_fixed_width_optimized(const size_type start_row,
-                                                   const size_type num_rows,
-                                                   const size_type num_columns,
-                                                   const size_type row_size,
-                                                   const size_type* output_offset_in_row,
-                                                   const size_type* num_bytes,
-                                                   const int8_t** input_data,
-                                                   const bitmask_type** input_nm,
-                                                   int8_t* output_data)
+CUDF_KERNEL void copy_to_rows_fixed_width_optimized(const size_type start_row,
+                                                    const size_type num_rows,
+                                                    const size_type num_columns,
+                                                    const size_type row_size,
+                                                    const size_type* output_offset_in_row,
+                                                    const size_type* num_bytes,
+                                                    const int8_t** input_data,
+                                                    const bitmask_type** input_nm,
+                                                    int8_t* output_data)
 {
   // We are going to copy the data in two passes.
   // The first pass copies a chunk of data into shared memory.
@@ -588,16 +588,16 @@ __global__ void copy_to_rows_fixed_width_optimized(const size_type start_row,
  *
  */
 template <typename RowOffsetFunctor>
-__global__ void copy_to_rows(const size_type num_rows,
-                             const size_type num_columns,
-                             const size_type shmem_used_per_tile,
-                             device_span<const tile_info> tile_infos,
-                             const int8_t** input_data,
-                             const size_type* col_sizes,
-                             const size_type* col_offsets,
-                             RowOffsetFunctor row_offsets,
-                             size_type const* batch_row_boundaries,
-                             int8_t** output_data)
+CUDF_KERNEL void copy_to_rows(const size_type num_rows,
+                              const size_type num_columns,
+                              const size_type shmem_used_per_tile,
+                              device_span<const tile_info> tile_infos,
+                              const int8_t** input_data,
+                              const size_type* col_sizes,
+                              const size_type* col_offsets,
+                              RowOffsetFunctor row_offsets,
+                              size_type const* batch_row_boundaries,
+                              int8_t** output_data)
 {
   // We are going to copy the data in two passes.
   // The first pass copies a chunk of data into shared memory.
@@ -731,15 +731,15 @@ __global__ void copy_to_rows(const size_type num_rows,
  *
  */
 template <typename RowOffsetFunctor>
-__global__ void copy_validity_to_rows(const size_type num_rows,
-                                      const size_type num_columns,
-                                      const size_type shmem_used_per_tile,
-                                      RowOffsetFunctor row_offsets,
-                                      size_type const* batch_row_boundaries,
-                                      int8_t** output_data,
-                                      const size_type validity_offset,
-                                      device_span<const tile_info> tile_infos,
-                                      const bitmask_type** input_nm)
+CUDF_KERNEL void copy_validity_to_rows(const size_type num_rows,
+                                       const size_type num_columns,
+                                       const size_type shmem_used_per_tile,
+                                       RowOffsetFunctor row_offsets,
+                                       size_type const* batch_row_boundaries,
+                                       int8_t** output_data,
+                                       const size_type validity_offset,
+                                       device_span<const tile_info> tile_infos,
+                                       const bitmask_type** input_nm)
 {
   extern __shared__ int8_t shared_data[];
 
@@ -851,15 +851,15 @@ __global__ void copy_validity_to_rows(const size_type num_rows,
  *
  */
 template <typename RowOffsetFunctor>
-__global__ void copy_strings_to_rows(size_type const num_rows,
-                                     size_type const num_variable_columns,
-                                     int8_t const** variable_input_data,
-                                     size_type const* variable_col_output_offsets,
-                                     size_type const** variable_col_offsets,
-                                     size_type fixed_width_row_size,
-                                     RowOffsetFunctor row_offsets,
-                                     size_type const batch_row_offset,
-                                     int8_t* output_data)
+CUDF_KERNEL void copy_strings_to_rows(size_type const num_rows,
+                                      size_type const num_variable_columns,
+                                      int8_t const** variable_input_data,
+                                      size_type const* variable_col_output_offsets,
+                                      size_type const** variable_col_offsets,
+                                      size_type fixed_width_row_size,
+                                      RowOffsetFunctor row_offsets,
+                                      size_type const batch_row_offset,
+                                      int8_t* output_data)
 {
   // Each block will take a group of rows controlled by NUM_STRING_ROWS_PER_BLOCK_TO_ROWS. Each warp
   // will copy a row at a time. The base thread will first go through column data and fill out
@@ -920,16 +920,16 @@ __global__ void copy_strings_to_rows(size_type const num_rows,
  *
  */
 template <typename RowOffsetFunctor>
-__global__ void copy_from_rows(const size_type num_rows,
-                               const size_type num_columns,
-                               const size_type shmem_used_per_tile,
-                               RowOffsetFunctor row_offsets,
-                               size_type const* batch_row_boundaries,
-                               int8_t** output_data,
-                               const size_type* col_sizes,
-                               const size_type* col_offsets,
-                               device_span<const tile_info> tile_infos,
-                               const int8_t* input_data)
+CUDF_KERNEL void copy_from_rows(const size_type num_rows,
+                                const size_type num_columns,
+                                const size_type shmem_used_per_tile,
+                                RowOffsetFunctor row_offsets,
+                                size_type const* batch_row_boundaries,
+                                int8_t** output_data,
+                                const size_type* col_sizes,
+                                const size_type* col_offsets,
+                                device_span<const tile_info> tile_infos,
+                                const int8_t* input_data)
 {
   // We are going to copy the data in two passes.
   // The first pass copies a chunk of data into shared memory.
@@ -1042,15 +1042,15 @@ __global__ void copy_from_rows(const size_type num_rows,
  *
  */
 template <typename RowOffsetFunctor>
-__global__ void copy_validity_from_rows(const size_type num_rows,
-                                        const size_type num_columns,
-                                        const size_type shmem_used_per_tile,
-                                        RowOffsetFunctor row_offsets,
-                                        size_type const* batch_row_boundaries,
-                                        bitmask_type** output_nm,
-                                        const size_type validity_offset,
-                                        device_span<const tile_info> tile_infos,
-                                        const int8_t* input_data)
+CUDF_KERNEL void copy_validity_from_rows(const size_type num_rows,
+                                         const size_type num_columns,
+                                         const size_type shmem_used_per_tile,
+                                         RowOffsetFunctor row_offsets,
+                                         size_type const* batch_row_boundaries,
+                                         bitmask_type** output_nm,
+                                         const size_type validity_offset,
+                                         device_span<const tile_info> tile_infos,
+                                         const int8_t* input_data)
 {
   extern __shared__ int8_t shared[];
 
@@ -1175,14 +1175,14 @@ __global__ void copy_validity_from_rows(const size_type num_rows,
  * @param num_string_columns number of string columns in the table
  */
 template <typename RowOffsetFunctor>
-__global__ void copy_strings_from_rows(RowOffsetFunctor row_offsets,
-                                       int32_t** string_row_offsets,
-                                       int32_t** string_lengths,
-                                       size_type** string_column_offsets,
-                                       char** string_col_data,
-                                       int8_t const* row_data,
-                                       size_type const num_rows,
-                                       size_type const num_string_columns)
+CUDF_KERNEL void copy_strings_from_rows(RowOffsetFunctor row_offsets,
+                                        int32_t** string_row_offsets,
+                                        int32_t** string_lengths,
+                                        size_type** string_column_offsets,
+                                        char** string_col_data,
+                                        int8_t const* row_data,
+                                        size_type const num_rows,
+                                        size_type const num_string_columns)
 {
   // Each warp takes a tile, which is a single column and up to ROWS_PER_BLOCK rows. A tile will not
   // wrap around the bottom of the table. The warp will copy the strings for each row in the tile.
diff --git a/cpp/tests/device_atomics/device_atomics_test.cu b/cpp/tests/device_atomics/device_atomics_test.cu
index 24195362d92..f2662e244c8 100644
--- a/cpp/tests/device_atomics/device_atomics_test.cu
+++ b/cpp/tests/device_atomics/device_atomics_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -29,7 +29,7 @@
 #include <algorithm>
 
 template <typename T>
-__global__ void gpu_atomic_test(T* result, T* data, size_t size)
+CUDF_KERNEL void gpu_atomic_test(T* result, T* data, size_t size)
 {
   size_t id   = blockIdx.x * blockDim.x + threadIdx.x;
   size_t step = blockDim.x * gridDim.x;
@@ -77,7 +77,7 @@ __device__ T atomic_op(T* addr, T const& value, BinaryOp op)
 }
 
 template <typename T>
-__global__ void gpu_atomicCAS_test(T* result, T* data, size_t size)
+CUDF_KERNEL void gpu_atomicCAS_test(T* result, T* data, size_t size)
 {
   size_t id   = blockIdx.x * blockDim.x + threadIdx.x;
   size_t step = blockDim.x * gridDim.x;
diff --git a/cpp/tests/error/error_handling_test.cu b/cpp/tests/error/error_handling_test.cu
index 5b842322681..63b33b6bcab 100644
--- a/cpp/tests/error/error_handling_test.cu
+++ b/cpp/tests/error/error_handling_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -39,7 +39,7 @@ TEST(StreamCheck, success) { EXPECT_NO_THROW(CUDF_CHECK_CUDA(0)); }
 
 namespace {
 // Some silly kernel that will cause an error
-void __global__ test_kernel(int* data) { data[threadIdx.x] = threadIdx.x; }
+static void __global__ test_kernel(int* data) { data[threadIdx.x] = threadIdx.x; }
 }  // namespace
 
 // In a release build and without explicit synchronization, CUDF_CHECK_CUDA may
@@ -69,7 +69,7 @@ TEST(StreamCheck, CatchFailedKernel)
   EXPECT_THROW(CUDF_CHECK_CUDA(stream.value()), cudf::cuda_error);
 }
 
-__global__ void kernel() { asm("trap;"); }
+CUDF_KERNEL void kernel() { asm("trap;"); }
 
 TEST(DeathTest, CudaFatalError)
 {
@@ -87,9 +87,9 @@ TEST(DeathTest, CudaFatalError)
 
 #ifndef NDEBUG
 
-__global__ void assert_false_kernel() { cudf_assert(false && "this kernel should die"); }
+CUDF_KERNEL void assert_false_kernel() { cudf_assert(false && "this kernel should die"); }
 
-__global__ void assert_true_kernel() { cudf_assert(true && "this kernel should live"); }
+CUDF_KERNEL void assert_true_kernel() { cudf_assert(true && "this kernel should live"); }
 
 TEST(DebugAssertDeathTest, cudf_assert_false)
 {
diff --git a/cpp/tests/identify_stream_usage/test_default_stream_identification.cu b/cpp/tests/identify_stream_usage/test_default_stream_identification.cu
index 28bb47af40d..11124382299 100644
--- a/cpp/tests/identify_stream_usage/test_default_stream_identification.cu
+++ b/cpp/tests/identify_stream_usage/test_default_stream_identification.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,7 +16,7 @@
 
 #include <stdexcept>
 
-__global__ void kernel() { printf("The kernel ran!\n"); }
+CUDF_KERNEL void kernel() { printf("The kernel ran!\n"); }
 
 void test_cudaLaunchKernel()
 {
diff --git a/cpp/tests/scalar/scalar_device_view_test.cu b/cpp/tests/scalar/scalar_device_view_test.cu
index 8d0e54f024f..8444716bccd 100644
--- a/cpp/tests/scalar/scalar_device_view_test.cu
+++ b/cpp/tests/scalar/scalar_device_view_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -35,14 +35,14 @@ struct TypedScalarDeviceViewTest : public cudf::test::BaseFixture {};
 TYPED_TEST_SUITE(TypedScalarDeviceViewTest, cudf::test::FixedWidthTypesWithoutFixedPoint);
 
 template <typename ScalarDeviceViewType>
-__global__ void test_set_value(ScalarDeviceViewType s, ScalarDeviceViewType s1)
+CUDF_KERNEL void test_set_value(ScalarDeviceViewType s, ScalarDeviceViewType s1)
 {
   s1.set_value(s.value());
   s1.set_valid(true);
 }
 
 template <typename ScalarDeviceViewType>
-__global__ void test_value(ScalarDeviceViewType s, ScalarDeviceViewType s1, bool* result)
+CUDF_KERNEL void test_value(ScalarDeviceViewType s, ScalarDeviceViewType s1, bool* result)
 {
   *result = (s.value() == s1.value());
 }
@@ -73,7 +73,7 @@ TYPED_TEST(TypedScalarDeviceViewTest, Value)
 }
 
 template <typename ScalarDeviceViewType>
-__global__ void test_null(ScalarDeviceViewType s, bool* result)
+CUDF_KERNEL void test_null(ScalarDeviceViewType s, bool* result)
 {
   *result = s.is_valid();
 }
@@ -92,7 +92,7 @@ TYPED_TEST(TypedScalarDeviceViewTest, ConstructNull)
 }
 
 template <typename ScalarDeviceViewType>
-__global__ void test_setnull(ScalarDeviceViewType s)
+CUDF_KERNEL void test_setnull(ScalarDeviceViewType s)
 {
   s.set_valid(false);
 }
@@ -113,10 +113,10 @@ TYPED_TEST(TypedScalarDeviceViewTest, SetNull)
 
 struct StringScalarDeviceViewTest : public cudf::test::BaseFixture {};
 
-__global__ void test_string_value(cudf::string_scalar_device_view s,
-                                  char const* value,
-                                  cudf::size_type size,
-                                  bool* result)
+CUDF_KERNEL void test_string_value(cudf::string_scalar_device_view s,
+                                   char const* value,
+                                   cudf::size_type size,
+                                   bool* result)
 {
   *result = (s.value() == cudf::string_view(value, size));
 }
diff --git a/cpp/tests/streams/pool_test.cu b/cpp/tests/streams/pool_test.cu
index 0f92e1c0c2b..52debe24fe8 100644
--- a/cpp/tests/streams/pool_test.cu
+++ b/cpp/tests/streams/pool_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,7 +22,7 @@
 
 class StreamPoolTest : public cudf::test::BaseFixture {};
 
-__global__ void do_nothing_kernel() {}
+CUDF_KERNEL void do_nothing_kernel() {}
 
 TEST_F(StreamPoolTest, ForkStreams)
 {
diff --git a/cpp/tests/types/type_dispatcher_test.cu b/cpp/tests/types/type_dispatcher_test.cu
index 7b2384d38cb..e59ec590bdf 100644
--- a/cpp/tests/types/type_dispatcher_test.cu
+++ b/cpp/tests/types/type_dispatcher_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -58,7 +58,7 @@ struct verify_dispatched_type {
   }
 };
 
-__global__ void dispatch_test_kernel(cudf::type_id id, bool* d_result)
+CUDF_KERNEL void dispatch_test_kernel(cudf::type_id id, bool* d_result)
 {
   if (0 == threadIdx.x + blockIdx.x * blockDim.x)
     *d_result = cudf::type_dispatcher(cudf::data_type{id}, verify_dispatched_type{}, id);
@@ -118,7 +118,7 @@ struct verify_double_dispatched_type {
   }
 };
 
-__global__ void double_dispatch_test_kernel(cudf::type_id id1, cudf::type_id id2, bool* d_result)
+CUDF_KERNEL void double_dispatch_test_kernel(cudf::type_id id1, cudf::type_id id2, bool* d_result)
 {
   if (0 == threadIdx.x + blockIdx.x * blockDim.x)
     *d_result = cudf::double_type_dispatcher(
diff --git a/cpp/tests/utilities_tests/span_tests.cu b/cpp/tests/utilities_tests/span_tests.cu
index 881a237de72..731839acb14 100644
--- a/cpp/tests/utilities_tests/span_tests.cu
+++ b/cpp/tests/utilities_tests/span_tests.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -246,7 +246,7 @@ TEST(SpanTest, CanConstructFromDeviceContainers)
   (void)device_span<int const>(d_uvector_c);
 }
 
-__global__ void simple_device_kernel(device_span<bool> result) { result[0] = true; }
+CUDF_KERNEL void simple_device_kernel(device_span<bool> result) { result[0] = true; }
 
 TEST(SpanTest, CanUseDeviceSpan)
 {
@@ -276,7 +276,7 @@ TEST(MdSpanTest, CanDetermineEmptiness)
   EXPECT_TRUE(device_2dspan<int const>{no_columns_vector}.is_empty());
 }
 
-__global__ void readwrite_kernel(device_2dspan<int> result)
+CUDF_KERNEL void readwrite_kernel(device_2dspan<int> result)
 {
   if (result[5][6] == 5) {
     result[5][6] *= 6;
@@ -435,7 +435,7 @@ TEST(HostDeviceSpanTest, CanSendToDevice)
   EXPECT_EQ(std::string(d_message), hello_world_message);
 }
 
-__global__ void simple_device_char_kernel(device_span<char> result)
+CUDF_KERNEL void simple_device_char_kernel(device_span<char> result)
 {
   char const* str = "world hello";
   for (int offset = 0; offset < result.size(); ++offset) {