diff --git a/src/main/cpp/faultinj/faultinj.cu b/src/main/cpp/faultinj/faultinj.cu
index 13065a81ed..fcb4b3a12d 100644
--- a/src/main/cpp/faultinj/faultinj.cu
+++ b/src/main/cpp/faultinj/faultinj.cu
@@ -136,12 +136,12 @@ CUptiResult cuptiInitialize(void)
   return status;
 }
 
-__global__ void faultInjectorKernelAssert(void)
+__global__ static void faultInjectorKernelAssert(void)
 {
   assert(0 && "faultInjectorKernelAssert triggered");
 }
 
-__global__ void faultInjectorKernelTrap(void) { asm("trap;"); }
+__global__ static void faultInjectorKernelTrap(void) { asm("trap;"); }
 
 boost::optional<boost::property_tree::ptree&> lookupConfig(
   boost::optional<boost::property_tree::ptree&> domainConfigs,
diff --git a/src/main/cpp/src/bloom_filter.cu b/src/main/cpp/src/bloom_filter.cu
index 5dfdd582ef..da4e3c5cb9 100644
--- a/src/main/cpp/src/bloom_filter.cu
+++ b/src/main/cpp/src/bloom_filter.cu
@@ -60,10 +60,10 @@ __device__ inline std::pair<cudf::size_type, cudf::bitmask_type> gpu_get_hash_ma
 }
 
 template <bool nullable>
-__global__ void gpu_bloom_filter_put(cudf::bitmask_type* const bloom_filter,
-                                     cudf::size_type bloom_filter_bits,
-                                     cudf::column_device_view input,
-                                     cudf::size_type num_hashes)
+CUDF_KERNEL void gpu_bloom_filter_put(cudf::bitmask_type* const bloom_filter,
+                                      cudf::size_type bloom_filter_bits,
+                                      cudf::column_device_view input,
+                                      cudf::size_type num_hashes)
 {
   size_t const tid = threadIdx.x + blockIdx.x * blockDim.x;
   if (tid >= input.size()) { return; }
diff --git a/src/main/cpp/src/cast_string.cu b/src/main/cpp/src/cast_string.cu
index bfbbc3777d..4267daae37 100644
--- a/src/main/cpp/src/cast_string.cu
+++ b/src/main/cpp/src/cast_string.cu
@@ -156,14 +156,14 @@ process_value(bool first_value, T current_val, T const new_digit, bool adding)
  * @param ansi_mode true if ansi mode is required, which is more strict and throws
  */
 template <typename T>
-void __global__ string_to_integer_kernel(T* out,
-                                         bitmask_type* validity,
-                                         const char* const chars,
-                                         size_type const* offsets,
-                                         bitmask_type const* incoming_null_mask,
-                                         size_type num_rows,
-                                         bool ansi_mode,
-                                         bool strip)
+void CUDF_KERNEL string_to_integer_kernel(T* out,
+                                          bitmask_type* validity,
+                                          const char* const chars,
+                                          size_type const* offsets,
+                                          bitmask_type const* incoming_null_mask,
+                                          size_type num_rows,
+                                          bool ansi_mode,
+                                          bool strip)
 {
   auto const group = cooperative_groups::this_thread_block();
   auto const warp  = cooperative_groups::tiled_partition<cudf::detail::warp_size>(group);
@@ -386,18 +386,17 @@ __device__ thrust::optional<thrust::tuple<bool, int, int>> validate_and_exponent
  * @param scale scale of desired decimals
  * @param precision precision of desired decimals
  * @param ansi_mode true if ansi mode is required, which is more strict and throws
- * @return __global__
  */
 template <typename T>
-__global__ void string_to_decimal_kernel(T* out,
-                                         bitmask_type* validity,
-                                         const char* const chars,
-                                         size_type const* offsets,
-                                         bitmask_type const* incoming_null_mask,
-                                         size_type num_rows,
-                                         int32_t scale,
-                                         int32_t precision,
-                                         bool strip)
+CUDF_KERNEL void string_to_decimal_kernel(T* out,
+                                          bitmask_type* validity,
+                                          const char* const chars,
+                                          size_type const* offsets,
+                                          bitmask_type const* incoming_null_mask,
+                                          size_type num_rows,
+                                          int32_t scale,
+                                          int32_t precision,
+                                          bool strip)
 {
   auto const group = cooperative_groups::this_thread_block();
   auto const warp  = cooperative_groups::tiled_partition<cudf::detail::warp_size>(group);
diff --git a/src/main/cpp/src/cast_string_to_float.cu b/src/main/cpp/src/cast_string_to_float.cu
index cd7de88110..c19a2a10fe 100644
--- a/src/main/cpp/src/cast_string_to_float.cu
+++ b/src/main/cpp/src/cast_string_to_float.cu
@@ -618,14 +618,14 @@ class string_to_float {
 };
 
 template <typename T, size_type block_size>
-__global__ void string_to_float_kernel(T* out,
-                                       bitmask_type* validity,
-                                       int32_t* ansi_except,
-                                       size_type* valid_count,
-                                       const char* const chars,
-                                       size_type const* offsets,
-                                       bitmask_type const* incoming_null_mask,
-                                       size_type const num_rows)
+CUDF_KERNEL void string_to_float_kernel(T* out,
+                                        bitmask_type* validity,
+                                        int32_t* ansi_except,
+                                        size_type* valid_count,
+                                        const char* const chars,
+                                        size_type const* offsets,
+                                        bitmask_type const* incoming_null_mask,
+                                        size_type const num_rows)
 {
   size_type const tid = threadIdx.x + (blockDim.x * blockIdx.x);
   size_type const row = tid / 32;
diff --git a/src/main/cpp/src/parse_uri.cu b/src/main/cpp/src/parse_uri.cu
index 0e57366358..f0a78f4f52 100644
--- a/src/main/cpp/src/parse_uri.cu
+++ b/src/main/cpp/src/parse_uri.cu
@@ -770,13 +770,13 @@ uri_parts __device__ validate_uri(const char* str,
  * @param out_offsets Offsets to the start of the chunks
  * @param out_validity Bitmask of validity data, updated in function
  */
-__global__ void parse_uri_char_counter(column_device_view const in_strings,
-                                       URI_chunks chunk,
-                                       char const* const base_ptr,
-                                       size_type* const out_lengths,
-                                       size_type* const out_offsets,
-                                       bitmask_type* out_validity,
-                                       thrust::optional<column_device_view const> query_match)
+CUDF_KERNEL void parse_uri_char_counter(column_device_view const in_strings,
+                                        URI_chunks chunk,
+                                        char const* const base_ptr,
+                                        size_type* const out_lengths,
+                                        size_type* const out_offsets,
+                                        bitmask_type* out_validity,
+                                        thrust::optional<column_device_view const> query_match)
 {
   // thread per row
   auto const tid = cudf::detail::grid_1d::global_thread_id();
@@ -850,11 +850,11 @@ __global__ void parse_uri_char_counter(column_device_view const in_strings,
  * @param offsets Offset value of each string associated with `out_chars`
  * @param out_chars Character buffer for the output string column
  */
-__global__ void parse_uri(column_device_view const in_strings,
-                          char const* const base_ptr,
-                          size_type const* const src_offsets,
-                          size_type const* const offsets,
-                          char* const out_chars)
+CUDF_KERNEL void parse_uri(column_device_view const in_strings,
+                           char const* const base_ptr,
+                           size_type const* const src_offsets,
+                           size_type const* const offsets,
+                           char* const out_chars)
 {
   auto const tid = cudf::detail::grid_1d::global_thread_id();