rapidsai · rapids-bot · Jan 17, 2024 · Jan 9, 2024 · Jan 9, 2024 · Jan 9, 2024
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -31,19 +31,19 @@
 
 #include <cassert>
 
-__global__ static void init_curand(curandState* state, int const nstates)
+CUDF_KERNEL void init_curand(curandState* state, int const nstates)
 {
   int ithread = threadIdx.x + blockIdx.x * blockDim.x;
 
   if (ithread < nstates) { curand_init(1234ULL, ithread, 0, state + ithread); }
 }
 
 template <typename key_type, typename size_type>
-__global__ static void init_build_tbl(key_type* const build_tbl,
-                                      size_type const build_tbl_size,
-                                      int const multiplicity,
-                                      curandState* state,
-                                      int const num_states)
+CUDF_KERNEL void init_build_tbl(key_type* const build_tbl,
+                                size_type const build_tbl_size,
+                                int const multiplicity,
+                                curandState* state,
+                                int const num_states)
 {
   auto const start_idx = blockIdx.x * blockDim.x + threadIdx.x;
   auto const stride    = blockDim.x * gridDim.x;
@@ -61,14 +61,14 @@ __global__ static void init_build_tbl(key_type* const build_tbl,
 }
 
 template <typename key_type, typename size_type>
-__global__ void init_probe_tbl(key_type* const probe_tbl,
-                               size_type const probe_tbl_size,
-                               size_type const build_tbl_size,
-                               key_type const rand_max,
-                               double const selectivity,
-                               int const multiplicity,
-                               curandState* state,
-                               int const num_states)
+CUDF_KERNEL void init_probe_tbl(key_type* const probe_tbl,
+                                size_type const probe_tbl_size,
+                                size_type const build_tbl_size,
+                                key_type const rand_max,
+                                double const selectivity,
+                                int const multiplicity,
+                                curandState* state,
+                                int const num_states)
 {
   auto const start_idx = blockIdx.x * blockDim.x + threadIdx.x;
   auto const stride    = blockDim.x * gridDim.x;

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -58,7 +58,7 @@ constexpr int block_size = 256;
 
 // This is for NO_DISPATCHING
 template <FunctorType functor_type, class T>
-__global__ void no_dispatching_kernel(T** A, cudf::size_type n_rows, cudf::size_type n_cols)
+CUDF_KERNEL void no_dispatching_kernel(T** A, cudf::size_type n_rows, cudf::size_type n_cols)
 {
   using F               = Functor<T, functor_type>;
   cudf::size_type index = blockIdx.x * blockDim.x + threadIdx.x;
@@ -72,7 +72,7 @@ __global__ void no_dispatching_kernel(T** A, cudf::size_type n_rows, cudf::size_
 
 // This is for HOST_DISPATCHING
 template <FunctorType functor_type, class T>
-__global__ void host_dispatching_kernel(cudf::mutable_column_device_view source_column)
+CUDF_KERNEL void host_dispatching_kernel(cudf::mutable_column_device_view source_column)
 {
   using F               = Functor<T, functor_type>;
   T* A                  = source_column.data<T>();
@@ -124,7 +124,7 @@ struct RowHandle {
 
 // This is for DEVICE_DISPATCHING
 template <FunctorType functor_type>
-__global__ void device_dispatching_kernel(cudf::mutable_table_device_view source)
+CUDF_KERNEL void device_dispatching_kernel(cudf::mutable_table_device_view source)
 {
   cudf::size_type const n_rows = source.num_rows();
   cudf::size_type index        = threadIdx.x + blockIdx.x * blockDim.x;

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -37,9 +37,9 @@
  * @param d_visibilities Column of visibilities
  * @param d_sizes Output sizes for each row
  */
-__global__ void sizes_kernel(cudf::column_device_view const d_names,
-                             cudf::column_device_view const d_visibilities,
-                             cudf::size_type* d_sizes)
+__global__ static void sizes_kernel(cudf::column_device_view const d_names,
+                                    cudf::column_device_view const d_visibilities,
+                                    cudf::size_type* d_sizes)
 {
   // The row index is resolved from the CUDA thread/block objects
   auto index = threadIdx.x + blockIdx.x * blockDim.x;
@@ -74,10 +74,10 @@ __global__ void sizes_kernel(cudf::column_device_view const d_names,
  * @param d_offsets Byte offset in `d_chars` for each row
  * @param d_chars Output memory for all rows
  */
-__global__ void redact_kernel(cudf::column_device_view const d_names,
-                              cudf::column_device_view const d_visibilities,
-                              cudf::size_type const* d_offsets,
-                              char* d_chars)
+__global__ static void redact_kernel(cudf::column_device_view const d_names,
+                                     cudf::column_device_view const d_visibilities,
+                                     cudf::size_type const* d_offsets,
+                                     char* d_chars)
 {
   // The row index is resolved from the CUDA thread/block objects
   auto index = threadIdx.x + blockIdx.x * blockDim.x;

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -37,12 +37,12 @@
  * @param d_offsets Byte offset in `d_chars` for each row
  * @param d_output Output array of string_view objects
  */
-__global__ void redact_kernel(cudf::column_device_view const d_names,
-                              cudf::column_device_view const d_visibilities,
-                              cudf::string_view redaction,
-                              char* working_memory,
-                              cudf::size_type const* d_offsets,
-                              cudf::string_view* d_output)
+__global__ static void redact_kernel(cudf::column_device_view const d_names,
+                                     cudf::column_device_view const d_visibilities,
+                                     cudf::string_view redaction,
+                                     char* working_memory,
+                                     cudf::size_type const* d_offsets,
+                                     cudf::string_view* d_output)
 {
   // The row index is resolved from the CUDA thread/block objects
   auto index = threadIdx.x + blockIdx.x * blockDim.x;

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -64,10 +64,10 @@ void set_malloc_heap_size(size_t heap_size = 1073741824)  // 1GB
  * @param redaction Redacted string replacement
  * @param d_output Output array of string_view objects
  */
-__global__ void redact_kernel(cudf::column_device_view const d_names,
-                              cudf::column_device_view const d_visibilities,
-                              cudf::string_view redaction,
-                              cudf::string_view* d_output)
+__global__ static void redact_kernel(cudf::column_device_view const d_names,
+                                     cudf::column_device_view const d_visibilities,
+                                     cudf::string_view redaction,
+                                     cudf::string_view* d_output)
 {
   // The row index is resolved from the CUDA thread/block objects
   auto index = threadIdx.x + blockIdx.x * blockDim.x;
@@ -107,7 +107,9 @@ __global__ void redact_kernel(cudf::column_device_view const d_names,
  * @param redaction Redacted string replacement (not to be freed)
  * @param d_output Output array of string_view objects to free
  */
-__global__ void free_kernel(cudf::string_view redaction, cudf::string_view* d_output, int count)
+__global__ static void free_kernel(cudf::string_view redaction,
+                                   cudf::string_view* d_output,
+                                   int count)
 {
   auto index = threadIdx.x + blockIdx.x * blockDim.x;
   if (index >= count) return;

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -52,10 +52,10 @@ namespace detail {
 
 // Compute the count of elements that pass the mask within each block
 template <typename Filter, int block_size>
-__global__ void compute_block_counts(cudf::size_type* __restrict__ block_counts,
-                                     cudf::size_type size,
-                                     cudf::size_type per_thread,
-                                     Filter filter)
+CUDF_KERNEL void compute_block_counts(cudf::size_type* __restrict__ block_counts,
+                                      cudf::size_type size,
+                                      cudf::size_type per_thread,
+                                      Filter filter)
 {
   int tid   = threadIdx.x + per_thread * block_size * blockIdx.x;
   int count = 0;
@@ -96,7 +96,7 @@ __device__ cudf::size_type block_scan_mask(bool mask_true, cudf::size_type& bloc
 //
 // Note: `filter` is not run on indices larger than the input column size
 template <typename T, typename Filter, int block_size, bool has_validity>
-__launch_bounds__(block_size) __global__
+__launch_bounds__(block_size) CUDF_KERNEL
   void scatter_kernel(cudf::mutable_column_device_view output_view,
                       cudf::size_type* output_null_count,
                       cudf::column_device_view input_view,

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -37,7 +37,7 @@ template <size_type block_size,
           typename RightIter,
           typename Filter,
           bool has_nulls>
-__launch_bounds__(block_size) __global__
+__launch_bounds__(block_size) CUDF_KERNEL
   void copy_if_else_kernel(LeftIter lhs,
                            RightIter rhs,
                            Filter filter,

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -40,12 +40,12 @@ template <cudf::size_type block_size,
           typename SourceValidityIterator,
           typename T,
           bool has_validity>
-__global__ void copy_range_kernel(SourceValueIterator source_value_begin,
-                                  SourceValidityIterator source_validity_begin,
-                                  cudf::mutable_column_device_view target,
-                                  cudf::size_type target_begin,
-                                  cudf::size_type target_end,
-                                  cudf::size_type* __restrict__ const null_count)
+CUDF_KERNEL void copy_range_kernel(SourceValueIterator source_value_begin,
+                                   SourceValidityIterator source_validity_begin,
+                                   cudf::mutable_column_device_view target,
+                                   cudf::size_type target_begin,
+                                   cudf::size_type target_end,
+                                   cudf::size_type* __restrict__ const null_count)
 {
   using cudf::detail::warp_size;
 

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -61,12 +61,12 @@ namespace detail {
  * @param count_ptr Pointer to counter of set bits
  */
 template <int block_size, typename Binop>
-__global__ void offset_bitmask_binop(Binop op,
-                                     device_span<bitmask_type> destination,
-                                     device_span<bitmask_type const* const> source,
-                                     device_span<size_type const> source_begin_bits,
-                                     size_type source_size_bits,
-                                     size_type* count_ptr)
+CUDF_KERNEL void offset_bitmask_binop(Binop op,
+                                      device_span<bitmask_type> destination,
+                                      device_span<bitmask_type const* const> source,
+                                      device_span<size_type const> source_begin_bits,
+                                      size_type source_size_bits,
+                                      size_type* count_ptr)
 {
   auto const tid = threadIdx.x + blockIdx.x * blockDim.x;
 
@@ -214,11 +214,11 @@ enum class count_bits_policy : bool {
  * in each range is updated.
  */
 template <typename OffsetIterator, typename OutputIterator>
-__global__ void subtract_set_bits_range_boundaries_kernel(bitmask_type const* bitmask,
-                                                          size_type num_ranges,
-                                                          OffsetIterator first_bit_indices,
-                                                          OffsetIterator last_bit_indices,
-                                                          OutputIterator null_counts)
+CUDF_KERNEL void subtract_set_bits_range_boundaries_kernel(bitmask_type const* bitmask,
+                                                           size_type num_ranges,
+                                                           OffsetIterator first_bit_indices,
+                                                           OffsetIterator last_bit_indices,
+                                                           OutputIterator null_counts)
 {
   constexpr size_type const word_size_in_bits{detail::size_in_bits<bitmask_type>()};
 

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -211,7 +211,7 @@ __device__ inline T round_up_pow2(T number_to_round, T modulus)
 }
 
 template <class F>
-__global__ void single_thread_kernel(F f)
+CUDF_KERNEL void single_thread_kernel(F f)
 {
   f();
 }

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -44,7 +44,7 @@ namespace detail {
  * @param[out] valid_count The count of set bits in the output bitmask
  */
 template <size_type block_size, typename InputIterator, typename Predicate>
-__global__ void valid_if_kernel(
+CUDF_KERNEL void valid_if_kernel(
   bitmask_type* output, InputIterator begin, size_type size, Predicate p, size_type* valid_count)
 {
   constexpr size_type leader_lane{0};
@@ -151,13 +151,13 @@ template <typename InputIterator1,
           typename InputIterator2,
           typename BinaryPredicate,
           int32_t block_size>
-__global__ void valid_if_n_kernel(InputIterator1 begin1,
-                                  InputIterator2 begin2,
-                                  BinaryPredicate p,
-                                  bitmask_type* masks[],
-                                  size_type mask_count,
-                                  size_type mask_num_bits,
-                                  size_type* valid_counts)
+CUDF_KERNEL void valid_if_n_kernel(InputIterator1 begin1,
+                                   InputIterator2 begin2,
+                                   BinaryPredicate p,
+                                   bitmask_type* masks[],
+                                   size_type mask_count,
+                                   size_type mask_num_bits,
+                                   size_type* valid_counts)
 {
   for (size_type mask_idx = 0; mask_idx < mask_count; mask_idx++) {
     auto const mask = masks[mask_idx];

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2017-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -130,10 +130,10 @@ __forceinline__ __device__ void store_pair_vectorized(pair_type* __restrict__ co
 }
 
 template <typename value_type, typename size_type, typename key_type, typename elem_type>
-__global__ void init_hashtbl(value_type* __restrict__ const hashtbl_values,
-                             size_type const n,
-                             key_type const key_val,
-                             elem_type const elem_val)
+CUDF_KERNEL void init_hashtbl(value_type* __restrict__ const hashtbl_values,
+                              size_type const n,
+                              key_type const key_val,
+                              elem_type const elem_val)
 {
   size_type const idx = blockIdx.x * blockDim.x + threadIdx.x;
   if (idx < n) {