diff --git a/cpp/benchmarks/join/generate_input_tables.cuh b/cpp/benchmarks/join/generate_input_tables.cuh index 84e607a9f28..b14541564dd 100644 --- a/cpp/benchmarks/join/generate_input_tables.cuh +++ b/cpp/benchmarks/join/generate_input_tables.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -31,7 +31,7 @@ #include -__global__ static void init_curand(curandState* state, int const nstates) +CUDF_KERNEL void init_curand(curandState* state, int const nstates) { int ithread = threadIdx.x + blockIdx.x * blockDim.x; @@ -39,11 +39,11 @@ __global__ static void init_curand(curandState* state, int const nstates) } template -__global__ static void init_build_tbl(key_type* const build_tbl, - size_type const build_tbl_size, - int const multiplicity, - curandState* state, - int const num_states) +CUDF_KERNEL void init_build_tbl(key_type* const build_tbl, + size_type const build_tbl_size, + int const multiplicity, + curandState* state, + int const num_states) { auto const start_idx = blockIdx.x * blockDim.x + threadIdx.x; auto const stride = blockDim.x * gridDim.x; @@ -61,14 +61,14 @@ __global__ static void init_build_tbl(key_type* const build_tbl, } template -__global__ void init_probe_tbl(key_type* const probe_tbl, - size_type const probe_tbl_size, - size_type const build_tbl_size, - key_type const rand_max, - double const selectivity, - int const multiplicity, - curandState* state, - int const num_states) +CUDF_KERNEL void init_probe_tbl(key_type* const probe_tbl, + size_type const probe_tbl_size, + size_type const build_tbl_size, + key_type const rand_max, + double const selectivity, + int const multiplicity, + curandState* state, + int const num_states) { auto const start_idx = blockIdx.x * blockDim.x + threadIdx.x; auto const stride = blockDim.x * gridDim.x; diff --git a/cpp/benchmarks/type_dispatcher/type_dispatcher.cu b/cpp/benchmarks/type_dispatcher/type_dispatcher.cu index 3f985cffb1f..161328ae088 100644 --- a/cpp/benchmarks/type_dispatcher/type_dispatcher.cu +++ b/cpp/benchmarks/type_dispatcher/type_dispatcher.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -58,7 +58,7 @@ constexpr int block_size = 256; // This is for NO_DISPATCHING template -__global__ void no_dispatching_kernel(T** A, cudf::size_type n_rows, cudf::size_type n_cols) +CUDF_KERNEL void no_dispatching_kernel(T** A, cudf::size_type n_rows, cudf::size_type n_cols) { using F = Functor; cudf::size_type index = blockIdx.x * blockDim.x + threadIdx.x; @@ -72,7 +72,7 @@ __global__ void no_dispatching_kernel(T** A, cudf::size_type n_rows, cudf::size_ // This is for HOST_DISPATCHING template -__global__ void host_dispatching_kernel(cudf::mutable_column_device_view source_column) +CUDF_KERNEL void host_dispatching_kernel(cudf::mutable_column_device_view source_column) { using F = Functor; T* A = source_column.data(); @@ -124,7 +124,7 @@ struct RowHandle { // This is for DEVICE_DISPATCHING template -__global__ void device_dispatching_kernel(cudf::mutable_table_device_view source) +CUDF_KERNEL void device_dispatching_kernel(cudf::mutable_table_device_view source) { cudf::size_type const n_rows = source.num_rows(); cudf::size_type index = threadIdx.x + blockIdx.x * blockDim.x; diff --git a/cpp/examples/strings/custom_optimized.cu b/cpp/examples/strings/custom_optimized.cu index 36521871ad8..522093bc647 100644 --- a/cpp/examples/strings/custom_optimized.cu +++ b/cpp/examples/strings/custom_optimized.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -37,9 +37,9 @@ * @param d_visibilities Column of visibilities * @param d_sizes Output sizes for each row */ -__global__ void sizes_kernel(cudf::column_device_view const d_names, - cudf::column_device_view const d_visibilities, - cudf::size_type* d_sizes) +__global__ static void sizes_kernel(cudf::column_device_view const d_names, + cudf::column_device_view const d_visibilities, + cudf::size_type* d_sizes) { // The row index is resolved from the CUDA thread/block objects auto index = threadIdx.x + blockIdx.x * blockDim.x; @@ -74,10 +74,10 @@ __global__ void sizes_kernel(cudf::column_device_view const d_names, * @param d_offsets Byte offset in `d_chars` for each row * @param d_chars Output memory for all rows */ -__global__ void redact_kernel(cudf::column_device_view const d_names, - cudf::column_device_view const d_visibilities, - cudf::size_type const* d_offsets, - char* d_chars) +__global__ static void redact_kernel(cudf::column_device_view const d_names, + cudf::column_device_view const d_visibilities, + cudf::size_type const* d_offsets, + char* d_chars) { // The row index is resolved from the CUDA thread/block objects auto index = threadIdx.x + blockIdx.x * blockDim.x; diff --git a/cpp/examples/strings/custom_prealloc.cu b/cpp/examples/strings/custom_prealloc.cu index 5088ebd6168..93194899fe1 100644 --- a/cpp/examples/strings/custom_prealloc.cu +++ b/cpp/examples/strings/custom_prealloc.cu @@ -37,12 +37,12 @@ * @param d_offsets Byte offset in `d_chars` for each row * @param d_output Output array of string_view objects */ -__global__ void redact_kernel(cudf::column_device_view const d_names, - cudf::column_device_view const d_visibilities, - cudf::string_view redaction, - char* working_memory, - cudf::size_type const* d_offsets, - cudf::string_view* d_output) +__global__ static void redact_kernel(cudf::column_device_view const d_names, + cudf::column_device_view const d_visibilities, + cudf::string_view redaction, + char* working_memory, + cudf::size_type const* d_offsets, + cudf::string_view* d_output) { // The row index is resolved from the CUDA thread/block objects auto index = threadIdx.x + blockIdx.x * blockDim.x; diff --git a/cpp/examples/strings/custom_with_malloc.cu b/cpp/examples/strings/custom_with_malloc.cu index 32f7bf7cbd0..e02fb52cd76 100644 --- a/cpp/examples/strings/custom_with_malloc.cu +++ b/cpp/examples/strings/custom_with_malloc.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -64,10 +64,10 @@ void set_malloc_heap_size(size_t heap_size = 1073741824) // 1GB * @param redaction Redacted string replacement * @param d_output Output array of string_view objects */ -__global__ void redact_kernel(cudf::column_device_view const d_names, - cudf::column_device_view const d_visibilities, - cudf::string_view redaction, - cudf::string_view* d_output) +__global__ static void redact_kernel(cudf::column_device_view const d_names, + cudf::column_device_view const d_visibilities, + cudf::string_view redaction, + cudf::string_view* d_output) { // The row index is resolved from the CUDA thread/block objects auto index = threadIdx.x + blockIdx.x * blockDim.x; @@ -107,7 +107,9 @@ __global__ void redact_kernel(cudf::column_device_view const d_names, * @param redaction Redacted string replacement (not to be freed) * @param d_output Output array of string_view objects to free */ -__global__ void free_kernel(cudf::string_view redaction, cudf::string_view* d_output, int count) +__global__ static void free_kernel(cudf::string_view redaction, + cudf::string_view* d_output, + int count) { auto index = threadIdx.x + blockIdx.x * blockDim.x; if (index >= count) return; diff --git a/cpp/include/cudf/detail/copy_if.cuh b/cpp/include/cudf/detail/copy_if.cuh index ebe7e052b6d..1d051ea32ff 100644 --- a/cpp/include/cudf/detail/copy_if.cuh +++ b/cpp/include/cudf/detail/copy_if.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -52,10 +52,10 @@ namespace detail { // Compute the count of elements that pass the mask within each block template -__global__ void compute_block_counts(cudf::size_type* __restrict__ block_counts, - cudf::size_type size, - cudf::size_type per_thread, - Filter filter) +CUDF_KERNEL void compute_block_counts(cudf::size_type* __restrict__ block_counts, + cudf::size_type size, + cudf::size_type per_thread, + Filter filter) { int tid = threadIdx.x + per_thread * block_size * blockIdx.x; int count = 0; @@ -96,7 +96,7 @@ __device__ cudf::size_type block_scan_mask(bool mask_true, cudf::size_type& bloc // // Note: `filter` is not run on indices larger than the input column size template -__launch_bounds__(block_size) __global__ +__launch_bounds__(block_size) CUDF_KERNEL void scatter_kernel(cudf::mutable_column_device_view output_view, cudf::size_type* output_null_count, cudf::column_device_view input_view, diff --git a/cpp/include/cudf/detail/copy_if_else.cuh b/cpp/include/cudf/detail/copy_if_else.cuh index 04ad1f20196..6162fa5ecf1 100644 --- a/cpp/include/cudf/detail/copy_if_else.cuh +++ b/cpp/include/cudf/detail/copy_if_else.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -37,7 +37,7 @@ template -__launch_bounds__(block_size) __global__ +__launch_bounds__(block_size) CUDF_KERNEL void copy_if_else_kernel(LeftIter lhs, RightIter rhs, Filter filter, diff --git a/cpp/include/cudf/detail/copy_range.cuh b/cpp/include/cudf/detail/copy_range.cuh index 16e4e7a1297..4bfdaa94c53 100644 --- a/cpp/include/cudf/detail/copy_range.cuh +++ b/cpp/include/cudf/detail/copy_range.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -40,12 +40,12 @@ template -__global__ void copy_range_kernel(SourceValueIterator source_value_begin, - SourceValidityIterator source_validity_begin, - cudf::mutable_column_device_view target, - cudf::size_type target_begin, - cudf::size_type target_end, - cudf::size_type* __restrict__ const null_count) +CUDF_KERNEL void copy_range_kernel(SourceValueIterator source_value_begin, + SourceValidityIterator source_validity_begin, + cudf::mutable_column_device_view target, + cudf::size_type target_begin, + cudf::size_type target_end, + cudf::size_type* __restrict__ const null_count) { using cudf::detail::warp_size; diff --git a/cpp/include/cudf/detail/null_mask.cuh b/cpp/include/cudf/detail/null_mask.cuh index ae05d4c6954..e57d85f2998 100644 --- a/cpp/include/cudf/detail/null_mask.cuh +++ b/cpp/include/cudf/detail/null_mask.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -61,12 +61,12 @@ namespace detail { * @param count_ptr Pointer to counter of set bits */ template -__global__ void offset_bitmask_binop(Binop op, - device_span destination, - device_span source, - device_span source_begin_bits, - size_type source_size_bits, - size_type* count_ptr) +CUDF_KERNEL void offset_bitmask_binop(Binop op, + device_span destination, + device_span source, + device_span source_begin_bits, + size_type source_size_bits, + size_type* count_ptr) { auto const tid = threadIdx.x + blockIdx.x * blockDim.x; @@ -214,11 +214,11 @@ enum class count_bits_policy : bool { * in each range is updated. */ template -__global__ void subtract_set_bits_range_boundaries_kernel(bitmask_type const* bitmask, - size_type num_ranges, - OffsetIterator first_bit_indices, - OffsetIterator last_bit_indices, - OutputIterator null_counts) +CUDF_KERNEL void subtract_set_bits_range_boundaries_kernel(bitmask_type const* bitmask, + size_type num_ranges, + OffsetIterator first_bit_indices, + OffsetIterator last_bit_indices, + OutputIterator null_counts) { constexpr size_type const word_size_in_bits{detail::size_in_bits()}; diff --git a/cpp/include/cudf/detail/utilities/cuda.cuh b/cpp/include/cudf/detail/utilities/cuda.cuh index 264302df0e9..86c85ca8d06 100644 --- a/cpp/include/cudf/detail/utilities/cuda.cuh +++ b/cpp/include/cudf/detail/utilities/cuda.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -211,7 +211,7 @@ __device__ inline T round_up_pow2(T number_to_round, T modulus) } template -__global__ void single_thread_kernel(F f) +CUDF_KERNEL void single_thread_kernel(F f) { f(); } diff --git a/cpp/include/cudf/detail/valid_if.cuh b/cpp/include/cudf/detail/valid_if.cuh index f3f95dad017..d0073177445 100644 --- a/cpp/include/cudf/detail/valid_if.cuh +++ b/cpp/include/cudf/detail/valid_if.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -44,7 +44,7 @@ namespace detail { * @param[out] valid_count The count of set bits in the output bitmask */ template -__global__ void valid_if_kernel( +CUDF_KERNEL void valid_if_kernel( bitmask_type* output, InputIterator begin, size_type size, Predicate p, size_type* valid_count) { constexpr size_type leader_lane{0}; @@ -151,13 +151,13 @@ template -__global__ void valid_if_n_kernel(InputIterator1 begin1, - InputIterator2 begin2, - BinaryPredicate p, - bitmask_type* masks[], - size_type mask_count, - size_type mask_num_bits, - size_type* valid_counts) +CUDF_KERNEL void valid_if_n_kernel(InputIterator1 begin1, + InputIterator2 begin2, + BinaryPredicate p, + bitmask_type* masks[], + size_type mask_count, + size_type mask_num_bits, + size_type* valid_counts) { for (size_type mask_idx = 0; mask_idx < mask_count; mask_idx++) { auto const mask = masks[mask_idx]; diff --git a/cpp/include/cudf/hashing/detail/helper_functions.cuh b/cpp/include/cudf/hashing/detail/helper_functions.cuh index cd58ec5f57d..3489fdeccee 100644 --- a/cpp/include/cudf/hashing/detail/helper_functions.cuh +++ b/cpp/include/cudf/hashing/detail/helper_functions.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2023, NVIDIA CORPORATION. + * Copyright (c) 2017-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -130,10 +130,10 @@ __forceinline__ __device__ void store_pair_vectorized(pair_type* __restrict__ co } template -__global__ void init_hashtbl(value_type* __restrict__ const hashtbl_values, - size_type const n, - key_type const key_val, - elem_type const elem_val) +CUDF_KERNEL void init_hashtbl(value_type* __restrict__ const hashtbl_values, + size_type const n, + key_type const key_val, + elem_type const elem_val) { size_type const idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < n) { diff --git a/cpp/include/cudf/strings/detail/gather.cuh b/cpp/include/cudf/strings/detail/gather.cuh index e681373e6e0..f7d2ebebe9a 100644 --- a/cpp/include/cudf/strings/detail/gather.cuh +++ b/cpp/include/cudf/strings/detail/gather.cuh @@ -78,11 +78,11 @@ __forceinline__ __device__ uint4 load_uint4(char const* ptr) * @param total_out_strings Number of output strings to be gathered. */ template -__global__ void gather_chars_fn_string_parallel(StringIterator strings_begin, - char* out_chars, - cudf::detail::input_offsetalator const out_offsets, - MapIterator string_indices, - size_type total_out_strings) +CUDF_KERNEL void gather_chars_fn_string_parallel(StringIterator strings_begin, + char* out_chars, + cudf::detail::input_offsetalator const out_offsets, + MapIterator string_indices, + size_type total_out_strings) { constexpr size_t out_datatype_size = sizeof(uint4); constexpr size_t in_datatype_size = sizeof(uint); @@ -160,11 +160,11 @@ __global__ void gather_chars_fn_string_parallel(StringIterator strings_begin, * @param total_out_strings Number of output strings to be gathered. */ template -__global__ void gather_chars_fn_char_parallel(StringIterator strings_begin, - char* out_chars, - cudf::detail::input_offsetalator const out_offsets, - MapIterator string_indices, - size_type total_out_strings) +CUDF_KERNEL void gather_chars_fn_char_parallel(StringIterator strings_begin, + char* out_chars, + cudf::detail::input_offsetalator const out_offsets, + MapIterator string_indices, + size_type total_out_strings) { __shared__ int64_t out_offsets_threadblock[strings_per_threadblock + 1]; diff --git a/cpp/include/cudf/types.hpp b/cpp/include/cudf/types.hpp index addab160b6e..86750ea4ca8 100644 --- a/cpp/include/cudf/types.hpp +++ b/cpp/include/cudf/types.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2023, NVIDIA CORPORATION. + * Copyright (c) 2018-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,9 +17,23 @@ #pragma once #ifdef __CUDACC__ +/** + * @brief Indicates that the function or method is usable on host and device + */ #define CUDF_HOST_DEVICE __host__ __device__ +/** + * @brief Indicates that the function is a CUDA kernel + */ +#define CUDF_KERNEL __global__ static #else +/** + * @brief Indicates that the function or method is usable on host and device + */ #define CUDF_HOST_DEVICE +/** + * @brief Indicates that the function is a CUDA kernel + */ +#define CUDF_KERNEL static #endif #include diff --git a/cpp/include/cudf_test/print_utilities.cuh b/cpp/include/cudf_test/print_utilities.cuh index 37ffcd401fc..ae6c8cef029 100644 --- a/cpp/include/cudf_test/print_utilities.cuh +++ b/cpp/include/cudf_test/print_utilities.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -103,7 +103,7 @@ CUDF_HOST_DEVICE void print_values(int32_t width, char delimiter, T arg, Ts... a } template -__global__ void print_array_kernel(std::size_t count, int32_t width, char delimiter, Ts... args) +CUDF_KERNEL void print_array_kernel(std::size_t count, int32_t width, char delimiter, Ts... args) { if (threadIdx.x == 0 && blockIdx.x == 0) { for (std::size_t i = 0; i < count; i++) { diff --git a/cpp/src/binaryop/compiled/binary_ops.cuh b/cpp/src/binaryop/compiled/binary_ops.cuh index 9a50eb0d0ec..d605c877d3f 100644 --- a/cpp/src/binaryop/compiled/binary_ops.cuh +++ b/cpp/src/binaryop/compiled/binary_ops.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -237,7 +237,7 @@ struct binary_op_double_device_dispatcher { * @param f Functor object to call for each element. */ template -__global__ void for_each_kernel(cudf::size_type size, Functor f) +CUDF_KERNEL void for_each_kernel(cudf::size_type size, Functor f) { int tid = threadIdx.x; int blkid = blockIdx.x; diff --git a/cpp/src/binaryop/jit/kernel.cu b/cpp/src/binaryop/jit/kernel.cu index c9cc61a4f34..39735a43474 100644 --- a/cpp/src/binaryop/jit/kernel.cu +++ b/cpp/src/binaryop/jit/kernel.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Copyright 2018-2019 BlazingDB, Inc. * Copyright 2018 Christian Noboa Mardini @@ -43,10 +43,10 @@ struct UserDefinedOp { }; template -__global__ void kernel_v_v(cudf::size_type size, - TypeOut* out_data, - TypeLhs* lhs_data, - TypeRhs* rhs_data) +CUDF_KERNEL void kernel_v_v(cudf::size_type size, + TypeOut* out_data, + TypeLhs* lhs_data, + TypeRhs* rhs_data) { int tid = threadIdx.x; int blkid = blockIdx.x; @@ -62,15 +62,15 @@ __global__ void kernel_v_v(cudf::size_type size, } template -__global__ void kernel_v_v_with_validity(cudf::size_type size, - TypeOut* out_data, - TypeLhs* lhs_data, - TypeRhs* rhs_data, - cudf::bitmask_type* output_mask, - cudf::bitmask_type const* lhs_mask, - cudf::size_type lhs_offset, - cudf::bitmask_type const* rhs_mask, - cudf::size_type rhs_offset) +CUDF_KERNEL void kernel_v_v_with_validity(cudf::size_type size, + TypeOut* out_data, + TypeLhs* lhs_data, + TypeRhs* rhs_data, + cudf::bitmask_type* output_mask, + cudf::bitmask_type const* lhs_mask, + cudf::size_type lhs_offset, + cudf::bitmask_type const* rhs_mask, + cudf::size_type rhs_offset) { int tid = threadIdx.x; int blkid = blockIdx.x; diff --git a/cpp/src/bitmask/null_mask.cu b/cpp/src/bitmask/null_mask.cu index 1a1cbb17d15..bb320e4b81a 100644 --- a/cpp/src/bitmask/null_mask.cu +++ b/cpp/src/bitmask/null_mask.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -98,11 +98,11 @@ rmm::device_buffer create_null_mask(size_type size, } namespace { -__global__ void set_null_mask_kernel(bitmask_type* __restrict__ destination, - size_type begin_bit, - size_type end_bit, - bool valid, - size_type number_of_mask_words) +CUDF_KERNEL void set_null_mask_kernel(bitmask_type* __restrict__ destination, + size_type begin_bit, + size_type end_bit, + bool valid, + size_type number_of_mask_words) { auto x = destination + word_index(begin_bit); thread_index_type const last_word = word_index(end_bit) - word_index(begin_bit); @@ -190,11 +190,11 @@ namespace { * @param number_of_mask_words The number of `cudf::bitmask_type` words to copy */ // TODO: Also make binops test that uses offset in column_view -__global__ void copy_offset_bitmask(bitmask_type* __restrict__ destination, - bitmask_type const* __restrict__ source, - size_type source_begin_bit, - size_type source_end_bit, - size_type number_of_mask_words) +CUDF_KERNEL void copy_offset_bitmask(bitmask_type* __restrict__ destination, + bitmask_type const* __restrict__ source, + size_type source_begin_bit, + size_type source_end_bit, + size_type number_of_mask_words) { auto const stride = cudf::detail::grid_1d::grid_stride(); for (thread_index_type destination_word_index = grid_1d::global_thread_id(); @@ -260,10 +260,10 @@ namespace { * @param[out] global_count The number of non-zero bits in the specified range */ template -__global__ void count_set_bits_kernel(bitmask_type const* bitmask, - size_type first_bit_index, - size_type last_bit_index, - size_type* global_count) +CUDF_KERNEL void count_set_bits_kernel(bitmask_type const* bitmask, + size_type first_bit_index, + size_type last_bit_index, + size_type* global_count) { constexpr auto const word_size{detail::size_in_bits()}; diff --git a/cpp/src/copying/concatenate.cu b/cpp/src/copying/concatenate.cu index ddf39e21685..b1d850e0b27 100644 --- a/cpp/src/copying/concatenate.cu +++ b/cpp/src/copying/concatenate.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -111,12 +111,12 @@ auto create_device_views(host_span views, rmm::cuda_stream_vi * @param out_valid_count To hold the total number of valid bits set */ template -__global__ void concatenate_masks_kernel(column_device_view const* views, - size_t const* output_offsets, - size_type number_of_views, - bitmask_type* dest_mask, - size_type number_of_mask_bits, - size_type* out_valid_count) +CUDF_KERNEL void concatenate_masks_kernel(column_device_view const* views, + size_t const* output_offsets, + size_type number_of_views, + bitmask_type* dest_mask, + size_type number_of_mask_bits, + size_type* out_valid_count) { auto tidx = cudf::detail::grid_1d::global_thread_id(); auto const stride = cudf::detail::grid_1d::grid_stride(); @@ -187,11 +187,11 @@ size_type concatenate_masks(host_span views, namespace { template -__global__ void fused_concatenate_kernel(column_device_view const* input_views, - size_t const* input_offsets, - size_type num_input_views, - mutable_column_device_view output_view, - size_type* out_valid_count) +CUDF_KERNEL void fused_concatenate_kernel(column_device_view const* input_views, + size_t const* input_offsets, + size_type num_input_views, + mutable_column_device_view output_view, + size_type* out_valid_count) { auto const output_size = output_view.size(); auto* output_data = output_view.data(); diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu index 54d0aa10353..d711f40605a 100644 --- a/cpp/src/copying/contiguous_split.cu +++ b/cpp/src/copying/contiguous_split.cu @@ -280,9 +280,9 @@ __device__ void copy_buffer(uint8_t* __restrict__ dst, * @param buf_info Information on the range of values to be copied for each destination buffer */ template -__global__ void copy_partitions(IndexToDstBuf index_to_buffer, - uint8_t const** src_bufs, - dst_buf_info* buf_info) +CUDF_KERNEL void copy_partitions(IndexToDstBuf index_to_buffer, + uint8_t const** src_bufs, + dst_buf_info* buf_info) { auto const buf_index = blockIdx.x; auto const src_buf_index = buf_info[buf_index].src_buf_index; diff --git a/cpp/src/copying/scatter.cu b/cpp/src/copying/scatter.cu index 8f326184012..517435503ee 100644 --- a/cpp/src/copying/scatter.cu +++ b/cpp/src/copying/scatter.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -50,9 +50,9 @@ namespace detail { namespace { template -__global__ void marking_bitmask_kernel(mutable_column_device_view destination, - MapIterator scatter_map, - size_type num_scatter_rows) +CUDF_KERNEL void marking_bitmask_kernel(mutable_column_device_view destination, + MapIterator scatter_map, + size_type num_scatter_rows) { auto row = cudf::detail::grid_1d::global_thread_id(); auto const stride = cudf::detail::grid_1d::grid_stride(); diff --git a/cpp/src/io/avro/avro_gpu.cu b/cpp/src/io/avro/avro_gpu.cu index 365f6d6875c..59177a68ee7 100644 --- a/cpp/src/io/avro/avro_gpu.cu +++ b/cpp/src/io/avro/avro_gpu.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -324,7 +324,7 @@ avro_decode_row(schemadesc_s const* schema, * @param[in] min_row_size Minimum size in bytes of a row */ // blockDim {32,num_warps,1} -__global__ void __launch_bounds__(num_warps * 32, 2) +CUDF_KERNEL void __launch_bounds__(num_warps * 32, 2) gpuDecodeAvroColumnData(device_span blocks, schemadesc_s* schema_g, device_span global_dictionary, diff --git a/cpp/src/io/comp/debrotli.cu b/cpp/src/io/comp/debrotli.cu index 8bafd054bdb..9c936fefd6c 100644 --- a/cpp/src/io/comp/debrotli.cu +++ b/cpp/src/io/comp/debrotli.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2023, NVIDIA CORPORATION. + * Copyright (c) 2018-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -1911,7 +1911,7 @@ static __device__ void ProcessCommands(debrotli_state_s* s, brotli_dictionary_s * @param scratch_size Size of scratch heap space (smaller sizes may result in serialization between * blocks) */ -__global__ void __launch_bounds__(block_size, 2) +CUDF_KERNEL void __launch_bounds__(block_size, 2) gpu_debrotli_kernel(device_span const> inputs, device_span const> outputs, device_span results, diff --git a/cpp/src/io/comp/gpuinflate.cu b/cpp/src/io/comp/gpuinflate.cu index 8993815e560..cd50545afbd 100644 --- a/cpp/src/io/comp/gpuinflate.cu +++ b/cpp/src/io/comp/gpuinflate.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2023, NVIDIA CORPORATION. + * Copyright (c) 2018-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -1024,7 +1024,7 @@ __device__ int parse_gzip_header(uint8_t const* src, size_t src_size) * @param parse_hdr If nonzero, indicates that the compressed bitstream includes a GZIP header */ template -__global__ void __launch_bounds__(block_size) +CUDF_KERNEL void __launch_bounds__(block_size) inflate_kernel(device_span const> inputs, device_span const> outputs, device_span results, @@ -1152,7 +1152,7 @@ __global__ void __launch_bounds__(block_size) * * @param inputs Source and destination information per block */ -__global__ void __launch_bounds__(1024) +CUDF_KERNEL void __launch_bounds__(1024) copy_uncompressed_kernel(device_span const> inputs, device_span const> outputs) { diff --git a/cpp/src/io/comp/snap.cu b/cpp/src/io/comp/snap.cu index 0428f4edaf2..a45e8b2083b 100644 --- a/cpp/src/io/comp/snap.cu +++ b/cpp/src/io/comp/snap.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2023, NVIDIA CORPORATION. + * Copyright (c) 2018-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -257,7 +257,7 @@ static __device__ uint32_t Match60(uint8_t const* src1, * @param[out] outputs Compression status per block * @param[in] count Number of blocks to compress */ -__global__ void __launch_bounds__(128) +CUDF_KERNEL void __launch_bounds__(128) snap_kernel(device_span const> inputs, device_span const> outputs, device_span results) diff --git a/cpp/src/io/comp/unsnap.cu b/cpp/src/io/comp/unsnap.cu index 504a2fe377c..46555a97e9c 100644 --- a/cpp/src/io/comp/unsnap.cu +++ b/cpp/src/io/comp/unsnap.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2023, NVIDIA CORPORATION. + * Copyright (c) 2018-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -628,7 +628,7 @@ __device__ void snappy_process_symbols(unsnap_state_s* s, int t, Storage& temp_s * @param[out] outputs Decompression status per block */ template -__global__ void __launch_bounds__(block_size) +CUDF_KERNEL void __launch_bounds__(block_size) unsnap_kernel(device_span const> inputs, device_span const> outputs, device_span results) diff --git a/cpp/src/io/csv/csv_gpu.cu b/cpp/src/io/csv/csv_gpu.cu index 248e17669bc..8252cccbdb9 100644 --- a/cpp/src/io/csv/csv_gpu.cu +++ b/cpp/src/io/csv/csv_gpu.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -168,7 +168,7 @@ __device__ __inline__ bool is_floatingpoint(long len, * @param row_offsets The start the CSV data of interest * @param d_column_data The count for each column data type */ -__global__ void __launch_bounds__(csvparse_block_dim) +CUDF_KERNEL void __launch_bounds__(csvparse_block_dim) data_type_detection(parse_options_view const opts, device_span csv_text, device_span const column_flags, @@ -305,7 +305,7 @@ __global__ void __launch_bounds__(csvparse_block_dim) * @param[out] valids The bitmaps indicating whether column fields are valid * @param[out] valid_counts The number of valid fields in each column */ -__global__ void __launch_bounds__(csvparse_block_dim) +CUDF_KERNEL void __launch_bounds__(csvparse_block_dim) convert_csv_to_cudf(cudf::io::parse_options_view options, device_span data, device_span column_flags, @@ -622,7 +622,7 @@ static inline __device__ rowctx32_t rowctx_inverse_merge_transform(uint64_t ctxt * @param escapechar Delimiter escape character * @param commentchar Comment line character (skip rows starting with this character) */ -__global__ void __launch_bounds__(rowofs_block_dim) +CUDF_KERNEL void __launch_bounds__(rowofs_block_dim) gather_row_offsets_gpu(uint64_t* row_ctx, device_span offsets_out, device_span const data, diff --git a/cpp/src/io/fst/agent_dfa.cuh b/cpp/src/io/fst/agent_dfa.cuh index f867a95a864..9bb087e788d 100644 --- a/cpp/src/io/fst/agent_dfa.cuh +++ b/cpp/src/io/fst/agent_dfa.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -493,7 +493,7 @@ template -__launch_bounds__(int32_t(AgentDFAPolicy::BLOCK_THREADS)) __global__ +__launch_bounds__(int32_t(AgentDFAPolicy::BLOCK_THREADS)) CUDF_KERNEL void SimulateDFAKernel(DfaT dfa, SymbolItT d_chars, OffsetT const num_chars, diff --git a/cpp/src/io/fst/dispatch_dfa.cuh b/cpp/src/io/fst/dispatch_dfa.cuh index a5c1a4f4f5c..be63ec6539f 100644 --- a/cpp/src/io/fst/dispatch_dfa.cuh +++ b/cpp/src/io/fst/dispatch_dfa.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -67,7 +67,7 @@ struct DeviceFSMPolicy { * @return */ template -__global__ void initialization_pass_kernel(TileState items_state, uint32_t num_tiles) +CUDF_KERNEL void initialization_pass_kernel(TileState items_state, uint32_t num_tiles) { items_state.InitializeStatus(num_tiles); } diff --git a/cpp/src/io/json/legacy/json_gpu.cu b/cpp/src/io/json/legacy/json_gpu.cu index b358cc2071b..4d5293e12fd 100644 --- a/cpp/src/io/json/legacy/json_gpu.cu +++ b/cpp/src/io/json/legacy/json_gpu.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -245,14 +245,14 @@ __device__ std::pair get_row_data_range( * @param[out] valid_fields The bitmaps indicating whether column fields are valid * @param[out] num_valid_fields The numbers of valid fields in columns */ -__global__ void convert_data_to_columns_kernel(parse_options_view opts, - device_span const data, - device_span const row_offsets, - device_span const column_types, - col_map_type col_map, - device_span const output_columns, - device_span const valid_fields, - device_span const num_valid_fields) +CUDF_KERNEL void convert_data_to_columns_kernel(parse_options_view opts, + device_span const data, + device_span const row_offsets, + device_span const column_types, + col_map_type col_map, + device_span const output_columns, + device_span const valid_fields, + device_span const num_valid_fields) { auto const rec_id = grid_1d::global_thread_id(); if (rec_id >= row_offsets.size()) return; @@ -321,7 +321,7 @@ __global__ void convert_data_to_columns_kernel(parse_options_view opts, * @param[in] num_columns The number of columns of input data * @param[out] column_infos The count for each column data type */ -__global__ void detect_data_types_kernel( +CUDF_KERNEL void detect_data_types_kernel( parse_options_view const opts, device_span const data, device_span const row_offsets, @@ -481,11 +481,11 @@ __device__ key_value_range get_next_key_value_range(char const* begin, * @param[out] keys_cnt Number of keys found in the file * @param[out] keys_info optional, information (offset, length, hash) for each found key */ -__global__ void collect_keys_info_kernel(parse_options_view const options, - device_span const data, - device_span const row_offsets, - unsigned long long int* keys_cnt, - thrust::optional keys_info) +CUDF_KERNEL void collect_keys_info_kernel(parse_options_view const options, + device_span const data, + device_span const row_offsets, + unsigned long long int* keys_cnt, + thrust::optional keys_info) { auto const rec_id = grid_1d::global_thread_id(); if (rec_id >= row_offsets.size()) return; diff --git a/cpp/src/io/orc/dict_enc.cu b/cpp/src/io/orc/dict_enc.cu index 1d2262a1ccc..5971482f80c 100644 --- a/cpp/src/io/orc/dict_enc.cu +++ b/cpp/src/io/orc/dict_enc.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -27,10 +27,10 @@ namespace cudf::io::orc::gpu { /** * @brief Counts the number of characters in each rowgroup of each string column. */ -__global__ void rowgroup_char_counts_kernel(device_2dspan char_counts, - device_span orc_columns, - device_2dspan rowgroup_bounds, - device_span str_col_indexes) +CUDF_KERNEL void rowgroup_char_counts_kernel(device_2dspan char_counts, + device_span orc_columns, + device_2dspan rowgroup_bounds, + device_span str_col_indexes) { // Index of the column in the `str_col_indexes` array auto const str_col_idx = blockIdx.y; @@ -75,7 +75,7 @@ void rowgroup_char_counts(device_2dspan counts, } template -__global__ void __launch_bounds__(block_size) +CUDF_KERNEL void __launch_bounds__(block_size) initialize_dictionary_hash_maps_kernel(device_span dictionaries) { auto const dict_map = dictionaries[blockIdx.x].map_slots; @@ -107,7 +107,7 @@ struct hash_functor { }; template -__global__ void __launch_bounds__(block_size) +CUDF_KERNEL void __launch_bounds__(block_size) populate_dictionary_hash_maps_kernel(device_2dspan dictionaries, device_span columns) { @@ -162,7 +162,7 @@ __global__ void __launch_bounds__(block_size) } template -__global__ void __launch_bounds__(block_size) +CUDF_KERNEL void __launch_bounds__(block_size) collect_map_entries_kernel(device_2dspan dictionaries) { auto const col_idx = blockIdx.x; @@ -196,7 +196,7 @@ __global__ void __launch_bounds__(block_size) } template -__global__ void __launch_bounds__(block_size) +CUDF_KERNEL void __launch_bounds__(block_size) get_dictionary_indices_kernel(device_2dspan dictionaries, device_span columns) { diff --git a/cpp/src/io/orc/stats_enc.cu b/cpp/src/io/orc/stats_enc.cu index 1afc0200bfa..31159ae0341 100644 --- a/cpp/src/io/orc/stats_enc.cu +++ b/cpp/src/io/orc/stats_enc.cu @@ -35,7 +35,7 @@ constexpr unsigned int init_threads_per_group = 32; constexpr unsigned int init_groups_per_block = 4; constexpr unsigned int init_threads_per_block = init_threads_per_group * init_groups_per_block; -__global__ void __launch_bounds__(init_threads_per_block) +CUDF_KERNEL void __launch_bounds__(init_threads_per_block) gpu_init_statistics_groups(statistics_group* groups, stats_column_desc const* cols, device_2dspan rowgroup_bounds) @@ -73,7 +73,7 @@ constexpr unsigned int pb_fldlen_common = pb_fld_hdrlen + (pb_fld_hdrlen + pb_fldlen_int64) + 2 * pb_fld_hdrlen; template -__global__ void __launch_bounds__(block_size, 1) +CUDF_KERNEL void __launch_bounds__(block_size, 1) gpu_init_statistics_buffersize(statistics_merge_group* groups, statistics_chunk const* chunks, uint32_t statistics_count) @@ -249,7 +249,7 @@ constexpr unsigned int encode_chunks_per_block = 4; constexpr unsigned int encode_threads_per_block = encode_threads_per_chunk * encode_chunks_per_block; -__global__ void __launch_bounds__(encode_threads_per_block) +CUDF_KERNEL void __launch_bounds__(encode_threads_per_block) gpu_encode_statistics(uint8_t* blob_bfr, statistics_merge_group* groups, statistics_chunk const* chunks, diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu index 0b249bbdafe..14072d79172 100644 --- a/cpp/src/io/orc/stripe_data.cu +++ b/cpp/src/io/orc/stripe_data.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -1082,7 +1082,7 @@ static __device__ int Decode_Decimals(orc_bytestream_s* bs, */ // blockDim {block_size,1,1} template -__global__ void __launch_bounds__(block_size) +CUDF_KERNEL void __launch_bounds__(block_size) gpuDecodeNullsAndStringDictionaries(ColumnDesc* chunks, DictionaryEntry* global_dictionary, uint32_t num_columns, @@ -1358,7 +1358,7 @@ static const __device__ __constant__ uint32_t kTimestampNanoScale[8] = { */ // blockDim {block_size,1,1} template -__global__ void __launch_bounds__(block_size) +CUDF_KERNEL void __launch_bounds__(block_size) gpuDecodeOrcColumnData(ColumnDesc* chunks, DictionaryEntry* global_dictionary, table_device_view tz_table, diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu index b99826e070e..b7dd0ea9ec3 100644 --- a/cpp/src/io/orc/stripe_enc.cu +++ b/cpp/src/io/orc/stripe_enc.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -723,7 +723,7 @@ static __device__ void encode_null_mask(orcenc_state_s* s, */ // blockDim {`encode_block_size`,1,1} template -__global__ void __launch_bounds__(block_size) +CUDF_KERNEL void __launch_bounds__(block_size) gpuEncodeOrcColumnData(device_2dspan chunks, device_2dspan streams) { @@ -1008,7 +1008,7 @@ __global__ void __launch_bounds__(block_size) */ // blockDim {512,1,1} template -__global__ void __launch_bounds__(block_size) +CUDF_KERNEL void __launch_bounds__(block_size) gpuEncodeStringDictionaries(stripe_dictionary const* stripes, device_span columns, device_2dspan chunks, @@ -1091,7 +1091,7 @@ __global__ void __launch_bounds__(block_size) * @param[in,out] streams List of encoder chunk streams [column][rowgroup] */ // blockDim {compact_streams_block_size,1,1} -__global__ void __launch_bounds__(compact_streams_block_size) +CUDF_KERNEL void __launch_bounds__(compact_streams_block_size) gpuCompactOrcDataStreams(device_2dspan strm_desc, device_2dspan streams) { @@ -1136,7 +1136,7 @@ __global__ void __launch_bounds__(compact_streams_block_size) * @param[in] comp_block_align Required alignment for compressed blocks */ // blockDim {256,1,1} -__global__ void __launch_bounds__(256) +CUDF_KERNEL void __launch_bounds__(256) gpuInitCompressionBlocks(device_2dspan strm_desc, device_2dspan streams, // const? device_span> inputs, @@ -1191,7 +1191,7 @@ __global__ void __launch_bounds__(256) * @param[in] max_comp_blk_size Max size of any block after compression */ // blockDim {1024,1,1} -__global__ void __launch_bounds__(1024) +CUDF_KERNEL void __launch_bounds__(1024) gpuCompactCompressedBlocks(device_2dspan strm_desc, device_span const> inputs, device_span const> outputs, @@ -1274,8 +1274,8 @@ struct decimal_column_element_sizes { // Converts sizes of individual decimal elements to offsets within each row group // Conversion is done in-place template -__global__ void decimal_sizes_to_offsets_kernel(device_2dspan rg_bounds, - device_span sizes) +CUDF_KERNEL void decimal_sizes_to_offsets_kernel(device_2dspan rg_bounds, + device_span sizes) { using block_scan = cub::BlockScan; __shared__ typename block_scan::TempStorage scan_storage; diff --git a/cpp/src/io/orc/stripe_init.cu b/cpp/src/io/orc/stripe_init.cu index b31a4a081d1..327b9557176 100644 --- a/cpp/src/io/orc/stripe_init.cu +++ b/cpp/src/io/orc/stripe_init.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -41,7 +41,7 @@ struct compressed_stream_s { }; // blockDim {128,1,1} -__global__ void __launch_bounds__(128, 8) gpuParseCompressedStripeData( +CUDF_KERNEL void __launch_bounds__(128, 8) gpuParseCompressedStripeData( CompressedStreamInfo* strm_info, int32_t num_streams, uint32_t block_size, uint32_t log2maxcr) { __shared__ compressed_stream_s strm_g[4]; @@ -138,7 +138,7 @@ __global__ void __launch_bounds__(128, 8) gpuParseCompressedStripeData( } // blockDim {128,1,1} -__global__ void __launch_bounds__(128, 8) +CUDF_KERNEL void __launch_bounds__(128, 8) gpuPostDecompressionReassemble(CompressedStreamInfo* strm_info, int32_t num_streams) { __shared__ compressed_stream_s strm_g[4]; @@ -442,14 +442,14 @@ static __device__ void gpuMapRowIndexToUncompressed(rowindex_state_s* s, * value */ // blockDim {128,1,1} -__global__ void __launch_bounds__(128, 8) gpuParseRowGroupIndex(RowGroup* row_groups, - CompressedStreamInfo* strm_info, - ColumnDesc* chunks, - uint32_t num_columns, - uint32_t num_stripes, - uint32_t num_rowgroups, - uint32_t rowidx_stride, - bool use_base_stride) +CUDF_KERNEL void __launch_bounds__(128, 8) gpuParseRowGroupIndex(RowGroup* row_groups, + CompressedStreamInfo* strm_info, + ColumnDesc* chunks, + uint32_t num_columns, + uint32_t num_stripes, + uint32_t num_rowgroups, + uint32_t rowidx_stride, + bool use_base_stride) { __shared__ __align__(16) rowindex_state_s state_g; rowindex_state_s* const s = &state_g; @@ -513,7 +513,7 @@ __global__ void __launch_bounds__(128, 8) gpuParseRowGroupIndex(RowGroup* row_gr } template -__global__ void __launch_bounds__(block_size) +CUDF_KERNEL void __launch_bounds__(block_size) gpu_reduce_pushdown_masks(device_span orc_columns, device_2dspan rowgroup_bounds, device_2dspan set_counts) diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu index cef4915e0c9..edc40391bfa 100644 --- a/cpp/src/io/orc/writer_impl.cu +++ b/cpp/src/io/orc/writer_impl.cu @@ -357,10 +357,10 @@ struct string_length_functor { statistics_merge_group const* stripe_stat_merge; }; -__global__ void copy_string_data(char* string_pool, - size_type* offsets, - statistics_chunk* chunks, - statistics_merge_group const* groups) +CUDF_KERNEL void copy_string_data(char* string_pool, + size_type* offsets, + statistics_chunk* chunks, + statistics_merge_group const* groups) { auto const idx = blockIdx.x / 2; if (groups[idx].stats_dtype == dtype_string) { diff --git a/cpp/src/io/parquet/chunk_dict.cu b/cpp/src/io/parquet/chunk_dict.cu index 53ff31ab0a7..a43c6d4cbb6 100644 --- a/cpp/src/io/parquet/chunk_dict.cu +++ b/cpp/src/io/parquet/chunk_dict.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -31,7 +31,7 @@ constexpr int DEFAULT_BLOCK_SIZE = 256; } template -__global__ void __launch_bounds__(block_size) +CUDF_KERNEL void __launch_bounds__(block_size) initialize_chunk_hash_maps_kernel(device_span chunks) { auto const chunk = chunks[blockIdx.x]; @@ -98,7 +98,7 @@ struct map_find_fn { }; template -__global__ void __launch_bounds__(block_size) +CUDF_KERNEL void __launch_bounds__(block_size) populate_chunk_hash_maps_kernel(cudf::detail::device_2dspan frags) { auto col_idx = blockIdx.y; @@ -189,7 +189,7 @@ __global__ void __launch_bounds__(block_size) } template -__global__ void __launch_bounds__(block_size) +CUDF_KERNEL void __launch_bounds__(block_size) collect_map_entries_kernel(device_span chunks) { auto& chunk = chunks[blockIdx.x]; @@ -223,7 +223,7 @@ __global__ void __launch_bounds__(block_size) } template -__global__ void __launch_bounds__(block_size) +CUDF_KERNEL void __launch_bounds__(block_size) get_dictionary_indices_kernel(cudf::detail::device_2dspan frags) { auto col_idx = blockIdx.y; diff --git a/cpp/src/io/parquet/decode_preprocess.cu b/cpp/src/io/parquet/decode_preprocess.cu index afe9a76a6d0..2d000600028 100644 --- a/cpp/src/io/parquet/decode_preprocess.cu +++ b/cpp/src/io/parquet/decode_preprocess.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -207,7 +207,7 @@ static __device__ void gpuUpdatePageSizes(page_state_s* s, * (PageInfo::str_bytes) as part of the pass */ template -__global__ void __launch_bounds__(preprocess_block_size) +CUDF_KERNEL void __launch_bounds__(preprocess_block_size) gpuComputePageSizes(PageInfo* pages, device_span chunks, size_t min_row, diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu index d39edd70fcd..8d220e6fa96 100644 --- a/cpp/src/io/parquet/page_data.cu +++ b/cpp/src/io/parquet/page_data.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2023, NVIDIA CORPORATION. + * Copyright (c) 2018-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -421,7 +421,7 @@ static __device__ void gpuOutputGeneric( * @param error_code Error code to set if an error is encountered */ template -__global__ void __launch_bounds__(decode_block_size) +CUDF_KERNEL void __launch_bounds__(decode_block_size) gpuDecodePageData(PageInfo* pages, device_span chunks, size_t min_row, diff --git a/cpp/src/io/parquet/page_delta_decode.cu b/cpp/src/io/parquet/page_delta_decode.cu index 44ec0e1e027..d0557446f14 100644 --- a/cpp/src/io/parquet/page_delta_decode.cu +++ b/cpp/src/io/parquet/page_delta_decode.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -305,7 +305,7 @@ struct delta_byte_array_decoder { // with V2 page headers; see https://www.mail-archive.com/dev@parquet.apache.org/msg11826.html). // this kernel only needs 96 threads (3 warps)(for now). template -__global__ void __launch_bounds__(96) +CUDF_KERNEL void __launch_bounds__(96) gpuDecodeDeltaBinary(PageInfo* pages, device_span chunks, size_t min_row, @@ -430,7 +430,7 @@ __global__ void __launch_bounds__(96) // suffixes are not encoded in the header, we're going to have to first do a quick pass through them // to find the start/end of each structure. template -__global__ void __launch_bounds__(decode_block_size) +CUDF_KERNEL void __launch_bounds__(decode_block_size) gpuDecodeDeltaByteArray(PageInfo* pages, device_span chunks, size_t min_row, @@ -587,7 +587,7 @@ __global__ void __launch_bounds__(decode_block_size) // Decode page data that is DELTA_LENGTH_BYTE_ARRAY packed. This encoding consists of a // DELTA_BINARY_PACKED array of string lengths, followed by the string data. template -__global__ void __launch_bounds__(decode_block_size) +CUDF_KERNEL void __launch_bounds__(decode_block_size) gpuDecodeDeltaLengthByteArray(PageInfo* pages, device_span chunks, size_t min_row, diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu index e16551024d1..12af5888d2f 100644 --- a/cpp/src/io/parquet/page_enc.cu +++ b/cpp/src/io/parquet/page_enc.cu @@ -385,7 +385,7 @@ __device__ uint8_t const* delta_encode(page_enc_state_s<0>* s, uint64_t* buffer, // blockDim {512,1,1} template -__global__ void __launch_bounds__(block_size) +CUDF_KERNEL void __launch_bounds__(block_size) gpuInitRowGroupFragments(device_2dspan frag, device_span col_desc, device_span partitions, @@ -422,7 +422,7 @@ __global__ void __launch_bounds__(block_size) // blockDim {512,1,1} template -__global__ void __launch_bounds__(block_size) +CUDF_KERNEL void __launch_bounds__(block_size) gpuCalculatePageFragments(device_span frag, device_span column_frag_sizes) { @@ -449,7 +449,7 @@ __global__ void __launch_bounds__(block_size) } // blockDim {128,1,1} -__global__ void __launch_bounds__(128) +CUDF_KERNEL void __launch_bounds__(128) gpuInitFragmentStats(device_span groups, device_span fragments) { @@ -510,7 +510,7 @@ __device__ size_t delta_data_len(Type physical_type, } // blockDim {128,1,1} -__global__ void __launch_bounds__(128) +CUDF_KERNEL void __launch_bounds__(128) gpuInitPages(device_2dspan chunks, device_span pages, device_span page_sizes, @@ -1244,9 +1244,10 @@ __device__ auto julian_days_with_time(int64_t v) // the level data is encoded. // blockDim(128, 1, 1) template -__global__ void __launch_bounds__(block_size, 8) gpuEncodePageLevels(device_span pages, - bool write_v2_headers, - encode_kernel_mask kernel_mask) +CUDF_KERNEL void __launch_bounds__(block_size, 8) + gpuEncodePageLevels(device_span pages, + bool write_v2_headers, + encode_kernel_mask kernel_mask) { __shared__ __align__(8) rle_page_enc_state_s state_g; @@ -1504,7 +1505,7 @@ __device__ void finish_page_encode(state_buf* s, // PLAIN page data encoder // blockDim(128, 1, 1) template -__global__ void __launch_bounds__(block_size, 8) +CUDF_KERNEL void __launch_bounds__(block_size, 8) gpuEncodePages(device_span pages, device_span> comp_in, device_span> comp_out, @@ -1739,7 +1740,7 @@ __global__ void __launch_bounds__(block_size, 8) // DICTIONARY page data encoder // blockDim(128, 1, 1) template -__global__ void __launch_bounds__(block_size, 8) +CUDF_KERNEL void __launch_bounds__(block_size, 8) gpuEncodeDictPages(device_span pages, device_span> comp_in, device_span> comp_out, @@ -1871,7 +1872,7 @@ __global__ void __launch_bounds__(block_size, 8) // DELTA_BINARY_PACKED page data encoder // blockDim(128, 1, 1) template -__global__ void __launch_bounds__(block_size, 8) +CUDF_KERNEL void __launch_bounds__(block_size, 8) gpuEncodeDeltaBinaryPages(device_span pages, device_span> comp_in, device_span> comp_out, @@ -1975,7 +1976,7 @@ __global__ void __launch_bounds__(block_size, 8) // DELTA_LENGTH_BYTE_ARRAY page data encoder // blockDim(128, 1, 1) template -__global__ void __launch_bounds__(block_size, 8) +CUDF_KERNEL void __launch_bounds__(block_size, 8) gpuEncodeDeltaLengthByteArrayPages(device_span pages, device_span> comp_in, device_span> comp_out, @@ -2105,7 +2106,7 @@ constexpr int decide_compression_block_size = decide_compression_warps_in_block * cudf::detail::warp_size; // blockDim(decide_compression_block_size, 1, 1) -__global__ void __launch_bounds__(decide_compression_block_size) +CUDF_KERNEL void __launch_bounds__(decide_compression_block_size) gpuDecideCompression(device_span chunks) { __shared__ __align__(8) EncColumnChunk ck_g[decide_compression_warps_in_block]; @@ -2575,7 +2576,7 @@ __device__ uint8_t* EncodeStatistics(uint8_t* start, } // blockDim(128, 1, 1) -__global__ void __launch_bounds__(128) +CUDF_KERNEL void __launch_bounds__(128) gpuEncodePageHeaders(device_span pages, device_span comp_results, device_span page_stats, @@ -2670,7 +2671,7 @@ __global__ void __launch_bounds__(128) } // blockDim(1024, 1, 1) -__global__ void __launch_bounds__(1024) +CUDF_KERNEL void __launch_bounds__(1024) gpuGatherPages(device_span chunks, device_span pages) { __shared__ __align__(8) EncColumnChunk ck_g; @@ -2848,7 +2849,7 @@ struct mask_tform { } // namespace // blockDim(1, 1, 1) -__global__ void __launch_bounds__(1) +CUDF_KERNEL void __launch_bounds__(1) gpuEncodeColumnIndexes(device_span chunks, device_span column_stats, int32_t column_index_truncate_length) diff --git a/cpp/src/io/parquet/page_hdr.cu b/cpp/src/io/parquet/page_hdr.cu index cc3f584422d..4be4f45497d 100644 --- a/cpp/src/io/parquet/page_hdr.cu +++ b/cpp/src/io/parquet/page_hdr.cu @@ -348,9 +348,9 @@ struct gpuParsePageHeader { * @param[in] num_chunks Number of column chunks */ // blockDim {128,1,1} -__global__ void __launch_bounds__(128) gpuDecodePageHeaders(ColumnChunkDesc* chunks, - int32_t num_chunks, - kernel_error::pointer error_code) +CUDF_KERNEL void __launch_bounds__(128) gpuDecodePageHeaders(ColumnChunkDesc* chunks, + int32_t num_chunks, + kernel_error::pointer error_code) { using cudf::detail::warp_size; gpuParsePageHeader parse_page_header; @@ -480,7 +480,7 @@ __global__ void __launch_bounds__(128) gpuDecodePageHeaders(ColumnChunkDesc* chu * @param[in] num_chunks Number of column chunks */ // blockDim {128,1,1} -__global__ void __launch_bounds__(128) +CUDF_KERNEL void __launch_bounds__(128) gpuBuildStringDictionaryIndex(ColumnChunkDesc* chunks, int32_t num_chunks) { __shared__ ColumnChunkDesc chunk_g[4]; diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu index d559f93f45b..37a8cabc182 100644 --- a/cpp/src/io/parquet/page_string_decode.cu +++ b/cpp/src/io/parquet/page_string_decode.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2023, NVIDIA CORPORATION. + * Copyright (c) 2018-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -584,7 +584,7 @@ __device__ thrust::pair totalDeltaByteArraySize(uint8_t const* d * @tparam level_t Type used to store decoded repetition and definition levels */ template -__global__ void __launch_bounds__(preprocess_block_size) gpuComputeStringPageBounds( +CUDF_KERNEL void __launch_bounds__(preprocess_block_size) gpuComputeStringPageBounds( PageInfo* pages, device_span chunks, size_t min_row, size_t num_rows) { __shared__ __align__(16) page_state_s state_g; @@ -653,7 +653,7 @@ __global__ void __launch_bounds__(preprocess_block_size) gpuComputeStringPageBou * @param min_rows crop all rows below min_row * @param num_rows Maximum number of rows to read */ -__global__ void __launch_bounds__(delta_preproc_block_size) gpuComputeDeltaPageStringSizes( +CUDF_KERNEL void __launch_bounds__(delta_preproc_block_size) gpuComputeDeltaPageStringSizes( PageInfo* pages, device_span chunks, size_t min_row, size_t num_rows) { __shared__ __align__(16) page_state_s state_g; @@ -725,7 +725,7 @@ __global__ void __launch_bounds__(delta_preproc_block_size) gpuComputeDeltaPageS * @param min_rows crop all rows below min_row * @param num_rows Maximum number of rows to read */ -__global__ void __launch_bounds__(delta_length_block_size) gpuComputeDeltaLengthPageStringSizes( +CUDF_KERNEL void __launch_bounds__(delta_length_block_size) gpuComputeDeltaLengthPageStringSizes( PageInfo* pages, device_span chunks, size_t min_row, size_t num_rows) { using cudf::detail::warp_size; @@ -820,7 +820,7 @@ __global__ void __launch_bounds__(delta_length_block_size) gpuComputeDeltaLength * @param min_rows crop all rows below min_row * @param num_rows Maximum number of rows to read */ -__global__ void __launch_bounds__(preprocess_block_size) gpuComputePageStringSizes( +CUDF_KERNEL void __launch_bounds__(preprocess_block_size) gpuComputePageStringSizes( PageInfo* pages, device_span chunks, size_t min_row, size_t num_rows) { __shared__ __align__(16) page_state_s state_g; @@ -912,7 +912,7 @@ __global__ void __launch_bounds__(preprocess_block_size) gpuComputePageStringSiz * @tparam level_t Type used to store decoded repetition and definition levels */ template -__global__ void __launch_bounds__(decode_block_size) +CUDF_KERNEL void __launch_bounds__(decode_block_size) gpuDecodeStringPageData(PageInfo* pages, device_span chunks, size_t min_row, diff --git a/cpp/src/io/statistics/column_statistics.cuh b/cpp/src/io/statistics/column_statistics.cuh index f71fb95949f..db0d56ac321 100644 --- a/cpp/src/io/statistics/column_statistics.cuh +++ b/cpp/src/io/statistics/column_statistics.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -289,7 +289,7 @@ __device__ void cooperative_load(T& destination, T const* source = nullptr) * @tparam IO File format for which statistics calculation is being done */ template -__global__ void __launch_bounds__(block_size, 1) +CUDF_KERNEL void __launch_bounds__(block_size, 1) gpu_calculate_group_statistics(statistics_chunk* chunks, statistics_group const* groups, bool const int96_timestamps) @@ -368,7 +368,7 @@ void calculate_group_statistics(statistics_chunk* chunks, * @tparam IO File format for which statistics calculation is being done */ template -__global__ void __launch_bounds__(block_size, 1) +CUDF_KERNEL void __launch_bounds__(block_size, 1) gpu_merge_group_statistics(statistics_chunk* chunks_out, statistics_chunk const* chunks_in, statistics_merge_group const* groups) diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu index 443ca0f5fe7..2194ee1aaa1 100644 --- a/cpp/src/io/text/multibyte_split.cu +++ b/cpp/src/io/text/multibyte_split.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -138,7 +138,7 @@ using byte_offset = int64_t; // it begins in. From there, each thread can then take deterministic action. In this case, the // deterministic action is counting and outputting delimiter offsets when a delimiter is found. -__global__ void multibyte_split_init_kernel( +CUDF_KERNEL void multibyte_split_init_kernel( cudf::size_type base_tile_idx, cudf::size_type num_tiles, cudf::io::text::detail::scan_tile_state_view tile_multistates, @@ -154,7 +154,7 @@ __global__ void multibyte_split_init_kernel( } } -__global__ __launch_bounds__(THREADS_PER_TILE) void multibyte_split_kernel( +CUDF_KERNEL __launch_bounds__(THREADS_PER_TILE) void multibyte_split_kernel( cudf::size_type base_tile_idx, byte_offset base_input_offset, output_offset base_output_offset, @@ -231,7 +231,7 @@ __global__ __launch_bounds__(THREADS_PER_TILE) void multibyte_split_kernel( } } -__global__ __launch_bounds__(THREADS_PER_TILE) void byte_split_kernel( +CUDF_KERNEL __launch_bounds__(THREADS_PER_TILE) void byte_split_kernel( cudf::size_type base_tile_idx, byte_offset base_input_offset, output_offset base_output_offset, diff --git a/cpp/src/io/utilities/data_casting.cu b/cpp/src/io/utilities/data_casting.cu index 9e5c5c76392..9545811a542 100644 --- a/cpp/src/io/utilities/data_casting.cu +++ b/cpp/src/io/utilities/data_casting.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -420,14 +420,14 @@ struct bitfield_block { * @param d_chars Character array to store the characters of strings */ template -__global__ void parse_fn_string_parallel(str_tuple_it str_tuples, - size_type total_out_strings, - size_type* str_counter, - bitmask_type* null_mask, - size_type* null_count_data, - cudf::io::parse_options_view const options, - size_type* d_offsets, - char* d_chars) +CUDF_KERNEL void parse_fn_string_parallel(str_tuple_it str_tuples, + size_type total_out_strings, + size_type* str_counter, + bitmask_type* null_mask, + size_type* null_count_data, + cudf::io::parse_options_view const options, + size_type* d_offsets, + char* d_chars) { constexpr auto BLOCK_SIZE = is_warp ? cudf::detail::warp_size : cudf::detail::warp_size * num_warps; diff --git a/cpp/src/io/utilities/parsing_utils.cu b/cpp/src/io/utilities/parsing_utils.cu index 06b86f33c85..d02ce99e6e5 100644 --- a/cpp/src/io/utilities/parsing_utils.cu +++ b/cpp/src/io/utilities/parsing_utils.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -87,12 +87,12 @@ __device__ __forceinline__ void setElement(void*, cudf::size_type, T const&, V c * @param[out] positions Array containing the output positions */ template -__global__ void count_and_set_positions(char const* data, - uint64_t size, - uint64_t offset, - char const key, - cudf::size_type* count, - T* positions) +CUDF_KERNEL void count_and_set_positions(char const* data, + uint64_t size, + uint64_t offset, + char const key, + cudf::size_type* count, + T* positions) { // thread IDs range per block, so also need the block id auto const tid = cudf::detail::grid_1d::global_thread_id(); diff --git a/cpp/src/io/utilities/type_inference.cu b/cpp/src/io/utilities/type_inference.cu index 79a5c8f1c4c..b446ad41946 100644 --- a/cpp/src/io/utilities/type_inference.cu +++ b/cpp/src/io/utilities/type_inference.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -112,11 +112,11 @@ __device__ __inline__ bool is_like_float(std::size_t len, * @param[out] column_info Histogram of column type counters */ template -__global__ void infer_column_type_kernel(OptionsView options, - device_span data, - ColumnStringIter offset_length_begin, - std::size_t size, - cudf::io::column_type_histogram* column_info) +CUDF_KERNEL void infer_column_type_kernel(OptionsView options, + device_span data, + ColumnStringIter offset_length_begin, + std::size_t size, + cudf::io::column_type_histogram* column_info) { auto thread_type_histogram = cudf::io::column_type_histogram{}; diff --git a/cpp/src/join/conditional_join_kernels.cuh b/cpp/src/join/conditional_join_kernels.cuh index f665aba698f..02ce27a36ba 100644 --- a/cpp/src/join/conditional_join_kernels.cuh +++ b/cpp/src/join/conditional_join_kernels.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -48,7 +48,7 @@ namespace detail { * @param[out] output_size The resulting output size */ template -__global__ void compute_conditional_join_output_size( +CUDF_KERNEL void compute_conditional_join_output_size( table_device_view left_table, table_device_view right_table, join_kind join_type, @@ -138,15 +138,15 @@ __global__ void compute_conditional_join_output_size( * the kernel needs to internally loop over left rows. Otherwise, loop over right rows. */ template -__global__ void conditional_join(table_device_view left_table, - table_device_view right_table, - join_kind join_type, - cudf::size_type* join_output_l, - cudf::size_type* join_output_r, - cudf::size_type* current_idx, - cudf::ast::detail::expression_device_view device_expression_data, - cudf::size_type const max_size, - bool const swap_tables) +CUDF_KERNEL void conditional_join(table_device_view left_table, + table_device_view right_table, + join_kind join_type, + cudf::size_type* join_output_l, + cudf::size_type* join_output_r, + cudf::size_type* current_idx, + cudf::ast::detail::expression_device_view device_expression_data, + cudf::size_type const max_size, + bool const swap_tables) { constexpr int num_warps = block_size / detail::warp_size; __shared__ cudf::size_type current_idx_shared[num_warps]; diff --git a/cpp/src/join/mixed_join_kernel.cuh b/cpp/src/join/mixed_join_kernel.cuh index efe575e14de..22bbbff967a 100644 --- a/cpp/src/join/mixed_join_kernel.cuh +++ b/cpp/src/join/mixed_join_kernel.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -36,8 +36,10 @@ namespace detail { namespace cg = cooperative_groups; +#pragma GCC diagnostic ignored "-Wattributes" + template -__launch_bounds__(block_size) __global__ +__attribute__((visibility("hidden"))) __launch_bounds__(block_size) __global__ void mixed_join(table_device_view left_table, table_device_view right_table, table_device_view probe, diff --git a/cpp/src/join/mixed_join_kernels.cuh b/cpp/src/join/mixed_join_kernels.cuh index 2cd4d0c3b38..1d36a246f02 100644 --- a/cpp/src/join/mixed_join_kernels.cuh +++ b/cpp/src/join/mixed_join_kernels.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -57,6 +57,7 @@ namespace detail { * left/right tables to determine which is the build table and which is the * probe table has already happened on the host. */ + template __global__ void compute_mixed_join_output_size( table_device_view left_table, diff --git a/cpp/src/join/mixed_join_kernels_semi.cu b/cpp/src/join/mixed_join_kernels_semi.cu index e31e35ff788..bde75395371 100644 --- a/cpp/src/join/mixed_join_kernels_semi.cu +++ b/cpp/src/join/mixed_join_kernels_semi.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -31,8 +31,10 @@ namespace detail { namespace cg = cooperative_groups; +#pragma GCC diagnostic ignored "-Wattributes" + template -__launch_bounds__(block_size) __global__ +__attribute__((visibility("hidden"))) __launch_bounds__(block_size) __global__ void mixed_join_semi(table_device_view left_table, table_device_view right_table, table_device_view probe, diff --git a/cpp/src/join/mixed_join_size_kernel.cuh b/cpp/src/join/mixed_join_size_kernel.cuh index ef377dadc4b..3bd7bfd7c9a 100644 --- a/cpp/src/join/mixed_join_size_kernel.cuh +++ b/cpp/src/join/mixed_join_size_kernel.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -33,20 +33,23 @@ namespace cudf { namespace detail { namespace cg = cooperative_groups; +#pragma GCC diagnostic ignored "-Wattributes" + template -__launch_bounds__(block_size) __global__ void compute_mixed_join_output_size( - table_device_view left_table, - table_device_view right_table, - table_device_view probe, - table_device_view build, - row_hash const hash_probe, - row_equality const equality_probe, - join_kind const join_type, - cudf::detail::mixed_multimap_type::device_view hash_table_view, - ast::detail::expression_device_view device_expression_data, - bool const swap_tables, - std::size_t* output_size, - cudf::device_span matches_per_row) +__attribute__((visibility("hidden"))) __launch_bounds__(block_size) __global__ + void compute_mixed_join_output_size( + table_device_view left_table, + table_device_view right_table, + table_device_view probe, + table_device_view build, + row_hash const hash_probe, + row_equality const equality_probe, + join_kind const join_type, + cudf::detail::mixed_multimap_type::device_view hash_table_view, + ast::detail::expression_device_view device_expression_data, + bool const swap_tables, + std::size_t* output_size, + cudf::device_span matches_per_row) { // The (required) extern storage of the shared memory array leads to // conflicting declarations between different templates. The easiest diff --git a/cpp/src/join/mixed_join_size_kernels_semi.cu b/cpp/src/join/mixed_join_size_kernels_semi.cu index fd7bf0234e9..31da6677aef 100644 --- a/cpp/src/join/mixed_join_size_kernels_semi.cu +++ b/cpp/src/join/mixed_join_size_kernels_semi.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -31,20 +31,23 @@ namespace detail { namespace cg = cooperative_groups; +#pragma GCC diagnostic ignored "-Wattributes" + template -__launch_bounds__(block_size) __global__ void compute_mixed_join_output_size_semi( - table_device_view left_table, - table_device_view right_table, - table_device_view probe, - table_device_view build, - row_hash const hash_probe, - row_equality const equality_probe, - join_kind const join_type, - cudf::detail::semi_map_type::device_view hash_table_view, - ast::detail::expression_device_view device_expression_data, - bool const swap_tables, - std::size_t* output_size, - cudf::device_span matches_per_row) +__attribute__((visibility("hidden"))) __launch_bounds__(block_size) __global__ + void compute_mixed_join_output_size_semi( + table_device_view left_table, + table_device_view right_table, + table_device_view probe, + table_device_view build, + row_hash const hash_probe, + row_equality const equality_probe, + join_kind const join_type, + cudf::detail::semi_map_type::device_view hash_table_view, + ast::detail::expression_device_view device_expression_data, + bool const swap_tables, + std::size_t* output_size, + cudf::device_span matches_per_row) { // The (required) extern storage of the shared memory array leads to // conflicting declarations between different templates. The easiest diff --git a/cpp/src/json/json_path.cu b/cpp/src/json/json_path.cu index c01357c96ca..6794838c70f 100644 --- a/cpp/src/json/json_path.cu +++ b/cpp/src/json/json_path.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -900,7 +900,7 @@ __device__ thrust::pair get_json_object_single( * @param options Options controlling behavior */ template -__launch_bounds__(block_size) __global__ +__launch_bounds__(block_size) CUDF_KERNEL void get_json_object_kernel(column_device_view col, path_operator const* const commands, size_type* output_offsets, diff --git a/cpp/src/merge/merge.cu b/cpp/src/merge/merge.cu index 0d30230de28..073a2a6b97e 100644 --- a/cpp/src/merge/merge.cu +++ b/cpp/src/merge/merge.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -122,7 +122,7 @@ using index_type = detail::index_type; * to be copied to the output. Length must be equal to `num_destination_rows` */ template -__global__ void materialize_merged_bitmask_kernel( +CUDF_KERNEL void materialize_merged_bitmask_kernel( column_device_view left_dcol, column_device_view right_dcol, bitmask_type* out_validity, diff --git a/cpp/src/partitioning/partitioning.cu b/cpp/src/partitioning/partitioning.cu index 7b6676346c2..8d8f1a71672 100644 --- a/cpp/src/partitioning/partitioning.cu +++ b/cpp/src/partitioning/partitioning.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -122,14 +122,14 @@ class bitwise_partitioner { * @param[out] global_partition_sizes The number of rows in each partition. */ template -__global__ void compute_row_partition_numbers(row_hasher_t the_hasher, - size_type const num_rows, - size_type const num_partitions, - partitioner_type const the_partitioner, - size_type* __restrict__ row_partition_numbers, - size_type* __restrict__ row_partition_offset, - size_type* __restrict__ block_partition_sizes, - size_type* __restrict__ global_partition_sizes) +CUDF_KERNEL void compute_row_partition_numbers(row_hasher_t the_hasher, + size_type const num_rows, + size_type const num_partitions, + partitioner_type const the_partitioner, + size_type* __restrict__ row_partition_numbers, + size_type* __restrict__ row_partition_offset, + size_type* __restrict__ block_partition_sizes, + size_type* __restrict__ global_partition_sizes) { // Accumulate histogram of the size of each partition in shared memory extern __shared__ size_type shared_partition_sizes[]; @@ -197,10 +197,10 @@ __global__ void compute_row_partition_numbers(row_hasher_t the_hasher, {block0 partition(num_partitions-1) offset, block1 partition(num_partitions -1) offset, ...} } */ -__global__ void compute_row_output_locations(size_type* __restrict__ row_partition_numbers, - size_type const num_rows, - size_type const num_partitions, - size_type* __restrict__ block_partition_offsets) +CUDF_KERNEL void compute_row_output_locations(size_type* __restrict__ row_partition_numbers, + size_type const num_rows, + size_type const num_partitions, + size_type* __restrict__ block_partition_offsets) { // Shared array that holds the offset of this blocks partitions in // global memory @@ -255,14 +255,14 @@ __global__ void compute_row_output_locations(size_type* __restrict__ row_partiti * @param[in] scanned_block_partition_sizes The scan of block_partition_sizes */ template -__global__ void copy_block_partitions(InputIter input_iter, - DataType* __restrict__ output_buf, - size_type const num_rows, - size_type const num_partitions, - size_type const* __restrict__ row_partition_numbers, - size_type const* __restrict__ row_partition_offset, - size_type const* __restrict__ block_partition_sizes, - size_type const* __restrict__ scanned_block_partition_sizes) +CUDF_KERNEL void copy_block_partitions(InputIter input_iter, + DataType* __restrict__ output_buf, + size_type const num_rows, + size_type const num_partitions, + size_type const* __restrict__ row_partition_numbers, + size_type const* __restrict__ row_partition_offset, + size_type const* __restrict__ block_partition_sizes, + size_type const* __restrict__ scanned_block_partition_sizes) { extern __shared__ char shared_memory[]; auto block_output = reinterpret_cast(shared_memory); diff --git a/cpp/src/quantiles/tdigest/tdigest.cu b/cpp/src/quantiles/tdigest/tdigest.cu index 4764ac4d87a..c8ac19e01cc 100644 --- a/cpp/src/quantiles/tdigest/tdigest.cu +++ b/cpp/src/quantiles/tdigest/tdigest.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -68,13 +68,13 @@ struct make_centroid { // kernel for computing percentiles on input tdigest (mean, weight) centroid data. template -__global__ void compute_percentiles_kernel(device_span tdigest_offsets, - column_device_view percentiles, - CentroidIter centroids_, - double const* min_, - double const* max_, - double const* cumulative_weight_, - double* output) +CUDF_KERNEL void compute_percentiles_kernel(device_span tdigest_offsets, + column_device_view percentiles, + CentroidIter centroids_, + double const* min_, + double const* max_, + double const* cumulative_weight_, + double* output) { auto const tid = cudf::detail::grid_1d::global_thread_id(); diff --git a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu index 450996a43d2..fc56d17d73b 100644 --- a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu +++ b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -370,15 +370,15 @@ std::unique_ptr to_tdigest_scalar(std::unique_ptr&& tdigest, */ template -__global__ void generate_cluster_limits_kernel(int delta, - size_type num_groups, - NearestWeightFunc nearest_weight, - GroupInfo group_info, - CumulativeWeight cumulative_weight, - double* group_cluster_wl, - size_type* group_num_clusters, - size_type const* group_cluster_offsets, - bool has_nulls) +CUDF_KERNEL void generate_cluster_limits_kernel(int delta, + size_type num_groups, + NearestWeightFunc nearest_weight, + GroupInfo group_info, + CumulativeWeight cumulative_weight, + double* group_cluster_wl, + size_type* group_num_clusters, + size_type const* group_cluster_offsets, + bool has_nulls) { int const tid = threadIdx.x + blockIdx.x * blockDim.x; diff --git a/cpp/src/replace/nulls.cu b/cpp/src/replace/nulls.cu index 2eb624d3f05..bd3e75e2e80 100644 --- a/cpp/src/replace/nulls.cu +++ b/cpp/src/replace/nulls.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -57,12 +57,12 @@ namespace { // anonymous static constexpr int BLOCK_SIZE = 256; template -__global__ void replace_nulls_strings(cudf::column_device_view input, - cudf::column_device_view replacement, - cudf::bitmask_type* output_valid, - cudf::size_type* offsets, - char* chars, - cudf::size_type* valid_counter) +CUDF_KERNEL void replace_nulls_strings(cudf::column_device_view input, + cudf::column_device_view replacement, + cudf::bitmask_type* output_valid, + cudf::size_type* offsets, + char* chars, + cudf::size_type* valid_counter) { cudf::size_type nrows = input.size(); auto i = cudf::detail::grid_1d::global_thread_id(); @@ -112,10 +112,10 @@ __global__ void replace_nulls_strings(cudf::column_device_view input, } template -__global__ void replace_nulls(cudf::column_device_view input, - cudf::column_device_view replacement, - cudf::mutable_column_device_view output, - cudf::size_type* output_valid_count) +CUDF_KERNEL void replace_nulls(cudf::column_device_view input, + cudf::column_device_view replacement, + cudf::mutable_column_device_view output, + cudf::size_type* output_valid_count) { cudf::size_type nrows = input.size(); auto i = cudf::detail::grid_1d::global_thread_id(); diff --git a/cpp/src/replace/replace.cu b/cpp/src/replace/replace.cu index 9341929de44..7cad2fb10d3 100644 --- a/cpp/src/replace/replace.cu +++ b/cpp/src/replace/replace.cu @@ -17,7 +17,7 @@ * limitations under the License. */ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -118,13 +118,13 @@ __device__ int get_new_string_value(cudf::size_type idx, * @param output_valid_count The output valid count */ template -__global__ void replace_strings_first_pass(cudf::column_device_view input, - cudf::column_device_view values_to_replace, - cudf::column_device_view replacement, - cudf::mutable_column_device_view offsets, - cudf::mutable_column_device_view indices, - cudf::bitmask_type* output_valid, - cudf::size_type* __restrict__ output_valid_count) +CUDF_KERNEL void replace_strings_first_pass(cudf::column_device_view input, + cudf::column_device_view values_to_replace, + cudf::column_device_view replacement, + cudf::mutable_column_device_view offsets, + cudf::mutable_column_device_view indices, + cudf::bitmask_type* output_valid, + cudf::size_type* __restrict__ output_valid_count) { cudf::size_type nrows = input.size(); auto tid = cudf::detail::grid_1d::global_thread_id(); @@ -184,11 +184,11 @@ __global__ void replace_strings_first_pass(cudf::column_device_view input, * @param indices Temporary column used to store the replacement indices. */ template -__global__ void replace_strings_second_pass(cudf::column_device_view input, - cudf::column_device_view replacement, - cudf::mutable_column_device_view offsets, - cudf::mutable_column_device_view strings, - cudf::mutable_column_device_view indices) +CUDF_KERNEL void replace_strings_second_pass(cudf::column_device_view input, + cudf::column_device_view replacement, + cudf::mutable_column_device_view offsets, + cudf::mutable_column_device_view strings, + cudf::mutable_column_device_view indices) { cudf::size_type nrows = input.size(); auto tid = cudf::detail::grid_1d::global_thread_id(); @@ -245,12 +245,12 @@ __global__ void replace_strings_second_pass(cudf::column_device_view input, * @param[in] replacement_valid Valid mask associated with d_replacement_values */ template -__global__ void replace_kernel(cudf::column_device_view input, - cudf::mutable_column_device_view output, - cudf::size_type* __restrict__ output_valid_count, - cudf::size_type nrows, - cudf::column_device_view values_to_replace, - cudf::column_device_view replacement) +CUDF_KERNEL void replace_kernel(cudf::column_device_view input, + cudf::mutable_column_device_view output, + cudf::size_type* __restrict__ output_valid_count, + cudf::size_type nrows, + cudf::column_device_view values_to_replace, + cudf::column_device_view replacement) { T* __restrict__ output_data = output.data(); diff --git a/cpp/src/rolling/detail/rolling.cuh b/cpp/src/rolling/detail/rolling.cuh index 0648ef3d30f..20845a97c7e 100644 --- a/cpp/src/rolling/detail/rolling.cuh +++ b/cpp/src/rolling/detail/rolling.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -1022,7 +1022,7 @@ template -__launch_bounds__(block_size) __global__ +__launch_bounds__(block_size) CUDF_KERNEL void gpu_rolling(column_device_view input, column_device_view default_outputs, mutable_column_device_view output, diff --git a/cpp/src/rolling/jit/kernel.cu b/cpp/src/rolling/jit/kernel.cu index 06b224c39ad..2c753965c1c 100644 --- a/cpp/src/rolling/jit/kernel.cu +++ b/cpp/src/rolling/jit/kernel.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -41,15 +41,15 @@ template -__global__ void gpu_rolling_new(cudf::size_type nrows, - InType const* const __restrict__ in_col, - cudf::bitmask_type const* const __restrict__ in_col_valid, - OutType* __restrict__ out_col, - cudf::bitmask_type* __restrict__ out_col_valid, - cudf::size_type* __restrict__ output_valid_count, - PrecedingWindowType preceding_window_begin, - FollowingWindowType following_window_begin, - cudf::size_type min_periods) +CUDF_KERNEL void gpu_rolling_new(cudf::size_type nrows, + InType const* const __restrict__ in_col, + cudf::bitmask_type const* const __restrict__ in_col_valid, + OutType* __restrict__ out_col, + cudf::bitmask_type* __restrict__ out_col_valid, + cudf::size_type* __restrict__ output_valid_count, + PrecedingWindowType preceding_window_begin, + FollowingWindowType following_window_begin, + cudf::size_type min_periods) { cudf::thread_index_type i = blockIdx.x * blockDim.x + threadIdx.x; cudf::thread_index_type const stride = blockDim.x * gridDim.x; diff --git a/cpp/src/strings/attributes.cu b/cpp/src/strings/attributes.cu index 00e49f9d97e..2856c077fb2 100644 --- a/cpp/src/strings/attributes.cu +++ b/cpp/src/strings/attributes.cu @@ -110,8 +110,8 @@ std::unique_ptr counts_fn(strings_column_view const& strings, * @param d_strings Column with strings to count * @param d_lengths Results of the counts per string */ -__global__ void count_characters_parallel_fn(column_device_view const d_strings, - size_type* d_lengths) +CUDF_KERNEL void count_characters_parallel_fn(column_device_view const d_strings, + size_type* d_lengths) { auto const idx = cudf::detail::grid_1d::global_thread_id(); using warp_reduce = cub::WarpReduce; diff --git a/cpp/src/strings/convert/convert_urls.cu b/cpp/src/strings/convert/convert_urls.cu index 511acc38d75..b16eb318b39 100644 --- a/cpp/src/strings/convert/convert_urls.cu +++ b/cpp/src/strings/convert/convert_urls.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -197,8 +197,8 @@ __forceinline__ __device__ char escaped_sequence_to_byte(char const* const ptr) * @param[out] out_counts Number of characters in each decode URL. */ template -__global__ void url_decode_char_counter(column_device_view const in_strings, - size_type* const out_counts) +CUDF_KERNEL void url_decode_char_counter(column_device_view const in_strings, + size_type* const out_counts) { constexpr int halo_size = 2; __shared__ char temporary_buffer[num_warps_per_threadblock][char_block_size + halo_size]; @@ -280,9 +280,9 @@ __global__ void url_decode_char_counter(column_device_view const in_strings, * @param[in] out_offsets Offset value of each string associated with `out_chars`. */ template -__global__ void url_decode_char_replacer(column_device_view const in_strings, - char* const out_chars, - size_type const* const out_offsets) +CUDF_KERNEL void url_decode_char_replacer(column_device_view const in_strings, + char* const out_chars, + size_type const* const out_offsets) { constexpr int halo_size = 2; __shared__ char temporary_buffer[num_warps_per_threadblock][char_block_size + halo_size * 2]; diff --git a/cpp/src/strings/copying/concatenate.cu b/cpp/src/strings/copying/concatenate.cu index 2d9b06183e2..8cabd0dc75f 100644 --- a/cpp/src/strings/copying/concatenate.cu +++ b/cpp/src/strings/copying/concatenate.cu @@ -112,7 +112,7 @@ auto create_strings_device_views(host_span views, rmm::cuda_s } template -__global__ void fused_concatenate_string_offset_kernel( +CUDF_KERNEL void fused_concatenate_string_offset_kernel( column_device_view const* input_views, size_t const* input_offsets, size_t const* partition_offsets, @@ -171,11 +171,11 @@ __global__ void fused_concatenate_string_offset_kernel( } } -__global__ void fused_concatenate_string_chars_kernel(column_device_view const* input_views, - size_t const* partition_offsets, - size_type const num_input_views, - size_type const output_size, - char* output_data) +CUDF_KERNEL void fused_concatenate_string_chars_kernel(column_device_view const* input_views, + size_t const* partition_offsets, + size_type const num_input_views, + size_type const output_size, + char* output_data) { cudf::thread_index_type output_index = threadIdx.x + blockIdx.x * blockDim.x; diff --git a/cpp/src/strings/regex/utilities.cuh b/cpp/src/strings/regex/utilities.cuh index 23b53062bf3..bc8f5d68a4b 100644 --- a/cpp/src/strings/regex/utilities.cuh +++ b/cpp/src/strings/regex/utilities.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -37,7 +37,7 @@ namespace detail { constexpr auto regex_launch_kernel_block_size = 256; template -__global__ void for_each_kernel(ForEachFunction fn, reprog_device const d_prog, size_type size) +CUDF_KERNEL void for_each_kernel(ForEachFunction fn, reprog_device const d_prog, size_type size) { extern __shared__ u_char shmem[]; if (threadIdx.x == 0) { d_prog.store(shmem); } @@ -71,10 +71,10 @@ void launch_for_each_kernel(ForEachFunction fn, } template -__global__ void transform_kernel(TransformFunction fn, - reprog_device const d_prog, - OutputType* d_output, - size_type size) +CUDF_KERNEL void transform_kernel(TransformFunction fn, + reprog_device const d_prog, + OutputType* d_output, + size_type size) { extern __shared__ u_char shmem[]; if (threadIdx.x == 0) { d_prog.store(shmem); } diff --git a/cpp/src/strings/search/find.cu b/cpp/src/strings/search/find.cu index 4ba1359c469..78343d58626 100644 --- a/cpp/src/strings/search/find.cu +++ b/cpp/src/strings/search/find.cu @@ -115,11 +115,11 @@ struct empty_target_fn { * @brief String per warp function for find/rfind */ template -__global__ void finder_warp_parallel_fn(column_device_view const d_strings, - TargetIterator const d_targets, - size_type const start, - size_type const stop, - size_type* d_results) +CUDF_KERNEL void finder_warp_parallel_fn(column_device_view const d_strings, + TargetIterator const d_targets, + size_type const start, + size_type const stop, + size_type* d_results) { size_type const idx = static_cast(threadIdx.x + blockIdx.x * blockDim.x); @@ -346,9 +346,9 @@ namespace { * @param d_target String to search for in each row of `d_strings` * @param d_results Indicates which rows contain `d_target` */ -__global__ void contains_warp_parallel_fn(column_device_view const d_strings, - string_view const d_target, - bool* d_results) +CUDF_KERNEL void contains_warp_parallel_fn(column_device_view const d_strings, + string_view const d_target, + bool* d_results) { size_type const idx = static_cast(threadIdx.x + blockIdx.x * blockDim.x); using warp_reduce = cub::WarpReduce; diff --git a/cpp/src/text/bpe/byte_pair_encoding.cu b/cpp/src/text/bpe/byte_pair_encoding.cu index a697df913d3..1f125636208 100644 --- a/cpp/src/text/bpe/byte_pair_encoding.cu +++ b/cpp/src/text/bpe/byte_pair_encoding.cu @@ -122,11 +122,11 @@ struct bpe_unpairable_offsets_fn { * @param d_rerank_data Working memory to hold locations where reranking is required */ template -__global__ void bpe_parallel_fn(cudf::column_device_view const d_strings, - MapRefType const d_map, - int8_t* d_spaces_data, // working memory - cudf::size_type* d_ranks_data, // more working memory - int8_t* d_rerank_data // and one more working memory +CUDF_KERNEL void bpe_parallel_fn(cudf::column_device_view const d_strings, + MapRefType const d_map, + int8_t* d_spaces_data, // working memory + cudf::size_type* d_ranks_data, // more working memory + int8_t* d_rerank_data // and one more working memory ) { // string per block @@ -291,9 +291,9 @@ __global__ void bpe_parallel_fn(cudf::column_device_view const d_strings, * @param d_spaces_data Output the location where separator will be inserted * @param d_sizes Output sizes of each row */ -__global__ void bpe_finalize(cudf::column_device_view const d_strings, - int8_t* d_spaces_data, // where separators are inserted - cudf::size_type* d_sizes // output sizes of encoded strings +CUDF_KERNEL void bpe_finalize(cudf::column_device_view const d_strings, + int8_t* d_spaces_data, // where separators are inserted + cudf::size_type* d_sizes // output sizes of encoded strings ) { // string per block diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu index 4e0a538ffe9..dcb59166cec 100644 --- a/cpp/src/text/minhash.cu +++ b/cpp/src/text/minhash.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -62,10 +62,10 @@ template < typename HashFunction, typename hash_value_type = std:: conditional_t, uint32_t, uint64_t>> -__global__ void minhash_kernel(cudf::column_device_view const d_strings, - cudf::device_span seeds, - cudf::size_type width, - hash_value_type* d_hashes) +CUDF_KERNEL void minhash_kernel(cudf::column_device_view const d_strings, + cudf::device_span seeds, + cudf::size_type width, + hash_value_type* d_hashes) { auto const idx = static_cast(threadIdx.x + blockIdx.x * blockDim.x); if (idx >= (static_cast(d_strings.size()) * diff --git a/cpp/src/text/subword/data_normalizer.cu b/cpp/src/text/subword/data_normalizer.cu index 34eb95bea5c..c83bc2e318f 100644 --- a/cpp/src/text/subword/data_normalizer.cu +++ b/cpp/src/text/subword/data_normalizer.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -204,13 +204,13 @@ extract_code_points_from_utf8(unsigned char const* strings, * @param[out] code_points The resulting code point values from normalization. * @param[out] chars_per_thread Output number of code point values per string. */ -__global__ void kernel_data_normalizer(unsigned char const* strings, - size_t const total_bytes, - uint32_t const* cp_metadata, - uint64_t const* aux_table, - bool const do_lower_case, - uint32_t* code_points, - uint32_t* chars_per_thread) +CUDF_KERNEL void kernel_data_normalizer(unsigned char const* strings, + size_t const total_bytes, + uint32_t const* cp_metadata, + uint64_t const* aux_table, + bool const do_lower_case, + uint32_t* code_points, + uint32_t* chars_per_thread) { constexpr uint32_t init_val = (1 << FILTER_BIT); uint32_t replacement_code_points[MAX_NEW_CHARS] = {init_val, init_val, init_val}; diff --git a/cpp/src/text/subword/subword_tokenize.cu b/cpp/src/text/subword/subword_tokenize.cu index a35d69e2145..c9592e5cc48 100644 --- a/cpp/src/text/subword/subword_tokenize.cu +++ b/cpp/src/text/subword/subword_tokenize.cu @@ -56,7 +56,7 @@ namespace { * @param[out] attn_mask Identifies valid token id entries * @param[out] metadata Additional data per row */ -__global__ void kernel_compute_tensor_metadata( +CUDF_KERNEL void kernel_compute_tensor_metadata( // input uint32_t const* token_ids, cudf::size_type const* offsets, diff --git a/cpp/src/text/subword/wordpiece_tokenizer.cu b/cpp/src/text/subword/wordpiece_tokenizer.cu index 3b912017320..d2804af5f8b 100644 --- a/cpp/src/text/subword/wordpiece_tokenizer.cu +++ b/cpp/src/text/subword/wordpiece_tokenizer.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -75,12 +75,12 @@ namespace { * @param[out] tokens_per_word An array of size `num_code_points` which hold the number of * tokens. This kernel just sets all the values to 0. */ -__global__ void init_data_and_mark_word_start_and_ends(uint32_t const* code_points, - uint32_t* start_word_indices, - uint32_t* end_word_indices, - size_t num_code_points, - uint32_t* token_ids, - uint8_t* tokens_per_word) +CUDF_KERNEL void init_data_and_mark_word_start_and_ends(uint32_t const* code_points, + uint32_t* start_word_indices, + uint32_t* end_word_indices, + size_t num_code_points, + uint32_t* token_ids, + uint8_t* tokens_per_word) { cudf::thread_index_type char_for_thread = static_cast(blockDim.x) * static_cast(blockIdx.x) + @@ -131,11 +131,11 @@ __global__ void init_data_and_mark_word_start_and_ends(uint32_t const* code_poin * written to indicate this. * @param num_strings The total number of strings to be processed. */ -__global__ void mark_string_start_and_ends(uint32_t const* code_points, - cudf::size_type const* strings_offsets, - uint32_t* start_word_indices, - uint32_t* end_word_indices, - uint32_t num_strings) +CUDF_KERNEL void mark_string_start_and_ends(uint32_t const* code_points, + cudf::size_type const* strings_offsets, + uint32_t* start_word_indices, + uint32_t* end_word_indices, + uint32_t num_strings) { cudf::thread_index_type idx = static_cast(blockDim.x) * static_cast(blockIdx.x) + @@ -319,20 +319,20 @@ struct mark_special_tokens { * @param outer_hash_b_param: The b parameter for the outer hash * @param num_outer_bins: The number of bins for the outer hash */ -__global__ void kernel_wordpiece_tokenizer(uint32_t const* code_points, - uint64_t const* hash_table, - uint64_t const* bin_coefficients, - uint16_t const* bin_offsets, - uint16_t unk_token_id, - uint32_t outer_hash_a_param, - uint32_t outer_hash_b_param, - uint16_t num_outer_bins, - uint32_t const* word_starts, - uint32_t const* word_ends, - uint32_t max_word_length, - uint32_t total_words, - uint32_t* token_ids, - uint8_t* tokens_per_word) +CUDF_KERNEL void kernel_wordpiece_tokenizer(uint32_t const* code_points, + uint64_t const* hash_table, + uint64_t const* bin_coefficients, + uint16_t const* bin_offsets, + uint16_t unk_token_id, + uint32_t outer_hash_a_param, + uint32_t outer_hash_b_param, + uint16_t num_outer_bins, + uint32_t const* word_starts, + uint32_t const* word_ends, + uint32_t max_word_length, + uint32_t total_words, + uint32_t* token_ids, + uint8_t* tokens_per_word) { cudf::thread_index_type word_to_tokenize = static_cast(blockDim.x) * static_cast(blockIdx.x) + diff --git a/cpp/src/text/vocabulary_tokenize.cu b/cpp/src/text/vocabulary_tokenize.cu index 91f4c304590..a9e8d4d9a24 100644 --- a/cpp/src/text/vocabulary_tokenize.cu +++ b/cpp/src/text/vocabulary_tokenize.cu @@ -214,10 +214,10 @@ struct mark_delimiters_fn { } }; -__global__ void token_counts_fn(cudf::column_device_view const d_strings, - cudf::string_view const d_delimiter, - cudf::size_type* d_counts, - int8_t* d_results) +CUDF_KERNEL void token_counts_fn(cudf::column_device_view const d_strings, + cudf::string_view const d_delimiter, + cudf::size_type* d_counts, + int8_t* d_results) { // string per warp auto const idx = static_cast(threadIdx.x + blockIdx.x * blockDim.x); diff --git a/cpp/src/transform/compute_column.cu b/cpp/src/transform/compute_column.cu index 224dd93b048..eaf47adec10 100644 --- a/cpp/src/transform/compute_column.cu +++ b/cpp/src/transform/compute_column.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -54,7 +54,7 @@ namespace detail { * @param output_column The destination for the results of evaluating the expression. */ template -__launch_bounds__(max_block_size) __global__ +__launch_bounds__(max_block_size) CUDF_KERNEL void compute_column_kernel(table_device_view const table, ast::detail::expression_device_view device_expression_data, mutable_column_device_view output_column) diff --git a/cpp/src/transform/jit/kernel.cu b/cpp/src/transform/jit/kernel.cu index 0170cc50c6f..1e913ecb5bb 100644 --- a/cpp/src/transform/jit/kernel.cu +++ b/cpp/src/transform/jit/kernel.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -35,7 +35,7 @@ namespace transformation { namespace jit { template -__global__ void kernel(cudf::size_type size, TypeOut* out_data, TypeIn* in_data) +CUDF_KERNEL void kernel(cudf::size_type size, TypeOut* out_data, TypeIn* in_data) { // cannot use global_thread_id utility due to a JIT build issue by including // the `cudf/detail/utilities/cuda.cuh` header diff --git a/cpp/src/transform/row_bit_count.cu b/cpp/src/transform/row_bit_count.cu index b151b44565d..a91dc8fbbc6 100644 --- a/cpp/src/transform/row_bit_count.cu +++ b/cpp/src/transform/row_bit_count.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -398,10 +398,10 @@ __device__ size_type row_size_functor::operator()(column_device_vie * @param output Output span of size (# rows) where per-row bit sizes are stored * @param max_branch_depth Maximum depth of the span stack needed per-thread */ -__global__ void compute_row_sizes(device_span cols, - device_span info, - device_span output, - size_type max_branch_depth) +CUDF_KERNEL void compute_row_sizes(device_span cols, + device_span info, + device_span output, + size_type max_branch_depth) { extern __shared__ row_span thread_branch_stacks[]; int const tid = threadIdx.x + blockIdx.x * blockDim.x; diff --git a/cpp/src/transform/row_conversion.cu b/cpp/src/transform/row_conversion.cu index b797e495480..ef12fbeae52 100644 --- a/cpp/src/transform/row_conversion.cu +++ b/cpp/src/transform/row_conversion.cu @@ -314,14 +314,14 @@ struct fixed_width_row_offset_functor { * @param output_nm array of pointers to the output null masks * @param input_data pointing to the incoming row data */ -__global__ void copy_from_rows_fixed_width_optimized(const size_type num_rows, - const size_type num_columns, - const size_type row_size, - const size_type* input_offset_in_row, - const size_type* num_bytes, - int8_t** output_data, - bitmask_type** output_nm, - const int8_t* input_data) +CUDF_KERNEL void copy_from_rows_fixed_width_optimized(const size_type num_rows, + const size_type num_columns, + const size_type row_size, + const size_type* input_offset_in_row, + const size_type* num_bytes, + int8_t** output_data, + bitmask_type** output_nm, + const int8_t* input_data) { // We are going to copy the data in two passes. // The first pass copies a chunk of data into shared memory. @@ -433,15 +433,15 @@ __global__ void copy_from_rows_fixed_width_optimized(const size_type num_rows, } } -__global__ void copy_to_rows_fixed_width_optimized(const size_type start_row, - const size_type num_rows, - const size_type num_columns, - const size_type row_size, - const size_type* output_offset_in_row, - const size_type* num_bytes, - const int8_t** input_data, - const bitmask_type** input_nm, - int8_t* output_data) +CUDF_KERNEL void copy_to_rows_fixed_width_optimized(const size_type start_row, + const size_type num_rows, + const size_type num_columns, + const size_type row_size, + const size_type* output_offset_in_row, + const size_type* num_bytes, + const int8_t** input_data, + const bitmask_type** input_nm, + int8_t* output_data) { // We are going to copy the data in two passes. // The first pass copies a chunk of data into shared memory. @@ -588,16 +588,16 @@ __global__ void copy_to_rows_fixed_width_optimized(const size_type start_row, * */ template -__global__ void copy_to_rows(const size_type num_rows, - const size_type num_columns, - const size_type shmem_used_per_tile, - device_span tile_infos, - const int8_t** input_data, - const size_type* col_sizes, - const size_type* col_offsets, - RowOffsetFunctor row_offsets, - size_type const* batch_row_boundaries, - int8_t** output_data) +CUDF_KERNEL void copy_to_rows(const size_type num_rows, + const size_type num_columns, + const size_type shmem_used_per_tile, + device_span tile_infos, + const int8_t** input_data, + const size_type* col_sizes, + const size_type* col_offsets, + RowOffsetFunctor row_offsets, + size_type const* batch_row_boundaries, + int8_t** output_data) { // We are going to copy the data in two passes. // The first pass copies a chunk of data into shared memory. @@ -731,15 +731,15 @@ __global__ void copy_to_rows(const size_type num_rows, * */ template -__global__ void copy_validity_to_rows(const size_type num_rows, - const size_type num_columns, - const size_type shmem_used_per_tile, - RowOffsetFunctor row_offsets, - size_type const* batch_row_boundaries, - int8_t** output_data, - const size_type validity_offset, - device_span tile_infos, - const bitmask_type** input_nm) +CUDF_KERNEL void copy_validity_to_rows(const size_type num_rows, + const size_type num_columns, + const size_type shmem_used_per_tile, + RowOffsetFunctor row_offsets, + size_type const* batch_row_boundaries, + int8_t** output_data, + const size_type validity_offset, + device_span tile_infos, + const bitmask_type** input_nm) { extern __shared__ int8_t shared_data[]; @@ -851,15 +851,15 @@ __global__ void copy_validity_to_rows(const size_type num_rows, * */ template -__global__ void copy_strings_to_rows(size_type const num_rows, - size_type const num_variable_columns, - int8_t const** variable_input_data, - size_type const* variable_col_output_offsets, - size_type const** variable_col_offsets, - size_type fixed_width_row_size, - RowOffsetFunctor row_offsets, - size_type const batch_row_offset, - int8_t* output_data) +CUDF_KERNEL void copy_strings_to_rows(size_type const num_rows, + size_type const num_variable_columns, + int8_t const** variable_input_data, + size_type const* variable_col_output_offsets, + size_type const** variable_col_offsets, + size_type fixed_width_row_size, + RowOffsetFunctor row_offsets, + size_type const batch_row_offset, + int8_t* output_data) { // Each block will take a group of rows controlled by NUM_STRING_ROWS_PER_BLOCK_TO_ROWS. Each warp // will copy a row at a time. The base thread will first go through column data and fill out @@ -920,16 +920,16 @@ __global__ void copy_strings_to_rows(size_type const num_rows, * */ template -__global__ void copy_from_rows(const size_type num_rows, - const size_type num_columns, - const size_type shmem_used_per_tile, - RowOffsetFunctor row_offsets, - size_type const* batch_row_boundaries, - int8_t** output_data, - const size_type* col_sizes, - const size_type* col_offsets, - device_span tile_infos, - const int8_t* input_data) +CUDF_KERNEL void copy_from_rows(const size_type num_rows, + const size_type num_columns, + const size_type shmem_used_per_tile, + RowOffsetFunctor row_offsets, + size_type const* batch_row_boundaries, + int8_t** output_data, + const size_type* col_sizes, + const size_type* col_offsets, + device_span tile_infos, + const int8_t* input_data) { // We are going to copy the data in two passes. // The first pass copies a chunk of data into shared memory. @@ -1042,15 +1042,15 @@ __global__ void copy_from_rows(const size_type num_rows, * */ template -__global__ void copy_validity_from_rows(const size_type num_rows, - const size_type num_columns, - const size_type shmem_used_per_tile, - RowOffsetFunctor row_offsets, - size_type const* batch_row_boundaries, - bitmask_type** output_nm, - const size_type validity_offset, - device_span tile_infos, - const int8_t* input_data) +CUDF_KERNEL void copy_validity_from_rows(const size_type num_rows, + const size_type num_columns, + const size_type shmem_used_per_tile, + RowOffsetFunctor row_offsets, + size_type const* batch_row_boundaries, + bitmask_type** output_nm, + const size_type validity_offset, + device_span tile_infos, + const int8_t* input_data) { extern __shared__ int8_t shared[]; @@ -1175,14 +1175,14 @@ __global__ void copy_validity_from_rows(const size_type num_rows, * @param num_string_columns number of string columns in the table */ template -__global__ void copy_strings_from_rows(RowOffsetFunctor row_offsets, - int32_t** string_row_offsets, - int32_t** string_lengths, - size_type** string_column_offsets, - char** string_col_data, - int8_t const* row_data, - size_type const num_rows, - size_type const num_string_columns) +CUDF_KERNEL void copy_strings_from_rows(RowOffsetFunctor row_offsets, + int32_t** string_row_offsets, + int32_t** string_lengths, + size_type** string_column_offsets, + char** string_col_data, + int8_t const* row_data, + size_type const num_rows, + size_type const num_string_columns) { // Each warp takes a tile, which is a single column and up to ROWS_PER_BLOCK rows. A tile will not // wrap around the bottom of the table. The warp will copy the strings for each row in the tile. diff --git a/cpp/tests/device_atomics/device_atomics_test.cu b/cpp/tests/device_atomics/device_atomics_test.cu index f0c69ea6bfb..6e90d4462df 100644 --- a/cpp/tests/device_atomics/device_atomics_test.cu +++ b/cpp/tests/device_atomics/device_atomics_test.cu @@ -31,7 +31,7 @@ #include template -__global__ void gpu_atomic_test(T* result, T* data, size_t size) +CUDF_KERNEL void gpu_atomic_test(T* result, T* data, size_t size) { size_t id = blockIdx.x * blockDim.x + threadIdx.x; size_t step = blockDim.x * gridDim.x; @@ -79,7 +79,7 @@ __device__ T atomic_op(T* addr, T const& value, BinaryOp op) } template -__global__ void gpu_atomicCAS_test(T* result, T* data, size_t size) +CUDF_KERNEL void gpu_atomicCAS_test(T* result, T* data, size_t size) { size_t id = blockIdx.x * blockDim.x + threadIdx.x; size_t step = blockDim.x * gridDim.x; diff --git a/cpp/tests/error/error_handling_test.cu b/cpp/tests/error/error_handling_test.cu index 6bb1afda2a8..5cb2d729f3d 100644 --- a/cpp/tests/error/error_handling_test.cu +++ b/cpp/tests/error/error_handling_test.cu @@ -40,7 +40,7 @@ TEST(StreamCheck, success) { EXPECT_NO_THROW(CUDF_CHECK_CUDA(0)); } namespace { // Some silly kernel that will cause an error -void __global__ test_kernel(int* data) { data[threadIdx.x] = threadIdx.x; } +CUDF_KERNEL void test_kernel(int* data) { data[threadIdx.x] = threadIdx.x; } } // namespace // In a release build and without explicit synchronization, CUDF_CHECK_CUDA may @@ -70,7 +70,7 @@ TEST(StreamCheck, CatchFailedKernel) EXPECT_THROW(CUDF_CHECK_CUDA(stream.value()), cudf::cuda_error); } -__global__ void kernel() { asm("trap;"); } +CUDF_KERNEL void kernel() { asm("trap;"); } TEST(DeathTest, CudaFatalError) { @@ -88,9 +88,9 @@ TEST(DeathTest, CudaFatalError) #ifndef NDEBUG -__global__ void assert_false_kernel() { cudf_assert(false && "this kernel should die"); } +CUDF_KERNEL void assert_false_kernel() { cudf_assert(false && "this kernel should die"); } -__global__ void assert_true_kernel() { cudf_assert(true && "this kernel should live"); } +CUDF_KERNEL void assert_true_kernel() { cudf_assert(true && "this kernel should live"); } TEST(DebugAssertDeathTest, cudf_assert_false) { diff --git a/cpp/tests/identify_stream_usage/test_default_stream_identification.cu b/cpp/tests/identify_stream_usage/test_default_stream_identification.cu index 28bb47af40d..268c7b37c81 100644 --- a/cpp/tests/identify_stream_usage/test_default_stream_identification.cu +++ b/cpp/tests/identify_stream_usage/test_default_stream_identification.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,7 +16,7 @@ #include -__global__ void kernel() { printf("The kernel ran!\n"); } +__global__ static void kernel() { printf("The kernel ran!\n"); } void test_cudaLaunchKernel() { diff --git a/cpp/tests/scalar/scalar_device_view_test.cu b/cpp/tests/scalar/scalar_device_view_test.cu index 8d0e54f024f..8444716bccd 100644 --- a/cpp/tests/scalar/scalar_device_view_test.cu +++ b/cpp/tests/scalar/scalar_device_view_test.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -35,14 +35,14 @@ struct TypedScalarDeviceViewTest : public cudf::test::BaseFixture {}; TYPED_TEST_SUITE(TypedScalarDeviceViewTest, cudf::test::FixedWidthTypesWithoutFixedPoint); template -__global__ void test_set_value(ScalarDeviceViewType s, ScalarDeviceViewType s1) +CUDF_KERNEL void test_set_value(ScalarDeviceViewType s, ScalarDeviceViewType s1) { s1.set_value(s.value()); s1.set_valid(true); } template -__global__ void test_value(ScalarDeviceViewType s, ScalarDeviceViewType s1, bool* result) +CUDF_KERNEL void test_value(ScalarDeviceViewType s, ScalarDeviceViewType s1, bool* result) { *result = (s.value() == s1.value()); } @@ -73,7 +73,7 @@ TYPED_TEST(TypedScalarDeviceViewTest, Value) } template -__global__ void test_null(ScalarDeviceViewType s, bool* result) +CUDF_KERNEL void test_null(ScalarDeviceViewType s, bool* result) { *result = s.is_valid(); } @@ -92,7 +92,7 @@ TYPED_TEST(TypedScalarDeviceViewTest, ConstructNull) } template -__global__ void test_setnull(ScalarDeviceViewType s) +CUDF_KERNEL void test_setnull(ScalarDeviceViewType s) { s.set_valid(false); } @@ -113,10 +113,10 @@ TYPED_TEST(TypedScalarDeviceViewTest, SetNull) struct StringScalarDeviceViewTest : public cudf::test::BaseFixture {}; -__global__ void test_string_value(cudf::string_scalar_device_view s, - char const* value, - cudf::size_type size, - bool* result) +CUDF_KERNEL void test_string_value(cudf::string_scalar_device_view s, + char const* value, + cudf::size_type size, + bool* result) { *result = (s.value() == cudf::string_view(value, size)); } diff --git a/cpp/tests/streams/pool_test.cu b/cpp/tests/streams/pool_test.cu index 0f92e1c0c2b..52debe24fe8 100644 --- a/cpp/tests/streams/pool_test.cu +++ b/cpp/tests/streams/pool_test.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -22,7 +22,7 @@ class StreamPoolTest : public cudf::test::BaseFixture {}; -__global__ void do_nothing_kernel() {} +CUDF_KERNEL void do_nothing_kernel() {} TEST_F(StreamPoolTest, ForkStreams) { diff --git a/cpp/tests/types/type_dispatcher_test.cu b/cpp/tests/types/type_dispatcher_test.cu index d7df8f03ec1..0b26330d323 100644 --- a/cpp/tests/types/type_dispatcher_test.cu +++ b/cpp/tests/types/type_dispatcher_test.cu @@ -59,7 +59,7 @@ struct verify_dispatched_type { } }; -__global__ void dispatch_test_kernel(cudf::type_id id, bool* d_result) +CUDF_KERNEL void dispatch_test_kernel(cudf::type_id id, bool* d_result) { if (0 == threadIdx.x + blockIdx.x * blockDim.x) *d_result = cudf::type_dispatcher(cudf::data_type{id}, verify_dispatched_type{}, id); @@ -119,7 +119,7 @@ struct verify_double_dispatched_type { } }; -__global__ void double_dispatch_test_kernel(cudf::type_id id1, cudf::type_id id2, bool* d_result) +CUDF_KERNEL void double_dispatch_test_kernel(cudf::type_id id1, cudf::type_id id2, bool* d_result) { if (0 == threadIdx.x + blockIdx.x * blockDim.x) *d_result = cudf::double_type_dispatcher( diff --git a/cpp/tests/utilities_tests/span_tests.cu b/cpp/tests/utilities_tests/span_tests.cu index 870528d306c..2075c67a18a 100644 --- a/cpp/tests/utilities_tests/span_tests.cu +++ b/cpp/tests/utilities_tests/span_tests.cu @@ -247,7 +247,7 @@ TEST(SpanTest, CanConstructFromDeviceContainers) (void)device_span(d_uvector_c); } -__global__ void simple_device_kernel(device_span result) { result[0] = true; } +CUDF_KERNEL void simple_device_kernel(device_span result) { result[0] = true; } TEST(SpanTest, CanUseDeviceSpan) { @@ -277,7 +277,7 @@ TEST(MdSpanTest, CanDetermineEmptiness) EXPECT_TRUE(device_2dspan{no_columns_vector}.is_empty()); } -__global__ void readwrite_kernel(device_2dspan result) +CUDF_KERNEL void readwrite_kernel(device_2dspan result) { if (result[5][6] == 5) { result[5][6] *= 6; @@ -436,7 +436,7 @@ TEST(HostDeviceSpanTest, CanSendToDevice) EXPECT_EQ(std::string(d_message), hello_world_message); } -__global__ void simple_device_char_kernel(device_span result) +CUDF_KERNEL void simple_device_char_kernel(device_span result) { char const* str = "world hello"; for (int offset = 0; offset < result.size(); ++offset) { diff --git a/python/cudf/cudf/core/udf/utils.py b/python/cudf/cudf/core/udf/utils.py index bfe5f5007fe..bd57db6b620 100644 --- a/python/cudf/cudf/core/udf/utils.py +++ b/python/cudf/cudf/core/udf/utils.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. import os from typing import Any, Callable, Dict @@ -17,10 +17,7 @@ import rmm -from cudf._lib.strings_udf import ( - column_from_udf_string_array, - column_to_string_view_array, -) +from cudf._lib import strings_udf from cudf.api.types import is_scalar from cudf.core.column.column import as_column from cudf.core.dtypes import dtype @@ -63,7 +60,10 @@ precompiled: cachetools.LRUCache = cachetools.LRUCache(maxsize=32) launch_arg_getters: Dict[Any, Any] = {} -_PTX_FILE = _get_ptx_file(os.path.dirname(__file__), "shim_") +_PTX_FILE = _get_ptx_file( + os.path.join(os.path.dirname(strings_udf.__file__), "..", "core", "udf"), + "shim_", +) @_cudf_nvtx_annotate @@ -319,7 +319,7 @@ def _return_arr_from_dtype(dtype, size): def _post_process_output_col(col, retty): if retty == _cudf_str_dtype: - return column_from_udf_string_array(col) + return strings_udf.column_from_udf_string_array(col) return as_column(col, retty) @@ -361,7 +361,7 @@ def set_malloc_heap_size(size=None): def column_to_string_view_array_init_heap(col): # lazily allocate heap only when a string needs to be returned - return column_to_string_view_array(col) + return strings_udf.column_to_string_view_array(col) class UDFError(RuntimeError): diff --git a/python/cudf/cudf/utils/_numba.py b/python/cudf/cudf/utils/_numba.py index b5fb841873c..0257a068324 100644 --- a/python/cudf/cudf/utils/_numba.py +++ b/python/cudf/cudf/utils/_numba.py @@ -4,14 +4,27 @@ import os import sys +from functools import lru_cache + from numba import config as numba_config from pynvjitlink.patch import ( patch_numba_linker as patch_numba_linker_pynvjitlink, ) -CC_60_PTX_FILE = os.path.join( - os.path.dirname(__file__), "../core/udf/shim_60.ptx" -) +# Use an lru_cache with a single value to allow a delayed import of +# strings_udf. This is the easiest way to break an otherwise circular import +# loop of _lib.*->cudautils->_numba->_lib.strings_udf +@lru_cache +def _get_cc_60_ptx_file(): + from cudf._lib import strings_udf + + return os.path.join( + os.path.dirname(strings_udf.__file__), + "..", + "core", + "udf", + "shim_60.ptx", + ) def _get_best_ptx_file(archs, max_compute_capability): @@ -108,7 +121,9 @@ def _setup_numba(): versions = safe_get_versions() if versions != NO_DRIVER: driver_version, runtime_version = versions - ptx_toolkit_version = _get_cuda_version_from_ptx_file(CC_60_PTX_FILE) + ptx_toolkit_version = _get_cuda_version_from_ptx_file( + _get_cc_60_ptx_file() + ) # MVC is required whenever any PTX is newer than the driver # This could be the shipped PTX file or the PTX emitted by diff --git a/python/cudf/udf_cpp/CMakeLists.txt b/python/cudf/udf_cpp/CMakeLists.txt index 7d6dc84b322..57b52559f00 100644 --- a/python/cudf/udf_cpp/CMakeLists.txt +++ b/python/cudf/udf_cpp/CMakeLists.txt @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2022-2023, NVIDIA CORPORATION. +# Copyright (c) 2022-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at @@ -55,30 +55,6 @@ target_compile_options( target_link_libraries(cudf_strings_udf PUBLIC cudf::cudf) install(TARGETS cudf_strings_udf DESTINATION ./cudf/_lib/) -# This function will copy the generated PTX file from its generator-specific location in the build -# tree into a specified location in the build tree from which we can install it. -function(copy_ptx_to_location target destination new_name) - set(cmake_generated_file - "${CMAKE_CURRENT_BINARY_DIR}/cmake/cp_${target}_$>_ptx.cmake" - ) - file( - GENERATE - OUTPUT "${cmake_generated_file}" - CONTENT - " -set(ptx_path \"$\") -file(MAKE_DIRECTORY \"${destination}\") -file(COPY_FILE \${ptx_path} \"${destination}/${new_name}\")" - ) - - add_custom_target( - ${target}_cp_ptx ALL - COMMAND ${CMAKE_COMMAND} -P "${cmake_generated_file}" - DEPENDS $ - COMMENT "Copying PTX files to '${destination}'" - ) -endfunction() - # Create the shim library for each architecture. set(SHIM_CUDA_FLAGS --expt-relaxed-constexpr -rdc=true) @@ -104,10 +80,9 @@ foreach(arch IN LISTS CMAKE_CUDA_ARCHITECTURES) target_compile_options(${tgt} PRIVATE "$<$:${SHIM_CUDA_FLAGS}>") target_link_libraries(${tgt} PUBLIC cudf::cudf) - copy_ptx_to_location(${tgt} "${CMAKE_CURRENT_BINARY_DIR}/../udf" ${tgt}.ptx) install( FILES $ - DESTINATION ./cudf/core/udf/ + DESTINATION cudf/core/udf/ RENAME ${tgt}.ptx ) endforeach()