Skip to content

Commit

Permalink
Ensure that all CUDA kernels in cudf have hidden visibility. (#14726)
Browse files Browse the repository at this point in the history
To correct potential issues when using a static cuda runtime, we mark all kernels with internal linkage via the `static` keyword or `hidden` visibility.

Note: This doesn't fix dependencies, but focuses just on the CUDA kernels in cudf directly.

Authors:
  - Robert Maynard (https://github.com/robertmaynard)
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Nghia Truong (https://github.com/ttnghia)

URL: #14726
  • Loading branch information
robertmaynard authored Jan 17, 2024
1 parent c7acdaa commit 6abef4a
Show file tree
Hide file tree
Showing 85 changed files with 627 additions and 599 deletions.
30 changes: 15 additions & 15 deletions cpp/benchmarks/join/generate_input_tables.cuh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2023, NVIDIA CORPORATION.
* Copyright (c) 2019-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -31,19 +31,19 @@

#include <cassert>

__global__ static void init_curand(curandState* state, int const nstates)
CUDF_KERNEL void init_curand(curandState* state, int const nstates)
{
int ithread = threadIdx.x + blockIdx.x * blockDim.x;

if (ithread < nstates) { curand_init(1234ULL, ithread, 0, state + ithread); }
}

template <typename key_type, typename size_type>
__global__ static void init_build_tbl(key_type* const build_tbl,
size_type const build_tbl_size,
int const multiplicity,
curandState* state,
int const num_states)
CUDF_KERNEL void init_build_tbl(key_type* const build_tbl,
size_type const build_tbl_size,
int const multiplicity,
curandState* state,
int const num_states)
{
auto const start_idx = blockIdx.x * blockDim.x + threadIdx.x;
auto const stride = blockDim.x * gridDim.x;
Expand All @@ -61,14 +61,14 @@ __global__ static void init_build_tbl(key_type* const build_tbl,
}

template <typename key_type, typename size_type>
__global__ void init_probe_tbl(key_type* const probe_tbl,
size_type const probe_tbl_size,
size_type const build_tbl_size,
key_type const rand_max,
double const selectivity,
int const multiplicity,
curandState* state,
int const num_states)
CUDF_KERNEL void init_probe_tbl(key_type* const probe_tbl,
size_type const probe_tbl_size,
size_type const build_tbl_size,
key_type const rand_max,
double const selectivity,
int const multiplicity,
curandState* state,
int const num_states)
{
auto const start_idx = blockIdx.x * blockDim.x + threadIdx.x;
auto const stride = blockDim.x * gridDim.x;
Expand Down
8 changes: 4 additions & 4 deletions cpp/benchmarks/type_dispatcher/type_dispatcher.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2023, NVIDIA CORPORATION.
* Copyright (c) 2019-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -58,7 +58,7 @@ constexpr int block_size = 256;

// This is for NO_DISPATCHING
template <FunctorType functor_type, class T>
__global__ void no_dispatching_kernel(T** A, cudf::size_type n_rows, cudf::size_type n_cols)
CUDF_KERNEL void no_dispatching_kernel(T** A, cudf::size_type n_rows, cudf::size_type n_cols)
{
using F = Functor<T, functor_type>;
cudf::size_type index = blockIdx.x * blockDim.x + threadIdx.x;
Expand All @@ -72,7 +72,7 @@ __global__ void no_dispatching_kernel(T** A, cudf::size_type n_rows, cudf::size_

// This is for HOST_DISPATCHING
template <FunctorType functor_type, class T>
__global__ void host_dispatching_kernel(cudf::mutable_column_device_view source_column)
CUDF_KERNEL void host_dispatching_kernel(cudf::mutable_column_device_view source_column)
{
using F = Functor<T, functor_type>;
T* A = source_column.data<T>();
Expand Down Expand Up @@ -124,7 +124,7 @@ struct RowHandle {

// This is for DEVICE_DISPATCHING
template <FunctorType functor_type>
__global__ void device_dispatching_kernel(cudf::mutable_table_device_view source)
CUDF_KERNEL void device_dispatching_kernel(cudf::mutable_table_device_view source)
{
cudf::size_type const n_rows = source.num_rows();
cudf::size_type index = threadIdx.x + blockIdx.x * blockDim.x;
Expand Down
16 changes: 8 additions & 8 deletions cpp/examples/strings/custom_optimized.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2022-2023, NVIDIA CORPORATION.
* Copyright (c) 2022-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -37,9 +37,9 @@
* @param d_visibilities Column of visibilities
* @param d_sizes Output sizes for each row
*/
__global__ void sizes_kernel(cudf::column_device_view const d_names,
cudf::column_device_view const d_visibilities,
cudf::size_type* d_sizes)
__global__ static void sizes_kernel(cudf::column_device_view const d_names,
cudf::column_device_view const d_visibilities,
cudf::size_type* d_sizes)
{
// The row index is resolved from the CUDA thread/block objects
auto index = threadIdx.x + blockIdx.x * blockDim.x;
Expand Down Expand Up @@ -74,10 +74,10 @@ __global__ void sizes_kernel(cudf::column_device_view const d_names,
* @param d_offsets Byte offset in `d_chars` for each row
* @param d_chars Output memory for all rows
*/
__global__ void redact_kernel(cudf::column_device_view const d_names,
cudf::column_device_view const d_visibilities,
cudf::size_type const* d_offsets,
char* d_chars)
__global__ static void redact_kernel(cudf::column_device_view const d_names,
cudf::column_device_view const d_visibilities,
cudf::size_type const* d_offsets,
char* d_chars)
{
// The row index is resolved from the CUDA thread/block objects
auto index = threadIdx.x + blockIdx.x * blockDim.x;
Expand Down
12 changes: 6 additions & 6 deletions cpp/examples/strings/custom_prealloc.cu
Original file line number Diff line number Diff line change
Expand Up @@ -37,12 +37,12 @@
* @param d_offsets Byte offset in `d_chars` for each row
* @param d_output Output array of string_view objects
*/
__global__ void redact_kernel(cudf::column_device_view const d_names,
cudf::column_device_view const d_visibilities,
cudf::string_view redaction,
char* working_memory,
cudf::size_type const* d_offsets,
cudf::string_view* d_output)
__global__ static void redact_kernel(cudf::column_device_view const d_names,
cudf::column_device_view const d_visibilities,
cudf::string_view redaction,
char* working_memory,
cudf::size_type const* d_offsets,
cudf::string_view* d_output)
{
// The row index is resolved from the CUDA thread/block objects
auto index = threadIdx.x + blockIdx.x * blockDim.x;
Expand Down
14 changes: 8 additions & 6 deletions cpp/examples/strings/custom_with_malloc.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2022-2023, NVIDIA CORPORATION.
* Copyright (c) 2022-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -64,10 +64,10 @@ void set_malloc_heap_size(size_t heap_size = 1073741824) // 1GB
* @param redaction Redacted string replacement
* @param d_output Output array of string_view objects
*/
__global__ void redact_kernel(cudf::column_device_view const d_names,
cudf::column_device_view const d_visibilities,
cudf::string_view redaction,
cudf::string_view* d_output)
__global__ static void redact_kernel(cudf::column_device_view const d_names,
cudf::column_device_view const d_visibilities,
cudf::string_view redaction,
cudf::string_view* d_output)
{
// The row index is resolved from the CUDA thread/block objects
auto index = threadIdx.x + blockIdx.x * blockDim.x;
Expand Down Expand Up @@ -107,7 +107,9 @@ __global__ void redact_kernel(cudf::column_device_view const d_names,
* @param redaction Redacted string replacement (not to be freed)
* @param d_output Output array of string_view objects to free
*/
__global__ void free_kernel(cudf::string_view redaction, cudf::string_view* d_output, int count)
__global__ static void free_kernel(cudf::string_view redaction,
cudf::string_view* d_output,
int count)
{
auto index = threadIdx.x + blockIdx.x * blockDim.x;
if (index >= count) return;
Expand Down
12 changes: 6 additions & 6 deletions cpp/include/cudf/detail/copy_if.cuh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2023, NVIDIA CORPORATION.
* Copyright (c) 2019-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -52,10 +52,10 @@ namespace detail {

// Compute the count of elements that pass the mask within each block
template <typename Filter, int block_size>
__global__ void compute_block_counts(cudf::size_type* __restrict__ block_counts,
cudf::size_type size,
cudf::size_type per_thread,
Filter filter)
CUDF_KERNEL void compute_block_counts(cudf::size_type* __restrict__ block_counts,
cudf::size_type size,
cudf::size_type per_thread,
Filter filter)
{
int tid = threadIdx.x + per_thread * block_size * blockIdx.x;
int count = 0;
Expand Down Expand Up @@ -96,7 +96,7 @@ __device__ cudf::size_type block_scan_mask(bool mask_true, cudf::size_type& bloc
//
// Note: `filter` is not run on indices larger than the input column size
template <typename T, typename Filter, int block_size, bool has_validity>
__launch_bounds__(block_size) __global__
__launch_bounds__(block_size) CUDF_KERNEL
void scatter_kernel(cudf::mutable_column_device_view output_view,
cudf::size_type* output_null_count,
cudf::column_device_view input_view,
Expand Down
4 changes: 2 additions & 2 deletions cpp/include/cudf/detail/copy_if_else.cuh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2023, NVIDIA CORPORATION.
* Copyright (c) 2019-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -37,7 +37,7 @@ template <size_type block_size,
typename RightIter,
typename Filter,
bool has_nulls>
__launch_bounds__(block_size) __global__
__launch_bounds__(block_size) CUDF_KERNEL
void copy_if_else_kernel(LeftIter lhs,
RightIter rhs,
Filter filter,
Expand Down
14 changes: 7 additions & 7 deletions cpp/include/cudf/detail/copy_range.cuh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2023, NVIDIA CORPORATION.
* Copyright (c) 2019-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -40,12 +40,12 @@ template <cudf::size_type block_size,
typename SourceValidityIterator,
typename T,
bool has_validity>
__global__ void copy_range_kernel(SourceValueIterator source_value_begin,
SourceValidityIterator source_validity_begin,
cudf::mutable_column_device_view target,
cudf::size_type target_begin,
cudf::size_type target_end,
cudf::size_type* __restrict__ const null_count)
CUDF_KERNEL void copy_range_kernel(SourceValueIterator source_value_begin,
SourceValidityIterator source_validity_begin,
cudf::mutable_column_device_view target,
cudf::size_type target_begin,
cudf::size_type target_end,
cudf::size_type* __restrict__ const null_count)
{
using cudf::detail::warp_size;

Expand Down
24 changes: 12 additions & 12 deletions cpp/include/cudf/detail/null_mask.cuh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021-2023, NVIDIA CORPORATION.
* Copyright (c) 2021-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -61,12 +61,12 @@ namespace detail {
* @param count_ptr Pointer to counter of set bits
*/
template <int block_size, typename Binop>
__global__ void offset_bitmask_binop(Binop op,
device_span<bitmask_type> destination,
device_span<bitmask_type const* const> source,
device_span<size_type const> source_begin_bits,
size_type source_size_bits,
size_type* count_ptr)
CUDF_KERNEL void offset_bitmask_binop(Binop op,
device_span<bitmask_type> destination,
device_span<bitmask_type const* const> source,
device_span<size_type const> source_begin_bits,
size_type source_size_bits,
size_type* count_ptr)
{
auto const tid = threadIdx.x + blockIdx.x * blockDim.x;

Expand Down Expand Up @@ -214,11 +214,11 @@ enum class count_bits_policy : bool {
* in each range is updated.
*/
template <typename OffsetIterator, typename OutputIterator>
__global__ void subtract_set_bits_range_boundaries_kernel(bitmask_type const* bitmask,
size_type num_ranges,
OffsetIterator first_bit_indices,
OffsetIterator last_bit_indices,
OutputIterator null_counts)
CUDF_KERNEL void subtract_set_bits_range_boundaries_kernel(bitmask_type const* bitmask,
size_type num_ranges,
OffsetIterator first_bit_indices,
OffsetIterator last_bit_indices,
OutputIterator null_counts)
{
constexpr size_type const word_size_in_bits{detail::size_in_bits<bitmask_type>()};

Expand Down
4 changes: 2 additions & 2 deletions cpp/include/cudf/detail/utilities/cuda.cuh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2023, NVIDIA CORPORATION.
* Copyright (c) 2019-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -211,7 +211,7 @@ __device__ inline T round_up_pow2(T number_to_round, T modulus)
}

template <class F>
__global__ void single_thread_kernel(F f)
CUDF_KERNEL void single_thread_kernel(F f)
{
f();
}
Expand Down
18 changes: 9 additions & 9 deletions cpp/include/cudf/detail/valid_if.cuh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2023, NVIDIA CORPORATION.
* Copyright (c) 2019-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -44,7 +44,7 @@ namespace detail {
* @param[out] valid_count The count of set bits in the output bitmask
*/
template <size_type block_size, typename InputIterator, typename Predicate>
__global__ void valid_if_kernel(
CUDF_KERNEL void valid_if_kernel(
bitmask_type* output, InputIterator begin, size_type size, Predicate p, size_type* valid_count)
{
constexpr size_type leader_lane{0};
Expand Down Expand Up @@ -151,13 +151,13 @@ template <typename InputIterator1,
typename InputIterator2,
typename BinaryPredicate,
int32_t block_size>
__global__ void valid_if_n_kernel(InputIterator1 begin1,
InputIterator2 begin2,
BinaryPredicate p,
bitmask_type* masks[],
size_type mask_count,
size_type mask_num_bits,
size_type* valid_counts)
CUDF_KERNEL void valid_if_n_kernel(InputIterator1 begin1,
InputIterator2 begin2,
BinaryPredicate p,
bitmask_type* masks[],
size_type mask_count,
size_type mask_num_bits,
size_type* valid_counts)
{
for (size_type mask_idx = 0; mask_idx < mask_count; mask_idx++) {
auto const mask = masks[mask_idx];
Expand Down
10 changes: 5 additions & 5 deletions cpp/include/cudf/hashing/detail/helper_functions.cuh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2017-2023, NVIDIA CORPORATION.
* Copyright (c) 2017-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -130,10 +130,10 @@ __forceinline__ __device__ void store_pair_vectorized(pair_type* __restrict__ co
}

template <typename value_type, typename size_type, typename key_type, typename elem_type>
__global__ void init_hashtbl(value_type* __restrict__ const hashtbl_values,
size_type const n,
key_type const key_val,
elem_type const elem_val)
CUDF_KERNEL void init_hashtbl(value_type* __restrict__ const hashtbl_values,
size_type const n,
key_type const key_val,
elem_type const elem_val)
{
size_type const idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < n) {
Expand Down
Loading

0 comments on commit 6abef4a

Please sign in to comment.