Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use grid_stride for stride computations. #13996

Merged
merged 6 commits into from
Sep 1, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions cpp/include/cudf/detail/utilities/cuda.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,29 @@ class grid_1d {
{
return global_thread_id(threadIdx.x, blockIdx.x, blockDim.x);
}

/**
* @brief Returns the stride of a 1D grid.
*
* The returned stride is the total number of threads in the grid.
*
* @param thread_id The thread index within the block
* @param block_id The block index within the grid
* @param num_threads_per_block The number of threads per block
* @return thread_index_type The global thread index
*/
static constexpr thread_index_type grid_stride(thread_index_type num_threads_per_block,
thread_index_type num_blocks_per_grid)
{
return num_threads_per_block * num_blocks_per_grid;
}

/**
* @brief Returns the stride of the current 1D grid.
*
* @return thread_index_type The number of threads in the grid.
*/
static __device__ thread_index_type grid_stride() { return grid_stride(blockDim.x, gridDim.x); }
};

/**
Expand Down
4 changes: 2 additions & 2 deletions cpp/include/cudf/detail/valid_if.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,8 @@ __global__ void valid_if_kernel(
{
constexpr size_type leader_lane{0};
auto const lane_id{threadIdx.x % warp_size};
thread_index_type i = threadIdx.x + blockIdx.x * blockDim.x;
thread_index_type const stride = blockDim.x * gridDim.x;
auto i = cudf::detail::grid_1d::global_thread_id();
auto const stride = cudf::detail::grid_1d::grid_stride();
size_type warp_valid_count{0};

auto active_mask = __ballot_sync(0xFFFF'FFFFu, i < size);
Expand Down
6 changes: 3 additions & 3 deletions cpp/src/bitmask/null_mask.cu
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ __global__ void set_null_mask_kernel(bitmask_type* __restrict__ destination,
thread_index_type const last_word = word_index(end_bit) - word_index(begin_bit);
bitmask_type fill_value = valid ? 0xffff'ffff : 0;

thread_index_type const stride = blockDim.x * gridDim.x;
auto const stride = cudf::detail::grid_1d::grid_stride();

for (thread_index_type destination_word_index = grid_1d::global_thread_id();
destination_word_index < number_of_mask_words;
Expand Down Expand Up @@ -191,7 +191,7 @@ __global__ void copy_offset_bitmask(bitmask_type* __restrict__ destination,
size_type source_end_bit,
size_type number_of_mask_words)
{
thread_index_type const stride = blockDim.x * gridDim.x;
auto const stride = cudf::detail::grid_1d::grid_stride();
for (thread_index_type destination_word_index = grid_1d::global_thread_id();
destination_word_index < number_of_mask_words;
destination_word_index += stride) {
Expand Down Expand Up @@ -265,7 +265,7 @@ __global__ void count_set_bits_kernel(bitmask_type const* bitmask,
auto const first_word_index{word_index(first_bit_index)};
auto const last_word_index{word_index(last_bit_index)};
thread_index_type const tid = grid_1d::global_thread_id();
thread_index_type const stride = blockDim.x * gridDim.x;
thread_index_type const stride = grid_1d::grid_stride();
thread_index_type thread_word_index = tid + first_word_index;
size_type thread_count{0};

Expand Down
4 changes: 2 additions & 2 deletions cpp/src/copying/scatter.cu
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,8 @@ __global__ void marking_bitmask_kernel(mutable_column_device_view destination,
MapIterator scatter_map,
size_type num_scatter_rows)
{
thread_index_type row = threadIdx.x + blockIdx.x * blockDim.x;
thread_index_type const stride = blockDim.x * gridDim.x;
auto row = cudf::detail::grid_1d::global_thread_id();
auto const stride = cudf::detail::grid_1d::grid_stride();

while (row < num_scatter_rows) {
size_type const output_row = scatter_map[row];
Expand Down
18 changes: 8 additions & 10 deletions cpp/src/partitioning/partitioning.cu
Original file line number Diff line number Diff line change
Expand Up @@ -134,8 +134,8 @@ __global__ void compute_row_partition_numbers(row_hasher_t the_hasher,
// Accumulate histogram of the size of each partition in shared memory
extern __shared__ size_type shared_partition_sizes[];

auto tid = cudf::thread_index_type{threadIdx.x} +
cudf::thread_index_type{blockIdx.x} * cudf::thread_index_type{blockDim.x};
auto tid = cudf::detail::grid_1d::global_thread_id();
auto const stride = cudf::detail::grid_1d::grid_stride();

// Initialize local histogram
size_type partition_number = threadIdx.x;
Expand All @@ -160,7 +160,7 @@ __global__ void compute_row_partition_numbers(row_hasher_t the_hasher,
row_partition_offset[row_number] =
atomicAdd(&(shared_partition_sizes[partition_number]), size_type(1));

tid += cudf::thread_index_type{blockDim.x} * cudf::thread_index_type{gridDim.x};
tid += stride;
}

__syncthreads();
Expand Down Expand Up @@ -215,8 +215,8 @@ __global__ void compute_row_output_locations(size_type* __restrict__ row_partiti
}
__syncthreads();

auto tid = cudf::thread_index_type{threadIdx.x} +
cudf::thread_index_type{blockIdx.x} * cudf::thread_index_type{blockDim.x};
auto tid = cudf::detail::grid_1d::global_thread_id();
auto const stride = cudf::detail::grid_1d::grid_stride();

// Get each row's partition number, and get it's output location by
// incrementing block's offset counter for that partition number
Expand All @@ -234,7 +234,7 @@ __global__ void compute_row_output_locations(size_type* __restrict__ row_partiti
// Store the row's output location in-place
row_partition_numbers[row_number] = row_output_location;

tid += cudf::thread_index_type{blockDim.x} * cudf::thread_index_type{gridDim.x};
tid += stride;
}
}

Expand Down Expand Up @@ -311,10 +311,8 @@ __global__ void copy_block_partitions(InputIter input_iter,
__syncthreads();

// Fetch the input data to shared memory
for (auto tid = cudf::thread_index_type{threadIdx.x} +
cudf::thread_index_type{blockIdx.x} * cudf::thread_index_type{blockDim.x};
tid < num_rows;
tid += cudf::thread_index_type{blockDim.x} * cudf::thread_index_type{gridDim.x}) {
for (auto tid = cudf::detail::grid_1d::global_thread_id(); tid < num_rows;
tid += cudf::detail::grid_1d::grid_stride()) {
auto const row_number = static_cast<size_type>(tid);
size_type const ipartition = row_partition_numbers[row_number];

Expand Down
12 changes: 6 additions & 6 deletions cpp/src/replace/nulls.cu
Original file line number Diff line number Diff line change
Expand Up @@ -64,9 +64,9 @@ __global__ void replace_nulls_strings(cudf::column_device_view input,
char* chars,
cudf::size_type* valid_counter)
{
cudf::size_type nrows = input.size();
cudf::thread_index_type i = blockIdx.x * blockDim.x + threadIdx.x;
cudf::thread_index_type const stride = blockDim.x * gridDim.x;
cudf::size_type nrows = input.size();
auto i = cudf::detail::grid_1d::global_thread_id();
auto const stride = cudf::detail::grid_1d::grid_stride();

uint32_t active_mask = 0xffff'ffff;
active_mask = __ballot_sync(active_mask, i < nrows);
Expand Down Expand Up @@ -117,9 +117,9 @@ __global__ void replace_nulls(cudf::column_device_view input,
cudf::mutable_column_device_view output,
cudf::size_type* output_valid_count)
{
cudf::size_type nrows = input.size();
cudf::thread_index_type i = blockIdx.x * blockDim.x + threadIdx.x;
cudf::thread_index_type const stride = blockDim.x * gridDim.x;
cudf::size_type nrows = input.size();
auto i = cudf::detail::grid_1d::global_thread_id();
auto const stride = cudf::detail::grid_1d::grid_stride();

uint32_t active_mask = 0xffff'ffff;
active_mask = __ballot_sync(active_mask, i < nrows);
Expand Down
5 changes: 2 additions & 3 deletions cpp/src/transform/compute_column.cu
Original file line number Diff line number Diff line change
Expand Up @@ -69,9 +69,8 @@ __launch_bounds__(max_block_size) __global__

auto thread_intermediate_storage =
&intermediate_storage[threadIdx.x * device_expression_data.num_intermediates];
auto const start_idx =
static_cast<cudf::thread_index_type>(threadIdx.x + blockIdx.x * blockDim.x);
auto const stride = static_cast<cudf::thread_index_type>(blockDim.x * gridDim.x);
auto start_idx = cudf::detail::grid_1d::global_thread_id();
auto const stride = cudf::detail::grid_1d::grid_stride();
auto evaluator =
cudf::ast::detail::expression_evaluator<has_nulls>(table, device_expression_data);

Expand Down