From efa538c985283ad22c4cac440d76732b050512dd Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Tue, 29 Aug 2023 13:33:53 -0500 Subject: [PATCH 1/4] Use grid_stride for stride computations. --- cpp/include/cudf/detail/utilities/cuda.cuh | 23 ++++++++++++++++++++++ cpp/include/cudf/detail/valid_if.cuh | 4 ++-- cpp/src/bitmask/null_mask.cu | 6 +++--- cpp/src/copying/scatter.cu | 4 ++-- cpp/src/partitioning/partitioning.cu | 18 ++++++++--------- cpp/src/replace/nulls.cu | 12 +++++------ cpp/src/rolling/jit/kernel.cu | 4 ++-- cpp/src/transform/compute_column.cu | 5 ++--- 8 files changed, 48 insertions(+), 28 deletions(-) diff --git a/cpp/include/cudf/detail/utilities/cuda.cuh b/cpp/include/cudf/detail/utilities/cuda.cuh index c95189f1f94..264302df0e9 100644 --- a/cpp/include/cudf/detail/utilities/cuda.cuh +++ b/cpp/include/cudf/detail/utilities/cuda.cuh @@ -92,6 +92,29 @@ class grid_1d { { return global_thread_id(threadIdx.x, blockIdx.x, blockDim.x); } + + /** + * @brief Returns the stride of a 1D grid. + * + * The returned stride is the total number of threads in the grid. + * + * @param thread_id The thread index within the block + * @param block_id The block index within the grid + * @param num_threads_per_block The number of threads per block + * @return thread_index_type The global thread index + */ + static constexpr thread_index_type grid_stride(thread_index_type num_threads_per_block, + thread_index_type num_blocks_per_grid) + { + return num_threads_per_block * num_blocks_per_grid; + } + + /** + * @brief Returns the stride of the current 1D grid. + * + * @return thread_index_type The number of threads in the grid. + */ + static __device__ thread_index_type grid_stride() { return grid_stride(blockDim.x, gridDim.x); } }; /** diff --git a/cpp/include/cudf/detail/valid_if.cuh b/cpp/include/cudf/detail/valid_if.cuh index bed884a23eb..f3f95dad017 100644 --- a/cpp/include/cudf/detail/valid_if.cuh +++ b/cpp/include/cudf/detail/valid_if.cuh @@ -49,8 +49,8 @@ __global__ void valid_if_kernel( { constexpr size_type leader_lane{0}; auto const lane_id{threadIdx.x % warp_size}; - thread_index_type i = threadIdx.x + blockIdx.x * blockDim.x; - thread_index_type const stride = blockDim.x * gridDim.x; + auto i = cudf::detail::grid_1d::global_thread_id(); + auto const stride = cudf::detail::grid_1d::grid_stride(); size_type warp_valid_count{0}; auto active_mask = __ballot_sync(0xFFFF'FFFFu, i < size); diff --git a/cpp/src/bitmask/null_mask.cu b/cpp/src/bitmask/null_mask.cu index 33dc7e0556b..5a0d3e4f120 100644 --- a/cpp/src/bitmask/null_mask.cu +++ b/cpp/src/bitmask/null_mask.cu @@ -108,7 +108,7 @@ __global__ void set_null_mask_kernel(bitmask_type* __restrict__ destination, thread_index_type const last_word = word_index(end_bit) - word_index(begin_bit); bitmask_type fill_value = valid ? 0xffff'ffff : 0; - thread_index_type const stride = blockDim.x * gridDim.x; + auto const stride = cudf::detail::grid_1d::grid_stride(); for (thread_index_type destination_word_index = grid_1d::global_thread_id(); destination_word_index < number_of_mask_words; @@ -191,7 +191,7 @@ __global__ void copy_offset_bitmask(bitmask_type* __restrict__ destination, size_type source_end_bit, size_type number_of_mask_words) { - thread_index_type const stride = blockDim.x * gridDim.x; + auto const stride = cudf::detail::grid_1d::grid_stride(); for (thread_index_type destination_word_index = grid_1d::global_thread_id(); destination_word_index < number_of_mask_words; destination_word_index += stride) { @@ -265,7 +265,7 @@ __global__ void count_set_bits_kernel(bitmask_type const* bitmask, auto const first_word_index{word_index(first_bit_index)}; auto const last_word_index{word_index(last_bit_index)}; thread_index_type const tid = grid_1d::global_thread_id(); - thread_index_type const stride = blockDim.x * gridDim.x; + thread_index_type const stride = grid_1d::grid_stride(); thread_index_type thread_word_index = tid + first_word_index; size_type thread_count{0}; diff --git a/cpp/src/copying/scatter.cu b/cpp/src/copying/scatter.cu index 96e24e9059d..484a6f46faf 100644 --- a/cpp/src/copying/scatter.cu +++ b/cpp/src/copying/scatter.cu @@ -52,8 +52,8 @@ __global__ void marking_bitmask_kernel(mutable_column_device_view destination, MapIterator scatter_map, size_type num_scatter_rows) { - thread_index_type row = threadIdx.x + blockIdx.x * blockDim.x; - thread_index_type const stride = blockDim.x * gridDim.x; + auto row = cudf::detail::grid_1d::global_thread_index(); + auto const stride = cudf::detail::grid_1d::grid_stride(); while (row < num_scatter_rows) { size_type const output_row = scatter_map[row]; diff --git a/cpp/src/partitioning/partitioning.cu b/cpp/src/partitioning/partitioning.cu index ff9c4ea2f59..6be03c4fb8e 100644 --- a/cpp/src/partitioning/partitioning.cu +++ b/cpp/src/partitioning/partitioning.cu @@ -134,8 +134,8 @@ __global__ void compute_row_partition_numbers(row_hasher_t the_hasher, // Accumulate histogram of the size of each partition in shared memory extern __shared__ size_type shared_partition_sizes[]; - auto tid = cudf::thread_index_type{threadIdx.x} + - cudf::thread_index_type{blockIdx.x} * cudf::thread_index_type{blockDim.x}; + auto tid = cudf::detail::grid_1d::global_thread_index(); + auto const stride = cudf::detail::grid_1d::grid_stride(); // Initialize local histogram size_type partition_number = threadIdx.x; @@ -160,7 +160,7 @@ __global__ void compute_row_partition_numbers(row_hasher_t the_hasher, row_partition_offset[row_number] = atomicAdd(&(shared_partition_sizes[partition_number]), size_type(1)); - tid += cudf::thread_index_type{blockDim.x} * cudf::thread_index_type{gridDim.x}; + tid += stride; } __syncthreads(); @@ -215,8 +215,8 @@ __global__ void compute_row_output_locations(size_type* __restrict__ row_partiti } __syncthreads(); - auto tid = cudf::thread_index_type{threadIdx.x} + - cudf::thread_index_type{blockIdx.x} * cudf::thread_index_type{blockDim.x}; + auto tid = cudf::detail::grid_1d::global_thread_id(); + auto const stride = cudf::detail::grid_1d::grid_stride(); // Get each row's partition number, and get it's output location by // incrementing block's offset counter for that partition number @@ -234,7 +234,7 @@ __global__ void compute_row_output_locations(size_type* __restrict__ row_partiti // Store the row's output location in-place row_partition_numbers[row_number] = row_output_location; - tid += cudf::thread_index_type{blockDim.x} * cudf::thread_index_type{gridDim.x}; + tid += stride; } } @@ -311,10 +311,8 @@ __global__ void copy_block_partitions(InputIter input_iter, __syncthreads(); // Fetch the input data to shared memory - for (auto tid = cudf::thread_index_type{threadIdx.x} + - cudf::thread_index_type{blockIdx.x} * cudf::thread_index_type{blockDim.x}; - tid < num_rows; - tid += cudf::thread_index_type{blockDim.x} * cudf::thread_index_type{gridDim.x}) { + for (auto tid = cudf::detail::grid_1d::global_thread_index(); tid < num_rows; + tid += cudf::detail::grid_1d::grid_stride()) { auto const row_number = static_cast(tid); size_type const ipartition = row_partition_numbers[row_number]; diff --git a/cpp/src/replace/nulls.cu b/cpp/src/replace/nulls.cu index e033db0e52a..ec157ecde2d 100644 --- a/cpp/src/replace/nulls.cu +++ b/cpp/src/replace/nulls.cu @@ -64,9 +64,9 @@ __global__ void replace_nulls_strings(cudf::column_device_view input, char* chars, cudf::size_type* valid_counter) { - cudf::size_type nrows = input.size(); - cudf::thread_index_type i = blockIdx.x * blockDim.x + threadIdx.x; - cudf::thread_index_type const stride = blockDim.x * gridDim.x; + cudf::size_type nrows = input.size(); + auto i = cudf::detail::grid_1d::global_thread_index(); + auto const stride = cudf::detail::grid_1d::grid_stride(); uint32_t active_mask = 0xffff'ffff; active_mask = __ballot_sync(active_mask, i < nrows); @@ -117,9 +117,9 @@ __global__ void replace_nulls(cudf::column_device_view input, cudf::mutable_column_device_view output, cudf::size_type* output_valid_count) { - cudf::size_type nrows = input.size(); - cudf::thread_index_type i = blockIdx.x * blockDim.x + threadIdx.x; - cudf::thread_index_type const stride = blockDim.x * gridDim.x; + cudf::size_type nrows = input.size(); + auto i = cudf::detail::grid_1d::global_thread_index(); + auto const stride = cudf::detail::grid_1d::grid_stride(); uint32_t active_mask = 0xffff'ffff; active_mask = __ballot_sync(active_mask, i < nrows); diff --git a/cpp/src/rolling/jit/kernel.cu b/cpp/src/rolling/jit/kernel.cu index 06b224c39ad..6bd37c1c5a5 100644 --- a/cpp/src/rolling/jit/kernel.cu +++ b/cpp/src/rolling/jit/kernel.cu @@ -51,8 +51,8 @@ __global__ void gpu_rolling_new(cudf::size_type nrows, FollowingWindowType following_window_begin, cudf::size_type min_periods) { - cudf::thread_index_type i = blockIdx.x * blockDim.x + threadIdx.x; - cudf::thread_index_type const stride = blockDim.x * gridDim.x; + auto i = cudf::detail::grid_1d::global_thread_index(); + auto const stride = cudf::detail::grid_1d::grid_stride(); cudf::size_type warp_valid_count{0}; diff --git a/cpp/src/transform/compute_column.cu b/cpp/src/transform/compute_column.cu index 61293d51ba2..0cc652eb40f 100644 --- a/cpp/src/transform/compute_column.cu +++ b/cpp/src/transform/compute_column.cu @@ -69,9 +69,8 @@ __launch_bounds__(max_block_size) __global__ auto thread_intermediate_storage = &intermediate_storage[threadIdx.x * device_expression_data.num_intermediates]; - auto const start_idx = - static_cast(threadIdx.x + blockIdx.x * blockDim.x); - auto const stride = static_cast(blockDim.x * gridDim.x); + auto start_idx = cudf::detail::grid_1d::global_thread_index(); + auto const stride = cudf::detail::grid_1d::grid_stride(); auto evaluator = cudf::ast::detail::expression_evaluator(table, device_expression_data); From 1400472188d0ba06f8a458ed7ea009130bb87380 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Thu, 31 Aug 2023 14:04:00 -0500 Subject: [PATCH 2/4] Fix typo: id, not index. --- cpp/src/copying/scatter.cu | 2 +- cpp/src/partitioning/partitioning.cu | 4 ++-- cpp/src/replace/nulls.cu | 4 ++-- cpp/src/rolling/jit/kernel.cu | 2 +- cpp/src/transform/compute_column.cu | 2 +- 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/cpp/src/copying/scatter.cu b/cpp/src/copying/scatter.cu index 484a6f46faf..11c27fc86e3 100644 --- a/cpp/src/copying/scatter.cu +++ b/cpp/src/copying/scatter.cu @@ -52,7 +52,7 @@ __global__ void marking_bitmask_kernel(mutable_column_device_view destination, MapIterator scatter_map, size_type num_scatter_rows) { - auto row = cudf::detail::grid_1d::global_thread_index(); + auto row = cudf::detail::grid_1d::global_thread_id(); auto const stride = cudf::detail::grid_1d::grid_stride(); while (row < num_scatter_rows) { diff --git a/cpp/src/partitioning/partitioning.cu b/cpp/src/partitioning/partitioning.cu index 6be03c4fb8e..7b6676346c2 100644 --- a/cpp/src/partitioning/partitioning.cu +++ b/cpp/src/partitioning/partitioning.cu @@ -134,7 +134,7 @@ __global__ void compute_row_partition_numbers(row_hasher_t the_hasher, // Accumulate histogram of the size of each partition in shared memory extern __shared__ size_type shared_partition_sizes[]; - auto tid = cudf::detail::grid_1d::global_thread_index(); + auto tid = cudf::detail::grid_1d::global_thread_id(); auto const stride = cudf::detail::grid_1d::grid_stride(); // Initialize local histogram @@ -311,7 +311,7 @@ __global__ void copy_block_partitions(InputIter input_iter, __syncthreads(); // Fetch the input data to shared memory - for (auto tid = cudf::detail::grid_1d::global_thread_index(); tid < num_rows; + for (auto tid = cudf::detail::grid_1d::global_thread_id(); tid < num_rows; tid += cudf::detail::grid_1d::grid_stride()) { auto const row_number = static_cast(tid); size_type const ipartition = row_partition_numbers[row_number]; diff --git a/cpp/src/replace/nulls.cu b/cpp/src/replace/nulls.cu index ec157ecde2d..5b9fd3d9f0f 100644 --- a/cpp/src/replace/nulls.cu +++ b/cpp/src/replace/nulls.cu @@ -65,7 +65,7 @@ __global__ void replace_nulls_strings(cudf::column_device_view input, cudf::size_type* valid_counter) { cudf::size_type nrows = input.size(); - auto i = cudf::detail::grid_1d::global_thread_index(); + auto i = cudf::detail::grid_1d::global_thread_id(); auto const stride = cudf::detail::grid_1d::grid_stride(); uint32_t active_mask = 0xffff'ffff; @@ -118,7 +118,7 @@ __global__ void replace_nulls(cudf::column_device_view input, cudf::size_type* output_valid_count) { cudf::size_type nrows = input.size(); - auto i = cudf::detail::grid_1d::global_thread_index(); + auto i = cudf::detail::grid_1d::global_thread_id(); auto const stride = cudf::detail::grid_1d::grid_stride(); uint32_t active_mask = 0xffff'ffff; diff --git a/cpp/src/rolling/jit/kernel.cu b/cpp/src/rolling/jit/kernel.cu index 6bd37c1c5a5..64632a09c44 100644 --- a/cpp/src/rolling/jit/kernel.cu +++ b/cpp/src/rolling/jit/kernel.cu @@ -51,7 +51,7 @@ __global__ void gpu_rolling_new(cudf::size_type nrows, FollowingWindowType following_window_begin, cudf::size_type min_periods) { - auto i = cudf::detail::grid_1d::global_thread_index(); + auto i = cudf::detail::grid_1d::global_thread_id(); auto const stride = cudf::detail::grid_1d::grid_stride(); cudf::size_type warp_valid_count{0}; diff --git a/cpp/src/transform/compute_column.cu b/cpp/src/transform/compute_column.cu index 0cc652eb40f..224dd93b048 100644 --- a/cpp/src/transform/compute_column.cu +++ b/cpp/src/transform/compute_column.cu @@ -69,7 +69,7 @@ __launch_bounds__(max_block_size) __global__ auto thread_intermediate_storage = &intermediate_storage[threadIdx.x * device_expression_data.num_intermediates]; - auto start_idx = cudf::detail::grid_1d::global_thread_index(); + auto start_idx = cudf::detail::grid_1d::global_thread_id(); auto const stride = cudf::detail::grid_1d::grid_stride(); auto evaluator = cudf::ast::detail::expression_evaluator(table, device_expression_data); From 6ea6de4fa2d19fc89dcc485a82574016d1aae008 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Thu, 31 Aug 2023 22:09:30 -0500 Subject: [PATCH 3/4] Revert cpp/src/rolling/jit/kernel.cu --- cpp/src/rolling/jit/kernel.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/rolling/jit/kernel.cu b/cpp/src/rolling/jit/kernel.cu index 64632a09c44..e9f69b91fd0 100644 --- a/cpp/src/rolling/jit/kernel.cu +++ b/cpp/src/rolling/jit/kernel.cu @@ -51,8 +51,8 @@ __global__ void gpu_rolling_new(cudf::size_type nrows, FollowingWindowType following_window_begin, cudf::size_type min_periods) { - auto i = cudf::detail::grid_1d::global_thread_id(); - auto const stride = cudf::detail::grid_1d::grid_stride(); + cudf::thread_index_type i = blockIdx.x * blockDim.x + threadIdx.x; + cudf::thread_index_type const stride = blockDim.x * gridDim.x; cudf::size_type warp_valid_count{0}; From 61c7c92feb67b302c5ab741346cda7db4800edb4 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Thu, 31 Aug 2023 22:10:12 -0500 Subject: [PATCH 4/4] Update cpp/src/rolling/jit/kernel.cu --- cpp/src/rolling/jit/kernel.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/rolling/jit/kernel.cu b/cpp/src/rolling/jit/kernel.cu index e9f69b91fd0..06b224c39ad 100644 --- a/cpp/src/rolling/jit/kernel.cu +++ b/cpp/src/rolling/jit/kernel.cu @@ -51,7 +51,7 @@ __global__ void gpu_rolling_new(cudf::size_type nrows, FollowingWindowType following_window_begin, cudf::size_type min_periods) { - cudf::thread_index_type i = blockIdx.x * blockDim.x + threadIdx.x; + cudf::thread_index_type i = blockIdx.x * blockDim.x + threadIdx.x; cudf::thread_index_type const stride = blockDim.x * gridDim.x; cudf::size_type warp_valid_count{0};