From 5c13bcee03199e896a0f3355ab15bffa7ea7831d Mon Sep 17 00:00:00 2001 From: divyegala Date: Fri, 25 Aug 2023 17:24:52 -0700 Subject: [PATCH 1/3] thread_index_type --- cpp/src/merge/merge.cu | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/cpp/src/merge/merge.cu b/cpp/src/merge/merge.cu index 5c54bb5661c..6283b16a4f4 100644 --- a/cpp/src/merge/merge.cu +++ b/cpp/src/merge/merge.cu @@ -78,11 +78,13 @@ __global__ void materialize_merged_bitmask_kernel( size_type const num_destination_rows, index_type const* const __restrict__ merged_indices) { - size_type destination_row = threadIdx.x + blockIdx.x * blockDim.x; + auto tid = cudf::thread_index_type{threadIdx.x} + + cudf::thread_index_type{blockIdx.x} * cudf::thread_index_type{blockDim.x}; - auto active_threads = __ballot_sync(0xffff'ffffu, destination_row < num_destination_rows); + auto active_threads = __ballot_sync(0xffff'ffffu, tid < num_destination_rows); - while (destination_row < num_destination_rows) { + while (tid < num_destination_rows) { + auto const destination_row = static_cast(tid); auto const [src_side, src_row] = merged_indices[destination_row]; bool const from_left{src_side == side::LEFT}; bool source_bit_is_valid{true}; @@ -99,8 +101,8 @@ __global__ void materialize_merged_bitmask_kernel( // Only one thread writes output if (0 == threadIdx.x % warpSize) { out_validity[word_index(destination_row)] = result_mask; } - destination_row += blockDim.x * gridDim.x; - active_threads = __ballot_sync(active_threads, destination_row < num_destination_rows); + tid += cudf::thread_index_type{blockDim.x} * cudf::thread_index_type{gridDim.x}; + active_threads = __ballot_sync(active_threads, tid < num_destination_rows); } } From 664659ab2fb232c2dac1ee13dbaadbcf3254cffd Mon Sep 17 00:00:00 2001 From: divyegala Date: Tue, 5 Sep 2023 16:05:35 -0700 Subject: [PATCH 2/3] address suggestion --- cpp/src/merge/merge.cu | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/src/merge/merge.cu b/cpp/src/merge/merge.cu index 6283b16a4f4..22f3cbfe438 100644 --- a/cpp/src/merge/merge.cu +++ b/cpp/src/merge/merge.cu @@ -78,11 +78,11 @@ __global__ void materialize_merged_bitmask_kernel( size_type const num_destination_rows, index_type const* const __restrict__ merged_indices) { - auto tid = cudf::thread_index_type{threadIdx.x} + - cudf::thread_index_type{blockIdx.x} * cudf::thread_index_type{blockDim.x}; + auto tid = detail::grid_1d::global_thread_id(); auto active_threads = __ballot_sync(0xffff'ffffu, tid < num_destination_rows); + auto stride = detail::grid_1d::grid_stride(); while (tid < num_destination_rows) { auto const destination_row = static_cast(tid); auto const [src_side, src_row] = merged_indices[destination_row]; @@ -101,7 +101,7 @@ __global__ void materialize_merged_bitmask_kernel( // Only one thread writes output if (0 == threadIdx.x % warpSize) { out_validity[word_index(destination_row)] = result_mask; } - tid += cudf::thread_index_type{blockDim.x} * cudf::thread_index_type{gridDim.x}; + tid += stride; active_threads = __ballot_sync(active_threads, tid < num_destination_rows); } } From 03934040cdb05021c9407ceba05610c20448e533 Mon Sep 17 00:00:00 2001 From: divyegala Date: Tue, 5 Sep 2023 16:17:58 -0700 Subject: [PATCH 3/3] use const for stride --- cpp/src/merge/merge.cu | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/src/merge/merge.cu b/cpp/src/merge/merge.cu index 22f3cbfe438..c0765b48205 100644 --- a/cpp/src/merge/merge.cu +++ b/cpp/src/merge/merge.cu @@ -78,11 +78,12 @@ __global__ void materialize_merged_bitmask_kernel( size_type const num_destination_rows, index_type const* const __restrict__ merged_indices) { + auto const stride = detail::grid_1d::grid_stride(); + auto tid = detail::grid_1d::global_thread_id(); auto active_threads = __ballot_sync(0xffff'ffffu, tid < num_destination_rows); - auto stride = detail::grid_1d::grid_stride(); while (tid < num_destination_rows) { auto const destination_row = static_cast(tid); auto const [src_side, src_row] = merged_indices[destination_row];