From efa538c985283ad22c4cac440d76732b050512dd Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Tue, 29 Aug 2023 13:33:53 -0500
Subject: [PATCH 1/4] Use grid_stride for stride computations.

---
 cpp/include/cudf/detail/utilities/cuda.cuh | 23 ++++++++++++++++++++++
 cpp/include/cudf/detail/valid_if.cuh       |  4 ++--
 cpp/src/bitmask/null_mask.cu               |  6 +++---
 cpp/src/copying/scatter.cu                 |  4 ++--
 cpp/src/partitioning/partitioning.cu       | 18 ++++++++---------
 cpp/src/replace/nulls.cu                   | 12 +++++------
 cpp/src/rolling/jit/kernel.cu              |  4 ++--
 cpp/src/transform/compute_column.cu        |  5 ++---
 8 files changed, 48 insertions(+), 28 deletions(-)

diff --git a/cpp/include/cudf/detail/utilities/cuda.cuh b/cpp/include/cudf/detail/utilities/cuda.cuh
index c95189f1f94..264302df0e9 100644
--- a/cpp/include/cudf/detail/utilities/cuda.cuh
+++ b/cpp/include/cudf/detail/utilities/cuda.cuh
@@ -92,6 +92,29 @@ class grid_1d {
   {
     return global_thread_id(threadIdx.x, blockIdx.x, blockDim.x);
   }
+
+  /**
+   * @brief Returns the stride of a 1D grid.
+   *
+   * The returned stride is the total number of threads in the grid.
+   *
+   * @param thread_id The thread index within the block
+   * @param block_id The block index within the grid
+   * @param num_threads_per_block The number of threads per block
+   * @return thread_index_type The global thread index
+   */
+  static constexpr thread_index_type grid_stride(thread_index_type num_threads_per_block,
+                                                 thread_index_type num_blocks_per_grid)
+  {
+    return num_threads_per_block * num_blocks_per_grid;
+  }
+
+  /**
+   * @brief Returns the stride of the current 1D grid.
+   *
+   * @return thread_index_type The number of threads in the grid.
+   */
+  static __device__ thread_index_type grid_stride() { return grid_stride(blockDim.x, gridDim.x); }
 };
 
 /**
diff --git a/cpp/include/cudf/detail/valid_if.cuh b/cpp/include/cudf/detail/valid_if.cuh
index bed884a23eb..f3f95dad017 100644
--- a/cpp/include/cudf/detail/valid_if.cuh
+++ b/cpp/include/cudf/detail/valid_if.cuh
@@ -49,8 +49,8 @@ __global__ void valid_if_kernel(
 {
   constexpr size_type leader_lane{0};
   auto const lane_id{threadIdx.x % warp_size};
-  thread_index_type i            = threadIdx.x + blockIdx.x * blockDim.x;
-  thread_index_type const stride = blockDim.x * gridDim.x;
+  auto i            = cudf::detail::grid_1d::global_thread_id();
+  auto const stride = cudf::detail::grid_1d::grid_stride();
   size_type warp_valid_count{0};
 
   auto active_mask = __ballot_sync(0xFFFF'FFFFu, i < size);
diff --git a/cpp/src/bitmask/null_mask.cu b/cpp/src/bitmask/null_mask.cu
index 33dc7e0556b..5a0d3e4f120 100644
--- a/cpp/src/bitmask/null_mask.cu
+++ b/cpp/src/bitmask/null_mask.cu
@@ -108,7 +108,7 @@ __global__ void set_null_mask_kernel(bitmask_type* __restrict__ destination,
   thread_index_type const last_word = word_index(end_bit) - word_index(begin_bit);
   bitmask_type fill_value           = valid ? 0xffff'ffff : 0;
 
-  thread_index_type const stride = blockDim.x * gridDim.x;
+  auto const stride = cudf::detail::grid_1d::grid_stride();
 
   for (thread_index_type destination_word_index = grid_1d::global_thread_id();
        destination_word_index < number_of_mask_words;
@@ -191,7 +191,7 @@ __global__ void copy_offset_bitmask(bitmask_type* __restrict__ destination,
                                     size_type source_end_bit,
                                     size_type number_of_mask_words)
 {
-  thread_index_type const stride = blockDim.x * gridDim.x;
+  auto const stride = cudf::detail::grid_1d::grid_stride();
   for (thread_index_type destination_word_index = grid_1d::global_thread_id();
        destination_word_index < number_of_mask_words;
        destination_word_index += stride) {
@@ -265,7 +265,7 @@ __global__ void count_set_bits_kernel(bitmask_type const* bitmask,
   auto const first_word_index{word_index(first_bit_index)};
   auto const last_word_index{word_index(last_bit_index)};
   thread_index_type const tid         = grid_1d::global_thread_id();
-  thread_index_type const stride      = blockDim.x * gridDim.x;
+  thread_index_type const stride      = grid_1d::grid_stride();
   thread_index_type thread_word_index = tid + first_word_index;
   size_type thread_count{0};
 
diff --git a/cpp/src/copying/scatter.cu b/cpp/src/copying/scatter.cu
index 96e24e9059d..484a6f46faf 100644
--- a/cpp/src/copying/scatter.cu
+++ b/cpp/src/copying/scatter.cu
@@ -52,8 +52,8 @@ __global__ void marking_bitmask_kernel(mutable_column_device_view destination,
                                        MapIterator scatter_map,
                                        size_type num_scatter_rows)
 {
-  thread_index_type row          = threadIdx.x + blockIdx.x * blockDim.x;
-  thread_index_type const stride = blockDim.x * gridDim.x;
+  auto row          = cudf::detail::grid_1d::global_thread_index();
+  auto const stride = cudf::detail::grid_1d::grid_stride();
 
   while (row < num_scatter_rows) {
     size_type const output_row = scatter_map[row];
diff --git a/cpp/src/partitioning/partitioning.cu b/cpp/src/partitioning/partitioning.cu
index ff9c4ea2f59..6be03c4fb8e 100644
--- a/cpp/src/partitioning/partitioning.cu
+++ b/cpp/src/partitioning/partitioning.cu
@@ -134,8 +134,8 @@ __global__ void compute_row_partition_numbers(row_hasher_t the_hasher,
   // Accumulate histogram of the size of each partition in shared memory
   extern __shared__ size_type shared_partition_sizes[];
 
-  auto tid = cudf::thread_index_type{threadIdx.x} +
-             cudf::thread_index_type{blockIdx.x} * cudf::thread_index_type{blockDim.x};
+  auto tid          = cudf::detail::grid_1d::global_thread_index();
+  auto const stride = cudf::detail::grid_1d::grid_stride();
 
   // Initialize local histogram
   size_type partition_number = threadIdx.x;
@@ -160,7 +160,7 @@ __global__ void compute_row_partition_numbers(row_hasher_t the_hasher,
     row_partition_offset[row_number] =
       atomicAdd(&(shared_partition_sizes[partition_number]), size_type(1));
 
-    tid += cudf::thread_index_type{blockDim.x} * cudf::thread_index_type{gridDim.x};
+    tid += stride;
   }
 
   __syncthreads();
@@ -215,8 +215,8 @@ __global__ void compute_row_output_locations(size_type* __restrict__ row_partiti
   }
   __syncthreads();
 
-  auto tid = cudf::thread_index_type{threadIdx.x} +
-             cudf::thread_index_type{blockIdx.x} * cudf::thread_index_type{blockDim.x};
+  auto tid          = cudf::detail::grid_1d::global_thread_id();
+  auto const stride = cudf::detail::grid_1d::grid_stride();
 
   // Get each row's partition number, and get it's output location by
   // incrementing block's offset counter for that partition number
@@ -234,7 +234,7 @@ __global__ void compute_row_output_locations(size_type* __restrict__ row_partiti
     // Store the row's output location in-place
     row_partition_numbers[row_number] = row_output_location;
 
-    tid += cudf::thread_index_type{blockDim.x} * cudf::thread_index_type{gridDim.x};
+    tid += stride;
   }
 }
 
@@ -311,10 +311,8 @@ __global__ void copy_block_partitions(InputIter input_iter,
   __syncthreads();
 
   // Fetch the input data to shared memory
-  for (auto tid = cudf::thread_index_type{threadIdx.x} +
-                  cudf::thread_index_type{blockIdx.x} * cudf::thread_index_type{blockDim.x};
-       tid < num_rows;
-       tid += cudf::thread_index_type{blockDim.x} * cudf::thread_index_type{gridDim.x}) {
+  for (auto tid = cudf::detail::grid_1d::global_thread_index(); tid < num_rows;
+       tid += cudf::detail::grid_1d::grid_stride()) {
     auto const row_number      = static_cast<size_type>(tid);
     size_type const ipartition = row_partition_numbers[row_number];
 
diff --git a/cpp/src/replace/nulls.cu b/cpp/src/replace/nulls.cu
index e033db0e52a..ec157ecde2d 100644
--- a/cpp/src/replace/nulls.cu
+++ b/cpp/src/replace/nulls.cu
@@ -64,9 +64,9 @@ __global__ void replace_nulls_strings(cudf::column_device_view input,
                                       char* chars,
                                       cudf::size_type* valid_counter)
 {
-  cudf::size_type nrows                = input.size();
-  cudf::thread_index_type i            = blockIdx.x * blockDim.x + threadIdx.x;
-  cudf::thread_index_type const stride = blockDim.x * gridDim.x;
+  cudf::size_type nrows = input.size();
+  auto i                = cudf::detail::grid_1d::global_thread_index();
+  auto const stride     = cudf::detail::grid_1d::grid_stride();
 
   uint32_t active_mask = 0xffff'ffff;
   active_mask          = __ballot_sync(active_mask, i < nrows);
@@ -117,9 +117,9 @@ __global__ void replace_nulls(cudf::column_device_view input,
                               cudf::mutable_column_device_view output,
                               cudf::size_type* output_valid_count)
 {
-  cudf::size_type nrows                = input.size();
-  cudf::thread_index_type i            = blockIdx.x * blockDim.x + threadIdx.x;
-  cudf::thread_index_type const stride = blockDim.x * gridDim.x;
+  cudf::size_type nrows = input.size();
+  auto i                = cudf::detail::grid_1d::global_thread_index();
+  auto const stride     = cudf::detail::grid_1d::grid_stride();
 
   uint32_t active_mask = 0xffff'ffff;
   active_mask          = __ballot_sync(active_mask, i < nrows);
diff --git a/cpp/src/rolling/jit/kernel.cu b/cpp/src/rolling/jit/kernel.cu
index 06b224c39ad..6bd37c1c5a5 100644
--- a/cpp/src/rolling/jit/kernel.cu
+++ b/cpp/src/rolling/jit/kernel.cu
@@ -51,8 +51,8 @@ __global__ void gpu_rolling_new(cudf::size_type nrows,
                                 FollowingWindowType following_window_begin,
                                 cudf::size_type min_periods)
 {
-  cudf::thread_index_type i            = blockIdx.x * blockDim.x + threadIdx.x;
-  cudf::thread_index_type const stride = blockDim.x * gridDim.x;
+  auto i            = cudf::detail::grid_1d::global_thread_index();
+  auto const stride = cudf::detail::grid_1d::grid_stride();
 
   cudf::size_type warp_valid_count{0};
 
diff --git a/cpp/src/transform/compute_column.cu b/cpp/src/transform/compute_column.cu
index 61293d51ba2..0cc652eb40f 100644
--- a/cpp/src/transform/compute_column.cu
+++ b/cpp/src/transform/compute_column.cu
@@ -69,9 +69,8 @@ __launch_bounds__(max_block_size) __global__
 
   auto thread_intermediate_storage =
     &intermediate_storage[threadIdx.x * device_expression_data.num_intermediates];
-  auto const start_idx =
-    static_cast<cudf::thread_index_type>(threadIdx.x + blockIdx.x * blockDim.x);
-  auto const stride = static_cast<cudf::thread_index_type>(blockDim.x * gridDim.x);
+  auto start_idx    = cudf::detail::grid_1d::global_thread_index();
+  auto const stride = cudf::detail::grid_1d::grid_stride();
   auto evaluator =
     cudf::ast::detail::expression_evaluator<has_nulls>(table, device_expression_data);
 

From 1400472188d0ba06f8a458ed7ea009130bb87380 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Thu, 31 Aug 2023 14:04:00 -0500
Subject: [PATCH 2/4] Fix typo: id, not index.

---
 cpp/src/copying/scatter.cu           | 2 +-
 cpp/src/partitioning/partitioning.cu | 4 ++--
 cpp/src/replace/nulls.cu             | 4 ++--
 cpp/src/rolling/jit/kernel.cu        | 2 +-
 cpp/src/transform/compute_column.cu  | 2 +-
 5 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/cpp/src/copying/scatter.cu b/cpp/src/copying/scatter.cu
index 484a6f46faf..11c27fc86e3 100644
--- a/cpp/src/copying/scatter.cu
+++ b/cpp/src/copying/scatter.cu
@@ -52,7 +52,7 @@ __global__ void marking_bitmask_kernel(mutable_column_device_view destination,
                                        MapIterator scatter_map,
                                        size_type num_scatter_rows)
 {
-  auto row          = cudf::detail::grid_1d::global_thread_index();
+  auto row          = cudf::detail::grid_1d::global_thread_id();
   auto const stride = cudf::detail::grid_1d::grid_stride();
 
   while (row < num_scatter_rows) {
diff --git a/cpp/src/partitioning/partitioning.cu b/cpp/src/partitioning/partitioning.cu
index 6be03c4fb8e..7b6676346c2 100644
--- a/cpp/src/partitioning/partitioning.cu
+++ b/cpp/src/partitioning/partitioning.cu
@@ -134,7 +134,7 @@ __global__ void compute_row_partition_numbers(row_hasher_t the_hasher,
   // Accumulate histogram of the size of each partition in shared memory
   extern __shared__ size_type shared_partition_sizes[];
 
-  auto tid          = cudf::detail::grid_1d::global_thread_index();
+  auto tid          = cudf::detail::grid_1d::global_thread_id();
   auto const stride = cudf::detail::grid_1d::grid_stride();
 
   // Initialize local histogram
@@ -311,7 +311,7 @@ __global__ void copy_block_partitions(InputIter input_iter,
   __syncthreads();
 
   // Fetch the input data to shared memory
-  for (auto tid = cudf::detail::grid_1d::global_thread_index(); tid < num_rows;
+  for (auto tid = cudf::detail::grid_1d::global_thread_id(); tid < num_rows;
        tid += cudf::detail::grid_1d::grid_stride()) {
     auto const row_number      = static_cast<size_type>(tid);
     size_type const ipartition = row_partition_numbers[row_number];
diff --git a/cpp/src/replace/nulls.cu b/cpp/src/replace/nulls.cu
index ec157ecde2d..5b9fd3d9f0f 100644
--- a/cpp/src/replace/nulls.cu
+++ b/cpp/src/replace/nulls.cu
@@ -65,7 +65,7 @@ __global__ void replace_nulls_strings(cudf::column_device_view input,
                                       cudf::size_type* valid_counter)
 {
   cudf::size_type nrows = input.size();
-  auto i                = cudf::detail::grid_1d::global_thread_index();
+  auto i                = cudf::detail::grid_1d::global_thread_id();
   auto const stride     = cudf::detail::grid_1d::grid_stride();
 
   uint32_t active_mask = 0xffff'ffff;
@@ -118,7 +118,7 @@ __global__ void replace_nulls(cudf::column_device_view input,
                               cudf::size_type* output_valid_count)
 {
   cudf::size_type nrows = input.size();
-  auto i                = cudf::detail::grid_1d::global_thread_index();
+  auto i                = cudf::detail::grid_1d::global_thread_id();
   auto const stride     = cudf::detail::grid_1d::grid_stride();
 
   uint32_t active_mask = 0xffff'ffff;
diff --git a/cpp/src/rolling/jit/kernel.cu b/cpp/src/rolling/jit/kernel.cu
index 6bd37c1c5a5..64632a09c44 100644
--- a/cpp/src/rolling/jit/kernel.cu
+++ b/cpp/src/rolling/jit/kernel.cu
@@ -51,7 +51,7 @@ __global__ void gpu_rolling_new(cudf::size_type nrows,
                                 FollowingWindowType following_window_begin,
                                 cudf::size_type min_periods)
 {
-  auto i            = cudf::detail::grid_1d::global_thread_index();
+  auto i            = cudf::detail::grid_1d::global_thread_id();
   auto const stride = cudf::detail::grid_1d::grid_stride();
 
   cudf::size_type warp_valid_count{0};
diff --git a/cpp/src/transform/compute_column.cu b/cpp/src/transform/compute_column.cu
index 0cc652eb40f..224dd93b048 100644
--- a/cpp/src/transform/compute_column.cu
+++ b/cpp/src/transform/compute_column.cu
@@ -69,7 +69,7 @@ __launch_bounds__(max_block_size) __global__
 
   auto thread_intermediate_storage =
     &intermediate_storage[threadIdx.x * device_expression_data.num_intermediates];
-  auto start_idx    = cudf::detail::grid_1d::global_thread_index();
+  auto start_idx    = cudf::detail::grid_1d::global_thread_id();
   auto const stride = cudf::detail::grid_1d::grid_stride();
   auto evaluator =
     cudf::ast::detail::expression_evaluator<has_nulls>(table, device_expression_data);

From 6ea6de4fa2d19fc89dcc485a82574016d1aae008 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Thu, 31 Aug 2023 22:09:30 -0500
Subject: [PATCH 3/4] Revert cpp/src/rolling/jit/kernel.cu

---
 cpp/src/rolling/jit/kernel.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/src/rolling/jit/kernel.cu b/cpp/src/rolling/jit/kernel.cu
index 64632a09c44..e9f69b91fd0 100644
--- a/cpp/src/rolling/jit/kernel.cu
+++ b/cpp/src/rolling/jit/kernel.cu
@@ -51,8 +51,8 @@ __global__ void gpu_rolling_new(cudf::size_type nrows,
                                 FollowingWindowType following_window_begin,
                                 cudf::size_type min_periods)
 {
-  auto i            = cudf::detail::grid_1d::global_thread_id();
-  auto const stride = cudf::detail::grid_1d::grid_stride();
+  cudf::thread_index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+  cudf::thread_index_type const stride = blockDim.x * gridDim.x;
 
   cudf::size_type warp_valid_count{0};
 

From 61c7c92feb67b302c5ab741346cda7db4800edb4 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Thu, 31 Aug 2023 22:10:12 -0500
Subject: [PATCH 4/4] Update cpp/src/rolling/jit/kernel.cu

---
 cpp/src/rolling/jit/kernel.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/rolling/jit/kernel.cu b/cpp/src/rolling/jit/kernel.cu
index e9f69b91fd0..06b224c39ad 100644
--- a/cpp/src/rolling/jit/kernel.cu
+++ b/cpp/src/rolling/jit/kernel.cu
@@ -51,7 +51,7 @@ __global__ void gpu_rolling_new(cudf::size_type nrows,
                                 FollowingWindowType following_window_begin,
                                 cudf::size_type min_periods)
 {
-  cudf::thread_index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+  cudf::thread_index_type i            = blockIdx.x * blockDim.x + threadIdx.x;
   cudf::thread_index_type const stride = blockDim.x * gridDim.x;
 
   cudf::size_type warp_valid_count{0};