From e5377a9ad318c3a2b4105d1f86742b41d9c940ef Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Fri, 22 Nov 2024 10:55:55 -0500
Subject: [PATCH 1/3] Change binops for-each kernel to thrust::for_each_n

---
 cpp/src/binaryop/compiled/binary_ops.cuh | 29 ++++--------------------
 1 file changed, 4 insertions(+), 25 deletions(-)
diff --git a/cpp/src/binaryop/compiled/binary_ops.cuh b/cpp/src/binaryop/compiled/binary_ops.cuh
index 06987139188..7e8334801cd 100644
--- a/cpp/src/binaryop/compiled/binary_ops.cuh
+++ b/cpp/src/binaryop/compiled/binary_ops.cuh
@@ -244,24 +244,6 @@ struct binary_op_double_device_dispatcher {
   }
 };
 
-/**
- * @brief Simplified for_each kernel
- *
- * @param size number of elements to process.
- * @param f Functor object to call for each element.
- */
-template <typename Functor>
-CUDF_KERNEL void for_each_kernel(cudf::size_type size, Functor f)
-{
-  auto start        = cudf::detail::grid_1d::global_thread_id();
-  auto const stride = cudf::detail::grid_1d::grid_stride();
-
-#pragma unroll
-  for (auto i = start; i < size; i += stride) {
-    f(i);
-  }
-}
-
 /**
  * @brief Launches Simplified for_each kernel with maximum occupancy grid dimensions.
  *
@@ -273,13 +255,10 @@ CUDF_KERNEL void for_each_kernel(cudf::size_type size, Functor f)
 template <typename Functor>
 void for_each(rmm::cuda_stream_view stream, cudf::size_type size, Functor f)
 {
-  int block_size;
-  int min_grid_size;
-  CUDF_CUDA_TRY(
-    cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size, for_each_kernel<decltype(f)>));
-  auto grid = cudf::detail::grid_1d(size, block_size, 2 /* elements_per_thread */);
-  for_each_kernel<<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
-    size, std::forward<Functor&&>(f));
+  thrust::for_each_n(rmm::exec_policy(stream),
+                     thrust::counting_iterator<size_type>(0),
+                     size,
+                     std::forward<Functor&&>(f));
 }
 
 template <class BinaryOperator>

From 1158c750aa53a4ef7b346eb524b441e30b96f992 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Fri, 22 Nov 2024 12:41:40 -0500
Subject: [PATCH 2/3] use exec_policy_nosync

---
 cpp/src/binaryop/compiled/binary_ops.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/binaryop/compiled/binary_ops.cuh b/cpp/src/binaryop/compiled/binary_ops.cuh
index 7e8334801cd..ea7bb72bf12 100644
--- a/cpp/src/binaryop/compiled/binary_ops.cuh
+++ b/cpp/src/binaryop/compiled/binary_ops.cuh
@@ -255,7 +255,7 @@ struct binary_op_double_device_dispatcher {
 template <typename Functor>
 void for_each(rmm::cuda_stream_view stream, cudf::size_type size, Functor f)
 {
-  thrust::for_each_n(rmm::exec_policy(stream),
+  thrust::for_each_n(rmm::exec_policy_nosync(stream),
                      thrust::counting_iterator<size_type>(0),
                      size,
                      std::forward<Functor&&>(f));

From c6a02cae3acde310a7c193ffd1f452c4809f4cef Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Fri, 22 Nov 2024 15:14:44 -0500
Subject: [PATCH 3/3] remove for_each() utility

---
 cpp/src/binaryop/compiled/binary_ops.cuh  | 35 ++++++---------------
 cpp/src/binaryop/compiled/equality_ops.cu | 38 +++++++++++++----------
 2 files changed, 31 insertions(+), 42 deletions(-)

diff --git a/cpp/src/binaryop/compiled/binary_ops.cuh b/cpp/src/binaryop/compiled/binary_ops.cuh
index ea7bb72bf12..ec63504a414 100644
--- a/cpp/src/binaryop/compiled/binary_ops.cuh
+++ b/cpp/src/binaryop/compiled/binary_ops.cuh
@@ -244,23 +244,6 @@ struct binary_op_double_device_dispatcher {
   }
 };
 
-/**
- * @brief Launches Simplified for_each kernel with maximum occupancy grid dimensions.
- *
- * @tparam Functor
- * @param stream CUDA stream used for device memory operations and kernel launches.
- * @param size number of elements to process.
- * @param f Functor object to call for each element.
- */
-template <typename Functor>
-void for_each(rmm::cuda_stream_view stream, cudf::size_type size, Functor f)
-{
-  thrust::for_each_n(rmm::exec_policy_nosync(stream),
-                     thrust::counting_iterator<size_type>(0),
-                     size,
-                     std::forward<Functor&&>(f));
-}
-
 template <class BinaryOperator>
 void apply_binary_op(mutable_column_view& out,
                      column_view const& lhs,
@@ -277,16 +260,18 @@ void apply_binary_op(mutable_column_view& out,
   // Create binop functor instance
   if (common_dtype) {
     // Execute it on every element
-    for_each(stream,
-             out.size(),
-             binary_op_device_dispatcher<BinaryOperator>{
-               *common_dtype, *outd, *lhsd, *rhsd, is_lhs_scalar, is_rhs_scalar});
+    thrust::for_each_n(rmm::exec_policy_nosync(stream),
+                       thrust::counting_iterator<size_type>(0),
+                       out.size(),
+                       binary_op_device_dispatcher<BinaryOperator>{
+                         *common_dtype, *outd, *lhsd, *rhsd, is_lhs_scalar, is_rhs_scalar});
   } else {
     // Execute it on every element
-    for_each(stream,
-             out.size(),
-             binary_op_double_device_dispatcher<BinaryOperator>{
-               *outd, *lhsd, *rhsd, is_lhs_scalar, is_rhs_scalar});
+    thrust::for_each_n(rmm::exec_policy_nosync(stream),
+                       thrust::counting_iterator<size_type>(0),
+                       out.size(),
+                       binary_op_double_device_dispatcher<BinaryOperator>{
+                         *outd, *lhsd, *rhsd, is_lhs_scalar, is_rhs_scalar});
   }
 }
 
diff --git a/cpp/src/binaryop/compiled/equality_ops.cu b/cpp/src/binaryop/compiled/equality_ops.cu
index 041fca76494..d8c50683026 100644
--- a/cpp/src/binaryop/compiled/equality_ops.cu
+++ b/cpp/src/binaryop/compiled/equality_ops.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -34,27 +34,31 @@ void dispatch_equality_op(mutable_column_view& out,
   auto rhsd         = column_device_view::create(rhs, stream);
   if (common_dtype) {
     if (op == binary_operator::EQUAL) {
-      for_each(stream,
-               out.size(),
-               binary_op_device_dispatcher<ops::Equal>{
-                 *common_dtype, *outd, *lhsd, *rhsd, is_lhs_scalar, is_rhs_scalar});
+      thrust::for_each_n(rmm::exec_policy_nosync(stream),
+                         thrust::counting_iterator<size_type>(0),
+                         out.size(),
+                         binary_op_device_dispatcher<ops::Equal>{
+                           *common_dtype, *outd, *lhsd, *rhsd, is_lhs_scalar, is_rhs_scalar});
     } else if (op == binary_operator::NOT_EQUAL) {
-      for_each(stream,
-               out.size(),
-               binary_op_device_dispatcher<ops::NotEqual>{
-                 *common_dtype, *outd, *lhsd, *rhsd, is_lhs_scalar, is_rhs_scalar});
+      thrust::for_each_n(rmm::exec_policy_nosync(stream),
+                         thrust::counting_iterator<size_type>(0),
+                         out.size(),
+                         binary_op_device_dispatcher<ops::NotEqual>{
+                           *common_dtype, *outd, *lhsd, *rhsd, is_lhs_scalar, is_rhs_scalar});
     }
   } else {
     if (op == binary_operator::EQUAL) {
-      for_each(stream,
-               out.size(),
-               binary_op_double_device_dispatcher<ops::Equal>{
-                 *outd, *lhsd, *rhsd, is_lhs_scalar, is_rhs_scalar});
+      thrust::for_each_n(rmm::exec_policy_nosync(stream),
+                         thrust::counting_iterator<size_type>(0),
+                         out.size(),
+                         binary_op_double_device_dispatcher<ops::Equal>{
+                           *outd, *lhsd, *rhsd, is_lhs_scalar, is_rhs_scalar});
     } else if (op == binary_operator::NOT_EQUAL) {
-      for_each(stream,
-               out.size(),
-               binary_op_double_device_dispatcher<ops::NotEqual>{
-                 *outd, *lhsd, *rhsd, is_lhs_scalar, is_rhs_scalar});
+      thrust::for_each_n(rmm::exec_policy_nosync(stream),
+                         thrust::counting_iterator<size_type>(0),
+                         out.size(),
+                         binary_op_double_device_dispatcher<ops::NotEqual>{
+                           *outd, *lhsd, *rhsd, is_lhs_scalar, is_rhs_scalar});
     }
   }
 }