From e5377a9ad318c3a2b4105d1f86742b41d9c940ef Mon Sep 17 00:00:00 2001 From: David Wendt Date: Fri, 22 Nov 2024 10:55:55 -0500 Subject: [PATCH 1/3] Change binops for-each kernel to thrust::for_each_n --- cpp/src/binaryop/compiled/binary_ops.cuh | 29 ++++-------------------- 1 file changed, 4 insertions(+), 25 deletions(-) diff --git a/cpp/src/binaryop/compiled/binary_ops.cuh b/cpp/src/binaryop/compiled/binary_ops.cuh index 06987139188..7e8334801cd 100644 --- a/cpp/src/binaryop/compiled/binary_ops.cuh +++ b/cpp/src/binaryop/compiled/binary_ops.cuh @@ -244,24 +244,6 @@ struct binary_op_double_device_dispatcher { } }; -/** - * @brief Simplified for_each kernel - * - * @param size number of elements to process. - * @param f Functor object to call for each element. - */ -template -CUDF_KERNEL void for_each_kernel(cudf::size_type size, Functor f) -{ - auto start = cudf::detail::grid_1d::global_thread_id(); - auto const stride = cudf::detail::grid_1d::grid_stride(); - -#pragma unroll - for (auto i = start; i < size; i += stride) { - f(i); - } -} - /** * @brief Launches Simplified for_each kernel with maximum occupancy grid dimensions. * @@ -273,13 +255,10 @@ CUDF_KERNEL void for_each_kernel(cudf::size_type size, Functor f) template void for_each(rmm::cuda_stream_view stream, cudf::size_type size, Functor f) { - int block_size; - int min_grid_size; - CUDF_CUDA_TRY( - cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size, for_each_kernel)); - auto grid = cudf::detail::grid_1d(size, block_size, 2 /* elements_per_thread */); - for_each_kernel<<>>( - size, std::forward(f)); + thrust::for_each_n(rmm::exec_policy(stream), + thrust::counting_iterator(0), + size, + std::forward(f)); } template From 1158c750aa53a4ef7b346eb524b441e30b96f992 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Fri, 22 Nov 2024 12:41:40 -0500 Subject: [PATCH 2/3] use exec_policy_nosync --- cpp/src/binaryop/compiled/binary_ops.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/binaryop/compiled/binary_ops.cuh b/cpp/src/binaryop/compiled/binary_ops.cuh index 7e8334801cd..ea7bb72bf12 100644 --- a/cpp/src/binaryop/compiled/binary_ops.cuh +++ b/cpp/src/binaryop/compiled/binary_ops.cuh @@ -255,7 +255,7 @@ struct binary_op_double_device_dispatcher { template void for_each(rmm::cuda_stream_view stream, cudf::size_type size, Functor f) { - thrust::for_each_n(rmm::exec_policy(stream), + thrust::for_each_n(rmm::exec_policy_nosync(stream), thrust::counting_iterator(0), size, std::forward(f)); From c6a02cae3acde310a7c193ffd1f452c4809f4cef Mon Sep 17 00:00:00 2001 From: David Wendt Date: Fri, 22 Nov 2024 15:14:44 -0500 Subject: [PATCH 3/3] remove for_each() utility --- cpp/src/binaryop/compiled/binary_ops.cuh | 35 ++++++--------------- cpp/src/binaryop/compiled/equality_ops.cu | 38 +++++++++++++---------- 2 files changed, 31 insertions(+), 42 deletions(-) diff --git a/cpp/src/binaryop/compiled/binary_ops.cuh b/cpp/src/binaryop/compiled/binary_ops.cuh index ea7bb72bf12..ec63504a414 100644 --- a/cpp/src/binaryop/compiled/binary_ops.cuh +++ b/cpp/src/binaryop/compiled/binary_ops.cuh @@ -244,23 +244,6 @@ struct binary_op_double_device_dispatcher { } }; -/** - * @brief Launches Simplified for_each kernel with maximum occupancy grid dimensions. - * - * @tparam Functor - * @param stream CUDA stream used for device memory operations and kernel launches. - * @param size number of elements to process. - * @param f Functor object to call for each element. - */ -template -void for_each(rmm::cuda_stream_view stream, cudf::size_type size, Functor f) -{ - thrust::for_each_n(rmm::exec_policy_nosync(stream), - thrust::counting_iterator(0), - size, - std::forward(f)); -} - template void apply_binary_op(mutable_column_view& out, column_view const& lhs, @@ -277,16 +260,18 @@ void apply_binary_op(mutable_column_view& out, // Create binop functor instance if (common_dtype) { // Execute it on every element - for_each(stream, - out.size(), - binary_op_device_dispatcher{ - *common_dtype, *outd, *lhsd, *rhsd, is_lhs_scalar, is_rhs_scalar}); + thrust::for_each_n(rmm::exec_policy_nosync(stream), + thrust::counting_iterator(0), + out.size(), + binary_op_device_dispatcher{ + *common_dtype, *outd, *lhsd, *rhsd, is_lhs_scalar, is_rhs_scalar}); } else { // Execute it on every element - for_each(stream, - out.size(), - binary_op_double_device_dispatcher{ - *outd, *lhsd, *rhsd, is_lhs_scalar, is_rhs_scalar}); + thrust::for_each_n(rmm::exec_policy_nosync(stream), + thrust::counting_iterator(0), + out.size(), + binary_op_double_device_dispatcher{ + *outd, *lhsd, *rhsd, is_lhs_scalar, is_rhs_scalar}); } } diff --git a/cpp/src/binaryop/compiled/equality_ops.cu b/cpp/src/binaryop/compiled/equality_ops.cu index 041fca76494..d8c50683026 100644 --- a/cpp/src/binaryop/compiled/equality_ops.cu +++ b/cpp/src/binaryop/compiled/equality_ops.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -34,27 +34,31 @@ void dispatch_equality_op(mutable_column_view& out, auto rhsd = column_device_view::create(rhs, stream); if (common_dtype) { if (op == binary_operator::EQUAL) { - for_each(stream, - out.size(), - binary_op_device_dispatcher{ - *common_dtype, *outd, *lhsd, *rhsd, is_lhs_scalar, is_rhs_scalar}); + thrust::for_each_n(rmm::exec_policy_nosync(stream), + thrust::counting_iterator(0), + out.size(), + binary_op_device_dispatcher{ + *common_dtype, *outd, *lhsd, *rhsd, is_lhs_scalar, is_rhs_scalar}); } else if (op == binary_operator::NOT_EQUAL) { - for_each(stream, - out.size(), - binary_op_device_dispatcher{ - *common_dtype, *outd, *lhsd, *rhsd, is_lhs_scalar, is_rhs_scalar}); + thrust::for_each_n(rmm::exec_policy_nosync(stream), + thrust::counting_iterator(0), + out.size(), + binary_op_device_dispatcher{ + *common_dtype, *outd, *lhsd, *rhsd, is_lhs_scalar, is_rhs_scalar}); } } else { if (op == binary_operator::EQUAL) { - for_each(stream, - out.size(), - binary_op_double_device_dispatcher{ - *outd, *lhsd, *rhsd, is_lhs_scalar, is_rhs_scalar}); + thrust::for_each_n(rmm::exec_policy_nosync(stream), + thrust::counting_iterator(0), + out.size(), + binary_op_double_device_dispatcher{ + *outd, *lhsd, *rhsd, is_lhs_scalar, is_rhs_scalar}); } else if (op == binary_operator::NOT_EQUAL) { - for_each(stream, - out.size(), - binary_op_double_device_dispatcher{ - *outd, *lhsd, *rhsd, is_lhs_scalar, is_rhs_scalar}); + thrust::for_each_n(rmm::exec_policy_nosync(stream), + thrust::counting_iterator(0), + out.size(), + binary_op_double_device_dispatcher{ + *outd, *lhsd, *rhsd, is_lhs_scalar, is_rhs_scalar}); } } }