From 69e6dbbf447a951e4b08f15c737eedcbaf3291da Mon Sep 17 00:00:00 2001 From: Robert Maynard Date: Fri, 3 Dec 2021 10:18:04 -0500 Subject: [PATCH] Move the binary_ops common dispatcher logic to be executed on the CPU (#9816) * move NullEquals to separate file * To improve runtime performance move more binary_ops dispatch to host * make sure to forceinline the operator_dispatcher * Correct style issues found by ci * Expand the binary-op compiled benchmark suite * Ensure forceinline is on binary ops device dispatch functions * Correct style issues found by ci Co-authored-by: Karthikeyan Natarajan Co-authored-by: Karthikeyan <6488848+karthikeyann@users.noreply.github.com> --- cpp/CMakeLists.txt | 1 + .../binaryop/compiled_binaryop_benchmark.cpp | 66 ++++++++++--------- .../cudf/utilities/type_dispatcher.hpp | 14 ++-- cpp/src/binaryop/compiled/NullEquals.cu | 26 ++++++++ cpp/src/binaryop/compiled/binary_ops.cu | 2 +- cpp/src/binaryop/compiled/binary_ops.cuh | 63 ++++++++++++------ cpp/src/binaryop/compiled/equality_ops.cu | 41 ++++++++---- 7 files changed, 141 insertions(+), 72 deletions(-) create mode 100644 cpp/src/binaryop/compiled/NullEquals.cu diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 59dc3c74af2..37f93f1868b 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -185,6 +185,7 @@ add_library( src/binaryop/compiled/LogicalOr.cu src/binaryop/compiled/Mod.cu src/binaryop/compiled/Mul.cu + src/binaryop/compiled/NullEquals.cu src/binaryop/compiled/NullMax.cu src/binaryop/compiled/NullMin.cu src/binaryop/compiled/PMod.cu diff --git a/cpp/benchmarks/binaryop/compiled_binaryop_benchmark.cpp b/cpp/benchmarks/binaryop/compiled_binaryop_benchmark.cpp index bc0818ace4b..8d04f8bdcb2 100644 --- a/cpp/benchmarks/binaryop/compiled_binaryop_benchmark.cpp +++ b/cpp/benchmarks/binaryop/compiled_binaryop_benchmark.cpp @@ -50,14 +50,14 @@ void BM_compiled_binaryop(benchmark::State& state, cudf::binary_operator binop) } // TODO tparam boolean for null. -#define BINARYOP_BENCHMARK_DEFINE(TypeLhs, TypeRhs, binop, TypeOut) \ +#define BINARYOP_BENCHMARK_DEFINE(name, TypeLhs, TypeRhs, binop, TypeOut) \ BENCHMARK_TEMPLATE_DEFINE_F( \ - COMPILED_BINARYOP, binop, TypeLhs, TypeRhs, TypeOut, cudf::binary_operator::binop) \ + COMPILED_BINARYOP, name, TypeLhs, TypeRhs, TypeOut, cudf::binary_operator::binop) \ (::benchmark::State & st) \ { \ BM_compiled_binaryop(st, cudf::binary_operator::binop); \ } \ - BENCHMARK_REGISTER_F(COMPILED_BINARYOP, binop) \ + BENCHMARK_REGISTER_F(COMPILED_BINARYOP, name) \ ->Unit(benchmark::kMicrosecond) \ ->UseManualTime() \ ->Arg(10000) /* 10k */ \ @@ -70,30 +70,36 @@ using namespace cudf; using namespace numeric; // clang-format off -BINARYOP_BENCHMARK_DEFINE(float, int64_t, ADD, int32_t); -BINARYOP_BENCHMARK_DEFINE(duration_s, duration_D, SUB, duration_ms); -BINARYOP_BENCHMARK_DEFINE(float, float, MUL, int64_t); -BINARYOP_BENCHMARK_DEFINE(int64_t, int64_t, DIV, int64_t); -BINARYOP_BENCHMARK_DEFINE(int64_t, int64_t, TRUE_DIV, int64_t); -BINARYOP_BENCHMARK_DEFINE(int64_t, int64_t, FLOOR_DIV, int64_t); -BINARYOP_BENCHMARK_DEFINE(double, double, MOD, double); -BINARYOP_BENCHMARK_DEFINE(int32_t, int64_t, PMOD, double); -BINARYOP_BENCHMARK_DEFINE(int32_t, uint8_t, PYMOD, int64_t); -BINARYOP_BENCHMARK_DEFINE(int64_t, int64_t, POW, double); -BINARYOP_BENCHMARK_DEFINE(float, double, LOG_BASE, double); -BINARYOP_BENCHMARK_DEFINE(float, double, ATAN2, double); -BINARYOP_BENCHMARK_DEFINE(int, int, SHIFT_LEFT, int); -BINARYOP_BENCHMARK_DEFINE(int16_t, int64_t, SHIFT_RIGHT, int); -BINARYOP_BENCHMARK_DEFINE(int64_t, int32_t, SHIFT_RIGHT_UNSIGNED, int64_t); -BINARYOP_BENCHMARK_DEFINE(int64_t, int32_t, BITWISE_AND, int16_t); -BINARYOP_BENCHMARK_DEFINE(int16_t, int32_t, BITWISE_OR, int64_t); -BINARYOP_BENCHMARK_DEFINE(int16_t, int64_t, BITWISE_XOR, int32_t); -BINARYOP_BENCHMARK_DEFINE(double, int8_t, LOGICAL_AND, bool); -BINARYOP_BENCHMARK_DEFINE(int16_t, int64_t, LOGICAL_OR, bool); -BINARYOP_BENCHMARK_DEFINE(duration_ms, duration_ns, EQUAL, bool); -BINARYOP_BENCHMARK_DEFINE(decimal32, decimal32, NOT_EQUAL, bool); -BINARYOP_BENCHMARK_DEFINE(timestamp_s, timestamp_s, LESS, bool); -BINARYOP_BENCHMARK_DEFINE(timestamp_ms, timestamp_s, GREATER, bool); -BINARYOP_BENCHMARK_DEFINE(duration_ms, duration_ns, NULL_EQUALS, bool); -BINARYOP_BENCHMARK_DEFINE(decimal32, decimal32, NULL_MAX, decimal32); -BINARYOP_BENCHMARK_DEFINE(timestamp_D, timestamp_s, NULL_MIN, timestamp_s); +BINARYOP_BENCHMARK_DEFINE(ADD_1, float, float, ADD, float); +BINARYOP_BENCHMARK_DEFINE(ADD_2, timestamp_s, duration_s, ADD, timestamp_s); +BINARYOP_BENCHMARK_DEFINE(SUB_1, duration_s, duration_D, SUB, duration_ms); +BINARYOP_BENCHMARK_DEFINE(SUB_2, int64_t, int64_t, SUB, int64_t); +BINARYOP_BENCHMARK_DEFINE(MUL_1, float, float, MUL, int64_t); +BINARYOP_BENCHMARK_DEFINE(MUL_2, duration_s, int64_t, MUL, duration_s); +BINARYOP_BENCHMARK_DEFINE(DIV_1, int64_t, int64_t, DIV, int64_t); +BINARYOP_BENCHMARK_DEFINE(DIV_2, duration_ms, int32_t, DIV, duration_ms); +BINARYOP_BENCHMARK_DEFINE(TRUE_DIV, int64_t, int64_t, TRUE_DIV, int64_t); +BINARYOP_BENCHMARK_DEFINE(FLOOR_DIV, int64_t, int64_t, FLOOR_DIV, int64_t); +BINARYOP_BENCHMARK_DEFINE(MOD_1, double, double, MOD, double); +BINARYOP_BENCHMARK_DEFINE(MOD_2, duration_ms, int64_t, MOD, duration_ms); +BINARYOP_BENCHMARK_DEFINE(PMOD, int32_t, int64_t, PMOD, double); +BINARYOP_BENCHMARK_DEFINE(PYMOD, int32_t, uint8_t, PYMOD, int64_t); +BINARYOP_BENCHMARK_DEFINE(POW, int64_t, int64_t, POW, double); +BINARYOP_BENCHMARK_DEFINE(LOG_BASE, float, double, LOG_BASE, double); +BINARYOP_BENCHMARK_DEFINE(ATAN2, float, double, ATAN2, double); +BINARYOP_BENCHMARK_DEFINE(SHIFT_LEFT, int, int, SHIFT_LEFT, int); +BINARYOP_BENCHMARK_DEFINE(SHIFT_RIGHT, int16_t, int64_t, SHIFT_RIGHT, int); +BINARYOP_BENCHMARK_DEFINE(USHIFT_RIGHT, int64_t, int32_t, SHIFT_RIGHT_UNSIGNED, int64_t); +BINARYOP_BENCHMARK_DEFINE(BITWISE_AND, int64_t, int32_t, BITWISE_AND, int16_t); +BINARYOP_BENCHMARK_DEFINE(BITWISE_OR, int16_t, int32_t, BITWISE_OR, int64_t); +BINARYOP_BENCHMARK_DEFINE(BITWISE_XOR, int16_t, int64_t, BITWISE_XOR, int32_t); +BINARYOP_BENCHMARK_DEFINE(LOGICAL_AND, double, int8_t, LOGICAL_AND, bool); +BINARYOP_BENCHMARK_DEFINE(LOGICAL_OR, int16_t, int64_t, LOGICAL_OR, bool); +BINARYOP_BENCHMARK_DEFINE(EQUAL_1, int32_t, int64_t, EQUAL, bool); +BINARYOP_BENCHMARK_DEFINE(EQUAL_2, duration_ms, duration_ns, EQUAL, bool); +BINARYOP_BENCHMARK_DEFINE(NOT_EQUAL, decimal32, decimal32, NOT_EQUAL, bool); +BINARYOP_BENCHMARK_DEFINE(LESS, timestamp_s, timestamp_s, LESS, bool); +BINARYOP_BENCHMARK_DEFINE(GREATER, timestamp_ms, timestamp_s, GREATER, bool); +BINARYOP_BENCHMARK_DEFINE(NULL_EQUALS, duration_ms, duration_ns, NULL_EQUALS, bool); +BINARYOP_BENCHMARK_DEFINE(NULL_MAX, decimal32, decimal32, NULL_MAX, decimal32); +BINARYOP_BENCHMARK_DEFINE(NULL_MIN, timestamp_D, timestamp_s, NULL_MIN, timestamp_s); diff --git a/cpp/include/cudf/utilities/type_dispatcher.hpp b/cpp/include/cudf/utilities/type_dispatcher.hpp index a04b8309142..d7d38aba4f3 100644 --- a/cpp/include/cudf/utilities/type_dispatcher.hpp +++ b/cpp/include/cudf/utilities/type_dispatcher.hpp @@ -531,7 +531,7 @@ template struct double_type_dispatcher_second_type { #pragma nv_exec_check_disable template - CUDA_HOST_DEVICE_CALLABLE decltype(auto) operator()(F&& f, Ts&&... args) const + CUDF_HDFI decltype(auto) operator()(F&& f, Ts&&... args) const { return f.template operator()(std::forward(args)...); } @@ -541,9 +541,7 @@ template