From a5d2407b93de444a6a7faf9db4b7dbf4ecbfe9ed Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Thu, 8 Apr 2021 16:49:23 -0500 Subject: [PATCH] jitify direct-to-cubin compilation and caching. (#7919) This changes jitify2 `get_kernel` invocations to pass an `-arch=sm_.` arg which causes jitify to compile and disk-cache direct-to-cubin, rather than caching ptx. This alleviates some compilation issues for specific (possibly unsupported) environments/configurations, but more importantly should provide faster launching from a cached kernel. Authors: - Christopher Harris (https://github.com/cwharris) Approvers: - MithunR (https://github.com/mythrocks) - Keith Kraus (https://github.com/kkraus14) - Devavret Makkar (https://github.com/devavret) URL: https://github.com/rapidsai/cudf/pull/7919 --- cpp/src/binaryop/binaryop.cpp | 13 +++++++------ cpp/src/rolling/rolling_detail.cuh | 5 +++-- cpp/src/transform/transform.cpp | 7 ++++--- 3 files changed, 14 insertions(+), 11 deletions(-) diff --git a/cpp/src/binaryop/binaryop.cpp b/cpp/src/binaryop/binaryop.cpp index 55a7f7a9f1b..11a3383ee87 100644 --- a/cpp/src/binaryop/binaryop.cpp +++ b/cpp/src/binaryop/binaryop.cpp @@ -89,7 +89,7 @@ void binary_operation(mutable_column_view& out, get_operator_name(op, op_type)); cudf::jit::get_program_cache(*binaryop_jit_kernel_cu_jit) - .get_kernel(kernel_name) // + .get_kernel(kernel_name, {}, {}, {"-arch=sm_."}) // ->configure_1d_max_occupancy(0, 0, 0, stream.value()) // ->launch(out.size(), cudf::jit::get_data_ptr(out), @@ -108,7 +108,7 @@ void binary_operation(mutable_column_view& out, get_operator_name(op, op_type)); cudf::jit::get_program_cache(*binaryop_jit_kernel_cu_jit) - .get_kernel(kernel_name) // + .get_kernel(kernel_name, {}, {}, {"-arch=sm_."}) // ->configure_1d_max_occupancy(0, 0, 0, stream.value()) // ->launch(out.size(), cudf::jit::get_data_ptr(out), @@ -150,7 +150,7 @@ void binary_operation(mutable_column_view& out, get_operator_name(op, OperatorType::Direct)); cudf::jit::get_program_cache(*binaryop_jit_kernel_cu_jit) - .get_kernel(kernel_name) // + .get_kernel(kernel_name, {}, {}, {"-arch=sm_."}) // ->configure_1d_max_occupancy(0, 0, 0, stream.value()) // ->launch(out.size(), cudf::jit::get_data_ptr(out), @@ -170,7 +170,7 @@ void binary_operation(mutable_column_view& out, get_operator_name(op, OperatorType::Direct)); cudf::jit::get_program_cache(*binaryop_jit_kernel_cu_jit) - .get_kernel(kernel_name) // + .get_kernel(kernel_name, {}, {}, {"-arch=sm_."}) // ->configure_1d_max_occupancy(0, 0, 0, stream.value()) // ->launch(out.size(), cudf::jit::get_data_ptr(out), @@ -200,8 +200,9 @@ void binary_operation(mutable_column_view& out, get_operator_name(binary_operator::GENERIC_BINARY, OperatorType::Direct)); cudf::jit::get_program_cache(*binaryop_jit_kernel_cu_jit) - .get_kernel(kernel_name, {}, {{"binaryop/jit/operation-udf.hpp", cuda_source}}) // - ->configure_1d_max_occupancy(0, 0, 0, stream.value()) // + .get_kernel( + kernel_name, {}, {{"binaryop/jit/operation-udf.hpp", cuda_source}}, {"-arch=sm_."}) // + ->configure_1d_max_occupancy(0, 0, 0, stream.value()) // ->launch(out.size(), cudf::jit::get_data_ptr(out), cudf::jit::get_data_ptr(lhs), diff --git a/cpp/src/rolling/rolling_detail.cuh b/cpp/src/rolling/rolling_detail.cuh index bb431fad537..c6439486461 100644 --- a/cpp/src/rolling/rolling_detail.cuh +++ b/cpp/src/rolling/rolling_detail.cuh @@ -1295,8 +1295,9 @@ std::unique_ptr rolling_window_udf(column_view const& input, following_window_str.c_str()); cudf::jit::get_program_cache(*rolling_jit_kernel_cu_jit) - .get_kernel(kernel_name, {}, {{"rolling/jit/operation-udf.hpp", cuda_source}}) // - ->configure_1d_max_occupancy(0, 0, 0, stream.value()) // + .get_kernel( + kernel_name, {}, {{"rolling/jit/operation-udf.hpp", cuda_source}}, {"-arch=sm_."}) // + ->configure_1d_max_occupancy(0, 0, 0, stream.value()) // ->launch(input.size(), cudf::jit::get_data_ptr(input), input.null_mask(), diff --git a/cpp/src/transform/transform.cpp b/cpp/src/transform/transform.cpp index 8f176d035d2..40feab00b3c 100644 --- a/cpp/src/transform/transform.cpp +++ b/cpp/src/transform/transform.cpp @@ -55,9 +55,10 @@ void unary_operation(mutable_column_view output, "GENERIC_UNARY_OP"); cudf::jit::get_program_cache(*transform_jit_kernel_cu_jit) - .get_kernel(kernel_name, {}, {{"transform/jit/operation-udf.hpp", cuda_source}}) // - ->configure_1d_max_occupancy(0, 0, 0, stream.value()) // - ->launch(output.size(), // + .get_kernel( + kernel_name, {}, {{"transform/jit/operation-udf.hpp", cuda_source}}, {"-arch=sm_."}) // + ->configure_1d_max_occupancy(0, 0, 0, stream.value()) // + ->launch(output.size(), // cudf::jit::get_data_ptr(output), cudf::jit::get_data_ptr(input)); }