jitify direct-to-cubin compilation and caching. (#7919)

This changes jitify2 `get_kernel` invocations to pass an `-arch=sm_.` arg which causes jitify to compile and disk-cache direct-to-cubin, rather than caching ptx. This alleviates some compilation issues for specific (possibly unsupported) environments/configurations, but more importantly should provide faster launching from a cached kernel. Authors: - Christopher Harris (https://github.com/cwharris) Approvers: - MithunR (https://github.com/mythrocks) - Keith Kraus (https://github.com/kkraus14) - Devavret Makkar (https://github.com/devavret) URL: #7919
rapidsai · Apr 8, 2021 · a5d2407 · a5d2407
1 parent 192ff46
commit a5d2407
Show file tree

Hide file tree

Showing 3 changed files with 14 additions and 11 deletions.
diff --git a/cpp/src/binaryop/binaryop.cpp b/cpp/src/binaryop/binaryop.cpp
@@ -89,7 +89,7 @@ void binary_operation(mutable_column_view& out,
                      get_operator_name(op, op_type));
 
     cudf::jit::get_program_cache(*binaryop_jit_kernel_cu_jit)
-      .get_kernel(kernel_name)                               //
+      .get_kernel(kernel_name, {}, {}, {"-arch=sm_."})       //
       ->configure_1d_max_occupancy(0, 0, 0, stream.value())  //
       ->launch(out.size(),
                cudf::jit::get_data_ptr(out),
@@ -108,7 +108,7 @@ void binary_operation(mutable_column_view& out,
                      get_operator_name(op, op_type));
 
     cudf::jit::get_program_cache(*binaryop_jit_kernel_cu_jit)
-      .get_kernel(kernel_name)                               //
+      .get_kernel(kernel_name, {}, {}, {"-arch=sm_."})       //
       ->configure_1d_max_occupancy(0, 0, 0, stream.value())  //
       ->launch(out.size(),
                cudf::jit::get_data_ptr(out),
@@ -150,7 +150,7 @@ void binary_operation(mutable_column_view& out,
                      get_operator_name(op, OperatorType::Direct));
 
     cudf::jit::get_program_cache(*binaryop_jit_kernel_cu_jit)
-      .get_kernel(kernel_name)                               //
+      .get_kernel(kernel_name, {}, {}, {"-arch=sm_."})       //
       ->configure_1d_max_occupancy(0, 0, 0, stream.value())  //
       ->launch(out.size(),
                cudf::jit::get_data_ptr(out),
@@ -170,7 +170,7 @@ void binary_operation(mutable_column_view& out,
                      get_operator_name(op, OperatorType::Direct));
 
     cudf::jit::get_program_cache(*binaryop_jit_kernel_cu_jit)
-      .get_kernel(kernel_name)                               //
+      .get_kernel(kernel_name, {}, {}, {"-arch=sm_."})       //
       ->configure_1d_max_occupancy(0, 0, 0, stream.value())  //
       ->launch(out.size(),
                cudf::jit::get_data_ptr(out),
@@ -200,8 +200,9 @@ void binary_operation(mutable_column_view& out,
                    get_operator_name(binary_operator::GENERIC_BINARY, OperatorType::Direct));
 
   cudf::jit::get_program_cache(*binaryop_jit_kernel_cu_jit)
-    .get_kernel(kernel_name, {}, {{"binaryop/jit/operation-udf.hpp", cuda_source}})  //
-    ->configure_1d_max_occupancy(0, 0, 0, stream.value())                            //
+    .get_kernel(
+      kernel_name, {}, {{"binaryop/jit/operation-udf.hpp", cuda_source}}, {"-arch=sm_."})  //
+    ->configure_1d_max_occupancy(0, 0, 0, stream.value())                                  //
     ->launch(out.size(),
              cudf::jit::get_data_ptr(out),
              cudf::jit::get_data_ptr(lhs),

diff --git a/cpp/src/rolling/rolling_detail.cuh b/cpp/src/rolling/rolling_detail.cuh
@@ -1295,8 +1295,9 @@ std::unique_ptr<column> rolling_window_udf(column_view const& input,
                    following_window_str.c_str());
 
   cudf::jit::get_program_cache(*rolling_jit_kernel_cu_jit)
-    .get_kernel(kernel_name, {}, {{"rolling/jit/operation-udf.hpp", cuda_source}})  //
-    ->configure_1d_max_occupancy(0, 0, 0, stream.value())                           //
+    .get_kernel(
+      kernel_name, {}, {{"rolling/jit/operation-udf.hpp", cuda_source}}, {"-arch=sm_."})  //
+    ->configure_1d_max_occupancy(0, 0, 0, stream.value())                                 //
     ->launch(input.size(),
              cudf::jit::get_data_ptr(input),
              input.null_mask(),

diff --git a/cpp/src/transform/transform.cpp b/cpp/src/transform/transform.cpp
@@ -55,9 +55,10 @@ void unary_operation(mutable_column_view output,
                                                    "GENERIC_UNARY_OP");
 
   cudf::jit::get_program_cache(*transform_jit_kernel_cu_jit)
-    .get_kernel(kernel_name, {}, {{"transform/jit/operation-udf.hpp", cuda_source}})  //
-    ->configure_1d_max_occupancy(0, 0, 0, stream.value())                             //
-    ->launch(output.size(),                                                           //
+    .get_kernel(
+      kernel_name, {}, {{"transform/jit/operation-udf.hpp", cuda_source}}, {"-arch=sm_."})  //
+    ->configure_1d_max_occupancy(0, 0, 0, stream.value())                                   //
+    ->launch(output.size(),                                                                 //
              cudf::jit::get_data_ptr(output),
              cudf::jit::get_data_ptr(input));
 }