Adds launch bounds hints to mixed join kernels to address regression …

…seen in NDS q72 in Spark (#10534) The following change addresses a performance degradation we noticed in the `mixed_join` and `compute_mixed_join_output_size` that looks to be tied to the theoretical occupancy of these kernels, as limited by the number of registers used. The regression is triggered by this patch: #9727, which improves handling of unreachable code paths. That said, somehow, this change is altering the number of registers these kernels need. Both `mixed_join` and `compute_mixed_join_output_size` are very sensitive to the register count, per NSight compute. With the patch, the register required changed from 92 to 102, and 118 to 141 respectively. The fix here hints the compiler what our block size is (128 threads). This, from our testing, allows the compiler to reduce the number of registers required to 128 for `compute_mixed_join_output_size` and 96 for `mixed_join`. This lead to better occupancy (I think @nvdbaranec measured it going from 30% to 50%) and I saw the wall clock time of q72 (which started all this) to go from 133s to 121s, which is within the ballpark I'd expect. Authors: - Alessandro Bellina (https://github.com/abellina) Approvers: - Mike Wilson (https://github.com/hyperbolic2346)
rapidsai · Mar 30, 2022 · 4770599 · 4770599
1 parent c42cee3
commit 4770599
Show file tree

Hide file tree

Showing 4 changed files with 27 additions and 25 deletions.
diff --git a/cpp/src/join/mixed_join_kernels.cu b/cpp/src/join/mixed_join_kernels.cu
@@ -35,18 +35,19 @@ namespace detail {
 namespace cg = cooperative_groups;
 
 template <cudf::size_type block_size, bool has_nulls>
-__global__ void mixed_join(table_device_view left_table,
-                           table_device_view right_table,
-                           table_device_view probe,
-                           table_device_view build,
-                           row_equality const equality_probe,
-                           join_kind const join_type,
-                           cudf::detail::mixed_multimap_type::device_view hash_table_view,
-                           size_type* join_output_l,
-                           size_type* join_output_r,
-                           cudf::ast::detail::expression_device_view device_expression_data,
-                           cudf::size_type const* join_result_offsets,
-                           bool const swap_tables)
+__launch_bounds__(block_size) __global__
+  void mixed_join(table_device_view left_table,
+                  table_device_view right_table,
+                  table_device_view probe,
+                  table_device_view build,
+                  row_equality const equality_probe,
+                  join_kind const join_type,
+                  cudf::detail::mixed_multimap_type::device_view hash_table_view,
+                  size_type* join_output_l,
+                  size_type* join_output_r,
+                  cudf::ast::detail::expression_device_view device_expression_data,
+                  cudf::size_type const* join_result_offsets,
+                  bool const swap_tables)
 {
   // Normally the casting of a shared memory array is used to create multiple
   // arrays of different types from the shared memory buffer, but here it is

diff --git a/cpp/src/join/mixed_join_kernels_semi.cu b/cpp/src/join/mixed_join_kernels_semi.cu
@@ -32,17 +32,18 @@ namespace detail {
 namespace cg = cooperative_groups;
 
 template <cudf::size_type block_size, bool has_nulls>
-__global__ void mixed_join_semi(table_device_view left_table,
-                                table_device_view right_table,
-                                table_device_view probe,
-                                table_device_view build,
-                                row_equality const equality_probe,
-                                join_kind const join_type,
-                                cudf::detail::semi_map_type::device_view hash_table_view,
-                                size_type* join_output_l,
-                                cudf::ast::detail::expression_device_view device_expression_data,
-                                cudf::size_type const* join_result_offsets,
-                                bool const swap_tables)
+__launch_bounds__(block_size) __global__
+  void mixed_join_semi(table_device_view left_table,
+                       table_device_view right_table,
+                       table_device_view probe,
+                       table_device_view build,
+                       row_equality const equality_probe,
+                       join_kind const join_type,
+                       cudf::detail::semi_map_type::device_view hash_table_view,
+                       size_type* join_output_l,
+                       cudf::ast::detail::expression_device_view device_expression_data,
+                       cudf::size_type const* join_result_offsets,
+                       bool const swap_tables)
 {
   // Normally the casting of a shared memory array is used to create multiple
   // arrays of different types from the shared memory buffer, but here it is

diff --git a/cpp/src/join/mixed_join_size_kernels.cu b/cpp/src/join/mixed_join_size_kernels.cu
@@ -35,7 +35,7 @@ namespace detail {
 namespace cg = cooperative_groups;
 
 template <int block_size, bool has_nulls>
-__global__ void compute_mixed_join_output_size(
+__launch_bounds__(block_size) __global__ void compute_mixed_join_output_size(
   table_device_view left_table,
   table_device_view right_table,
   table_device_view probe,

diff --git a/cpp/src/join/mixed_join_size_kernels_semi.cu b/cpp/src/join/mixed_join_size_kernels_semi.cu
@@ -32,7 +32,7 @@ namespace detail {
 namespace cg = cooperative_groups;
 
 template <int block_size, bool has_nulls>
-__global__ void compute_mixed_join_output_size_semi(
+__launch_bounds__(block_size) __global__ void compute_mixed_join_output_size_semi(
   table_device_view left_table,
   table_device_view right_table,
   table_device_view probe,