Add accurate hash join size functions (#8453)

Addresses #8237 This PR adds 3 join size APIs (`hash_join::inner_join_size`, `hash_join::left_join_size` and `hash_join::full_join_size`) into `hash_join` class, one for each type of join that returns the exact number of matches with the specified probe table. It completely removed the deprecated size estimation logic in the current implementation. Also, this PR updates the existing join APIs by adding an optional `output_size` as an argument. If `output_size.has_value()`, we take that value directly for further computation. Otherwise, the target join will internally invoke its corresponding size function. `TODO`: the current `full_join_size` uses a 2-step algorithm similar to what's used in `hash_join::full_join`. It duplicates certain computations with `full_join` also thus should be refactored during `cuco` integration. Authors: - Yunsong Wang (https://github.com/PointKernel) Approvers: - Jake Hemstad (https://github.com/jrhemstad) - Robert Maynard (https://github.com/robertmaynard) - Robert (Bobby) Evans (https://github.com/revans2) URL: #8453
rapidsai · Jun 14, 2021 · 82005fe · 82005fe
1 parent 79735aa
commit 82005fe
Show file tree

Hide file tree

Showing 5 changed files with 367 additions and 142 deletions.
diff --git a/cpp/include/cudf/join.hpp b/cpp/include/cudf/join.hpp
@@ -22,6 +22,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 
+#include <optional>
 #include <vector>
 
 namespace cudf {
@@ -522,13 +523,15 @@ class hash_join {
 
   /**
    * Returns the row indices that can be used to construct the result of performing
-   * an inner join between two tables. @see cudf::inner_join().
+   * an inner join between two tables. @see cudf::inner_join(). Behavior is undefined if the
+   * provided `output_size` is smaller than the actual output size.
    *
    * @param probe The probe table, from which the tuples are probed.
    * @param compare_nulls Controls whether null join-key values should match or not.
+   * @param output_size Optional value which allows users to specify the exact output size.
+   * @param stream CUDA stream used for device memory operations and kernel launches
    * @param mr Device memory resource used to allocate the returned table and columns' device
    * memory.
-   * @param stream CUDA stream used for device memory operations and kernel launches
    *
    * @return A pair of columns [`left_indices`, `right_indices`] that can be used to construct
    * the result of performing an inner join between two tables with `build` and `probe`
@@ -537,19 +540,22 @@ class hash_join {
   std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
             std::unique_ptr<rmm::device_uvector<size_type>>>
   inner_join(cudf::table_view const& probe,
-             null_equality compare_nulls         = null_equality::EQUAL,
-             rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
-             rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) const;
+             null_equality compare_nulls            = null_equality::EQUAL,
+             std::optional<std::size_t> output_size = {},
+             rmm::cuda_stream_view stream           = rmm::cuda_stream_default,
+             rmm::mr::device_memory_resource* mr    = rmm::mr::get_current_device_resource()) const;
 
   /**
    * Returns the row indices that can be used to construct the result of performing
-   * a left join between two tables. @see cudf::left_join().
+   * a left join between two tables. @see cudf::left_join(). Behavior is undefined if the
+   * provided `output_size` is smaller than the actual output size.
    *
    * @param probe The probe table, from which the tuples are probed.
    * @param compare_nulls Controls whether null join-key values should match or not.
+   * @param output_size Optional value which allows users to specify the exact output size.
+   * @param stream CUDA stream used for device memory operations and kernel launches
    * @param mr Device memory resource used to allocate the returned table and columns' device
    * memory.
-   * @param stream CUDA stream used for device memory operations and kernel launches
    *
    * @return A pair of columns [`left_indices`, `right_indices`] that can be used to construct
    * the result of performing a left join between two tables with `build` and `probe`
@@ -558,19 +564,22 @@ class hash_join {
   std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
             std::unique_ptr<rmm::device_uvector<size_type>>>
   left_join(cudf::table_view const& probe,
-            null_equality compare_nulls         = null_equality::EQUAL,
-            rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
-            rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) const;
+            null_equality compare_nulls            = null_equality::EQUAL,
+            std::optional<std::size_t> output_size = {},
+            rmm::cuda_stream_view stream           = rmm::cuda_stream_default,
+            rmm::mr::device_memory_resource* mr    = rmm::mr::get_current_device_resource()) const;
 
   /**
    * Returns the row indices that can be used to construct the result of performing
-   * a full join between two tables. @see cudf::full_join().
+   * a full join between two tables. @see cudf::full_join(). Behavior is undefined if the
+   * provided `output_size` is smaller than the actual output size.
    *
    * @param probe The probe table, from which the tuples are probed.
    * @param compare_nulls Controls whether null join-key values should match or not.
+   * @param output_size Optional value which allows users to specify the exact output size.
+   * @param stream CUDA stream used for device memory operations and kernel launches
    * @param mr Device memory resource used to allocate the returned table and columns' device
    * memory.
-   * @param stream CUDA stream used for device memory operations and kernel launches
    *
    * @return A pair of columns [`left_indices`, `right_indices`] that can be used to construct
    * the result of performing a full join between two tables with `build` and `probe`
@@ -579,9 +588,59 @@ class hash_join {
   std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
             std::unique_ptr<rmm::device_uvector<size_type>>>
   full_join(cudf::table_view const& probe,
-            null_equality compare_nulls         = null_equality::EQUAL,
-            rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
-            rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) const;
+            null_equality compare_nulls            = null_equality::EQUAL,
+            std::optional<std::size_t> output_size = {},
+            rmm::cuda_stream_view stream           = rmm::cuda_stream_default,
+            rmm::mr::device_memory_resource* mr    = rmm::mr::get_current_device_resource()) const;
+
+  /**
+   * Returns the exact number of matches (rows) when performing an inner join with the specified
+   * probe table.
+   *
+   * @param probe The probe table, from which the tuples are probed.
+   * @param compare_nulls Controls whether null join-key values should match or not.
+   * @param stream CUDA stream used for device memory operations and kernel launches
+   *
+   * @return The exact number of output when performing an inner join between two tables with
+   * `build` and `probe` as the the join keys .
+   */
+  std::size_t inner_join_size(cudf::table_view const& probe,
+                              null_equality compare_nulls  = null_equality::EQUAL,
+                              rmm::cuda_stream_view stream = rmm::cuda_stream_default) const;
+
+  /**
+   * Returns the exact number of matches (rows) when performing a left join with the specified probe
+   * table.
+   *
+   * @param probe The probe table, from which the tuples are probed.
+   * @param compare_nulls Controls whether null join-key values should match or not.
+   * @param stream CUDA stream used for device memory operations and kernel launches
+   *
+   * @return The exact number of output when performing a left join between two tables with `build`
+   * and `probe` as the the join keys .
+   */
+  std::size_t left_join_size(cudf::table_view const& probe,
+                             null_equality compare_nulls  = null_equality::EQUAL,
+                             rmm::cuda_stream_view stream = rmm::cuda_stream_default) const;
+
+  /**
+   * Returns the exact number of matches (rows) when performing a full join with the specified probe
+   * table.
+   *
+   * @param probe The probe table, from which the tuples are probed.
+   * @param compare_nulls Controls whether null join-key values should match or not.
+   * @param stream CUDA stream used for device memory operations and kernel launches
+   * @param mr Device memory resource used to allocate the intermediate table and columns' device
+   * memory.
+   *
+   * @return The exact number of output when performing a full join between two tables with `build`
+   * and `probe` as the the join keys .
+   */
+  std::size_t full_join_size(
+    cudf::table_view const& probe,
+    null_equality compare_nulls         = null_equality::EQUAL,
+    rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) const;
 
  private:
   struct hash_join_impl;