Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix hash join when the input tables have nulls on only one side #13120

Merged
merged 22 commits into from
Apr 13, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 12 additions & 3 deletions cpp/benchmarks/join/join.cu
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,10 @@ void nvbench_inner_join(nvbench::state& state,
cudf::table_view const& right_input,
cudf::null_equality compare_nulls,
rmm::cuda_stream_view stream) {
cudf::hash_join hj_obj(left_input, compare_nulls, stream);
auto const has_nulls = cudf::has_nested_nulls(left_input) || cudf::has_nested_nulls(right_input)
? cudf::nullable_join::YES
: cudf::nullable_join::NO;
cudf::hash_join hj_obj(left_input, has_nulls, compare_nulls, stream);
return hj_obj.inner_join(right_input, std::nullopt, stream);
};

Expand All @@ -44,7 +47,10 @@ void nvbench_left_join(nvbench::state& state,
cudf::table_view const& right_input,
cudf::null_equality compare_nulls,
rmm::cuda_stream_view stream) {
cudf::hash_join hj_obj(left_input, compare_nulls, stream);
auto const has_nulls = cudf::has_nested_nulls(left_input) || cudf::has_nested_nulls(right_input)
? cudf::nullable_join::YES
: cudf::nullable_join::NO;
cudf::hash_join hj_obj(left_input, has_nulls, compare_nulls, stream);
return hj_obj.left_join(right_input, std::nullopt, stream);
};

Expand All @@ -61,7 +67,10 @@ void nvbench_full_join(nvbench::state& state,
cudf::table_view const& right_input,
cudf::null_equality compare_nulls,
rmm::cuda_stream_view stream) {
cudf::hash_join hj_obj(left_input, compare_nulls, stream);
auto const has_nulls = cudf::has_nested_nulls(left_input) || cudf::has_nested_nulls(right_input)
? cudf::nullable_join::YES
: cudf::nullable_join::NO;
cudf::hash_join hj_obj(left_input, has_nulls, compare_nulls, stream);
return hj_obj.full_join(right_input, std::nullopt, stream);
};

Expand Down
11 changes: 7 additions & 4 deletions cpp/include/cudf/detail/join.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -73,10 +73,10 @@ struct hash_join {
hash_join& operator=(hash_join&&) = delete;

private:
bool const _is_empty; ///< true if `_hash_table` is empty
rmm::device_buffer const _composite_bitmask; ///< Bitmask to denote whether a row is valid
cudf::null_equality const _nulls_equal; ///< whether to consider nulls as equal
cudf::table_view _build; ///< input table to build the hash map
bool const _is_empty; ///< true if `_hash_table` is empty
bool const _has_nulls; ///< true if nulls are present in either build table or any probe table
cudf::null_equality const _nulls_equal; ///< whether to consider nulls as equal
cudf::table_view _build; ///< input table to build the hash map
std::shared_ptr<cudf::experimental::row::equality::preprocessed_table>
_preprocessed_build; ///< input table preprocssed for row operators
map_type _hash_table; ///< hash table built on `_build`
Expand All @@ -89,10 +89,13 @@ struct hash_join {
* @throw cudf::logic_error if the number of rows in `build` table exceeds MAX_JOIN_SIZE.
*
* @param build The build table, from which the hash table is built.
* @param has_nulls Flag to indicate if the there exists any nulls in the `build` table or
* any `probe` table that will be used later for join.
* @param compare_nulls Controls whether null join-key values should match or not.
* @param stream CUDA stream used for device memory operations and kernel launches.
*/
hash_join(cudf::table_view const& build,
bool has_nulls,
cudf::null_equality compare_nulls,
rmm::cuda_stream_view stream);

Expand Down
41 changes: 40 additions & 1 deletion cpp/include/cudf/join.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2022, NVIDIA CORPORATION.
* Copyright (c) 2019-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -256,6 +256,16 @@ std::unique_ptr<cudf::table> cross_join(
cudf::table_view const& right,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief The enum class to specify if any of the input join tables (`build` table and any later
* `probe` table) has nulls.
*
* This is used upon hash_join object construction to specify the existence of nulls in all the
* possible input tables. If such null existence is unknown, `YES` should be used as the default
* option.
*/
enum class nullable_join : bool { YES, NO };

/**
* @brief Hash join that builds hash table in creation and probes results in subsequent `*_join`
* member functions.
Expand Down Expand Up @@ -289,6 +299,17 @@ class hash_join {
null_equality compare_nulls,
rmm::cuda_stream_view stream = cudf::get_default_stream());

/**
* @copydoc hash_join(cudf::table_view const&, null_equality, rmm::cuda_stream_view)
*
* @param has_nulls Flag to indicate if the there exists any nulls in the `build` table or
* any `probe` table that will be used later for join
*/
hash_join(cudf::table_view const& build,
nullable_join has_nulls,
null_equality compare_nulls,
rmm::cuda_stream_view stream = cudf::get_default_stream());

/**
* Returns the row indices that can be used to construct the result of performing
* an inner join between two tables. @see cudf::inner_join(). Behavior is undefined if the
Expand All @@ -300,6 +321,9 @@ class hash_join {
* @param mr Device memory resource used to allocate the returned table and columns' device
* memory.
*
* @throw cudf::logic_error If the input probe table has nulls while this hash_join object was not
* constructed with null check.
*
* @return A pair of columns [`left_indices`, `right_indices`] that can be used to construct
* the result of performing an inner join between two tables with `build` and `probe`
* as the the join keys .
Expand All @@ -322,6 +346,9 @@ class hash_join {
* @param mr Device memory resource used to allocate the returned table and columns' device
* memory.
*
* @throw cudf::logic_error If the input probe table has nulls while this hash_join object was not
* constructed with null check.
*
* @return A pair of columns [`left_indices`, `right_indices`] that can be used to construct
* the result of performing a left join between two tables with `build` and `probe`
* as the the join keys .
Expand All @@ -344,6 +371,9 @@ class hash_join {
* @param mr Device memory resource used to allocate the returned table and columns' device
* memory.
*
* @throw cudf::logic_error If the input probe table has nulls while this hash_join object was not
* constructed with null check.
*
* @return A pair of columns [`left_indices`, `right_indices`] that can be used to construct
* the result of performing a full join between two tables with `build` and `probe`
* as the the join keys .
Expand All @@ -362,6 +392,9 @@ class hash_join {
* @param probe The probe table, from which the tuples are probed
* @param stream CUDA stream used for device memory operations and kernel launches
*
* @throw cudf::logic_error If the input probe table has nulls while this hash_join object was not
* constructed with null check.
*
* @return The exact number of output when performing an inner join between two tables with
* `build` and `probe` as the the join keys .
*/
Expand All @@ -375,6 +408,9 @@ class hash_join {
* @param probe The probe table, from which the tuples are probed
* @param stream CUDA stream used for device memory operations and kernel launches
*
* @throw cudf::logic_error If the input probe table has nulls while this hash_join object was not
* constructed with null check.
*
* @return The exact number of output when performing a left join between two tables with `build`
* and `probe` as the the join keys .
*/
Expand All @@ -390,6 +426,9 @@ class hash_join {
* @param mr Device memory resource used to allocate the intermediate table and columns' device
* memory.
*
* @throw cudf::logic_error If the input probe table has nulls while this hash_join object was not
* constructed with null check.
*
* @return The exact number of output when performing a full join between two tables with `build`
* and `probe` as the the join keys .
*/
Expand Down
118 changes: 71 additions & 47 deletions cpp/src/join/hash_join.cu
Original file line number Diff line number Diff line change
Expand Up @@ -359,11 +359,11 @@ std::size_t get_full_join_size(

template <typename Hasher>
hash_join<Hasher>::hash_join(cudf::table_view const& build,
bool has_nulls,
cudf::null_equality compare_nulls,
rmm::cuda_stream_view stream)
: _is_empty{build.num_rows() == 0},
_composite_bitmask{
cudf::detail::bitmask_and(build, stream, rmm::mr::get_current_device_resource()).first},
: _has_nulls(has_nulls),
_is_empty{build.num_rows() == 0},
_nulls_equal{compare_nulls},
_hash_table{compute_hash_table_size(build.num_rows()),
cuco::empty_key{std::numeric_limits<hash_value_type>::max()},
Expand All @@ -381,11 +381,14 @@ hash_join<Hasher>::hash_join(cudf::table_view const& build,

if (_is_empty) { return; }

auto const row_bitmask =
cudf::detail::bitmask_and(build, stream, rmm::mr::get_current_device_resource()).first;
cudf::detail::build_join_hash_table(_build,
_preprocessed_build,
_hash_table,
_has_nulls,
_nulls_equal,
static_cast<bitmask_type const*>(_composite_bitmask.data()),
reinterpret_cast<bitmask_type const*>(row_bitmask.data()),
stream);
}

Expand Down Expand Up @@ -434,19 +437,21 @@ std::size_t hash_join<Hasher>::inner_join_size(cudf::table_view const& probe,
// Return directly if build table is empty
if (_is_empty) { return 0; }

CUDF_EXPECTS(_has_nulls || !cudf::has_nested_nulls(probe),
"Probe table has nulls while build table was not hashed with null check.");

auto const preprocessed_probe =
cudf::experimental::row::equality::preprocessed_table::create(probe, stream);

return cudf::detail::compute_join_output_size(
_build,
probe,
_preprocessed_build,
preprocessed_probe,
_hash_table,
cudf::detail::join_kind::INNER_JOIN,
cudf::has_nested_nulls(probe) || cudf::has_nested_nulls(_build),
_nulls_equal,
stream);
return cudf::detail::compute_join_output_size(_build,
probe,
_preprocessed_build,
preprocessed_probe,
_hash_table,
cudf::detail::join_kind::INNER_JOIN,
_has_nulls,
_nulls_equal,
stream);
}

template <typename Hasher>
Expand All @@ -458,19 +463,21 @@ std::size_t hash_join<Hasher>::left_join_size(cudf::table_view const& probe,
// Trivial left join case - exit early
if (_is_empty) { return probe.num_rows(); }

CUDF_EXPECTS(_has_nulls || !cudf::has_nested_nulls(probe),
"Probe table has nulls while build table was not hashed with null check.");
ttnghia marked this conversation as resolved.
Show resolved Hide resolved

auto const preprocessed_probe =
cudf::experimental::row::equality::preprocessed_table::create(probe, stream);

return cudf::detail::compute_join_output_size(
_build,
probe,
_preprocessed_build,
preprocessed_probe,
_hash_table,
cudf::detail::join_kind::LEFT_JOIN,
cudf::has_nested_nulls(probe) || cudf::has_nested_nulls(_build),
_nulls_equal,
stream);
return cudf::detail::compute_join_output_size(_build,
probe,
_preprocessed_build,
preprocessed_probe,
_hash_table,
cudf::detail::join_kind::LEFT_JOIN,
_has_nulls,
_nulls_equal,
stream);
}

template <typename Hasher>
Expand All @@ -483,19 +490,21 @@ std::size_t hash_join<Hasher>::full_join_size(cudf::table_view const& probe,
// Trivial left join case - exit early
if (_is_empty) { return probe.num_rows(); }

CUDF_EXPECTS(_has_nulls || !cudf::has_nested_nulls(probe),
"Probe table has nulls while build table was not hashed with null check.");

auto const preprocessed_probe =
cudf::experimental::row::equality::preprocessed_table::create(probe, stream);

return cudf::detail::get_full_join_size(
_build,
probe,
_preprocessed_build,
preprocessed_probe,
_hash_table,
cudf::has_nested_nulls(probe) || cudf::has_nested_nulls(_build),
_nulls_equal,
stream,
mr);
return cudf::detail::get_full_join_size(_build,
probe,
_preprocessed_build,
preprocessed_probe,
_hash_table,
_has_nulls,
_nulls_equal,
stream,
mr);
}

template <typename Hasher>
Expand All @@ -514,20 +523,22 @@ hash_join<Hasher>::probe_join_indices(cudf::table_view const& probe_table,

CUDF_EXPECTS(!_is_empty, "Hash table of hash join is null.");

CUDF_EXPECTS(_has_nulls || !cudf::has_nested_nulls(probe_table),
"Probe table has nulls while build table was not hashed with null check.");

auto const preprocessed_probe =
cudf::experimental::row::equality::preprocessed_table::create(probe_table, stream);
auto join_indices = cudf::detail::probe_join_hash_table(
_build,
probe_table,
_preprocessed_build,
preprocessed_probe,
_hash_table,
join,
cudf::has_nested_nulls(probe_table) || cudf::has_nested_nulls(_build),
_nulls_equal,
output_size,
stream,
mr);
auto join_indices = cudf::detail::probe_join_hash_table(_build,
probe_table,
_preprocessed_build,
preprocessed_probe,
_hash_table,
join,
_has_nulls,
_nulls_equal,
output_size,
stream,
mr);

if (join == cudf::detail::join_kind::FULL_JOIN) {
auto complement_indices = detail::get_left_join_indices_complement(
Expand All @@ -553,6 +564,9 @@ hash_join<Hasher>::compute_hash_join(cudf::table_view const& probe,
CUDF_EXPECTS(_build.num_columns() == probe.num_columns(),
"Mismatch in number of columns to be joined on");

CUDF_EXPECTS(_has_nulls || !cudf::has_nested_nulls(probe),
"Probe table has nulls while build table was not hashed with null check.");

if (is_trivial_join(probe, _build, join)) {
return std::pair(std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr),
std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr));
Expand All @@ -574,7 +588,17 @@ hash_join::~hash_join() = default;
hash_join::hash_join(cudf::table_view const& build,
null_equality compare_nulls,
rmm::cuda_stream_view stream)
: _impl{std::make_unique<const impl_type>(build, compare_nulls, stream)}
// If we cannot know beforehand about null existence then let's assume that there are nulls.
: hash_join(build, nullable_join::YES, compare_nulls, stream)
{
}

hash_join::hash_join(cudf::table_view const& build,
nullable_join has_nulls,
null_equality compare_nulls,
rmm::cuda_stream_view stream)
: _impl{std::make_unique<const impl_type>(
build, has_nulls == nullable_join::YES, compare_nulls, stream)}
{
}

Expand Down
Loading