Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add row bitmask as a detail::hash_join member #10248

Merged
merged 18 commits into from
May 2, 2022
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions cpp/src/join/hash_join.cu
Original file line number Diff line number Diff line change
Expand Up @@ -235,7 +235,7 @@ hash_join::hash_join_impl::hash_join_impl(cudf::table_view const& build,
null_equality compare_nulls,
rmm::cuda_stream_view stream)
: _is_empty{build.num_rows() == 0},
_hash_table{compute_hash_table_size(build.num_rows()),
_hash_table{0,
PointKernel marked this conversation as resolved.
Show resolved Hide resolved
std::numeric_limits<hash_value_type>::max(),
cudf::detail::JoinNoneValue,
stream.value(),
Expand All @@ -253,7 +253,7 @@ hash_join::hash_join_impl::hash_join_impl(cudf::table_view const& build,

if (_is_empty) { return; }

build_join_hash_table(_build, _hash_table, compare_nulls, stream);
_hash_table = std::move(build_join_hash_table(_build, compare_nulls, stream));
}

std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
Expand Down
38 changes: 24 additions & 14 deletions cpp/src/join/hash_join.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -148,42 +148,52 @@ std::unique_ptr<cudf::table> combine_table_pair(std::unique_ptr<cudf::table>&& l
/**
* @brief Builds the hash table based on the given `build_table`.
*
* @tparam MultimapType The type of the hash table
*
* @param build Table of columns used to build join hash.
* @param hash_table Build hash table.
* @param compare_nulls Controls whether null join-key values should match or not.
* @param stream CUDA stream used for device memory operations and kernel launches.
*
*/
template <typename MultimapType>
void build_join_hash_table(cudf::table_view const& build,
MultimapType& hash_table,
null_equality compare_nulls,
rmm::cuda_stream_view stream)
cudf::detail::multimap_type build_join_hash_table(cudf::table_view const& build,
null_equality compare_nulls,
rmm::cuda_stream_view stream)
{
auto build_table_ptr = cudf::table_device_view::create(build, stream);
auto build_table_ptr = cudf::table_device_view::create(build, stream);
auto const build_table_num_rows = build.num_rows();

CUDF_EXPECTS(0 != build_table_ptr->num_columns(), "Selected build dataset is empty");
CUDF_EXPECTS(0 != build_table_ptr->num_rows(), "Build side table has no rows");
CUDF_EXPECTS(0 != build_table_num_rows, "Build side table has no rows");

cudf::detail::multimap_type hash_table{
compute_hash_table_size(build_table_num_rows),
std::numeric_limits<hash_value_type>::max(),
cudf::detail::JoinNoneValue,
stream.value(),
detail::hash_table_allocator_type{default_allocator<char>{}, stream}};

row_hash hash_build{nullate::DYNAMIC{cudf::has_nulls(build)}, *build_table_ptr};
auto const empty_key_sentinel = hash_table.get_empty_key_sentinel();
make_pair_function pair_func{hash_build, empty_key_sentinel};
make_pair_function pair_func{hash_build, hash_table.get_empty_key_sentinel()};

auto iter = cudf::detail::make_counting_transform_iterator(0, pair_func);

size_type const build_table_num_rows{build_table_ptr->num_rows()};
if ((compare_nulls == null_equality::EQUAL) or (not nullable(build))) {
hash_table.insert(iter, iter + build_table_num_rows, stream.value());
} else {
thrust::counting_iterator<size_type> stencil(0);
auto const row_bitmask = cudf::detail::bitmask_and(build, stream).first;
auto const [row_bitmask, count] = cudf::detail::bitmask_and(build, stream);
row_is_valid pred{static_cast<bitmask_type const*>(row_bitmask.data())};

// resize the hash table
hash_table = std::move(cudf::detail::multimap_type{
compute_hash_table_size(build_table_num_rows - count),
std::numeric_limits<hash_value_type>::max(),
cudf::detail::JoinNoneValue,
stream.value(),
detail::hash_table_allocator_type{default_allocator<char>{}, stream}});

// insert valid rows
hash_table.insert_if(iter, iter + build_table_num_rows, stencil, pred, stream.value());
}
return hash_table;
}
} // namespace detail

Expand Down