Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Additional refactoring of hash functions #10462

Merged
merged 18 commits into from
Apr 20, 2022
Merged
Show file tree
Hide file tree
Changes from 15 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
231 changes: 118 additions & 113 deletions cpp/include/cudf/detail/utilities/hash_functions.cuh

Large diffs are not rendered by default.

12 changes: 7 additions & 5 deletions cpp/src/groupby/hash/groupby.cu
Original file line number Diff line number Diff line change
Expand Up @@ -429,17 +429,19 @@ auto create_hash_map(table_device_view const& d_keys,
size_type constexpr unused_key{std::numeric_limits<size_type>::max()};
size_type constexpr unused_value{std::numeric_limits<size_type>::max()};

using map_type = concurrent_unordered_map<size_type,
size_type,
row_hasher<default_hash, nullate::DYNAMIC>,
row_equality_comparator<nullate::DYNAMIC>>;
using map_type =
concurrent_unordered_map<size_type,
size_type,
row_hasher<cudf::detail::default_hash, nullate::DYNAMIC>,
row_equality_comparator<nullate::DYNAMIC>>;

using allocator_type = typename map_type::allocator_type;

auto const null_keys_are_equal =
include_null_keys == null_policy::INCLUDE ? null_equality::EQUAL : null_equality::UNEQUAL;

row_hasher<default_hash, nullate::DYNAMIC> hasher{nullate::DYNAMIC{keys_have_nulls}, d_keys};
row_hasher<cudf::detail::default_hash, nullate::DYNAMIC> hasher{nullate::DYNAMIC{keys_have_nulls},
d_keys};
row_equality_comparator rows_equal{
nullate::DYNAMIC{keys_have_nulls}, d_keys, d_keys, null_keys_are_equal};

Expand Down
2 changes: 1 addition & 1 deletion cpp/src/hash/concurrent_unordered_map.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ union pair_packer<pair_type, std::enable_if_t<is_packable<pair_type>()>> {
*/
template <typename Key,
typename Element,
typename Hasher = default_hash<Key>,
typename Hasher = cudf::detail::default_hash<Key>,
typename Equality = equal_to<Key>,
typename Allocator = default_allocator<thrust::pair<Key, Element>>>
class concurrent_unordered_map {
Expand Down
5 changes: 3 additions & 2 deletions cpp/src/io/json/json_gpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -356,7 +356,7 @@ __device__ field_descriptor next_field_descriptor(const char* begin,
? field_descriptor{field_idx, begin, cudf::io::gpu::seek_field_end(begin, end, opts, true)}
: [&]() {
auto const key_range = get_next_key(begin, end, opts.quotechar);
auto const key_hash = MurmurHash3_32<cudf::string_view>{}(
auto const key_hash = cudf::detail::MurmurHash3_32<cudf::string_view>{}(
cudf::string_view(key_range.first, key_range.second - key_range.first));
auto const hash_col = col_map.find(key_hash);
// Fall back to field index if not found (parsing error)
Expand Down Expand Up @@ -667,7 +667,8 @@ __global__ void collect_keys_info_kernel(parse_options_view const options,
keys_info->column(0).element<uint64_t>(idx) = field_range.key_begin - data.begin();
keys_info->column(1).element<uint16_t>(idx) = len;
keys_info->column(2).element<uint32_t>(idx) =
MurmurHash3_32<cudf::string_view>{}(cudf::string_view(field_range.key_begin, len));
cudf::detail::MurmurHash3_32<cudf::string_view>{}(
cudf::string_view(field_range.key_begin, len));
}
}
}
Expand Down
5 changes: 4 additions & 1 deletion cpp/src/io/parquet/chunk_dict.cu
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,10 @@ struct equality_functor {
template <typename T>
struct hash_functor {
column_device_view const& col;
__device__ auto operator()(size_type idx) { return MurmurHash3_32<T>{}(col.element<T>(idx)); }
__device__ auto operator()(size_type idx)
bdice marked this conversation as resolved.
Show resolved Hide resolved
{
return cudf::detail::MurmurHash3_32<T>{}(col.element<T>(idx));
}
};

struct map_insert_fn {
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/partitioning/partitioning.cu
Original file line number Diff line number Diff line change
Expand Up @@ -779,10 +779,10 @@ std::pair<std::unique_ptr<table>, std::vector<size_type>> hash_partition(
if (!is_numeric(input.column(column_id).type()))
CUDF_FAIL("IdentityHash does not support this data type");
}
return detail::local::hash_partition<IdentityHash>(
return detail::local::hash_partition<detail::IdentityHash>(
input, columns_to_hash, num_partitions, seed, stream, mr);
case (hash_id::HASH_MURMUR3):
return detail::local::hash_partition<MurmurHash3_32>(
return detail::local::hash_partition<detail::MurmurHash3_32>(
input, columns_to_hash, num_partitions, seed, stream, mr);
default: CUDF_FAIL("Unsupported hash function in hash_partition");
}
Expand Down
5 changes: 3 additions & 2 deletions cpp/src/text/subword/bpe_tokenizer.cu
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#include <cudf/column/column_factories.hpp>
#include <cudf/detail/get_value.cuh>
#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/detail/utilities/hash_functions.cuh>
#include <cudf/strings/detail/combine.hpp>
#include <cudf/strings/detail/utilities.cuh>
#include <cudf/strings/detail/utilities.hpp>
Expand Down Expand Up @@ -144,8 +145,8 @@ struct byte_pair_encoding_fn {
* @param rhs Second string.
* @return The hash value to match with `d_map`.
*/
__device__ hash_value_type compute_hash(cudf::string_view const& lhs,
cudf::string_view const& rhs)
__device__ cudf::hash_value_type compute_hash(cudf::string_view const& lhs,
cudf::string_view const& rhs)
{
__shared__ char shmem[48 * 1024]; // max for Pascal
auto const total_size = lhs.size_bytes() + rhs.size_bytes() + 1;
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/text/subword/bpe_tokenizer.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -36,12 +36,12 @@ namespace detail {

using hash_table_allocator_type = rmm::mr::stream_allocator_adaptor<default_allocator<char>>;

using merge_pairs_map_type = cuco::static_map<hash_value_type,
using merge_pairs_map_type = cuco::static_map<cudf::hash_value_type,
cudf::size_type,
cuda::thread_scope_device,
hash_table_allocator_type>;

using string_hasher_type = MurmurHash3_32<cudf::string_view>;
using string_hasher_type = cudf::detail::MurmurHash3_32<cudf::string_view>;

} // namespace detail

Expand Down
13 changes: 7 additions & 6 deletions cpp/src/text/subword/load_merges_file.cu
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include <cudf/column/column_factories.hpp>
#include <cudf/detail/iterator.cuh>
#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/detail/utilities/hash_functions.cuh>
#include <cudf/detail/utilities/vector_factories.hpp>
#include <cudf/utilities/error.hpp>

Expand All @@ -42,7 +43,7 @@ struct make_pair_function {
/**
* @brief Hash the merge pair entry
*/
__device__ cuco::pair_type<hash_value_type, cudf::size_type> operator()(cudf::size_type idx)
__device__ cuco::pair_type<cudf::hash_value_type, cudf::size_type> operator()(cudf::size_type idx)
{
auto const result = _hasher(d_strings.element<cudf::string_view>(idx));
return cuco::make_pair(result, idx);
Expand Down Expand Up @@ -105,9 +106,9 @@ std::unique_ptr<detail::merge_pairs_map_type> initialize_merge_pairs_map(
// Ensure capacity is at least (size/0.7) as documented here:
// https://github.com/NVIDIA/cuCollections/blob/6ec8b6dcdeceea07ab4456d32461a05c18864411/include/cuco/static_map.cuh#L179-L182
auto merge_pairs_map = std::make_unique<merge_pairs_map_type>(
static_cast<size_t>(input.size() * 2), // capacity is 2x;
std::numeric_limits<hash_value_type>::max(), // empty key;
-1, // empty value is not used
static_cast<size_t>(input.size() * 2), // capacity is 2x;
std::numeric_limits<cudf::hash_value_type>::max(), // empty key;
-1, // empty value is not used
hash_table_allocator_type{default_allocator<char>{}, stream},
stream.value());

Expand All @@ -117,8 +118,8 @@ std::unique_ptr<detail::merge_pairs_map_type> initialize_merge_pairs_map(

merge_pairs_map->insert(iter,
iter + input.size(),
cuco::detail::MurmurHash3_32<hash_value_type>{},
thrust::equal_to<hash_value_type>{},
cuco::detail::MurmurHash3_32<cudf::hash_value_type>{},
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Out of scope for this PR (really, out of scope for any PR in this repository), but it seems bad that we're relying on a cuco detail namespace here. @jrhemstad do we need MurmurHash3_32 to be exposed more publicly in cuco? If we expect callers to use it as a provided hash function then it shouldn't be detail.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree this is awkward / undesirable. This may be resolved or improved by #10401.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually, no. The issue is that we need to provide a stream and that requires re-supplying the other defaulted arguments. https://github.com/NVIDIA/cuCollections/blob/fb58a38701f1c24ecfe07d8f1f208bbe80930da5/include/cuco/static_map.cuh#L224-L231

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure, but IMO it's a design issue in cuco if supplying a stream while using the default hash requires pulling the default hash out of a detail namespace.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree with that. (Sorry, the "Actually, no" was about whether #10401 would improve this situation. It would not.)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Got it. I think in the long run we probably don't want to be able to provide the hash function and equality comparator as template parameters of these methods, but rather as parameters of the constructor. It doesn't really make sense to be able to insert and query with different ones. Unfortunately we currently abuse this ability in libcudf, so I don't think removing it is feasible in the short term, but in the longer term getting rid of this would make it easier to provide streams without having this problem since the hash/equality operators would be defined on construction and the user wouldn't need to provide those templates unless they wanted to override the default.

thrust::equal_to<cudf::hash_value_type>{},
stream.value());

return merge_pairs_map;
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/core/resample.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# SPDX-FileCopyrightText: Copyright (c) 2021 NVIDIA CORPORATION &
# SPDX-FileCopyrightText: Copyright (c) 2021-2022, NVIDIA CORPORATION &
bdice marked this conversation as resolved.
Show resolved Hide resolved
# AFFILIATES. All rights reserved. SPDX-License-Identifier:
# Apache-2.0
#
Expand Down