Skip to content

Commit

Permalink
Additional refactoring of hash functions (#10462)
Browse files Browse the repository at this point in the history
Additional work related to #10081.

This is breaking because it reorganizes several public names/namespaces.

Summary of changes in this PR:
- The `cudf` namespace now wraps the contents of `hash_functions.cuh`, and some public names are now classified as `detail` APIs.
- `SparkMurmurHash3_32` has been updated to align with the design and naming conventions of `MurmurHash3_32`

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Jake Hemstad (https://github.com/jrhemstad)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: #10462
  • Loading branch information
bdice authored Apr 20, 2022
1 parent 5f6b70a commit c8c7271
Show file tree
Hide file tree
Showing 9 changed files with 147 additions and 134 deletions.
231 changes: 118 additions & 113 deletions cpp/include/cudf/detail/utilities/hash_functions.cuh

Large diffs are not rendered by default.

12 changes: 7 additions & 5 deletions cpp/src/groupby/hash/groupby.cu
Original file line number Diff line number Diff line change
Expand Up @@ -429,17 +429,19 @@ auto create_hash_map(table_device_view const& d_keys,
size_type constexpr unused_key{std::numeric_limits<size_type>::max()};
size_type constexpr unused_value{std::numeric_limits<size_type>::max()};

using map_type = concurrent_unordered_map<size_type,
size_type,
row_hasher<default_hash, nullate::DYNAMIC>,
row_equality_comparator<nullate::DYNAMIC>>;
using map_type =
concurrent_unordered_map<size_type,
size_type,
row_hasher<cudf::detail::default_hash, nullate::DYNAMIC>,
row_equality_comparator<nullate::DYNAMIC>>;

using allocator_type = typename map_type::allocator_type;

auto const null_keys_are_equal =
include_null_keys == null_policy::INCLUDE ? null_equality::EQUAL : null_equality::UNEQUAL;

row_hasher<default_hash, nullate::DYNAMIC> hasher{nullate::DYNAMIC{keys_have_nulls}, d_keys};
row_hasher<cudf::detail::default_hash, nullate::DYNAMIC> hasher{nullate::DYNAMIC{keys_have_nulls},
d_keys};
row_equality_comparator rows_equal{
nullate::DYNAMIC{keys_have_nulls}, d_keys, d_keys, null_keys_are_equal};

Expand Down
2 changes: 1 addition & 1 deletion cpp/src/hash/concurrent_unordered_map.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ union pair_packer<pair_type, std::enable_if_t<is_packable<pair_type>()>> {
*/
template <typename Key,
typename Element,
typename Hasher = default_hash<Key>,
typename Hasher = cudf::detail::default_hash<Key>,
typename Equality = equal_to<Key>,
typename Allocator = default_allocator<thrust::pair<Key, Element>>>
class concurrent_unordered_map {
Expand Down
5 changes: 3 additions & 2 deletions cpp/src/io/json/json_gpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -356,7 +356,7 @@ __device__ field_descriptor next_field_descriptor(const char* begin,
? field_descriptor{field_idx, begin, cudf::io::gpu::seek_field_end(begin, end, opts, true)}
: [&]() {
auto const key_range = get_next_key(begin, end, opts.quotechar);
auto const key_hash = MurmurHash3_32<cudf::string_view>{}(
auto const key_hash = cudf::detail::MurmurHash3_32<cudf::string_view>{}(
cudf::string_view(key_range.first, key_range.second - key_range.first));
auto const hash_col = col_map.find(key_hash);
// Fall back to field index if not found (parsing error)
Expand Down Expand Up @@ -667,7 +667,8 @@ __global__ void collect_keys_info_kernel(parse_options_view const options,
keys_info->column(0).element<uint64_t>(idx) = field_range.key_begin - data.begin();
keys_info->column(1).element<uint16_t>(idx) = len;
keys_info->column(2).element<uint32_t>(idx) =
MurmurHash3_32<cudf::string_view>{}(cudf::string_view(field_range.key_begin, len));
cudf::detail::MurmurHash3_32<cudf::string_view>{}(
cudf::string_view(field_range.key_begin, len));
}
}
}
Expand Down
5 changes: 4 additions & 1 deletion cpp/src/io/parquet/chunk_dict.cu
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,10 @@ struct equality_functor {
template <typename T>
struct hash_functor {
column_device_view const& col;
__device__ auto operator()(size_type idx) { return MurmurHash3_32<T>{}(col.element<T>(idx)); }
__device__ auto operator()(size_type idx) const
{
return cudf::detail::MurmurHash3_32<T>{}(col.element<T>(idx));
}
};

struct map_insert_fn {
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/partitioning/partitioning.cu
Original file line number Diff line number Diff line change
Expand Up @@ -779,10 +779,10 @@ std::pair<std::unique_ptr<table>, std::vector<size_type>> hash_partition(
if (!is_numeric(input.column(column_id).type()))
CUDF_FAIL("IdentityHash does not support this data type");
}
return detail::local::hash_partition<IdentityHash>(
return detail::local::hash_partition<detail::IdentityHash>(
input, columns_to_hash, num_partitions, seed, stream, mr);
case (hash_id::HASH_MURMUR3):
return detail::local::hash_partition<MurmurHash3_32>(
return detail::local::hash_partition<detail::MurmurHash3_32>(
input, columns_to_hash, num_partitions, seed, stream, mr);
default: CUDF_FAIL("Unsupported hash function in hash_partition");
}
Expand Down
5 changes: 3 additions & 2 deletions cpp/src/text/subword/bpe_tokenizer.cu
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#include <cudf/column/column_factories.hpp>
#include <cudf/detail/get_value.cuh>
#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/detail/utilities/hash_functions.cuh>
#include <cudf/strings/detail/combine.hpp>
#include <cudf/strings/detail/utilities.cuh>
#include <cudf/strings/detail/utilities.hpp>
Expand Down Expand Up @@ -144,8 +145,8 @@ struct byte_pair_encoding_fn {
* @param rhs Second string.
* @return The hash value to match with `d_map`.
*/
__device__ hash_value_type compute_hash(cudf::string_view const& lhs,
cudf::string_view const& rhs)
__device__ cudf::hash_value_type compute_hash(cudf::string_view const& lhs,
cudf::string_view const& rhs)
{
__shared__ char shmem[48 * 1024]; // max for Pascal
auto const total_size = lhs.size_bytes() + rhs.size_bytes() + 1;
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/text/subword/bpe_tokenizer.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -36,12 +36,12 @@ namespace detail {

using hash_table_allocator_type = rmm::mr::stream_allocator_adaptor<default_allocator<char>>;

using merge_pairs_map_type = cuco::static_map<hash_value_type,
using merge_pairs_map_type = cuco::static_map<cudf::hash_value_type,
cudf::size_type,
cuda::thread_scope_device,
hash_table_allocator_type>;

using string_hasher_type = MurmurHash3_32<cudf::string_view>;
using string_hasher_type = cudf::detail::MurmurHash3_32<cudf::string_view>;

} // namespace detail

Expand Down
13 changes: 7 additions & 6 deletions cpp/src/text/subword/load_merges_file.cu
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include <cudf/column/column_factories.hpp>
#include <cudf/detail/iterator.cuh>
#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/detail/utilities/hash_functions.cuh>
#include <cudf/detail/utilities/vector_factories.hpp>
#include <cudf/utilities/error.hpp>

Expand All @@ -42,7 +43,7 @@ struct make_pair_function {
/**
* @brief Hash the merge pair entry
*/
__device__ cuco::pair_type<hash_value_type, cudf::size_type> operator()(cudf::size_type idx)
__device__ cuco::pair_type<cudf::hash_value_type, cudf::size_type> operator()(cudf::size_type idx)
{
auto const result = _hasher(d_strings.element<cudf::string_view>(idx));
return cuco::make_pair(result, idx);
Expand Down Expand Up @@ -105,9 +106,9 @@ std::unique_ptr<detail::merge_pairs_map_type> initialize_merge_pairs_map(
// Ensure capacity is at least (size/0.7) as documented here:
// https://github.com/NVIDIA/cuCollections/blob/6ec8b6dcdeceea07ab4456d32461a05c18864411/include/cuco/static_map.cuh#L179-L182
auto merge_pairs_map = std::make_unique<merge_pairs_map_type>(
static_cast<size_t>(input.size() * 2), // capacity is 2x;
std::numeric_limits<hash_value_type>::max(), // empty key;
-1, // empty value is not used
static_cast<size_t>(input.size() * 2), // capacity is 2x;
std::numeric_limits<cudf::hash_value_type>::max(), // empty key;
-1, // empty value is not used
hash_table_allocator_type{default_allocator<char>{}, stream},
stream.value());

Expand All @@ -117,8 +118,8 @@ std::unique_ptr<detail::merge_pairs_map_type> initialize_merge_pairs_map(

merge_pairs_map->insert(iter,
iter + input.size(),
cuco::detail::MurmurHash3_32<hash_value_type>{},
thrust::equal_to<hash_value_type>{},
cuco::detail::MurmurHash3_32<cudf::hash_value_type>{},
thrust::equal_to<cudf::hash_value_type>{},
stream.value());

return merge_pairs_map;
Expand Down

0 comments on commit c8c7271

Please sign in to comment.