Skip to content

Commit

Permalink
Create separate libcudf hash APIs for each supported hash function (#…
Browse files Browse the repository at this point in the history
…13626)

Deprecates `cudf::hash` in favor of individual hash functions in the new `cudf::hashing` namespace. This will help with adding new hash algorithms.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - Karthikeyan (https://github.com/karthikeyann)
  - Nghia Truong (https://github.com/ttnghia)

URL: #13626
  • Loading branch information
davidwendt authored Jul 10, 2023
1 parent ecdaa91 commit 2ca4630
Show file tree
Hide file tree
Showing 12 changed files with 159 additions and 77 deletions.
2 changes: 1 addition & 1 deletion cpp/include/cudf/detail/aggregation/result_cache.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ struct pair_column_aggregation_equal_to {
struct pair_column_aggregation_hash {
size_t operator()(std::pair<column_view, aggregation const&> const& key) const
{
return hash_combine(shallow_hash(key.first), key.second.do_hash());
return cudf::hashing::detail::hash_combine(shallow_hash(key.first), key.second.do_hash());
}
};

Expand Down
34 changes: 18 additions & 16 deletions cpp/include/cudf/detail/hashing.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,19 +24,9 @@
#include <functional>

namespace cudf {
namespace hashing {
namespace detail {

/**
* @copydoc cudf::hash
*
* @param stream CUDA stream used for device memory operations and kernel launches.
*/
std::unique_ptr<column> hash(table_view const& input,
hash_id hash_function,
uint32_t seed,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr);

std::unique_ptr<column> murmur_hash3_32(table_view const& input,
uint32_t seed,
rmm::cuda_stream_view,
Expand All @@ -47,9 +37,9 @@ std::unique_ptr<column> spark_murmur_hash3_32(table_view const& input,
rmm::cuda_stream_view,
rmm::mr::device_memory_resource* mr);

std::unique_ptr<column> md5_hash(table_view const& input,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr);
std::unique_ptr<column> md5(table_view const& input,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr);

/* Copyright 2005-2014 Daniel James.
*
Expand Down Expand Up @@ -94,6 +84,18 @@ constexpr std::size_t hash_combine(std::size_t lhs, std::size_t rhs)
}

} // namespace detail

/**
* @copydoc cudf::hash
*
* @param stream CUDA stream used for device memory operations and kernel launches.
*/
std::unique_ptr<column> hash(table_view const& input,
hash_id hash_function,
uint32_t seed,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr);
} // namespace hashing
} // namespace cudf

// specialization of std::hash for cudf::data_type
Expand All @@ -102,8 +104,8 @@ template <>
struct hash<cudf::data_type> {
std::size_t operator()(cudf::data_type const& type) const noexcept
{
return cudf::detail::hash_combine(std::hash<int32_t>{}(static_cast<int32_t>(type.id())),
std::hash<int32_t>{}(type.scale()));
return cudf::hashing::detail::hash_combine(
std::hash<int32_t>{}(static_cast<int32_t>(type.id())), std::hash<int32_t>{}(type.scale()));
}
};
} // namespace std
70 changes: 67 additions & 3 deletions cpp/include/cudf/hashing.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,16 +22,21 @@

namespace cudf {

using hash_value_type = uint32_t; ///< Type of hash value

/**
* @addtogroup column_hash
* @{
* @file
*/

/**
* @brief Identifies the hash function to be used
* @brief Type of hash value
*
*/
using hash_value_type = uint32_t;

/**
* @brief Identifies the hash function to be used
*
*/
enum class hash_id {
HASH_IDENTITY = 0, ///< Identity hash function that simply returns the key to be hashed
Expand All @@ -48,6 +53,8 @@ static constexpr uint32_t DEFAULT_HASH_SEED = 0;
/**
* @brief Computes the hash value of each row in the input set of columns.
*
* @deprecated Since 23.08
*
* @param input The table of columns to hash
* @param hash_function The hash function enum to use
* @param seed Optional seed value to use for the hash function
Expand All @@ -63,5 +70,62 @@ std::unique_ptr<column> hash(
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

namespace hashing {

/**
* @brief Computes the MurmurHash3 32-bit of each row in the given table
*
* This function computes the hash of each column using the `seed` for the first column
* and the resulting hash as a seed for the next column and so on.
* The result is a uint32 value for each row.
*
* @param input The table of columns to hash
* @param seed Optional seed value to use for the hash function
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned column's device memory
*
* @returns A column where each row is the hash of a row from the input
*/
std::unique_ptr<column> murmur_hash3_32(
table_view const& input,
uint32_t seed = DEFAULT_HASH_SEED,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Computes the MurmurHash3 32-bit of each row in the given table
*
* This function computes the hash similar to MurmurHash3_32 with special processing
* to match Spark's implementation results.
*
* @param input The table of columns to hash
* @param seed Optional seed value to use for the hash function
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned column's device memory
*
* @returns A column where each row is the hash of a row from the input
*/
std::unique_ptr<column> spark_murmur_hash3_32(
table_view const& input,
uint32_t seed = DEFAULT_HASH_SEED,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Computes the MD5 hash of each row in the given table
*
* @param input The table of columns to hash
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned column's device memory
*
* @returns A column where each row is the hash of a row from the input
*/
std::unique_ptr<column> md5(
table_view const& input,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

} // namespace hashing

/** @} */ // end of group
} // namespace cudf
9 changes: 5 additions & 4 deletions cpp/include/cudf/table/experimental/row_operators.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -1813,7 +1813,7 @@ class device_row_hasher {

// Hash each element and combine all the hash values together
return detail::accumulate(it, it + _table.num_columns(), _seed, [](auto hash, auto h) {
return cudf::detail::hash_combine(hash, h);
return cudf::hashing::detail::hash_combine(hash, h);
});
}

Expand Down Expand Up @@ -1854,7 +1854,8 @@ class device_row_hasher {
auto validity_it = detail::make_validity_iterator<true>(curr_col);
hash = detail::accumulate(
validity_it, validity_it + curr_col.size(), hash, [](auto hash, auto is_valid) {
return cudf::detail::hash_combine(hash, is_valid ? NON_NULL_HASH : NULL_HASH);
return cudf::hashing::detail::hash_combine(hash,
is_valid ? NON_NULL_HASH : NULL_HASH);
});
}
if (curr_col.type().id() == type_id::STRUCT) {
Expand All @@ -1866,13 +1867,13 @@ class device_row_hasher {
auto list_sizes = make_list_size_iterator(list_col);
hash = detail::accumulate(
list_sizes, list_sizes + list_col.size(), hash, [](auto hash, auto size) {
return cudf::detail::hash_combine(hash, hash_fn<size_type>{}(size));
return cudf::hashing::detail::hash_combine(hash, hash_fn<size_type>{}(size));
});
curr_col = list_col.get_sliced_child();
}
}
for (int i = 0; i < curr_col.size(); ++i) {
hash = cudf::detail::hash_combine(
hash = cudf::hashing::detail::hash_combine(
hash,
type_dispatcher<dispatch_void_if_nested>(curr_col.type(), _element_hasher, curr_col, i));
}
Expand Down
4 changes: 2 additions & 2 deletions cpp/include/cudf/table/row_operators.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -600,7 +600,7 @@ class row_hasher {
__device__ auto operator()(size_type row_index) const
{
// Hash the first column w/ the seed
auto const initial_hash = cudf::detail::hash_combine(
auto const initial_hash = cudf::hashing::detail::hash_combine(
hash_value_type{0},
type_dispatcher<dispatch_storage_type>(
_table.column(0).type(),
Expand All @@ -626,7 +626,7 @@ class row_hasher {
hasher,
initial_hash,
[](hash_value_type lhs, hash_value_type rhs) {
return cudf::detail::hash_combine(lhs, rhs);
return cudf::hashing::detail::hash_combine(lhs, rhs);
});
}

Expand Down
4 changes: 2 additions & 2 deletions cpp/src/column/column_view.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ struct HashValue {
explicit HashValue(std::size_t h) : hash{h} {}
HashValue operator^(HashValue const& other) const
{
return HashValue{hash_combine(hash, other.hash)};
return HashValue{cudf::hashing::detail::hash_combine(hash, other.hash)};
}
};

Expand All @@ -97,7 +97,7 @@ std::size_t shallow_hash_impl(column_view const& c, bool is_parent_empty = false
c.child_end(),
init,
[&c, is_parent_empty](std::size_t hash, auto const& child) {
return hash_combine(
return cudf::hashing::detail::hash_combine(
hash, shallow_hash_impl(child, c.is_empty() or is_parent_empty));
});
}
Expand Down
52 changes: 31 additions & 21 deletions cpp/src/hash/hashing.cu
Original file line number Diff line number Diff line change
Expand Up @@ -31,26 +31,8 @@
#include <algorithm>

namespace cudf {
namespace hashing {
namespace detail {
namespace {

template <typename IterType>
std::vector<column_view> to_leaf_columns(IterType iter_begin, IterType iter_end)
{
std::vector<column_view> leaf_columns;
std::for_each(iter_begin, iter_end, [&leaf_columns](column_view const& col) {
if (is_nested(col.type())) {
CUDF_EXPECTS(col.type().id() == type_id::STRUCT, "unsupported nested type");
auto child_columns = to_leaf_columns(col.child_begin(), col.child_end());
leaf_columns.insert(leaf_columns.end(), child_columns.begin(), child_columns.end());
} else {
leaf_columns.emplace_back(col);
}
});
return leaf_columns;
}

} // namespace

std::unique_ptr<column> hash(table_view const& input,
hash_id hash_function,
Expand All @@ -61,21 +43,49 @@ std::unique_ptr<column> hash(table_view const& input,
switch (hash_function) {
case (hash_id::HASH_MURMUR3): return murmur_hash3_32(input, seed, stream, mr);
case (hash_id::HASH_SPARK_MURMUR3): return spark_murmur_hash3_32(input, seed, stream, mr);
case (hash_id::HASH_MD5): return md5_hash(input, stream, mr);
case (hash_id::HASH_MD5): return md5(input, stream, mr);
default: CUDF_FAIL("Unsupported hash function.");
}
}

} // namespace detail

std::unique_ptr<column> murmur_hash3_32(table_view const& input,
uint32_t seed,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
return detail::murmur_hash3_32(input, seed, stream, mr);
}

std::unique_ptr<column> spark_murmur_hash3_32(table_view const& input,
uint32_t seed,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
return detail::spark_murmur_hash3_32(input, seed, stream, mr);
}

std::unique_ptr<column> md5(table_view const& input,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
return detail::md5(input, stream, mr);
}

} // namespace hashing

std::unique_ptr<column> hash(table_view const& input,
hash_id hash_function,
uint32_t seed,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
return detail::hash(input, hash_function, seed, stream, mr);
return hashing::detail::hash(input, hash_function, seed, stream, mr);
}

} // namespace cudf
19 changes: 10 additions & 9 deletions cpp/src/hash/md5_hash.cu
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
#include <iterator>

namespace cudf {

namespace hashing {
namespace detail {

namespace {
Expand Down Expand Up @@ -81,7 +81,7 @@ struct MD5Hasher {
sizeof(message_length_in_bits));

for (int i = 0; i < 4; ++i) {
uint32ToLowercaseHexString(hash_values[i], result_location + (8 * i));
cudf::detail::uint32ToLowercaseHexString(hash_values[i], result_location + (8 * i));
}
}

Expand All @@ -93,8 +93,8 @@ struct MD5Hasher {
template <typename Element>
void __device__ inline process(Element const& element)
{
auto const normalized_element = normalize_nans_and_zeros(element);
auto const [element_ptr, size] = get_element_pointer_and_size(normalized_element);
auto const normalized_element = cudf::detail::normalize_nans_and_zeros(element);
auto const [element_ptr, size] = cudf::detail::get_element_pointer_and_size(normalized_element);
buffer.put(element_ptr, size);
message_length += size;
}
Expand Down Expand Up @@ -142,7 +142,7 @@ struct MD5Hasher {
A = D;
D = C;
C = B;
B = B + rotate_bits_left(F, md5_shift_constants[((j / 16) * 4) + (j % 4)]);
B = B + cudf::detail::rotate_bits_left(F, md5_shift_constants[((j / 16) * 4) + (j % 4)]);
}

hash_values[0] += A;
Expand All @@ -153,7 +153,7 @@ struct MD5Hasher {
};

char* result_location;
hash_circular_buffer<message_chunk_size, md5_hash_step> buffer;
cudf::detail::hash_circular_buffer<message_chunk_size, md5_hash_step> buffer;
uint64_t message_length = 0;
uint32_t hash_values[4] = {0x67452301, 0xefcdab89, 0x98badcfe, 0x10325476};
};
Expand Down Expand Up @@ -215,9 +215,9 @@ inline bool md5_leaf_type_check(data_type dt)

} // namespace

std::unique_ptr<column> md5_hash(table_view const& input,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
std::unique_ptr<column> md5(table_view const& input,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
if (input.num_columns() == 0 || input.num_rows() == 0) {
// Return the MD5 hash of a zero-length input.
Expand Down Expand Up @@ -281,4 +281,5 @@ std::unique_ptr<column> md5_hash(table_view const& input,
}

} // namespace detail
} // namespace hashing
} // namespace cudf
Loading

0 comments on commit 2ca4630

Please sign in to comment.