Skip to content

Commit

Permalink
Large strings support in MD5 and SHA hashers (#15631)
Browse files Browse the repository at this point in the history
Updates the hash functions for md5 and sha to support creating large strings results.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Bradley Dice (https://github.com/bdice)

URL: #15631
  • Loading branch information
davidwendt authored May 3, 2024
1 parent c60860d commit 18f2e7a
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 16 deletions.
4 changes: 2 additions & 2 deletions cpp/src/hash/md5_hash.cu
Original file line number Diff line number Diff line change
Expand Up @@ -309,7 +309,7 @@ std::unique_ptr<column> md5(table_view const& input,
// Result column allocation and creation
auto begin = thrust::make_constant_iterator(digest_size);
auto [offsets_column, bytes] =
cudf::detail::make_offsets_child_column(begin, begin + input.num_rows(), stream, mr);
cudf::strings::detail::make_offsets_child_column(begin, begin + input.num_rows(), stream, mr);

rmm::device_uvector<char> chars(bytes, stream, mr);
auto d_chars = chars.data();
Expand All @@ -322,7 +322,7 @@ std::unique_ptr<column> md5(table_view const& input,
thrust::make_counting_iterator(0),
thrust::make_counting_iterator(input.num_rows()),
[d_chars, device_input = *device_input] __device__(auto row_index) {
MD5Hasher hasher(d_chars + (row_index * digest_size));
MD5Hasher hasher(d_chars + (static_cast<int64_t>(row_index) * digest_size));
for (auto const& col : device_input) {
if (col.is_valid(row_index)) {
if (col.type().id() == type_id::LIST) {
Expand Down
29 changes: 15 additions & 14 deletions cpp/src/hash/sha_hash.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -518,27 +518,28 @@ std::unique_ptr<column> sha_hash(table_view const& input,
// Result column allocation and creation
auto begin = thrust::make_constant_iterator(Hasher::digest_size);
auto [offsets_column, bytes] =
cudf::detail::make_offsets_child_column(begin, begin + input.num_rows(), stream, mr);
cudf::strings::detail::make_offsets_child_column(begin, begin + input.num_rows(), stream, mr);

auto chars = rmm::device_uvector<char>(bytes, stream, mr);
auto d_chars = chars.data();

auto const device_input = table_device_view::create(input, stream);

// Hash each row, hashing each element sequentially left to right
thrust::for_each(rmm::exec_policy(stream),
thrust::make_counting_iterator(0),
thrust::make_counting_iterator(input.num_rows()),
[d_chars, device_input = *device_input] __device__(auto row_index) {
Hasher hasher(d_chars + (row_index * Hasher::digest_size));
for (auto const& col : device_input) {
if (col.is_valid(row_index)) {
cudf::type_dispatcher<dispatch_storage_type>(
col.type(), HasherDispatcher(&hasher, col), row_index);
}
}
hasher.finalize();
});
thrust::for_each(
rmm::exec_policy(stream),
thrust::make_counting_iterator(0),
thrust::make_counting_iterator(input.num_rows()),
[d_chars, device_input = *device_input] __device__(auto row_index) {
Hasher hasher(d_chars + (static_cast<int64_t>(row_index) * Hasher::digest_size));
for (auto const& col : device_input) {
if (col.is_valid(row_index)) {
cudf::type_dispatcher<dispatch_storage_type>(
col.type(), HasherDispatcher(&hasher, col), row_index);
}
}
hasher.finalize();
});

return make_strings_column(input.num_rows(), std::move(offsets_column), chars.release(), 0, {});
}
Expand Down

0 comments on commit 18f2e7a

Please sign in to comment.