From 18f2e7a84a03342bf6305f63ae1f8164ffbccd99 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Fri, 3 May 2024 09:03:59 -0400 Subject: [PATCH] Large strings support in MD5 and SHA hashers (#15631) Updates the hash functions for md5 and sha to support creating large strings results. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Vukasin Milovanovic (https://github.com/vuule) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/15631 --- cpp/src/hash/md5_hash.cu | 4 ++-- cpp/src/hash/sha_hash.cuh | 29 +++++++++++++++-------------- 2 files changed, 17 insertions(+), 16 deletions(-) diff --git a/cpp/src/hash/md5_hash.cu b/cpp/src/hash/md5_hash.cu index 8f490ada8ff..0b559e8e86c 100644 --- a/cpp/src/hash/md5_hash.cu +++ b/cpp/src/hash/md5_hash.cu @@ -309,7 +309,7 @@ std::unique_ptr md5(table_view const& input, // Result column allocation and creation auto begin = thrust::make_constant_iterator(digest_size); auto [offsets_column, bytes] = - cudf::detail::make_offsets_child_column(begin, begin + input.num_rows(), stream, mr); + cudf::strings::detail::make_offsets_child_column(begin, begin + input.num_rows(), stream, mr); rmm::device_uvector chars(bytes, stream, mr); auto d_chars = chars.data(); @@ -322,7 +322,7 @@ std::unique_ptr md5(table_view const& input, thrust::make_counting_iterator(0), thrust::make_counting_iterator(input.num_rows()), [d_chars, device_input = *device_input] __device__(auto row_index) { - MD5Hasher hasher(d_chars + (row_index * digest_size)); + MD5Hasher hasher(d_chars + (static_cast(row_index) * digest_size)); for (auto const& col : device_input) { if (col.is_valid(row_index)) { if (col.type().id() == type_id::LIST) { diff --git a/cpp/src/hash/sha_hash.cuh b/cpp/src/hash/sha_hash.cuh index 005578cb2c2..6976241057e 100644 --- a/cpp/src/hash/sha_hash.cuh +++ b/cpp/src/hash/sha_hash.cuh @@ -518,7 +518,7 @@ std::unique_ptr sha_hash(table_view const& input, // Result column allocation and creation auto begin = thrust::make_constant_iterator(Hasher::digest_size); auto [offsets_column, bytes] = - cudf::detail::make_offsets_child_column(begin, begin + input.num_rows(), stream, mr); + cudf::strings::detail::make_offsets_child_column(begin, begin + input.num_rows(), stream, mr); auto chars = rmm::device_uvector(bytes, stream, mr); auto d_chars = chars.data(); @@ -526,19 +526,20 @@ std::unique_ptr sha_hash(table_view const& input, auto const device_input = table_device_view::create(input, stream); // Hash each row, hashing each element sequentially left to right - thrust::for_each(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(input.num_rows()), - [d_chars, device_input = *device_input] __device__(auto row_index) { - Hasher hasher(d_chars + (row_index * Hasher::digest_size)); - for (auto const& col : device_input) { - if (col.is_valid(row_index)) { - cudf::type_dispatcher( - col.type(), HasherDispatcher(&hasher, col), row_index); - } - } - hasher.finalize(); - }); + thrust::for_each( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(input.num_rows()), + [d_chars, device_input = *device_input] __device__(auto row_index) { + Hasher hasher(d_chars + (static_cast(row_index) * Hasher::digest_size)); + for (auto const& col : device_input) { + if (col.is_valid(row_index)) { + cudf::type_dispatcher( + col.type(), HasherDispatcher(&hasher, col), row_index); + } + } + hasher.finalize(); + }); return make_strings_column(input.num_rows(), std::move(offsets_column), chars.release(), 0, {}); }