From 4483b879047efecc234de74e99bd6b5afcb829a4 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Tue, 16 May 2023 11:30:29 -0400 Subject: [PATCH] Performance improvement for nvtext::minhash (#13333) Improves performance of `nvtext::minhash` by minimizing character counting in the internal logic. The MinHash strings are expected to be very long ( `> 1KB`). Improvement is measure to be up to 2x. Authors: - David Wendt (https://github.com/davidwendt) - Vukasin Milovanovic (https://github.com/vuule) Approvers: - Bradley Dice (https://github.com/bdice) - Mark Harris (https://github.com/harrism) - Nghia Truong (https://github.com/ttnghia) - Vukasin Milovanovic (https://github.com/vuule) URL: https://github.com/rapidsai/cudf/pull/13333 --- cpp/src/text/minhash.cu | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu index eb3b9092185..e9aa6c2693c 100644 --- a/cpp/src/text/minhash.cu +++ b/cpp/src/text/minhash.cu @@ -74,20 +74,20 @@ struct minhash_fn { } __syncwarp(); - auto const begin = d_str.begin() + lane_idx; - auto const end = [d_str, width = width] { - auto const length = d_str.length(); - if (length > width) { return (d_str.end() - (width - 1)); } - return d_str.begin() + static_cast(length > 0); - }(); - - // each lane hashes substrings of the given width - for (auto itr = begin; itr < end; itr += cudf::detail::warp_size) { - auto const offset = itr.byte_offset(); - auto const hash_str = - cudf::string_view(d_str.data() + offset, (itr + width).byte_offset() - offset); + auto const begin = d_str.data() + lane_idx; + auto const end = d_str.data() + d_str.size_bytes(); - // hashing each seed on the same section of string is 10x faster than + // each lane hashes 'width' substrings of d_str + for (auto itr = begin; itr < end; itr += cudf::detail::warp_size) { + if (cudf::strings::detail::is_utf8_continuation_char(*itr)) { continue; } + auto const check_str = // used for counting 'width' characters + cudf::string_view(itr, static_cast(thrust::distance(itr, end))); + auto const [bytes, left] = + cudf::strings::detail::bytes_to_character_position(check_str, width); + if ((itr != d_str.data()) && (left > 0)) { continue; } // true if past the end of the string + + auto const hash_str = cudf::string_view(itr, bytes); + // hashing with each seed on the same section of the string is 10x faster than // computing the substrings for each seed for (std::size_t seed_idx = 0; seed_idx < seeds.size(); ++seed_idx) { auto const hasher = cudf::detail::MurmurHash3_32{seeds[seed_idx]};