From 4483b879047efecc234de74e99bd6b5afcb829a4 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Tue, 16 May 2023 11:30:29 -0400
Subject: [PATCH] Performance improvement for nvtext::minhash (#13333)

Improves performance of `nvtext::minhash` by minimizing character counting in the internal logic. The MinHash strings are expected to be very long ( `> 1KB`). Improvement is measure to be up to 2x.

Authors:
  - David Wendt (https://github.com/davidwendt)
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Mark Harris (https://github.com/harrism)
  - Nghia Truong (https://github.com/ttnghia)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/cudf/pull/13333
---
 cpp/src/text/minhash.cu | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu
index eb3b9092185..e9aa6c2693c 100644
--- a/cpp/src/text/minhash.cu
+++ b/cpp/src/text/minhash.cu
@@ -74,20 +74,20 @@ struct minhash_fn {
     }
     __syncwarp();
 
-    auto const begin = d_str.begin() + lane_idx;
-    auto const end   = [d_str, width = width] {
-      auto const length = d_str.length();
-      if (length > width) { return (d_str.end() - (width - 1)); }
-      return d_str.begin() + static_cast<cudf::size_type>(length > 0);
-    }();
-
-    // each lane hashes substrings of the given width
-    for (auto itr = begin; itr < end; itr += cudf::detail::warp_size) {
-      auto const offset = itr.byte_offset();
-      auto const hash_str =
-        cudf::string_view(d_str.data() + offset, (itr + width).byte_offset() - offset);
+    auto const begin = d_str.data() + lane_idx;
+    auto const end   = d_str.data() + d_str.size_bytes();
 
-      // hashing each seed on the same section of string is 10x faster than
+    // each lane hashes 'width'  substrings of d_str
+    for (auto itr = begin; itr < end; itr += cudf::detail::warp_size) {
+      if (cudf::strings::detail::is_utf8_continuation_char(*itr)) { continue; }
+      auto const check_str =  // used for counting 'width' characters
+        cudf::string_view(itr, static_cast<cudf::size_type>(thrust::distance(itr, end)));
+      auto const [bytes, left] =
+        cudf::strings::detail::bytes_to_character_position(check_str, width);
+      if ((itr != d_str.data()) && (left > 0)) { continue; }  // true if past the end of the string
+
+      auto const hash_str = cudf::string_view(itr, bytes);
+      // hashing with each seed on the same section of the string is 10x faster than
       // computing the substrings for each seed
       for (std::size_t seed_idx = 0; seed_idx < seeds.size(); ++seed_idx) {
         auto const hasher = cudf::detail::MurmurHash3_32<cudf::string_view>{seeds[seed_idx]};