From aeef0a1f4159d4c87f987d20225401040973d10f Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Thu, 18 Jul 2024 16:56:30 -0400
Subject: [PATCH 1/2] Remove hash_character_ngrams dependency from
 jaccard_index (#16241)

Removes internal dependency of `nvtext::hash_character_ngrams` from `nvtext::jaccard_index`.
Works around the size-type limit imposed by `hash_character_ngrams` which returns a `list` column.
This also specializes the hashing logic for the jaccard calculation specifically.

The overall algorithm has not changed. Code has moved around a bit and internal list-columns have been replaced with just offsets and values vectors.

Closes #16157

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Mike Wilson (https://github.com/hyperbolic2346)

URL: https://github.com/rapidsai/cudf/pull/16241
---
 cpp/benchmarks/text/jaccard.cpp |   4 +-
 cpp/src/text/jaccard.cu         | 478 ++++++++++++++++++++++----------
 2 files changed, 339 insertions(+), 143 deletions(-)

diff --git a/cpp/benchmarks/text/jaccard.cpp b/cpp/benchmarks/text/jaccard.cpp
index d05c195d077..d5b74da6773 100644
--- a/cpp/benchmarks/text/jaccard.cpp
+++ b/cpp/benchmarks/text/jaccard.cpp
@@ -59,6 +59,6 @@ static void bench_jaccard(nvbench::state& state)
 
 NVBENCH_BENCH(bench_jaccard)
   .set_name("jaccard")
-  .add_int64_axis("num_rows", {1024, 4096, 8192, 16364, 32768, 262144})
-  .add_int64_axis("row_width", {128, 512, 2048})
+  .add_int64_axis("num_rows", {32768, 131072, 262144})
+  .add_int64_axis("row_width", {128, 512, 1024, 2048})
   .add_int64_axis("substring_width", {5, 10});
diff --git a/cpp/src/text/jaccard.cu b/cpp/src/text/jaccard.cu
index 9cf934165f6..e465fb79c89 100644
--- a/cpp/src/text/jaccard.cu
+++ b/cpp/src/text/jaccard.cu
@@ -19,16 +19,19 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/sizes_to_offsets_iterator.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
-#include <cudf/lists/lists_column_view.hpp>
+#include <cudf/detail/utilities/integer_utils.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/hashing/detail/murmurhash3_x86_32.cuh>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/span.hpp>
 
-#include <nvtext/detail/generate_ngrams.hpp>
 #include <nvtext/jaccard.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 #include <rmm/resource_ref.hpp>
 
@@ -36,127 +39,375 @@
 #include <thrust/binary_search.h>
 #include <thrust/execution_policy.h>
 #include <thrust/iterator/counting_iterator.h>
+#include <thrust/reduce.h>
+#include <thrust/scan.h>
+#include <thrust/sequence.h>
 #include <thrust/transform.h>
 
 namespace nvtext {
 namespace detail {
 namespace {
 
+constexpr cudf::thread_index_type block_size       = 256;
+constexpr cudf::thread_index_type bytes_per_thread = 4;
+
 /**
  * @brief Retrieve the row data (span) for the given column/row-index
  *
- * @param d_input Input lists column
+ * @param values Flat vector of all values
+ * @param offsets Offsets identifying rows within values
  * @param idx Row index to retrieve
  * @return A device-span of the row values
  */
-__device__ auto get_row(cudf::column_device_view const& d_input, cudf::size_type idx)
+__device__ auto get_row(uint32_t const* values, int64_t const* offsets, cudf::size_type row_idx)
 {
-  auto const offsets =
-    d_input.child(cudf::lists_column_view::offsets_column_index).data<cudf::size_type>();
-  auto const offset = offsets[idx];
-  auto const size   = offsets[idx + 1] - offset;
-  auto const begin =
-    d_input.child(cudf::lists_column_view::child_column_index).data<uint32_t>() + offset;
+  auto const offset = offsets[row_idx];
+  auto const size   = offsets[row_idx + 1] - offset;
+  auto const begin  = values + offset;
   return cudf::device_span<uint32_t const>(begin, size);
 }
 
 /**
- * @brief Count the unique values within each row of the input column
+ * @brief Kernel to count the unique values within each row of the input column
+ *
+ * This is called with a warp per row.
  *
- * This is called with a warp per row
+ * @param d_values Sorted hash values to count uniqueness
+ * @param d_offsets Offsets to each set of row elements in d_values
+ * @param rows Number of rows in the output
+ * @param d_results Number of unique values in each row
  */
-struct sorted_unique_fn {
-  cudf::column_device_view const d_input;
-  cudf::size_type* d_results;
+CUDF_KERNEL void sorted_unique_fn(uint32_t const* d_values,
+                                  int64_t const* d_offsets,
+                                  cudf::size_type rows,
+                                  cudf::size_type* d_results)
+{
+  auto const idx = cudf::detail::grid_1d::global_thread_id();
+  if (idx >= (static_cast<cudf::thread_index_type>(rows) * cudf::detail::warp_size)) { return; }
 
-  // warp per row
-  __device__ void operator()(cudf::size_type idx) const
-  {
-    using warp_reduce = cub::WarpReduce<cudf::size_type>;
-    __shared__ typename warp_reduce::TempStorage temp_storage;
+  using warp_reduce = cub::WarpReduce<cudf::size_type>;
+  __shared__ typename warp_reduce::TempStorage temp_storage;
 
-    auto const row_idx  = idx / cudf::detail::warp_size;
-    auto const lane_idx = idx % cudf::detail::warp_size;
-    auto const row      = get_row(d_input, row_idx);
-    auto const begin    = row.begin();
+  auto const row_idx  = idx / cudf::detail::warp_size;
+  auto const lane_idx = idx % cudf::detail::warp_size;
+  auto const row      = get_row(d_values, d_offsets, row_idx);
+  auto const begin    = row.begin();
 
-    cudf::size_type count = 0;
-    for (auto itr = begin + lane_idx; itr < row.end(); itr += cudf::detail::warp_size) {
-      count += (itr == begin || *itr != *(itr - 1));
-    }
-    auto const result = warp_reduce(temp_storage).Sum(count);
-    if (lane_idx == 0) { d_results[row_idx] = result; }
+  cudf::size_type count = 0;
+  for (auto itr = begin + lane_idx; itr < row.end(); itr += cudf::detail::warp_size) {
+    count += (itr == begin || *itr != *(itr - 1));
   }
-};
+  auto const result = warp_reduce(temp_storage).Sum(count);
+  if (lane_idx == 0) { d_results[row_idx] = result; }
+}
 
-rmm::device_uvector<cudf::size_type> compute_unique_counts(cudf::column_view const& input,
+/**
+ * @brief Count the unique values within each row of the input column
+ *
+ * @param values Sorted hash values to count uniqueness
+ * @param offsets Offsets to each set of row elements in d_values
+ * @param rows Number of rows in the output
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @return Number of unique values
+ */
+rmm::device_uvector<cudf::size_type> compute_unique_counts(uint32_t const* values,
+                                                           int64_t const* offsets,
+                                                           cudf::size_type rows,
                                                            rmm::cuda_stream_view stream)
 {
-  auto const d_input = cudf::column_device_view::create(input, stream);
-  auto d_results     = rmm::device_uvector<cudf::size_type>(input.size(), stream);
-  sorted_unique_fn fn{*d_input, d_results.data()};
-  thrust::for_each_n(rmm::exec_policy(stream),
-                     thrust::counting_iterator<cudf::size_type>(0),
-                     input.size() * cudf::detail::warp_size,
-                     fn);
+  auto d_results        = rmm::device_uvector<cudf::size_type>(rows, stream);
+  auto const num_blocks = cudf::util::div_rounding_up_safe(
+    static_cast<cudf::thread_index_type>(rows) * cudf::detail::warp_size, block_size);
+  sorted_unique_fn<<<num_blocks, block_size, 0, stream.value()>>>(
+    values, offsets, rows, d_results.data());
   return d_results;
 }
 
+/**
+ * @brief Kernel to count the number of common values within each row of the 2 input columns
+ *
+ * This is called with a warp per row.
+ *
+ * @param d_values1 Sorted hash values to check against d_values2
+ * @param d_offsets1 Offsets to each set of row elements in d_values1
+ * @param d_values2 Sorted hash values to check against d_values1
+ * @param d_offsets2 Offsets to each set of row elements in d_values2
+ * @param rows Number of rows in the output
+ * @param d_results Number of common values in each row
+ */
+CUDF_KERNEL void sorted_intersect_fn(uint32_t const* d_values1,
+                                     int64_t const* d_offsets1,
+                                     uint32_t const* d_values2,
+                                     int64_t const* d_offsets2,
+                                     cudf::size_type rows,
+                                     cudf::size_type* d_results)
+{
+  auto const idx = cudf::detail::grid_1d::global_thread_id();
+  if (idx >= (static_cast<cudf::thread_index_type>(rows) * cudf::detail::warp_size)) { return; }
+
+  using warp_reduce = cub::WarpReduce<cudf::size_type>;
+  __shared__ typename warp_reduce::TempStorage temp_storage;
+
+  auto const row_idx  = idx / cudf::detail::warp_size;
+  auto const lane_idx = idx % cudf::detail::warp_size;
+
+  auto const needles  = get_row(d_values1, d_offsets1, row_idx);
+  auto const haystack = get_row(d_values2, d_offsets2, row_idx);
+
+  auto begin     = haystack.begin();
+  auto const end = haystack.end();
+
+  cudf::size_type count = 0;
+  for (auto itr = needles.begin() + lane_idx; itr < needles.end() && begin < end;
+       itr += cudf::detail::warp_size) {
+    if (itr != needles.begin() && *itr == *(itr - 1)) { continue; }  // skip duplicates
+    // search haystack for this needle (*itr)
+    auto const found = thrust::lower_bound(thrust::seq, begin, end, *itr);
+    count += (found != end) && (*found == *itr);  // increment if found;
+    begin = found;                                // shorten the next lower-bound range
+  }
+  // sum up the counts across this warp
+  auto const result = warp_reduce(temp_storage).Sum(count);
+  if (lane_idx == 0) { d_results[row_idx] = result; }
+}
+
 /**
  * @brief Count the number of common values within each row of the 2 input columns
  *
- * This is called with a warp per row
+ * @param d_values1 Sorted hash values to check against d_values2
+ * @param d_offsets1 Offsets to each set of row elements in d_values1
+ * @param d_values2 Sorted hash values to check against d_values1
+ * @param d_offsets2 Offsets to each set of row elements in d_values2
+ * @param rows Number of rows in the output
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @return Number of common values
  */
-struct sorted_intersect_fn {
-  cudf::column_device_view const d_input1;
-  cudf::column_device_view const d_input2;
-  cudf::size_type* d_results;
+rmm::device_uvector<cudf::size_type> compute_intersect_counts(uint32_t const* values1,
+                                                              int64_t const* offsets1,
+                                                              uint32_t const* values2,
+                                                              int64_t const* offsets2,
+                                                              cudf::size_type rows,
+                                                              rmm::cuda_stream_view stream)
+{
+  auto d_results        = rmm::device_uvector<cudf::size_type>(rows, stream);
+  auto const num_blocks = cudf::util::div_rounding_up_safe(
+    static_cast<cudf::thread_index_type>(rows) * cudf::detail::warp_size, block_size);
+  sorted_intersect_fn<<<num_blocks, block_size, 0, stream.value()>>>(
+    values1, offsets1, values2, offsets2, rows, d_results.data());
+  return d_results;
+}
 
-  // warp per row
-  __device__ void operator()(cudf::size_type idx) const
-  {
-    using warp_reduce = cub::WarpReduce<cudf::size_type>;
-    __shared__ typename warp_reduce::TempStorage temp_storage;
+/**
+ * @brief Counts the number of substrings in each row of the given strings column
+ *
+ * Each warp processes a single string.
+ * Formula is `count = max(1, str.length() - width + 1)`
+ * If a string has less than width characters (but not empty), the count is 1
+ * since the entire string is still hashed.
+ *
+ * @param d_strings Input column of strings
+ * @param width Substring size in characters
+ * @param d_counts Output number of substring per row of input
+ */
+CUDF_KERNEL void count_substrings_kernel(cudf::column_device_view const d_strings,
+                                         cudf::size_type width,
+                                         int64_t* d_counts)
+{
+  auto const idx = cudf::detail::grid_1d::global_thread_id();
+  if (idx >= (static_cast<cudf::thread_index_type>(d_strings.size()) * cudf::detail::warp_size)) {
+    return;
+  }
 
-    auto const row_idx  = idx / cudf::detail::warp_size;
-    auto const lane_idx = idx % cudf::detail::warp_size;
+  auto const str_idx = static_cast<cudf::size_type>(idx / cudf::detail::warp_size);
+  if (d_strings.is_null(str_idx)) {
+    d_counts[str_idx] = 0;
+    return;
+  }
 
-    auto const needles  = get_row(d_input1, row_idx);
-    auto const haystack = get_row(d_input2, row_idx);
+  auto const d_str = d_strings.element<cudf::string_view>(str_idx);
+  if (d_str.empty()) {
+    d_counts[str_idx] = 0;
+    return;
+  }
 
-    auto begin     = haystack.begin();
-    auto const end = haystack.end();
+  using warp_reduce = cub::WarpReduce<cudf::size_type>;
+  __shared__ typename warp_reduce::TempStorage temp_storage;
 
-    // TODO: investigate cuCollections device-side static-map to match row values
+  auto const end        = d_str.data() + d_str.size_bytes();
+  auto const lane_idx   = idx % cudf::detail::warp_size;
+  cudf::size_type count = 0;
+  for (auto itr = d_str.data() + (lane_idx * bytes_per_thread); itr < end;
+       itr += cudf::detail::warp_size * bytes_per_thread) {
+    for (auto s = itr; (s < (itr + bytes_per_thread)) && (s < end); ++s) {
+      count += static_cast<cudf::size_type>(cudf::strings::detail::is_begin_utf8_char(*s));
+    }
+  }
+  auto const char_count = warp_reduce(temp_storage).Sum(count);
+  if (lane_idx == 0) { d_counts[str_idx] = std::max(1, char_count - width + 1); }
+}
+
+/**
+ * @brief Kernel to hash the substrings for each input row
+ *
+ * Each warp processes a single string.
+ * Substrings of string "hello world" with width=4 produce:
+ *   "hell", "ello", "llo ", "lo w", "o wo", " wor", "worl", "orld"
+ * Each of these substrings is hashed and the hash stored in d_results
+ *
+ * @param d_strings Input column of strings
+ * @param width Substring size in characters
+ * @param d_output_offsets Offsets into d_results
+ * @param d_results Hash values for each substring
+ */
+CUDF_KERNEL void substring_hash_kernel(cudf::column_device_view const d_strings,
+                                       cudf::size_type width,
+                                       int64_t const* d_output_offsets,
+                                       uint32_t* d_results)
+{
+  auto const idx = cudf::detail::grid_1d::global_thread_id();
+  if (idx >= (static_cast<cudf::thread_index_type>(d_strings.size()) * cudf::detail::warp_size)) {
+    return;
+  }
 
-    cudf::size_type count = 0;
-    for (auto itr = needles.begin() + lane_idx; itr < needles.end() && begin < end;
-         itr += cudf::detail::warp_size) {
-      if (itr != needles.begin() && *itr == *(itr - 1)) { continue; }  // skip duplicates
-      // search haystack for this needle (*itr)
-      auto const found = thrust::lower_bound(thrust::seq, begin, end, *itr);
-      count += (found != end) && (*found == *itr);  // increment if found;
-      begin = found;                                // shorten the next lower-bound range
+  auto const str_idx  = idx / cudf::detail::warp_size;
+  auto const lane_idx = idx % cudf::detail::warp_size;
+
+  if (d_strings.is_null(str_idx)) { return; }
+  auto const d_str = d_strings.element<cudf::string_view>(str_idx);
+  if (d_str.empty()) { return; }
+
+  __shared__ uint32_t hvs[block_size];  // temp store for hash values
+
+  auto const hasher     = cudf::hashing::detail::MurmurHash3_x86_32<cudf::string_view>{0};
+  auto const end        = d_str.data() + d_str.size_bytes();
+  auto const warp_count = (d_str.size_bytes() / cudf::detail::warp_size) + 1;
+
+  auto d_hashes = d_results + d_output_offsets[str_idx];
+  auto itr      = d_str.data() + lane_idx;
+  for (auto i = 0; i < warp_count; ++i) {
+    uint32_t hash = 0;
+    if (itr < end && cudf::strings::detail::is_begin_utf8_char(*itr)) {
+      // resolve substring
+      auto const sub_str =
+        cudf::string_view(itr, static_cast<cudf::size_type>(thrust::distance(itr, end)));
+      auto const [bytes, left] = cudf::strings::detail::bytes_to_character_position(sub_str, width);
+      // hash only if we have the full width of characters or this is the beginning of the string
+      if ((left == 0) || (itr == d_str.data())) { hash = hasher(cudf::string_view(itr, bytes)); }
     }
-    // sum up the counts across this warp
-    auto const result = warp_reduce(temp_storage).Sum(count);
-    if (lane_idx == 0) { d_results[row_idx] = result; }
+    hvs[threadIdx.x] = hash;  // store hash into shared memory
+    __syncwarp();
+    if (lane_idx == 0) {
+      // copy valid hash values for this warp into d_hashes
+      auto const hashes     = &hvs[threadIdx.x];
+      auto const hashes_end = hashes + cudf::detail::warp_size;
+      d_hashes =
+        thrust::copy_if(thrust::seq, hashes, hashes_end, d_hashes, [](auto h) { return h != 0; });
+    }
+    __syncwarp();
+    itr += cudf::detail::warp_size;
   }
-};
+}
 
-rmm::device_uvector<cudf::size_type> compute_intersect_counts(cudf::column_view const& input1,
-                                                              cudf::column_view const& input2,
-                                                              rmm::cuda_stream_view stream)
+void segmented_sort(uint32_t const* input,
+                    uint32_t* output,
+                    int64_t items,
+                    cudf::size_type segments,
+                    int64_t const* offsets,
+                    rmm::cuda_stream_view stream)
 {
-  auto const d_input1 = cudf::column_device_view::create(input1, stream);
-  auto const d_input2 = cudf::column_device_view::create(input2, stream);
-  auto d_results      = rmm::device_uvector<cudf::size_type>(input1.size(), stream);
-  sorted_intersect_fn fn{*d_input1, *d_input2, d_results.data()};
-  thrust::for_each_n(rmm::exec_policy(stream),
-                     thrust::counting_iterator<cudf::size_type>(0),
-                     input1.size() * cudf::detail::warp_size,
-                     fn);
-  return d_results;
+  rmm::device_buffer temp;
+  std::size_t temp_bytes = 0;
+  cub::DeviceSegmentedSort::SortKeys(
+    temp.data(), temp_bytes, input, output, items, segments, offsets, offsets + 1, stream.value());
+  temp = rmm::device_buffer(temp_bytes, stream);
+  cub::DeviceSegmentedSort::SortKeys(
+    temp.data(), temp_bytes, input, output, items, segments, offsets, offsets + 1, stream.value());
+}
+
+/**
+ * @brief Create hashes for each substring
+ *
+ * The hashes are sorted using a segmented-sort as setup to
+ * perform the unique and intersect operations.
+ *
+ * @param input Input strings column to hash
+ * @param width Substring width in characters
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @return The sorted hash values and offsets to each row
+ */
+std::pair<rmm::device_uvector<uint32_t>, rmm::device_uvector<int64_t>> hash_substrings(
+  cudf::strings_column_view const& input, cudf::size_type width, rmm::cuda_stream_view stream)
+{
+  auto const d_strings = cudf::column_device_view::create(input.parent(), stream);
+
+  // count substrings
+  auto offsets          = rmm::device_uvector<int64_t>(input.size() + 1, stream);
+  auto const num_blocks = cudf::util::div_rounding_up_safe(
+    static_cast<cudf::thread_index_type>(input.size()) * cudf::detail::warp_size, block_size);
+  count_substrings_kernel<<<num_blocks, block_size, 0, stream.value()>>>(
+    *d_strings, width, offsets.data());
+  auto const total_hashes =
+    cudf::detail::sizes_to_offsets(offsets.begin(), offsets.end(), offsets.begin(), stream);
+
+  // hash substrings
+  rmm::device_uvector<uint32_t> hashes(total_hashes, stream);
+  substring_hash_kernel<<<num_blocks, block_size, 0, stream.value()>>>(
+    *d_strings, width, offsets.data(), hashes.data());
+
+  // sort hashes
+  rmm::device_uvector<uint32_t> sorted(total_hashes, stream);
+  if (total_hashes < static_cast<int64_t>(std::numeric_limits<int>::max())) {
+    segmented_sort(
+      hashes.begin(), sorted.begin(), sorted.size(), input.size(), offsets.begin(), stream);
+  } else {
+    // The CUB segmented sort can only handle max<int> total values
+    // so this code calls it in sections.
+    auto const section_size   = std::numeric_limits<int>::max() / 2L;
+    auto const sort_sections  = cudf::util::div_rounding_up_safe(total_hashes, section_size);
+    auto const offset_indices = [&] {
+      // build a set of indices that point to offsets subsections
+      auto sub_offsets = rmm::device_uvector<int64_t>(sort_sections + 1, stream);
+      thrust::sequence(
+        rmm::exec_policy(stream), sub_offsets.begin(), sub_offsets.end(), 0L, section_size);
+      auto indices = rmm::device_uvector<int64_t>(sub_offsets.size(), stream);
+      thrust::lower_bound(rmm::exec_policy(stream),
+                          offsets.begin(),
+                          offsets.end(),
+                          sub_offsets.begin(),
+                          sub_offsets.end(),
+                          indices.begin());
+      return cudf::detail::make_std_vector_sync(indices, stream);
+    }();
+
+    // Call segmented sort with the sort sections
+    for (auto i = 0L; i < sort_sections; ++i) {
+      auto const index1 = offset_indices[i];
+      auto const index2 = std::min(offset_indices[i + 1], static_cast<int64_t>(offsets.size() - 1));
+      auto const offset1 = offsets.element(index1, stream);
+      auto const offset2 = offsets.element(index2, stream);
+
+      auto const num_items    = offset2 - offset1;
+      auto const num_segments = index2 - index1;
+
+      // There is a bug in the CUB segmented sort and the workaround is to
+      // shift the offset values so the first offset is 0.
+      // This transform can be removed once the bug is fixed.
+      auto sort_offsets = rmm::device_uvector<int64_t>(num_segments + 1, stream);
+      thrust::transform(rmm::exec_policy(stream),
+                        offsets.begin() + index1,
+                        offsets.begin() + index2 + 1,
+                        sort_offsets.begin(),
+                        [offset1] __device__(auto const o) { return o - offset1; });
+
+      segmented_sort(hashes.begin() + offset1,
+                     sorted.begin() + offset1,
+                     num_items,
+                     num_segments,
+                     sort_offsets.begin(),
+                     stream);
+    }
+  }
+  return std::make_pair(std::move(sorted), std::move(offsets));
 }
 
 /**
@@ -186,62 +437,6 @@ struct jaccard_fn {
   }
 };
 
-/**
- * @brief Create hashes for each substring
- *
- * Uses the hash_character_ngrams to hash substrings of the input column.
- * This returns a lists column where each row is the hashes for the substrings
- * of the corresponding input string row.
- *
- * The hashes are then sorted using a segmented-sort as setup to
- * perform the unique and intersect operations.
- */
-std::unique_ptr<cudf::column> hash_substrings(cudf::strings_column_view const& col,
-                                              cudf::size_type width,
-                                              rmm::cuda_stream_view stream)
-{
-  auto hashes = hash_character_ngrams(col, width, stream, rmm::mr::get_current_device_resource());
-  auto const input   = cudf::lists_column_view(hashes->view());
-  auto const offsets = input.offsets_begin();
-  auto const data    = input.child().data<uint32_t>();
-
-  rmm::device_uvector<uint32_t> sorted(input.child().size(), stream);
-
-  // this is wicked fast and much faster than using cudf::lists::detail::sort_list
-  rmm::device_buffer d_temp_storage;
-  size_t temp_storage_bytes = 0;
-  cub::DeviceSegmentedSort::SortKeys(d_temp_storage.data(),
-                                     temp_storage_bytes,
-                                     data,
-                                     sorted.data(),
-                                     sorted.size(),
-                                     input.size(),
-                                     offsets,
-                                     offsets + 1,
-                                     stream.value());
-  d_temp_storage = rmm::device_buffer{temp_storage_bytes, stream};
-  cub::DeviceSegmentedSort::SortKeys(d_temp_storage.data(),
-                                     temp_storage_bytes,
-                                     data,
-                                     sorted.data(),
-                                     sorted.size(),
-                                     input.size(),
-                                     offsets,
-                                     offsets + 1,
-                                     stream.value());
-
-  auto contents = hashes->release();
-  // the offsets are taken from the hashes column since they are the same
-  // before and after the segmented-sort
-  return cudf::make_lists_column(
-    col.size(),
-    std::move(contents.children.front()),
-    std::make_unique<cudf::column>(std::move(sorted), rmm::device_buffer{}, 0),
-    0,
-    rmm::device_buffer{},
-    stream,
-    rmm::mr::get_current_device_resource());
-}
 }  // namespace
 
 std::unique_ptr<cudf::column> jaccard_index(cudf::strings_column_view const& input1,
@@ -261,13 +456,14 @@ std::unique_ptr<cudf::column> jaccard_index(cudf::strings_column_view const& inp
 
   auto const [d_uniques1, d_uniques2, d_intersects] = [&] {
     // build hashes of the substrings
-    auto const hash1 = hash_substrings(input1, width, stream);
-    auto const hash2 = hash_substrings(input2, width, stream);
+    auto const [hash1, offsets1] = hash_substrings(input1, width, stream);
+    auto const [hash2, offsets2] = hash_substrings(input2, width, stream);
 
     // compute the unique counts in each set and the intersection counts
-    auto d_uniques1   = compute_unique_counts(hash1->view(), stream);
-    auto d_uniques2   = compute_unique_counts(hash2->view(), stream);
-    auto d_intersects = compute_intersect_counts(hash1->view(), hash2->view(), stream);
+    auto d_uniques1   = compute_unique_counts(hash1.data(), offsets1.data(), input1.size(), stream);
+    auto d_uniques2   = compute_unique_counts(hash2.data(), offsets2.data(), input2.size(), stream);
+    auto d_intersects = compute_intersect_counts(
+      hash1.data(), offsets1.data(), hash2.data(), offsets2.data(), input1.size(), stream);
 
     return std::tuple{std::move(d_uniques1), std::move(d_uniques2), std::move(d_intersects)};
   }();

From 4acca4d57303f52907aa158a2ef996c9d42a73d6 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 18 Jul 2024 11:07:07 -1000
Subject: [PATCH 2/2] Use Column.can_cast_safely instead of some ad-hoc dtype
 functions in .where (#16303)

There were a couple of dedicated functions in `python/cudf/cudf/utils/dtypes.py` specific to `.where` that could be subsumed by `Column.can_cast_safely`.

The minor downside is that we need to cast where's argument to a Column first, but IMO it's probably OK given the deduplication

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16303
---
 python/cudf/cudf/core/_internals/where.py | 78 ++++++++++++++----
 python/cudf/cudf/utils/dtypes.py          | 96 +----------------------
 2 files changed, 62 insertions(+), 112 deletions(-)

diff --git a/python/cudf/cudf/core/_internals/where.py b/python/cudf/cudf/core/_internals/where.py
index 4a36be76b6d..6003a0f6aea 100644
--- a/python/cudf/cudf/core/_internals/where.py
+++ b/python/cudf/cudf/core/_internals/where.py
@@ -9,12 +9,7 @@
 import cudf
 from cudf.api.types import _is_non_decimal_numeric_dtype, is_scalar
 from cudf.core.dtypes import CategoricalDtype
-from cudf.utils.dtypes import (
-    _can_cast,
-    _dtype_can_hold_element,
-    find_common_type,
-    is_mixed_with_object_dtype,
-)
+from cudf.utils.dtypes import find_common_type, is_mixed_with_object_dtype
 
 if TYPE_CHECKING:
     from cudf._typing import ScalarLike
@@ -44,6 +39,8 @@ def _check_and_cast_columns_with_other(
     inplace: bool,
 ) -> tuple[ColumnBase, ScalarLike | ColumnBase]:
     # Returns type-casted `source_col` & `other` based on `inplace`.
+    from cudf.core.column import as_column
+
     source_dtype = source_col.dtype
     if isinstance(source_dtype, CategoricalDtype):
         return _normalize_categorical(source_col, other)
@@ -84,17 +81,9 @@ def _check_and_cast_columns_with_other(
             )
         return _normalize_categorical(source_col, other.astype(source_dtype))
 
-    if (
-        _is_non_decimal_numeric_dtype(source_dtype)
-        and not other_is_scalar  # can-cast fails for Python scalars
-        and _can_cast(other, source_dtype)
-    ):
-        common_dtype = source_dtype
-    elif (
-        isinstance(source_col, cudf.core.column.NumericalColumn)
-        and other_is_scalar
-        and _dtype_can_hold_element(source_dtype, other)
-    ):
+    if _is_non_decimal_numeric_dtype(source_dtype) and as_column(
+        other
+    ).can_cast_safely(source_dtype):
         common_dtype = source_dtype
     else:
         common_dtype = find_common_type(
@@ -130,3 +119,58 @@ def _make_categorical_like(result, column):
             ordered=column.ordered,
         )
     return result
+
+
+def _can_cast(from_dtype, to_dtype):
+    """
+    Utility function to determine if we can cast
+    from `from_dtype` to `to_dtype`. This function primarily calls
+    `np.can_cast` but with some special handling around
+    cudf specific dtypes.
+    """
+    if cudf.utils.utils.is_na_like(from_dtype):
+        return True
+    if isinstance(from_dtype, type):
+        from_dtype = cudf.dtype(from_dtype)
+    if isinstance(to_dtype, type):
+        to_dtype = cudf.dtype(to_dtype)
+
+    # TODO : Add precision & scale checking for
+    # decimal types in future
+
+    if isinstance(from_dtype, cudf.core.dtypes.DecimalDtype):
+        if isinstance(to_dtype, cudf.core.dtypes.DecimalDtype):
+            return True
+        elif isinstance(to_dtype, np.dtype):
+            if to_dtype.kind in {"i", "f", "u", "U", "O"}:
+                return True
+            else:
+                return False
+    elif isinstance(from_dtype, np.dtype):
+        if isinstance(to_dtype, np.dtype):
+            return np.can_cast(from_dtype, to_dtype)
+        elif isinstance(to_dtype, cudf.core.dtypes.DecimalDtype):
+            if from_dtype.kind in {"i", "f", "u", "U", "O"}:
+                return True
+            else:
+                return False
+        elif isinstance(to_dtype, cudf.core.types.CategoricalDtype):
+            return True
+        else:
+            return False
+    elif isinstance(from_dtype, cudf.core.dtypes.ListDtype):
+        # TODO: Add level based checks too once casting of
+        # list columns is supported
+        if isinstance(to_dtype, cudf.core.dtypes.ListDtype):
+            return np.can_cast(from_dtype.leaf_type, to_dtype.leaf_type)
+        else:
+            return False
+    elif isinstance(from_dtype, cudf.core.dtypes.CategoricalDtype):
+        if isinstance(to_dtype, cudf.core.dtypes.CategoricalDtype):
+            return True
+        elif isinstance(to_dtype, np.dtype):
+            return np.can_cast(from_dtype._categories.dtype, to_dtype)
+        else:
+            return False
+    else:
+        return np.can_cast(from_dtype, to_dtype)
diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py
index 59e5ec1df04..af912bee342 100644
--- a/python/cudf/cudf/utils/dtypes.py
+++ b/python/cudf/cudf/utils/dtypes.py
@@ -10,8 +10,6 @@
 from pandas.core.dtypes.common import infer_dtype_from_object
 
 import cudf
-from cudf._typing import DtypeObj
-from cudf.api.types import is_bool, is_float, is_integer
 
 """Map numpy dtype to pyarrow types.
 Note that np.bool_ bitwidth (8) is different from pa.bool_ (1). Special
@@ -584,61 +582,6 @@ def _dtype_pandas_compatible(dtype):
     return dtype
 
 
-def _can_cast(from_dtype, to_dtype):
-    """
-    Utility function to determine if we can cast
-    from `from_dtype` to `to_dtype`. This function primarily calls
-    `np.can_cast` but with some special handling around
-    cudf specific dtypes.
-    """
-    if cudf.utils.utils.is_na_like(from_dtype):
-        return True
-    if isinstance(from_dtype, type):
-        from_dtype = cudf.dtype(from_dtype)
-    if isinstance(to_dtype, type):
-        to_dtype = cudf.dtype(to_dtype)
-
-    # TODO : Add precision & scale checking for
-    # decimal types in future
-
-    if isinstance(from_dtype, cudf.core.dtypes.DecimalDtype):
-        if isinstance(to_dtype, cudf.core.dtypes.DecimalDtype):
-            return True
-        elif isinstance(to_dtype, np.dtype):
-            if to_dtype.kind in {"i", "f", "u", "U", "O"}:
-                return True
-            else:
-                return False
-    elif isinstance(from_dtype, np.dtype):
-        if isinstance(to_dtype, np.dtype):
-            return np.can_cast(from_dtype, to_dtype)
-        elif isinstance(to_dtype, cudf.core.dtypes.DecimalDtype):
-            if from_dtype.kind in {"i", "f", "u", "U", "O"}:
-                return True
-            else:
-                return False
-        elif isinstance(to_dtype, cudf.core.types.CategoricalDtype):
-            return True
-        else:
-            return False
-    elif isinstance(from_dtype, cudf.core.dtypes.ListDtype):
-        # TODO: Add level based checks too once casting of
-        # list columns is supported
-        if isinstance(to_dtype, cudf.core.dtypes.ListDtype):
-            return np.can_cast(from_dtype.leaf_type, to_dtype.leaf_type)
-        else:
-            return False
-    elif isinstance(from_dtype, cudf.core.dtypes.CategoricalDtype):
-        if isinstance(to_dtype, cudf.core.dtypes.CategoricalDtype):
-            return True
-        elif isinstance(to_dtype, np.dtype):
-            return np.can_cast(from_dtype._categories.dtype, to_dtype)
-        else:
-            return False
-    else:
-        return np.can_cast(from_dtype, to_dtype)
-
-
 def _maybe_convert_to_default_type(dtype):
     """Convert `dtype` to default if specified by user.
 
@@ -661,44 +604,7 @@ def _maybe_convert_to_default_type(dtype):
     return dtype
 
 
-def _dtype_can_hold_range(rng: range, dtype: np.dtype) -> bool:
-    if not len(rng):
-        return True
-    return np.can_cast(rng[0], dtype) and np.can_cast(rng[-1], dtype)
-
-
-def _dtype_can_hold_element(dtype: np.dtype, element) -> bool:
-    if dtype.kind in {"i", "u"}:
-        if isinstance(element, range):
-            if _dtype_can_hold_range(element, dtype):
-                return True
-            return False
-
-        elif is_integer(element) or (
-            is_float(element) and element.is_integer()
-        ):
-            info = np.iinfo(dtype)
-            if info.min <= element <= info.max:
-                return True
-            return False
-
-    elif dtype.kind == "f":
-        if is_integer(element) or is_float(element):
-            casted = dtype.type(element)
-            if np.isnan(casted) or casted == element:
-                return True
-            # otherwise e.g. overflow see TestCoercionFloat32
-            return False
-
-    elif dtype.kind == "b":
-        if is_bool(element):
-            return True
-        return False
-
-    raise NotImplementedError(f"Unsupported dtype: {dtype}")
-
-
-def _get_base_dtype(dtype: DtypeObj) -> DtypeObj:
+def _get_base_dtype(dtype: pd.DatetimeTZDtype) -> np.dtype:
     # TODO: replace the use of this function with just `dtype.base`
     # when Pandas 2.1.0 is the minimum version we support:
     # https://github.com/pandas-dev/pandas/pull/52706