From cd6d25065c0b1606b30ecf6b5fbdbb46760004d5 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Wed, 30 Nov 2022 11:10:44 -0500
Subject: [PATCH 1/7] Rework nvtext::detokenize to use indexalator for row
 indices

---
 cpp/src/text/detokenize.cu | 121 +++++++++++++------------------------
 1 file changed, 41 insertions(+), 80 deletions(-)
diff --git a/cpp/src/text/detokenize.cu b/cpp/src/text/detokenize.cu
index 76994dd0da4..3b40a6e051b 100644
--- a/cpp/src/text/detokenize.cu
+++ b/cpp/src/text/detokenize.cu
@@ -19,9 +19,9 @@
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
-#include <cudf/detail/get_value.cuh>
+#include <cudf/detail/indexalator.cuh>
 #include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/sorting.hpp>
+#include <cudf/detail/sorting.hpp>
 #include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/string_view.cuh>
@@ -29,17 +29,13 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
-#include <cudf/utilities/traits.hpp>
-#include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 
 #include <thrust/copy.h>
 #include <thrust/count.h>
-#include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
-#include <thrust/iterator/transform_iterator.h>
 
 namespace nvtext {
 namespace detail {
@@ -55,10 +51,10 @@ struct detokenizer_fn {
   int32_t const* d_row_map;                  // indices sorted by output row
   cudf::size_type const* d_token_offsets;    // to each input token array
   cudf::string_view const d_separator;       // append after each token
-  int32_t const* d_offsets{};                // offsets to output buffer d_chars
+  cudf::size_type* d_offsets{};              // offsets to output buffer d_chars
   char* d_chars{};                           // output buffer for characters
 
-  __device__ cudf::size_type operator()(cudf::size_type idx)
+  __device__ void operator()(cudf::size_type idx)
   {
     auto const offset      = d_token_offsets[idx];
     auto d_tokens          = d_row_map + offset;
@@ -78,61 +74,41 @@ struct detokenizer_fn {
         nbytes += d_separator.size_bytes();
       }
     }
-    return (nbytes > 0) ? (nbytes - d_separator.size_bytes()) : 0;
+    if (!d_chars) { d_offsets[idx] = (nbytes > 0) ? (nbytes - d_separator.size_bytes()) : 0; }
   }
 };
 
-/**
- * @brief Identifies indexes where the row value changes.
- */
-template <typename IndexType>
 struct index_changed_fn {
-  IndexType const* d_rows;
-  int32_t const* d_row_map;
-  __device__ bool operator()(cudf::size_type idx)
+  cudf::detail::input_indexalator const d_rows;
+  cudf::size_type const* d_row_map;
+  __device__ bool operator()(cudf::size_type idx) const
   {
     return (idx == 0) || (d_rows[d_row_map[idx]] != d_rows[d_row_map[idx - 1]]);
   }
 };
 
-/**
- * @brief This is a type-dispatch function to convert the row indices
- * into token offsets.
- */
-struct token_row_offsets_fn {
-  cudf::column_view const row_indices;
-  cudf::column_view const sorted_indices;
-  cudf::size_type const tokens_counts;
-
-  template <typename T, std::enable_if_t<cudf::is_index_type<T>()>* = nullptr>
-  std::unique_ptr<rmm::device_uvector<cudf::size_type>> operator()(
-    rmm::cuda_stream_view stream) const
-  {
-    index_changed_fn<T> pfn{row_indices.data<T>(), sorted_indices.template data<int32_t>()};
-    auto const output_count =
-      thrust::count_if(rmm::exec_policy(stream),
-                       thrust::make_counting_iterator<cudf::size_type>(0),
-                       thrust::make_counting_iterator<cudf::size_type>(tokens_counts),
-                       pfn);
-    auto tokens_offsets =
-      std::make_unique<rmm::device_uvector<cudf::size_type>>(output_count + 1, stream);
-    thrust::copy_if(rmm::exec_policy(stream),
-                    thrust::make_counting_iterator<cudf::size_type>(0),
-                    thrust::make_counting_iterator<cudf::size_type>(tokens_counts),
-                    tokens_offsets->begin(),
-                    pfn);
-    // set the last element to the total number of tokens
-    tokens_offsets->set_element(output_count, tokens_counts, stream);
-    return tokens_offsets;
-  }
-
-  // non-integral types throw an exception
-  template <typename T, typename... Args, std::enable_if_t<not cudf::is_index_type<T>()>* = nullptr>
-  std::unique_ptr<rmm::device_uvector<cudf::size_type>> operator()(Args&&...) const
-  {
-    CUDF_FAIL("The detokenize indices parameter must be an integer type.");
-  }
-};
+rmm::device_uvector<cudf::size_type> create_token_row_offsets(cudf::column_view row_indices,
+                                                              cudf::column_view sorted_indices,
+                                                              cudf::size_type tokens_counts,
+                                                              rmm::cuda_stream_view stream)
+{
+  index_changed_fn pfn{cudf::detail::indexalator_factory::make_input_iterator(row_indices),
+                       sorted_indices.data<cudf::size_type>()};
+  auto const output_count =
+    thrust::count_if(rmm::exec_policy(stream),
+                     thrust::make_counting_iterator<cudf::size_type>(0),
+                     thrust::make_counting_iterator<cudf::size_type>(tokens_counts),
+                     pfn);
+  auto tokens_offsets = rmm::device_uvector<cudf::size_type>(output_count + 1, stream);
+  thrust::copy_if(rmm::exec_policy(stream),
+                  thrust::make_counting_iterator<cudf::size_type>(0),
+                  thrust::make_counting_iterator<cudf::size_type>(tokens_counts),
+                  tokens_offsets.begin(),
+                  pfn);
+  // set the last element to the total number of tokens
+  tokens_offsets.set_element(output_count, tokens_counts, stream);
+  return tokens_offsets;
+}
 
 }  // namespace
 
@@ -155,42 +131,27 @@ std::unique_ptr<cudf::column> detokenize(cudf::strings_column_view const& string
     return cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING});
 
   auto strings_column = cudf::column_device_view::create(strings.parent(), stream);
-  // the indices may not be in order so we need to sort them
-  auto sorted_rows     = cudf::stable_sorted_order(cudf::table_view({row_indices}));
+  // the indices may not be in order so we need to build a sorted map
+  auto sorted_rows = cudf::detail::stable_sorted_order(
+    cudf::table_view({row_indices}), {}, {}, stream, rmm::mr::get_current_device_resource());
   auto const d_row_map = sorted_rows->view().data<int32_t>();
 
   // create offsets for the tokens for each output string
   auto tokens_offsets =
-    cudf::type_dispatcher(row_indices.type(),
-                          token_row_offsets_fn{row_indices, sorted_rows->view(), tokens_counts},
-                          stream);
-  auto const output_count = tokens_offsets->size() - 1;  // number of output strings
+    create_token_row_offsets(row_indices, sorted_rows->view(), tokens_counts, stream);
+  auto const output_count = tokens_offsets.size() - 1;  // number of output strings
 
-  // create output strings offsets by calculating the size of each output string
   cudf::string_view const d_separator(separator.data(), separator.size());
-  auto offsets_transformer_itr = thrust::make_transform_iterator(
-    thrust::make_counting_iterator<cudf::size_type>(0),
-    detokenizer_fn{*strings_column, d_row_map, tokens_offsets->data(), d_separator});
-  auto offsets_column = cudf::strings::detail::make_offsets_child_column(
-    offsets_transformer_itr, offsets_transformer_itr + output_count, stream, mr);
-  auto d_offsets = offsets_column->view().data<int32_t>();
-
-  // build the chars column - append each source token to the appropriate output row
-  cudf::size_type const total_bytes =
-    cudf::detail::get_value<int32_t>(offsets_column->view(), output_count, stream);
-  auto chars_column = cudf::strings::detail::create_chars_child_column(total_bytes, stream, mr);
-  auto d_chars      = chars_column->mutable_view().data<char>();
-  thrust::for_each_n(
-    rmm::exec_policy(stream),
-    thrust::make_counting_iterator<cudf::size_type>(0),
+
+  auto children = cudf::strings::detail::make_strings_children(
+    detokenizer_fn{*strings_column, d_row_map, tokens_offsets.data(), d_separator},
     output_count,
-    detokenizer_fn{
-      *strings_column, d_row_map, tokens_offsets->data(), d_separator, d_offsets, d_chars});
-  chars_column->set_null_count(0);
+    stream,
+    mr);
 
   // make the output strings column from the offsets and chars column
   return cudf::make_strings_column(
-    output_count, std::move(offsets_column), std::move(chars_column), 0, rmm::device_buffer{});
+    output_count, std::move(children.first), std::move(children.second), 0, rmm::device_buffer{});
 }
 
 }  // namespace detail

From c92d89f25864abc2de9e8a8c0317b1fe139cdb15 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Thu, 1 Dec 2022 08:48:02 -0500
Subject: [PATCH 2/7] add back doxygen for internal utility

---
 cpp/src/text/detokenize.cu | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/cpp/src/text/detokenize.cu b/cpp/src/text/detokenize.cu
index 3b40a6e051b..e9900a7bcb4 100644
--- a/cpp/src/text/detokenize.cu
+++ b/cpp/src/text/detokenize.cu
@@ -87,6 +87,14 @@ struct index_changed_fn {
   }
 };
 
+/**
+ * @brief Convert the row indices into token offsets
+ *
+ * @param row_indices Indices where each token should land
+ * @param sorted_indices Map of row_indices sorted
+ * @param tokens_counts Token counts for each row
+ * @param stream CUDA stream used for kernel launches
+ */
 rmm::device_uvector<cudf::size_type> create_token_row_offsets(cudf::column_view row_indices,
                                                               cudf::column_view sorted_indices,
                                                               cudf::size_type tokens_counts,

From a407931dddcd94402ff54051372ce6736806268c Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Mon, 5 Dec 2022 09:39:33 -0500
Subject: [PATCH 3/7] add const decl to column_view parms

---
 cpp/src/text/detokenize.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/src/text/detokenize.cu b/cpp/src/text/detokenize.cu
index e9900a7bcb4..8749bb7c9a7 100644
--- a/cpp/src/text/detokenize.cu
+++ b/cpp/src/text/detokenize.cu
@@ -95,8 +95,8 @@ struct index_changed_fn {
  * @param tokens_counts Token counts for each row
  * @param stream CUDA stream used for kernel launches
  */
-rmm::device_uvector<cudf::size_type> create_token_row_offsets(cudf::column_view row_indices,
-                                                              cudf::column_view sorted_indices,
+rmm::device_uvector<cudf::size_type> create_token_row_offsets(cudf::column_view const& row_indices,
+                                                              cudf::column_view const& sorted_indices,
                                                               cudf::size_type tokens_counts,
                                                               rmm::cuda_stream_view stream)
 {

From e9b3ab0c3e580494e169648db890bb3ee71051c0 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Mon, 5 Dec 2022 09:40:14 -0500
Subject: [PATCH 4/7] fix style

---
 cpp/src/text/detokenize.cu | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/cpp/src/text/detokenize.cu b/cpp/src/text/detokenize.cu
index 8749bb7c9a7..14b86989491 100644
--- a/cpp/src/text/detokenize.cu
+++ b/cpp/src/text/detokenize.cu
@@ -95,10 +95,11 @@ struct index_changed_fn {
  * @param tokens_counts Token counts for each row
  * @param stream CUDA stream used for kernel launches
  */
-rmm::device_uvector<cudf::size_type> create_token_row_offsets(cudf::column_view const& row_indices,
-                                                              cudf::column_view const& sorted_indices,
-                                                              cudf::size_type tokens_counts,
-                                                              rmm::cuda_stream_view stream)
+rmm::device_uvector<cudf::size_type> create_token_row_offsets(
+  cudf::column_view const& row_indices,
+  cudf::column_view const& sorted_indices,
+  cudf::size_type tokens_counts,
+  rmm::cuda_stream_view stream)
 {
   index_changed_fn pfn{cudf::detail::indexalator_factory::make_input_iterator(row_indices),
                        sorted_indices.data<cudf::size_type>()};

From 609e3bb9e6bb5a6f38555cf759f50e306438aea7 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Tue, 6 Dec 2022 08:26:32 -0500
Subject: [PATCH 5/7] change row_map decl to size_type

---
 cpp/src/text/detokenize.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/src/text/detokenize.cu b/cpp/src/text/detokenize.cu
index 14b86989491..fc4026c3cbb 100644
--- a/cpp/src/text/detokenize.cu
+++ b/cpp/src/text/detokenize.cu
@@ -48,7 +48,7 @@ namespace {
  */
 struct detokenizer_fn {
   cudf::column_device_view const d_strings;  // these are the tokens
-  int32_t const* d_row_map;                  // indices sorted by output row
+  cudf::size_type const* d_row_map;          // indices sorted by output row
   cudf::size_type const* d_token_offsets;    // to each input token array
   cudf::string_view const d_separator;       // append after each token
   cudf::size_type* d_offsets{};              // offsets to output buffer d_chars
@@ -143,7 +143,7 @@ std::unique_ptr<cudf::column> detokenize(cudf::strings_column_view const& string
   // the indices may not be in order so we need to build a sorted map
   auto sorted_rows = cudf::detail::stable_sorted_order(
     cudf::table_view({row_indices}), {}, {}, stream, rmm::mr::get_current_device_resource());
-  auto const d_row_map = sorted_rows->view().data<int32_t>();
+  auto const d_row_map = sorted_rows->view().data<cudf::size_type>();
 
   // create offsets for the tokens for each output string
   auto tokens_offsets =

From 0998f8df05e6cc150165023fd472ccdabacfa1a8 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Tue, 6 Dec 2022 13:33:27 -0500
Subject: [PATCH 6/7] remove count-if call

---
 cpp/src/text/detokenize.cu | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/cpp/src/text/detokenize.cu b/cpp/src/text/detokenize.cu
index fc4026c3cbb..1a58e53b8a8 100644
--- a/cpp/src/text/detokenize.cu
+++ b/cpp/src/text/detokenize.cu
@@ -101,19 +101,19 @@ rmm::device_uvector<cudf::size_type> create_token_row_offsets(
   cudf::size_type tokens_counts,
   rmm::cuda_stream_view stream)
 {
-  index_changed_fn pfn{cudf::detail::indexalator_factory::make_input_iterator(row_indices),
-                       sorted_indices.data<cudf::size_type>()};
-  auto const output_count =
-    thrust::count_if(rmm::exec_policy(stream),
-                     thrust::make_counting_iterator<cudf::size_type>(0),
-                     thrust::make_counting_iterator<cudf::size_type>(tokens_counts),
-                     pfn);
-  auto tokens_offsets = rmm::device_uvector<cudf::size_type>(output_count + 1, stream);
-  thrust::copy_if(rmm::exec_policy(stream),
-                  thrust::make_counting_iterator<cudf::size_type>(0),
-                  thrust::make_counting_iterator<cudf::size_type>(tokens_counts),
-                  tokens_offsets.begin(),
-                  pfn);
+  auto tokens_offsets = rmm::device_uvector<cudf::size_type>(tokens_counts + 1, stream);
+
+  auto end_itr = thrust::copy_if(
+    rmm::exec_policy(stream),
+    thrust::make_counting_iterator<cudf::size_type>(0),
+    thrust::make_counting_iterator<cudf::size_type>(tokens_counts),
+    tokens_offsets.begin(),
+    index_changed_fn{cudf::detail::indexalator_factory::make_input_iterator(row_indices),
+                     sorted_indices.data<cudf::size_type>()});
+
+  auto const output_count = std::distance(tokens_offsets.begin(), end_itr);
+  tokens_offsets.resize(output_count + 1, stream);
+
   // set the last element to the total number of tokens
   tokens_offsets.set_element(output_count, tokens_counts, stream);
   return tokens_offsets;

From 64deeb1bbadb05f38ef124768fd242711dd34511 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Tue, 6 Dec 2022 13:40:57 -0500
Subject: [PATCH 7/7] put count-if back in

---
 cpp/src/text/detokenize.cu | 28 ++++++++++++++++------------
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/cpp/src/text/detokenize.cu b/cpp/src/text/detokenize.cu
index 1a58e53b8a8..a17583cf649 100644
--- a/cpp/src/text/detokenize.cu
+++ b/cpp/src/text/detokenize.cu
@@ -101,18 +101,22 @@ rmm::device_uvector<cudf::size_type> create_token_row_offsets(
   cudf::size_type tokens_counts,
   rmm::cuda_stream_view stream)
 {
-  auto tokens_offsets = rmm::device_uvector<cudf::size_type>(tokens_counts + 1, stream);
-
-  auto end_itr = thrust::copy_if(
-    rmm::exec_policy(stream),
-    thrust::make_counting_iterator<cudf::size_type>(0),
-    thrust::make_counting_iterator<cudf::size_type>(tokens_counts),
-    tokens_offsets.begin(),
-    index_changed_fn{cudf::detail::indexalator_factory::make_input_iterator(row_indices),
-                     sorted_indices.data<cudf::size_type>()});
-
-  auto const output_count = std::distance(tokens_offsets.begin(), end_itr);
-  tokens_offsets.resize(output_count + 1, stream);
+  index_changed_fn fn{cudf::detail::indexalator_factory::make_input_iterator(row_indices),
+                      sorted_indices.data<cudf::size_type>()};
+
+  auto const output_count =
+    thrust::count_if(rmm::exec_policy(stream),
+                     thrust::make_counting_iterator<cudf::size_type>(0),
+                     thrust::make_counting_iterator<cudf::size_type>(tokens_counts),
+                     fn);
+
+  auto tokens_offsets = rmm::device_uvector<cudf::size_type>(output_count + 1, stream);
+
+  thrust::copy_if(rmm::exec_policy(stream),
+                  thrust::make_counting_iterator<cudf::size_type>(0),
+                  thrust::make_counting_iterator<cudf::size_type>(tokens_counts),
+                  tokens_offsets.begin(),
+                  fn);
 
   // set the last element to the total number of tokens
   tokens_offsets.set_element(output_count, tokens_counts, stream);