From e7f9db81eb698ba9ecdc999529e62c39f387d49d Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Thu, 18 Jan 2024 14:03:11 -0500
Subject: [PATCH 1/3] Use offsetalator in nvtext tokenize functions

---
 cpp/src/text/tokenize.cu                | 56 +++++++++++--------------
 cpp/src/text/utilities/tokenize_ops.cuh | 20 ++++-----
 2 files changed, 35 insertions(+), 41 deletions(-)

diff --git a/cpp/src/text/tokenize.cu b/cpp/src/text/tokenize.cu
index c43b9dda56c..97896f20f4f 100644
--- a/cpp/src/text/tokenize.cu
+++ b/cpp/src/text/tokenize.cu
@@ -22,8 +22,8 @@
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
-#include <cudf/detail/get_value.cuh>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/utilities/algorithm.cuh>
 #include <cudf/strings/detail/strings_column_factories.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
@@ -38,7 +38,6 @@
 #include <thrust/count.h>
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
-#include <thrust/scan.h>
 #include <thrust/transform.h>
 
 namespace nvtext {
@@ -80,18 +79,17 @@ std::unique_ptr<cudf::column> tokenize_fn(cudf::size_type strings_count,
     token_count_fn(strings_count, tokenizer, stream, rmm::mr::get_current_device_resource());
   auto d_token_counts = token_counts->view();
   // create token-index offsets from the counts
-  rmm::device_uvector<cudf::size_type> token_offsets(strings_count + 1, stream);
-  thrust::inclusive_scan(rmm::exec_policy(stream),
-                         d_token_counts.template begin<cudf::size_type>(),
-                         d_token_counts.template end<cudf::size_type>(),
-                         token_offsets.begin() + 1);
-  token_offsets.set_element_to_zero_async(0, stream);
-  auto const total_tokens = token_offsets.back_element(stream);
-  // build a list of pointers to each token
+  auto [token_offsets, total_tokens] =
+    cudf::detail::make_offsets_child_column(d_token_counts.template begin<cudf::size_type>(),
+                                            d_token_counts.template end<cudf::size_type>(),
+                                            stream,
+                                            rmm::mr::get_current_device_resource());
+  //  build a list of pointers to each token
   rmm::device_uvector<string_index_pair> tokens(total_tokens, stream);
   // now go get the tokens
-  tokenizer.d_offsets = token_offsets.data();
-  tokenizer.d_tokens  = tokens.data();
+  tokenizer.d_offsets =
+    cudf::detail::offsetalator_factory::make_input_iterator(token_offsets->view());
+  tokenizer.d_tokens = tokens.data();
   thrust::for_each_n(rmm::exec_policy(stream),
                      thrust::make_counting_iterator<cudf::size_type>(0),
                      strings_count,
@@ -178,8 +176,8 @@ std::unique_ptr<cudf::column> character_tokenize(cudf::strings_column_view const
   }
 
   auto offsets = strings_column.offsets();
-  auto offset  = cudf::detail::get_value<cudf::size_type>(offsets, strings_column.offset(), stream);
-  auto chars_bytes = cudf::detail::get_value<cudf::size_type>(
+  auto offset  = cudf::strings::detail::get_offset_value(offsets, strings_column.offset(), stream);
+  auto chars_bytes = cudf::strings::detail::get_offset_value(
                        offsets, strings_column.offset() + strings_count, stream) -
                      offset;
   auto d_chars =
@@ -202,31 +200,27 @@ std::unique_ptr<cudf::column> character_tokenize(cudf::strings_column_view const
   // create output offsets column
   // -- conditionally copy a counting iterator where
   //    the first byte of each character is located
-  auto offsets_column =
-    cudf::make_numeric_column(cudf::data_type{cudf::type_to_id<cudf::size_type>()},
-                              num_characters + 1,
-                              cudf::mask_state::UNALLOCATED,
-                              stream,
-                              mr);
-  auto d_new_offsets = offsets_column->mutable_view().begin<cudf::size_type>();
-  thrust::copy_if(
-    rmm::exec_policy(stream),
-    thrust::counting_iterator<cudf::size_type>(0),
-    thrust::counting_iterator<cudf::size_type>(chars_bytes + 1),
+  auto offsets_column = cudf::make_numeric_column(
+    offsets.type(), num_characters + 1, cudf::mask_state::UNALLOCATED, stream, mr);
+  auto d_new_offsets =
+    cudf::detail::offsetalator_factory::make_output_iterator(offsets_column->mutable_view());
+  cudf::detail::copy_if_safe(
+    thrust::counting_iterator<int64_t>(0),
+    thrust::counting_iterator<int64_t>(chars_bytes + 1),
     d_new_offsets,
     [d_chars, chars_bytes] __device__(auto idx) {
       // this will also set the final value to the size chars_bytes
       return idx < chars_bytes ? cudf::strings::detail::is_begin_utf8_char(d_chars[idx]) : true;
-    });
+    },
+    stream);
 
-  // create the output chars column -- just a copy of the input's chars column
-  cudf::column_view chars_view(
-    cudf::data_type{cudf::type_id::INT8}, chars_bytes, d_chars, nullptr, 0);
-  auto chars_column = std::make_unique<cudf::column>(chars_view, stream, mr);
+  // create the output chars buffer -- just a copy of the input's chars
+  rmm::device_uvector<char> output_chars(chars_bytes, stream, mr);
+  thrust::copy(rmm::exec_policy(stream), d_chars, d_chars + chars_bytes, output_chars.data());
 
   // return new strings column
   return cudf::make_strings_column(
-    num_characters, std::move(offsets_column), std::move(chars_column), 0, rmm::device_buffer{});
+    num_characters, std::move(offsets_column), output_chars.release(), 0, rmm::device_buffer{});
 }
 
 }  // namespace detail
diff --git a/cpp/src/text/utilities/tokenize_ops.cuh b/cpp/src/text/utilities/tokenize_ops.cuh
index a84e94a6924..0901dc37e56 100644
--- a/cpp/src/text/utilities/tokenize_ops.cuh
+++ b/cpp/src/text/utilities/tokenize_ops.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -147,10 +147,10 @@ struct characters_tokenizer {
  * positions into the d_tokens vector.
  */
 struct strings_tokenizer {
-  cudf::column_device_view const d_strings;  ///< strings to tokenize
-  cudf::string_view const d_delimiter;       ///< delimiter characters to tokenize around
-  cudf::size_type* d_offsets{};              ///< offsets into the d_tokens vector for each string
-  string_index_pair* d_tokens{};             ///< token positions in device memory
+  cudf::column_device_view const d_strings;    ///< strings to tokenize
+  cudf::string_view const d_delimiter;         ///< delimiter characters to tokenize around
+  cudf::detail::input_offsetalator d_offsets;  ///< offsets into the d_tokens vector for each string
+  string_index_pair* d_tokens{};               ///< token positions in device memory
 
   /**
    * @brief Identifies the token positions within each string.
@@ -191,11 +191,11 @@ using delimiterator = cudf::column_device_view::const_iterator<cudf::string_view
  * each string of a given strings column.
  */
 struct multi_delimiter_strings_tokenizer {
-  cudf::column_device_view const d_strings;  ///< strings column to tokenize
-  delimiterator delimiters_begin;            ///< first delimiter
-  delimiterator delimiters_end;              ///< last delimiter
-  cudf::size_type* d_offsets{};              ///< offsets into the d_tokens output vector
-  string_index_pair* d_tokens{};             ///< token positions found for each string
+  cudf::column_device_view const d_strings;    ///< strings column to tokenize
+  delimiterator delimiters_begin;              ///< first delimiter
+  delimiterator delimiters_end;                ///< last delimiter
+  cudf::detail::input_offsetalator d_offsets;  ///< offsets into the d_tokens output vector
+  string_index_pair* d_tokens{};               ///< token positions found for each string
 
   /**
    * @brief Identifies the token positions within each string.

From 0688e497751213a02c0916748233b697ad933c79 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Thu, 18 Jan 2024 17:07:07 -0500
Subject: [PATCH 2/3] add ngrams_tokenizer

---
 cpp/src/text/ngrams_tokenize.cu | 97 +++++++++++++--------------------
 1 file changed, 39 insertions(+), 58 deletions(-)

diff --git a/cpp/src/text/ngrams_tokenize.cu b/cpp/src/text/ngrams_tokenize.cu
index bc5cd04eac6..99ea85a7a0d 100644
--- a/cpp/src/text/ngrams_tokenize.cu
+++ b/cpp/src/text/ngrams_tokenize.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,6 +23,7 @@
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/offsets_iterator_factory.cuh>
 #include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/string_view.cuh>
@@ -37,12 +38,9 @@
 #include <thrust/functional.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/transform.h>
-#include <thrust/transform_scan.h>
 
 #include <cuda/functional>
 
-#include <stdexcept>
-
 namespace nvtext {
 namespace detail {
 namespace {
@@ -60,10 +58,10 @@ namespace {
  * member.
  */
 struct string_tokens_positions_fn {
-  cudf::column_device_view const d_strings;  // strings to tokenize
-  cudf::string_view const d_delimiter;       // delimiter to tokenize around
-  cudf::size_type const* d_token_offsets;    // offsets into the d_token_positions for each string
-  position_pair* d_token_positions;          // token positions in each string
+  cudf::column_device_view const d_strings;          // strings to tokenize
+  cudf::string_view const d_delimiter;               // delimiter to tokenize around
+  cudf::detail::input_offsetalator d_token_offsets;  // offsets of d_token_positions for each string
+  position_pair* d_token_positions;                  // token positions in each string
 
   __device__ void operator()(cudf::size_type idx)
   {
@@ -95,12 +93,12 @@ struct ngram_builder_fn {
   cudf::column_device_view const d_strings;  // strings to generate ngrams from
   cudf::string_view const d_separator;       // separator to place between them 'grams
   cudf::size_type const ngrams;              // ngram number to generate (2=bi-gram, 3=tri-gram)
-  cudf::size_type const* d_token_offsets;    // offsets for token position for each string
-  position_pair const* d_token_positions;    // token positions for each string
-  cudf::size_type const* d_chars_offsets{};  // offsets for each string's ngrams
-  char* d_chars{};                           // write ngram strings to here
-  cudf::size_type const* d_ngram_offsets{};  // offsets for sizes of each string's ngrams
-  cudf::size_type* d_ngram_sizes{};          // write ngram sizes to here
+  cudf::detail::input_offsetalator d_token_offsets;    // offsets for token position for each string
+  position_pair const* d_token_positions;              // token positions for each string
+  cudf::detail::input_offsetalator d_chars_offsets{};  // offsets for each string's ngrams
+  char* d_chars{};                                     // write ngram strings to here
+  cudf::detail::input_offsetalator d_ngram_offsets{};  // offsets for sizes of each string's ngrams
+  cudf::size_type* d_ngram_sizes{};                    // write ngram sizes to here
 
   __device__ cudf::size_type operator()(cudf::size_type idx)
   {
@@ -165,16 +163,12 @@ std::unique_ptr<cudf::column> ngrams_tokenize(cudf::strings_column_view const& s
 
   // first, get the number of tokens per string to get the token-offsets
   // Ex. token-counts = [3,2]; token-offsets = [0,3,5]
-  rmm::device_uvector<cudf::size_type> token_offsets(strings_count + 1, stream);
-  auto d_token_offsets = token_offsets.data();
-  thrust::transform_inclusive_scan(rmm::exec_policy(stream),
-                                   thrust::make_counting_iterator<cudf::size_type>(0),
-                                   thrust::make_counting_iterator<cudf::size_type>(strings_count),
-                                   d_token_offsets + 1,
-                                   strings_tokenizer{d_strings, d_delimiter},
-                                   thrust::plus<cudf::size_type>());
-  token_offsets.set_element_to_zero_async(0, stream);
-  auto const total_tokens = token_offsets.back_element(stream);  // Ex. 5 tokens
+  auto const count_itr =
+    cudf::detail::make_counting_transform_iterator(0, strings_tokenizer{d_strings, d_delimiter});
+  auto [token_offsets, total_tokens] = cudf::detail::make_offsets_child_column(
+    count_itr, count_itr + strings_count, stream, rmm::mr::get_current_device_resource());
+  auto d_token_offsets =
+    cudf::detail::offsetalator_factory::make_input_iterator(token_offsets->view());
 
   // get the token positions (in bytes) per string
   // Ex. start/end pairs: [(0,1),(2,4),(5,8), (0,2),(3,4)]
@@ -188,21 +182,18 @@ std::unique_ptr<cudf::column> ngrams_tokenize(cudf::strings_column_view const& s
 
   // compute the number of ngrams per string to get the total number of ngrams to generate
   // Ex. ngram-counts = [2,1]; ngram-offsets = [0,2,3]; total = 3 bigrams
-  rmm::device_uvector<cudf::size_type> ngram_offsets(strings_count + 1, stream);
-  auto d_ngram_offsets = ngram_offsets.data();
-  thrust::transform_inclusive_scan(
-    rmm::exec_policy(stream),
-    thrust::make_counting_iterator<cudf::size_type>(0),
-    thrust::make_counting_iterator<cudf::size_type>(strings_count),
-    d_ngram_offsets + 1,
+  auto const ngram_counts = cudf::detail::make_counting_transform_iterator(
+    0,
     cuda::proclaim_return_type<cudf::size_type>(
       [d_token_offsets, ngrams] __device__(cudf::size_type idx) {
-        auto token_count = d_token_offsets[idx + 1] - d_token_offsets[idx];
+        auto token_count =
+          static_cast<cudf::size_type>(d_token_offsets[idx + 1] - d_token_offsets[idx]);
         return (token_count >= ngrams) ? token_count - ngrams + 1 : 0;
-      }),
-    thrust::plus{});
-  ngram_offsets.set_element_to_zero_async(0, stream);
-  auto const total_ngrams = ngram_offsets.back_element(stream);
+      }));
+  auto [ngram_offsets, total_ngrams] = cudf::detail::make_offsets_child_column(
+    ngram_counts, ngram_counts + strings_count, stream, rmm::mr::get_current_device_resource());
+  auto d_ngram_offsets =
+    cudf::detail::offsetalator_factory::make_input_iterator(ngram_offsets->view());
 
   // Compute the total size of the ngrams for each string (not for each ngram)
   // Ex. 2 bigrams in 1st string total to 10 bytes; 1 bigram in 2nd string is 4 bytes
@@ -212,29 +203,21 @@ std::unique_ptr<cudf::column> ngrams_tokenize(cudf::strings_column_view const& s
   // ngrams for each string.
   // Ex. bigram for first string produces 2 bigrams ("a_bb","bb_ccc") which
   //     is built in memory like this: "a_bbbb_ccc"
-  rmm::device_uvector<cudf::size_type> chars_offsets(strings_count + 1, stream);
-  // First compute the output sizes for each string (this not the final output result)
-  thrust::transform(
-    rmm::exec_policy(stream),
-    thrust::make_counting_iterator<cudf::size_type>(0),
-    thrust::make_counting_iterator<cudf::size_type>(strings_count),
-    chars_offsets.begin(),
-    ngram_builder_fn{d_strings, d_separator, ngrams, d_token_offsets, d_token_positions});
-  // Convert the sizes to offsets
-  auto const output_chars_size = cudf::detail::sizes_to_offsets(
-    chars_offsets.begin(), chars_offsets.end(), chars_offsets.begin(), stream);
-  CUDF_EXPECTS(
-    output_chars_size <= static_cast<int64_t>(std::numeric_limits<cudf::size_type>::max()),
-    "Size of output exceeds the column size limit",
-    std::overflow_error);
+
+  //  First compute the output sizes for each string (this not the final output result)
+  auto const sizes_itr = cudf::detail::make_counting_transform_iterator(
+    0, ngram_builder_fn{d_strings, d_separator, ngrams, d_token_offsets, d_token_positions});
+  auto [chars_offsets, output_chars_size] = cudf::detail::make_offsets_child_column(
+    sizes_itr, sizes_itr + strings_count, stream, rmm::mr::get_current_device_resource());
+  auto d_chars_offsets =
+    cudf::detail::offsetalator_factory::make_input_iterator(chars_offsets->view());
 
   // This will contain the size in bytes of each ngram to generate
   rmm::device_uvector<cudf::size_type> ngram_sizes(total_ngrams, stream);
 
   // build output chars column
-  auto chars_column = cudf::strings::detail::create_chars_child_column(
-    static_cast<cudf::size_type>(output_chars_size), stream, mr);
-  auto d_chars = chars_column->mutable_view().data<char>();
+  rmm::device_uvector<char> chars(output_chars_size, stream, mr);
+  auto d_chars = chars.data();
   // Generate the ngrams into the chars column data buffer.
   // The ngram_builder_fn functor also fills the ngram_sizes vector with the
   // size of each ngram.
@@ -246,18 +229,16 @@ std::unique_ptr<cudf::column> ngrams_tokenize(cudf::strings_column_view const& s
                                       ngrams,
                                       d_token_offsets,
                                       d_token_positions,
-                                      chars_offsets.data(),
+                                      d_chars_offsets,
                                       d_chars,
                                       d_ngram_offsets,
                                       ngram_sizes.data()});
   // build the offsets column -- converting the ngram sizes into offsets
   auto offsets_column = std::get<0>(
     cudf::detail::make_offsets_child_column(ngram_sizes.begin(), ngram_sizes.end(), stream, mr));
-  chars_column->set_null_count(0);
-  offsets_column->set_null_count(0);
   // create the output strings column
   return make_strings_column(
-    total_ngrams, std::move(offsets_column), std::move(chars_column), 0, rmm::device_buffer{});
+    total_ngrams, std::move(offsets_column), chars.release(), 0, rmm::device_buffer{});
 }
 
 }  // namespace detail

From 703ee0a9b4e73389ee68d8eecb4b3e193658c6e7 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Thu, 18 Jan 2024 17:20:32 -0500
Subject: [PATCH 3/3] add tokenize_with_vocabulary

---
 cpp/src/text/vocabulary_tokenize.cu | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/cpp/src/text/vocabulary_tokenize.cu b/cpp/src/text/vocabulary_tokenize.cu
index a9e8d4d9a24..86c61cc4241 100644
--- a/cpp/src/text/vocabulary_tokenize.cu
+++ b/cpp/src/text/vocabulary_tokenize.cu
@@ -21,14 +21,15 @@
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
-#include <cudf/detail/get_value.cuh>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/offsets_iterator_factory.cuh>
 #include <cudf/detail/sizes_to_offsets_iterator.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/hashing/detail/hash_allocator.cuh>
 #include <cudf/hashing/detail/murmurhash3_x86_32.cuh>
+#include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
@@ -299,7 +300,7 @@ struct vocabulary_tokenizer_fn {
   cudf::string_view const d_delimiter;
   MapRefType d_map;
   cudf::size_type const default_id;
-  cudf::size_type const* d_offsets;
+  cudf::detail::input_offsetalator d_offsets;
   cudf::size_type* d_results;
 
   __device__ void operator()(cudf::size_type idx) const
@@ -380,7 +381,7 @@ std::unique_ptr<cudf::column> tokenize_with_vocabulary(cudf::strings_column_view
     auto tokens = cudf::make_numeric_column(
       output_type, total_count, cudf::mask_state::UNALLOCATED, stream, mr);
     auto d_tokens  = tokens->mutable_view().data<cudf::size_type>();
-    auto d_offsets = token_offsets->view().data<cudf::size_type>();
+    auto d_offsets = cudf::detail::offsetalator_factory::make_input_iterator(token_offsets->view());
     vocabulary_tokenizer_fn<decltype(map_ref)> tokenizer{
       *d_strings, d_delimiter, map_ref, default_id, d_offsets, d_tokens};
     thrust::for_each_n(rmm::exec_policy(stream), zero_itr, input.size(), tokenizer);
@@ -396,11 +397,11 @@ std::unique_ptr<cudf::column> tokenize_with_vocabulary(cudf::strings_column_view
   // longer strings perform better with warp-parallel approach
 
   auto const first_offset  = (input.offset() == 0) ? 0
-                                                   : cudf::detail::get_value<cudf::size_type>(
+                                                   : cudf::strings::detail::get_offset_value(
                                                       input.offsets(), input.offset(), stream);
   auto const last_offset   = (input.offset() == 0 && input.size() == input.offsets().size() - 1)
                                ? input.chars_size(stream)
-                               : cudf::detail::get_value<cudf::size_type>(
+                               : cudf::strings::detail::get_offset_value(
                                  input.offsets(), input.size() + input.offset(), stream);
   auto const chars_size    = last_offset - first_offset;
   auto const d_input_chars = input.chars_begin(stream) + first_offset;