From fa66823cce02c646dcd8c53e9bc584c79d183e07 Mon Sep 17 00:00:00 2001
From: David <45795991+davidwendt@users.noreply.github.com>
Date: Thu, 11 Mar 2021 09:57:38 -0500
Subject: [PATCH] Change device_vector to device_uvector in nvtext source files
 (#7512)

Reference #7287
This PR changes `cpp/src/text/*` source files to use `rmm::device_uvector` instead of `rmm:device_vector`. This allows keeping the memory operations on the provided kernel stream.

Authors:
  - David (@davidwendt)

Approvers:
  - Paul Taylor (@trxcllnt)
  - Mark Harris (@harrism)

URL: https://github.com/rapidsai/cudf/pull/7512
---
 cpp/src/text/generate_ngrams.cu | 14 ++++---------
 cpp/src/text/ngrams_tokenize.cu | 35 +++++++++++++++++----------------
 cpp/src/text/tokenize.cu        |  9 ++++++---
 3 files changed, 28 insertions(+), 30 deletions(-)

diff --git a/cpp/src/text/generate_ngrams.cu b/cpp/src/text/generate_ngrams.cu
index 79154232394..3c583622ed8 100644
--- a/cpp/src/text/generate_ngrams.cu
+++ b/cpp/src/text/generate_ngrams.cu
@@ -221,7 +221,7 @@ std::unique_ptr<cudf::column> generate_character_ngrams(cudf::strings_column_vie
   auto const d_strings      = *strings_column;
 
   // create a vector of ngram offsets for each string
-  rmm::device_vector<int32_t> ngram_offsets(strings_count + 1);
+  rmm::device_uvector<int32_t> ngram_offsets(strings_count + 1, stream);
   thrust::transform_exclusive_scan(
     rmm::exec_policy(stream),
     thrust::make_counting_iterator<cudf::size_type>(0),
@@ -235,14 +235,8 @@ std::unique_ptr<cudf::column> generate_character_ngrams(cudf::strings_column_vie
     cudf::size_type{0},
     thrust::plus<cudf::size_type>());
 
-  // total count is the last entry
-  auto const d_ngram_offsets   = ngram_offsets.data().get();
-  cudf::size_type total_ngrams = 0;
-  CUDA_TRY(cudaMemcpyAsync(&total_ngrams,
-                           d_ngram_offsets + strings_count,
-                           sizeof(cudf::size_type),
-                           cudaMemcpyDeviceToHost,
-                           stream.value()));
+  // total ngrams count is the last entry
+  cudf::size_type const total_ngrams = ngram_offsets.back_element(stream);
   CUDF_EXPECTS(total_ngrams > 0,
                "Insufficient number of characters in each string to generate ngrams");
 
@@ -254,7 +248,7 @@ std::unique_ptr<cudf::column> generate_character_ngrams(cudf::strings_column_vie
                                                   mr);
   auto d_offsets      = offsets_column->mutable_view().data<int32_t>();
   // compute the size of each ngram -- output goes in d_offsets
-  character_ngram_generator_fn generator{d_strings, ngrams, d_ngram_offsets, d_offsets};
+  character_ngram_generator_fn generator{d_strings, ngrams, ngram_offsets.data(), d_offsets};
   thrust::for_each_n(rmm::exec_policy(stream),
                      thrust::make_counting_iterator<cudf::size_type>(0),
                      strings_count,
diff --git a/cpp/src/text/ngrams_tokenize.cu b/cpp/src/text/ngrams_tokenize.cu
index 18bc86f6478..96b06e7a1eb 100644
--- a/cpp/src/text/ngrams_tokenize.cu
+++ b/cpp/src/text/ngrams_tokenize.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -158,21 +158,22 @@ std::unique_ptr<cudf::column> ngrams_tokenize(
 
   // first, get the number of tokens per string to get the token-offsets
   // Ex. token-counts = [3,2]; token-offsets = [0,3,5]
-  rmm::device_vector<int32_t> token_offsets(strings_count + 1);
-  auto d_token_offsets = token_offsets.data().get();
+  rmm::device_uvector<int32_t> token_offsets(strings_count + 1, stream);
+  auto d_token_offsets = token_offsets.data();
   thrust::transform_inclusive_scan(rmm::exec_policy(stream),
                                    thrust::make_counting_iterator<cudf::size_type>(0),
                                    thrust::make_counting_iterator<cudf::size_type>(strings_count),
                                    d_token_offsets + 1,
                                    strings_tokenizer{d_strings, d_delimiter},
                                    thrust::plus<int32_t>());
-  CUDA_TRY(cudaMemsetAsync(d_token_offsets, 0, sizeof(int32_t), stream.value()));
-  auto total_tokens = token_offsets[strings_count];  // Ex. 5 tokens
+  int32_t const zero = 0;
+  token_offsets.set_element_async(0, zero, stream);
+  auto const total_tokens = token_offsets.back_element(stream);  // Ex. 5 tokens
 
   // get the token positions (in bytes) per string
   // Ex. start/end pairs: [(0,1),(2,4),(5,8), (0,2),(3,4)]
-  rmm::device_vector<position_pair> token_positions(total_tokens);
-  auto d_token_positions = token_positions.data().get();
+  rmm::device_uvector<position_pair> token_positions(total_tokens, stream);
+  auto d_token_positions = token_positions.data();
   thrust::for_each_n(
     rmm::exec_policy(stream),
     thrust::make_counting_iterator<cudf::size_type>(0),
@@ -181,8 +182,8 @@ std::unique_ptr<cudf::column> ngrams_tokenize(
 
   // compute the number of ngrams per string to get the total number of ngrams to generate
   // Ex. ngram-counts = [2,1]; ngram-offsets = [0,2,3]; total = 3 bigrams
-  rmm::device_vector<int32_t> ngram_offsets(strings_count + 1);
-  auto d_ngram_offsets = ngram_offsets.data().get();
+  rmm::device_uvector<int32_t> ngram_offsets(strings_count + 1, stream);
+  auto d_ngram_offsets = ngram_offsets.data();
   thrust::transform_inclusive_scan(
     rmm::exec_policy(stream),
     thrust::make_counting_iterator<cudf::size_type>(0),
@@ -193,8 +194,8 @@ std::unique_ptr<cudf::column> ngrams_tokenize(
       return (token_count >= ngrams) ? token_count - ngrams + 1 : 0;
     },
     thrust::plus<int32_t>());
-  CUDA_TRY(cudaMemsetAsync(d_ngram_offsets, 0, sizeof(int32_t), stream.value()));
-  auto total_ngrams = ngram_offsets[strings_count];
+  ngram_offsets.set_element_async(0, zero, stream);
+  auto const total_ngrams = ngram_offsets.back_element(stream);
 
   // Compute the total size of the ngrams for each string (not for each ngram)
   // Ex. 2 bigrams in 1st string total to 10 bytes; 1 bigram in 2nd string is 4 bytes
@@ -204,8 +205,8 @@ std::unique_ptr<cudf::column> ngrams_tokenize(
   // ngrams for each string.
   // Ex. bigram for first string produces 2 bigrams ("a_bb","bb_ccc") which
   //     is built in memory like this: "a_bbbb_ccc"
-  rmm::device_vector<int32_t> chars_offsets(strings_count + 1);  // output memory offsets
-  auto d_chars_offsets = chars_offsets.data().get();             // per input string
+  rmm::device_uvector<int32_t> chars_offsets(strings_count + 1, stream);  // output memory offsets
+  auto d_chars_offsets = chars_offsets.data();                            // per input string
   thrust::transform_inclusive_scan(
     rmm::exec_policy(stream),
     thrust::make_counting_iterator<cudf::size_type>(0),
@@ -213,11 +214,11 @@ std::unique_ptr<cudf::column> ngrams_tokenize(
     d_chars_offsets + 1,
     ngram_builder_fn{d_strings, d_separator, ngrams, d_token_offsets, d_token_positions},
     thrust::plus<int32_t>());
-  CUDA_TRY(cudaMemsetAsync(d_chars_offsets, 0, sizeof(int32_t), stream.value()));
-  auto output_chars_size = chars_offsets[strings_count];  // Ex. 14 output bytes total
+  chars_offsets.set_element_async(0, zero, stream);
+  auto const output_chars_size = chars_offsets.back_element(stream);  // Ex. 14 output bytes total
 
-  rmm::device_vector<int32_t> ngram_sizes(total_ngrams);  // size in bytes of each
-  auto d_ngram_sizes = ngram_sizes.data().get();          // ngram to generate
+  rmm::device_uvector<int32_t> ngram_sizes(total_ngrams, stream);  // size in bytes of each
+  auto d_ngram_sizes = ngram_sizes.data();                         // ngram to generate
 
   // build chars column
   auto chars_column = cudf::strings::detail::create_chars_child_column(
diff --git a/cpp/src/text/tokenize.cu b/cpp/src/text/tokenize.cu
index 0ba51f7639f..1b7e457367e 100644
--- a/cpp/src/text/tokenize.cu
+++ b/cpp/src/text/tokenize.cu
@@ -19,6 +19,7 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/get_value.cuh>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/strings/detail/strings_column_factories.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/error.hpp>
@@ -27,9 +28,10 @@
 #include <text/utilities/tokenize_ops.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
-#include <thrust/count.h>
+#include <thrust/copy.h>
 #include <thrust/transform.h>
 
 namespace nvtext {
@@ -75,7 +77,8 @@ std::unique_ptr<cudf::column> tokenize_fn(cudf::size_type strings_count,
                          d_token_counts.template begin<int32_t>(),
                          d_token_counts.template end<int32_t>(),
                          token_offsets.begin() + 1);
-  CUDA_TRY(cudaMemsetAsync(token_offsets.data(), 0, sizeof(int32_t), stream.value()));
+  int32_t const zero = 0;
+  token_offsets.set_element_async(0, zero, stream);
   auto const total_tokens = token_offsets.back_element(stream);
   // build a list of pointers to each token
   rmm::device_uvector<string_index_pair> tokens(total_tokens, stream);
@@ -87,7 +90,7 @@ std::unique_ptr<cudf::column> tokenize_fn(cudf::size_type strings_count,
                      strings_count,
                      tokenizer);
   // create the strings column using the tokens pointers
-  return cudf::make_strings_column(tokens, stream, mr);
+  return cudf::strings::detail::make_strings_column(tokens.begin(), tokens.end(), stream, mr);
 }
 
 }  // namespace