From a77dc711966a8a80bf0b2c14d1d6def691168ad0 Mon Sep 17 00:00:00 2001
From: davidwendt <dwendt@nvidia.com>
Date: Tue, 23 Feb 2021 11:51:39 -0500
Subject: [PATCH 1/3] Add gbenchmarks for strings::split functions

---
 cpp/benchmarks/CMakeLists.txt             |  1 +
 cpp/benchmarks/string/split_benchmark.cpp | 89 +++++++++++++++++++++++
 2 files changed, 90 insertions(+)
 create mode 100644 cpp/benchmarks/string/split_benchmark.cpp
diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 0f38138fff9..001a4cce122 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -311,6 +311,7 @@ set(STRINGS_BENCH_SRC
   "${CMAKE_CURRENT_SOURCE_DIR}/string/copy_benchmark.cpp"
   "${CMAKE_CURRENT_SOURCE_DIR}/string/find_benchmark.cpp"
   "${CMAKE_CURRENT_SOURCE_DIR}/string/replace_benchmark.cpp"
+  "${CMAKE_CURRENT_SOURCE_DIR}/string/split_benchmark.cpp"
   "${CMAKE_CURRENT_SOURCE_DIR}/string/url_decode_benchmark.cpp")
 
 ConfigureBench(STRINGS_BENCH "${STRINGS_BENCH_SRC}")
diff --git a/cpp/benchmarks/string/split_benchmark.cpp b/cpp/benchmarks/string/split_benchmark.cpp
new file mode 100644
index 00000000000..35bedb1b767
--- /dev/null
+++ b/cpp/benchmarks/string/split_benchmark.cpp
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmark/benchmark.h>
+#include <benchmarks/common/generate_benchmark_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/synchronization/synchronization.hpp>
+
+#include <cudf/scalar/scalar.hpp>
+#include <cudf/strings/split/split.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf_test/column_wrapper.hpp>
+
+#include <limits>
+
+class StringSplit : public cudf::benchmark {
+};
+
+enum split_type { split, split_ws, record, record_ws };
+
+static void BM_split(benchmark::State& state, split_type rt)
+{
+  cudf::size_type const n_rows{static_cast<cudf::size_type>(state.range(0))};
+  cudf::size_type const max_str_length{static_cast<cudf::size_type>(state.range(1))};
+  data_profile table_profile;
+  table_profile.set_distribution_params(
+    cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length);
+  auto const table =
+    create_random_table({cudf::type_id::STRING}, 1, row_count{n_rows}, table_profile);
+  cudf::strings_column_view input(table->view().column(0));
+  cudf::string_scalar target("+");
+
+  for (auto _ : state) {
+    cuda_event_timer raii(state, true, 0);
+    switch (rt) {
+      case split: cudf::strings::split(input, target); break;
+      case split_ws: cudf::strings::split(input); break;
+      case record: cudf::strings::split_record(input, target); break;
+      case record_ws: cudf::strings::split_record(input); break;
+    }
+  }
+
+  state.SetBytesProcessed(state.iterations() * input.chars_size());
+}
+
+static void generate_bench_args(benchmark::internal::Benchmark* b)
+{
+  int const min_rows   = 1 << 12;
+  int const max_rows   = 1 << 24;
+  int const row_mult   = 8;
+  int const min_rowlen = 1 << 5;
+  int const max_rowlen = 1 << 13;
+  int const len_mult   = 4;
+  for (int row_count = min_rows; row_count <= max_rows; row_count *= row_mult) {
+    for (int rowlen = min_rowlen; rowlen <= max_rowlen; rowlen *= len_mult) {
+      // avoid generating combinations that exceed the cudf column limit
+      size_t total_chars = static_cast<size_t>(row_count) * rowlen;
+      if (total_chars < std::numeric_limits<cudf::size_type>::max()) {
+        b->Args({row_count, rowlen});
+      }
+    }
+  }
+}
+
+#define STRINGS_BENCHMARK_DEFINE(name)                          \
+  BENCHMARK_DEFINE_F(StringSplit, name)                         \
+  (::benchmark::State & st) { BM_split(st, split_type::name); } \
+  BENCHMARK_REGISTER_F(StringSplit, name)                       \
+    ->Apply(generate_bench_args)                                \
+    ->UseManualTime()                                           \
+    ->Unit(benchmark::kMillisecond);
+
+STRINGS_BENCHMARK_DEFINE(split)
+STRINGS_BENCHMARK_DEFINE(split_ws)
+STRINGS_BENCHMARK_DEFINE(record)
+STRINGS_BENCHMARK_DEFINE(record_ws)

From f488f67e278f1a11164e1105cb92ca41f3d60ce4 Mon Sep 17 00:00:00 2001
From: davidwendt <dwendt@nvidia.com>
Date: Tue, 23 Feb 2021 11:52:10 -0500
Subject: [PATCH 2/3] change some device_vectors to device_uvector

---
 cpp/src/strings/split/split.cu | 71 +++++++++++++++++++---------------
 1 file changed, 39 insertions(+), 32 deletions(-)

diff --git a/cpp/src/strings/split/split.cu b/cpp/src/strings/split/split.cu
index 00e4998b688..be6ace5e1fe 100644
--- a/cpp/src/strings/split/split.cu
+++ b/cpp/src/strings/split/split.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/get_value.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/strings/detail/strings_column_factories.cuh>
@@ -34,7 +35,7 @@
 #include <thrust/binary_search.h>  // upper_bound()
 #include <thrust/copy.h>           // copy_if()
 #include <thrust/count.h>          // count_if()
-#include <thrust/extrema.h>        // max()
+#include <thrust/reduce.h>         // maximum()
 #include <thrust/transform.h>      // transform()
 
 namespace cudf {
@@ -429,7 +430,7 @@ std::unique_ptr<table> split_fn(strings_column_view const& strings_column,
                                 rmm::mr::device_memory_resource* mr)
 {
   std::vector<std::unique_ptr<column>> results;
-  auto strings_count = strings_column.size();
+  auto const strings_count = strings_column.size();
   if (strings_count == 0) {
     results.push_back(make_empty_strings_column(stream, mr));
     return std::make_unique<table>(std::move(results));
@@ -437,11 +438,13 @@ std::unique_ptr<table> split_fn(strings_column_view const& strings_column,
 
   auto d_offsets = strings_column.offsets().data<int32_t>();
   d_offsets += strings_column.offset();  // nvbug-2808421 : do not combine with the previous line
-  auto chars_bytes = thrust::device_pointer_cast(d_offsets)[strings_count] -
-                     thrust::device_pointer_cast(d_offsets)[0];
+  auto const chars_bytes =
+    cudf::detail::get_value<int32_t>(
+      strings_column.offsets(), strings_column.offset() + strings_count, stream) -
+    cudf::detail::get_value<int32_t>(strings_column.offsets(), strings_column.offset(), stream);
 
   // count the number of delimiters in the entire column
-  size_type delimiter_count =
+  auto const delimiter_count =
     thrust::count_if(rmm::exec_policy(stream),
                      thrust::make_counting_iterator<size_type>(0),
                      thrust::make_counting_iterator<size_type>(chars_bytes),
@@ -450,8 +453,8 @@ std::unique_ptr<table> split_fn(strings_column_view const& strings_column,
                      });
 
   // create vector of every delimiter position in the chars column
-  rmm::device_vector<size_type> delimiter_positions(delimiter_count);
-  auto d_positions = delimiter_positions.data().get();
+  rmm::device_uvector<size_type> delimiter_positions(delimiter_count, stream);
+  auto d_positions = delimiter_positions.data();
   auto copy_end    = thrust::copy_if(rmm::exec_policy(stream),
                                   thrust::make_counting_iterator<size_type>(0),
                                   thrust::make_counting_iterator<size_type>(chars_bytes),
@@ -461,8 +464,8 @@ std::unique_ptr<table> split_fn(strings_column_view const& strings_column,
                                   });
 
   // create vector of string indices for each delimiter
-  rmm::device_vector<size_type> string_indices(delimiter_count);  // these will be strings that
-  auto d_string_indices = string_indices.data().get();            // only contain delimiters
+  rmm::device_uvector<size_type> string_indices(delimiter_count, stream);  // these will
+  auto d_string_indices = string_indices.data();  // be strings that only contain delimiters
   thrust::upper_bound(rmm::exec_policy(stream),
                       d_offsets,
                       d_offsets + strings_count,
@@ -471,8 +474,8 @@ std::unique_ptr<table> split_fn(strings_column_view const& strings_column,
                       string_indices.begin());
 
   // compute the number of tokens per string
-  rmm::device_vector<size_type> token_counts(strings_count);
-  auto d_token_counts = token_counts.data().get();
+  rmm::device_uvector<size_type> token_counts(strings_count, stream);
+  auto d_token_counts = token_counts.data();
   // first, initialize token counts for strings without delimiters in them
   thrust::transform(rmm::exec_policy(stream),
                     thrust::make_counting_iterator<size_type>(0),
@@ -482,6 +485,7 @@ std::unique_ptr<table> split_fn(strings_column_view const& strings_column,
                       // null are 0, all others 1
                       return static_cast<size_type>(tokenizer.is_valid(idx));
                     });
+
   // now compute the number of tokens in each string
   thrust::for_each_n(
     rmm::exec_policy(stream),
@@ -493,8 +497,11 @@ std::unique_ptr<table> split_fn(strings_column_view const& strings_column,
     });
 
   // the columns_count is the maximum number of tokens for any string
-  size_type columns_count =
-    *thrust::max_element(rmm::exec_policy(stream), token_counts.begin(), token_counts.end());
+  auto const columns_count = thrust::reduce(rmm::exec_policy(stream),
+                                            token_counts.begin(),
+                                            token_counts.end(),
+                                            0,
+                                            thrust::maximum<size_type>{});
   // boundary case: if no columns, return one null column (custrings issue #119)
   if (columns_count == 0) {
     results.push_back(std::make_unique<column>(
@@ -506,8 +513,8 @@ std::unique_ptr<table> split_fn(strings_column_view const& strings_column,
   }
 
   // create working area to hold all token positions
-  rmm::device_vector<string_index_pair> tokens(columns_count * strings_count);
-  string_index_pair* d_tokens = tokens.data().get();
+  rmm::device_uvector<string_index_pair> tokens(columns_count * strings_count, stream);
+  string_index_pair* d_tokens = tokens.data();
   // initialize the token positions
   // -- accounts for nulls, empty, and strings with no delimiter in them
   thrust::for_each_n(rmm::exec_policy(stream),
@@ -748,20 +755,20 @@ std::unique_ptr<table> whitespace_split_fn(size_type strings_count,
                                            rmm::mr::device_memory_resource* mr)
 {
   // compute the number of tokens per string
-  size_type columns_count = 0;
-  rmm::device_vector<size_type> token_counts(strings_count);
-  auto d_token_counts = token_counts.data().get();
-  if (strings_count > 0) {
-    thrust::transform(
-      rmm::exec_policy(stream),
-      thrust::make_counting_iterator<size_type>(0),
-      thrust::make_counting_iterator<size_type>(strings_count),
-      d_token_counts,
-      [tokenizer] __device__(size_type idx) { return tokenizer.count_tokens(idx); });
-    // column count is the maximum number of tokens for any string
-    columns_count =
-      *thrust::max_element(rmm::exec_policy(stream), token_counts.begin(), token_counts.end());
-  }
+  rmm::device_uvector<size_type> token_counts(strings_count, stream);
+  auto d_token_counts = token_counts.data();
+  thrust::transform(rmm::exec_policy(stream),
+                    thrust::make_counting_iterator<size_type>(0),
+                    thrust::make_counting_iterator<size_type>(strings_count),
+                    d_token_counts,
+                    [tokenizer] __device__(size_type idx) { return tokenizer.count_tokens(idx); });
+
+  // column count is the maximum number of tokens for any string
+  size_type const columns_count = thrust::reduce(rmm::exec_policy(stream),
+                                                 token_counts.begin(),
+                                                 token_counts.end(),
+                                                 0,
+                                                 thrust::maximum<size_type>{});
 
   std::vector<std::unique_ptr<column>> results;
   // boundary case: if no columns, return one null column (issue #119)
@@ -775,8 +782,8 @@ std::unique_ptr<table> whitespace_split_fn(size_type strings_count,
   }
 
   // get the positions for every token
-  rmm::device_vector<string_index_pair> tokens(columns_count * strings_count);
-  string_index_pair* d_tokens = tokens.data().get();
+  rmm::device_uvector<string_index_pair> tokens(columns_count * strings_count, stream);
+  string_index_pair* d_tokens = tokens.data();
   thrust::fill(rmm::exec_policy(stream),
                d_tokens,
                d_tokens + (columns_count * strings_count),

From 30f4159387a9fe734f838f31534c62259228d37c Mon Sep 17 00:00:00 2001
From: davidwendt <dwendt@nvidia.com>
Date: Thu, 25 Feb 2021 10:45:27 -0500
Subject: [PATCH 3/3] change one more device_vector to device_uvector

---
 cpp/src/strings/split/split_record.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/src/strings/split/split_record.cu b/cpp/src/strings/split/split_record.cu
index f171dcabc5d..b780791c7a5 100644
--- a/cpp/src/strings/split/split_record.cu
+++ b/cpp/src/strings/split/split_record.cu
@@ -243,9 +243,9 @@ std::unique_ptr<column> split_record_fn(strings_column_view const& strings,
   // last entry is the total number of tokens to be generated
   auto total_tokens = cudf::detail::get_value<int32_t>(offsets->view(), strings_count, stream);
   // split each string into an array of index-pair values
-  rmm::device_vector<string_index_pair> tokens(total_tokens);
+  rmm::device_uvector<string_index_pair> tokens(total_tokens, stream);
   reader.d_token_offsets = d_offsets;
-  reader.d_tokens        = tokens.data().get();
+  reader.d_tokens        = tokens.data();
   thrust::for_each_n(
     rmm::exec_policy(stream), thrust::make_counting_iterator<size_type>(0), strings_count, reader);
   // convert the index-pairs into one big strings column