Add gbenchmarks for strings filter functions (#7438)

Reference #5698 This creates a gbenchmark for the `cudf::strings::filter_characters`, `cudf::strings::filter_characters_of_type`, and `cudf::strings::strip` functions. This PR also includes changes to `strip.cu` and `filter_chars` to use the more efficient `make_strings_children` utility. This improved performance on these functions by 2x on average. Authors: - David (@davidwendt) Approvers: - Keith Kraus (@kkraus14) - Conor Hoekstra (@codereport) - Karthikeyan (@karthikeyann) URL: #7438
rapidsai · Feb 26, 2021 · 862559f · 862559f
1 parent 6b579c7
commit 862559f
Show file tree

Hide file tree

Showing 4 changed files with 178 additions and 119 deletions.
diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
@@ -309,6 +309,7 @@ set(STRINGS_BENCH_SRC
   "${CMAKE_CURRENT_SOURCE_DIR}/string/contains_benchmark.cpp"
   "${CMAKE_CURRENT_SOURCE_DIR}/string/convert_durations_benchmark.cpp"
   "${CMAKE_CURRENT_SOURCE_DIR}/string/copy_benchmark.cpp"
+  "${CMAKE_CURRENT_SOURCE_DIR}/string/filter_benchmark.cpp"
   "${CMAKE_CURRENT_SOURCE_DIR}/string/find_benchmark.cpp"
   "${CMAKE_CURRENT_SOURCE_DIR}/string/replace_benchmark.cpp"
   "${CMAKE_CURRENT_SOURCE_DIR}/string/split_benchmark.cpp"

diff --git a/cpp/benchmarks/string/filter_benchmark.cpp b/cpp/benchmarks/string/filter_benchmark.cpp
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmark/benchmark.h>
+#include <benchmarks/common/generate_benchmark_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/synchronization/synchronization.hpp>
+
+#include <cudf/scalar/scalar.hpp>
+#include <cudf/strings/char_types/char_types.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/strings/strip.hpp>
+#include <cudf/strings/translate.hpp>
+#include <cudf_test/column_wrapper.hpp>
+
+#include <limits>
+#include <vector>
+
+enum FilterAPI { filter, filter_chars, strip };
+
+class StringFilterChars : public cudf::benchmark {
+};
+
+static void BM_filter_chars(benchmark::State& state, FilterAPI api)
+{
+  cudf::size_type const n_rows{static_cast<cudf::size_type>(state.range(0))};
+  cudf::size_type const max_str_length{static_cast<cudf::size_type>(state.range(1))};
+  data_profile table_profile;
+  table_profile.set_distribution_params(
+    cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length);
+  auto const table =
+    create_random_table({cudf::type_id::STRING}, 1, row_count{n_rows}, table_profile);
+  cudf::strings_column_view input(table->view().column(0));
+
+  auto const types = cudf::strings::string_character_types::SPACE;
+  std::vector<std::pair<cudf::char_utf8, cudf::char_utf8>> filter_table{
+    {cudf::char_utf8{'a'}, cudf::char_utf8{'c'}}};
+
+  for (auto _ : state) {
+    cuda_event_timer raii(state, true, 0);
+    switch (api) {
+      case filter: cudf::strings::filter_characters_of_type(input, types); break;
+      case filter_chars: cudf::strings::filter_characters(input, filter_table); break;
+      case strip: cudf::strings::strip(input); break;
+    }
+  }
+
+  state.SetBytesProcessed(state.iterations() * input.chars_size());
+}
+
+static void generate_bench_args(benchmark::internal::Benchmark* b)
+{
+  int const min_rows   = 1 << 12;
+  int const max_rows   = 1 << 24;
+  int const row_mult   = 8;
+  int const min_rowlen = 1 << 5;
+  int const max_rowlen = 1 << 13;
+  int const len_mult   = 4;
+  for (int row_count = min_rows; row_count <= max_rows; row_count *= row_mult) {
+    for (int rowlen = min_rowlen; rowlen <= max_rowlen; rowlen *= len_mult) {
+      // avoid generating combinations that exceed the cudf column limit
+      size_t total_chars = static_cast<size_t>(row_count) * rowlen;
+      if (total_chars < std::numeric_limits<cudf::size_type>::max()) {
+        b->Args({row_count, rowlen});
+      }
+    }
+  }
+}
+
+#define STRINGS_BENCHMARK_DEFINE(name)                                \
+  BENCHMARK_DEFINE_F(StringFilterChars, name)                         \
+  (::benchmark::State & st) { BM_filter_chars(st, FilterAPI::name); } \
+  BENCHMARK_REGISTER_F(StringFilterChars, name)                       \
+    ->Apply(generate_bench_args)                                      \
+    ->UseManualTime()                                                 \
+    ->Unit(benchmark::kMillisecond);
+
+STRINGS_BENCHMARK_DEFINE(filter)
+STRINGS_BENCHMARK_DEFINE(filter_chars)
+STRINGS_BENCHMARK_DEFINE(strip)
diff --git a/cpp/src/strings/filter_chars.cu b/cpp/src/strings/filter_chars.cu
@@ -49,10 +49,10 @@ namespace {
 struct filter_fn {
   column_device_view const d_strings;
   filter_type keep_characters;
-  rmm::device_vector<char_range>::iterator table_begin;
-  rmm::device_vector<char_range>::iterator table_end;
+  rmm::device_uvector<char_range>::iterator table_begin;
+  rmm::device_uvector<char_range>::iterator table_end;
   string_view const d_replacement;
-  int32_t const* d_offsets{};
+  int32_t* d_offsets{};
   char* d_chars{};
 
   /**
@@ -78,23 +78,28 @@ struct filter_fn {
    * This is also used to calculate the size of the output.
    *
    * @param idx Index of the current string to process.
-   * @return The size of the output for this string.
    */
-  __device__ size_type operator()(size_type idx)
+  __device__ void operator()(size_type idx)
   {
-    if (d_strings.is_null(idx)) return 0;
-    string_view d_str = d_strings.element<string_view>(idx);
-    size_type nbytes  = d_str.size_bytes();
-    auto const in_ptr = d_str.data();
-    auto out_ptr      = d_chars ? d_chars + d_offsets[idx] : nullptr;
+    if (d_strings.is_null(idx)) {
+      if (!d_chars) d_offsets[idx] = 0;
+      return;
+    }
+    auto const d_str = d_strings.element<string_view>(idx);
+
+    auto nbytes  = d_str.size_bytes();
+    auto out_ptr = d_chars ? d_chars + d_offsets[idx] : nullptr;
     for (auto itr = d_str.begin(); itr != d_str.end(); ++itr) {
-      auto const char_size = bytes_in_char_utf8(*itr);
-      string_view const d_newchar =
-        remove_char(*itr) ? d_replacement : string_view(in_ptr + itr.byte_offset(), char_size);
-      nbytes += d_newchar.size_bytes() - char_size;
-      if (out_ptr) out_ptr = cudf::strings::detail::copy_string(out_ptr, d_newchar);
+      auto const char_size        = bytes_in_char_utf8(*itr);
+      string_view const d_newchar = remove_char(*itr)
+                                      ? d_replacement
+                                      : string_view(d_str.data() + itr.byte_offset(), char_size);
+      if (out_ptr)
+        out_ptr = cudf::strings::detail::copy_string(out_ptr, d_newchar);
+      else
+        nbytes += d_newchar.size_bytes() - char_size;
     }
-    return nbytes;
+    if (!out_ptr) d_offsets[idx] = nbytes;
   }
 };
 
@@ -123,36 +128,25 @@ std::unique_ptr<column> filter_characters(
     characters_to_filter.begin(), characters_to_filter.end(), htable.begin(), [](auto entry) {
       return char_range{entry.first, entry.second};
     });
-  rmm::device_vector<char_range> table(htable);  // copy filter table to device memory
-
-  auto strings_column = column_device_view::create(strings.parent(), stream);
-  auto d_strings      = *strings_column;
-
-  // create null mask
-  rmm::device_buffer null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr);
-
-  // create offsets column
-  filter_fn ffn{d_strings, keep_characters, table.begin(), table.end(), d_replacement};
-  auto offsets_transformer_itr = cudf::detail::make_counting_transform_iterator(0, ffn);
-  auto offsets_column          = make_offsets_child_column(
-    offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr);
-  ffn.d_offsets = offsets_column->view().data<int32_t>();
-
-  // build chars column
-  size_type bytes = cudf::detail::get_value<int32_t>(offsets_column->view(), strings_count, stream);
-  auto chars_column = strings::detail::create_chars_child_column(
-    strings_count, strings.null_count(), bytes, stream, mr);
-  ffn.d_chars = chars_column->mutable_view().data<char>();
-  thrust::for_each_n(rmm::exec_policy(stream),
-                     thrust::make_counting_iterator<cudf::size_type>(0),
-                     strings_count,
-                     ffn);
+  rmm::device_uvector<char_range> table(table_size, stream);
+  CUDA_TRY(cudaMemcpyAsync(table.data(),
+                           htable.data(),
+                           table_size * sizeof(char_range),
+                           cudaMemcpyHostToDevice,
+                           stream.value()));
+
+  auto d_strings = column_device_view::create(strings.parent(), stream);
+
+  // this utility calls the strip_fn to build the offsets and chars columns
+  filter_fn ffn{*d_strings, keep_characters, table.begin(), table.end(), d_replacement};
+  auto children = cudf::strings::detail::make_strings_children(
+    ffn, strings.size(), strings.null_count(), stream, mr);
 
   return make_strings_column(strings_count,
-                             std::move(offsets_column),
-                             std::move(chars_column),
+                             std::move(children.first),
+                             std::move(children.second),
                              strings.null_count(),
-                             std::move(null_mask),
+                             cudf::detail::copy_bitmask(strings.parent(), stream, mr),
                              stream,
                              mr);
 }

diff --git a/cpp/src/strings/strip.cu b/cpp/src/strings/strip.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -35,16 +35,6 @@ namespace cudf {
 namespace strings {
 namespace detail {
 namespace {
-/**
- * @brief Used as template parameter to divide size calculation from
- * the actual string operation within a function.
- *
- * Useful when most of the logic is identical for both passes.
- */
-enum TwoPass {
-  SizeOnly = 0,  ///< calculate the size only
-  ExecuteOp      ///< run the string operation
-};
 
 /**
  * @brief Strip characters from the beginning and/or end of a string.
@@ -53,51 +43,52 @@ enum TwoPass {
  * of any characters found in d_to_strip or whitespace if
  * d_to_strip is empty.
  *
- * @tparam Pass Allows computing only the size of the output
- *              or writing the output to device memory.
  */
-template <TwoPass Pass = SizeOnly>
 struct strip_fn {
   column_device_view const d_strings;
-  strip_type stype;  // right, left, or both
-  string_view d_to_strip;
-  int32_t const* d_offsets{};
+  strip_type const stype;  // right, left, or both
+  string_view const d_to_strip;
+  int32_t* d_offsets{};
   char* d_chars{};
 
-  __device__ bool is_strip_character(char_utf8 chr)
-  {
-    return d_to_strip.empty() ? (chr <= ' ') :  // whitespace check
-             thrust::any_of(
-               thrust::seq, d_to_strip.begin(), d_to_strip.end(), [chr] __device__(char_utf8 c) {
-                 return c == chr;
-               });
-  }
-
-  __device__ size_type operator()(size_type idx)
+  __device__ void operator()(size_type idx)
   {
-    if (d_strings.is_null(idx)) return 0;
-    string_view d_str     = d_strings.element<string_view>(idx);
-    size_type length      = d_str.length();
-    size_type left_offset = 0;
-    auto itr              = d_str.begin();
-    if (stype == strip_type::LEFT || stype == strip_type::BOTH) {
-      for (; itr != d_str.end();) {
-        if (!is_strip_character(*itr++)) break;
-        left_offset = itr.byte_offset();
-      }
+    if (d_strings.is_null(idx)) {
+      if (!d_chars) d_offsets[idx] = 0;
+      return;
     }
+    auto const d_str = d_strings.element<string_view>(idx);
+
+    auto is_strip_character = [d_to_strip = d_to_strip] __device__(char_utf8 chr) -> bool {
+      return d_to_strip.empty() ? (chr <= ' ') :  // whitespace check
+               thrust::any_of(
+                 thrust::seq, d_to_strip.begin(), d_to_strip.end(), [chr] __device__(char_utf8 c) {
+                   return c == chr;
+                 });
+    };
+
+    size_type const left_offset = [&] {
+      if (stype != strip_type::LEFT && stype != strip_type::BOTH) return 0;
+      auto const itr =
+        thrust::find_if_not(thrust::seq, d_str.begin(), d_str.end(), is_strip_character);
+      return itr != d_str.end() ? itr.byte_offset() : d_str.size_bytes();
+    }();
+
     size_type right_offset = d_str.size_bytes();
     if (stype == strip_type::RIGHT || stype == strip_type::BOTH) {
-      itr = d_str.end();
+      auto const length = d_str.length();
+      auto itr          = d_str.end();
       for (size_type n = 0; n < length; ++n) {
         if (!is_strip_character(*(--itr))) break;
         right_offset = itr.byte_offset();
       }
     }
-    size_type bytes = 0;
-    if (right_offset > left_offset) bytes = right_offset - left_offset;
-    if (Pass == ExecuteOp) memcpy(d_chars + d_offsets[idx], d_str.data() + left_offset, bytes);
-    return bytes;
+
+    auto const bytes = (right_offset > left_offset) ? right_offset - left_offset : 0;
+    if (d_chars)
+      memcpy(d_chars + d_offsets[idx], d_str.data() + left_offset, bytes);
+    else
+      d_offsets[idx] = bytes;
   }
 };
 
@@ -110,42 +101,22 @@ std::unique_ptr<column> strip(
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
-  auto strings_count = strings.size();
-  if (strings_count == 0) return detail::make_empty_strings_column(stream, mr);
+  if (strings.is_empty()) return detail::make_empty_strings_column(stream, mr);
 
   CUDF_EXPECTS(to_strip.is_valid(), "Parameter to_strip must be valid");
-  string_view d_to_strip(to_strip.data(), to_strip.size());
-
-  auto strings_column  = column_device_view::create(strings.parent(), stream);
-  auto d_column        = *strings_column;
-  size_type null_count = strings.null_count();
-
-  // copy null mask
-  rmm::device_buffer null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr);
-
-  // build offsets column -- calculate the size of each output string
-  auto offsets_transformer_itr = thrust::make_transform_iterator(
-    thrust::make_counting_iterator<size_type>(0), strip_fn<SizeOnly>{d_column, stype, d_to_strip});
-  auto offsets_column = make_offsets_child_column(
-    offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr);
-  auto offsets_view = offsets_column->view();
-  auto d_offsets    = offsets_view.data<int32_t>();
-
-  // build the chars column -- convert characters based on case_flag parameter
-  size_type bytes   = thrust::device_pointer_cast(d_offsets)[strings_count];
-  auto chars_column = create_chars_child_column(strings_count, null_count, bytes, stream, mr);
-  auto chars_view   = chars_column->mutable_view();
-  auto d_chars      = chars_view.data<char>();
-  thrust::for_each_n(rmm::exec_policy(stream),
-                     thrust::make_counting_iterator<size_type>(0),
-                     strings_count,
-                     strip_fn<ExecuteOp>{d_column, stype, d_to_strip, d_offsets, d_chars});
-
-  return make_strings_column(strings_count,
-                             std::move(offsets_column),
-                             std::move(chars_column),
-                             null_count,
-                             std::move(null_mask),
+  string_view const d_to_strip(to_strip.data(), to_strip.size());
+
+  auto const d_column = column_device_view::create(strings.parent(), stream);
+
+  // this utility calls the strip_fn to build the offsets and chars columns
+  auto children = cudf::strings::detail::make_strings_children(
+    strip_fn{*d_column, stype, d_to_strip}, strings.size(), strings.null_count(), stream, mr);
+
+  return make_strings_column(strings.size(),
+                             std::move(children.first),
+                             std::move(children.second),
+                             strings.null_count(),
+                             cudf::detail::copy_bitmask(strings.parent(), stream, mr),
                              stream,
                              mr);
 }