Add gbenchmark for cudf::strings::translate function (#7617)

Reference #5698 This creates a gbenchmark for the `cudf::strings::translate()` API. The benchmarks measures various sized rows as well as strings lengths and translate table sizes. This PR also includes changes to `translate.cu` implementation cleaning up the code and using the more efficient make_strings_children. This change improved performance for all 4 functions on average by 2-3x. A further improvement was to sort the translation table input to more quickly lookup matches in device code. This added another 2x improvement when using longer translate tables. Authors: - David (@davidwendt) Approvers: - Nghia Truong (@ttnghia) - Mark Harris (@harrism) URL: #7617
rapidsai · Mar 23, 2021 · 2bf22d1 · 2bf22d1
1 parent 500f42c
commit 2bf22d1
Show file tree

Hide file tree

Showing 3 changed files with 134 additions and 46 deletions.
diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
@@ -193,4 +193,5 @@ ConfigureBench(STRINGS_BENCH
   string/replace_benchmark.cpp
   string/split_benchmark.cpp
   string/substring_benchmark.cpp
+  string/translate_benchmark.cpp
   string/url_decode_benchmark.cpp)
diff --git a/cpp/benchmarks/string/translate_benchmark.cpp b/cpp/benchmarks/string/translate_benchmark.cpp
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "string_bench_args.hpp"
+
+#include <benchmark/benchmark.h>
+#include <benchmarks/common/generate_benchmark_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/synchronization/synchronization.hpp>
+
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/strings/translate.hpp>
+#include <cudf_test/column_wrapper.hpp>
+
+#include <algorithm>
+
+#include <thrust/iterator/counting_iterator.h>
+
+class StringTranslate : public cudf::benchmark {
+};
+
+using entry_type = std::pair<cudf::char_utf8, cudf::char_utf8>;
+
+static void BM_translate(benchmark::State& state, int entry_count)
+{
+  cudf::size_type const n_rows{static_cast<cudf::size_type>(state.range(0))};
+  cudf::size_type const max_str_length{static_cast<cudf::size_type>(state.range(1))};
+  data_profile table_profile;
+  table_profile.set_distribution_params(
+    cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length);
+  auto const table =
+    create_random_table({cudf::type_id::STRING}, 1, row_count{n_rows}, table_profile);
+  cudf::strings_column_view input(table->view().column(0));
+
+  std::vector<entry_type> entries(entry_count);
+  std::transform(thrust::counting_iterator<int>(0),
+                 thrust::counting_iterator<int>(entry_count),
+                 entries.begin(),
+                 [](auto idx) -> entry_type {
+                   return entry_type{'!' + idx, '~' - idx};
+                 });
+
+  for (auto _ : state) {
+    cuda_event_timer raii(state, true, 0);
+    cudf::strings::translate(input, entries);
+  }
+
+  state.SetBytesProcessed(state.iterations() * input.chars_size());
+}
+
+static void generate_bench_args(benchmark::internal::Benchmark* b)
+{
+  int const min_rows   = 1 << 12;
+  int const max_rows   = 1 << 24;
+  int const row_mult   = 8;
+  int const min_rowlen = 1 << 5;
+  int const max_rowlen = 1 << 13;
+  int const len_mult   = 4;
+  generate_string_bench_args(b, min_rows, max_rows, row_mult, min_rowlen, max_rowlen, len_mult);
+}
+
+#define STRINGS_BENCHMARK_DEFINE(name, entries)            \
+  BENCHMARK_DEFINE_F(StringTranslate, name)                \
+  (::benchmark::State & st) { BM_translate(st, entries); } \
+  BENCHMARK_REGISTER_F(StringTranslate, name)              \
+    ->Apply(generate_bench_args)                           \
+    ->UseManualTime()                                      \
+    ->Unit(benchmark::kMillisecond);
+
+STRINGS_BENCHMARK_DEFINE(translate_small, 5)
+STRINGS_BENCHMARK_DEFINE(translate_medium, 25)
+STRINGS_BENCHMARK_DEFINE(translate_large, 50)
diff --git a/cpp/src/strings/translate.cu b/cpp/src/strings/translate.cu
@@ -19,7 +19,6 @@
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
-#include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/strings/detail/utilities.hpp>
@@ -30,7 +29,8 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 
-#include <thrust/find.h>
+#include <thrust/binary_search.h>
+#include <thrust/sort.h>
 
 #include <algorithm>
 
@@ -46,31 +46,37 @@ namespace {
  */
 struct translate_fn {
   column_device_view const d_strings;
-  rmm::device_vector<translate_table>::iterator table_begin;
-  rmm::device_vector<translate_table>::iterator table_end;
-  int32_t const* d_offsets{};
+  rmm::device_uvector<translate_table>::iterator table_begin;
+  rmm::device_uvector<translate_table>::iterator table_end;
+  int32_t* d_offsets{};
   char* d_chars{};
 
-  __device__ size_type operator()(size_type idx)
+  __device__ void operator()(size_type idx)
   {
-    if (d_strings.is_null(idx)) return 0;
-    string_view d_str = d_strings.element<string_view>(idx);
-    size_type bytes   = d_str.size_bytes();
-    char* out_ptr     = d_offsets ? d_chars + d_offsets[idx] : nullptr;
+    if (d_strings.is_null(idx)) {
+      if (!d_chars) d_offsets[idx] = 0;
+      return;
+    }
+    string_view const d_str = d_strings.element<string_view>(idx);
+
+    size_type bytes = d_str.size_bytes();
+    char* out_ptr   = d_chars ? d_chars + d_offsets[idx] : nullptr;
     for (auto chr : d_str) {
-      auto entry =
-        thrust::find_if(thrust::seq, table_begin, table_end, [chr] __device__(auto const& te) {
-          return te.first == chr;
-        });
-      if (entry != table_end) {
+      auto const entry =
+        thrust::lower_bound(thrust::seq,
+                            table_begin,
+                            table_end,
+                            translate_table{chr, 0},
+                            [](auto const& lhs, auto const& rhs) { return lhs.first < rhs.first; });
+      if (entry != table_end && entry->first == chr) {
         bytes -= bytes_in_char_utf8(chr);
-        chr = static_cast<translate_table>(*entry).second;
+        chr = entry->second;
         if (chr)  // if null, skip the character
           bytes += bytes_in_char_utf8(chr);
       }
       if (chr && out_ptr) out_ptr += from_char_utf8(chr, out_ptr);
     }
-    return bytes;
+    if (!d_chars) d_offsets[idx] = bytes;
   }
 };
 
@@ -83,44 +89,40 @@ std::unique_ptr<column> translate(
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
-  size_type strings_count = strings.size();
-  if (strings_count == 0) return make_empty_strings_column(stream, mr);
+  if (strings.is_empty()) return make_empty_strings_column(stream, mr);
 
   size_type table_size = static_cast<size_type>(chars_table.size());
   // convert input table
   thrust::host_vector<translate_table> htable(table_size);
   std::transform(chars_table.begin(), chars_table.end(), htable.begin(), [](auto entry) {
     return translate_table{entry.first, entry.second};
   });
+  // The size of this table is usually much less than 100 so it is was
+  // found to be more efficient to sort on the CPU than the GPU.
+  thrust::sort(htable.begin(), htable.end(), [](auto const& lhs, auto const& rhs) {
+    return lhs.first < rhs.first;
+  });
   // copy translate table to device memory
-  rmm::device_vector<translate_table> table(htable);
-
-  auto strings_column = column_device_view::create(strings.parent(), stream);
-  auto d_strings      = *strings_column;
-  // create null mask
-  rmm::device_buffer null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr);
-  // create offsets column
-  auto offsets_transformer_itr = cudf::detail::make_counting_transform_iterator(
-    0, translate_fn{d_strings, table.begin(), table.end()});
-  auto offsets_column = make_offsets_child_column(
-    offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr);
-  auto d_offsets = offsets_column->view().data<int32_t>();
-
-  // build chars column
-  size_type bytes   = thrust::device_pointer_cast(d_offsets)[strings_count];
-  auto chars_column = strings::detail::create_chars_child_column(
-    strings_count, strings.null_count(), bytes, stream, mr);
-  auto d_chars = chars_column->mutable_view().data<char>();
-  thrust::for_each_n(rmm::exec_policy(stream),
-                     thrust::make_counting_iterator<cudf::size_type>(0),
-                     strings_count,
-                     translate_fn{d_strings, table.begin(), table.end(), d_offsets, d_chars});
-
-  return make_strings_column(strings_count,
-                             std::move(offsets_column),
-                             std::move(chars_column),
+  rmm::device_uvector<translate_table> table(htable.size(), stream);
+  CUDA_TRY(cudaMemcpyAsync(table.data(),
+                           htable.data(),
+                           sizeof(translate_table) * htable.size(),
+                           cudaMemcpyHostToDevice,
+                           stream.value()));
+
+  auto d_strings = column_device_view::create(strings.parent(), stream);
+
+  auto children = make_strings_children(translate_fn{*d_strings, table.begin(), table.end()},
+                                        strings.size(),
+                                        strings.null_count(),
+                                        stream,
+                                        mr);
+
+  return make_strings_column(strings.size(),
+                             std::move(children.first),
+                             std::move(children.second),
                              strings.null_count(),
-                             std::move(null_mask),
+                             cudf::detail::copy_bitmask(strings.parent(), stream, mr),
                              stream,
                              mr);
 }