Performance improvement for libcudf upper/lower conversion for long s…

…trings (#13142) Improves on performance for longer strings with `cudf::strings::to_lower()` `cudf::strings::to_upper()` and `cudf::strings::swapcase()` APIs. The current implementation works well with smallish strings and so this new implementation splits into a longish string algorithm when average number of bytes per string is 64 bytes or greater. The new implementation is similar but computes the output size of each string with a warp per string function. In addition, a check is added for long strings testing if all bytes are ASCII and thereby can run a faster kernel for this case. Reference #13048 Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Mark Harris (https://github.com/harrism) - Karthikeyan (https://github.com/karthikeyann) URL: #13142
rapidsai · May 9, 2023 · 7246a9e · 7246a9e
1 parent ac158da
commit 7246a9e
Show file tree

Hide file tree

Showing 4 changed files with 307 additions and 93 deletions.
diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
@@ -280,7 +280,6 @@ ConfigureNVBench(TEXT_NVBENCH text/minhash.cpp)
 # * strings benchmark -------------------------------------------------------------------
 ConfigureBench(
   STRINGS_BENCH
-  string/case.cpp
   string/combine.cpp
   string/contains.cpp
   string/convert_datetime.cpp
@@ -301,7 +300,9 @@ ConfigureBench(
   string/url_decode.cu
 )
 
-ConfigureNVBench(STRINGS_NVBENCH string/like.cpp string/reverse.cpp string/lengths.cpp)
+ConfigureNVBench(
+  STRINGS_NVBENCH string/like.cpp string/reverse.cpp string/lengths.cpp string/case.cpp
+)
 
 # ##################################################################################################
 # * json benchmark -------------------------------------------------------------------

diff --git a/cpp/benchmarks/string/case.cpp b/cpp/benchmarks/string/case.cpp
@@ -15,36 +15,64 @@
  */
 
 #include <benchmarks/common/generate_input.hpp>
-#include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/synchronization/synchronization.hpp>
+#include <benchmarks/fixture/rmm_pool_raii.hpp>
 
 #include <cudf/strings/case.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
-class StringCase : public cudf::benchmark {};
+#include <nvbench/nvbench.cuh>
 
-static void BM_case(benchmark::State& state)
+void bench_case(nvbench::state& state)
 {
-  cudf::size_type const n_rows{(cudf::size_type)state.range(0)};
-  auto const column = create_random_column(cudf::type_id::STRING, row_count{n_rows});
-  cudf::strings_column_view input(column->view());
+  auto const n_rows    = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const max_width = static_cast<int32_t>(state.get_int64("width"));
+  auto const encoding  = state.get_string("encoding");
 
-  for (auto _ : state) {
-    cuda_event_timer raii(state, true, cudf::get_default_stream());
-    cudf::strings::to_lower(input);
+  if (static_cast<std::size_t>(n_rows) * static_cast<std::size_t>(max_width) >=
+      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
+    state.skip("Skip benchmarks greater than size_type limit");
   }
 
-  state.SetBytesProcessed(state.iterations() * input.chars_size());
-}
+  data_profile const profile = data_profile_builder().distribution(
+    cudf::type_id::STRING, distribution_id::NORMAL, 0, max_width);
+  auto const column = create_random_column(cudf::type_id::STRING, row_count{n_rows}, profile);
+
+  auto col_view = column->view();
+
+  cudf::column::contents ascii_contents;
+  if (encoding == "ascii") {
+    data_profile ascii_profile = data_profile_builder().no_validity().distribution(
+      cudf::type_id::INT8, distribution_id::UNIFORM, 32, 126);  // nice ASCII range
+    auto input = cudf::strings_column_view(col_view);
+    auto ascii_column =
+      create_random_column(cudf::type_id::INT8, row_count{input.chars_size()}, ascii_profile);
+    auto ascii_data = ascii_column->view();
 
-#define SORT_BENCHMARK_DEFINE(name)          \
-  BENCHMARK_DEFINE_F(StringCase, name)       \
-  (::benchmark::State & st) { BM_case(st); } \
-  BENCHMARK_REGISTER_F(StringCase, name)     \
-    ->RangeMultiplier(8)                     \
-    ->Ranges({{1 << 12, 1 << 24}})           \
-    ->UseManualTime()                        \
-    ->Unit(benchmark::kMillisecond);
+    col_view = cudf::column_view(col_view.type(),
+                                 col_view.size(),
+                                 nullptr,
+                                 col_view.null_mask(),
+                                 col_view.null_count(),
+                                 0,
+                                 {input.offsets(), ascii_data});
+
+    ascii_contents = ascii_column->release();
+  }
+  auto input = cudf::strings_column_view(col_view);
+
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
+
+  state.add_element_count(input.chars_size(), "chars_size");
+  state.add_global_memory_reads<nvbench::int8_t>(input.chars_size());
+  state.add_global_memory_writes<nvbench::int8_t>(input.chars_size());
+
+  state.exec(nvbench::exec_tag::sync,
+             [&](nvbench::launch& launch) { auto result = cudf::strings::to_lower(input); });
+}
 
-SORT_BENCHMARK_DEFINE(to_lower)
+NVBENCH_BENCH(bench_case)
+  .set_name("strings_case")
+  .add_int64_axis("width", {32, 64, 128, 256, 512, 1024, 2048})
+  .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216})
+  .add_string_axis("encoding", {"ascii", "utf8"});