Skip to content

Commit

Permalink
Performance improvement for libcudf upper/lower conversion for long s…
Browse files Browse the repository at this point in the history
…trings (#13142)

Improves on performance for longer strings with `cudf::strings::to_lower()` `cudf::strings::to_upper()` and `cudf::strings::swapcase()` APIs. 

The current implementation works well with smallish strings and so this new implementation splits into a longish string algorithm when average number of bytes per string is 64 bytes or greater. The new implementation is similar but computes the output size of each string with a warp per string function. In addition, a check is added for long strings testing if all bytes are ASCII and thereby can run a faster kernel for this case.

Reference #13048

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Mark Harris (https://github.com/harrism)
  - Karthikeyan (https://github.com/karthikeyann)

URL: #13142
  • Loading branch information
davidwendt authored May 9, 2023
1 parent ac158da commit 7246a9e
Show file tree
Hide file tree
Showing 4 changed files with 307 additions and 93 deletions.
5 changes: 3 additions & 2 deletions cpp/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -280,7 +280,6 @@ ConfigureNVBench(TEXT_NVBENCH text/minhash.cpp)
# * strings benchmark -------------------------------------------------------------------
ConfigureBench(
STRINGS_BENCH
string/case.cpp
string/combine.cpp
string/contains.cpp
string/convert_datetime.cpp
Expand All @@ -301,7 +300,9 @@ ConfigureBench(
string/url_decode.cu
)

ConfigureNVBench(STRINGS_NVBENCH string/like.cpp string/reverse.cpp string/lengths.cpp)
ConfigureNVBench(
STRINGS_NVBENCH string/like.cpp string/reverse.cpp string/lengths.cpp string/case.cpp
)

# ##################################################################################################
# * json benchmark -------------------------------------------------------------------
Expand Down
70 changes: 49 additions & 21 deletions cpp/benchmarks/string/case.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,36 +15,64 @@
*/

#include <benchmarks/common/generate_input.hpp>
#include <benchmarks/fixture/benchmark_fixture.hpp>
#include <benchmarks/synchronization/synchronization.hpp>
#include <benchmarks/fixture/rmm_pool_raii.hpp>

#include <cudf/strings/case.hpp>
#include <cudf/strings/strings_column_view.hpp>
#include <cudf/utilities/default_stream.hpp>

class StringCase : public cudf::benchmark {};
#include <nvbench/nvbench.cuh>

static void BM_case(benchmark::State& state)
void bench_case(nvbench::state& state)
{
cudf::size_type const n_rows{(cudf::size_type)state.range(0)};
auto const column = create_random_column(cudf::type_id::STRING, row_count{n_rows});
cudf::strings_column_view input(column->view());
auto const n_rows = static_cast<cudf::size_type>(state.get_int64("num_rows"));
auto const max_width = static_cast<int32_t>(state.get_int64("width"));
auto const encoding = state.get_string("encoding");

for (auto _ : state) {
cuda_event_timer raii(state, true, cudf::get_default_stream());
cudf::strings::to_lower(input);
if (static_cast<std::size_t>(n_rows) * static_cast<std::size_t>(max_width) >=
static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
state.skip("Skip benchmarks greater than size_type limit");
}

state.SetBytesProcessed(state.iterations() * input.chars_size());
}
data_profile const profile = data_profile_builder().distribution(
cudf::type_id::STRING, distribution_id::NORMAL, 0, max_width);
auto const column = create_random_column(cudf::type_id::STRING, row_count{n_rows}, profile);

auto col_view = column->view();

cudf::column::contents ascii_contents;
if (encoding == "ascii") {
data_profile ascii_profile = data_profile_builder().no_validity().distribution(
cudf::type_id::INT8, distribution_id::UNIFORM, 32, 126); // nice ASCII range
auto input = cudf::strings_column_view(col_view);
auto ascii_column =
create_random_column(cudf::type_id::INT8, row_count{input.chars_size()}, ascii_profile);
auto ascii_data = ascii_column->view();

#define SORT_BENCHMARK_DEFINE(name) \
BENCHMARK_DEFINE_F(StringCase, name) \
(::benchmark::State & st) { BM_case(st); } \
BENCHMARK_REGISTER_F(StringCase, name) \
->RangeMultiplier(8) \
->Ranges({{1 << 12, 1 << 24}}) \
->UseManualTime() \
->Unit(benchmark::kMillisecond);
col_view = cudf::column_view(col_view.type(),
col_view.size(),
nullptr,
col_view.null_mask(),
col_view.null_count(),
0,
{input.offsets(), ascii_data});

ascii_contents = ascii_column->release();
}
auto input = cudf::strings_column_view(col_view);

state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));

state.add_element_count(input.chars_size(), "chars_size");
state.add_global_memory_reads<nvbench::int8_t>(input.chars_size());
state.add_global_memory_writes<nvbench::int8_t>(input.chars_size());

state.exec(nvbench::exec_tag::sync,
[&](nvbench::launch& launch) { auto result = cudf::strings::to_lower(input); });
}

SORT_BENCHMARK_DEFINE(to_lower)
NVBENCH_BENCH(bench_case)
.set_name("strings_case")
.add_int64_axis("width", {32, 64, 128, 256, 512, 1024, 2048})
.add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216})
.add_string_axis("encoding", {"ascii", "utf8"});
Loading

0 comments on commit 7246a9e

Please sign in to comment.