Skip to content

Commit

Permalink
Add gbenchmark for cudf::strings::to_lower (#7316)
Browse files Browse the repository at this point in the history
Reference #5698 
This creates a gbenchmark for the `cudf::strings::to_lower`. The device logic is the same for `cudf::strings::to_upper` and `cudf::strings::swapcase` so this a good measure for the 3 APIs.

This PR is dependent on changes in PR #7292 
These are mostly in the `generate_benchmark_input.cpp`

The initial results were as follows:
```
--------------------------------------------------------------------------------------------------
Benchmark                                        Time             CPU   Iterations UserCounters...
--------------------------------------------------------------------------------------------------
StringCase/strings/4096/manual_time          0.278 ms        0.296 ms         2514 bytes_per_second=248.756M/s
StringCase/strings/32768/manual_time         0.289 ms        0.307 ms         2421 bytes_per_second=1.86625G/s
StringCase/strings/262144/manual_time        0.419 ms        0.438 ms         1662 bytes_per_second=10.2869G/s
StringCase/strings/2097152/manual_time        2.59 ms         2.61 ms          269 bytes_per_second=13.3449G/s
StringCase/strings/16777216/manual_time       25.9 ms         25.9 ms           27 bytes_per_second=10.6531G/s
```

The `convert_case` code here is a bit old. I changed it to use the more efficient `make_strings_children` utility and found the performance improved by 2x

```
--------------------------------------------------------------------------------------------------
Benchmark                                        Time             CPU   Iterations UserCounters...
--------------------------------------------------------------------------------------------------
StringCase/strings/4096/manual_time          0.117 ms        0.135 ms         5877 bytes_per_second=592.795M/s
StringCase/strings/32768/manual_time         0.122 ms        0.140 ms         5641 bytes_per_second=4.42664G/s
StringCase/strings/262144/manual_time        0.274 ms        0.292 ms         2535 bytes_per_second=15.768G/s
StringCase/strings/2097152/manual_time        1.59 ms         1.61 ms          441 bytes_per_second=21.759G/s
StringCase/strings/16777216/manual_time       12.1 ms         12.1 ms           58 bytes_per_second=22.8626G/s
```

So these changes are also included in this PR.

Authors:
  - David (@davidwendt)

Approvers:
  - Conor Hoekstra (@codereport)
  - Vukasin Milovanovic (@vuule)
  - Mark Harris (@harrism)

URL: #7316
  • Loading branch information
davidwendt authored Feb 10, 2021
1 parent 74a7c76 commit f7b3f75
Show file tree
Hide file tree
Showing 5 changed files with 105 additions and 88 deletions.
1 change: 1 addition & 0 deletions cpp/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -305,6 +305,7 @@ ConfigureBench(SUBWORD_TOKENIZER_BENCH "${SUBWORD_TOKENIZER_BENCH_SRC}")
# - strings benchmark -------------------------------------------------------------------

set(STRINGS_BENCH_SRC
"${CMAKE_CURRENT_SOURCE_DIR}/string/case_benchmark.cpp"
"${CMAKE_CURRENT_SOURCE_DIR}/string/convert_durations_benchmark.cpp")

ConfigureBench(STRINGS_BENCH "${STRINGS_BENCH_SRC}")
51 changes: 51 additions & 0 deletions cpp/benchmarks/string/case_benchmark.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
/*
* Copyright (c) 2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <benchmark/benchmark.h>
#include <benchmarks/common/generate_benchmark_input.hpp>
#include <benchmarks/fixture/benchmark_fixture.hpp>
#include <benchmarks/synchronization/synchronization.hpp>

#include <cudf/strings/case.hpp>
#include <cudf/strings/strings_column_view.hpp>

class StringCase : public cudf::benchmark {
};

static void BM_case(benchmark::State& state)
{
cudf::size_type const n_rows{(cudf::size_type)state.range(0)};
auto const table = create_random_table({cudf::type_id::STRING}, 1, row_count{n_rows});
cudf::strings_column_view input(table->view().column(0));

for (auto _ : state) {
cuda_event_timer raii(state, true, 0);
cudf::strings::to_lower(input);
}

state.SetBytesProcessed(state.iterations() * input.chars_size());
}

#define SORT_BENCHMARK_DEFINE(name) \
BENCHMARK_DEFINE_F(StringCase, name) \
(::benchmark::State & st) { BM_case(st); } \
BENCHMARK_REGISTER_F(StringCase, name) \
->RangeMultiplier(8) \
->Ranges({{1 << 12, 1 << 24}}) \
->UseManualTime() \
->Unit(benchmark::kMillisecond);

SORT_BENCHMARK_DEFINE(to_lower)
129 changes: 47 additions & 82 deletions cpp/src/strings/case.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2020, NVIDIA CORPORATION.
* Copyright (c) 2019-2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -19,6 +19,7 @@
#include <cudf/column/column.hpp>
#include <cudf/column/column_device_view.cuh>
#include <cudf/column/column_factories.hpp>
#include <cudf/copying.hpp>
#include <cudf/detail/null_mask.hpp>
#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/strings/case.hpp>
Expand All @@ -37,30 +38,18 @@ namespace cudf {
namespace strings {
namespace detail {
namespace {
/**
* @brief Used as template parameter to divide size calculation from
* the actual string operation within a function.
*
* Useful when most of the logic is identical for both passes.
*/
enum TwoPass {
SizeOnly = 0, ///< calculate the size only
ExecuteOp ///< run the string operation
};

/**
* @brief Per string logic for case conversion functions.
*
* @tparam Pass Determines if size calculation or output write is begin performed.
*/
template <TwoPass Pass = SizeOnly>
struct upper_lower_fn {
const column_device_view d_column;
character_flags_table_type case_flag; // flag to check with on each character
const character_flags_table_type* d_flags;
const character_cases_table_type* d_case_table;
const special_case_mapping* d_special_case_mapping;
const int32_t* d_offsets{};
int32_t* d_offsets{};
char* d_chars{};

__device__ special_case_mapping get_special_case_mapping(uint32_t code_point)
Expand All @@ -70,7 +59,7 @@ struct upper_lower_fn {

// compute-size / copy the bytes representing the special case mapping for this codepoint
__device__ int32_t handle_special_case_bytes(uint32_t code_point,
char*& d_buffer,
char* d_buffer,
detail::character_flags_table_type flag)
{
special_case_mapping m = get_special_case_mapping(code_point);
Expand All @@ -79,47 +68,45 @@ struct upper_lower_fn {
auto const count = IS_LOWER(flag) ? m.num_upper_chars : m.num_lower_chars;
auto const* chars = IS_LOWER(flag) ? m.upper : m.lower;
for (uint16_t idx = 0; idx < count; idx++) {
if (Pass == SizeOnly) {
bytes += detail::bytes_in_char_utf8(detail::codepoint_to_utf8(chars[idx]));
} else {
bytes += detail::from_char_utf8(detail::codepoint_to_utf8(chars[idx]), d_buffer + bytes);
}
bytes += d_buffer
? detail::from_char_utf8(detail::codepoint_to_utf8(chars[idx]), d_buffer + bytes)
: detail::bytes_in_char_utf8(detail::codepoint_to_utf8(chars[idx]));
}
if (d_buffer != nullptr) { d_buffer += bytes; }
return bytes;
}

__device__ int32_t operator()(size_type idx)
__device__ void operator()(size_type idx)
{
if (d_column.is_null(idx)) return 0; // null string
string_view d_str = d_column.template element<string_view>(idx);
int32_t bytes = 0;
char* d_buffer = nullptr;
if (Pass == ExecuteOp) d_buffer = d_chars + d_offsets[idx];
if (d_column.is_null(idx)) {
if (!d_chars) d_offsets[idx] = 0;
return;
}
auto const d_str = d_column.template element<string_view>(idx);
int32_t bytes = 0;
char* d_buffer = d_chars ? d_chars + d_offsets[idx] : nullptr;
for (auto itr = d_str.begin(); itr != d_str.end(); ++itr) {
uint32_t code_point = detail::utf8_to_codepoint(*itr);
uint32_t code_point = detail::utf8_to_codepoint(*itr);

detail::character_flags_table_type flag = code_point <= 0x00FFFF ? d_flags[code_point] : 0;

// we apply special mapping in two cases:
// - uncased characters with the special mapping flag, always
// - cased characters with the special mapping flag, when matching the input case_flag
//
if (IS_SPECIAL(flag) && ((flag & case_flag) || !IS_UPPER_OR_LOWER(flag))) {
bytes += handle_special_case_bytes(code_point, d_buffer, case_flag);
} else if (flag & case_flag) {
if (Pass == SizeOnly)
bytes += detail::bytes_in_char_utf8(detail::codepoint_to_utf8(d_case_table[code_point]));
else
d_buffer +=
detail::from_char_utf8(detail::codepoint_to_utf8(d_case_table[code_point]), d_buffer);
auto const new_bytes = handle_special_case_bytes(code_point, d_buffer, case_flag);
bytes += new_bytes;
if (d_buffer) d_buffer += new_bytes;
} else {
if (Pass == SizeOnly)
bytes += detail::bytes_in_char_utf8(*itr);
char_utf8 new_char =
(flag & case_flag) ? detail::codepoint_to_utf8(d_case_table[code_point]) : *itr;
if (!d_buffer)
bytes += detail::bytes_in_char_utf8(new_char);
else
d_buffer += detail::from_char_utf8(*itr, d_buffer);
d_buffer += detail::from_char_utf8(new_char, d_buffer);
}
}
return bytes;
if (!d_buffer) d_offsets[idx] = bytes;
}
};

Expand All @@ -138,49 +125,27 @@ std::unique_ptr<column> convert_case(strings_column_view const& strings,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
auto strings_count = strings.size();
if (strings_count == 0) return detail::make_empty_strings_column(stream, mr);

auto strings_column = column_device_view::create(strings.parent(), stream);
auto d_column = *strings_column;
size_type null_count = strings.null_count();

// copy null mask
rmm::device_buffer null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr);
// get the lookup tables used for case conversion
auto d_flags = get_character_flags_table();

auto d_case_table = get_character_cases_table();
auto d_special_case_mapping = get_special_case_mapping_table();

// build offsets column -- calculate the size of each output string
auto offsets_transformer_itr = thrust::make_transform_iterator(
thrust::make_counting_iterator<size_type>(0),
upper_lower_fn<SizeOnly>{d_column, case_flag, d_flags, d_case_table, d_special_case_mapping});
auto offsets_column = detail::make_offsets_child_column(
offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr);
auto offsets_view = offsets_column->view();
auto d_new_offsets = offsets_view.data<int32_t>();

// build the chars column -- convert characters based on case_flag parameter
size_type bytes = thrust::device_pointer_cast(d_new_offsets)[strings_count];
auto chars_column =
strings::detail::create_chars_child_column(strings_count, null_count, bytes, stream, mr);
auto chars_view = chars_column->mutable_view();
auto d_chars = chars_view.data<char>();

thrust::for_each_n(
rmm::exec_policy(stream),
thrust::make_counting_iterator<size_type>(0),
strings_count,
upper_lower_fn<ExecuteOp>{
d_column, case_flag, d_flags, d_case_table, d_special_case_mapping, d_new_offsets, d_chars});

return make_strings_column(strings_count,
std::move(offsets_column),
std::move(chars_column),
null_count,
std::move(null_mask),
if (strings.is_empty()) return detail::make_empty_strings_column(stream, mr);

auto strings_column = column_device_view::create(strings.parent(), stream);
auto d_column = *strings_column;

// build functor with lookup tables used for case conversion
upper_lower_fn functor{d_column,
case_flag,
get_character_flags_table(),
get_character_cases_table(),
get_special_case_mapping_table()};

// this utility calls the functor to build the offsets and chars columns
auto children = cudf::strings::detail::make_strings_children(
functor, strings.size(), strings.null_count(), stream, mr);

return make_strings_column(strings.size(),
std::move(children.first),
std::move(children.second),
strings.null_count(),
cudf::detail::copy_bitmask(strings.parent(), stream, mr),
stream,
mr);
}
Expand Down
6 changes: 3 additions & 3 deletions cpp/src/strings/char_types/char_cases.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020, NVIDIA CORPORATION.
* Copyright (c) 2020-2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -188,9 +188,9 @@ void generate_special_mapping_hash_table()
printf(
"// the special case mapping table is a perfect hash table with no collisions, allowing us\n"
"// to 'hash' by simply modding by the incoming codepoint\n"
"inline __device__ uint16_t get_special_case_hash_index(uint32_t code_point){\n"
"constexpr uint16_t get_special_case_hash_index(uint32_t code_point){\n"
" constexpr uint16_t special_case_prime = %d;\n"
" return code_point %% special_case_prime;"
" return static_cast<uint16_t>(code_point %% special_case_prime);"
"\n}\n",
hash_prime);
}
Expand Down
6 changes: 3 additions & 3 deletions cpp/src/strings/char_types/char_cases.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -5236,10 +5236,10 @@ constexpr special_case_mapping g_special_case_mappings[] = {
};
// the special case mapping table is a perfect hash table with no collisions, allowing us
// to 'hash' by simply modding by the incoming codepoint
inline __device__ uint16_t get_special_case_hash_index(uint32_t code_point)
constexpr uint16_t get_special_case_hash_index(uint32_t code_point)
{
constexpr uint16_t special_case_prime = 499;
return code_point % special_case_prime;
return static_cast<uint16_t>(code_point % special_case_prime);
}

} // namespace detail
Expand Down

0 comments on commit f7b3f75

Please sign in to comment.