Skip to content

Commit

Permalink
Add gbenchmark for cudf::strings::translate function (#7617)
Browse files Browse the repository at this point in the history
Reference #5698
This creates a gbenchmark for the `cudf::strings::translate()` API. The benchmarks measures various sized rows as well as strings lengths and translate table sizes.
This PR also includes changes to `translate.cu` implementation cleaning up the code and using the more efficient make_strings_children. This change improved performance for all 4 functions on average by 2-3x.
A further improvement was to sort the translation table input to more quickly lookup matches in device code. This added another 2x improvement when using longer translate tables.

Authors:
  - David (@davidwendt)

Approvers:
  - Nghia Truong (@ttnghia)
  - Mark Harris (@harrism)

URL: #7617
  • Loading branch information
davidwendt authored Mar 23, 2021
1 parent 500f42c commit 2bf22d1
Show file tree
Hide file tree
Showing 3 changed files with 134 additions and 46 deletions.
1 change: 1 addition & 0 deletions cpp/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -193,4 +193,5 @@ ConfigureBench(STRINGS_BENCH
string/replace_benchmark.cpp
string/split_benchmark.cpp
string/substring_benchmark.cpp
string/translate_benchmark.cpp
string/url_decode_benchmark.cpp)
85 changes: 85 additions & 0 deletions cpp/benchmarks/string/translate_benchmark.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
/*
* Copyright (c) 2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "string_bench_args.hpp"

#include <benchmark/benchmark.h>
#include <benchmarks/common/generate_benchmark_input.hpp>
#include <benchmarks/fixture/benchmark_fixture.hpp>
#include <benchmarks/synchronization/synchronization.hpp>

#include <cudf/strings/strings_column_view.hpp>
#include <cudf/strings/translate.hpp>
#include <cudf_test/column_wrapper.hpp>

#include <algorithm>

#include <thrust/iterator/counting_iterator.h>

class StringTranslate : public cudf::benchmark {
};

using entry_type = std::pair<cudf::char_utf8, cudf::char_utf8>;

static void BM_translate(benchmark::State& state, int entry_count)
{
cudf::size_type const n_rows{static_cast<cudf::size_type>(state.range(0))};
cudf::size_type const max_str_length{static_cast<cudf::size_type>(state.range(1))};
data_profile table_profile;
table_profile.set_distribution_params(
cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length);
auto const table =
create_random_table({cudf::type_id::STRING}, 1, row_count{n_rows}, table_profile);
cudf::strings_column_view input(table->view().column(0));

std::vector<entry_type> entries(entry_count);
std::transform(thrust::counting_iterator<int>(0),
thrust::counting_iterator<int>(entry_count),
entries.begin(),
[](auto idx) -> entry_type {
return entry_type{'!' + idx, '~' - idx};
});

for (auto _ : state) {
cuda_event_timer raii(state, true, 0);
cudf::strings::translate(input, entries);
}

state.SetBytesProcessed(state.iterations() * input.chars_size());
}

static void generate_bench_args(benchmark::internal::Benchmark* b)
{
int const min_rows = 1 << 12;
int const max_rows = 1 << 24;
int const row_mult = 8;
int const min_rowlen = 1 << 5;
int const max_rowlen = 1 << 13;
int const len_mult = 4;
generate_string_bench_args(b, min_rows, max_rows, row_mult, min_rowlen, max_rowlen, len_mult);
}

#define STRINGS_BENCHMARK_DEFINE(name, entries) \
BENCHMARK_DEFINE_F(StringTranslate, name) \
(::benchmark::State & st) { BM_translate(st, entries); } \
BENCHMARK_REGISTER_F(StringTranslate, name) \
->Apply(generate_bench_args) \
->UseManualTime() \
->Unit(benchmark::kMillisecond);

STRINGS_BENCHMARK_DEFINE(translate_small, 5)
STRINGS_BENCHMARK_DEFINE(translate_medium, 25)
STRINGS_BENCHMARK_DEFINE(translate_large, 50)
94 changes: 48 additions & 46 deletions cpp/src/strings/translate.cu
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
#include <cudf/column/column.hpp>
#include <cudf/column/column_device_view.cuh>
#include <cudf/column/column_factories.hpp>
#include <cudf/detail/iterator.cuh>
#include <cudf/detail/null_mask.hpp>
#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/strings/detail/utilities.hpp>
Expand All @@ -30,7 +29,8 @@
#include <rmm/cuda_stream_view.hpp>
#include <rmm/exec_policy.hpp>

#include <thrust/find.h>
#include <thrust/binary_search.h>
#include <thrust/sort.h>

#include <algorithm>

Expand All @@ -46,31 +46,37 @@ namespace {
*/
struct translate_fn {
column_device_view const d_strings;
rmm::device_vector<translate_table>::iterator table_begin;
rmm::device_vector<translate_table>::iterator table_end;
int32_t const* d_offsets{};
rmm::device_uvector<translate_table>::iterator table_begin;
rmm::device_uvector<translate_table>::iterator table_end;
int32_t* d_offsets{};
char* d_chars{};

__device__ size_type operator()(size_type idx)
__device__ void operator()(size_type idx)
{
if (d_strings.is_null(idx)) return 0;
string_view d_str = d_strings.element<string_view>(idx);
size_type bytes = d_str.size_bytes();
char* out_ptr = d_offsets ? d_chars + d_offsets[idx] : nullptr;
if (d_strings.is_null(idx)) {
if (!d_chars) d_offsets[idx] = 0;
return;
}
string_view const d_str = d_strings.element<string_view>(idx);

size_type bytes = d_str.size_bytes();
char* out_ptr = d_chars ? d_chars + d_offsets[idx] : nullptr;
for (auto chr : d_str) {
auto entry =
thrust::find_if(thrust::seq, table_begin, table_end, [chr] __device__(auto const& te) {
return te.first == chr;
});
if (entry != table_end) {
auto const entry =
thrust::lower_bound(thrust::seq,
table_begin,
table_end,
translate_table{chr, 0},
[](auto const& lhs, auto const& rhs) { return lhs.first < rhs.first; });
if (entry != table_end && entry->first == chr) {
bytes -= bytes_in_char_utf8(chr);
chr = static_cast<translate_table>(*entry).second;
chr = entry->second;
if (chr) // if null, skip the character
bytes += bytes_in_char_utf8(chr);
}
if (chr && out_ptr) out_ptr += from_char_utf8(chr, out_ptr);
}
return bytes;
if (!d_chars) d_offsets[idx] = bytes;
}
};

Expand All @@ -83,44 +89,40 @@ std::unique_ptr<column> translate(
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
{
size_type strings_count = strings.size();
if (strings_count == 0) return make_empty_strings_column(stream, mr);
if (strings.is_empty()) return make_empty_strings_column(stream, mr);

size_type table_size = static_cast<size_type>(chars_table.size());
// convert input table
thrust::host_vector<translate_table> htable(table_size);
std::transform(chars_table.begin(), chars_table.end(), htable.begin(), [](auto entry) {
return translate_table{entry.first, entry.second};
});
// The size of this table is usually much less than 100 so it is was
// found to be more efficient to sort on the CPU than the GPU.
thrust::sort(htable.begin(), htable.end(), [](auto const& lhs, auto const& rhs) {
return lhs.first < rhs.first;
});
// copy translate table to device memory
rmm::device_vector<translate_table> table(htable);

auto strings_column = column_device_view::create(strings.parent(), stream);
auto d_strings = *strings_column;
// create null mask
rmm::device_buffer null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr);
// create offsets column
auto offsets_transformer_itr = cudf::detail::make_counting_transform_iterator(
0, translate_fn{d_strings, table.begin(), table.end()});
auto offsets_column = make_offsets_child_column(
offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr);
auto d_offsets = offsets_column->view().data<int32_t>();

// build chars column
size_type bytes = thrust::device_pointer_cast(d_offsets)[strings_count];
auto chars_column = strings::detail::create_chars_child_column(
strings_count, strings.null_count(), bytes, stream, mr);
auto d_chars = chars_column->mutable_view().data<char>();
thrust::for_each_n(rmm::exec_policy(stream),
thrust::make_counting_iterator<cudf::size_type>(0),
strings_count,
translate_fn{d_strings, table.begin(), table.end(), d_offsets, d_chars});

return make_strings_column(strings_count,
std::move(offsets_column),
std::move(chars_column),
rmm::device_uvector<translate_table> table(htable.size(), stream);
CUDA_TRY(cudaMemcpyAsync(table.data(),
htable.data(),
sizeof(translate_table) * htable.size(),
cudaMemcpyHostToDevice,
stream.value()));

auto d_strings = column_device_view::create(strings.parent(), stream);

auto children = make_strings_children(translate_fn{*d_strings, table.begin(), table.end()},
strings.size(),
strings.null_count(),
stream,
mr);

return make_strings_column(strings.size(),
std::move(children.first),
std::move(children.second),
strings.null_count(),
std::move(null_mask),
cudf::detail::copy_bitmask(strings.parent(), stream, mr),
stream,
mr);
}
Expand Down

0 comments on commit 2bf22d1

Please sign in to comment.