Skip to content

Commit

Permalink
Improve string gather performance (#7433)
Browse files Browse the repository at this point in the history
This improves string gather performance by using an algorithm with character-level parallelism.  A string gather benchmark with varying row counts and row lengths has also been added.

Authors:
  - Jason Lowe (@jlowe)

Approvers:
  - David (@davidwendt)
  - Mark Harris (@harrism)

URL: #7433
  • Loading branch information
jlowe authored Mar 2, 2021
1 parent 3135f1b commit fb7f7c3
Show file tree
Hide file tree
Showing 5 changed files with 187 additions and 64 deletions.
55 changes: 40 additions & 15 deletions cpp/benchmarks/string/copy_benchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,38 +26,63 @@
#include <algorithm>
#include <random>

#include "string_bench_args.hpp"

class StringCopy : public cudf::benchmark {
};

static void BM_copy(benchmark::State& state)
enum copy_type { gather, scatter };

static void BM_copy(benchmark::State& state, copy_type ct)
{
cudf::size_type const n_rows{(cudf::size_type)state.range(0)};
auto const source = create_random_table({cudf::type_id::STRING}, 1, row_count{n_rows});
auto const target = create_random_table({cudf::type_id::STRING}, 1, row_count{n_rows});
cudf::size_type const n_rows{static_cast<cudf::size_type>(state.range(0))};
cudf::size_type const max_str_length{static_cast<cudf::size_type>(state.range(1))};
data_profile table_profile;
table_profile.set_distribution_params(
cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length);

auto const source =
create_random_table({cudf::type_id::STRING}, 1, row_count{n_rows}, table_profile);
auto const target =
create_random_table({cudf::type_id::STRING}, 1, row_count{n_rows}, table_profile);

// scatter indices
std::vector<cudf::size_type> host_map_data(n_rows);
std::iota(host_map_data.begin(), host_map_data.end(), 0);
std::random_shuffle(host_map_data.begin(), host_map_data.end());
cudf::test::fixed_width_column_wrapper<cudf::size_type> scatter_map(host_map_data.begin(),
host_map_data.end());
cudf::test::fixed_width_column_wrapper<cudf::size_type> index_map(host_map_data.begin(),
host_map_data.end());

for (auto _ : state) {
cuda_event_timer raii(state, true, 0);
cudf::scatter(source->view(), scatter_map, target->view());
switch (ct) {
case gather: cudf::gather(source->view(), index_map); break;
case scatter: cudf::scatter(source->view(), index_map, target->view()); break;
}
}

state.SetBytesProcessed(state.iterations() *
cudf::strings_column_view(source->view().column(0)).chars_size());
}

#define SORT_BENCHMARK_DEFINE(name) \
BENCHMARK_DEFINE_F(StringCopy, name) \
(::benchmark::State & st) { BM_copy(st); } \
BENCHMARK_REGISTER_F(StringCopy, name) \
->RangeMultiplier(8) \
->Ranges({{1 << 12, 1 << 24}}) \
->UseManualTime() \
static void generate_bench_args(benchmark::internal::Benchmark* b)
{
int const min_rows = 1 << 12;
int const max_rows = 1 << 24;
int const row_mult = 8;
int const min_rowlen = 1 << 5;
int const max_rowlen = 1 << 13;
int const len_mult = 4;
generate_string_bench_args(b, min_rows, max_rows, row_mult, min_rowlen, max_rowlen, len_mult);
}

#define COPY_BENCHMARK_DEFINE(name) \
BENCHMARK_DEFINE_F(StringCopy, name) \
(::benchmark::State & st) { BM_copy(st, copy_type::name); } \
BENCHMARK_REGISTER_F(StringCopy, name) \
->Apply(generate_bench_args) \
->UseManualTime() \
->Unit(benchmark::kMillisecond);

SORT_BENCHMARK_DEFINE(scatter)
COPY_BENCHMARK_DEFINE(gather)
COPY_BENCHMARK_DEFINE(scatter)
12 changes: 3 additions & 9 deletions cpp/benchmarks/string/replace_benchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@

#include <limits>

#include "string_bench_args.hpp"

class StringReplace : public cudf::benchmark {
};

Expand Down Expand Up @@ -69,15 +71,7 @@ static void generate_bench_args(benchmark::internal::Benchmark* b)
int const min_rowlen = 1 << 5;
int const max_rowlen = 1 << 13;
int const len_mult = 4;
for (int row_count = min_rows; row_count <= max_rows; row_count *= row_mult) {
for (int rowlen = min_rowlen; rowlen <= max_rowlen; rowlen *= len_mult) {
// avoid generating combinations that exceed the cudf column limit
size_t total_chars = static_cast<size_t>(row_count) * rowlen;
if (total_chars < std::numeric_limits<cudf::size_type>::max()) {
b->Args({row_count, rowlen});
}
}
}
generate_string_bench_args(b, min_rows, max_rows, row_mult, min_rowlen, max_rowlen, len_mult);
}

#define STRINGS_BENCHMARK_DEFINE(name) \
Expand Down
52 changes: 52 additions & 0 deletions cpp/benchmarks/string/string_bench_args.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
/*
* Copyright (c) 2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once

#include <benchmark/benchmark.h>

/**
* @brief Generate row count and row length argument ranges for a string benchmark.
*
* Generates a series of row count and row length arguments for string benchmarks.
* Combinations of row count and row length that would exceed the maximum string character
* column data length are not generated.
*
* @param b Benchmark to update with row count and row length arguments.
* @param min_rows Minimum row count argument to generate.
* @param max_rows Maximum row count argument to generate.
* @param rows_mult Row count multiplier to generate intermediate row count arguments.
* @param min_rowlen Minimum row length argument to generate.
* @param max_rowlen Maximum row length argument to generate.
* @param rowlen_mult Row length multiplier to generate intermediate row length arguments.
*/
inline void generate_string_bench_args(benchmark::internal::Benchmark* b,
int min_rows,
int max_rows,
int rows_mult,
int min_rowlen,
int max_rowlen,
int rowlen_mult)
{
for (int row_count = min_rows; row_count <= max_rows; row_count *= rows_mult) {
for (int rowlen = min_rowlen; rowlen <= max_rowlen; rowlen *= rowlen_mult) {
// avoid generating combinations that exceed the cudf column limit
size_t total_chars = static_cast<size_t>(row_count) * rowlen;
if (total_chars < std::numeric_limits<cudf::size_type>::max()) {
b->Args({row_count, rowlen});
}
}
}
}
87 changes: 48 additions & 39 deletions cpp/include/cudf/strings/detail/gather.cuh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2020, NVIDIA CORPORATION.
* Copyright (c) 2019-2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -20,10 +20,16 @@
#include <cudf/column/column_factories.hpp>
#include <cudf/strings/detail/utilities.hpp>
#include <cudf/strings/strings_column_view.hpp>
#include <cudf/utilities/span.hpp>

#include <rmm/cuda_stream_view.hpp>
#include <rmm/exec_policy.hpp>

#include <thrust/binary_search.h>
#include <thrust/distance.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/transform.h>

namespace cudf {

template <typename Iterator>
Expand All @@ -34,6 +40,7 @@ constexpr inline bool is_signed_iterator()

namespace strings {
namespace detail {

/**
* @brief Returns a new strings column using the specified indices to select
* elements from the `strings` column.
Expand Down Expand Up @@ -65,67 +72,69 @@ std::unique_ptr<cudf::column> gather(
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
{
auto output_count = std::distance(begin, end);
auto strings_count = strings.size();
auto const output_count = std::distance(begin, end);
auto const strings_count = strings.size();
if (output_count == 0) return make_empty_strings_column(stream, mr);

auto strings_column = column_device_view::create(strings.parent(), stream);
auto d_strings = *strings_column;

// build offsets column
auto offsets_column = make_numeric_column(
// allocate offsets column and use memory to compute string size in each output row
auto out_offsets_column = make_numeric_column(
data_type{type_id::INT32}, output_count + 1, mask_state::UNALLOCATED, stream, mr);
auto d_offsets = offsets_column->mutable_view().template data<int32_t>();
auto const d_out_offsets = out_offsets_column->mutable_view().template data<int32_t>();
auto const d_in_offsets =
(strings_count > 0) ? strings.offsets().data<int32_t>() + strings.offset() : nullptr;
thrust::transform(rmm::exec_policy(stream),
begin,
end,
d_offsets,
[d_strings, strings_count] __device__(size_type idx) {
if (NullifyOutOfBounds && ((idx < 0) || (idx >= strings_count))) return 0;
if (d_strings.is_null(idx)) return 0;
return d_strings.element<string_view>(idx).size_bytes();
d_out_offsets,
[d_in_offsets, strings_count] __device__(size_type in_idx) {
if (NullifyOutOfBounds && (in_idx < 0 || in_idx >= strings_count)) return 0;
return d_in_offsets[in_idx + 1] - d_in_offsets[in_idx];
});

// check total size is not too large
size_t total_bytes = thrust::transform_reduce(
size_t const total_bytes = thrust::transform_reduce(
rmm::exec_policy(stream),
d_offsets,
d_offsets + output_count,
d_out_offsets,
d_out_offsets + output_count,
[] __device__(auto size) { return static_cast<size_t>(size); },
size_t{0},
thrust::plus<size_t>{});
CUDF_EXPECTS(total_bytes < static_cast<std::size_t>(std::numeric_limits<size_type>::max()),
"total size of output strings is too large for a cudf column");

// create offsets from sizes
// In-place convert output sizes into offsets
thrust::exclusive_scan(
rmm::exec_policy(stream), d_offsets, d_offsets + output_count + 1, d_offsets);
rmm::exec_policy(stream), d_out_offsets, d_out_offsets + output_count + 1, d_out_offsets);

// build chars column
size_type bytes = static_cast<size_type>(total_bytes);
auto chars_column = create_chars_child_column(output_count, 0, bytes, stream, mr);
auto d_chars = chars_column->mutable_view().template data<char>();
size_type const out_chars_bytes = static_cast<size_type>(total_bytes);
auto out_chars_column = create_chars_child_column(output_count, 0, out_chars_bytes, stream, mr);
auto const d_out_chars = out_chars_column->mutable_view().template data<char>();

// fill in chars
auto gather_chars =
[d_strings, begin, strings_count, d_offsets, d_chars] __device__(size_type idx) {
auto index = begin[idx];
if (NullifyOutOfBounds) {
if (is_signed_iterator<MapIterator>() ? ((index < 0) || (index >= strings_count))
: (index >= strings_count))
return;
}
if (d_strings.is_null(index)) return;
string_view d_str = d_strings.element<string_view>(index);
memcpy(d_chars + d_offsets[idx], d_str.data(), d_str.size_bytes());
cudf::detail::device_span<int32_t const> const d_out_offsets_span(d_out_offsets,
output_count + 1);
auto const d_in_chars = (strings_count > 0) ? strings.chars().data<char>() : nullptr;
auto gather_chars_fn =
[d_out_offsets_span, begin, d_in_offsets, d_in_chars] __device__(size_type out_char_idx) {
// find output row index for this output char index
auto const next_row_ptr = thrust::upper_bound(
thrust::seq, d_out_offsets_span.begin(), d_out_offsets_span.end(), out_char_idx);
auto const out_row_idx = thrust::distance(d_out_offsets_span.begin(), next_row_ptr) - 1;
auto const str_char_offset = out_char_idx - d_out_offsets_span[out_row_idx];
auto const in_row_idx = begin[out_row_idx];
auto const in_char_offset = d_in_offsets[in_row_idx] + str_char_offset;
return d_in_chars[in_char_offset];
};
thrust::for_each_n(rmm::exec_policy(stream),
thrust::make_counting_iterator<size_type>(0),
output_count,
gather_chars);
thrust::transform(rmm::exec_policy(stream),
thrust::make_counting_iterator<size_type>(0),
thrust::make_counting_iterator<size_type>(out_chars_bytes),
d_out_chars,
gather_chars_fn);

return make_strings_column(output_count,
std::move(offsets_column),
std::move(chars_column),
std::move(out_offsets_column),
std::move(out_chars_column),
0,
rmm::device_buffer{0, stream, mr},
stream,
Expand Down
45 changes: 44 additions & 1 deletion cpp/tests/copying/gather_str_tests.cu
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,35 @@ TEST_F(GatherTestStr, StringColumn)
CUDF_TEST_EXPECT_TABLES_EQUAL(expected, got->view());
}

TEST_F(GatherTestStr, GatherSlicedStringsColumn)
{
cudf::test::strings_column_wrapper strings{{"This", "is", "not", "a", "string", "type"},
{1, 1, 1, 1, 1, 0}};
std::vector<cudf::size_type> slice_indices{0, 2, 2, 3, 3, 6};
auto sliced_strings = cudf::slice(strings, slice_indices);
{
cudf::test::fixed_width_column_wrapper<int16_t> gather_map{{1, 0, 1}};
cudf::test::strings_column_wrapper expected_strings{{"is", "This", "is"}, {1, 1, 1}};
cudf::table_view expected{{expected_strings}};
auto result = cudf::gather(cudf::table_view{{sliced_strings[0]}}, gather_map);
CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result->view());
}
{
cudf::test::fixed_width_column_wrapper<int16_t> gather_map{{0, 0, 0}};
cudf::test::strings_column_wrapper expected_strings{{"not", "not", "not"}, {1, 1, 1}};
cudf::table_view expected{{expected_strings}};
auto result = cudf::gather(cudf::table_view{{sliced_strings[1]}}, gather_map);
CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result->view());
}
{
cudf::test::fixed_width_column_wrapper<int16_t> gather_map{{2, 1, 0}};
cudf::test::strings_column_wrapper expected_strings{{"", "string", "a"}, {0, 1, 1}};
cudf::table_view expected{{expected_strings}};
auto result = cudf::gather(cudf::table_view{{sliced_strings[2]}}, gather_map);
CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result->view());
}
}

TEST_F(GatherTestStr, Gather)
{
std::vector<const char*> h_strings{"eee", "bb", "", "aa", "bbb", "ééé"};
Expand Down Expand Up @@ -98,7 +127,7 @@ TEST_F(GatherTestStr, GatherDontCheckOutOfBounds)
CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view().column(0), expected);
}

TEST_F(GatherTestStr, GatherZeroSizeStringsColumn)
TEST_F(GatherTestStr, GatherEmptyMapStringsColumn)
{
cudf::column_view zero_size_strings_column(
cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
Expand All @@ -109,3 +138,17 @@ TEST_F(GatherTestStr, GatherZeroSizeStringsColumn)
cudf::out_of_bounds_policy::NULLIFY);
cudf::test::expect_strings_empty(results->get_column(0).view());
}

TEST_F(GatherTestStr, GatherZeroSizeStringsColumn)
{
cudf::column_view zero_size_strings_column(
cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
cudf::test::fixed_width_column_wrapper<int32_t> gather_map({0});
cudf::column_view gather_view = gather_map;
cudf::test::strings_column_wrapper expected{std::pair<std::string, bool>{"", false}};
auto results = cudf::detail::gather(cudf::table_view({zero_size_strings_column}),
gather_view.begin<int32_t>(),
gather_view.end<int32_t>(),
cudf::out_of_bounds_policy::NULLIFY);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, results->get_column(0).view());
}

0 comments on commit fb7f7c3

Please sign in to comment.