Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve string gather performance #7433

Merged
merged 5 commits into from
Mar 2, 2021
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 40 additions & 15 deletions cpp/benchmarks/string/copy_benchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,38 +26,63 @@
#include <algorithm>
#include <random>

#include "string_bench_args.hpp"

class StringCopy : public cudf::benchmark {
};

static void BM_copy(benchmark::State& state)
enum copy_type { gather, scatter };

static void BM_copy(benchmark::State& state, copy_type ct)
{
cudf::size_type const n_rows{(cudf::size_type)state.range(0)};
auto const source = create_random_table({cudf::type_id::STRING}, 1, row_count{n_rows});
auto const target = create_random_table({cudf::type_id::STRING}, 1, row_count{n_rows});
cudf::size_type const n_rows{static_cast<cudf::size_type>(state.range(0))};
cudf::size_type const max_str_length{static_cast<cudf::size_type>(state.range(1))};
data_profile table_profile;
table_profile.set_distribution_params(
cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length);

auto const source =
create_random_table({cudf::type_id::STRING}, 1, row_count{n_rows}, table_profile);
auto const target =
create_random_table({cudf::type_id::STRING}, 1, row_count{n_rows}, table_profile);

// scatter indices
std::vector<cudf::size_type> host_map_data(n_rows);
std::iota(host_map_data.begin(), host_map_data.end(), 0);
std::random_shuffle(host_map_data.begin(), host_map_data.end());
cudf::test::fixed_width_column_wrapper<cudf::size_type> scatter_map(host_map_data.begin(),
host_map_data.end());
cudf::test::fixed_width_column_wrapper<cudf::size_type> index_map(host_map_data.begin(),
host_map_data.end());

for (auto _ : state) {
cuda_event_timer raii(state, true, 0);
cudf::scatter(source->view(), scatter_map, target->view());
switch (ct) {
case gather: cudf::gather(source->view(), index_map); break;
case scatter: cudf::scatter(source->view(), index_map, target->view()); break;
}
}

state.SetBytesProcessed(state.iterations() *
cudf::strings_column_view(source->view().column(0)).chars_size());
}

#define SORT_BENCHMARK_DEFINE(name) \
BENCHMARK_DEFINE_F(StringCopy, name) \
(::benchmark::State & st) { BM_copy(st); } \
BENCHMARK_REGISTER_F(StringCopy, name) \
->RangeMultiplier(8) \
->Ranges({{1 << 12, 1 << 24}}) \
->UseManualTime() \
static void generate_bench_args(benchmark::internal::Benchmark* b)
{
int const min_rows = 1 << 12;
int const max_rows = 1 << 24;
int const row_mult = 8;
int const min_rowlen = 1 << 5;
int const max_rowlen = 1 << 13;
int const len_mult = 4;
generate_string_bench_args(b, min_rows, max_rows, row_mult, min_rowlen, max_rowlen, len_mult);
}

#define COPY_BENCHMARK_DEFINE(name) \
BENCHMARK_DEFINE_F(StringCopy, name) \
(::benchmark::State & st) { BM_copy(st, copy_type::name); } \
BENCHMARK_REGISTER_F(StringCopy, name) \
->Apply(generate_bench_args) \
->UseManualTime() \
->Unit(benchmark::kMillisecond);

SORT_BENCHMARK_DEFINE(scatter)
COPY_BENCHMARK_DEFINE(gather)
COPY_BENCHMARK_DEFINE(scatter)
12 changes: 3 additions & 9 deletions cpp/benchmarks/string/replace_benchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@

#include <limits>

#include "string_bench_args.hpp"

class StringReplace : public cudf::benchmark {
};

Expand Down Expand Up @@ -69,15 +71,7 @@ static void generate_bench_args(benchmark::internal::Benchmark* b)
int const min_rowlen = 1 << 5;
int const max_rowlen = 1 << 13;
int const len_mult = 4;
for (int row_count = min_rows; row_count <= max_rows; row_count *= row_mult) {
for (int rowlen = min_rowlen; rowlen <= max_rowlen; rowlen *= len_mult) {
// avoid generating combinations that exceed the cudf column limit
size_t total_chars = static_cast<size_t>(row_count) * rowlen;
if (total_chars < std::numeric_limits<cudf::size_type>::max()) {
b->Args({row_count, rowlen});
}
}
}
generate_string_bench_args(b, min_rows, max_rows, row_mult, min_rowlen, max_rowlen, len_mult);
}

#define STRINGS_BENCHMARK_DEFINE(name) \
Expand Down
52 changes: 52 additions & 0 deletions cpp/benchmarks/string/string_bench_args.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
/*
* Copyright (c) 2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once

#include <benchmark/benchmark.h>

/**
* @brief Generate row count and row length argument ranges for a string benchmark.
*
* Generates a series of row count and row length arguments for string benchmarks.
* Combinations of row count and row length that would exceed the maximum string character
* column data length are not generated.
*
* @param b Benchmark to update with row count and row length arguments.
* @param min_rows Minimum row count argument to generate.
* @param max_rows Maximum row count argument to generate.
* @param rows_mult Row count multiplier to generate intermediate row count arguments.
* @param min_rowlen Minimum row length argument to generate.
* @param max_rowlen Maximum row length argument to generate.
* @param rowlen_mult Row length multiplier to generate intermediate row length arguments.
*/
inline void generate_string_bench_args(benchmark::internal::Benchmark* b,
int min_rows,
int max_rows,
int rows_mult,
int min_rowlen,
int max_rowlen,
int rowlen_mult)
{
for (int row_count = min_rows; row_count <= max_rows; row_count *= rows_mult) {
for (int rowlen = min_rowlen; rowlen <= max_rowlen; rowlen *= rowlen_mult) {
// avoid generating combinations that exceed the cudf column limit
size_t total_chars = static_cast<size_t>(row_count) * rowlen;
if (total_chars < std::numeric_limits<cudf::size_type>::max()) {
b->Args({row_count, rowlen});
}
}
}
}
103 changes: 64 additions & 39 deletions cpp/include/cudf/strings/detail/gather.cuh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2020, NVIDIA CORPORATION.
* Copyright (c) 2019-2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -20,10 +20,17 @@
#include <cudf/column/column_factories.hpp>
#include <cudf/strings/detail/utilities.hpp>
#include <cudf/strings/strings_column_view.hpp>
#include <cudf/utilities/error.hpp>
#include <cudf/utilities/span.hpp>

#include <rmm/cuda_stream_view.hpp>
#include <rmm/exec_policy.hpp>

#include <thrust/binary_search.h>
#include <thrust/distance.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/transform.h>

namespace cudf {

template <typename Iterator>
Expand All @@ -34,6 +41,7 @@ constexpr inline bool is_signed_iterator()

namespace strings {
namespace detail {

/**
* @brief Returns a new strings column using the specified indices to select
* elements from the `strings` column.
Expand Down Expand Up @@ -65,67 +73,84 @@ std::unique_ptr<cudf::column> gather(
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
{
auto output_count = std::distance(begin, end);
auto strings_count = strings.size();
auto const output_count = std::distance(begin, end);
auto const strings_count = strings.size();
if (output_count == 0) return make_empty_strings_column(stream, mr);

auto strings_column = column_device_view::create(strings.parent(), stream);
auto d_strings = *strings_column;

// build offsets column
auto offsets_column = make_numeric_column(
// allocate offsets column
auto out_offsets_column = make_numeric_column(
data_type{type_id::INT32}, output_count + 1, mask_state::UNALLOCATED, stream, mr);
auto d_offsets = offsets_column->mutable_view().template data<int32_t>();
auto const d_out_offsets = out_offsets_column->mutable_view().template data<int32_t>();

if (strings_count == 0) {
jlowe marked this conversation as resolved.
Show resolved Hide resolved
// Gathering from an empty strings column, so generate null offsets with no chars.
CUDF_EXPECTS(NullifyOutOfBounds, "gathering from an empty strings column");
CUDA_TRY(cudaMemsetAsync(
d_out_offsets, 0, out_offsets_column->size() * sizeof(int32_t), stream.value()));
return make_strings_column(output_count,
std::move(out_offsets_column),
create_chars_child_column(0, 0, 0, stream, mr),
0,
rmm::device_buffer{0, stream, mr},
stream,
mr);
}

// use output offsets memory to compute string size in each output row
auto const d_in_offsets = strings.offsets().data<int32_t>() + strings.offset();
thrust::transform(rmm::exec_policy(stream),
begin,
end,
d_offsets,
[d_strings, strings_count] __device__(size_type idx) {
if (NullifyOutOfBounds && ((idx < 0) || (idx >= strings_count))) return 0;
if (d_strings.is_null(idx)) return 0;
return d_strings.element<string_view>(idx).size_bytes();
d_out_offsets,
[d_in_offsets, strings_count] __device__(size_type in_idx) {
if (NullifyOutOfBounds && (in_idx < 0 || in_idx >= strings_count)) return 0;
return d_in_offsets[in_idx + 1] - d_in_offsets[in_idx];
});

// check total size is not too large
size_t total_bytes = thrust::transform_reduce(
size_t const total_bytes = thrust::transform_reduce(
rmm::exec_policy(stream),
d_offsets,
d_offsets + output_count,
d_out_offsets,
d_out_offsets + output_count,
[] __device__(auto size) { return static_cast<size_t>(size); },
size_t{0},
thrust::plus<size_t>{});
CUDF_EXPECTS(total_bytes < static_cast<std::size_t>(std::numeric_limits<size_type>::max()),
"total size of output strings is too large for a cudf column");

// create offsets from sizes
// In-place convert output sizes into offsets
thrust::exclusive_scan(
rmm::exec_policy(stream), d_offsets, d_offsets + output_count + 1, d_offsets);
rmm::exec_policy(stream), d_out_offsets, d_out_offsets + output_count + 1, d_out_offsets);

// build chars column
size_type bytes = static_cast<size_type>(total_bytes);
auto chars_column = create_chars_child_column(output_count, 0, bytes, stream, mr);
auto d_chars = chars_column->mutable_view().template data<char>();
size_type const out_chars_bytes = static_cast<size_type>(total_bytes);
auto out_chars_column = create_chars_child_column(output_count, 0, out_chars_bytes, stream, mr);
auto const d_out_chars = out_chars_column->mutable_view().template data<char>();

// fill in chars
auto gather_chars =
[d_strings, begin, strings_count, d_offsets, d_chars] __device__(size_type idx) {
auto index = begin[idx];
if (NullifyOutOfBounds) {
if (is_signed_iterator<MapIterator>() ? ((index < 0) || (index >= strings_count))
: (index >= strings_count))
return;
}
if (d_strings.is_null(index)) return;
string_view d_str = d_strings.element<string_view>(index);
memcpy(d_chars + d_offsets[idx], d_str.data(), d_str.size_bytes());
cudf::detail::device_span<int32_t const> const d_out_offsets_span(d_out_offsets,
output_count + 1);
auto const d_in_chars = strings.chars().data<char>();
auto gather_chars_fn =
[d_out_offsets_span, begin, d_in_offsets, d_in_chars] __device__(size_type out_char_idx) {
// find output row index for this output char index
auto const next_row_ptr = thrust::upper_bound(
thrust::seq, d_out_offsets_span.begin(), d_out_offsets_span.end(), out_char_idx);
auto const out_row_idx = thrust::distance(d_out_offsets_span.begin(), next_row_ptr) - 1;
auto const str_char_offset = out_char_idx - d_out_offsets_span[out_row_idx];
auto const in_row_idx = begin[out_row_idx];
auto const in_char_offset = d_in_offsets[in_row_idx] + str_char_offset;
jlowe marked this conversation as resolved.
Show resolved Hide resolved
return d_in_chars[in_char_offset];
};
thrust::for_each_n(rmm::exec_policy(stream),
thrust::make_counting_iterator<size_type>(0),
output_count,
gather_chars);
thrust::transform(rmm::exec_policy(stream),
thrust::make_counting_iterator<size_type>(0),
thrust::make_counting_iterator<size_type>(out_chars_bytes),
d_out_chars,
gather_chars_fn);

return make_strings_column(output_count,
std::move(offsets_column),
std::move(chars_column),
std::move(out_offsets_column),
std::move(out_chars_column),
0,
rmm::device_buffer{0, stream, mr},
stream,
Expand Down
46 changes: 45 additions & 1 deletion cpp/tests/copying/gather_str_tests.cu
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,35 @@ TEST_F(GatherTestStr, StringColumn)
CUDF_TEST_EXPECT_TABLES_EQUAL(expected, got->view());
}

TEST_F(GatherTestStr, GatherSlicedStringsColumn)
{
cudf::test::strings_column_wrapper strings{{"This", "is", "not", "a", "string", "type"},
{1, 1, 1, 1, 1, 0}};
std::vector<cudf::size_type> slice_indices{0, 2, 2, 3, 3, 6};
auto sliced_strings = cudf::slice(strings, slice_indices);
{
cudf::test::fixed_width_column_wrapper<int16_t> gather_map{{1, 0, 1}};
cudf::test::strings_column_wrapper expected_strings{{"is", "This", "is"}, {1, 1, 1}};
cudf::table_view expected{{expected_strings}};
auto result = cudf::gather(cudf::table_view{{sliced_strings[0]}}, gather_map);
CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result->view());
}
{
cudf::test::fixed_width_column_wrapper<int16_t> gather_map{{0, 0, 0}};
cudf::test::strings_column_wrapper expected_strings{{"not", "not", "not"}, {1, 1, 1}};
cudf::table_view expected{{expected_strings}};
auto result = cudf::gather(cudf::table_view{{sliced_strings[1]}}, gather_map);
CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result->view());
}
{
cudf::test::fixed_width_column_wrapper<int16_t> gather_map{{2, 1, 0}};
cudf::test::strings_column_wrapper expected_strings{{"", "string", "a"}, {0, 1, 1}};
cudf::table_view expected{{expected_strings}};
auto result = cudf::gather(cudf::table_view{{sliced_strings[2]}}, gather_map);
CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result->view());
}
}

TEST_F(GatherTestStr, Gather)
{
std::vector<const char*> h_strings{"eee", "bb", "", "aa", "bbb", "ééé"};
Expand Down Expand Up @@ -98,7 +127,7 @@ TEST_F(GatherTestStr, GatherDontCheckOutOfBounds)
CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view().column(0), expected);
}

TEST_F(GatherTestStr, GatherZeroSizeStringsColumn)
TEST_F(GatherTestStr, GatherEmptyMapStringsColumn)
{
cudf::column_view zero_size_strings_column(
cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
Expand All @@ -109,3 +138,18 @@ TEST_F(GatherTestStr, GatherZeroSizeStringsColumn)
cudf::out_of_bounds_policy::NULLIFY);
cudf::test::expect_strings_empty(results->get_column(0).view());
}

TEST_F(GatherTestStr, GatherZeroSizeStringsColumn)
{
cudf::column_view zero_size_strings_column(
cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
std::vector<int32_t> h_map{0};
cudf::test::fixed_width_column_wrapper<int32_t> gather_map(h_map.begin(), h_map.end());
jlowe marked this conversation as resolved.
Show resolved Hide resolved
cudf::column_view gather_view = gather_map;
cudf::test::strings_column_wrapper expected{std::pair<std::string, bool>{"", false}};
auto results = cudf::detail::gather(cudf::table_view({zero_size_strings_column}),
gather_view.begin<int32_t>(),
gather_view.end<int32_t>(),
cudf::out_of_bounds_policy::NULLIFY);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, results->get_column(0).view());
}