Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimize cudf::make_strings_column for long strings #7576

Merged
merged 15 commits into from
Mar 18, 2021
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cpp/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,7 @@ ConfigureBench(STRINGS_BENCH
string/convert_floats_benchmark.cpp
string/copy_benchmark.cpp
string/extract_benchmark.cpp
string/factory_benchmark.cu
string/filter_benchmark.cpp
string/find_benchmark.cpp
string/replace_benchmark.cpp
Expand Down
93 changes: 93 additions & 0 deletions cpp/benchmarks/string/factory_benchmark.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
/*
* Copyright (c) 2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "string_bench_args.hpp"

#include <benchmark/benchmark.h>
#include <benchmarks/common/generate_benchmark_input.hpp>
#include <benchmarks/fixture/benchmark_fixture.hpp>
#include <benchmarks/synchronization/synchronization.hpp>

#include <cudf/strings/string_view.cuh>
#include <cudf/strings/strings_column_view.hpp>
#include <cudf_test/column_wrapper.hpp>

#include <rmm/device_uvector.hpp>

#include <thrust/execution_policy.h>
#include <thrust/transform.h>

#include <limits>

namespace {
using string_pair = thrust::pair<char const*, cudf::size_type>;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"string_pair" should mean a pair of strings, so should we use some other name?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Most usages of 'pair' indicate two different types.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well, I don't agree with that statement... but I also don't have a better name suggestion.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The pair contains data pointer and data size, similar to a span. How about string_span?

struct string_view_to_pair {
__device__ string_pair operator()(thrust::pair<cudf::string_view, bool> const& p)
{
return (p.second) ? string_pair{p.first.data(), p.first.size_bytes()} : string_pair{nullptr, 0};
}
};
} // namespace

class StringsFactory : public cudf::benchmark {
davidwendt marked this conversation as resolved.
Show resolved Hide resolved
};

static void BM_factory(benchmark::State& state)
{
cudf::size_type const n_rows{static_cast<cudf::size_type>(state.range(0))};
cudf::size_type const max_str_length{static_cast<cudf::size_type>(state.range(1))};
data_profile table_profile;
table_profile.set_distribution_params(
cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length);
auto const table =
create_random_table({cudf::type_id::STRING}, 1, row_count{n_rows}, table_profile);
auto d_column = cudf::column_device_view::create(table->view().column(0));
rmm::device_vector<string_pair> pairs(d_column->size());
thrust::transform(thrust::device,
d_column->pair_begin<cudf::string_view, true>(),
d_column->pair_end<cudf::string_view, true>(),
pairs.data(),
string_view_to_pair{});

for (auto _ : state) {
cuda_event_timer raii(state, true, 0);
cudf::make_strings_column(pairs);
}

cudf::strings_column_view input(table->view().column(0));
state.SetBytesProcessed(state.iterations() * input.chars_size());
}

static void generate_bench_args(benchmark::internal::Benchmark* b)
{
int const min_rows = 1 << 12;
int const max_rows = 1 << 24;
int const row_mult = 8;
int const min_rowlen = 1 << 5;
int const max_rowlen = 1 << 13;
int const len_mult = 4;
generate_string_bench_args(b, min_rows, max_rows, row_mult, min_rowlen, max_rowlen, len_mult);
}

#define STRINGS_BENCHMARK_DEFINE(name) \
BENCHMARK_DEFINE_F(StringsFactory, name) \
(::benchmark::State & st) { BM_factory(st); } \
BENCHMARK_REGISTER_F(StringsFactory, name) \
->Apply(generate_bench_args) \
->UseManualTime() \
->Unit(benchmark::kMillisecond);

STRINGS_BENCHMARK_DEFINE(factory)
2 changes: 2 additions & 0 deletions cpp/benchmarks/string/string_bench_args.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@

#include <benchmark/benchmark.h>

#include <cudf/types.hpp>

/**
* @brief Generate row count and row length argument ranges for a string benchmark.
*
Expand Down
3 changes: 1 addition & 2 deletions cpp/include/cudf/detail/gather.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -83,8 +83,7 @@ struct gather_bitmask_functor {
auto col = input.column(mask_idx);

if (Op != gather_bitmask_op::DONT_CHECK) {
bool out_of_range = is_signed_iterator<MapIterator>() ? (row_idx < 0 || row_idx >= col.size())
: row_idx >= col.size();
bool out_of_range = (row_idx < 0 || row_idx >= col.size());
davidwendt marked this conversation as resolved.
Show resolved Hide resolved
if (out_of_range) {
if (Op == gather_bitmask_op::PASSTHROUGH) {
return bit_is_set(masks[mask_idx], bit_idx);
Expand Down
87 changes: 59 additions & 28 deletions cpp/include/cudf/strings/detail/gather.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -31,15 +31,60 @@
#include <thrust/transform.h>

namespace cudf {
namespace strings {
namespace detail {

template <typename Iterator>
constexpr inline bool is_signed_iterator()
/**
* @brief Returns a new chars column using the specified indices to select
* strings from the input iterator.
*
* This uses a character-parallel gather CUDA kernel that performs very
* well on a strings column with long strings (e.g. average > 64 bytes).
*
* @tparam StringIterator Iterator should produce `string_view` objects.
* @tparam MapIterator Iterator for retrieving integer indices of the `StringIterator`.
*
* @param strings_begin Start of the iterator to retrieve `string_view` instances
* @param map_begin Start of index iterator.
* @param map_end End of index iterator.
* @param offsets The offset values to be associated with the output chars column.
* @param chars_bytes The total number of bytes for the output chars column.
* @param mr Device memory resource used to allocate the returned column's device memory.
* @param stream CUDA stream used for device memory operations and kernel launches.
* @return New chars column fit for a strings column.
*/
template <typename StringIterator, typename MapIterator>
std::unique_ptr<cudf::column> gather_chars(StringIterator strings_begin,
MapIterator map_begin,
MapIterator map_end,
cudf::device_span<int32_t const> const offsets,
size_type chars_bytes,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
return std::is_signed<typename std::iterator_traits<Iterator>::value_type>::value;
}
auto const output_count = std::distance(map_begin, map_end);
if (output_count == 0) return make_empty_column(data_type{type_id::INT8});

namespace strings {
namespace detail {
auto chars_column = create_chars_child_column(output_count, 0, chars_bytes, stream, mr);
auto const d_chars = chars_column->mutable_view().template data<char>();

auto gather_chars_fn = [strings_begin, map_begin, offsets] __device__(size_type out_idx) -> char {
auto const out_row =
thrust::prev(thrust::upper_bound(thrust::seq, offsets.begin(), offsets.end(), out_idx));
auto const row_idx = map_begin[thrust::distance(offsets.begin(), out_row)]; // get row index
auto const d_str = strings_begin[row_idx]; // get row's string
auto const offset = out_idx - *out_row; // get string's char
return d_str.data()[offset];
};

thrust::transform(rmm::exec_policy(stream),
thrust::make_counting_iterator<size_type>(0),
thrust::make_counting_iterator<size_type>(chars_bytes),
d_chars,
gather_chars_fn);

return chars_column;
}

/**
* @brief Returns a new strings column using the specified indices to select
Expand Down Expand Up @@ -107,29 +152,15 @@ std::unique_ptr<cudf::column> gather(
rmm::exec_policy(stream), d_out_offsets, d_out_offsets + output_count + 1, d_out_offsets);

// build chars column
size_type const out_chars_bytes = static_cast<size_type>(total_bytes);
auto out_chars_column = create_chars_child_column(output_count, 0, out_chars_bytes, stream, mr);
auto const d_out_chars = out_chars_column->mutable_view().template data<char>();

// fill in chars
cudf::device_span<int32_t const> const d_out_offsets_span(d_out_offsets, output_count + 1);
auto const d_in_chars = (strings_count > 0) ? strings.chars().data<char>() : nullptr;
auto gather_chars_fn =
[d_out_offsets_span, begin, d_in_offsets, d_in_chars] __device__(size_type out_char_idx) {
// find output row index for this output char index
auto const next_row_ptr = thrust::upper_bound(
thrust::seq, d_out_offsets_span.begin(), d_out_offsets_span.end(), out_char_idx);
auto const out_row_idx = thrust::distance(d_out_offsets_span.begin(), next_row_ptr) - 1;
auto const str_char_offset = out_char_idx - d_out_offsets_span[out_row_idx];
auto const in_row_idx = begin[out_row_idx];
auto const in_char_offset = d_in_offsets[in_row_idx] + str_char_offset;
return d_in_chars[in_char_offset];
};
thrust::transform(rmm::exec_policy(stream),
thrust::make_counting_iterator<size_type>(0),
thrust::make_counting_iterator<size_type>(out_chars_bytes),
d_out_chars,
gather_chars_fn);
auto const d_strings = column_device_view::create(strings.parent(), stream);
auto out_chars_column = gather_chars(d_strings->begin<string_view>(),
begin,
end,
d_out_offsets_span,
static_cast<size_type>(total_bytes),
stream,
mr);

return make_strings_column(output_count,
std::move(out_offsets_column),
Expand Down
98 changes: 79 additions & 19 deletions cpp/include/cudf/strings/detail/strings_column_factories.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -20,21 +20,43 @@
#include <cudf/column/column_factories.hpp>
#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/detail/valid_if.cuh>
#include <cudf/strings/detail/gather.cuh>
#include <cudf/strings/detail/utilities.hpp>
#include <cudf/utilities/error.hpp>

#include <rmm/thrust_rmm_allocator.h>
#include <rmm/cuda_stream_view.hpp>
#include <rmm/exec_policy.hpp>

#include <thrust/copy.h>
#include <thrust/for_each.h>
#include <thrust/transform_reduce.h>

namespace cudf {
namespace strings {
namespace detail {

// Create a strings-type column from iterators of pointer/size pairs
/**
* @brief Average string byte-length threshold for deciding character-level
* vs. row-level parallel algorithm.
*
* This value was determined by running the factory_benchmark against different
* string lengths and observing the point where the performance is faster for
* long strings.
*/
constexpr size_type FACTORY_BYTES_PER_ROW_THRESHOLD = 64;

/**
* @brief Create a strings-type column from iterators of pointer/size pairs
*
* @tparam IndexPairIterator iterator over type `pair<char const*,size_type>` values
*
* @param begin First string row (inclusive)
* @param end Last string row (exclusive)
* @param stream CUDA stream used for device memory operations
* @param mr Device memory resource used to allocate the returned column's device memory
* @return New strings column
*/
template <typename IndexPairIterator>
std::unique_ptr<column> make_strings_column(IndexPairIterator begin,
IndexPairIterator end,
Expand All @@ -51,7 +73,7 @@ std::unique_ptr<column> make_strings_column(IndexPairIterator begin,
auto size_checker = [] __device__(string_index_pair const& item) {
return (item.first != nullptr) ? item.second : 0;
};
size_t bytes = thrust::transform_reduce(
size_t const bytes = thrust::transform_reduce(
rmm::exec_policy(stream), begin, end, size_checker, 0, thrust::plus<size_t>());
CUDF_EXPECTS(bytes < static_cast<std::size_t>(std::numeric_limits<size_type>::max()),
"total size of strings is too large for cudf column");
Expand All @@ -65,26 +87,49 @@ std::unique_ptr<column> make_strings_column(IndexPairIterator begin,
offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr);

// create null mask
auto validator = [] __device__(string_index_pair const item) { return item.first != nullptr; };
auto new_nulls = cudf::detail::valid_if(begin, end, validator, stream, mr);
auto null_count = new_nulls.second;
auto validator = [] __device__(string_index_pair const item) { return item.first != nullptr; };
auto new_nulls = cudf::detail::valid_if(begin, end, validator, stream, mr);
auto const null_count = new_nulls.second;
auto null_mask =
(null_count > 0) ? std::move(new_nulls.first) : rmm::device_buffer{0, stream, mr};

// build chars column
auto chars_column =
strings::detail::create_chars_child_column(strings_count, null_count, bytes, stream, mr);
auto d_chars = chars_column->mutable_view().template data<char>();
auto copy_chars = [d_chars] __device__(auto item) {
string_index_pair str = thrust::get<0>(item);
size_type offset = thrust::get<1>(item);
if (str.first != nullptr) memcpy(d_chars + offset, str.first, str.second);
};
thrust::for_each_n(rmm::exec_policy(stream),
thrust::make_zip_iterator(
thrust::make_tuple(begin, offsets_column->view().template begin<int32_t>())),
strings_count,
copy_chars);
std::unique_ptr<column> chars_column = [&] {
// use a character-parallel kernel for long string lengths
auto const avg_bytes_per_row = bytes / std::max(strings_count - null_count, 1);
if (avg_bytes_per_row > FACTORY_BYTES_PER_ROW_THRESHOLD) {
auto const d_offsets =
device_span<size_type const>{offsets_column->view().template data<int32_t>(),
static_cast<std::size_t>(offsets_column->size())};
auto const str_begin = thrust::make_transform_iterator(begin, [] __device__(auto ip) {
return string_view{ip.first, ip.second};
});

return gather_chars(str_begin,
thrust::make_counting_iterator<size_type>(0),
thrust::make_counting_iterator<size_type>(strings_count),
d_offsets,
static_cast<size_type>(bytes),
stream,
mr);
} else {
// this approach is 2-3x faster for large number smaller string lengths
davidwendt marked this conversation as resolved.
Show resolved Hide resolved
auto chars_column =
strings::detail::create_chars_child_column(strings_count, null_count, bytes, stream, mr);
auto d_chars = chars_column->mutable_view().template data<char>();
auto copy_chars = [d_chars] __device__(auto item) {
string_index_pair const str = thrust::get<0>(item);
size_type const offset = thrust::get<1>(item);
if (str.first != nullptr) memcpy(d_chars + offset, str.first, str.second);
};
thrust::for_each_n(rmm::exec_policy(stream),
thrust::make_zip_iterator(thrust::make_tuple(
begin, offsets_column->view().template begin<int32_t>())),
strings_count,
copy_chars);
return chars_column;
}
}();

return make_strings_column(strings_count,
std::move(offsets_column),
Expand All @@ -95,7 +140,22 @@ std::unique_ptr<column> make_strings_column(IndexPairIterator begin,
mr);
}

// Create a strings-type column from iterators to chars, offsets, and bitmask.
/**
* @brief Create a strings-type column from iterators to chars, offsets, and bitmask.
*
* @tparam CharIterator iterator over character bytes (int8)
* @tparam OffsetIterator iterator over offset values (size_type)
*
* @param chars_begin First character byte (inclusive)
* @param chars_end Last character byte (exclusive)
* @param offset_begin First offset value (inclusive)
* @param offset_end Last offset value (exclusive)
* @param null_count Number of null rows
* @param null_mask The validity bitmask in Arrow format
* @param stream CUDA stream used for device memory operations
* @param mr Device memory resource used to allocate the returned column's device memory
* @return New strings column
*/
template <typename CharIterator, typename OffsetIterator>
std::unique_ptr<column> make_strings_column(CharIterator chars_begin,
CharIterator chars_end,
Expand Down
Loading