Skip to content

Commit

Permalink
Optimize cudf::make_strings_column for long strings (#7576)
Browse files Browse the repository at this point in the history
Reference #7571 
This improves the performance of the `cudf::make_strings_column` for long strings. It uses a similar approach from `cudf::strings::detail::gather` and also use thresholding as in the optimized `cudf::strings::replace`. 
This may not be the right solution for overall optimizing #7571 but may be helpful in other places where long strings are used for created a strings column in libcudf.
This PR also includes a gbenchmark to help measure the performance results of this factory function. The results of the benchmark are that longer strings (~ >64 bytes on average) showed about a 10x improvement. I can post benchmark results here if needed. The character-parallel algorithm was slower for shorter strings so the existing algorithm is used based on the a threshold calculation.
I also added an additional gtest with a mixture of nulls and empty strings to make sure the new algorithm handles these correctly.

Authors:
  - David (@davidwendt)

Approvers:
  - Jason Lowe (@jlowe)
  - Nghia Truong (@ttnghia)
  - Jake Hemstad (@jrhemstad)

URL: #7576
  • Loading branch information
davidwendt authored Mar 18, 2021
1 parent 99001d2 commit 9aa33ef
Show file tree
Hide file tree
Showing 7 changed files with 280 additions and 47 deletions.
1 change: 1 addition & 0 deletions cpp/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,7 @@ ConfigureBench(STRINGS_BENCH
string/convert_floats_benchmark.cpp
string/copy_benchmark.cpp
string/extract_benchmark.cpp
string/factory_benchmark.cu
string/filter_benchmark.cpp
string/find_benchmark.cpp
string/replace_benchmark.cpp
Expand Down
93 changes: 93 additions & 0 deletions cpp/benchmarks/string/factory_benchmark.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
/*
* Copyright (c) 2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "string_bench_args.hpp"

#include <benchmark/benchmark.h>
#include <benchmarks/common/generate_benchmark_input.hpp>
#include <benchmarks/fixture/benchmark_fixture.hpp>
#include <benchmarks/synchronization/synchronization.hpp>

#include <cudf/strings/string_view.cuh>
#include <cudf/strings/strings_column_view.hpp>
#include <cudf_test/column_wrapper.hpp>

#include <rmm/device_uvector.hpp>

#include <thrust/execution_policy.h>
#include <thrust/transform.h>

#include <limits>

namespace {
using string_pair = thrust::pair<char const*, cudf::size_type>;
struct string_view_to_pair {
__device__ string_pair operator()(thrust::pair<cudf::string_view, bool> const& p)
{
return (p.second) ? string_pair{p.first.data(), p.first.size_bytes()} : string_pair{nullptr, 0};
}
};
} // namespace

class StringsFactory : public cudf::benchmark {
};

static void BM_factory(benchmark::State& state)
{
cudf::size_type const n_rows{static_cast<cudf::size_type>(state.range(0))};
cudf::size_type const max_str_length{static_cast<cudf::size_type>(state.range(1))};
data_profile table_profile;
table_profile.set_distribution_params(
cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length);
auto const table =
create_random_table({cudf::type_id::STRING}, 1, row_count{n_rows}, table_profile);
auto d_column = cudf::column_device_view::create(table->view().column(0));
rmm::device_vector<string_pair> pairs(d_column->size());
thrust::transform(thrust::device,
d_column->pair_begin<cudf::string_view, true>(),
d_column->pair_end<cudf::string_view, true>(),
pairs.data(),
string_view_to_pair{});

for (auto _ : state) {
cuda_event_timer raii(state, true, 0);
cudf::make_strings_column(pairs);
}

cudf::strings_column_view input(table->view().column(0));
state.SetBytesProcessed(state.iterations() * input.chars_size());
}

static void generate_bench_args(benchmark::internal::Benchmark* b)
{
int const min_rows = 1 << 12;
int const max_rows = 1 << 24;
int const row_mult = 8;
int const min_rowlen = 1 << 5;
int const max_rowlen = 1 << 13;
int const len_mult = 4;
generate_string_bench_args(b, min_rows, max_rows, row_mult, min_rowlen, max_rowlen, len_mult);
}

#define STRINGS_BENCHMARK_DEFINE(name) \
BENCHMARK_DEFINE_F(StringsFactory, name) \
(::benchmark::State & st) { BM_factory(st); } \
BENCHMARK_REGISTER_F(StringsFactory, name) \
->Apply(generate_bench_args) \
->UseManualTime() \
->Unit(benchmark::kMillisecond);

STRINGS_BENCHMARK_DEFINE(factory)
2 changes: 2 additions & 0 deletions cpp/benchmarks/string/string_bench_args.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@

#include <benchmark/benchmark.h>

#include <cudf/types.hpp>

/**
* @brief Generate row count and row length argument ranges for a string benchmark.
*
Expand Down
87 changes: 59 additions & 28 deletions cpp/include/cudf/strings/detail/gather.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -31,15 +31,60 @@
#include <thrust/transform.h>

namespace cudf {
namespace strings {
namespace detail {

template <typename Iterator>
constexpr inline bool is_signed_iterator()
/**
* @brief Returns a new chars column using the specified indices to select
* strings from the input iterator.
*
* This uses a character-parallel gather CUDA kernel that performs very
* well on a strings column with long strings (e.g. average > 64 bytes).
*
* @tparam StringIterator Iterator should produce `string_view` objects.
* @tparam MapIterator Iterator for retrieving integer indices of the `StringIterator`.
*
* @param strings_begin Start of the iterator to retrieve `string_view` instances
* @param map_begin Start of index iterator.
* @param map_end End of index iterator.
* @param offsets The offset values to be associated with the output chars column.
* @param chars_bytes The total number of bytes for the output chars column.
* @param mr Device memory resource used to allocate the returned column's device memory.
* @param stream CUDA stream used for device memory operations and kernel launches.
* @return New chars column fit for a strings column.
*/
template <typename StringIterator, typename MapIterator>
std::unique_ptr<cudf::column> gather_chars(StringIterator strings_begin,
MapIterator map_begin,
MapIterator map_end,
cudf::device_span<int32_t const> const offsets,
size_type chars_bytes,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
return std::is_signed<typename std::iterator_traits<Iterator>::value_type>::value;
}
auto const output_count = std::distance(map_begin, map_end);
if (output_count == 0) return make_empty_column(data_type{type_id::INT8});

namespace strings {
namespace detail {
auto chars_column = create_chars_child_column(output_count, 0, chars_bytes, stream, mr);
auto const d_chars = chars_column->mutable_view().template data<char>();

auto gather_chars_fn = [strings_begin, map_begin, offsets] __device__(size_type out_idx) -> char {
auto const out_row =
thrust::prev(thrust::upper_bound(thrust::seq, offsets.begin(), offsets.end(), out_idx));
auto const row_idx = map_begin[thrust::distance(offsets.begin(), out_row)]; // get row index
auto const d_str = strings_begin[row_idx]; // get row's string
auto const offset = out_idx - *out_row; // get string's char
return d_str.data()[offset];
};

thrust::transform(rmm::exec_policy(stream),
thrust::make_counting_iterator<size_type>(0),
thrust::make_counting_iterator<size_type>(chars_bytes),
d_chars,
gather_chars_fn);

return chars_column;
}

/**
* @brief Returns a new strings column using the specified indices to select
Expand Down Expand Up @@ -107,29 +152,15 @@ std::unique_ptr<cudf::column> gather(
rmm::exec_policy(stream), d_out_offsets, d_out_offsets + output_count + 1, d_out_offsets);

// build chars column
size_type const out_chars_bytes = static_cast<size_type>(total_bytes);
auto out_chars_column = create_chars_child_column(output_count, 0, out_chars_bytes, stream, mr);
auto const d_out_chars = out_chars_column->mutable_view().template data<char>();

// fill in chars
cudf::device_span<int32_t const> const d_out_offsets_span(d_out_offsets, output_count + 1);
auto const d_in_chars = (strings_count > 0) ? strings.chars().data<char>() : nullptr;
auto gather_chars_fn =
[d_out_offsets_span, begin, d_in_offsets, d_in_chars] __device__(size_type out_char_idx) {
// find output row index for this output char index
auto const next_row_ptr = thrust::upper_bound(
thrust::seq, d_out_offsets_span.begin(), d_out_offsets_span.end(), out_char_idx);
auto const out_row_idx = thrust::distance(d_out_offsets_span.begin(), next_row_ptr) - 1;
auto const str_char_offset = out_char_idx - d_out_offsets_span[out_row_idx];
auto const in_row_idx = begin[out_row_idx];
auto const in_char_offset = d_in_offsets[in_row_idx] + str_char_offset;
return d_in_chars[in_char_offset];
};
thrust::transform(rmm::exec_policy(stream),
thrust::make_counting_iterator<size_type>(0),
thrust::make_counting_iterator<size_type>(out_chars_bytes),
d_out_chars,
gather_chars_fn);
auto const d_strings = column_device_view::create(strings.parent(), stream);
auto out_chars_column = gather_chars(d_strings->begin<string_view>(),
begin,
end,
d_out_offsets_span,
static_cast<size_type>(total_bytes),
stream,
mr);

return make_strings_column(output_count,
std::move(out_offsets_column),
Expand Down
98 changes: 79 additions & 19 deletions cpp/include/cudf/strings/detail/strings_column_factories.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -20,21 +20,43 @@
#include <cudf/column/column_factories.hpp>
#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/detail/valid_if.cuh>
#include <cudf/strings/detail/gather.cuh>
#include <cudf/strings/detail/utilities.hpp>
#include <cudf/utilities/error.hpp>

#include <rmm/thrust_rmm_allocator.h>
#include <rmm/cuda_stream_view.hpp>
#include <rmm/exec_policy.hpp>

#include <thrust/copy.h>
#include <thrust/for_each.h>
#include <thrust/transform_reduce.h>

namespace cudf {
namespace strings {
namespace detail {

// Create a strings-type column from iterators of pointer/size pairs
/**
* @brief Average string byte-length threshold for deciding character-level
* vs. row-level parallel algorithm.
*
* This value was determined by running the factory_benchmark against different
* string lengths and observing the point where the performance is faster for
* long strings.
*/
constexpr size_type FACTORY_BYTES_PER_ROW_THRESHOLD = 64;

/**
* @brief Create a strings-type column from iterators of pointer/size pairs
*
* @tparam IndexPairIterator iterator over type `pair<char const*,size_type>` values
*
* @param begin First string row (inclusive)
* @param end Last string row (exclusive)
* @param stream CUDA stream used for device memory operations
* @param mr Device memory resource used to allocate the returned column's device memory
* @return New strings column
*/
template <typename IndexPairIterator>
std::unique_ptr<column> make_strings_column(IndexPairIterator begin,
IndexPairIterator end,
Expand All @@ -51,7 +73,7 @@ std::unique_ptr<column> make_strings_column(IndexPairIterator begin,
auto size_checker = [] __device__(string_index_pair const& item) {
return (item.first != nullptr) ? item.second : 0;
};
size_t bytes = thrust::transform_reduce(
size_t const bytes = thrust::transform_reduce(
rmm::exec_policy(stream), begin, end, size_checker, 0, thrust::plus<size_t>());
CUDF_EXPECTS(bytes < static_cast<std::size_t>(std::numeric_limits<size_type>::max()),
"total size of strings is too large for cudf column");
Expand All @@ -65,26 +87,49 @@ std::unique_ptr<column> make_strings_column(IndexPairIterator begin,
offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr);

// create null mask
auto validator = [] __device__(string_index_pair const item) { return item.first != nullptr; };
auto new_nulls = cudf::detail::valid_if(begin, end, validator, stream, mr);
auto null_count = new_nulls.second;
auto validator = [] __device__(string_index_pair const item) { return item.first != nullptr; };
auto new_nulls = cudf::detail::valid_if(begin, end, validator, stream, mr);
auto const null_count = new_nulls.second;
auto null_mask =
(null_count > 0) ? std::move(new_nulls.first) : rmm::device_buffer{0, stream, mr};

// build chars column
auto chars_column =
strings::detail::create_chars_child_column(strings_count, null_count, bytes, stream, mr);
auto d_chars = chars_column->mutable_view().template data<char>();
auto copy_chars = [d_chars] __device__(auto item) {
string_index_pair str = thrust::get<0>(item);
size_type offset = thrust::get<1>(item);
if (str.first != nullptr) memcpy(d_chars + offset, str.first, str.second);
};
thrust::for_each_n(rmm::exec_policy(stream),
thrust::make_zip_iterator(
thrust::make_tuple(begin, offsets_column->view().template begin<int32_t>())),
strings_count,
copy_chars);
std::unique_ptr<column> chars_column = [&] {
// use a character-parallel kernel for long string lengths
auto const avg_bytes_per_row = bytes / std::max(strings_count - null_count, 1);
if (avg_bytes_per_row > FACTORY_BYTES_PER_ROW_THRESHOLD) {
auto const d_offsets =
device_span<size_type const>{offsets_column->view().template data<int32_t>(),
static_cast<std::size_t>(offsets_column->size())};
auto const str_begin = thrust::make_transform_iterator(begin, [] __device__(auto ip) {
return string_view{ip.first, ip.second};
});

return gather_chars(str_begin,
thrust::make_counting_iterator<size_type>(0),
thrust::make_counting_iterator<size_type>(strings_count),
d_offsets,
static_cast<size_type>(bytes),
stream,
mr);
} else {
// this approach is 2-3x faster for a large number of smaller string lengths
auto chars_column =
strings::detail::create_chars_child_column(strings_count, null_count, bytes, stream, mr);
auto d_chars = chars_column->mutable_view().template data<char>();
auto copy_chars = [d_chars] __device__(auto item) {
string_index_pair const str = thrust::get<0>(item);
size_type const offset = thrust::get<1>(item);
if (str.first != nullptr) memcpy(d_chars + offset, str.first, str.second);
};
thrust::for_each_n(rmm::exec_policy(stream),
thrust::make_zip_iterator(thrust::make_tuple(
begin, offsets_column->view().template begin<int32_t>())),
strings_count,
copy_chars);
return chars_column;
}
}();

return make_strings_column(strings_count,
std::move(offsets_column),
Expand All @@ -95,7 +140,22 @@ std::unique_ptr<column> make_strings_column(IndexPairIterator begin,
mr);
}

// Create a strings-type column from iterators to chars, offsets, and bitmask.
/**
* @brief Create a strings-type column from iterators to chars, offsets, and bitmask.
*
* @tparam CharIterator iterator over character bytes (int8)
* @tparam OffsetIterator iterator over offset values (size_type)
*
* @param chars_begin First character byte (inclusive)
* @param chars_end Last character byte (exclusive)
* @param offset_begin First offset value (inclusive)
* @param offset_end Last offset value (exclusive)
* @param null_count Number of null rows
* @param null_mask The validity bitmask in Arrow format
* @param stream CUDA stream used for device memory operations
* @param mr Device memory resource used to allocate the returned column's device memory
* @return New strings column
*/
template <typename CharIterator, typename OffsetIterator>
std::unique_ptr<column> make_strings_column(CharIterator chars_begin,
CharIterator chars_end,
Expand Down
12 changes: 12 additions & 0 deletions cpp/include/cudf/utilities/traits.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,18 @@ constexpr inline bool is_unsigned(data_type type)
return cudf::type_dispatcher(type, is_unsigned_impl{});
}

/**
* @brief Indicates whether the `Iterator` value type is unsigned.
*
* @tparam Iterator The type to verify
* @return true if the iterator's value type is unsigned
*/
template <typename Iterator>
constexpr inline bool is_signed_iterator()
{
return std::is_signed<typename std::iterator_traits<Iterator>::value_type>::value;
}

/**
* @brief Indicates whether the type `T` is a floating point type.
*
Expand Down
Loading

0 comments on commit 9aa33ef

Please sign in to comment.