From feea040125fe66b78e6442f6091f22b491e6e747 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Tue, 18 Apr 2023 10:05:14 -0400 Subject: [PATCH] Improve performance of slice_strings for long strings (#13057) Improves on performance for longer strings with `cudf::strings::slice_strings()` API. The `cudf::string_view::substr` was reworked to minimize counting characters and the gather version of `make_strings_children` is used to build the resulting strings column. This version is already optimized for small and large strings. Additionally, the code was refactored so the common case of `step==1 and start < stop` can also make use of the gather approach. Common code was also grouped closer together to help navigate the source file better. The `slice.cpp` benchmark was updated to better measure large strings with comparable slice boundaries. The benchmark showed performance improvement was up to 9x for larger strings with no significant degradation for smaller strings. Reference #13048 and #12445 Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Nghia Truong (https://github.com/ttnghia) - Elias Stehle (https://github.com/elstehle) URL: https://github.com/rapidsai/cudf/pull/13057 --- cpp/benchmarks/string/slice.cpp | 8 +- cpp/include/cudf/strings/string_view.cuh | 16 +- cpp/include/cudf/strings/string_view.hpp | 4 +- cpp/src/strings/slice.cu | 252 ++++++++++++----------- 4 files changed, 142 insertions(+), 138 deletions(-) diff --git a/cpp/benchmarks/string/slice.cpp b/cpp/benchmarks/string/slice.cpp index ae6327f4ce0..e0b801ea0a7 100644 --- a/cpp/benchmarks/string/slice.cpp +++ b/cpp/benchmarks/string/slice.cpp @@ -43,7 +43,7 @@ static void BM_slice(benchmark::State& state, slice_type rt) cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length); auto const column = create_random_column(cudf::type_id::STRING, row_count{n_rows}, profile); cudf::strings_column_view input(column->view()); - auto starts_itr = thrust::constant_iterator(1); + auto starts_itr = thrust::constant_iterator(max_str_length / 3); auto stops_itr = thrust::constant_iterator(max_str_length / 2); cudf::test::fixed_width_column_wrapper starts(starts_itr, starts_itr + n_rows); cudf::test::fixed_width_column_wrapper stops(stops_itr, stops_itr + n_rows); @@ -53,7 +53,9 @@ static void BM_slice(benchmark::State& state, slice_type rt) for (auto _ : state) { cuda_event_timer raii(state, true, cudf::get_default_stream()); switch (rt) { - case position: cudf::strings::slice_strings(input, 1, max_str_length / 2); break; + case position: + cudf::strings::slice_strings(input, max_str_length / 3, max_str_length / 2); + break; case multi_position: cudf::strings::slice_strings(input, starts, stops); break; case delimiter: cudf::strings::slice_strings(input, std::string{" "}, 1); break; case multi_delimiter: @@ -72,7 +74,7 @@ static void generate_bench_args(benchmark::internal::Benchmark* b) int const row_mult = 8; int const min_rowlen = 1 << 5; int const max_rowlen = 1 << 13; - int const len_mult = 4; + int const len_mult = 2; generate_string_bench_args(b, min_rows, max_rows, row_mult, min_rowlen, max_rowlen, len_mult); } diff --git a/cpp/include/cudf/strings/string_view.cuh b/cpp/include/cudf/strings/string_view.cuh index 29062167f11..97d14a1e24b 100644 --- a/cpp/include/cudf/strings/string_view.cuh +++ b/cpp/include/cudf/strings/string_view.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -137,7 +137,7 @@ __device__ inline string_view::const_iterator string_view::const_iterator::opera } __device__ inline string_view::const_iterator string_view::const_iterator::operator+( - string_view::const_iterator::difference_type offset) + string_view::const_iterator::difference_type offset) const { const_iterator tmp(*this); size_type adjust = abs(offset); @@ -181,7 +181,7 @@ __device__ inline string_view::const_iterator& string_view::const_iterator::oper } __device__ inline string_view::const_iterator string_view::const_iterator::operator-( - string_view::const_iterator::difference_type offset) + string_view::const_iterator::difference_type offset) const { const_iterator tmp(*this); size_type adjust = abs(offset); @@ -396,12 +396,12 @@ __device__ inline size_type string_view::rfind(char_utf8 chr, size_type pos, siz } // parameters are character position values -__device__ inline string_view string_view::substr(size_type pos, size_type length) const +__device__ inline string_view string_view::substr(size_type pos, size_type count) const { - size_type spos = byte_offset(pos); - size_type epos = byte_offset(pos + length); - if (epos > size_bytes()) epos = size_bytes(); - if (spos >= epos) return string_view("", 0); + if (pos < 0 || pos >= length()) { return string_view{}; } + auto const itr = begin() + pos; + auto const spos = itr.byte_offset(); + auto const epos = count >= 0 ? (itr + count).byte_offset() : size_bytes(); return string_view(data() + spos, epos - spos); } diff --git a/cpp/include/cudf/strings/string_view.hpp b/cpp/include/cudf/strings/string_view.hpp index 28f9d57e9bd..680b7c395ca 100644 --- a/cpp/include/cudf/strings/string_view.hpp +++ b/cpp/include/cudf/strings/string_view.hpp @@ -88,11 +88,11 @@ class string_view { __device__ inline const_iterator& operator++(); __device__ inline const_iterator operator++(int); __device__ inline const_iterator& operator+=(difference_type); - __device__ inline const_iterator operator+(difference_type); + __device__ inline const_iterator operator+(difference_type) const; __device__ inline const_iterator& operator--(); __device__ inline const_iterator operator--(int); __device__ inline const_iterator& operator-=(difference_type); - __device__ inline const_iterator operator-(difference_type); + __device__ inline const_iterator operator-(difference_type) const; __device__ inline bool operator==(const const_iterator&) const; __device__ inline bool operator!=(const const_iterator&) const; __device__ inline bool operator<(const const_iterator&) const; diff --git a/cpp/src/strings/slice.cu b/cpp/src/strings/slice.cu index f63a15a5b96..fbe54c01bba 100644 --- a/cpp/src/strings/slice.cu +++ b/cpp/src/strings/slice.cu @@ -30,12 +30,44 @@ #include #include +#include #include +#include namespace cudf { namespace strings { namespace detail { namespace { +/** + * @brief Function logic for compute_substrings_from_fn API + * + * This computes the output size and resolves the substring + */ +template +struct substring_from_fn { + column_device_view const d_column; + IndexIterator const starts; + IndexIterator const stops; + + __device__ string_view operator()(size_type idx) const + { + if (d_column.is_null(idx)) { return string_view{nullptr, 0}; } + auto const d_str = d_column.template element(idx); + auto const length = d_str.length(); + auto const start = std::max(starts[idx], 0); + if (start >= length) { return string_view{}; } + + auto const stop = stops[idx]; + auto const end = (((stop < 0) || (stop > length)) ? length : stop); + return d_str.substr(start, end - start); + } + + substring_from_fn(column_device_view const& d_column, IndexIterator starts, IndexIterator stops) + : d_column(d_column), starts(starts), stops(stops) + { + } +}; + /** * @brief Function logic for the substring API. * @@ -92,14 +124,50 @@ struct substring_fn { char* d_buffer = d_chars ? d_chars + d_offsets[idx] : nullptr; auto itr = begin; while (step > 0 ? itr < end : end < itr) { - bytes += bytes_in_char_utf8(*itr); - if (d_buffer) d_buffer += from_char_utf8(*itr, d_buffer); + if (d_buffer) { + d_buffer += from_char_utf8(*itr, d_buffer); + } else { + bytes += bytes_in_char_utf8(*itr); + } itr += step; } if (!d_chars) d_offsets[idx] = bytes; } }; +/** + * @brief Common utility function for the slice_strings APIs + * + * It wraps calling the functors appropriately to build the output strings column. + * + * The input iterators may have unique position values per string in `d_column`. + * This can also be called with constant value iterators to handle special + * slice functions if possible. + * + * @tparam IndexIterator Iterator type for character position values + * + * @param d_column Input strings column to substring + * @param starts Start positions index iterator + * @param stops Stop positions index iterator + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + */ +template +std::unique_ptr compute_substrings_from_fn(column_device_view const& d_column, + IndexIterator starts, + IndexIterator stops, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + auto results = rmm::device_uvector(d_column.size(), stream); + thrust::transform(rmm::exec_policy(stream), + thrust::counting_iterator(0), + thrust::counting_iterator(d_column.size()), + results.begin(), + substring_from_fn{d_column, starts, stops}); + return make_strings_column(results, string_view{nullptr, 0}, stream, mr); +} + } // namespace // @@ -112,116 +180,69 @@ std::unique_ptr slice_strings(strings_column_view const& strings, { if (strings.is_empty()) return make_empty_column(type_id::STRING); - if (step.is_valid(stream)) CUDF_EXPECTS(step.value(stream) != 0, "Step parameter must not be 0"); + auto const step_valid = step.is_valid(stream); + auto const step_value = step_valid ? step.value(stream) : 0; + if (step_valid) { CUDF_EXPECTS(step_value != 0, "Step parameter must not be 0"); } auto const d_column = column_device_view::create(strings.parent(), stream); - auto const d_start = get_scalar_device_view(const_cast&>(start)); - auto const d_stop = get_scalar_device_view(const_cast&>(stop)); - auto const d_step = get_scalar_device_view(const_cast&>(step)); - auto children = make_strings_children( + // optimization for (step==1 and start < stop) -- expect this to be most common + if (step_value == 1 and start.is_valid(stream) and stop.is_valid(stream)) { + auto const start_value = start.value(stream); + auto const stop_value = stop.value(stream); + // note that any negative values here must use the alternate function below + if ((start_value >= 0) && (start_value < stop_value)) { + // this is about 2x faster on long strings for this common case + return compute_substrings_from_fn(*d_column, + thrust::constant_iterator(start_value), + thrust::constant_iterator(stop_value), + stream, + mr); + } + } + + auto const d_start = get_scalar_device_view(const_cast&>(start)); + auto const d_stop = get_scalar_device_view(const_cast&>(stop)); + auto const d_step = get_scalar_device_view(const_cast&>(step)); + + auto [offsets, chars] = make_strings_children( substring_fn{*d_column, d_start, d_stop, d_step}, strings.size(), stream, mr); return make_strings_column(strings.size(), - std::move(children.first), - std::move(children.second), + std::move(offsets), + std::move(chars), strings.null_count(), cudf::detail::copy_bitmask(strings.parent(), stream, mr)); } -} // namespace detail - -// external API - std::unique_ptr slice_strings(strings_column_view const& strings, - numeric_scalar const& start, - numeric_scalar const& stop, - numeric_scalar const& step, + column_view const& starts_column, + column_view const& stops_column, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - CUDF_FUNC_RANGE(); - return detail::slice_strings(strings, start, stop, step, cudf::get_default_stream(), mr); + size_type strings_count = strings.size(); + if (strings_count == 0) return make_empty_column(type_id::STRING); + CUDF_EXPECTS(starts_column.size() == strings_count, + "Parameter starts must have the same number of rows as strings."); + CUDF_EXPECTS(stops_column.size() == strings_count, + "Parameter stops must have the same number of rows as strings."); + CUDF_EXPECTS(starts_column.type() == stops_column.type(), + "Parameters starts and stops must be of the same type."); + CUDF_EXPECTS(starts_column.null_count() == 0, "Parameter starts must not contain nulls."); + CUDF_EXPECTS(stops_column.null_count() == 0, "Parameter stops must not contain nulls."); + CUDF_EXPECTS(starts_column.type().id() != data_type{type_id::BOOL8}.id(), + "Positions values must not be bool type."); + CUDF_EXPECTS(is_fixed_width(starts_column.type()), "Positions values must be fixed width type."); + + auto strings_column = column_device_view::create(strings.parent(), stream); + auto starts_iter = cudf::detail::indexalator_factory::make_input_iterator(starts_column); + auto stops_iter = cudf::detail::indexalator_factory::make_input_iterator(stops_column); + return compute_substrings_from_fn(*strings_column, starts_iter, stops_iter, stream, mr); } -namespace detail { namespace { -/** - * @brief Function logic for substring_from API. - * - * This both calculates the output size and executes the substring. - */ -struct substring_from_fn { - column_device_view const d_column; - cudf::detail::input_indexalator const starts; - cudf::detail::input_indexalator const stops; - int32_t* d_offsets{}; - char* d_chars{}; - - __device__ void operator()(size_type idx) - { - if (d_column.is_null(idx)) { - if (!d_chars) d_offsets[idx] = 0; - return; - } - auto const d_str = d_column.template element(idx); - auto const length = d_str.length(); - auto const start = std::max(starts[idx], 0); - if (start >= length) { - if (!d_chars) d_offsets[idx] = 0; - return; - } - auto const stop = stops[idx]; - auto const end = (((stop < 0) || (stop > length)) ? length : stop); - - auto const d_substr = d_str.substr(start, end - start); - if (d_chars) - memcpy(d_chars + d_offsets[idx], d_substr.data(), d_substr.size_bytes()); - else - d_offsets[idx] = d_substr.size_bytes(); - } -}; - -/** - * @brief Common utility function for the slice_strings APIs. - * - * It wraps calling the functors appropriately to build the output strings column. - * - * The input iterators may have unique position values per string in `d_column`. - * - * @param d_column Input strings column to substring. - * @param null_count Number of nulls for the output column. - * @param starts Start positions index iterator. - * @param stops Stop positions index iterator. - * @param stream CUDA stream used for device memory operations and kernel launches. - * @param mr Device memory resource used to allocate the returned column's device memory. - */ -std::unique_ptr compute_substrings_from_fn(column_device_view const& d_column, - size_type null_count, - cudf::detail::input_indexalator starts, - cudf::detail::input_indexalator stops, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - auto strings_count = d_column.size(); - - // Copy the null mask - rmm::device_buffer null_mask = - !d_column.nullable() - ? rmm::device_buffer{0, stream, mr} - : rmm::device_buffer( - d_column.null_mask(), cudf::bitmask_allocation_size_bytes(strings_count), stream, mr); - - auto children = - make_strings_children(substring_from_fn{d_column, starts, stops}, strings_count, stream, mr); - - return make_strings_column(strings_count, - std::move(children.first), - std::move(children.second), - null_count, - std::move(null_mask)); -} - /** * @brief Compute slice indices for each string. * @@ -287,34 +308,6 @@ void compute_substring_indices(column_device_view const& d_column, } // namespace -// -std::unique_ptr slice_strings(strings_column_view const& strings, - column_view const& starts_column, - column_view const& stops_column, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - size_type strings_count = strings.size(); - if (strings_count == 0) return make_empty_column(type_id::STRING); - CUDF_EXPECTS(starts_column.size() == strings_count, - "Parameter starts must have the same number of rows as strings."); - CUDF_EXPECTS(stops_column.size() == strings_count, - "Parameter stops must have the same number of rows as strings."); - CUDF_EXPECTS(starts_column.type() == stops_column.type(), - "Parameters starts and stops must be of the same type."); - CUDF_EXPECTS(starts_column.null_count() == 0, "Parameter starts must not contain nulls."); - CUDF_EXPECTS(stops_column.null_count() == 0, "Parameter stops must not contain nulls."); - CUDF_EXPECTS(starts_column.type().id() != data_type{type_id::BOOL8}.id(), - "Positions values must not be bool type."); - CUDF_EXPECTS(is_fixed_width(starts_column.type()), "Positions values must be fixed width type."); - - auto strings_column = column_device_view::create(strings.parent(), stream); - auto starts_iter = cudf::detail::indexalator_factory::make_input_iterator(starts_column); - auto stops_iter = cudf::detail::indexalator_factory::make_input_iterator(stops_column); - return compute_substrings_from_fn( - *strings_column, strings.null_count(), starts_iter, stops_iter, stream, mr); -} - template std::unique_ptr slice_strings(strings_column_view const& strings, DelimiterItrT const delimiter_itr, @@ -354,8 +347,7 @@ std::unique_ptr slice_strings(strings_column_view const& strings, cudf::detail::indexalator_factory::make_input_iterator(start_chars_pos_vec->view()); auto stops_iter = cudf::detail::indexalator_factory::make_input_iterator(stop_chars_pos_vec->view()); - return compute_substrings_from_fn( - d_column, strings.null_count(), starts_iter, stops_iter, stream, mr); + return compute_substrings_from_fn(d_column, starts_iter, stops_iter, stream, mr); } std::unique_ptr slice_strings(strings_column_view const& strings, @@ -387,6 +379,16 @@ std::unique_ptr slice_strings(strings_column_view const& strings, // external API +std::unique_ptr slice_strings(strings_column_view const& strings, + numeric_scalar const& start, + numeric_scalar const& stop, + numeric_scalar const& step, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::slice_strings(strings, start, stop, step, cudf::get_default_stream(), mr); +} + std::unique_ptr slice_strings(strings_column_view const& strings, column_view const& starts_column, column_view const& stops_column,