From 655cd937d10bb2e1531cd1db07081038a63447e9 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Mon, 8 May 2023 18:02:40 -0400 Subject: [PATCH] Performance improvement in cudf::strings::find/rfind for long strings (#13226) Improves performance for longer strings with `cudf::strings::find()` and `cudf::strings::rfind()` APIs. The current implementation works well with small-ish strings and so this new implementation splits into a long-ish string algorithm when the average number of bytes per string is 64 bytes or greater. The new implementation searches for the target string by applying a warp per string kernel. Additionally, the special logic needed for matching an empty target (easily checked in host code) is factored out into its own transform functor. For longer strings, the performance improvement is about 2-6x. Reference https://github.com/rapidsai/cudf/issues/13048 Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Nghia Truong (https://github.com/ttnghia) - Yunsong Wang (https://github.com/PointKernel) - MithunR (https://github.com/mythrocks) URL: https://github.com/rapidsai/cudf/pull/13226 --- cpp/benchmarks/string/find.cpp | 2 +- cpp/include/cudf/strings/string_view.cuh | 36 +++- cpp/src/strings/search/find.cu | 244 ++++++++++++++++------- cpp/tests/strings/find_tests.cpp | 69 ++++--- 4 files changed, 243 insertions(+), 108 deletions(-) diff --git a/cpp/benchmarks/string/find.cpp b/cpp/benchmarks/string/find.cpp index f5ca75d025f..5f2e6946b8b 100644 --- a/cpp/benchmarks/string/find.cpp +++ b/cpp/benchmarks/string/find.cpp @@ -66,7 +66,7 @@ static void generate_bench_args(benchmark::internal::Benchmark* b) int const row_mult = 8; int const min_rowlen = 1 << 5; int const max_rowlen = 1 << 13; - int const len_mult = 4; + int const len_mult = 2; for (int row_count = min_rows; row_count <= max_rows; row_count *= row_mult) { for (int rowlen = min_rowlen; rowlen <= max_rowlen; rowlen *= len_mult) { // avoid generating combinations that exceed the cudf column limit diff --git a/cpp/include/cudf/strings/string_view.cuh b/cpp/include/cudf/strings/string_view.cuh index 34ab17d367b..fc4e3d57cfb 100644 --- a/cpp/include/cudf/strings/string_view.cuh +++ b/cpp/include/cudf/strings/string_view.cuh @@ -63,6 +63,32 @@ __device__ inline size_type characters_in_string(const char* str, size_type byte #endif } +/** + * @brief Count the bytes to a specified character position + * + * Returns the number of bytes and any left over position value. + * The returned position is > 0 if the given position would read past + * the end of the input string. + * + * @param d_str Input string to count bytes within + * @param pos Character position to count to + * @return The number of bytes and the left over non-counted position value + */ +__device__ inline std::pair bytes_to_character_position(string_view d_str, + size_type pos) +{ + size_type bytes = 0; + auto ptr = d_str.data(); + auto const end_ptr = ptr + d_str.size_bytes(); + while ((pos > 0) && (ptr < end_ptr)) { + auto const width = strings::detail::bytes_in_utf8_byte(static_cast(*ptr)); + if (width) { --pos; } + bytes += width; + ++ptr; + } + return {bytes, pos}; +} + /** * @brief string value for sentinel which is used in min, max reduction * operators @@ -266,16 +292,8 @@ __device__ inline char_utf8 string_view::operator[](size_type pos) const __device__ inline size_type string_view::byte_offset(size_type pos) const { - size_type offset = 0; - const char* sptr = _data; - const char* eptr = sptr + _bytes; if (length() == size_bytes()) return pos; - while ((pos > 0) && (sptr < eptr)) { - size_type charbytes = strings::detail::bytes_in_utf8_byte(static_cast(*sptr++)); - if (charbytes) --pos; - offset += charbytes; - } - return offset; + return std::get<0>(strings::detail::bytes_to_character_position(*this, pos)); } __device__ inline int string_view::compare(const string_view& in) const diff --git a/cpp/src/strings/search/find.cu b/cpp/src/strings/search/find.cu index e6384d5d6e1..5e6a273958c 100644 --- a/cpp/src/strings/search/find.cu +++ b/cpp/src/strings/search/find.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -41,103 +41,207 @@ namespace cudf { namespace strings { namespace detail { namespace { + /** - * @brief Utility to return integer column indicating the position of - * target string within each string in a strings column. + * @brief Threshold to decide on using string or warp parallel functions. * - * Null string entries return corresponding null output column entries. + * If the average byte length of a string in a column exceeds this value then + * a warp-parallel function is used. * - * @tparam FindFunction Returns integer character position value given a string and target. + * Note that this value is shared by find, rfind, and contains functions. + */ +constexpr size_type AVG_CHAR_BYTES_THRESHOLD = 64; + +/** + * @brief Find function handles a string per thread + */ +template +struct finder_fn { + column_device_view const d_strings; + string_view const d_target; + size_type const start; + size_type const stop; + + __device__ size_type operator()(size_type idx) const + { + if (d_strings.is_null(idx)) { return -1; } + auto d_str = d_strings.element(idx); + + auto const length = d_str.length(); + auto const begin = (start > length) ? length : start; + auto const end = (stop < 0) || (stop > length) ? length : stop; + return forward ? d_str.find(d_target, begin, end - begin) + : d_str.rfind(d_target, begin, end - begin); + } +}; + +/** + * @brief Special logic handles an empty target for find/rfind * - * @param strings Strings column to search for target. - * @param target String to search for in each string in the strings column. - * @param start First character position to start the search. - * @param stop Last character position (exclusive) to end the search. - * @param pfn Functor used for locating `target` in each string. - * @param stream CUDA stream used for device memory operations and kernel launches. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New integer column with character position values. + * where length = number of characters in the input string + * if forward = true: + * return start iff (start <= length), otherwise return -1 + * if forward = false: + * return stop iff (0 <= stop <= length), otherwise return length */ -template -std::unique_ptr find_fn(strings_column_view const& strings, +template +struct empty_target_fn { + column_device_view const d_strings; + size_type const start; + size_type const stop; + + __device__ size_type operator()(size_type idx) const + { + if (d_strings.is_null(idx)) { return -1; } + auto d_str = d_strings.element(idx); + + // common case shortcut + if (forward && start == 0) { return 0; } + + auto const length = d_str.length(); + if (start > length) { return -1; } + if constexpr (forward) { return start; } + + return (stop < 0) || (stop > length) ? length : stop; + } +}; + +/** + * @brief String per warp function for find/rfind + */ +template +__global__ void finder_warp_parallel_fn(column_device_view const d_strings, + string_view const d_target, + size_type const start, + size_type const stop, + size_type* d_results) +{ + size_type const idx = static_cast(threadIdx.x + blockIdx.x * blockDim.x); + + if (idx >= (d_strings.size() * cudf::detail::warp_size)) { return; } + + auto const str_idx = idx / cudf::detail::warp_size; + auto const lane_idx = idx % cudf::detail::warp_size; + + if (d_strings.is_null(str_idx)) { return; } + + // initialize the output for the atomicMin/Max + if (lane_idx == 0) { d_results[str_idx] = forward ? std::numeric_limits::max() : -1; } + __syncwarp(); + + auto const d_str = d_strings.element(str_idx); + + auto const [begin, left_over] = bytes_to_character_position(d_str, start); + auto const start_char_pos = start - left_over; // keep track of character position + + auto const end = [d_str, start, stop, begin = begin] { + if (stop < 0) { return d_str.size_bytes(); } + if (stop <= start) { return begin; } + // we count from `begin` instead of recounting from the beginning of the string + return begin + std::get<0>(bytes_to_character_position( + string_view(d_str.data() + begin, d_str.size_bytes() - begin), stop - start)); + }(); + + // each thread compares the target with the thread's individual starting byte + size_type position = forward ? std::numeric_limits::max() : -1; + for (auto itr = begin + lane_idx; itr + d_target.size_bytes() <= end; + itr += cudf::detail::warp_size) { + if (d_target.compare(d_str.data() + itr, d_target.size_bytes()) == 0) { + position = itr; + if (forward) break; + } + } + + // find stores the minimum position while rfind stores the maximum position + // note that this was slightly faster than using cub::WarpReduce + forward ? atomicMin(d_results + str_idx, position) : atomicMax(d_results + str_idx, position); + __syncwarp(); + + if (lane_idx == 0) { + // the final result needs to be fixed up convert max() to -1 + // and a byte position to a character position + auto const result = d_results[str_idx]; + d_results[str_idx] = + ((result < std::numeric_limits::max()) && (result >= begin)) + ? start_char_pos + characters_in_string(d_str.data() + begin, result - begin) + : -1; + } +} + +template +std::unique_ptr find_fn(strings_column_view const& input, string_scalar const& target, size_type start, size_type stop, - FindFunction& pfn, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(target.is_valid(stream), "Parameter target must be valid."); CUDF_EXPECTS(start >= 0, "Parameter start must be positive integer or zero."); if ((stop > 0) && (start > stop)) CUDF_FAIL("Parameter start must be less than stop."); - // - auto d_target = string_view(target.data(), target.size()); - auto strings_column = column_device_view::create(strings.parent(), stream); - auto d_strings = *strings_column; - auto strings_count = strings.size(); + + auto d_target = string_view(target.data(), target.size()); + auto d_strings = column_device_view::create(input.parent(), stream); + // create output column - auto results = make_numeric_column(data_type{type_id::INT32}, - strings_count, - cudf::detail::copy_bitmask(strings.parent(), stream, mr), - strings.null_count(), + auto results = make_numeric_column(data_type{type_to_id()}, + input.size(), + cudf::detail::copy_bitmask(input.parent(), stream, mr), + input.null_count(), stream, mr); - auto results_view = results->mutable_view(); - auto d_results = results_view.data(); - // set the position values by evaluating the passed function - thrust::transform(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(strings_count), - d_results, - [d_strings, pfn, d_target, start, stop] __device__(size_type idx) { - int32_t position = -1; - if (!d_strings.is_null(idx)) - position = static_cast( - pfn(d_strings.element(idx), d_target, start, stop)); - return position; - }); - results->set_null_count(strings.null_count()); + // if input is empty or all-null then we are done + if (input.size() == input.null_count()) { return results; } + + auto d_results = results->mutable_view().data(); + + if (d_target.empty()) { + // special logic for empty target results + thrust::transform(rmm::exec_policy(stream), + thrust::counting_iterator(0), + thrust::counting_iterator(input.size()), + d_results, + empty_target_fn{*d_strings, start, stop}); + } else if ((input.chars_size() / (input.size() - input.null_count())) > + AVG_CHAR_BYTES_THRESHOLD) { + // warp-per-string runs faster for longer strings (but not shorter ones) + constexpr int block_size = 256; + cudf::detail::grid_1d grid{input.size() * cudf::detail::warp_size, block_size}; + finder_warp_parallel_fn + <<>>( + *d_strings, d_target, start, stop, d_results); + } else { + // string-per-thread function + thrust::transform(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(input.size()), + d_results, + finder_fn{*d_strings, d_target, start, stop}); + } + + results->set_null_count(input.null_count()); return results; } - } // namespace -std::unique_ptr find(strings_column_view const& strings, +std::unique_ptr find(strings_column_view const& input, string_scalar const& target, size_type start, size_type stop, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - auto pfn = [] __device__( - string_view d_string, string_view d_target, size_type start, size_type stop) { - size_type length = d_string.length(); - if (d_target.empty()) return start > length ? -1 : start; - size_type begin = (start > length) ? length : start; - size_type end = (stop < 0) || (stop > length) ? length : stop; - return d_string.find(d_target, begin, end - begin); - }; - - return find_fn(strings, target, start, stop, pfn, stream, mr); + return find_fn(input, target, start, stop, stream, mr); } -std::unique_ptr rfind(strings_column_view const& strings, +std::unique_ptr rfind(strings_column_view const& input, string_scalar const& target, size_type start, size_type stop, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - auto pfn = [] __device__( - string_view d_string, string_view d_target, size_type start, size_type stop) { - size_type length = d_string.length(); - size_type begin = (start > length) ? length : start; - size_type end = (stop < 0) || (stop > length) ? length : stop; - if (d_target.empty()) return start > length ? -1 : end; - return d_string.rfind(d_target, begin, end - begin); - }; - - return find_fn(strings, target, start, stop, pfn, stream, mr); + return find_fn(input, target, start, stop, stream, mr); } } // namespace detail @@ -167,17 +271,6 @@ std::unique_ptr rfind(strings_column_view const& strings, namespace detail { namespace { -/** - * @brief Threshold to decide on using string or warp parallel functions. - * - * If the average byte length of a string in a column exceeds this value then - * the warp-parallel `contains_warp_fn` function is used. - * Otherwise, the string-parallel function in `contains_fn` is used. - * - * This is only used for the scalar version of `contains()` right now. - */ -constexpr size_type AVG_CHAR_BYTES_THRESHOLD = 64; - /** * @brief Check if `d_target` appears in a row in `d_strings`. * @@ -370,7 +463,8 @@ std::unique_ptr contains(strings_column_view const& input, rmm::mr::device_memory_resource* mr) { // use warp parallel when the average string width is greater than the threshold - if (!input.is_empty() && ((input.chars_size() / input.size()) > AVG_CHAR_BYTES_THRESHOLD)) { + if ((input.null_count() < input.size()) && + ((input.chars_size() / input.size()) > AVG_CHAR_BYTES_THRESHOLD)) { return contains_warp_parallel(input, target, stream, mr); } diff --git a/cpp/tests/strings/find_tests.cpp b/cpp/tests/strings/find_tests.cpp index bd336540e0c..824fc7fe349 100644 --- a/cpp/tests/strings/find_tests.cpp +++ b/cpp/tests/strings/find_tests.cpp @@ -38,33 +38,54 @@ TEST_F(StringsFindTest, Find) auto strings_view = cudf::strings_column_view(strings); { - cudf::test::fixed_width_column_wrapper expected({1, 4, -1, -1, 1, -1}, - {1, 1, 0, 1, 1, 1}); + cudf::test::fixed_width_column_wrapper expected({1, 4, -1, -1, 1, -1}, + {1, 1, 0, 1, 1, 1}); auto results = cudf::strings::find(strings_view, cudf::string_scalar("é")); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); results = cudf::strings::rfind(strings_view, cudf::string_scalar("é")); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } { - cudf::test::fixed_width_column_wrapper expected({3, -1, -1, 0, -1, -1}, - {1, 1, 0, 1, 1, 1}); + cudf::test::fixed_width_column_wrapper expected({3, -1, -1, 0, -1, -1}, + {1, 1, 0, 1, 1, 1}); auto results = cudf::strings::rfind(strings_view, cudf::string_scalar("l")); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } { - cudf::test::fixed_width_column_wrapper expected({0, 0, 0, 0, 0, 0}, - {1, 1, 0, 1, 1, 1}); + cudf::test::fixed_width_column_wrapper expected({0, 0, 0, 0, 0, 0}, + {1, 1, 0, 1, 1, 1}); auto results = cudf::strings::find(strings_view, cudf::string_scalar("")); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } { - cudf::test::fixed_width_column_wrapper expected({5, 5, 0, 5, 12, 0}, - {1, 1, 0, 1, 1, 1}); + cudf::test::fixed_width_column_wrapper expected({5, 5, 0, 5, 12, 0}, + {1, 1, 0, 1, 1, 1}); auto results = cudf::strings::rfind(strings_view, cudf::string_scalar("")); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } } +TEST_F(StringsFindTest, FindLongStrings) +{ + cudf::test::strings_column_wrapper input( + {"Héllo, there world and goodbye", + "quick brown fox jumped over the lazy brown dog; the fat cats jump in place without moving", + "the following code snippet demonstrates how to use search for values in an ordered range", + "it returns the last position where value could be inserted without violating the ordering", + "algorithms execution is parallelized as determined by an execution policy. t", + "he this is a continuation of previous row to make sure string boundaries are honored", + ""}); + auto view = cudf::strings_column_view(input); + auto results = cudf::strings::find(view, cudf::string_scalar("the")); + auto expected = + cudf::test::fixed_width_column_wrapper({7, 28, 0, 11, -1, -1, -1}); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); + + results = cudf::strings::rfind(view, cudf::string_scalar("the")); + expected = cudf::test::fixed_width_column_wrapper({7, 48, 0, 77, -1, -1, -1}); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); +} + TEST_F(StringsFindTest, Contains) { cudf::test::strings_column_wrapper strings({"Héllo", "thesé", "", "lease", "tést strings", ""}, @@ -229,9 +250,9 @@ TEST_F(StringsFindTest, AllEmpty) std::vector h_strings{"", "", "", "", ""}; cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end()); - std::vector h_expected32(h_strings.size(), -1); - cudf::test::fixed_width_column_wrapper expected32(h_expected32.begin(), - h_expected32.end()); + std::vector h_expected32(h_strings.size(), -1); + cudf::test::fixed_width_column_wrapper expected32(h_expected32.begin(), + h_expected32.end()); std::vector h_expected8(h_strings.size(), 0); cudf::test::fixed_width_column_wrapper expected8(h_expected8.begin(), h_expected8.end()); @@ -264,8 +285,8 @@ TEST_F(StringsFindTest, AllNull) h_strings.end(), thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); - std::vector h_expected32(h_strings.size(), -1); - cudf::test::fixed_width_column_wrapper expected32( + std::vector h_expected32(h_strings.size(), -1); + cudf::test::fixed_width_column_wrapper expected32( h_expected32.begin(), h_expected32.end(), thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); @@ -324,29 +345,31 @@ TEST_P(FindParmsTest, Find) auto strings_view = cudf::strings_column_view(strings); { auto results = cudf::strings::find(strings_view, cudf::string_scalar("e"), position); - std::vector h_expected; + std::vector h_expected; for (auto itr = h_strings.begin(); itr != h_strings.end(); ++itr) - h_expected.push_back((int32_t)(*itr).find("e", position)); - cudf::test::fixed_width_column_wrapper expected(h_expected.begin(), h_expected.end()); + h_expected.push_back(static_cast((*itr).find("e", position))); + cudf::test::fixed_width_column_wrapper expected(h_expected.begin(), + h_expected.end()); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } { auto results = cudf::strings::rfind(strings_view, cudf::string_scalar("e"), 0, position + 1); - std::vector h_expected; + std::vector h_expected; for (auto itr = h_strings.begin(); itr != h_strings.end(); ++itr) - h_expected.push_back((int32_t)(*itr).rfind("e", position)); - cudf::test::fixed_width_column_wrapper expected(h_expected.begin(), h_expected.end()); + h_expected.push_back(static_cast((*itr).rfind("e", position))); + cudf::test::fixed_width_column_wrapper expected(h_expected.begin(), + h_expected.end()); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } { - auto begin = static_cast(position); + auto begin = static_cast(position); auto results = cudf::strings::find(strings_view, cudf::string_scalar(""), begin); - cudf::test::fixed_width_column_wrapper expected( + cudf::test::fixed_width_column_wrapper expected( {begin, (begin > 0 ? -1 : 0), begin, begin, begin}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - auto end = static_cast(position + 1); + auto end = static_cast(position + 1); results = cudf::strings::rfind(strings_view, cudf::string_scalar(""), 0, end); - cudf::test::fixed_width_column_wrapper rexpected({end, 0, end, end, end}); + cudf::test::fixed_width_column_wrapper rexpected({end, 0, end, end, end}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, rexpected); } }