diff --git a/cpp/benchmarks/string/find.cpp b/cpp/benchmarks/string/find.cpp index f5ca75d025f..5f2e6946b8b 100644 --- a/cpp/benchmarks/string/find.cpp +++ b/cpp/benchmarks/string/find.cpp @@ -66,7 +66,7 @@ static void generate_bench_args(benchmark::internal::Benchmark* b) int const row_mult = 8; int const min_rowlen = 1 << 5; int const max_rowlen = 1 << 13; - int const len_mult = 4; + int const len_mult = 2; for (int row_count = min_rows; row_count <= max_rows; row_count *= row_mult) { for (int rowlen = min_rowlen; rowlen <= max_rowlen; rowlen *= len_mult) { // avoid generating combinations that exceed the cudf column limit diff --git a/cpp/include/cudf/strings/string_view.cuh b/cpp/include/cudf/strings/string_view.cuh index 34ab17d367b..fc4e3d57cfb 100644 --- a/cpp/include/cudf/strings/string_view.cuh +++ b/cpp/include/cudf/strings/string_view.cuh @@ -63,6 +63,32 @@ __device__ inline size_type characters_in_string(const char* str, size_type byte #endif } +/** + * @brief Count the bytes to a specified character position + * + * Returns the number of bytes and any left over position value. + * The returned position is > 0 if the given position would read past + * the end of the input string. + * + * @param d_str Input string to count bytes within + * @param pos Character position to count to + * @return The number of bytes and the left over non-counted position value + */ +__device__ inline std::pair bytes_to_character_position(string_view d_str, + size_type pos) +{ + size_type bytes = 0; + auto ptr = d_str.data(); + auto const end_ptr = ptr + d_str.size_bytes(); + while ((pos > 0) && (ptr < end_ptr)) { + auto const width = strings::detail::bytes_in_utf8_byte(static_cast(*ptr)); + if (width) { --pos; } + bytes += width; + ++ptr; + } + return {bytes, pos}; +} + /** * @brief string value for sentinel which is used in min, max reduction * operators @@ -266,16 +292,8 @@ __device__ inline char_utf8 string_view::operator[](size_type pos) const __device__ inline size_type string_view::byte_offset(size_type pos) const { - size_type offset = 0; - const char* sptr = _data; - const char* eptr = sptr + _bytes; if (length() == size_bytes()) return pos; - while ((pos > 0) && (sptr < eptr)) { - size_type charbytes = strings::detail::bytes_in_utf8_byte(static_cast(*sptr++)); - if (charbytes) --pos; - offset += charbytes; - } - return offset; + return std::get<0>(strings::detail::bytes_to_character_position(*this, pos)); } __device__ inline int string_view::compare(const string_view& in) const diff --git a/cpp/src/strings/search/find.cu b/cpp/src/strings/search/find.cu index e6384d5d6e1..5e6a273958c 100644 --- a/cpp/src/strings/search/find.cu +++ b/cpp/src/strings/search/find.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -41,103 +41,207 @@ namespace cudf { namespace strings { namespace detail { namespace { + /** - * @brief Utility to return integer column indicating the position of - * target string within each string in a strings column. + * @brief Threshold to decide on using string or warp parallel functions. * - * Null string entries return corresponding null output column entries. + * If the average byte length of a string in a column exceeds this value then + * a warp-parallel function is used. * - * @tparam FindFunction Returns integer character position value given a string and target. + * Note that this value is shared by find, rfind, and contains functions. + */ +constexpr size_type AVG_CHAR_BYTES_THRESHOLD = 64; + +/** + * @brief Find function handles a string per thread + */ +template +struct finder_fn { + column_device_view const d_strings; + string_view const d_target; + size_type const start; + size_type const stop; + + __device__ size_type operator()(size_type idx) const + { + if (d_strings.is_null(idx)) { return -1; } + auto d_str = d_strings.element(idx); + + auto const length = d_str.length(); + auto const begin = (start > length) ? length : start; + auto const end = (stop < 0) || (stop > length) ? length : stop; + return forward ? d_str.find(d_target, begin, end - begin) + : d_str.rfind(d_target, begin, end - begin); + } +}; + +/** + * @brief Special logic handles an empty target for find/rfind * - * @param strings Strings column to search for target. - * @param target String to search for in each string in the strings column. - * @param start First character position to start the search. - * @param stop Last character position (exclusive) to end the search. - * @param pfn Functor used for locating `target` in each string. - * @param stream CUDA stream used for device memory operations and kernel launches. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New integer column with character position values. + * where length = number of characters in the input string + * if forward = true: + * return start iff (start <= length), otherwise return -1 + * if forward = false: + * return stop iff (0 <= stop <= length), otherwise return length */ -template -std::unique_ptr find_fn(strings_column_view const& strings, +template +struct empty_target_fn { + column_device_view const d_strings; + size_type const start; + size_type const stop; + + __device__ size_type operator()(size_type idx) const + { + if (d_strings.is_null(idx)) { return -1; } + auto d_str = d_strings.element(idx); + + // common case shortcut + if (forward && start == 0) { return 0; } + + auto const length = d_str.length(); + if (start > length) { return -1; } + if constexpr (forward) { return start; } + + return (stop < 0) || (stop > length) ? length : stop; + } +}; + +/** + * @brief String per warp function for find/rfind + */ +template +__global__ void finder_warp_parallel_fn(column_device_view const d_strings, + string_view const d_target, + size_type const start, + size_type const stop, + size_type* d_results) +{ + size_type const idx = static_cast(threadIdx.x + blockIdx.x * blockDim.x); + + if (idx >= (d_strings.size() * cudf::detail::warp_size)) { return; } + + auto const str_idx = idx / cudf::detail::warp_size; + auto const lane_idx = idx % cudf::detail::warp_size; + + if (d_strings.is_null(str_idx)) { return; } + + // initialize the output for the atomicMin/Max + if (lane_idx == 0) { d_results[str_idx] = forward ? std::numeric_limits::max() : -1; } + __syncwarp(); + + auto const d_str = d_strings.element(str_idx); + + auto const [begin, left_over] = bytes_to_character_position(d_str, start); + auto const start_char_pos = start - left_over; // keep track of character position + + auto const end = [d_str, start, stop, begin = begin] { + if (stop < 0) { return d_str.size_bytes(); } + if (stop <= start) { return begin; } + // we count from `begin` instead of recounting from the beginning of the string + return begin + std::get<0>(bytes_to_character_position( + string_view(d_str.data() + begin, d_str.size_bytes() - begin), stop - start)); + }(); + + // each thread compares the target with the thread's individual starting byte + size_type position = forward ? std::numeric_limits::max() : -1; + for (auto itr = begin + lane_idx; itr + d_target.size_bytes() <= end; + itr += cudf::detail::warp_size) { + if (d_target.compare(d_str.data() + itr, d_target.size_bytes()) == 0) { + position = itr; + if (forward) break; + } + } + + // find stores the minimum position while rfind stores the maximum position + // note that this was slightly faster than using cub::WarpReduce + forward ? atomicMin(d_results + str_idx, position) : atomicMax(d_results + str_idx, position); + __syncwarp(); + + if (lane_idx == 0) { + // the final result needs to be fixed up convert max() to -1 + // and a byte position to a character position + auto const result = d_results[str_idx]; + d_results[str_idx] = + ((result < std::numeric_limits::max()) && (result >= begin)) + ? start_char_pos + characters_in_string(d_str.data() + begin, result - begin) + : -1; + } +} + +template +std::unique_ptr find_fn(strings_column_view const& input, string_scalar const& target, size_type start, size_type stop, - FindFunction& pfn, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(target.is_valid(stream), "Parameter target must be valid."); CUDF_EXPECTS(start >= 0, "Parameter start must be positive integer or zero."); if ((stop > 0) && (start > stop)) CUDF_FAIL("Parameter start must be less than stop."); - // - auto d_target = string_view(target.data(), target.size()); - auto strings_column = column_device_view::create(strings.parent(), stream); - auto d_strings = *strings_column; - auto strings_count = strings.size(); + + auto d_target = string_view(target.data(), target.size()); + auto d_strings = column_device_view::create(input.parent(), stream); + // create output column - auto results = make_numeric_column(data_type{type_id::INT32}, - strings_count, - cudf::detail::copy_bitmask(strings.parent(), stream, mr), - strings.null_count(), + auto results = make_numeric_column(data_type{type_to_id()}, + input.size(), + cudf::detail::copy_bitmask(input.parent(), stream, mr), + input.null_count(), stream, mr); - auto results_view = results->mutable_view(); - auto d_results = results_view.data(); - // set the position values by evaluating the passed function - thrust::transform(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(strings_count), - d_results, - [d_strings, pfn, d_target, start, stop] __device__(size_type idx) { - int32_t position = -1; - if (!d_strings.is_null(idx)) - position = static_cast( - pfn(d_strings.element(idx), d_target, start, stop)); - return position; - }); - results->set_null_count(strings.null_count()); + // if input is empty or all-null then we are done + if (input.size() == input.null_count()) { return results; } + + auto d_results = results->mutable_view().data(); + + if (d_target.empty()) { + // special logic for empty target results + thrust::transform(rmm::exec_policy(stream), + thrust::counting_iterator(0), + thrust::counting_iterator(input.size()), + d_results, + empty_target_fn{*d_strings, start, stop}); + } else if ((input.chars_size() / (input.size() - input.null_count())) > + AVG_CHAR_BYTES_THRESHOLD) { + // warp-per-string runs faster for longer strings (but not shorter ones) + constexpr int block_size = 256; + cudf::detail::grid_1d grid{input.size() * cudf::detail::warp_size, block_size}; + finder_warp_parallel_fn + <<>>( + *d_strings, d_target, start, stop, d_results); + } else { + // string-per-thread function + thrust::transform(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(input.size()), + d_results, + finder_fn{*d_strings, d_target, start, stop}); + } + + results->set_null_count(input.null_count()); return results; } - } // namespace -std::unique_ptr find(strings_column_view const& strings, +std::unique_ptr find(strings_column_view const& input, string_scalar const& target, size_type start, size_type stop, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - auto pfn = [] __device__( - string_view d_string, string_view d_target, size_type start, size_type stop) { - size_type length = d_string.length(); - if (d_target.empty()) return start > length ? -1 : start; - size_type begin = (start > length) ? length : start; - size_type end = (stop < 0) || (stop > length) ? length : stop; - return d_string.find(d_target, begin, end - begin); - }; - - return find_fn(strings, target, start, stop, pfn, stream, mr); + return find_fn(input, target, start, stop, stream, mr); } -std::unique_ptr rfind(strings_column_view const& strings, +std::unique_ptr rfind(strings_column_view const& input, string_scalar const& target, size_type start, size_type stop, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - auto pfn = [] __device__( - string_view d_string, string_view d_target, size_type start, size_type stop) { - size_type length = d_string.length(); - size_type begin = (start > length) ? length : start; - size_type end = (stop < 0) || (stop > length) ? length : stop; - if (d_target.empty()) return start > length ? -1 : end; - return d_string.rfind(d_target, begin, end - begin); - }; - - return find_fn(strings, target, start, stop, pfn, stream, mr); + return find_fn(input, target, start, stop, stream, mr); } } // namespace detail @@ -167,17 +271,6 @@ std::unique_ptr rfind(strings_column_view const& strings, namespace detail { namespace { -/** - * @brief Threshold to decide on using string or warp parallel functions. - * - * If the average byte length of a string in a column exceeds this value then - * the warp-parallel `contains_warp_fn` function is used. - * Otherwise, the string-parallel function in `contains_fn` is used. - * - * This is only used for the scalar version of `contains()` right now. - */ -constexpr size_type AVG_CHAR_BYTES_THRESHOLD = 64; - /** * @brief Check if `d_target` appears in a row in `d_strings`. * @@ -370,7 +463,8 @@ std::unique_ptr contains(strings_column_view const& input, rmm::mr::device_memory_resource* mr) { // use warp parallel when the average string width is greater than the threshold - if (!input.is_empty() && ((input.chars_size() / input.size()) > AVG_CHAR_BYTES_THRESHOLD)) { + if ((input.null_count() < input.size()) && + ((input.chars_size() / input.size()) > AVG_CHAR_BYTES_THRESHOLD)) { return contains_warp_parallel(input, target, stream, mr); } diff --git a/cpp/tests/strings/find_tests.cpp b/cpp/tests/strings/find_tests.cpp index bd336540e0c..824fc7fe349 100644 --- a/cpp/tests/strings/find_tests.cpp +++ b/cpp/tests/strings/find_tests.cpp @@ -38,33 +38,54 @@ TEST_F(StringsFindTest, Find) auto strings_view = cudf::strings_column_view(strings); { - cudf::test::fixed_width_column_wrapper expected({1, 4, -1, -1, 1, -1}, - {1, 1, 0, 1, 1, 1}); + cudf::test::fixed_width_column_wrapper expected({1, 4, -1, -1, 1, -1}, + {1, 1, 0, 1, 1, 1}); auto results = cudf::strings::find(strings_view, cudf::string_scalar("é")); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); results = cudf::strings::rfind(strings_view, cudf::string_scalar("é")); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } { - cudf::test::fixed_width_column_wrapper expected({3, -1, -1, 0, -1, -1}, - {1, 1, 0, 1, 1, 1}); + cudf::test::fixed_width_column_wrapper expected({3, -1, -1, 0, -1, -1}, + {1, 1, 0, 1, 1, 1}); auto results = cudf::strings::rfind(strings_view, cudf::string_scalar("l")); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } { - cudf::test::fixed_width_column_wrapper expected({0, 0, 0, 0, 0, 0}, - {1, 1, 0, 1, 1, 1}); + cudf::test::fixed_width_column_wrapper expected({0, 0, 0, 0, 0, 0}, + {1, 1, 0, 1, 1, 1}); auto results = cudf::strings::find(strings_view, cudf::string_scalar("")); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } { - cudf::test::fixed_width_column_wrapper expected({5, 5, 0, 5, 12, 0}, - {1, 1, 0, 1, 1, 1}); + cudf::test::fixed_width_column_wrapper expected({5, 5, 0, 5, 12, 0}, + {1, 1, 0, 1, 1, 1}); auto results = cudf::strings::rfind(strings_view, cudf::string_scalar("")); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } } +TEST_F(StringsFindTest, FindLongStrings) +{ + cudf::test::strings_column_wrapper input( + {"Héllo, there world and goodbye", + "quick brown fox jumped over the lazy brown dog; the fat cats jump in place without moving", + "the following code snippet demonstrates how to use search for values in an ordered range", + "it returns the last position where value could be inserted without violating the ordering", + "algorithms execution is parallelized as determined by an execution policy. t", + "he this is a continuation of previous row to make sure string boundaries are honored", + ""}); + auto view = cudf::strings_column_view(input); + auto results = cudf::strings::find(view, cudf::string_scalar("the")); + auto expected = + cudf::test::fixed_width_column_wrapper({7, 28, 0, 11, -1, -1, -1}); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); + + results = cudf::strings::rfind(view, cudf::string_scalar("the")); + expected = cudf::test::fixed_width_column_wrapper({7, 48, 0, 77, -1, -1, -1}); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); +} + TEST_F(StringsFindTest, Contains) { cudf::test::strings_column_wrapper strings({"Héllo", "thesé", "", "lease", "tést strings", ""}, @@ -229,9 +250,9 @@ TEST_F(StringsFindTest, AllEmpty) std::vector h_strings{"", "", "", "", ""}; cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end()); - std::vector h_expected32(h_strings.size(), -1); - cudf::test::fixed_width_column_wrapper expected32(h_expected32.begin(), - h_expected32.end()); + std::vector h_expected32(h_strings.size(), -1); + cudf::test::fixed_width_column_wrapper expected32(h_expected32.begin(), + h_expected32.end()); std::vector h_expected8(h_strings.size(), 0); cudf::test::fixed_width_column_wrapper expected8(h_expected8.begin(), h_expected8.end()); @@ -264,8 +285,8 @@ TEST_F(StringsFindTest, AllNull) h_strings.end(), thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); - std::vector h_expected32(h_strings.size(), -1); - cudf::test::fixed_width_column_wrapper expected32( + std::vector h_expected32(h_strings.size(), -1); + cudf::test::fixed_width_column_wrapper expected32( h_expected32.begin(), h_expected32.end(), thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); @@ -324,29 +345,31 @@ TEST_P(FindParmsTest, Find) auto strings_view = cudf::strings_column_view(strings); { auto results = cudf::strings::find(strings_view, cudf::string_scalar("e"), position); - std::vector h_expected; + std::vector h_expected; for (auto itr = h_strings.begin(); itr != h_strings.end(); ++itr) - h_expected.push_back((int32_t)(*itr).find("e", position)); - cudf::test::fixed_width_column_wrapper expected(h_expected.begin(), h_expected.end()); + h_expected.push_back(static_cast((*itr).find("e", position))); + cudf::test::fixed_width_column_wrapper expected(h_expected.begin(), + h_expected.end()); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } { auto results = cudf::strings::rfind(strings_view, cudf::string_scalar("e"), 0, position + 1); - std::vector h_expected; + std::vector h_expected; for (auto itr = h_strings.begin(); itr != h_strings.end(); ++itr) - h_expected.push_back((int32_t)(*itr).rfind("e", position)); - cudf::test::fixed_width_column_wrapper expected(h_expected.begin(), h_expected.end()); + h_expected.push_back(static_cast((*itr).rfind("e", position))); + cudf::test::fixed_width_column_wrapper expected(h_expected.begin(), + h_expected.end()); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } { - auto begin = static_cast(position); + auto begin = static_cast(position); auto results = cudf::strings::find(strings_view, cudf::string_scalar(""), begin); - cudf::test::fixed_width_column_wrapper expected( + cudf::test::fixed_width_column_wrapper expected( {begin, (begin > 0 ? -1 : 0), begin, begin, begin}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - auto end = static_cast(position + 1); + auto end = static_cast(position + 1); results = cudf::strings::rfind(strings_view, cudf::string_scalar(""), 0, end); - cudf::test::fixed_width_column_wrapper rexpected({end, 0, end, end, end}); + cudf::test::fixed_width_column_wrapper rexpected({end, 0, end, end, end}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, rexpected); } }