From 7da233b279bf84a501e9c2e3041cbc6fb335e610 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Tue, 21 Feb 2023 08:12:54 -0500 Subject: [PATCH] Rework logic in cudf::strings::split_record to improve performance (#12729) Updates the `cudf::strings::split_record` logic to match the more optimized code in `cudf::strings:split`. The optimized code performs much better for longer strings (>64 bytes) by parallelizing over the character bytes to find delimiters before determining split tokens. This led to refactoring the code so it both APIs can share the optimized code. Also fixes a bug found when using overlapped delimiters. Additional tests were added for multi-byte delimiters which can overlap and span multiple adjacent strings. Closes #12694 Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Nghia Truong (https://github.com/ttnghia) - Yunsong Wang (https://github.com/PointKernel) - https://github.com/nvdbaranec URL: https://github.com/rapidsai/cudf/pull/12729 --- cpp/benchmarks/string/split.cpp | 14 +- cpp/src/strings/split/split.cu | 448 ++------------------------ cpp/src/strings/split/split.cuh | 403 +++++++++++++++++++++++ cpp/src/strings/split/split_record.cu | 168 +++------- cpp/tests/strings/split_tests.cpp | 78 ++++- 5 files changed, 565 insertions(+), 546 deletions(-) create mode 100644 cpp/src/strings/split/split.cuh diff --git a/cpp/benchmarks/string/split.cpp b/cpp/benchmarks/string/split.cpp index 0f005c462cc..1b3f4190680 100644 --- a/cpp/benchmarks/string/split.cpp +++ b/cpp/benchmarks/string/split.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -57,12 +57,12 @@ static void BM_split(benchmark::State& state, split_type rt) static void generate_bench_args(benchmark::internal::Benchmark* b) { - int const min_rows = 1 << 12; - int const max_rows = 1 << 24; - int const row_mult = 8; - int const min_rowlen = 1 << 5; - int const max_rowlen = 1 << 13; - int const len_mult = 4; + int constexpr min_rows = 1 << 12; + int constexpr max_rows = 1 << 24; + int constexpr row_mult = 8; + int constexpr min_rowlen = 1 << 5; + int constexpr max_rowlen = 1 << 13; + int constexpr len_mult = 2; for (int row_count = min_rows; row_count <= max_rows; row_count *= row_mult) { for (int rowlen = min_rowlen; rowlen <= max_rowlen; rowlen *= len_mult) { // avoid generating combinations that exceed the cudf column limit diff --git a/cpp/src/strings/split/split.cu b/cpp/src/strings/split/split.cu index c11d7ad47f9..18599fb568a 100644 --- a/cpp/src/strings/split/split.cu +++ b/cpp/src/strings/split/split.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,6 +14,8 @@ * limitations under the License. */ +#include "split.cuh" + #include #include #include @@ -31,14 +33,10 @@ #include #include -#include -#include -#include #include #include #include #include -#include #include #include @@ -46,321 +44,8 @@ namespace cudf { namespace strings { namespace detail { -using string_index_pair = thrust::pair; - namespace { -/** - * @brief Base class for delimiter-based tokenizers. - * - * These are common methods used by both split and rsplit tokenizer functors. - */ -struct base_split_tokenizer { - __device__ const char* get_base_ptr() const - { - return d_strings.child(strings_column_view::chars_column_index).data(); - } - - __device__ string_view const get_string(size_type idx) const - { - return d_strings.element(idx); - } - - __device__ bool is_valid(size_type idx) const { return d_strings.is_valid(idx); } - - /** - * @brief Initialize token elements for all strings. - * - * The process_tokens() only handles creating tokens for strings that contain - * delimiters. This function will initialize the output tokens for all - * strings by assigning null entries for null and empty strings and the - * string itself for strings with no delimiters. - * - * The tokens are placed in output order so that all tokens for each output - * column are stored consecutively in `d_all_tokens`. - * - * @param idx Index of string in column - * @param column_count Number of columns in output - * @param d_all_tokens Tokens vector for all strings - */ - __device__ void init_tokens(size_type idx, - size_type column_count, - string_index_pair* d_all_tokens) const - { - auto d_tokens = d_all_tokens + idx; - if (is_valid(idx)) { - auto d_str = get_string(idx); - *d_tokens = string_index_pair{d_str.data(), d_str.size_bytes()}; - --column_count; - d_tokens += d_strings.size(); - } - // this is like fill() but output needs to be strided - for (size_type col = 0; col < column_count; ++col) - d_tokens[d_strings.size() * col] = string_index_pair{nullptr, 0}; - } - - base_split_tokenizer(column_device_view const& d_strings, - string_view const& d_delimiter, - size_type max_tokens) - : d_strings(d_strings), d_delimiter(d_delimiter), max_tokens(max_tokens) - { - } - - protected: - column_device_view const d_strings; // strings to split - string_view const d_delimiter; // delimiter for split - size_type max_tokens; -}; - -/** - * @brief The tokenizer functions for split(). - * - * The methods here count delimiters, tokens, and output token elements - * for each string in a strings column. - */ -struct split_tokenizer_fn : base_split_tokenizer { - /** - * @brief This will create tokens around each delimiter honoring the string boundaries - * in which the delimiter resides. - * - * Each token is placed in `d_all_tokens` so they align consecutively - * with other tokens for the same output column. - * That is, `d_tokens[col * strings_count + string_index]` is the token at column `col` - * for string at `string_index`. - * - * @param idx Index of the delimiter in the chars column - * @param d_token_counts Token counts for each string - * @param d_positions The beginning byte position of each delimiter - * @param positions_count Number of delimiters - * @param d_indexes Indices of the strings for each delimiter - * @param d_all_tokens All output tokens for the strings column - */ - __device__ void process_tokens(size_type idx, - size_type const* d_token_counts, - size_type const* d_positions, - size_type positions_count, - size_type const* d_indexes, - string_index_pair* d_all_tokens) const - { - size_type str_idx = d_indexes[idx]; - if ((idx > 0) && d_indexes[idx - 1] == str_idx) - return; // the first delimiter for the string rules them all - --str_idx; // all of these are off by 1 from the upper_bound call - size_type token_count = d_token_counts[str_idx]; // max_tokens already included - const char* const base_ptr = get_base_ptr(); // d_positions values are based on this ptr - // this string's tokens output - auto d_tokens = d_all_tokens + str_idx; - // this string - const string_view d_str = get_string(str_idx); - const char* str_ptr = d_str.data(); // beginning of the string - const char* const str_end_ptr = str_ptr + d_str.size_bytes(); // end of the string - // build the index-pair of each token for this string - for (size_type col = 0; col < token_count; ++col) { - auto next_delim = ((idx + col) < positions_count) // boundary check for delims in last string - ? (base_ptr + d_positions[idx + col]) // start of next delimiter - : str_end_ptr; // or end of this string - auto eptr = (next_delim < str_end_ptr) // make sure delimiter is inside this string - && (col + 1 < token_count) // and this is not the last token - ? next_delim - : str_end_ptr; - // store the token into the output vector - d_tokens[col * d_strings.size()] = - string_index_pair{str_ptr, static_cast(eptr - str_ptr)}; - // point past this delimiter - str_ptr = eptr + d_delimiter.size_bytes(); - } - } - - /** - * @brief Returns `true` if the byte at `idx` is the start of the delimiter. - * - * @param idx Index of a byte in the chars column. - * @param d_offsets Offsets values to locate the chars ranges. - * @param chars_bytes Total number of characters to process. - * @return true if delimiter is found starting at position `idx` - */ - __device__ bool is_delimiter(size_type idx, // chars index - int32_t const* d_offsets, - size_type chars_bytes) const - { - auto d_chars = get_base_ptr() + d_offsets[0]; - if (idx + d_delimiter.size_bytes() > chars_bytes) return false; - return d_delimiter.compare(d_chars + idx, d_delimiter.size_bytes()) == 0; - } - - /** - * @brief This counts the tokens for strings that contain delimiters. - * - * @param idx Index of a delimiter - * @param d_positions Start positions of all the delimiters - * @param positions_count The number of delimiters - * @param d_indexes Indices of the strings for each delimiter - * @param d_counts The token counts for all the strings - */ - __device__ void count_tokens(size_type idx, // delimiter index - size_type const* d_positions, - size_type positions_count, - size_type const* d_indexes, - size_type* d_counts) const - { - size_type str_idx = d_indexes[idx]; - if ((idx > 0) && d_indexes[idx - 1] == str_idx) - return; // first delimiter found handles all of them for this string - auto const delim_length = d_delimiter.size_bytes(); - string_view const d_str = get_string(str_idx - 1); - const char* const base_ptr = get_base_ptr(); - size_type delim_count = 0; // re-count delimiters to compute the token-count - size_type last_pos = d_positions[idx] - delim_length; - while ((idx < positions_count) && (d_indexes[idx] == str_idx)) { - // make sure the whole delimiter is inside the string before counting it - auto d_pos = d_positions[idx]; - if (((base_ptr + d_pos + delim_length - 1) < (d_str.data() + d_str.size_bytes())) && - ((d_pos - last_pos) >= delim_length)) { - ++delim_count; // only count if the delimiter fits - last_pos = d_pos; // overlapping delimiters are ignored too - } - ++idx; - } - // the number of tokens is delim_count+1 but capped to max_tokens - d_counts[str_idx - 1] = - ((max_tokens > 0) && (delim_count + 1 > max_tokens)) ? max_tokens : delim_count + 1; - } - - split_tokenizer_fn(column_device_view const& d_strings, - string_view const& d_delimiter, - size_type max_tokens) - : base_split_tokenizer(d_strings, d_delimiter, max_tokens) - { - } -}; - -/** - * @brief The tokenizer functions for split(). - * - * The methods here count delimiters, tokens, and output token elements - * for each string in a strings column. - * - * Same as split_tokenizer_fn except tokens are counted from the end of each string. - */ -struct rsplit_tokenizer_fn : base_split_tokenizer { - /** - * @brief This will create tokens around each delimiter honoring the string boundaries - * in which the delimiter resides. - * - * The tokens are processed from the end of each string so the `max_tokens` - * is honored correctly. - * - * Each token is placed in `d_all_tokens` so they align consecutively - * with other tokens for the same output column. - * That is, `d_tokens[col * strings_count + string_index]` is the token at column `col` - * for string at `string_index`. - * - * @param idx Index of the delimiter in the chars column - * @param d_token_counts Token counts for each string - * @param d_positions The ending byte position of each delimiter - * @param positions_count Number of delimiters - * @param d_indexes Indices of the strings for each delimiter - * @param d_all_tokens All output tokens for the strings column - */ - __device__ void process_tokens(size_type idx, // delimiter position index - size_type const* d_token_counts, // token counts for each string - size_type const* d_positions, // end of each delimiter - size_type positions_count, // total number of delimiters - size_type const* d_indexes, // string indices for each delimiter - string_index_pair* d_all_tokens) const - { - size_type str_idx = d_indexes[idx]; - if ((idx + 1 < positions_count) && d_indexes[idx + 1] == str_idx) - return; // the last delimiter for the string rules them all - --str_idx; // all of these are off by 1 from the upper_bound call - size_type token_count = d_token_counts[str_idx]; // max_tokens already included - const char* const base_ptr = get_base_ptr(); // d_positions values are based on this ptr - // this string's tokens output - auto d_tokens = d_all_tokens + str_idx; - // this string - const string_view d_str = get_string(str_idx); - const char* const str_begin_ptr = d_str.data(); // beginning of the string - const char* str_ptr = str_begin_ptr + d_str.size_bytes(); // end of the string - // build the index-pair of each token for this string - for (size_type col = 0; col < token_count; ++col) { - auto prev_delim = (idx >= col) // boundary check for delims in first string - ? (base_ptr + d_positions[idx - col] + 1) // end of prev delimiter - : str_begin_ptr; // or the start of this string - auto sptr = (prev_delim > str_begin_ptr) // make sure delimiter is inside the string - && (col + 1 < token_count) // and this is not the last token - ? prev_delim - : str_begin_ptr; - // store the token into the output -- building the array backwards - d_tokens[d_strings.size() * (token_count - 1 - col)] = - string_index_pair{sptr, static_cast(str_ptr - sptr)}; - str_ptr = sptr - d_delimiter.size_bytes(); // get ready for the next prev token - } - } - - /** - * @brief Returns `true` if the byte at `idx` is the end of the delimiter. - * - * @param idx Index of a byte in the chars column. - * @param d_offsets Offsets values to locate the chars ranges. - * @return true if delimiter is found ending at position `idx` - */ - __device__ bool is_delimiter(size_type idx, int32_t const* d_offsets, size_type) const - { - auto delim_length = d_delimiter.size_bytes(); - if (idx < delim_length - 1) return false; - auto d_chars = get_base_ptr() + d_offsets[0]; - return d_delimiter.compare(d_chars + idx - (delim_length - 1), delim_length) == 0; - } - - /** - * @brief This counts the tokens for strings that contain delimiters. - * - * Token counting starts at the end of the string to honor the `max_tokens` - * appropriately. - * - * @param idx Index of a delimiter - * @param d_positions End positions of all the delimiters - * @param positions_count The number of delimiters - * @param d_indexes Indices of the strings for each delimiter - * @param d_counts The token counts for all the strings - */ - __device__ void count_tokens(size_type idx, - size_type const* d_positions, - size_type positions_count, - size_type const* d_indexes, - size_type* d_counts) const - { - size_type str_idx = d_indexes[idx]; // 1-based string index created by upper_bound() - if ((idx > 0) && d_indexes[idx - 1] == str_idx) - return; // first delimiter found handles all of them for this string - auto const delim_length = d_delimiter.size_bytes(); - const string_view d_str = get_string(str_idx - 1); // -1 for 0-based index - const char* const base_ptr = get_base_ptr(); - size_type delim_count = 0; - size_type last_pos = d_positions[idx] - delim_length; - while ((idx < positions_count) && (d_indexes[idx] == str_idx)) { - // make sure the whole delimiter is inside the string before counting it - auto d_pos = d_positions[idx]; - if (((base_ptr + d_pos + 1 - delim_length) >= d_str.data()) && - ((d_pos - last_pos) >= delim_length)) { - ++delim_count; // only count if the delimiter fits - last_pos = d_pos; // overlapping delimiters are also ignored - } - ++idx; - } - // the number of tokens is delim_count+1 but capped to max_tokens - d_counts[str_idx - 1] = - ((max_tokens > 0) && (delim_count + 1 > max_tokens)) ? max_tokens : delim_count + 1; - } - - rsplit_tokenizer_fn(column_device_view const& d_strings, - string_view const& d_delimiter, - size_type max_tokens) - : base_split_tokenizer(d_strings, d_delimiter, max_tokens) - { - } -}; - /** * @brief Generic split function called by split() and rsplit(). * @@ -423,125 +108,42 @@ struct rsplit_tokenizer_fn : base_split_tokenizer { * @return table of columns for the output of the split */ template -std::unique_ptr split_fn(strings_column_view const& strings_column, +std::unique_ptr
split_fn(strings_column_view const& input, Tokenizer tokenizer, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { std::vector> results; - auto const strings_count = strings_column.size(); - if (strings_count == 0) { - results.push_back(make_empty_column(type_id::STRING)); + if (input.size() == input.null_count()) { + results.push_back(std::make_unique(input.parent(), stream, mr)); return std::make_unique
(std::move(results)); } - auto d_offsets = strings_column.offsets_begin(); - auto const chars_bytes = - cudf::detail::get_value( - strings_column.offsets(), strings_column.offset() + strings_count, stream) - - cudf::detail::get_value(strings_column.offsets(), strings_column.offset(), stream); + // builds the offsets and the vector of all tokens + auto [offsets, tokens] = split_helper(input, tokenizer, stream, mr); + auto const d_offsets = offsets->view().template data(); + auto const d_tokens = tokens.data(); - // count the number of delimiters in the entire column - auto const delimiter_count = - thrust::count_if(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(chars_bytes), - [tokenizer, d_offsets, chars_bytes] __device__(size_type idx) { - return tokenizer.is_delimiter(idx, d_offsets, chars_bytes); - }); - - // create vector of every delimiter position in the chars column - rmm::device_uvector delimiter_positions(delimiter_count, stream); - auto d_positions = delimiter_positions.data(); - auto copy_end = thrust::copy_if(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(chars_bytes), - delimiter_positions.begin(), - [tokenizer, d_offsets, chars_bytes] __device__(size_type idx) { - return tokenizer.is_delimiter(idx, d_offsets, chars_bytes); - }); - - // create vector of string indices for each delimiter - rmm::device_uvector string_indices(delimiter_count, stream); // these will - auto d_string_indices = string_indices.data(); // be strings that only contain delimiters - thrust::upper_bound(rmm::exec_policy(stream), - d_offsets, - d_offsets + strings_count, - delimiter_positions.begin(), - copy_end, - string_indices.begin()); - - // compute the number of tokens per string - rmm::device_uvector token_counts(strings_count, stream); - auto d_token_counts = token_counts.data(); - // first, initialize token counts for strings without delimiters in them - thrust::transform(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(strings_count), - d_token_counts, - [tokenizer] __device__(size_type idx) { - // null are 0, all others 1 - return static_cast(tokenizer.is_valid(idx)); - }); - - // now compute the number of tokens in each string - thrust::for_each_n( + // compute the maximum number of tokens for any string + auto const columns_count = thrust::transform_reduce( rmm::exec_policy(stream), thrust::make_counting_iterator(0), - delimiter_count, - [tokenizer, d_positions, delimiter_count, d_string_indices, d_token_counts] __device__( - size_type idx) { - tokenizer.count_tokens(idx, d_positions, delimiter_count, d_string_indices, d_token_counts); - }); - - // the columns_count is the maximum number of tokens for any string - auto const columns_count = thrust::reduce( - rmm::exec_policy(stream), token_counts.begin(), token_counts.end(), 0, thrust::maximum{}); - // boundary case: if no columns, return one null column (custrings issue #119) - if (columns_count == 0) { - results.push_back(std::make_unique( - data_type{type_id::STRING}, - strings_count, - rmm::device_buffer{0, stream, mr}, // no data - cudf::detail::create_null_mask(strings_count, mask_state::ALL_NULL, stream, mr), - strings_count)); - } + thrust::make_counting_iterator(input.size()), + [d_offsets] __device__(auto idx) -> size_type { return d_offsets[idx + 1] - d_offsets[idx]; }, + 0, + thrust::maximum{}); - // create working area to hold all token positions - rmm::device_uvector tokens(columns_count * strings_count, stream); - string_index_pair* d_tokens = tokens.data(); - // initialize the token positions - // -- accounts for nulls, empty, and strings with no delimiter in them - thrust::for_each_n(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - strings_count, - [tokenizer, columns_count, d_tokens] __device__(size_type idx) { - tokenizer.init_tokens(idx, columns_count, d_tokens); - }); - - // get the positions for every token using the delimiter positions - thrust::for_each_n( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - delimiter_count, - [tokenizer, - d_token_counts, - d_positions, - delimiter_count, - d_string_indices, - d_tokens] __device__(size_type idx) { - tokenizer.process_tokens( - idx, d_token_counts, d_positions, delimiter_count, d_string_indices, d_tokens); - }); - - // Create each column. - // - Each pair points to the strings for that column for each row. - // - Create the strings column from the vector using the strings factory. + // build strings columns for each token position for (size_type col = 0; col < columns_count; ++col) { - auto column_tokens = d_tokens + (col * strings_count); - results.emplace_back( - make_strings_column(column_tokens, column_tokens + strings_count, stream, mr)); + auto itr = cudf::detail::make_counting_transform_iterator( + 0, [d_tokens, d_offsets, col] __device__(size_type idx) { + auto const offset = d_offsets[idx]; + auto const token_count = d_offsets[idx + 1] - offset; + return (col < token_count) ? d_tokens[offset + col] : string_index_pair{nullptr, 0}; + }); + results.emplace_back(make_strings_column(itr, itr + input.size(), stream, mr)); } + return std::make_unique
(std::move(results)); } diff --git a/cpp/src/strings/split/split.cuh b/cpp/src/strings/split/split.cuh new file mode 100644 index 00000000000..41213dac58b --- /dev/null +++ b/cpp/src/strings/split/split.cuh @@ -0,0 +1,403 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include + +namespace cudf::strings::detail { + +/** + * @brief Base class for delimiter-based tokenizers + * + * These are common methods used by both split and rsplit tokenizer functors. + * + * The Derived class is required to implement the `process_tokens` function. + */ +template +struct base_split_tokenizer { + __device__ char const* get_base_ptr() const + { + return d_strings.child(strings_column_view::chars_column_index).data(); + } + + __device__ string_view const get_string(size_type idx) const + { + return d_strings.element(idx); + } + + __device__ bool is_valid(size_type idx) const { return d_strings.is_valid(idx); } + + /** + * @brief Returns `true` if the byte at `idx` is the start of the delimiter + * + * @param idx Index of a byte in the chars column + * @param d_offsets Offsets values to locate the chars ranges + * @param chars_bytes Total number of characters to process + * @return true if delimiter is found starting at position `idx` + */ + __device__ bool is_delimiter(size_type idx, + size_type const* d_offsets, + size_type chars_bytes) const + { + auto const d_chars = get_base_ptr() + d_offsets[0]; + if (idx + d_delimiter.size_bytes() > chars_bytes) { return false; } + return d_delimiter.compare(d_chars + idx, d_delimiter.size_bytes()) == 0; + } + + /** + * @brief This counts the tokens for strings that contain delimiters + * + * Counting tokens is the same regardless if counting from the left + * or from the right. This logic counts from the left which is simpler. + * The count will be truncated appropriately to the max_tokens value. + * + * @param idx Index of input string + * @param d_positions Start positions of all the delimiters + * @param d_delimiter_offsets Offsets per string to delimiters in d_positions + */ + __device__ size_type count_tokens(size_type idx, + size_type const* d_positions, + size_type const* d_delimiter_offsets) const + { + if (!is_valid(idx)) { return 0; } + + auto const delim_size = d_delimiter.size_bytes(); + auto const d_str = get_string(idx); + auto const d_str_end = d_str.data() + d_str.size_bytes(); + auto const base_ptr = get_base_ptr() + delim_size - 1; + auto const delimiters = + cudf::device_span(d_positions + d_delimiter_offsets[idx], + d_delimiter_offsets[idx + 1] - d_delimiter_offsets[idx]); + + size_type token_count = 1; // all strings will have at least one token + size_type last_pos = delimiters[0] - delim_size; + for (auto d_pos : delimiters) { + // delimiter must fit in string && overlapping delimiters are ignored + if (((base_ptr + d_pos) < d_str_end) && ((d_pos - last_pos) >= delim_size)) { + ++token_count; + last_pos = d_pos; + } + } + // number of tokens is capped to max_tokens + return ((max_tokens > 0) && (token_count > max_tokens)) ? max_tokens : token_count; + } + + /** + * @brief This will create tokens around each delimiter honoring the string boundaries + * in which the delimiter resides + * + * Each token is placed in `d_all_tokens` so they align consecutively + * with other tokens for the same output column. + * + * The actual token extraction is performed in the subclass process_tokens() function. + * + * @param idx Index of the string to tokenize + * @param d_tokens_offsets Token offsets for each string + * @param d_positions The beginning byte position of each delimiter + * @param d_delimiter_offsets Offsets to d_positions to each delimiter set per string + * @param d_all_tokens All output tokens for the strings column + */ + __device__ void get_tokens(size_type idx, + size_type const* d_tokens_offsets, + size_type const* d_positions, + size_type const* d_delimiter_offsets, + string_index_pair* d_all_tokens) const + { + auto const d_tokens = // this string's tokens output + cudf::device_span(d_all_tokens + d_tokens_offsets[idx], + d_tokens_offsets[idx + 1] - d_tokens_offsets[idx]); + + if (!is_valid(idx)) { return; } + + auto const d_str = get_string(idx); + + // max_tokens already included in token counts + if (d_tokens.size() == 1) { + d_tokens[0] = string_index_pair{d_str.data(), d_str.size_bytes()}; + return; + } + + auto const delimiters = + cudf::device_span(d_positions + d_delimiter_offsets[idx], + d_delimiter_offsets[idx + 1] - d_delimiter_offsets[idx]); + + auto& derived = static_cast(*this); + derived.process_tokens(d_str, delimiters, d_tokens); + } + + base_split_tokenizer(column_device_view const& d_strings, + string_view const& d_delimiter, + size_type max_tokens) + : d_strings(d_strings), d_delimiter(d_delimiter), max_tokens(max_tokens) + { + } + + protected: + column_device_view const d_strings; // strings to split + string_view const d_delimiter; // delimiter for split + size_type max_tokens; // maximum number of tokens to identify +}; + +/** + * @brief The tokenizer functions for forward splitting + */ +struct split_tokenizer_fn : base_split_tokenizer { + /** + * @brief This will create tokens around each delimiter honoring the string boundaries + * + * The tokens are processed from the beginning of each string ignoring overlapping + * delimiters and honoring the `max_tokens` value. + * + * @param d_str String to tokenize + * @param d_delimiters Positions of delimiters for this string + * @param d_tokens Output vector to store tokens for this string + */ + __device__ void process_tokens(string_view const d_str, + device_span d_delimiters, + device_span d_tokens) const + { + auto const base_ptr = get_base_ptr(); // d_positions values based on this + auto str_ptr = d_str.data(); + auto const str_end = str_ptr + d_str.size_bytes(); // end of the string + auto const token_count = static_cast(d_tokens.size()); + auto const delim_size = d_delimiter.size_bytes(); + + // build the index-pair of each token for this string + size_type token_idx = 0; + for (auto d_pos : d_delimiters) { + auto const next_delim = base_ptr + d_pos; + if (next_delim < str_ptr || ((next_delim + delim_size) > str_end)) { continue; } + auto const end_ptr = (token_idx + 1 < token_count) ? next_delim : str_end; + + // store the token into the output vector + d_tokens[token_idx++] = + string_index_pair{str_ptr, static_cast(thrust::distance(str_ptr, end_ptr))}; + + // setup for next token + str_ptr = end_ptr + delim_size; + } + // include anything leftover + if (token_idx < token_count) { + d_tokens[token_idx] = + string_index_pair{str_ptr, static_cast(thrust::distance(str_ptr, str_end))}; + } + } + + split_tokenizer_fn(column_device_view const& d_strings, + string_view const& d_delimiter, + size_type max_tokens) + : base_split_tokenizer(d_strings, d_delimiter, max_tokens) + { + } +}; + +/** + * @brief The tokenizer functions for backwards splitting + * + * Same as split_tokenizer_fn except delimiters are searched from the end of each string. + */ +struct rsplit_tokenizer_fn : base_split_tokenizer { + /** + * @brief This will create tokens around each delimiter honoring the string boundaries + * + * The tokens are processed from the end of each string ignoring overlapping + * delimiters and honoring the `max_tokens` value. + * + * @param d_str String to tokenize + * @param d_delimiters Positions of delimiters for this string + * @param d_tokens Output vector to store tokens for this string + */ + __device__ void process_tokens(string_view const d_str, + device_span d_delimiters, + device_span d_tokens) const + { + auto const base_ptr = get_base_ptr(); // d_positions values are based on this ptr + auto const str_begin = d_str.data(); // beginning of the string + auto const token_count = static_cast(d_tokens.size()); + auto const delim_count = static_cast(d_delimiters.size()); + auto const delim_size = d_delimiter.size_bytes(); + + // build the index-pair of each token for this string + auto str_ptr = str_begin + d_str.size_bytes(); + size_type token_idx = 0; + for (auto d = delim_count - 1; d >= 0; --d) { // read right-to-left + auto const prev_delim = base_ptr + d_delimiters[d] + delim_size; + if (prev_delim > str_ptr || ((prev_delim - delim_size) < str_begin)) { continue; } + auto const start_ptr = (token_idx + 1 < token_count) ? prev_delim : str_begin; + + // store the token into the output vector right-to-left + d_tokens[token_count - token_idx - 1] = + string_index_pair{start_ptr, static_cast(thrust::distance(start_ptr, str_ptr))}; + + // setup for next token + str_ptr = start_ptr - delim_size; + ++token_idx; + } + // include anything leftover (rightover?) + if (token_idx < token_count) { + d_tokens[0] = + string_index_pair{str_begin, static_cast(thrust::distance(str_begin, str_ptr))}; + } + } + + rsplit_tokenizer_fn(column_device_view const& d_strings, + string_view const& d_delimiter, + size_type max_tokens) + : base_split_tokenizer(d_strings, d_delimiter, max_tokens) + { + } +}; + +/** + * @brief Helper function used by split/rsplit and split_record/rsplit_record + * + * This function returns all the token/split positions within the input column as processed by + * the given tokenizer. It also returns the offsets for each set of tokens identified per string. + * + * @tparam Tokenizer Type of the tokenizer object + * + * @param input The input column of strings to split + * @param tokenizer Object used for counting and identifying delimiters and tokens + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned objects' device memory. + */ +template +std::pair, rmm::device_uvector> split_helper( + strings_column_view const& input, + Tokenizer tokenizer, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + auto const strings_count = input.size(); + auto const chars_bytes = + cudf::detail::get_value(input.offsets(), input.offset() + strings_count, stream) - + cudf::detail::get_value(input.offsets(), input.offset(), stream); + + auto d_offsets = input.offsets_begin(); + + // count the number of delimiters in the entire column + auto const delimiter_count = + thrust::count_if(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(chars_bytes), + [tokenizer, d_offsets, chars_bytes] __device__(size_type idx) { + return tokenizer.is_delimiter(idx, d_offsets, chars_bytes); + }); + // Create a vector of every delimiter position in the chars column. + // These may include overlapping or otherwise out-of-bounds delimiters which + // will be resolved during token processing. + auto delimiter_positions = rmm::device_uvector(delimiter_count, stream); + auto d_positions = delimiter_positions.data(); + auto const copy_end = + thrust::copy_if(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(chars_bytes), + delimiter_positions.begin(), + [tokenizer, d_offsets, chars_bytes] __device__(size_type idx) { + return tokenizer.is_delimiter(idx, d_offsets, chars_bytes); + }); + + // create a vector of offsets to each string's delimiter set within delimiter_positions + auto const delimiter_offsets = [&] { + // first, create a vector of string indices for each delimiter + auto string_indices = rmm::device_uvector(delimiter_count, stream); + thrust::upper_bound(rmm::exec_policy(stream), + d_offsets, + d_offsets + strings_count, + delimiter_positions.begin(), + copy_end, + string_indices.begin()); + + // compute delimiter offsets per string + auto delimiter_offsets = rmm::device_uvector(strings_count + 1, stream); + auto d_delimiter_offsets = delimiter_offsets.data(); + + // memset to zero-out the delimiter counts for any null-entries or strings with no delimiters + CUDF_CUDA_TRY(cudaMemsetAsync( + d_delimiter_offsets, 0, delimiter_offsets.size() * sizeof(size_type), stream.value())); + + // next, count the number of delimiters per string + auto d_string_indices = string_indices.data(); // identifies strings with delimiters only + thrust::for_each_n(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + delimiter_count, + [d_string_indices, d_delimiter_offsets] __device__(size_type idx) { + auto const str_idx = d_string_indices[idx] - 1; + atomicAdd(d_delimiter_offsets + str_idx, 1); + }); + // finally, convert the delimiter counts into offsets + thrust::exclusive_scan(rmm::exec_policy(stream), + delimiter_offsets.begin(), + delimiter_offsets.end(), + delimiter_offsets.begin()); + return delimiter_offsets; + }(); + auto const d_delimiter_offsets = delimiter_offsets.data(); + + // compute the number of tokens per string + auto token_counts = rmm::device_uvector(strings_count, stream); + thrust::transform( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(strings_count), + token_counts.begin(), + [tokenizer, d_positions, d_delimiter_offsets] __device__(size_type idx) -> size_type { + return tokenizer.count_tokens(idx, d_positions, d_delimiter_offsets); + }); + + // create offsets from the counts for return to the caller + auto offsets = std::get<0>( + cudf::detail::make_offsets_child_column(token_counts.begin(), token_counts.end(), stream, mr)); + auto const total_tokens = + cudf::detail::get_value(offsets->view(), strings_count, stream); + auto const d_tokens_offsets = offsets->view().data(); + + // build a vector of all the token positions for all the strings + auto tokens = rmm::device_uvector(total_tokens, stream); + auto d_tokens = tokens.data(); + thrust::for_each_n( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + strings_count, + [tokenizer, d_tokens_offsets, d_positions, d_delimiter_offsets, d_tokens] __device__( + size_type idx) { + tokenizer.get_tokens(idx, d_tokens_offsets, d_positions, d_delimiter_offsets, d_tokens); + }); + + return std::make_pair(std::move(offsets), std::move(tokens)); +} + +} // namespace cudf::strings::detail diff --git a/cpp/src/strings/split/split_record.cu b/cpp/src/strings/split/split_record.cu index d935ad0b1da..5b79fdefb5a 100644 --- a/cpp/src/strings/split/split_record.cu +++ b/cpp/src/strings/split/split_record.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,6 +14,8 @@ * limitations under the License. */ +#include "split.cuh" + #include #include #include @@ -23,14 +25,12 @@ #include #include #include -#include #include #include #include #include -#include #include #include @@ -38,108 +38,43 @@ namespace cudf { namespace strings { namespace detail { -using string_index_pair = thrust::pair; - namespace { -enum class Dir { FORWARD, BACKWARD }; - -/** - * @brief Compute the number of tokens for the `idx'th` string element of `d_strings`. - * - * The number of tokens is the same regardless if counting from the beginning - * or the end of the string. - */ -struct token_counter_fn { - column_device_view const d_strings; // strings to split - string_view const d_delimiter; // delimiter for split - size_type const max_tokens = std::numeric_limits::max(); - - __device__ size_type operator()(size_type idx) const - { - if (d_strings.is_null(idx)) { return 0; } - - auto const d_str = d_strings.element(idx); - size_type token_count = 0; - size_type start_pos = 0; - while (token_count < max_tokens - 1) { - auto const delimiter_pos = d_str.find(d_delimiter, start_pos); - if (delimiter_pos == string_view::npos) break; - token_count++; - start_pos = delimiter_pos + d_delimiter.length(); - } - return token_count + 1; // always at least one token - } -}; - -/** - * @brief Identify the tokens from the `idx'th` string element of `d_strings`. - */ -template -struct token_reader_fn { - column_device_view const d_strings; // strings to split - string_view const d_delimiter; // delimiter for split - int32_t* d_token_offsets{}; // for locating tokens in d_tokens - string_index_pair* d_tokens{}; - - __device__ string_index_pair resolve_token(string_view const& d_str, - size_type start_pos, - size_type end_pos, - size_type delimiter_pos) const - { - if (dir == Dir::FORWARD) { - auto const byte_offset = d_str.byte_offset(start_pos); - return string_index_pair{d_str.data() + byte_offset, - d_str.byte_offset(delimiter_pos) - byte_offset}; - } else { - auto const byte_offset = d_str.byte_offset(delimiter_pos + d_delimiter.length()); - return string_index_pair{d_str.data() + byte_offset, - d_str.byte_offset(end_pos) - byte_offset}; - } +template +std::unique_ptr split_record_fn(strings_column_view const& input, + Tokenizer tokenizer, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + if (input.is_empty()) { return make_empty_column(type_id::LIST); } + if (input.size() == input.null_count()) { + auto offsets = std::make_unique(input.offsets(), stream, mr); + auto results = make_empty_column(type_id::STRING); + return make_lists_column(input.size(), + std::move(offsets), + std::move(results), + input.null_count(), + copy_bitmask(input.parent(), stream, mr), + stream, + mr); } - __device__ void operator()(size_type idx) - { - if (d_strings.is_null(idx)) { return; } + // builds the offsets and the vector of all tokens + auto [offsets, tokens] = split_helper(input, tokenizer, stream, mr); - auto const token_offset = d_token_offsets[idx]; - auto const token_count = d_token_offsets[idx + 1] - token_offset; - auto d_result = d_tokens + token_offset; - auto const d_str = d_strings.element(idx); - if (d_str.empty()) { - // Pandas str.split("") for non-whitespace delimiter is an empty string - *d_result = string_index_pair{"", 0}; - return; - } + // build a strings column from the tokens + auto strings_child = make_strings_column(tokens.begin(), tokens.end(), stream, mr); - size_type token_idx = 0; - size_type start_pos = 0; // updates only if moving forward - size_type end_pos = d_str.length(); // updates only if moving backward - while (token_idx < token_count - 1) { - auto const delimiter_pos = dir == Dir::FORWARD ? d_str.find(d_delimiter, start_pos) - : d_str.rfind(d_delimiter, start_pos, end_pos); - if (delimiter_pos == string_view::npos) break; - auto const token = resolve_token(d_str, start_pos, end_pos, delimiter_pos); - if (dir == Dir::FORWARD) { - d_result[token_idx] = token; - start_pos = delimiter_pos + d_delimiter.length(); - } else { - d_result[token_count - 1 - token_idx] = token; - end_pos = delimiter_pos; - } - token_idx++; - } + return make_lists_column(input.size(), + std::move(offsets), + std::move(strings_child), + input.null_count(), + copy_bitmask(input.parent(), stream, mr), + stream, + mr); +} - // set last token to remainder of the string - if (dir == Dir::FORWARD) { - auto const offset_bytes = d_str.byte_offset(start_pos); - d_result[token_idx] = - string_index_pair{d_str.data() + offset_bytes, d_str.byte_offset(end_pos) - offset_bytes}; - } else { - d_result[0] = string_index_pair{d_str.data(), d_str.byte_offset(end_pos)}; - } - } -}; +enum class Dir { FORWARD, BACKWARD }; /** * @brief Compute the number of tokens for the `idx'th` string element of `d_strings`. @@ -196,7 +131,7 @@ struct whitespace_token_reader_fn { whitespace_string_tokenizer tokenizer(d_str, dir != Dir::FORWARD); size_type token_idx = 0; position_pair token{0, 0}; - if (dir == Dir::FORWARD) { + if constexpr (dir == Dir::FORWARD) { while (tokenizer.next_token() && (token_idx < token_count)) { token = tokenizer.get_token(); d_result[token_idx++] = @@ -224,11 +159,11 @@ struct whitespace_token_reader_fn { // The output is one list item per string template -std::unique_ptr split_record_fn(strings_column_view const& strings, - TokenCounter counter, - TokenReader reader, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) +std::unique_ptr whitespace_split_record_fn(strings_column_view const& strings, + TokenCounter counter, + TokenReader reader, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { // create offsets column by counting the number of tokens per string auto strings_count = strings.size(); @@ -244,7 +179,7 @@ std::unique_ptr split_record_fn(strings_column_view const& strings, rmm::exec_policy(stream), d_offsets, d_offsets + strings_count + 1, d_offsets); // last entry is the total number of tokens to be generated - auto total_tokens = cudf::detail::get_value(offsets->view(), strings_count, stream); + auto total_tokens = cudf::detail::get_value(offsets->view(), strings_count, stream); // split each string into an array of index-pair values rmm::device_uvector tokens(total_tokens, stream); reader.d_token_offsets = d_offsets; @@ -277,18 +212,21 @@ std::unique_ptr split_record(strings_column_view const& strings, auto d_strings_column_ptr = column_device_view::create(strings.parent(), stream); if (delimiter.size() == 0) { - return split_record_fn(strings, - whitespace_token_counter_fn{*d_strings_column_ptr, max_tokens}, - whitespace_token_reader_fn{*d_strings_column_ptr, max_tokens}, - stream, - mr); + return whitespace_split_record_fn( + strings, + whitespace_token_counter_fn{*d_strings_column_ptr, max_tokens}, + whitespace_token_reader_fn{*d_strings_column_ptr, max_tokens}, + stream, + mr); } else { string_view d_delimiter(delimiter.data(), delimiter.size()); - return split_record_fn(strings, - token_counter_fn{*d_strings_column_ptr, d_delimiter, max_tokens}, - token_reader_fn{*d_strings_column_ptr, d_delimiter}, - stream, - mr); + if (dir == Dir::FORWARD) { + return split_record_fn( + strings, split_tokenizer_fn{*d_strings_column_ptr, d_delimiter, max_tokens}, stream, mr); + } else { + return split_record_fn( + strings, rsplit_tokenizer_fn{*d_strings_column_ptr, d_delimiter, max_tokens}, stream, mr); + } } } diff --git a/cpp/tests/strings/split_tests.cpp b/cpp/tests/strings/split_tests.cpp index 73d5adab427..714c1ad416a 100644 --- a/cpp/tests/strings/split_tests.cpp +++ b/cpp/tests/strings/split_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -308,6 +308,82 @@ TEST_F(StringsSplitTest, SplitRecordWhitespaceWithMaxSplit) CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected); } +TEST_F(StringsSplitTest, MultiByteDelimiters) +{ + // Overlapping delimiters + auto input = + cudf::test::strings_column_wrapper({"u::", "w:::x", "y::::z", "::a", ":::b", ":::c:::"}); + auto view = cudf::strings_column_view(input); + using LCW = cudf::test::lists_column_wrapper; + { + auto result = cudf::strings::split_record(view, cudf::string_scalar("::")); + auto expected_left = LCW({LCW{"u", ""}, + LCW{"w", ":x"}, + LCW{"y", "", "z"}, + LCW{"", "a"}, + LCW{"", ":b"}, + LCW{"", ":c", ":"}}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected_left); + result = cudf::strings::rsplit_record(view, cudf::string_scalar("::")); + auto expected_right = LCW({LCW{"u", ""}, + LCW{"w:", "x"}, + LCW{"y", "", "z"}, + LCW{"", "a"}, + LCW{":", "b"}, + LCW{":", "c:", ""}}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected_right); + } + { + auto result = cudf::strings::split(view, cudf::string_scalar("::")); + + auto c0 = cudf::test::strings_column_wrapper({"u", "w", "y", "", "", ""}); + auto c1 = cudf::test::strings_column_wrapper({"", ":x", "", "a", ":b", ":c"}); + auto c2 = cudf::test::strings_column_wrapper({"", "", "z", "", "", ":"}, {0, 0, 1, 0, 0, 1}); + std::vector> expected_columns; + expected_columns.push_back(c0.release()); + expected_columns.push_back(c1.release()); + expected_columns.push_back(c2.release()); + auto expected_left = std::make_unique(std::move(expected_columns)); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*result, *expected_left); + + result = cudf::strings::rsplit(view, cudf::string_scalar("::")); + + c0 = cudf::test::strings_column_wrapper({"u", "w:", "y", "", ":", ":"}); + c1 = cudf::test::strings_column_wrapper({"", "x", "", "a", "b", "c:"}); + c2 = cudf::test::strings_column_wrapper({"", "", "z", "", "", ""}, {0, 0, 1, 0, 0, 1}); + expected_columns.push_back(c0.release()); + expected_columns.push_back(c1.release()); + expected_columns.push_back(c2.release()); + auto expected_right = std::make_unique(std::move(expected_columns)); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*result, *expected_right); + } + + // Delimiters that span across adjacent strings + input = cudf::test::strings_column_wrapper({"{a=1}:{b=2}:", "{c=3}", ":{}:{}"}); + view = cudf::strings_column_view(input); + { + auto result = cudf::strings::split_record(view, cudf::string_scalar("}:{")); + auto expected = LCW({LCW{"{a=1", "b=2}:"}, LCW{"{c=3}"}, LCW{":{", "}"}}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected); + result = cudf::strings::rsplit_record(view, cudf::string_scalar("}:{")); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected); + } + { + auto result = cudf::strings::split(view, cudf::string_scalar("}:{")); + + auto c0 = cudf::test::strings_column_wrapper({"{a=1", "{c=3}", ":{"}); + auto c1 = cudf::test::strings_column_wrapper({"b=2}:", "", "}"}, {1, 0, 1}); + std::vector> expected_columns; + expected_columns.push_back(c0.release()); + expected_columns.push_back(c1.release()); + auto expected = std::make_unique(std::move(expected_columns)); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*result, *expected); + + result = cudf::strings::rsplit(view, cudf::string_scalar("}:{")); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*result, *expected); + } +} + TEST_F(StringsSplitTest, SplitRegex) { std::vector h_strings{" Héllo thesé", nullptr, "are some ", "tést String", ""};