From db3d6633cc5051cf8f665f43b991e043fbc21eac Mon Sep 17 00:00:00 2001 From: davidwendt Date: Tue, 14 Jul 2020 11:08:51 -0400 Subject: [PATCH 1/8] change split-record to return list column --- cpp/include/cudf/strings/split/split.hpp | 83 ++-- cpp/src/strings/split/split_record.cu | 550 +++++++---------------- cpp/src/strings/split/split_utils.cuh | 118 +++++ cpp/tests/strings/split_tests.cpp | 329 ++++---------- 4 files changed, 413 insertions(+), 667 deletions(-) create mode 100644 cpp/src/strings/split/split_utils.cuh diff --git a/cpp/include/cudf/strings/split/split.hpp b/cpp/include/cudf/strings/split/split.hpp index 371048287ca..4abd7ea54c8 100644 --- a/cpp/include/cudf/strings/split/split.hpp +++ b/cpp/include/cudf/strings/split/split.hpp @@ -82,82 +82,69 @@ std::unique_ptr rsplit( rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource()); /** - * @brief The result(s) of a `contiguous_(r)split_record` + * @brief Splits individual strings elements in to a list of tokens. * - * Each column_view resulting from a split operation performed by - * contiguous_split_record will be returned wrapped in a - * `contiguous_split_record_result`. The column data addresses stored in the - * column_view objects are not owned by top level cudf::column objects. The - * backing memory is instead owned by the `all_data` field and in one contiguous - * block. + * Each element generates an array of tokens that are stored in a + * resulting list column. * - * The user is responsible for assuring that the `column_views` or any derived - * objects do not outlive the memory owned by `all_data` - */ -struct contiguous_split_record_result { - std::vector column_views; - std::unique_ptr all_data; -}; - -/** - * @brief Splits each element of the input column to a column of tokens storing - * the resulting columns in a single contiguous block of memory. + * The number of elements in the output list will be the same as the number of + * elements in the input column. Each individual list item will contain the + * tokens for that row. The resulting number of tokens in each row can vary + * from 0 to `maxsplit+1`. * - * This function splits each element in the input column to a column of tokens. - * The number of columns in the output vector will be the same as the number of - * elements in the input column. The column length will coincide with the - * number of tokens; the resulting columns wrapped in the returned object may - * have different sizes. + * The `delimiter` is searched within each string from beginning to end + * and splitting stops when either `maxsplit` or the end of the string is reached. * - * Splitting a null string element will result in an empty output column. + * A null string element will result in a null list item for that row. * - * @throws cudf:logic_error if `delimiter` is invalid. + * @throw cudf:logic_error if `delimiter` is invalid. * * @param strings A column of string elements to be splitted. - * @param delimiter UTF-8 encoded string indicating the split points in each - * string. + * @param delimiter The string to identify split points in each string. * Default of empty string indicates split on whitespace. * @param maxsplit Maximum number of splits to perform. * Default of -1 indicates all possible splits on each string. * @param mr Device memory resource used to allocate the returned result's device memory. - * @return contiguous_split_record_result New vector of strings column_view - * objects - * (each column_view element of the vector holds splits from a string - * element of the input column). + * @return List column of strings + * Each vector of the list column holds splits from a single row + * element of the input column. */ -contiguous_split_record_result contiguous_split_record( +std::unique_ptr split_record( strings_column_view const& strings, string_scalar const& delimiter = string_scalar(""), size_type maxsplit = -1, rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource()); /** - * @brief Splits each element of the input column from the end to a column of - * tokens storing the resulting columns in a single contiguous block of memory. + * @brief Splits individual strings elements in to a list of tokens starting + * from the end of each string. + * + * Each element generates an array of tokens that are stored in a + * resulting list column. + * + * The number of elements in the output list will be the same as the number of + * elements in the input column. Each individual list item will contain the + * tokens for that row. The resulting number of tokens in each row can vary + * from 0 to `maxsplit+1`. * - * This function splits each element in the input column to a column of tokens. - * The number of columns in the output vector will be the same as the number of - * elements in the input column. The column length will coincide with the - * number of tokens; the resulting columns wrapped in the returned object may - * have different sizes. + * The `delimiter` is searched from end to beginning within each string + * and splitting stops when either `maxsplit` or the end of the string is reached. * - * Splitting a null string element will result in an empty output column. + * A null string element will result in a null list item for that row. * - * @throws cudf:logic_error if `delimiter` is invalid. + * @throw cudf:logic_error if `delimiter` is invalid. * * @param strings A column of string elements to be splitted. - * @param delimiter UTF-8 encoded string indicating the split points in each - * string. + * @param delimiter The string to identify split points in each string. * Default of empty string indicates split on whitespace. * @param maxsplit Maximum number of splits to perform. * Default of -1 indicates all possible splits on each string. * @param mr Device memory resource used to allocate the returned result's device memory. - * @return contiguous_split_record_result New vector of strings column_view - * objects - * (each column_view element of the vector holds splits from a string - * element of the input column). + * @return List column of strings + * Each vector of the list column holds splits from a single row + * element of the input column. */ -contiguous_split_record_result contiguous_rsplit_record( +std::unique_ptr rsplit_record( strings_column_view const& strings, string_scalar const& delimiter = string_scalar(""), size_type maxsplit = -1, diff --git a/cpp/src/strings/split/split_record.cu b/cpp/src/strings/split/split_record.cu index 4a069906f80..19ea2503ca3 100644 --- a/cpp/src/strings/split/split_record.cu +++ b/cpp/src/strings/split/split_record.cu @@ -16,125 +16,81 @@ #include #include +#include +#include #include -#include +#include #include #include #include +#include +#include #include -#include namespace cudf { namespace strings { namespace detail { -namespace { -// align all column size allocations to this boundary so that all output column buffers -// start at that alignment. -static constexpr size_type split_align = 64; +using string_index_pair = thrust::pair; -__device__ size_type compute_memory_size(size_type token_count, size_type token_size_sum) -{ - return cudf::detail::round_up_pow2(token_size_sum, split_align) + - cudf::detail::round_up_pow2((token_count + 1) * static_cast(sizeof(size_type)), - split_align); -} - -struct copy_info { - size_type idx{}; - size_type token_count{}; - size_type token_size_sum{}; - void* memory_ptr{}; -}; +namespace { enum class Dir { FORWARD, BACKWARD }; /** - * @brief Compute the number of tokens, the total byte sizes of the tokens, and - * required memory size for the `idx'th` string element of `d_strings`. + * @brief Compute the number of tokens for the `idx'th` string element of `d_strings`. */ template -struct token_reader_fn { +struct token_counter_fn { column_device_view const d_strings; // strings to split string_view const d_delimiter; // delimiter for split size_type const max_tokens = std::numeric_limits::max(); - bool const has_validity = false; - template - __device__ size_type compute_token_char_bytes(string_view const& d_str, - size_type start_pos, - size_type end_pos, - size_type delimiter_pos) const + __device__ size_type operator()(size_type idx) const { - if (last) { - return dir == Dir::FORWARD ? d_str.byte_offset(end_pos) - d_str.byte_offset(start_pos) - : d_str.byte_offset(end_pos); - } else { - return dir == Dir::FORWARD ? d_str.byte_offset(delimiter_pos) - d_str.byte_offset(start_pos) - : d_str.byte_offset(end_pos) - - d_str.byte_offset(delimiter_pos + d_delimiter.length()); - } - } - - // returns a tuple of token count, sum of token sizes in bytes, and required - // memory block size - __device__ thrust::tuple operator()(size_type idx) const - { - if (has_validity && d_strings.is_null(idx)) { - return thrust::make_tuple(0, 0, 0); - } + if (d_strings.is_null(idx)) { return 0; } - auto const d_str = d_strings.element(idx); - size_type token_count = 0; - size_type token_size_sum = 0; - size_type start_pos = 0; // updates only if moving forward - auto end_pos = d_str.length(); // updates only if moving backward + auto const d_str = d_strings.element(idx); + size_type token_count = 0; + size_type start_pos = 0; // updates only if moving forward + size_type end_pos = d_str.length(); // updates only if moving backward while (token_count < max_tokens - 1) { auto const delimiter_pos = dir == Dir::FORWARD ? d_str.find(d_delimiter, start_pos) : d_str.rfind(d_delimiter, start_pos, end_pos); - if (delimiter_pos != -1) { - token_count++; - token_size_sum += compute_token_char_bytes(d_str, start_pos, end_pos, delimiter_pos); - if (dir == Dir::FORWARD) { - start_pos = delimiter_pos + d_delimiter.length(); - } else { - end_pos = delimiter_pos; - } - } else { - break; - } + if (delimiter_pos < 0) break; + token_count++; + if (dir == Dir::FORWARD) + start_pos = delimiter_pos + d_delimiter.length(); + else + end_pos = delimiter_pos; } - token_count++; - token_size_sum += compute_token_char_bytes(d_str, start_pos, end_pos, -1); - - auto const memory_size = compute_memory_size(token_count, token_size_sum); - - return thrust::make_tuple( - token_count, token_size_sum, memory_size); + return token_count + 1; // always at least one token } }; /** - * @brief Copy the tokens from the `idx'th` string element of `d_strings` to - * the contiguous memory buffer. + * @brief Identify the tokens from the `idx'th` string element of `d_strings`. */ template -struct token_copier_fn { +struct token_reader_fn { column_device_view const d_strings; // strings to split string_view const d_delimiter; // delimiter for split - bool const has_validity = false; + int32_t* d_token_offsets{}; // for locating tokens in d_tokens + string_index_pair* d_tokens{}; template - __device__ thrust::pair compute_src_byte_offset_and_token_char_bytes( - string_view const& d_str, size_type start_pos, size_type end_pos, size_type delimiter_pos) const + __device__ string_index_pair resolve_token(string_view const& d_str, + size_type start_pos, + size_type end_pos, + size_type delimiter_pos) const { if (last) { auto const src_byte_offset = dir == Dir::FORWARD ? d_str.byte_offset(start_pos) : 0; auto const token_char_bytes = dir == Dir::FORWARD ? d_str.byte_offset(end_pos) - src_byte_offset : d_str.byte_offset(end_pos); - return thrust::make_pair(src_byte_offset, token_char_bytes); + return string_index_pair{d_str.data() + src_byte_offset, token_char_bytes}; } else { auto const src_byte_offset = dir == Dir::FORWARD ? d_str.byte_offset(start_pos) @@ -142,123 +98,71 @@ struct token_copier_fn { auto const token_char_bytes = dir == Dir::FORWARD ? d_str.byte_offset(delimiter_pos) - src_byte_offset : d_str.byte_offset(end_pos) - src_byte_offset; - return thrust::make_pair(src_byte_offset, token_char_bytes); + return string_index_pair{d_str.data() + src_byte_offset, token_char_bytes}; } } - __device__ void operator()(copy_info const info) const + __device__ void operator()(size_type idx) { - if (info.token_count == 0) { return; } - - auto memory_ptr = static_cast(info.memory_ptr); - - auto const char_buf_size = cudf::detail::round_up_pow2(info.token_size_sum, split_align); - auto const char_buf_ptr = memory_ptr; - memory_ptr += char_buf_size; - auto const offset_buf_ptr = reinterpret_cast(memory_ptr); + if (d_strings.is_null(idx)) { return; } + + auto const token_offset = d_token_offsets[idx]; + auto const token_count = d_token_offsets[idx + 1] - token_offset; + auto d_result = d_tokens + token_offset; + auto const d_str = d_strings.element(idx); + if (d_str.empty()) { + *d_result = string_index_pair{"", 0}; + return; + } - auto const d_str = d_strings.element(info.idx); - size_type token_idx = 0; - size_type char_bytes_copied = 0; - size_type start_pos = 0; // updates only if moving forward - auto end_pos = d_str.length(); // updates only if moving backward - while (token_idx < info.token_count - 1) { + size_type token_idx = 0; + size_type start_pos = 0; // updates only if moving forward + size_type end_pos = d_str.length(); // updates only if moving backward + while (token_idx < token_count - 1) { auto const delimiter_pos = dir == Dir::FORWARD ? d_str.find(d_delimiter, start_pos) : d_str.rfind(d_delimiter, start_pos, end_pos); - if (delimiter_pos != -1) { - auto const offset_size_pair = compute_src_byte_offset_and_token_char_bytes( - d_str, start_pos, end_pos, delimiter_pos); - if (dir == Dir::FORWARD) { - thrust::copy(thrust::seq, - d_str.data() + offset_size_pair.first, - d_str.data() + offset_size_pair.first + offset_size_pair.second, - char_buf_ptr + char_bytes_copied); - offset_buf_ptr[token_idx] = char_bytes_copied; - } else { - auto const char_buf_offset = - info.token_size_sum - char_bytes_copied - offset_size_pair.second; - thrust::copy(thrust::seq, - d_str.data() + offset_size_pair.first, - d_str.data() + offset_size_pair.first + offset_size_pair.second, - char_buf_ptr + char_buf_offset); - offset_buf_ptr[info.token_count - 1 - token_idx] = char_buf_offset; - } - token_idx++; - char_bytes_copied += offset_size_pair.second; - if (dir == Dir::FORWARD) { - start_pos = delimiter_pos + d_delimiter.length(); - } else { - end_pos = delimiter_pos; - } - } else { - break; - } + if (delimiter_pos < 0) break; + auto const token = resolve_token(d_str, start_pos, end_pos, delimiter_pos); + if (dir == Dir::FORWARD) + d_result[token_idx] = token; + else + d_result[token_count - 1 - token_idx] = token; + + token_idx++; + if (dir == Dir::FORWARD) + start_pos = delimiter_pos + d_delimiter.length(); + else + end_pos = delimiter_pos; } - auto const offset_size_pair = - compute_src_byte_offset_and_token_char_bytes(d_str, start_pos, end_pos, -1); - if (dir == Dir::FORWARD) { - thrust::copy(thrust::seq, - d_str.data() + offset_size_pair.first, - d_str.data() + offset_size_pair.first + offset_size_pair.second, - char_buf_ptr + char_bytes_copied); - offset_buf_ptr[token_idx] = char_bytes_copied; - } else { - thrust::copy(thrust::seq, d_str.data(), d_str.data() + offset_size_pair.second, char_buf_ptr); - offset_buf_ptr[0] = 0; - } - offset_buf_ptr[info.token_count] = info.token_size_sum; + auto const last_token = resolve_token(d_str, start_pos, end_pos, -1); + if (dir == Dir::FORWARD) + d_result[token_idx] = last_token; + else + d_result[0] = last_token; } }; /** - * @brief Compute the number of tokens, the total byte sizes of the tokens, and - * required memory size for the `idx'th` string element of `d_strings`. + * @brief Compute the number of tokens for the `idx'th` string element of `d_strings`. */ -template -struct whitespace_token_reader_fn { +struct whitespace_token_counter_fn { column_device_view const d_strings; // strings to split size_type const max_tokens = std::numeric_limits::max(); - bool const has_validity = false; - template - __device__ size_type compute_token_char_bytes(string_view const& d_str, - size_type cur_pos, - size_type to_token_pos) const + __device__ size_type operator()(size_type idx) const { - if (last) { - return dir == Dir::FORWARD - ? d_str.byte_offset(d_str.length()) - d_str.byte_offset(to_token_pos) - : d_str.byte_offset(to_token_pos + 1) - d_str.byte_offset(0); - } else { - return dir == Dir::FORWARD - ? d_str.byte_offset(cur_pos) - d_str.byte_offset(to_token_pos) - : d_str.byte_offset(to_token_pos + 1) - d_str.byte_offset(cur_pos + 1); - } - } + if (d_strings.is_null(idx)) { return 0; } - __device__ thrust::tuple operator()(size_type idx) const - { - if (has_validity && d_strings.is_null(idx)) { - return thrust::make_tuple(0, 0, 0); - } - - auto const d_str = d_strings.element(idx); - size_type token_count = 0; - size_type token_size_sum = 0; - auto spaces = true; - auto reached_max_tokens = false; - size_type to_token_pos = 0; - for (size_type i = 0; i < d_str.length(); ++i) { - auto const cur_pos = dir == Dir::FORWARD ? i : d_str.length() - 1 - i; - auto const ch = d_str[cur_pos]; + auto const d_str = d_strings.element(idx); + size_type token_count = 0; + auto spaces = true; + auto reached_max_tokens = false; + for (auto ch : d_str) { if (spaces != (ch <= ' ')) { - if (spaces) { // from whitespace(s) to a new token - to_token_pos = cur_pos; - } else { // from a token to whitespace(s) + if (!spaces) { if (token_count < max_tokens - 1) { token_count++; - token_size_sum += compute_token_char_bytes(d_str, cur_pos, to_token_pos); } else { reached_max_tokens = true; break; @@ -267,217 +171,105 @@ struct whitespace_token_reader_fn { spaces = !spaces; } } - if (reached_max_tokens || !spaces) { - token_count++; - token_size_sum += compute_token_char_bytes(d_str, -1, to_token_pos); - } - - if (token_count == 0) { // note that pandas.Series.str.split("", pat=" ") - // returns one token (i.e. "") while - // pandas.Series.str.split("") returns 0 token. - return thrust::make_tuple(0, 0, 0); - } - - auto const memory_size = compute_memory_size(token_count, token_size_sum); - - return thrust::make_tuple( - token_count, token_size_sum, memory_size); + // pandas.Series.str.split("") returns 0 tokens. + if (reached_max_tokens || !spaces) token_count++; + return token_count; } }; /** - * @brief Copy the tokens from the `idx'th` string element of `d_strings` to - * the contiguous memory buffer. + * @brief Identify the tokens from the `idx'th` string element of `d_strings`. */ template -struct whitespace_token_copier_fn { +struct whitespace_token_reader_fn { column_device_view const d_strings; // strings to split - bool const has_validity = false; + size_type const max_tokens{}; + int32_t* d_token_offsets{}; + string_index_pair* d_tokens{}; - template - __device__ thrust::pair compute_src_byte_offset_and_token_char_bytes( - string_view const& d_str, - size_type cur_pos, - size_type to_token_pos, - size_type remaining_bytes) const + __device__ void operator()(size_type idx) { - if (last) { - auto const token_char_bytes = remaining_bytes; - auto const src_byte_offset = dir == Dir::FORWARD - ? d_str.byte_offset(to_token_pos) - : d_str.byte_offset(to_token_pos + 1) - token_char_bytes; - return thrust::make_pair(src_byte_offset, token_char_bytes); + auto const token_offset = d_token_offsets[idx]; + auto const token_count = d_token_offsets[idx + 1] - token_offset; + if (token_count == 0) { return; } + auto d_result = d_tokens + token_offset; + + auto const d_str = d_strings.element(idx); + whitespace_string_tokenizer tokenizer(d_str, dir != Dir::FORWARD); + size_type token_idx = 0; + string_view last_token{}; + if (dir == Dir::FORWARD) { + while (tokenizer.next_token() && (token_idx < token_count)) { + last_token = tokenizer.get_token(); + d_result[token_idx++] = string_index_pair{last_token.data(), last_token.size_bytes()}; + } + if (token_count == max_tokens) { + d_result[token_idx - 1] = string_index_pair{ + last_token.data(), + static_cast(d_str.data() + d_str.size_bytes() - last_token.data())}; + } } else { - auto const src_byte_offset = - dir == Dir::FORWARD ? d_str.byte_offset(to_token_pos) : d_str.byte_offset(cur_pos + 1); - auto const token_char_bytes = dir == Dir::FORWARD - ? d_str.byte_offset(cur_pos) - src_byte_offset - : d_str.byte_offset(to_token_pos + 1) - src_byte_offset; - return thrust::make_pair(src_byte_offset, token_char_bytes); - } - } - - __device__ void operator()(copy_info const info) const - { - if (info.token_count == 0) { return; } - - auto memory_ptr = static_cast(info.memory_ptr); - - auto const char_buf_size = cudf::detail::round_up_pow2(info.token_size_sum, split_align); - auto const char_buf_ptr = memory_ptr; - memory_ptr += char_buf_size; - auto const offset_buf_ptr = reinterpret_cast(memory_ptr); - - auto const d_str = d_strings.element(info.idx); - size_type token_idx = 0; - size_type char_bytes_copied = 0; - auto spaces = true; - size_type to_token_pos = 0; - for (size_type i = 0; i < d_str.length(); ++i) { - auto const cur_pos = dir == Dir::FORWARD ? i : d_str.length() - 1 - i; - auto const ch = d_str[cur_pos]; - if (spaces != (ch <= ' ')) { - if (spaces) { // from whitespace(s) to a new token - to_token_pos = cur_pos; - } else { // from a token to whitespace(s) - if (token_idx < info.token_count - 1) { - auto const offset_size_pair = compute_src_byte_offset_and_token_char_bytes( - d_str, cur_pos, to_token_pos, info.token_size_sum - char_bytes_copied); - if (dir == Dir::FORWARD) { - thrust::copy(thrust::seq, - d_str.data() + offset_size_pair.first, - d_str.data() + offset_size_pair.first + offset_size_pair.second, - char_buf_ptr + char_bytes_copied); - offset_buf_ptr[token_idx] = char_bytes_copied; - } else { - auto const char_buf_offset = - info.token_size_sum - char_bytes_copied - offset_size_pair.second; - thrust::copy(thrust::seq, - d_str.data() + offset_size_pair.first, - d_str.data() + offset_size_pair.first + offset_size_pair.second, - char_buf_ptr + char_buf_offset); - offset_buf_ptr[info.token_count - 1 - token_idx] = char_buf_offset; - } - token_idx++; - char_bytes_copied += offset_size_pair.second; - } else { - break; - } - } - spaces = !spaces; + while (tokenizer.prev_token() && (token_idx < token_count)) { + last_token = tokenizer.get_token(); + d_result[token_count - 1 - token_idx] = + string_index_pair{last_token.data(), last_token.size_bytes()}; + ++token_idx; } - } - if (token_idx < info.token_count) { - auto const offset_size_pair = compute_src_byte_offset_and_token_char_bytes( - d_str, -1, to_token_pos, info.token_size_sum - char_bytes_copied); - if (dir == Dir::FORWARD) { - thrust::copy(thrust::seq, - d_str.data() + offset_size_pair.first, - d_str.data() + offset_size_pair.first + offset_size_pair.second, - char_buf_ptr + char_bytes_copied); - offset_buf_ptr[token_idx] = char_bytes_copied; - } else { - thrust::copy(thrust::seq, - d_str.data() + offset_size_pair.first, - d_str.data() + offset_size_pair.first + offset_size_pair.second, - char_buf_ptr); - offset_buf_ptr[0] = 0; + if (token_count == max_tokens) { + --token_idx; + d_result[token_count - 1 - token_idx] = string_index_pair{ + d_str.data(), + static_cast(last_token.data() + last_token.size_bytes() - d_str.data())}; } } - offset_buf_ptr[info.token_count] = info.token_size_sum; } }; -// Generic split function used by split_record and rsplit_record -template -contiguous_split_record_result contiguous_split_record_fn(strings_column_view const& strings, - TokenReader reader, - TokenCopier copier, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) -{ - // read each string element of the input column to count the number of tokens - // and compute the memory offsets +} // namespace +// The output is one list item per string +template +std::unique_ptr split_record_fn(strings_column_view const& strings, + TokenCounter counter, + TokenReader reader, + rmm::mr::device_memory_resource* mr, + cudaStream_t stream) +{ + // create offsets column by counting the number of tokens per string auto strings_count = strings.size(); - rmm::device_vector d_token_counts(strings_count); - rmm::device_vector d_token_size_sums(strings_count); - rmm::device_vector d_memory_offsets(strings_count + 1); - + auto offsets = make_numeric_column( + data_type{type_id::INT32}, strings_count + 1, mask_state::UNALLOCATED, stream, mr); + auto d_offsets = offsets->mutable_view().data(); thrust::transform(rmm::exec_policy(stream)->on(stream), thrust::make_counting_iterator(0), thrust::make_counting_iterator(strings_count), - thrust::make_zip_iterator(thrust::make_tuple( - d_token_counts.begin(), d_token_size_sums.begin(), d_memory_offsets.begin())), - reader); - - thrust::exclusive_scan(rmm::exec_policy(stream)->on(stream), - d_memory_offsets.begin(), - d_memory_offsets.end(), - d_memory_offsets.begin()); - - // allocate and copy - - thrust::host_vector h_token_counts = d_token_counts; - thrust::host_vector h_token_size_sums = d_token_size_sums; - thrust::host_vector h_memory_offsets = d_memory_offsets; - - auto memory_size = h_memory_offsets.back(); - auto all_data_ptr = std::make_unique(memory_size, stream, mr); - - auto d_all_data_ptr = reinterpret_cast(all_data_ptr->data()); - auto d_token_counts_ptr = d_token_counts.data().get(); - auto d_memory_offsets_ptr = d_memory_offsets.data().get(); - auto d_token_size_sums_ptr = d_token_size_sums.data().get(); - auto copy_info_begin = thrust::make_transform_iterator( - thrust::make_counting_iterator(0), - [d_all_data_ptr, d_token_counts_ptr, d_memory_offsets_ptr, d_token_size_sums_ptr] __device__( - auto i) { - return copy_info{i, - d_token_counts_ptr[i], - d_token_size_sums_ptr[i], - d_all_data_ptr + d_memory_offsets_ptr[i]}; - }); - - thrust::for_each( - rmm::exec_policy(stream)->on(stream), copy_info_begin, copy_info_begin + strings_count, copier); - - // update column_view objects - - std::vector column_views{}; - for (size_type i = 0; i < strings_count; ++i) { - if (h_token_counts[i] == 0) { - column_views.emplace_back(strings.parent().type(), 0, nullptr); - } else { - auto memory_ptr = d_all_data_ptr + h_memory_offsets[i]; - auto char_buf_size = cudf::util::round_up_safe(h_token_size_sums[i], split_align); - - auto char_buf_ptr = memory_ptr; - memory_ptr += char_buf_size; - auto offset_buf_ptr = reinterpret_cast(memory_ptr); - - column_views.emplace_back( - strings.parent().type(), - h_token_counts[i], - nullptr, - nullptr, - UNKNOWN_NULL_COUNT, - 0, - std::vector{ - column_view(strings.offsets().type(), h_token_counts[i] + 1, offset_buf_ptr), - column_view(strings.chars().type(), h_token_size_sums[i], char_buf_ptr)}); - } - } - - CUDA_TRY(cudaStreamSynchronize(stream)); - - return contiguous_split_record_result{std::move(column_views), std::move(all_data_ptr)}; + d_offsets, + counter); + thrust::exclusive_scan( + rmm::exec_policy(stream)->on(stream), d_offsets, d_offsets + strings_count + 1, d_offsets); + + // last entry is the total number of tokens to be generated + auto total_tokens = cudf::detail::get_value(offsets->view(), strings_count, stream); + // split each string into an array of index-pair values + rmm::device_vector tokens(total_tokens); + reader.d_token_offsets = d_offsets; + reader.d_tokens = tokens.data().get(); + thrust::for_each_n(rmm::exec_policy(stream)->on(stream), + thrust::make_counting_iterator(0), + strings_count, + reader); + // convert the index-pairs into one big strings column + auto strings_output = make_strings_column(tokens.begin(), tokens.end(), mr, stream); + // create a lists column using the offsets and the strings columns + return make_lists_column(strings_count, + std::move(offsets), + std::move(strings_output), + strings.null_count(), + copy_bitmask(strings.parent(), stream, mr)); } -} // namespace - template -contiguous_split_record_result contiguous_split_record( +std::unique_ptr split_record( strings_column_view const& strings, string_scalar const& delimiter = string_scalar(""), size_type maxsplit = -1, @@ -488,24 +280,21 @@ contiguous_split_record_result contiguous_split_record( // makes consistent with Pandas size_type max_tokens = maxsplit > 0 ? maxsplit + 1 : std::numeric_limits::max(); - auto has_validity = strings.parent().nullable(); auto d_strings_column_ptr = column_device_view::create(strings.parent(), stream); if (delimiter.size() == 0) { - return contiguous_split_record_fn( - strings, - whitespace_token_reader_fn{*d_strings_column_ptr, max_tokens, has_validity}, - whitespace_token_copier_fn{*d_strings_column_ptr, has_validity}, - mr, - stream); + return split_record_fn(strings, + whitespace_token_counter_fn{*d_strings_column_ptr, max_tokens}, + whitespace_token_reader_fn{*d_strings_column_ptr, max_tokens}, + mr, + stream); } else { string_view d_delimiter(delimiter.data(), delimiter.size()); - return contiguous_split_record_fn( - strings, - token_reader_fn{*d_strings_column_ptr, d_delimiter, max_tokens, has_validity}, - token_copier_fn{*d_strings_column_ptr, d_delimiter, has_validity}, - mr, - stream); + return split_record_fn(strings, + token_counter_fn{*d_strings_column_ptr, d_delimiter, max_tokens}, + token_reader_fn{*d_strings_column_ptr, d_delimiter}, + mr, + stream); } } @@ -513,23 +302,22 @@ contiguous_split_record_result contiguous_split_record( // external APIs -contiguous_split_record_result contiguous_split_record(strings_column_view const& strings, - string_scalar const& delimiter, - size_type maxsplit, - rmm::mr::device_memory_resource* mr) +std::unique_ptr split_record(strings_column_view const& strings, + string_scalar const& delimiter, + size_type maxsplit, + rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::contiguous_split_record(strings, delimiter, maxsplit, mr, 0); + return detail::split_record(strings, delimiter, maxsplit, mr, 0); } -contiguous_split_record_result contiguous_rsplit_record(strings_column_view const& strings, - string_scalar const& delimiter, - size_type maxsplit, - rmm::mr::device_memory_resource* mr) +std::unique_ptr rsplit_record(strings_column_view const& strings, + string_scalar const& delimiter, + size_type maxsplit, + rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::contiguous_split_record( - strings, delimiter, maxsplit, mr, 0); + return detail::split_record(strings, delimiter, maxsplit, mr, 0); } } // namespace strings diff --git a/cpp/src/strings/split/split_utils.cuh b/cpp/src/strings/split/split_utils.cuh new file mode 100644 index 00000000000..5c9f98f273a --- /dev/null +++ b/cpp/src/strings/split/split_utils.cuh @@ -0,0 +1,118 @@ +/* + * Copyright (c) 2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +namespace cudf { +namespace strings { +namespace detail { + +/** + * @brief Instantiated for each string to manage navigating tokens from + * the beginning or the end of that string. + */ +struct whitespace_string_tokenizer { + /** + * @brief Identifies the position range of the next token in the given + * string at the specified iterator position. + * + * Tokens are delimited by one or more whitespace characters. + * + * @return true if a token has been found + */ + __device__ bool next_token() + { + if (itr != d_str.begin()) { // skip these 2 lines the first time through + ++itr; + start_position = itr.byte_offset(); // end_position + 1; + } + if (start_position >= d_str.size_bytes()) return false; + // continue search for the next token + end_position = d_str.size_bytes(); + for (; itr < d_str.end(); ++itr) { + if (spaces == (*itr <= ' ')) { + if (spaces) + start_position = (itr + 1).byte_offset(); + else + end_position = (itr + 1).byte_offset(); + continue; + } + spaces = !spaces; + if (spaces) { + end_position = itr.byte_offset(); + break; + } + } + return start_position < end_position; + } + + /** + * @brief Identifies the position range of the previous token in the given + * string at the specified iterator position. + * + * Tokens are delimited by one or more whitespace characters. + * + * @return true if a token has been found + */ + __device__ bool prev_token() + { + end_position = start_position - 1; + --itr; + if (end_position <= 0) return false; + // continue search for the next token + start_position = 0; + for (; itr >= d_str.begin(); --itr) { + if (spaces == (*itr <= ' ')) { + if (spaces) + end_position = itr.byte_offset(); + else + start_position = itr.byte_offset(); + continue; + } + spaces = !spaces; + if (spaces) { + start_position = (itr + 1).byte_offset(); + break; + } + } + return start_position < end_position; + } + + __device__ string_view get_token() const + { + return string_view{d_str.data() + start_position, end_position - start_position}; + } + + __device__ whitespace_string_tokenizer(string_view const& d_str, bool reverse = false) + : d_str{d_str}, + spaces(true), + start_position{reverse ? d_str.size_bytes() + 1 : 0}, + end_position{d_str.size_bytes()}, + itr{reverse ? d_str.end() : d_str.begin()} + { + } + + private: + string_view const d_str; + bool spaces; // true if current position is whitespace + cudf::string_view::const_iterator itr; + size_type start_position; + size_type end_position; +}; + +} // namespace detail +} // namespace strings +} // namespace cudf diff --git a/cpp/tests/strings/split_tests.cpp b/cpp/tests/strings/split_tests.cpp index ebb1e1e78f7..2958b2892f3 100644 --- a/cpp/tests/strings/split_tests.cpp +++ b/cpp/tests/strings/split_tests.cpp @@ -275,7 +275,7 @@ TEST_F(StringsSplitTest, AllNullsCase) EXPECT_TRUE(column.null_count() == column.size()); } -TEST_F(StringsSplitTest, ContiguousSplitRecord) +TEST_F(StringsSplitTest, SplitRecord) { std::vector h_strings{" Héllo thesé", nullptr, "are some ", "tést String", ""}; cudf::test::strings_column_wrapper strings( @@ -283,34 +283,17 @@ TEST_F(StringsSplitTest, ContiguousSplitRecord) h_strings.end(), thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); - cudf::strings_column_view strings_view(strings); - - std::vector h_expected1{"", "Héllo", "thesé"}; - cudf::test::strings_column_wrapper expected1(h_expected1.begin(), h_expected1.end()); - std::vector h_expected2{}; - cudf::test::strings_column_wrapper expected2(h_expected2.begin(), h_expected2.end()); - std::vector h_expected3{"are", "some", "", ""}; - cudf::test::strings_column_wrapper expected3(h_expected3.begin(), h_expected3.end()); - std::vector h_expected4{"tést", "String"}; - cudf::test::strings_column_wrapper expected4(h_expected4.begin(), h_expected4.end()); - std::vector h_expected5{""}; - cudf::test::strings_column_wrapper expected5(h_expected5.begin(), h_expected5.end()); - - std::vector> expected_columns; - expected_columns.push_back(expected1.release()); - expected_columns.push_back(expected2.release()); - expected_columns.push_back(expected3.release()); - expected_columns.push_back(expected4.release()); - expected_columns.push_back(expected5.release()); - - auto result = cudf::strings::contiguous_split_record(strings_view, cudf::string_scalar(" ")); - EXPECT_TRUE(result.column_views.size() == expected_columns.size()); - for (size_t i = 0; i < result.column_views.size(); ++i) { - cudf::test::expect_columns_equal(result.column_views[i], *expected_columns[i]); - } + auto result = + cudf::strings::split_record(cudf::strings_column_view(strings), cudf::string_scalar(" ")); + cudf::lists_column_view lcv(result->view()); + cudf::test::strings_column_wrapper expected( + {"", "Héllo", "thesé", "are", "some", "", "", "tést", "String", ""}); + cudf::test::fixed_width_column_wrapper offsets({0, 3, 3, 7, 9, 10}); + cudf::test::expect_columns_equal(lcv.child(), expected); + cudf::test::expect_columns_equal(lcv.offsets(), offsets); } -TEST_F(StringsSplitTest, ContiguousSplitRecordWithMaxSplit) +TEST_F(StringsSplitTest, SplitRecordWithMaxSplit) { std::vector h_strings{" Héllo thesé", nullptr, "are some ", "tést String", ""}; cudf::test::strings_column_wrapper strings( @@ -318,34 +301,18 @@ TEST_F(StringsSplitTest, ContiguousSplitRecordWithMaxSplit) h_strings.end(), thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); - cudf::strings_column_view strings_view(strings); - - std::vector h_expected1{"", "Héllo thesé"}; - cudf::test::strings_column_wrapper expected1(h_expected1.begin(), h_expected1.end()); - std::vector h_expected2{}; - cudf::test::strings_column_wrapper expected2(h_expected2.begin(), h_expected2.end()); - std::vector h_expected3{"are", "some "}; - cudf::test::strings_column_wrapper expected3(h_expected3.begin(), h_expected3.end()); - std::vector h_expected4{"tést", "String"}; - cudf::test::strings_column_wrapper expected4(h_expected4.begin(), h_expected4.end()); - std::vector h_expected5{""}; - cudf::test::strings_column_wrapper expected5(h_expected5.begin(), h_expected5.end()); + auto result = + cudf::strings::split_record(cudf::strings_column_view(strings), cudf::string_scalar(" "), 1); - std::vector> expected_columns; - expected_columns.push_back(expected1.release()); - expected_columns.push_back(expected2.release()); - expected_columns.push_back(expected3.release()); - expected_columns.push_back(expected4.release()); - expected_columns.push_back(expected5.release()); - - auto result = cudf::strings::contiguous_split_record(strings_view, cudf::string_scalar(" "), 1); - EXPECT_TRUE(result.column_views.size() == expected_columns.size()); - for (size_t i = 0; i < result.column_views.size(); ++i) { - cudf::test::expect_columns_equal(result.column_views[i], *expected_columns[i]); - } + cudf::lists_column_view lcv(result->view()); + cudf::test::strings_column_wrapper expected( + {"", "Héllo thesé", "are", "some ", "tést", "String", ""}); + cudf::test::fixed_width_column_wrapper offsets({0, 2, 2, 4, 6, 7}); + cudf::test::expect_columns_equal(lcv.child(), expected); + cudf::test::expect_columns_equal(lcv.offsets(), offsets); } -TEST_F(StringsSplitTest, ContiguousSplitRecordWhitespace) +TEST_F(StringsSplitTest, SplitRecordWhitespace) { std::vector h_strings{ " Héllo thesé", nullptr, "are\tsome ", "tést\nString", " "}; @@ -354,34 +321,15 @@ TEST_F(StringsSplitTest, ContiguousSplitRecordWhitespace) h_strings.end(), thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); - cudf::strings_column_view strings_view(strings); - - std::vector h_expected1{"Héllo", "thesé"}; - cudf::test::strings_column_wrapper expected1(h_expected1.begin(), h_expected1.end()); - std::vector h_expected2{}; - cudf::test::strings_column_wrapper expected2(h_expected2.begin(), h_expected2.end()); - std::vector h_expected3{"are", "some"}; - cudf::test::strings_column_wrapper expected3(h_expected3.begin(), h_expected3.end()); - std::vector h_expected4{"tést", "String"}; - cudf::test::strings_column_wrapper expected4(h_expected4.begin(), h_expected4.end()); - std::vector h_expected5{}; - cudf::test::strings_column_wrapper expected5(h_expected5.begin(), h_expected5.end()); - - std::vector> expected_columns; - expected_columns.push_back(expected1.release()); - expected_columns.push_back(expected2.release()); - expected_columns.push_back(expected3.release()); - expected_columns.push_back(expected4.release()); - expected_columns.push_back(expected5.release()); - - auto result = cudf::strings::contiguous_split_record(strings_view); - EXPECT_TRUE(result.column_views.size() == expected_columns.size()); - for (size_t i = 0; i < result.column_views.size(); ++i) { - cudf::test::expect_columns_equal(result.column_views[i], *expected_columns[i]); - } + auto result = cudf::strings::split_record(cudf::strings_column_view(strings)); + cudf::lists_column_view lcv(result->view()); + cudf::test::strings_column_wrapper expected({"Héllo", "thesé", "are", "some", "tést", "String"}); + cudf::test::fixed_width_column_wrapper offsets({0, 2, 2, 4, 6, 6}); + cudf::test::expect_columns_equal(lcv.child(), expected); + cudf::test::expect_columns_equal(lcv.offsets(), offsets); } -TEST_F(StringsSplitTest, ContiguousSplitRecordWhitespaceWithMaxSplit) +TEST_F(StringsSplitTest, SplitRecordWhitespaceWithMaxSplit) { std::vector h_strings{ " Héllo thesé ", nullptr, "are\tsome ", "tést\nString", " "}; @@ -390,34 +338,17 @@ TEST_F(StringsSplitTest, ContiguousSplitRecordWhitespaceWithMaxSplit) h_strings.end(), thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); - cudf::strings_column_view strings_view(strings); - - std::vector h_expected1{"Héllo", "thesé "}; - cudf::test::strings_column_wrapper expected1(h_expected1.begin(), h_expected1.end()); - std::vector h_expected2{}; - cudf::test::strings_column_wrapper expected2(h_expected2.begin(), h_expected2.end()); - std::vector h_expected3{"are", "some "}; - cudf::test::strings_column_wrapper expected3(h_expected3.begin(), h_expected3.end()); - std::vector h_expected4{"tést", "String"}; - cudf::test::strings_column_wrapper expected4(h_expected4.begin(), h_expected4.end()); - std::vector h_expected5{}; - cudf::test::strings_column_wrapper expected5(h_expected5.begin(), h_expected5.end()); - - std::vector> expected_columns; - expected_columns.push_back(expected1.release()); - expected_columns.push_back(expected2.release()); - expected_columns.push_back(expected3.release()); - expected_columns.push_back(expected4.release()); - expected_columns.push_back(expected5.release()); - - auto result = cudf::strings::contiguous_split_record(strings_view, cudf::string_scalar(""), 1); - EXPECT_TRUE(result.column_views.size() == expected_columns.size()); - for (size_t i = 0; i < result.column_views.size(); ++i) { - cudf::test::expect_columns_equal(result.column_views[i], *expected_columns[i]); - } + auto result = + cudf::strings::split_record(cudf::strings_column_view(strings), cudf::string_scalar(""), 1); + cudf::lists_column_view lcv(result->view()); + cudf::test::strings_column_wrapper expected( + {"Héllo", "thesé ", "are", "some ", "tést", "String"}); + cudf::test::fixed_width_column_wrapper offsets({0, 2, 2, 4, 6, 6}); + cudf::test::expect_columns_equal(lcv.child(), expected); + cudf::test::expect_columns_equal(lcv.offsets(), offsets); } -TEST_F(StringsSplitTest, ContiguousRSplitRecord) +TEST_F(StringsSplitTest, RSplitRecord) { std::vector h_strings{ "héllo", nullptr, "a_bc_déf", "a__bc", "_ab_cd", "ab_cd_", "", " a b ", " a bbb c"}; @@ -426,46 +357,31 @@ TEST_F(StringsSplitTest, ContiguousRSplitRecord) h_strings.end(), thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); - cudf::strings_column_view strings_view(strings); - - std::vector h_expected1{"héllo"}; - cudf::test::strings_column_wrapper expected1(h_expected1.begin(), h_expected1.end()); - std::vector h_expected2{}; - cudf::test::strings_column_wrapper expected2(h_expected2.begin(), h_expected2.end()); - std::vector h_expected3{"a", "bc", "déf"}; - cudf::test::strings_column_wrapper expected3(h_expected3.begin(), h_expected3.end()); - std::vector h_expected4{"a", "", "bc"}; - cudf::test::strings_column_wrapper expected4(h_expected4.begin(), h_expected4.end()); - std::vector h_expected5{"", "ab", "cd"}; - cudf::test::strings_column_wrapper expected5(h_expected5.begin(), h_expected5.end()); - std::vector h_expected6{"ab", "cd", ""}; - cudf::test::strings_column_wrapper expected6(h_expected6.begin(), h_expected6.end()); - std::vector h_expected7{""}; - cudf::test::strings_column_wrapper expected7(h_expected7.begin(), h_expected7.end()); - std::vector h_expected8{" a b "}; - cudf::test::strings_column_wrapper expected8(h_expected8.begin(), h_expected8.end()); - std::vector h_expected9{" a bbb c"}; - cudf::test::strings_column_wrapper expected9(h_expected9.begin(), h_expected9.end()); - - std::vector> expected_columns; - expected_columns.push_back(expected1.release()); - expected_columns.push_back(expected2.release()); - expected_columns.push_back(expected3.release()); - expected_columns.push_back(expected4.release()); - expected_columns.push_back(expected5.release()); - expected_columns.push_back(expected6.release()); - expected_columns.push_back(expected7.release()); - expected_columns.push_back(expected8.release()); - expected_columns.push_back(expected9.release()); - - auto result = cudf::strings::contiguous_rsplit_record(strings_view, cudf::string_scalar("_")); - EXPECT_TRUE(result.column_views.size() == expected_columns.size()); - for (size_t i = 0; i < result.column_views.size(); i++) { - cudf::test::expect_columns_equal(result.column_views[i], *expected_columns[i]); - } + cudf::test::strings_column_wrapper expected({"héllo", + "a", + "bc", + "déf", + "a", + "", + "bc", + "", + "ab", + "cd", + "ab", + "cd", + "", + "", + " a b ", + " a bbb c"}); + cudf::test::fixed_width_column_wrapper offsets({0, 1, 1, 4, 7, 10, 13, 14, 15, 16}); + auto result = + cudf::strings::rsplit_record(cudf::strings_column_view(strings), cudf::string_scalar("_")); + cudf::lists_column_view lcv(result->view()); + cudf::test::expect_columns_equal(lcv.child(), expected); + cudf::test::expect_columns_equal(lcv.offsets(), offsets); } -TEST_F(StringsSplitTest, ContiguousRSplitRecordWithMaxSplit) +TEST_F(StringsSplitTest, RSplitRecordWithMaxSplit) { std::vector h_strings{"héllo", nullptr, @@ -481,46 +397,20 @@ TEST_F(StringsSplitTest, ContiguousRSplitRecordWithMaxSplit) h_strings.end(), thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); - cudf::strings_column_view strings_view(strings); + cudf::test::strings_column_wrapper expected( + {"héllo", "a", "bc", "déf", "___a", "", "bc", "_ab", "cd", "", + "ab", "cd", "", "", " a b _", "", "", "_", "", " a bbb c"}); + cudf::test::fixed_width_column_wrapper offsets({0, 1, 1, 4, 7, 10, 13, 14, 17, 20}); - std::vector h_expected1{"héllo"}; - cudf::test::strings_column_wrapper expected1(h_expected1.begin(), h_expected1.end()); - std::vector h_expected2{}; - cudf::test::strings_column_wrapper expected2(h_expected2.begin(), h_expected2.end()); - std::vector h_expected3{"a", "bc", "déf"}; - cudf::test::strings_column_wrapper expected3(h_expected3.begin(), h_expected3.end()); - std::vector h_expected4{"___a", "", "bc"}; - cudf::test::strings_column_wrapper expected4(h_expected4.begin(), h_expected4.end()); - std::vector h_expected5{"_ab", "cd", ""}; - cudf::test::strings_column_wrapper expected5(h_expected5.begin(), h_expected5.end()); - std::vector h_expected6{"ab", "cd", ""}; - cudf::test::strings_column_wrapper expected6(h_expected6.begin(), h_expected6.end()); - std::vector h_expected7{""}; - cudf::test::strings_column_wrapper expected7(h_expected7.begin(), h_expected7.end()); - std::vector h_expected8{" a b _", "", ""}; - cudf::test::strings_column_wrapper expected8(h_expected8.begin(), h_expected8.end()); - std::vector h_expected9{"_", "", " a bbb c"}; - cudf::test::strings_column_wrapper expected9(h_expected9.begin(), h_expected9.end()); + auto result = + cudf::strings::rsplit_record(cudf::strings_column_view(strings), cudf::string_scalar("_"), 2); - std::vector> expected_columns; - expected_columns.push_back(expected1.release()); - expected_columns.push_back(expected2.release()); - expected_columns.push_back(expected3.release()); - expected_columns.push_back(expected4.release()); - expected_columns.push_back(expected5.release()); - expected_columns.push_back(expected6.release()); - expected_columns.push_back(expected7.release()); - expected_columns.push_back(expected8.release()); - expected_columns.push_back(expected9.release()); - - auto result = cudf::strings::contiguous_rsplit_record(strings_view, cudf::string_scalar("_"), 2); - EXPECT_TRUE(result.column_views.size() == expected_columns.size()); - for (size_t i = 0; i < result.column_views.size(); i++) { - cudf::test::expect_columns_equal(result.column_views[i], *expected_columns[i]); - } + cudf::lists_column_view lcv(result->view()); + cudf::test::expect_columns_equal(lcv.child(), expected); + cudf::test::expect_columns_equal(lcv.offsets(), offsets); } -TEST_F(StringsSplitTest, ContiguousRSplitRecordWhitespace) +TEST_F(StringsSplitTest, RSplitRecordWhitespace) { std::vector h_strings{"héllo", nullptr, "a_bc_déf", "", " a\tb ", " a\r bbb c"}; cudf::test::strings_column_wrapper strings( @@ -528,37 +418,17 @@ TEST_F(StringsSplitTest, ContiguousRSplitRecordWhitespace) h_strings.end(), thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); - cudf::strings_column_view strings_view(strings); + cudf::test::strings_column_wrapper expected({"héllo", "a_bc_déf", "a", "b", "a", "bbb", "c"}); + cudf::test::fixed_width_column_wrapper offsets({0, 1, 1, 2, 2, 4, 7}); - std::vector h_expected1{"héllo"}; - cudf::test::strings_column_wrapper expected1(h_expected1.begin(), h_expected1.end()); - std::vector h_expected2{}; - cudf::test::strings_column_wrapper expected2(h_expected2.begin(), h_expected2.end()); - std::vector h_expected3{"a_bc_déf"}; - cudf::test::strings_column_wrapper expected3(h_expected3.begin(), h_expected3.end()); - std::vector h_expected4{}; - cudf::test::strings_column_wrapper expected4(h_expected4.begin(), h_expected4.end()); - std::vector h_expected5{"a", "b"}; - cudf::test::strings_column_wrapper expected5(h_expected5.begin(), h_expected5.end()); - std::vector h_expected6{"a", "bbb", "c"}; - cudf::test::strings_column_wrapper expected6(h_expected6.begin(), h_expected6.end()); + auto result = cudf::strings::rsplit_record(cudf::strings_column_view(strings)); - std::vector> expected_columns; - expected_columns.push_back(expected1.release()); - expected_columns.push_back(expected2.release()); - expected_columns.push_back(expected3.release()); - expected_columns.push_back(expected4.release()); - expected_columns.push_back(expected5.release()); - expected_columns.push_back(expected6.release()); - auto result = cudf::strings::contiguous_rsplit_record(strings_view); - - EXPECT_TRUE(result.column_views.size() == expected_columns.size()); - for (size_t i = 4; i < 5; i++) { - cudf::test::expect_columns_equal(result.column_views[i], *expected_columns[i]); - } + cudf::lists_column_view lcv(result->view()); + cudf::test::expect_columns_equal(lcv.child(), expected); + cudf::test::expect_columns_equal(lcv.offsets(), offsets); } -TEST_F(StringsSplitTest, ContiguousRSplitRecordWhitespaceWithMaxSplit) +TEST_F(StringsSplitTest, RSplitRecordWhitespaceWithMaxSplit) { std::vector h_strings{ " héllo Asher ", nullptr, " a_bc_déf ", "", " a\tb ", " a\r bbb c"}; @@ -567,44 +437,27 @@ TEST_F(StringsSplitTest, ContiguousRSplitRecordWhitespaceWithMaxSplit) h_strings.end(), thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); - cudf::strings_column_view strings_view(strings); - - std::vector h_expected1{" héllo", "Asher"}; - cudf::test::strings_column_wrapper expected1(h_expected1.begin(), h_expected1.end()); - std::vector h_expected2{}; - cudf::test::strings_column_wrapper expected2(h_expected2.begin(), h_expected2.end()); - std::vector h_expected3{"a_bc_déf"}; - cudf::test::strings_column_wrapper expected3(h_expected3.begin(), h_expected3.end()); - std::vector h_expected4{}; - cudf::test::strings_column_wrapper expected4(h_expected4.begin(), h_expected4.end()); - std::vector h_expected5{" a", "b"}; - cudf::test::strings_column_wrapper expected5(h_expected5.begin(), h_expected5.end()); - std::vector h_expected6{" a\r bbb", "c"}; - cudf::test::strings_column_wrapper expected6(h_expected6.begin(), h_expected6.end()); - - std::vector> expected_columns; - expected_columns.push_back(expected1.release()); - expected_columns.push_back(expected2.release()); - expected_columns.push_back(expected3.release()); - expected_columns.push_back(expected4.release()); - expected_columns.push_back(expected5.release()); - expected_columns.push_back(expected6.release()); - auto result = cudf::strings::contiguous_rsplit_record(strings_view, cudf::string_scalar(""), 1); - - EXPECT_TRUE(result.column_views.size() == expected_columns.size()); - for (size_t i = 4; i < 5; i++) { - cudf::test::expect_columns_equal(result.column_views[i], *expected_columns[i]); - } + cudf::test::strings_column_wrapper expected( + {" héllo", "Asher", "a_bc_déf", " a", "b", " a\r bbb", "c"}); + cudf::test::fixed_width_column_wrapper offsets({0, 2, 2, 3, 3, 5, 7}); + + auto result = + cudf::strings::rsplit_record(cudf::strings_column_view(strings), cudf::string_scalar(""), 1); + cudf::lists_column_view lcv(result->view()); + cudf::test::print(lcv.offsets()); + cudf::test::print(lcv.child()); + cudf::test::expect_columns_equal(lcv.child(), expected); + cudf::test::expect_columns_equal(lcv.offsets(), offsets); } -TEST_F(StringsSplitTest, ContiguousSplitRecordZeroSizeStringsColumns) +TEST_F(StringsSplitTest, SplitRecordZeroSizeStringsColumns) { cudf::column_view zero_size_strings_column( cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0); - auto split_record_result = cudf::strings::contiguous_split_record(zero_size_strings_column); - EXPECT_TRUE(split_record_result.column_views.size() == 0); - auto rsplit_record_result = cudf::strings::contiguous_rsplit_record(zero_size_strings_column); - EXPECT_TRUE(rsplit_record_result.column_views.size() == 0); + auto split_record_result = cudf::strings::split_record(zero_size_strings_column); + EXPECT_TRUE(split_record_result->size() == 0); + auto rsplit_record_result = cudf::strings::rsplit_record(zero_size_strings_column); + EXPECT_TRUE(rsplit_record_result->size() == 0); } TEST_F(StringsSplitTest, Partition) From c3dad9e81299fb2a72112660538cef668921c607 Mon Sep 17 00:00:00 2001 From: davidwendt Date: Tue, 14 Jul 2020 11:23:31 -0400 Subject: [PATCH 2/8] update changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9228e873d4b..fce7556420f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -116,6 +116,7 @@ - PR #5662 Make Java ColumnVector(long nativePointer) constructor public - PR #5679 Use `pickle5` to test older Python versions - PR #5684 Use `pickle5` in `Serializable` (when available) +- PR #5687 Change strings::split_record to return a lists column ## Bug Fixes From bc62892e7c67334bedf1b372d6c1764f8a855bd0 Mon Sep 17 00:00:00 2001 From: davidwendt Date: Tue, 14 Jul 2020 13:24:25 -0400 Subject: [PATCH 3/8] remove test print lines --- cpp/tests/strings/split_tests.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/cpp/tests/strings/split_tests.cpp b/cpp/tests/strings/split_tests.cpp index 2958b2892f3..95756e2fd33 100644 --- a/cpp/tests/strings/split_tests.cpp +++ b/cpp/tests/strings/split_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -444,8 +444,6 @@ TEST_F(StringsSplitTest, RSplitRecordWhitespaceWithMaxSplit) auto result = cudf::strings::rsplit_record(cudf::strings_column_view(strings), cudf::string_scalar(""), 1); cudf::lists_column_view lcv(result->view()); - cudf::test::print(lcv.offsets()); - cudf::test::print(lcv.child()); cudf::test::expect_columns_equal(lcv.child(), expected); cudf::test::expect_columns_equal(lcv.offsets(), offsets); } From a6b8f3614c52030800ccff661d2f69de1aa54ba9 Mon Sep 17 00:00:00 2001 From: davidwendt Date: Tue, 14 Jul 2020 13:24:56 -0400 Subject: [PATCH 4/8] refactor whitespace tokenize utility from split() --- cpp/src/strings/split/split.cu | 100 ++------------------------ cpp/src/strings/split/split_record.cu | 20 +++--- cpp/src/strings/split/split_utils.cuh | 7 +- 3 files changed, 16 insertions(+), 111 deletions(-) diff --git a/cpp/src/strings/split/split.cu b/cpp/src/strings/split/split.cu index 89d09d56517..3d7d902551f 100644 --- a/cpp/src/strings/split/split.cu +++ b/cpp/src/strings/split/split.cu @@ -24,6 +24,7 @@ #include #include #include +#include #include // upper_bound() #include // copy_if() @@ -34,8 +35,8 @@ namespace cudf { namespace strings { namespace detail { + using string_index_pair = thrust::pair; -using position_pair = thrust::pair; namespace { @@ -582,99 +583,6 @@ struct base_whitespace_split_tokenizer { size_type max_tokens; // maximum number of tokens }; -/** - * @brief Instantiated for each string to manage navigating tokens from - * the beginning or the end of that string. - */ -struct whitespace_string_tokenizer { - /** - * @brief Identifies the position range of the next token in the given - * string at the specified iterator position. - * - * Tokens are delimited by one or more whitespace characters. - * - * @return true if a token has been found - */ - __device__ bool next_token() - { - if (itr != d_str.begin()) { // skip these 2 lines the first time through - start_position = end_position + 1; - ++itr; - } - if (start_position >= d_str.length()) return false; - // continue search for the next token - end_position = d_str.length(); - for (; itr < d_str.end(); ++itr) { - if (spaces == (*itr <= ' ')) { - if (spaces) - start_position = itr.position() + 1; - else - end_position = itr.position() + 1; - continue; - } - spaces = !spaces; - if (spaces) { - end_position = itr.position(); - break; - } - } - return start_position < end_position; - } - - /** - * @brief Identifies the position range of the previous token in the given - * string at the specified iterator position. - * - * Tokens are delimited by one or more whitespace characters. - * - * @return true if a token has been found - */ - __device__ bool prev_token() - { - end_position = start_position - 1; - --itr; - if (end_position <= 0) return false; - // continue search for the next token - start_position = 0; - for (; itr >= d_str.begin(); --itr) { - if (spaces == (*itr <= ' ')) { - if (spaces) - end_position = itr.position(); - else - start_position = itr.position(); - continue; - } - spaces = !spaces; - if (spaces) { - start_position = itr.position() + 1; - break; - } - } - return start_position < end_position; - } - - __device__ position_pair token_byte_positions() - { - return position_pair{d_str.byte_offset(start_position), d_str.byte_offset(end_position)}; - } - - __device__ whitespace_string_tokenizer(string_view const& d_str, bool reverse = false) - : d_str{d_str}, - spaces(true), - start_position{reverse ? d_str.length() + 1 : 0}, - end_position{d_str.length()}, - itr{reverse ? d_str.end() : d_str.begin()} - { - } - - private: - string_view const d_str; - bool spaces; // true if current position is whitespace - cudf::string_view::const_iterator itr; - size_type start_position; - size_type end_position; -}; - /** * @brief The tokenizer functions for split() with whitespace. * @@ -709,7 +617,7 @@ struct whitespace_split_tokenizer_fn : base_whitespace_split_tokenizer { size_type token_idx = 0; position_pair token{0, 0}; while (tokenizer.next_token() && (token_idx < token_count)) { - token = tokenizer.token_byte_positions(); + token = tokenizer.get_token(); d_tokens[d_strings.size() * (token_idx++)] = string_index_pair{d_str.data() + token.first, (token.second - token.first)}; } @@ -760,7 +668,7 @@ struct whitespace_rsplit_tokenizer_fn : base_whitespace_split_tokenizer { size_type token_idx = 0; position_pair token{0, 0}; while (tokenizer.prev_token() && (token_idx < token_count)) { - token = tokenizer.token_byte_positions(); + token = tokenizer.get_token(); d_tokens[d_strings.size() * (token_count - 1 - token_idx)] = string_index_pair{d_str.data() + token.first, (token.second - token.first)}; ++token_idx; diff --git a/cpp/src/strings/split/split_record.cu b/cpp/src/strings/split/split_record.cu index 19ea2503ca3..c0d515b9d56 100644 --- a/cpp/src/strings/split/split_record.cu +++ b/cpp/src/strings/split/split_record.cu @@ -197,29 +197,27 @@ struct whitespace_token_reader_fn { auto const d_str = d_strings.element(idx); whitespace_string_tokenizer tokenizer(d_str, dir != Dir::FORWARD); size_type token_idx = 0; - string_view last_token{}; + position_pair token{0, 0}; if (dir == Dir::FORWARD) { while (tokenizer.next_token() && (token_idx < token_count)) { - last_token = tokenizer.get_token(); - d_result[token_idx++] = string_index_pair{last_token.data(), last_token.size_bytes()}; + token = tokenizer.get_token(); + d_result[token_idx++] = + string_index_pair{d_str.data() + token.first, token.second - token.first}; } if (token_count == max_tokens) { - d_result[token_idx - 1] = string_index_pair{ - last_token.data(), - static_cast(d_str.data() + d_str.size_bytes() - last_token.data())}; + d_result[token_idx - 1] = + string_index_pair{d_str.data() + token.first, d_str.size_bytes() - token.first}; } } else { while (tokenizer.prev_token() && (token_idx < token_count)) { - last_token = tokenizer.get_token(); + token = tokenizer.get_token(); d_result[token_count - 1 - token_idx] = - string_index_pair{last_token.data(), last_token.size_bytes()}; + string_index_pair{d_str.data() + token.first, token.second - token.first}; ++token_idx; } if (token_count == max_tokens) { --token_idx; - d_result[token_count - 1 - token_idx] = string_index_pair{ - d_str.data(), - static_cast(last_token.data() + last_token.size_bytes() - d_str.data())}; + d_result[token_count - 1 - token_idx] = string_index_pair{d_str.data(), token.second}; } } } diff --git a/cpp/src/strings/split/split_utils.cuh b/cpp/src/strings/split/split_utils.cuh index 5c9f98f273a..a6afd1bef10 100644 --- a/cpp/src/strings/split/split_utils.cuh +++ b/cpp/src/strings/split/split_utils.cuh @@ -20,6 +20,8 @@ namespace cudf { namespace strings { namespace detail { +using position_pair = thrust::pair; + /** * @brief Instantiated for each string to manage navigating tokens from * the beginning or the end of that string. @@ -91,10 +93,7 @@ struct whitespace_string_tokenizer { return start_position < end_position; } - __device__ string_view get_token() const - { - return string_view{d_str.data() + start_position, end_position - start_position}; - } + __device__ position_pair get_token() const { return position_pair{start_position, end_position}; } __device__ whitespace_string_tokenizer(string_view const& d_str, bool reverse = false) : d_str{d_str}, From 2d880a125ab11ba5808b4b7c8f918d0c0b13ee91 Mon Sep 17 00:00:00 2001 From: davidwendt Date: Tue, 14 Jul 2020 13:45:13 -0400 Subject: [PATCH 5/8] remove unneeded template case --- cpp/src/strings/split/split_record.cu | 68 ++++++++++++--------------- 1 file changed, 29 insertions(+), 39 deletions(-) diff --git a/cpp/src/strings/split/split_record.cu b/cpp/src/strings/split/split_record.cu index c0d515b9d56..d41fc01d31c 100644 --- a/cpp/src/strings/split/split_record.cu +++ b/cpp/src/strings/split/split_record.cu @@ -40,8 +40,10 @@ enum class Dir { FORWARD, BACKWARD }; /** * @brief Compute the number of tokens for the `idx'th` string element of `d_strings`. + * + * The number of tokens is the same regardless if counting from the beginning + * or the end of the string. */ -template struct token_counter_fn { column_device_view const d_strings; // strings to split string_view const d_delimiter; // delimiter for split @@ -53,17 +55,12 @@ struct token_counter_fn { auto const d_str = d_strings.element(idx); size_type token_count = 0; - size_type start_pos = 0; // updates only if moving forward - size_type end_pos = d_str.length(); // updates only if moving backward + size_type start_pos = 0; while (token_count < max_tokens - 1) { - auto const delimiter_pos = dir == Dir::FORWARD ? d_str.find(d_delimiter, start_pos) - : d_str.rfind(d_delimiter, start_pos, end_pos); + auto const delimiter_pos = d_str.find(d_delimiter, start_pos); if (delimiter_pos < 0) break; token_count++; - if (dir == Dir::FORWARD) - start_pos = delimiter_pos + d_delimiter.length(); - else - end_pos = delimiter_pos; + start_pos = delimiter_pos + d_delimiter.length(); } return token_count + 1; // always at least one token } @@ -79,27 +76,18 @@ struct token_reader_fn { int32_t* d_token_offsets{}; // for locating tokens in d_tokens string_index_pair* d_tokens{}; - template __device__ string_index_pair resolve_token(string_view const& d_str, size_type start_pos, size_type end_pos, size_type delimiter_pos) const { - if (last) { - auto const src_byte_offset = dir == Dir::FORWARD ? d_str.byte_offset(start_pos) : 0; - auto const token_char_bytes = dir == Dir::FORWARD - ? d_str.byte_offset(end_pos) - src_byte_offset - : d_str.byte_offset(end_pos); - return string_index_pair{d_str.data() + src_byte_offset, token_char_bytes}; - } else { - auto const src_byte_offset = dir == Dir::FORWARD - ? d_str.byte_offset(start_pos) - : d_str.byte_offset(delimiter_pos + d_delimiter.length()); - auto const token_char_bytes = dir == Dir::FORWARD - ? d_str.byte_offset(delimiter_pos) - src_byte_offset - : d_str.byte_offset(end_pos) - src_byte_offset; - return string_index_pair{d_str.data() + src_byte_offset, token_char_bytes}; - } + auto const src_byte_offset = dir == Dir::FORWARD + ? d_str.byte_offset(start_pos) + : d_str.byte_offset(delimiter_pos + d_delimiter.length()); + auto const token_char_bytes = dir == Dir::FORWARD + ? d_str.byte_offset(delimiter_pos) - src_byte_offset + : d_str.byte_offset(end_pos) - src_byte_offset; + return string_index_pair{d_str.data() + src_byte_offset, token_char_bytes}; } __device__ void operator()(size_type idx) @@ -111,6 +99,7 @@ struct token_reader_fn { auto d_result = d_tokens + token_offset; auto const d_str = d_strings.element(idx); if (d_str.empty()) { + // Pandas str.split("") for non-whitespace delimiter is an empty string *d_result = string_index_pair{"", 0}; return; } @@ -122,24 +111,25 @@ struct token_reader_fn { auto const delimiter_pos = dir == Dir::FORWARD ? d_str.find(d_delimiter, start_pos) : d_str.rfind(d_delimiter, start_pos, end_pos); if (delimiter_pos < 0) break; - auto const token = resolve_token(d_str, start_pos, end_pos, delimiter_pos); - if (dir == Dir::FORWARD) + auto const token = resolve_token(d_str, start_pos, end_pos, delimiter_pos); + if (dir == Dir::FORWARD) { d_result[token_idx] = token; - else + start_pos = delimiter_pos + d_delimiter.length(); + } else { d_result[token_count - 1 - token_idx] = token; - + end_pos = delimiter_pos; + } token_idx++; - if (dir == Dir::FORWARD) - start_pos = delimiter_pos + d_delimiter.length(); - else - end_pos = delimiter_pos; } - auto const last_token = resolve_token(d_str, start_pos, end_pos, -1); - if (dir == Dir::FORWARD) - d_result[token_idx] = last_token; - else - d_result[0] = last_token; + // set last token to remainder of the string + if (dir == Dir::FORWARD) { + auto const offset_bytes = d_str.byte_offset(start_pos); + d_result[token_idx] = + string_index_pair{d_str.data() + offset_bytes, d_str.byte_offset(end_pos) - offset_bytes}; + } else { + d_result[0] = string_index_pair{d_str.data(), d_str.byte_offset(end_pos)}; + } } }; @@ -289,7 +279,7 @@ std::unique_ptr split_record( } else { string_view d_delimiter(delimiter.data(), delimiter.size()); return split_record_fn(strings, - token_counter_fn{*d_strings_column_ptr, d_delimiter, max_tokens}, + token_counter_fn{*d_strings_column_ptr, d_delimiter, max_tokens}, token_reader_fn{*d_strings_column_ptr, d_delimiter}, mr, stream); From 154ff73ca88550e8e3126946ba667db6686a4fce Mon Sep 17 00:00:00 2001 From: davidwendt Date: Tue, 14 Jul 2020 16:34:56 -0400 Subject: [PATCH 6/8] simplify resolve-token logic --- cpp/src/strings/split/split_record.cu | 31 ++++++++++++++------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/cpp/src/strings/split/split_record.cu b/cpp/src/strings/split/split_record.cu index d41fc01d31c..7d0aee57bd5 100644 --- a/cpp/src/strings/split/split_record.cu +++ b/cpp/src/strings/split/split_record.cu @@ -81,13 +81,15 @@ struct token_reader_fn { size_type end_pos, size_type delimiter_pos) const { - auto const src_byte_offset = dir == Dir::FORWARD - ? d_str.byte_offset(start_pos) - : d_str.byte_offset(delimiter_pos + d_delimiter.length()); - auto const token_char_bytes = dir == Dir::FORWARD - ? d_str.byte_offset(delimiter_pos) - src_byte_offset - : d_str.byte_offset(end_pos) - src_byte_offset; - return string_index_pair{d_str.data() + src_byte_offset, token_char_bytes}; + if (dir == Dir::FORWARD) { + auto const byte_offset = d_str.byte_offset(start_pos); + return string_index_pair{d_str.data() + byte_offset, + d_str.byte_offset(delimiter_pos) - byte_offset}; + } else { + auto const byte_offset = d_str.byte_offset(delimiter_pos + d_delimiter.length()); + return string_index_pair{d_str.data() + byte_offset, + d_str.byte_offset(end_pos) - byte_offset}; + } } __device__ void operator()(size_type idx) @@ -194,10 +196,8 @@ struct whitespace_token_reader_fn { d_result[token_idx++] = string_index_pair{d_str.data() + token.first, token.second - token.first}; } - if (token_count == max_tokens) { - d_result[token_idx - 1] = - string_index_pair{d_str.data() + token.first, d_str.size_bytes() - token.first}; - } + --token_idx; + token.second = d_str.size_bytes() - token.first; } else { while (tokenizer.prev_token() && (token_idx < token_count)) { token = tokenizer.get_token(); @@ -205,11 +205,12 @@ struct whitespace_token_reader_fn { string_index_pair{d_str.data() + token.first, token.second - token.first}; ++token_idx; } - if (token_count == max_tokens) { - --token_idx; - d_result[token_count - 1 - token_idx] = string_index_pair{d_str.data(), token.second}; - } + token_idx = token_count - token_idx; // token_count - 1 - (token_idx-1) + token.first = 0; } + // reset last token only if we hit the max + if (token_count == max_tokens) + d_result[token_idx] = string_index_pair{d_str.data() + token.first, token.second}; } }; From b7f26ddeef5988186272feaa7e20ccbf1a27b31d Mon Sep 17 00:00:00 2001 From: davidwendt Date: Wed, 15 Jul 2020 11:07:39 -0400 Subject: [PATCH 7/8] add examples in the doxygen comments --- cpp/include/cudf/strings/split/split.hpp | 116 +++++++++++++++++++---- 1 file changed, 98 insertions(+), 18 deletions(-) diff --git a/cpp/include/cudf/strings/split/split.hpp b/cpp/include/cudf/strings/split/split.hpp index 4abd7ea54c8..87e423236e9 100644 --- a/cpp/include/cudf/strings/split/split.hpp +++ b/cpp/include/cudf/strings/split/split.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -82,19 +82,57 @@ std::unique_ptr
rsplit( rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource()); /** - * @brief Splits individual strings elements in to a list of tokens. + * @brief Splits individual strings elements into a list of strings. * - * Each element generates an array of tokens that are stored in a - * resulting list column. + * Each element generates an array of strings that are stored in an output + * lists column. * - * The number of elements in the output list will be the same as the number of + * The number of elements in the output column will be the same as the number of * elements in the input column. Each individual list item will contain the - * tokens for that row. The resulting number of tokens in each row can vary - * from 0 to `maxsplit+1`. + * new strings for that row. The resulting number of strings in each row can vary + * from 0 to `maxsplit + 1`. * * The `delimiter` is searched within each string from beginning to end * and splitting stops when either `maxsplit` or the end of the string is reached. * + * If a delimiter is not whitespace and occurs adjacent to another delimiter, + * an empty string is produced for that split occurrence. Likewise, a non-whitespace + * delimiter produces an empty string if it appears at the beginning or the end + * of a string. + * + * @code{.pseudo} + * s = ["a_bc_def_g", "a__bc", "_ab_cd", "ab_cd_"] + * s1 = split_record(s, "_") + * s1 is a lists column of strings: + * [ ["a", "bc", "def", "g"], + * ["a", "", "bc"], + * ["", "ab", "cd"], + * ["ab", "cd", ""] ] + * s2 = split_record(s, "_", 1) + * s2 is a lists column of strings: + * [ ["a", "bc_def_g"], + * ["a", "_bc"], + * ["", "ab_cd"], + * ["ab", "cd_"] ] + * @endcode + * + * A whitespace delimiter produces no empty strings. + * @code{.pseudo} + * s = ["a bc def", "a bc", " ab cd", "ab cd "] + * s1 = split_record(s, "") + * s1 is a lists column of strings: + * [ ["a", "bc", "def"], + * ["a", "bc"], + * ["ab", "cd"], + * ["ab", "cd"] ] + * s2 = split_record(s, "", 1) + * s2 is a lists column of strings: + * [ ["a", "bc def"], + * ["a", "bc"], + * ["ab", "cd"], + * ["ab", "cd "] ] + * @endcode + * * A null string element will result in a null list item for that row. * * @throw cudf:logic_error if `delimiter` is invalid. @@ -105,8 +143,8 @@ std::unique_ptr
rsplit( * @param maxsplit Maximum number of splits to perform. * Default of -1 indicates all possible splits on each string. * @param mr Device memory resource used to allocate the returned result's device memory. - * @return List column of strings - * Each vector of the list column holds splits from a single row + * @return Lists column of strings + * Each vector of the lists column holds splits from a single row * element of the input column. */ std::unique_ptr split_record( @@ -116,19 +154,61 @@ std::unique_ptr split_record( rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource()); /** - * @brief Splits individual strings elements in to a list of tokens starting + * @brief Splits individual strings elements into a list of strings starting * from the end of each string. * - * Each element generates an array of tokens that are stored in a - * resulting list column. + * Each element generates an array of strings that are stored in an output + * lists column. * - * The number of elements in the output list will be the same as the number of + * The number of elements in the output column will be the same as the number of * elements in the input column. Each individual list item will contain the - * tokens for that row. The resulting number of tokens in each row can vary - * from 0 to `maxsplit+1`. + * new strings for that row. The resulting number of strings in each row can vary + * from 0 to `maxsplit + 1`. * * The `delimiter` is searched from end to beginning within each string - * and splitting stops when either `maxsplit` or the end of the string is reached. + * and splitting stops when either `maxsplit` or the beginning of the string + * is reached. + * + * If a delimiter is not whitespace and occurs adjacent to another delimiter, + * an empty string is produced for that split occurrence. Likewise, a non-whitespace + * delimiter produces an empty string if it appears at the beginning or the end + * of a string. + * + * Note that `rsplit_record` and `split_record` produce equivalent results for + * the default `maxsplit` value. + * + * @code{.pseudo} + * s = ["a_bc_def_g", "a__bc", "_ab_cd", "ab_cd_"] + * s1 = rsplit_record(s, "_") + * s1 is a lists column of strings: + * [ ["a", "bc", "def", "g"], + * ["a", "", "bc"], + * ["", "ab", "cd"], + * ["ab", "cd", ""] ] + * s2 = rsplit_record(s, "_", 1) + * s2 is a lists column of strings: + * [ ["a_bc_def", "g"], + * ["a_", "bc"], + * ["_ab", "cd"], + * ["ab_cd", ""] ] + * @endcode + * + * A whitespace delimiter produces no empty strings. + * @code{.pseudo} + * s = ["a bc def", "a bc", " ab cd", "ab cd "] + * s1 = rsplit_record(s, "") + * s1 is a lists column of strings: + * [ ["a", "bc", "def"], + * ["a", "bc"], + * ["ab", "cd"], + * ["ab", "cd"] ] + * s2 = rsplit_record(s, "", 1) + * s2 is a lists column of strings: + * [ ["a bc", "def"], + * ["a", "bc"], + * [" ab", "cd"], + * ["ab", "cd"] ] + * @endcode * * A null string element will result in a null list item for that row. * @@ -140,8 +220,8 @@ std::unique_ptr split_record( * @param maxsplit Maximum number of splits to perform. * Default of -1 indicates all possible splits on each string. * @param mr Device memory resource used to allocate the returned result's device memory. - * @return List column of strings - * Each vector of the list column holds splits from a single row + * @return Lists column of strings + * Each vector of the lists column holds splits from a single row * element of the input column. */ std::unique_ptr rsplit_record( From ca7212d5625e7c4ea51e8286e8a65830798cf4a8 Mon Sep 17 00:00:00 2001 From: davidwendt Date: Mon, 20 Jul 2020 13:16:38 -0400 Subject: [PATCH 8/8] use lists_column_wrapper to create expected gtests results --- cpp/tests/strings/split_tests.cpp | 172 +++++++++++++----------------- 1 file changed, 77 insertions(+), 95 deletions(-) diff --git a/cpp/tests/strings/split_tests.cpp b/cpp/tests/strings/split_tests.cpp index 95756e2fd33..ffb875d330f 100644 --- a/cpp/tests/strings/split_tests.cpp +++ b/cpp/tests/strings/split_tests.cpp @@ -278,107 +278,89 @@ TEST_F(StringsSplitTest, AllNullsCase) TEST_F(StringsSplitTest, SplitRecord) { std::vector h_strings{" Héllo thesé", nullptr, "are some ", "tést String", ""}; - cudf::test::strings_column_wrapper strings( - h_strings.begin(), - h_strings.end(), - thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); + auto validity = + thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }); + cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end(), validity); auto result = cudf::strings::split_record(cudf::strings_column_view(strings), cudf::string_scalar(" ")); - cudf::lists_column_view lcv(result->view()); - cudf::test::strings_column_wrapper expected( - {"", "Héllo", "thesé", "are", "some", "", "", "tést", "String", ""}); - cudf::test::fixed_width_column_wrapper offsets({0, 3, 3, 7, 9, 10}); - cudf::test::expect_columns_equal(lcv.child(), expected); - cudf::test::expect_columns_equal(lcv.offsets(), offsets); + using LCW = cudf::test::lists_column_wrapper; + LCW expected( + {LCW{"", "Héllo", "thesé"}, LCW{}, LCW{"are", "some", "", ""}, LCW{"tést", "String"}, LCW{""}}, + validity); + cudf::test::expect_columns_equal(result->view(), expected); } TEST_F(StringsSplitTest, SplitRecordWithMaxSplit) { std::vector h_strings{" Héllo thesé", nullptr, "are some ", "tést String", ""}; - cudf::test::strings_column_wrapper strings( - h_strings.begin(), - h_strings.end(), - thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); + auto validity = + thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }); + cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end(), validity); auto result = cudf::strings::split_record(cudf::strings_column_view(strings), cudf::string_scalar(" "), 1); - cudf::lists_column_view lcv(result->view()); - cudf::test::strings_column_wrapper expected( - {"", "Héllo thesé", "are", "some ", "tést", "String", ""}); - cudf::test::fixed_width_column_wrapper offsets({0, 2, 2, 4, 6, 7}); - cudf::test::expect_columns_equal(lcv.child(), expected); - cudf::test::expect_columns_equal(lcv.offsets(), offsets); + using LCW = cudf::test::lists_column_wrapper; + LCW expected( + {LCW{"", "Héllo thesé"}, LCW{}, LCW{"are", "some "}, LCW{"tést", "String"}, LCW{""}}, + validity); + cudf::test::expect_columns_equal(result->view(), expected); } TEST_F(StringsSplitTest, SplitRecordWhitespace) { std::vector h_strings{ " Héllo thesé", nullptr, "are\tsome ", "tést\nString", " "}; - cudf::test::strings_column_wrapper strings( - h_strings.begin(), - h_strings.end(), - thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); + auto validity = + thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }); + cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end(), validity); auto result = cudf::strings::split_record(cudf::strings_column_view(strings)); - cudf::lists_column_view lcv(result->view()); - cudf::test::strings_column_wrapper expected({"Héllo", "thesé", "are", "some", "tést", "String"}); - cudf::test::fixed_width_column_wrapper offsets({0, 2, 2, 4, 6, 6}); - cudf::test::expect_columns_equal(lcv.child(), expected); - cudf::test::expect_columns_equal(lcv.offsets(), offsets); + using LCW = cudf::test::lists_column_wrapper; + LCW expected({LCW{"Héllo", "thesé"}, LCW{}, LCW{"are", "some"}, LCW{"tést", "String"}, LCW{}}, + validity); + cudf::test::expect_columns_equal(result->view(), expected); } TEST_F(StringsSplitTest, SplitRecordWhitespaceWithMaxSplit) { std::vector h_strings{ " Héllo thesé ", nullptr, "are\tsome ", "tést\nString", " "}; - cudf::test::strings_column_wrapper strings( - h_strings.begin(), - h_strings.end(), - thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); + auto validity = + thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }); + cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end(), validity); auto result = cudf::strings::split_record(cudf::strings_column_view(strings), cudf::string_scalar(""), 1); - cudf::lists_column_view lcv(result->view()); - cudf::test::strings_column_wrapper expected( - {"Héllo", "thesé ", "are", "some ", "tést", "String"}); - cudf::test::fixed_width_column_wrapper offsets({0, 2, 2, 4, 6, 6}); - cudf::test::expect_columns_equal(lcv.child(), expected); - cudf::test::expect_columns_equal(lcv.offsets(), offsets); + using LCW = cudf::test::lists_column_wrapper; + LCW expected({LCW{"Héllo", "thesé "}, LCW{}, LCW{"are", "some "}, LCW{"tést", "String"}, LCW{}}, + validity); + cudf::test::expect_columns_equal(result->view(), expected); } TEST_F(StringsSplitTest, RSplitRecord) { std::vector h_strings{ "héllo", nullptr, "a_bc_déf", "a__bc", "_ab_cd", "ab_cd_", "", " a b ", " a bbb c"}; - cudf::test::strings_column_wrapper strings( - h_strings.begin(), - h_strings.end(), - thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); - - cudf::test::strings_column_wrapper expected({"héllo", - "a", - "bc", - "déf", - "a", - "", - "bc", - "", - "ab", - "cd", - "ab", - "cd", - "", - "", - " a b ", - " a bbb c"}); - cudf::test::fixed_width_column_wrapper offsets({0, 1, 1, 4, 7, 10, 13, 14, 15, 16}); + auto validity = + thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }); + cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end(), validity); + + using LCW = cudf::test::lists_column_wrapper; + LCW expected({LCW{"héllo"}, + LCW{}, + LCW{"a", "bc", "déf"}, + LCW{"a", "", "bc"}, + LCW{"", "ab", "cd"}, + LCW{"ab", "cd", ""}, + LCW{""}, + LCW{" a b "}, + LCW{" a bbb c"}}, + validity); auto result = cudf::strings::rsplit_record(cudf::strings_column_view(strings), cudf::string_scalar("_")); - cudf::lists_column_view lcv(result->view()); - cudf::test::expect_columns_equal(lcv.child(), expected); - cudf::test::expect_columns_equal(lcv.offsets(), offsets); + cudf::test::expect_columns_equal(result->view(), expected); } TEST_F(StringsSplitTest, RSplitRecordWithMaxSplit) @@ -392,60 +374,60 @@ TEST_F(StringsSplitTest, RSplitRecordWithMaxSplit) "", " a b ___", "___ a bbb c"}; - cudf::test::strings_column_wrapper strings( - h_strings.begin(), - h_strings.end(), - thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); - - cudf::test::strings_column_wrapper expected( - {"héllo", "a", "bc", "déf", "___a", "", "bc", "_ab", "cd", "", - "ab", "cd", "", "", " a b _", "", "", "_", "", " a bbb c"}); - cudf::test::fixed_width_column_wrapper offsets({0, 1, 1, 4, 7, 10, 13, 14, 17, 20}); + auto validity = + thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }); + cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end(), validity); + + using LCW = cudf::test::lists_column_wrapper; + LCW expected({LCW{"héllo"}, + LCW{}, + LCW{"a", "bc", "déf"}, + LCW{"___a", "", "bc"}, + LCW{"_ab", "cd", ""}, + LCW{"ab", "cd", ""}, + LCW{""}, + LCW{" a b _", "", ""}, + LCW{"_", "", " a bbb c"}}, + validity); auto result = cudf::strings::rsplit_record(cudf::strings_column_view(strings), cudf::string_scalar("_"), 2); - cudf::lists_column_view lcv(result->view()); - cudf::test::expect_columns_equal(lcv.child(), expected); - cudf::test::expect_columns_equal(lcv.offsets(), offsets); + cudf::test::expect_columns_equal(result->view(), expected); } TEST_F(StringsSplitTest, RSplitRecordWhitespace) { std::vector h_strings{"héllo", nullptr, "a_bc_déf", "", " a\tb ", " a\r bbb c"}; - cudf::test::strings_column_wrapper strings( - h_strings.begin(), - h_strings.end(), - thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); + auto validity = + thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }); + cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end(), validity); - cudf::test::strings_column_wrapper expected({"héllo", "a_bc_déf", "a", "b", "a", "bbb", "c"}); - cudf::test::fixed_width_column_wrapper offsets({0, 1, 1, 2, 2, 4, 7}); + using LCW = cudf::test::lists_column_wrapper; + LCW expected({LCW{"héllo"}, LCW{}, LCW{"a_bc_déf"}, LCW{}, LCW{"a", "b"}, LCW{"a", "bbb", "c"}}, + validity); auto result = cudf::strings::rsplit_record(cudf::strings_column_view(strings)); - cudf::lists_column_view lcv(result->view()); - cudf::test::expect_columns_equal(lcv.child(), expected); - cudf::test::expect_columns_equal(lcv.offsets(), offsets); + cudf::test::expect_columns_equal(result->view(), expected); } TEST_F(StringsSplitTest, RSplitRecordWhitespaceWithMaxSplit) { std::vector h_strings{ " héllo Asher ", nullptr, " a_bc_déf ", "", " a\tb ", " a\r bbb c"}; - cudf::test::strings_column_wrapper strings( - h_strings.begin(), - h_strings.end(), - thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); + auto validity = + thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }); + cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end(), validity); - cudf::test::strings_column_wrapper expected( - {" héllo", "Asher", "a_bc_déf", " a", "b", " a\r bbb", "c"}); - cudf::test::fixed_width_column_wrapper offsets({0, 2, 2, 3, 3, 5, 7}); + using LCW = cudf::test::lists_column_wrapper; + LCW expected( + {LCW{" héllo", "Asher"}, LCW{}, LCW{"a_bc_déf"}, LCW{}, LCW{" a", "b"}, LCW{" a\r bbb", "c"}}, + validity); auto result = cudf::strings::rsplit_record(cudf::strings_column_view(strings), cudf::string_scalar(""), 1); - cudf::lists_column_view lcv(result->view()); - cudf::test::expect_columns_equal(lcv.child(), expected); - cudf::test::expect_columns_equal(lcv.offsets(), offsets); + cudf::test::expect_columns_equal(result->view(), expected); } TEST_F(StringsSplitTest, SplitRecordZeroSizeStringsColumns)