diff --git a/CHANGELOG.md b/CHANGELOG.md
index 2a637cd306f..78a10ddd302 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -123,6 +123,7 @@
- PR #5662 Make Java ColumnVector(long nativePointer) constructor public
- PR #5679 Use `pickle5` to test older Python versions
- PR #5684 Use `pickle5` in `Serializable` (when available)
+- PR #5687 Change strings::split_record to return a lists column
- PR #5708 Add support for `dummy_na` in `get_dummies`
- PR #5709 Update java build to help cu-spacial with java bindings
- PR #5713 Remove old NVTX utilities
diff --git a/cpp/include/cudf/strings/split/split.hpp b/cpp/include/cudf/strings/split/split.hpp
index 371048287ca..87e423236e9 100644
--- a/cpp/include/cudf/strings/split/split.hpp
+++ b/cpp/include/cudf/strings/split/split.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -82,82 +82,149 @@ std::unique_ptr
rsplit(
rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource());
/**
- * @brief The result(s) of a `contiguous_(r)split_record`
- *
- * Each column_view resulting from a split operation performed by
- * contiguous_split_record will be returned wrapped in a
- * `contiguous_split_record_result`. The column data addresses stored in the
- * column_view objects are not owned by top level cudf::column objects. The
- * backing memory is instead owned by the `all_data` field and in one contiguous
- * block.
- *
- * The user is responsible for assuring that the `column_views` or any derived
- * objects do not outlive the memory owned by `all_data`
- */
-struct contiguous_split_record_result {
- std::vector column_views;
- std::unique_ptr all_data;
-};
-
-/**
- * @brief Splits each element of the input column to a column of tokens storing
- * the resulting columns in a single contiguous block of memory.
- *
- * This function splits each element in the input column to a column of tokens.
- * The number of columns in the output vector will be the same as the number of
- * elements in the input column. The column length will coincide with the
- * number of tokens; the resulting columns wrapped in the returned object may
- * have different sizes.
- *
- * Splitting a null string element will result in an empty output column.
- *
- * @throws cudf:logic_error if `delimiter` is invalid.
+ * @brief Splits individual strings elements into a list of strings.
+ *
+ * Each element generates an array of strings that are stored in an output
+ * lists column.
+ *
+ * The number of elements in the output column will be the same as the number of
+ * elements in the input column. Each individual list item will contain the
+ * new strings for that row. The resulting number of strings in each row can vary
+ * from 0 to `maxsplit + 1`.
+ *
+ * The `delimiter` is searched within each string from beginning to end
+ * and splitting stops when either `maxsplit` or the end of the string is reached.
+ *
+ * If a delimiter is not whitespace and occurs adjacent to another delimiter,
+ * an empty string is produced for that split occurrence. Likewise, a non-whitespace
+ * delimiter produces an empty string if it appears at the beginning or the end
+ * of a string.
+ *
+ * @code{.pseudo}
+ * s = ["a_bc_def_g", "a__bc", "_ab_cd", "ab_cd_"]
+ * s1 = split_record(s, "_")
+ * s1 is a lists column of strings:
+ * [ ["a", "bc", "def", "g"],
+ * ["a", "", "bc"],
+ * ["", "ab", "cd"],
+ * ["ab", "cd", ""] ]
+ * s2 = split_record(s, "_", 1)
+ * s2 is a lists column of strings:
+ * [ ["a", "bc_def_g"],
+ * ["a", "_bc"],
+ * ["", "ab_cd"],
+ * ["ab", "cd_"] ]
+ * @endcode
+ *
+ * A whitespace delimiter produces no empty strings.
+ * @code{.pseudo}
+ * s = ["a bc def", "a bc", " ab cd", "ab cd "]
+ * s1 = split_record(s, "")
+ * s1 is a lists column of strings:
+ * [ ["a", "bc", "def"],
+ * ["a", "bc"],
+ * ["ab", "cd"],
+ * ["ab", "cd"] ]
+ * s2 = split_record(s, "", 1)
+ * s2 is a lists column of strings:
+ * [ ["a", "bc def"],
+ * ["a", "bc"],
+ * ["ab", "cd"],
+ * ["ab", "cd "] ]
+ * @endcode
+ *
+ * A null string element will result in a null list item for that row.
+ *
+ * @throw cudf:logic_error if `delimiter` is invalid.
*
* @param strings A column of string elements to be splitted.
- * @param delimiter UTF-8 encoded string indicating the split points in each
- * string.
+ * @param delimiter The string to identify split points in each string.
* Default of empty string indicates split on whitespace.
* @param maxsplit Maximum number of splits to perform.
* Default of -1 indicates all possible splits on each string.
* @param mr Device memory resource used to allocate the returned result's device memory.
- * @return contiguous_split_record_result New vector of strings column_view
- * objects
- * (each column_view element of the vector holds splits from a string
- * element of the input column).
+ * @return Lists column of strings
+ * Each vector of the lists column holds splits from a single row
+ * element of the input column.
*/
-contiguous_split_record_result contiguous_split_record(
+std::unique_ptr split_record(
strings_column_view const& strings,
string_scalar const& delimiter = string_scalar(""),
size_type maxsplit = -1,
rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource());
/**
- * @brief Splits each element of the input column from the end to a column of
- * tokens storing the resulting columns in a single contiguous block of memory.
- *
- * This function splits each element in the input column to a column of tokens.
- * The number of columns in the output vector will be the same as the number of
- * elements in the input column. The column length will coincide with the
- * number of tokens; the resulting columns wrapped in the returned object may
- * have different sizes.
- *
- * Splitting a null string element will result in an empty output column.
- *
- * @throws cudf:logic_error if `delimiter` is invalid.
+ * @brief Splits individual strings elements into a list of strings starting
+ * from the end of each string.
+ *
+ * Each element generates an array of strings that are stored in an output
+ * lists column.
+ *
+ * The number of elements in the output column will be the same as the number of
+ * elements in the input column. Each individual list item will contain the
+ * new strings for that row. The resulting number of strings in each row can vary
+ * from 0 to `maxsplit + 1`.
+ *
+ * The `delimiter` is searched from end to beginning within each string
+ * and splitting stops when either `maxsplit` or the beginning of the string
+ * is reached.
+ *
+ * If a delimiter is not whitespace and occurs adjacent to another delimiter,
+ * an empty string is produced for that split occurrence. Likewise, a non-whitespace
+ * delimiter produces an empty string if it appears at the beginning or the end
+ * of a string.
+ *
+ * Note that `rsplit_record` and `split_record` produce equivalent results for
+ * the default `maxsplit` value.
+ *
+ * @code{.pseudo}
+ * s = ["a_bc_def_g", "a__bc", "_ab_cd", "ab_cd_"]
+ * s1 = rsplit_record(s, "_")
+ * s1 is a lists column of strings:
+ * [ ["a", "bc", "def", "g"],
+ * ["a", "", "bc"],
+ * ["", "ab", "cd"],
+ * ["ab", "cd", ""] ]
+ * s2 = rsplit_record(s, "_", 1)
+ * s2 is a lists column of strings:
+ * [ ["a_bc_def", "g"],
+ * ["a_", "bc"],
+ * ["_ab", "cd"],
+ * ["ab_cd", ""] ]
+ * @endcode
+ *
+ * A whitespace delimiter produces no empty strings.
+ * @code{.pseudo}
+ * s = ["a bc def", "a bc", " ab cd", "ab cd "]
+ * s1 = rsplit_record(s, "")
+ * s1 is a lists column of strings:
+ * [ ["a", "bc", "def"],
+ * ["a", "bc"],
+ * ["ab", "cd"],
+ * ["ab", "cd"] ]
+ * s2 = rsplit_record(s, "", 1)
+ * s2 is a lists column of strings:
+ * [ ["a bc", "def"],
+ * ["a", "bc"],
+ * [" ab", "cd"],
+ * ["ab", "cd"] ]
+ * @endcode
+ *
+ * A null string element will result in a null list item for that row.
+ *
+ * @throw cudf:logic_error if `delimiter` is invalid.
*
* @param strings A column of string elements to be splitted.
- * @param delimiter UTF-8 encoded string indicating the split points in each
- * string.
+ * @param delimiter The string to identify split points in each string.
* Default of empty string indicates split on whitespace.
* @param maxsplit Maximum number of splits to perform.
* Default of -1 indicates all possible splits on each string.
* @param mr Device memory resource used to allocate the returned result's device memory.
- * @return contiguous_split_record_result New vector of strings column_view
- * objects
- * (each column_view element of the vector holds splits from a string
- * element of the input column).
+ * @return Lists column of strings
+ * Each vector of the lists column holds splits from a single row
+ * element of the input column.
*/
-contiguous_split_record_result contiguous_rsplit_record(
+std::unique_ptr rsplit_record(
strings_column_view const& strings,
string_scalar const& delimiter = string_scalar(""),
size_type maxsplit = -1,
diff --git a/cpp/src/strings/split/split.cu b/cpp/src/strings/split/split.cu
index 89d09d56517..3d7d902551f 100644
--- a/cpp/src/strings/split/split.cu
+++ b/cpp/src/strings/split/split.cu
@@ -24,6 +24,7 @@
#include
#include
#include
+#include
#include // upper_bound()
#include // copy_if()
@@ -34,8 +35,8 @@
namespace cudf {
namespace strings {
namespace detail {
+
using string_index_pair = thrust::pair;
-using position_pair = thrust::pair;
namespace {
@@ -582,99 +583,6 @@ struct base_whitespace_split_tokenizer {
size_type max_tokens; // maximum number of tokens
};
-/**
- * @brief Instantiated for each string to manage navigating tokens from
- * the beginning or the end of that string.
- */
-struct whitespace_string_tokenizer {
- /**
- * @brief Identifies the position range of the next token in the given
- * string at the specified iterator position.
- *
- * Tokens are delimited by one or more whitespace characters.
- *
- * @return true if a token has been found
- */
- __device__ bool next_token()
- {
- if (itr != d_str.begin()) { // skip these 2 lines the first time through
- start_position = end_position + 1;
- ++itr;
- }
- if (start_position >= d_str.length()) return false;
- // continue search for the next token
- end_position = d_str.length();
- for (; itr < d_str.end(); ++itr) {
- if (spaces == (*itr <= ' ')) {
- if (spaces)
- start_position = itr.position() + 1;
- else
- end_position = itr.position() + 1;
- continue;
- }
- spaces = !spaces;
- if (spaces) {
- end_position = itr.position();
- break;
- }
- }
- return start_position < end_position;
- }
-
- /**
- * @brief Identifies the position range of the previous token in the given
- * string at the specified iterator position.
- *
- * Tokens are delimited by one or more whitespace characters.
- *
- * @return true if a token has been found
- */
- __device__ bool prev_token()
- {
- end_position = start_position - 1;
- --itr;
- if (end_position <= 0) return false;
- // continue search for the next token
- start_position = 0;
- for (; itr >= d_str.begin(); --itr) {
- if (spaces == (*itr <= ' ')) {
- if (spaces)
- end_position = itr.position();
- else
- start_position = itr.position();
- continue;
- }
- spaces = !spaces;
- if (spaces) {
- start_position = itr.position() + 1;
- break;
- }
- }
- return start_position < end_position;
- }
-
- __device__ position_pair token_byte_positions()
- {
- return position_pair{d_str.byte_offset(start_position), d_str.byte_offset(end_position)};
- }
-
- __device__ whitespace_string_tokenizer(string_view const& d_str, bool reverse = false)
- : d_str{d_str},
- spaces(true),
- start_position{reverse ? d_str.length() + 1 : 0},
- end_position{d_str.length()},
- itr{reverse ? d_str.end() : d_str.begin()}
- {
- }
-
- private:
- string_view const d_str;
- bool spaces; // true if current position is whitespace
- cudf::string_view::const_iterator itr;
- size_type start_position;
- size_type end_position;
-};
-
/**
* @brief The tokenizer functions for split() with whitespace.
*
@@ -709,7 +617,7 @@ struct whitespace_split_tokenizer_fn : base_whitespace_split_tokenizer {
size_type token_idx = 0;
position_pair token{0, 0};
while (tokenizer.next_token() && (token_idx < token_count)) {
- token = tokenizer.token_byte_positions();
+ token = tokenizer.get_token();
d_tokens[d_strings.size() * (token_idx++)] =
string_index_pair{d_str.data() + token.first, (token.second - token.first)};
}
@@ -760,7 +668,7 @@ struct whitespace_rsplit_tokenizer_fn : base_whitespace_split_tokenizer {
size_type token_idx = 0;
position_pair token{0, 0};
while (tokenizer.prev_token() && (token_idx < token_count)) {
- token = tokenizer.token_byte_positions();
+ token = tokenizer.get_token();
d_tokens[d_strings.size() * (token_count - 1 - token_idx)] =
string_index_pair{d_str.data() + token.first, (token.second - token.first)};
++token_idx;
diff --git a/cpp/src/strings/split/split_record.cu b/cpp/src/strings/split/split_record.cu
index 4a069906f80..7d0aee57bd5 100644
--- a/cpp/src/strings/split/split_record.cu
+++ b/cpp/src/strings/split/split_record.cu
@@ -16,249 +16,145 @@
#include
#include
+#include
+#include
#include
-#include
+#include
#include
#include
#include
+#include
+#include
#include
-#include
namespace cudf {
namespace strings {
namespace detail {
-namespace {
-// align all column size allocations to this boundary so that all output column buffers
-// start at that alignment.
-static constexpr size_type split_align = 64;
+using string_index_pair = thrust::pair;
-__device__ size_type compute_memory_size(size_type token_count, size_type token_size_sum)
-{
- return cudf::detail::round_up_pow2(token_size_sum, split_align) +
- cudf::detail::round_up_pow2((token_count + 1) * static_cast(sizeof(size_type)),
- split_align);
-}
-
-struct copy_info {
- size_type idx{};
- size_type token_count{};
- size_type token_size_sum{};
- void* memory_ptr{};
-};
+namespace {
enum class Dir { FORWARD, BACKWARD };
/**
- * @brief Compute the number of tokens, the total byte sizes of the tokens, and
- * required memory size for the `idx'th` string element of `d_strings`.
+ * @brief Compute the number of tokens for the `idx'th` string element of `d_strings`.
+ *
+ * The number of tokens is the same regardless if counting from the beginning
+ * or the end of the string.
*/
-template
-struct token_reader_fn {
+struct token_counter_fn {
column_device_view const d_strings; // strings to split
string_view const d_delimiter; // delimiter for split
size_type const max_tokens = std::numeric_limits::max();
- bool const has_validity = false;
- template
- __device__ size_type compute_token_char_bytes(string_view const& d_str,
- size_type start_pos,
- size_type end_pos,
- size_type delimiter_pos) const
+ __device__ size_type operator()(size_type idx) const
{
- if (last) {
- return dir == Dir::FORWARD ? d_str.byte_offset(end_pos) - d_str.byte_offset(start_pos)
- : d_str.byte_offset(end_pos);
- } else {
- return dir == Dir::FORWARD ? d_str.byte_offset(delimiter_pos) - d_str.byte_offset(start_pos)
- : d_str.byte_offset(end_pos) -
- d_str.byte_offset(delimiter_pos + d_delimiter.length());
- }
- }
+ if (d_strings.is_null(idx)) { return 0; }
- // returns a tuple of token count, sum of token sizes in bytes, and required
- // memory block size
- __device__ thrust::tuple operator()(size_type idx) const
- {
- if (has_validity && d_strings.is_null(idx)) {
- return thrust::make_tuple(0, 0, 0);
- }
-
- auto const d_str = d_strings.element(idx);
- size_type token_count = 0;
- size_type token_size_sum = 0;
- size_type start_pos = 0; // updates only if moving forward
- auto end_pos = d_str.length(); // updates only if moving backward
+ auto const d_str = d_strings.element(idx);
+ size_type token_count = 0;
+ size_type start_pos = 0;
while (token_count < max_tokens - 1) {
- auto const delimiter_pos = dir == Dir::FORWARD ? d_str.find(d_delimiter, start_pos)
- : d_str.rfind(d_delimiter, start_pos, end_pos);
- if (delimiter_pos != -1) {
- token_count++;
- token_size_sum += compute_token_char_bytes(d_str, start_pos, end_pos, delimiter_pos);
- if (dir == Dir::FORWARD) {
- start_pos = delimiter_pos + d_delimiter.length();
- } else {
- end_pos = delimiter_pos;
- }
- } else {
- break;
- }
+ auto const delimiter_pos = d_str.find(d_delimiter, start_pos);
+ if (delimiter_pos < 0) break;
+ token_count++;
+ start_pos = delimiter_pos + d_delimiter.length();
}
- token_count++;
- token_size_sum += compute_token_char_bytes(d_str, start_pos, end_pos, -1);
-
- auto const memory_size = compute_memory_size(token_count, token_size_sum);
-
- return thrust::make_tuple(
- token_count, token_size_sum, memory_size);
+ return token_count + 1; // always at least one token
}
};
/**
- * @brief Copy the tokens from the `idx'th` string element of `d_strings` to
- * the contiguous memory buffer.
+ * @brief Identify the tokens from the `idx'th` string element of `d_strings`.
*/
template
-struct token_copier_fn {
+struct token_reader_fn {
column_device_view const d_strings; // strings to split
string_view const d_delimiter; // delimiter for split
- bool const has_validity = false;
+ int32_t* d_token_offsets{}; // for locating tokens in d_tokens
+ string_index_pair* d_tokens{};
- template
- __device__ thrust::pair compute_src_byte_offset_and_token_char_bytes(
- string_view const& d_str, size_type start_pos, size_type end_pos, size_type delimiter_pos) const
+ __device__ string_index_pair resolve_token(string_view const& d_str,
+ size_type start_pos,
+ size_type end_pos,
+ size_type delimiter_pos) const
{
- if (last) {
- auto const src_byte_offset = dir == Dir::FORWARD ? d_str.byte_offset(start_pos) : 0;
- auto const token_char_bytes = dir == Dir::FORWARD
- ? d_str.byte_offset(end_pos) - src_byte_offset
- : d_str.byte_offset(end_pos);
- return thrust::make_pair(src_byte_offset, token_char_bytes);
+ if (dir == Dir::FORWARD) {
+ auto const byte_offset = d_str.byte_offset(start_pos);
+ return string_index_pair{d_str.data() + byte_offset,
+ d_str.byte_offset(delimiter_pos) - byte_offset};
} else {
- auto const src_byte_offset = dir == Dir::FORWARD
- ? d_str.byte_offset(start_pos)
- : d_str.byte_offset(delimiter_pos + d_delimiter.length());
- auto const token_char_bytes = dir == Dir::FORWARD
- ? d_str.byte_offset(delimiter_pos) - src_byte_offset
- : d_str.byte_offset(end_pos) - src_byte_offset;
- return thrust::make_pair(src_byte_offset, token_char_bytes);
+ auto const byte_offset = d_str.byte_offset(delimiter_pos + d_delimiter.length());
+ return string_index_pair{d_str.data() + byte_offset,
+ d_str.byte_offset(end_pos) - byte_offset};
}
}
- __device__ void operator()(copy_info const info) const
+ __device__ void operator()(size_type idx)
{
- if (info.token_count == 0) { return; }
-
- auto memory_ptr = static_cast(info.memory_ptr);
-
- auto const char_buf_size = cudf::detail::round_up_pow2(info.token_size_sum, split_align);
- auto const char_buf_ptr = memory_ptr;
- memory_ptr += char_buf_size;
- auto const offset_buf_ptr = reinterpret_cast(memory_ptr);
+ if (d_strings.is_null(idx)) { return; }
+
+ auto const token_offset = d_token_offsets[idx];
+ auto const token_count = d_token_offsets[idx + 1] - token_offset;
+ auto d_result = d_tokens + token_offset;
+ auto const d_str = d_strings.element(idx);
+ if (d_str.empty()) {
+ // Pandas str.split("") for non-whitespace delimiter is an empty string
+ *d_result = string_index_pair{"", 0};
+ return;
+ }
- auto const d_str = d_strings.element(info.idx);
- size_type token_idx = 0;
- size_type char_bytes_copied = 0;
- size_type start_pos = 0; // updates only if moving forward
- auto end_pos = d_str.length(); // updates only if moving backward
- while (token_idx < info.token_count - 1) {
+ size_type token_idx = 0;
+ size_type start_pos = 0; // updates only if moving forward
+ size_type end_pos = d_str.length(); // updates only if moving backward
+ while (token_idx < token_count - 1) {
auto const delimiter_pos = dir == Dir::FORWARD ? d_str.find(d_delimiter, start_pos)
: d_str.rfind(d_delimiter, start_pos, end_pos);
- if (delimiter_pos != -1) {
- auto const offset_size_pair = compute_src_byte_offset_and_token_char_bytes(
- d_str, start_pos, end_pos, delimiter_pos);
- if (dir == Dir::FORWARD) {
- thrust::copy(thrust::seq,
- d_str.data() + offset_size_pair.first,
- d_str.data() + offset_size_pair.first + offset_size_pair.second,
- char_buf_ptr + char_bytes_copied);
- offset_buf_ptr[token_idx] = char_bytes_copied;
- } else {
- auto const char_buf_offset =
- info.token_size_sum - char_bytes_copied - offset_size_pair.second;
- thrust::copy(thrust::seq,
- d_str.data() + offset_size_pair.first,
- d_str.data() + offset_size_pair.first + offset_size_pair.second,
- char_buf_ptr + char_buf_offset);
- offset_buf_ptr[info.token_count - 1 - token_idx] = char_buf_offset;
- }
- token_idx++;
- char_bytes_copied += offset_size_pair.second;
- if (dir == Dir::FORWARD) {
- start_pos = delimiter_pos + d_delimiter.length();
- } else {
- end_pos = delimiter_pos;
- }
+ if (delimiter_pos < 0) break;
+ auto const token = resolve_token(d_str, start_pos, end_pos, delimiter_pos);
+ if (dir == Dir::FORWARD) {
+ d_result[token_idx] = token;
+ start_pos = delimiter_pos + d_delimiter.length();
} else {
- break;
+ d_result[token_count - 1 - token_idx] = token;
+ end_pos = delimiter_pos;
}
+ token_idx++;
}
- auto const offset_size_pair =
- compute_src_byte_offset_and_token_char_bytes(d_str, start_pos, end_pos, -1);
+ // set last token to remainder of the string
if (dir == Dir::FORWARD) {
- thrust::copy(thrust::seq,
- d_str.data() + offset_size_pair.first,
- d_str.data() + offset_size_pair.first + offset_size_pair.second,
- char_buf_ptr + char_bytes_copied);
- offset_buf_ptr[token_idx] = char_bytes_copied;
+ auto const offset_bytes = d_str.byte_offset(start_pos);
+ d_result[token_idx] =
+ string_index_pair{d_str.data() + offset_bytes, d_str.byte_offset(end_pos) - offset_bytes};
} else {
- thrust::copy(thrust::seq, d_str.data(), d_str.data() + offset_size_pair.second, char_buf_ptr);
- offset_buf_ptr[0] = 0;
+ d_result[0] = string_index_pair{d_str.data(), d_str.byte_offset(end_pos)};
}
- offset_buf_ptr[info.token_count] = info.token_size_sum;
}
};
/**
- * @brief Compute the number of tokens, the total byte sizes of the tokens, and
- * required memory size for the `idx'th` string element of `d_strings`.
+ * @brief Compute the number of tokens for the `idx'th` string element of `d_strings`.
*/
-template
-struct whitespace_token_reader_fn {
+struct whitespace_token_counter_fn {
column_device_view const d_strings; // strings to split
size_type const max_tokens = std::numeric_limits::max();
- bool const has_validity = false;
-
- template
- __device__ size_type compute_token_char_bytes(string_view const& d_str,
- size_type cur_pos,
- size_type to_token_pos) const
- {
- if (last) {
- return dir == Dir::FORWARD
- ? d_str.byte_offset(d_str.length()) - d_str.byte_offset(to_token_pos)
- : d_str.byte_offset(to_token_pos + 1) - d_str.byte_offset(0);
- } else {
- return dir == Dir::FORWARD
- ? d_str.byte_offset(cur_pos) - d_str.byte_offset(to_token_pos)
- : d_str.byte_offset(to_token_pos + 1) - d_str.byte_offset(cur_pos + 1);
- }
- }
- __device__ thrust::tuple operator()(size_type idx) const
+ __device__ size_type operator()(size_type idx) const
{
- if (has_validity && d_strings.is_null(idx)) {
- return thrust::make_tuple(0, 0, 0);
- }
+ if (d_strings.is_null(idx)) { return 0; }
- auto const d_str = d_strings.element(idx);
- size_type token_count = 0;
- size_type token_size_sum = 0;
- auto spaces = true;
- auto reached_max_tokens = false;
- size_type to_token_pos = 0;
- for (size_type i = 0; i < d_str.length(); ++i) {
- auto const cur_pos = dir == Dir::FORWARD ? i : d_str.length() - 1 - i;
- auto const ch = d_str[cur_pos];
+ auto const d_str = d_strings.element(idx);
+ size_type token_count = 0;
+ auto spaces = true;
+ auto reached_max_tokens = false;
+ for (auto ch : d_str) {
if (spaces != (ch <= ' ')) {
- if (spaces) { // from whitespace(s) to a new token
- to_token_pos = cur_pos;
- } else { // from a token to whitespace(s)
+ if (!spaces) {
if (token_count < max_tokens - 1) {
token_count++;
- token_size_sum += compute_token_char_bytes(d_str, cur_pos, to_token_pos);
} else {
reached_max_tokens = true;
break;
@@ -267,217 +163,102 @@ struct whitespace_token_reader_fn {
spaces = !spaces;
}
}
- if (reached_max_tokens || !spaces) {
- token_count++;
- token_size_sum += compute_token_char_bytes(d_str, -1, to_token_pos);
- }
-
- if (token_count == 0) { // note that pandas.Series.str.split("", pat=" ")
- // returns one token (i.e. "") while
- // pandas.Series.str.split("") returns 0 token.
- return thrust::make_tuple(0, 0, 0);
- }
-
- auto const memory_size = compute_memory_size(token_count, token_size_sum);
-
- return thrust::make_tuple(
- token_count, token_size_sum, memory_size);
+ // pandas.Series.str.split("") returns 0 tokens.
+ if (reached_max_tokens || !spaces) token_count++;
+ return token_count;
}
};
/**
- * @brief Copy the tokens from the `idx'th` string element of `d_strings` to
- * the contiguous memory buffer.
+ * @brief Identify the tokens from the `idx'th` string element of `d_strings`.
*/
template
-struct whitespace_token_copier_fn {
+struct whitespace_token_reader_fn {
column_device_view const d_strings; // strings to split
- bool const has_validity = false;
-
- template
- __device__ thrust::pair compute_src_byte_offset_and_token_char_bytes(
- string_view const& d_str,
- size_type cur_pos,
- size_type to_token_pos,
- size_type remaining_bytes) const
- {
- if (last) {
- auto const token_char_bytes = remaining_bytes;
- auto const src_byte_offset = dir == Dir::FORWARD
- ? d_str.byte_offset(to_token_pos)
- : d_str.byte_offset(to_token_pos + 1) - token_char_bytes;
- return thrust::make_pair(src_byte_offset, token_char_bytes);
- } else {
- auto const src_byte_offset =
- dir == Dir::FORWARD ? d_str.byte_offset(to_token_pos) : d_str.byte_offset(cur_pos + 1);
- auto const token_char_bytes = dir == Dir::FORWARD
- ? d_str.byte_offset(cur_pos) - src_byte_offset
- : d_str.byte_offset(to_token_pos + 1) - src_byte_offset;
- return thrust::make_pair(src_byte_offset, token_char_bytes);
- }
- }
+ size_type const max_tokens{};
+ int32_t* d_token_offsets{};
+ string_index_pair* d_tokens{};
- __device__ void operator()(copy_info const info) const
+ __device__ void operator()(size_type idx)
{
- if (info.token_count == 0) { return; }
-
- auto memory_ptr = static_cast(info.memory_ptr);
-
- auto const char_buf_size = cudf::detail::round_up_pow2(info.token_size_sum, split_align);
- auto const char_buf_ptr = memory_ptr;
- memory_ptr += char_buf_size;
- auto const offset_buf_ptr = reinterpret_cast(memory_ptr);
-
- auto const d_str = d_strings.element(info.idx);
- size_type token_idx = 0;
- size_type char_bytes_copied = 0;
- auto spaces = true;
- size_type to_token_pos = 0;
- for (size_type i = 0; i < d_str.length(); ++i) {
- auto const cur_pos = dir == Dir::FORWARD ? i : d_str.length() - 1 - i;
- auto const ch = d_str[cur_pos];
- if (spaces != (ch <= ' ')) {
- if (spaces) { // from whitespace(s) to a new token
- to_token_pos = cur_pos;
- } else { // from a token to whitespace(s)
- if (token_idx < info.token_count - 1) {
- auto const offset_size_pair = compute_src_byte_offset_and_token_char_bytes(
- d_str, cur_pos, to_token_pos, info.token_size_sum - char_bytes_copied);
- if (dir == Dir::FORWARD) {
- thrust::copy(thrust::seq,
- d_str.data() + offset_size_pair.first,
- d_str.data() + offset_size_pair.first + offset_size_pair.second,
- char_buf_ptr + char_bytes_copied);
- offset_buf_ptr[token_idx] = char_bytes_copied;
- } else {
- auto const char_buf_offset =
- info.token_size_sum - char_bytes_copied - offset_size_pair.second;
- thrust::copy(thrust::seq,
- d_str.data() + offset_size_pair.first,
- d_str.data() + offset_size_pair.first + offset_size_pair.second,
- char_buf_ptr + char_buf_offset);
- offset_buf_ptr[info.token_count - 1 - token_idx] = char_buf_offset;
- }
- token_idx++;
- char_bytes_copied += offset_size_pair.second;
- } else {
- break;
- }
- }
- spaces = !spaces;
+ auto const token_offset = d_token_offsets[idx];
+ auto const token_count = d_token_offsets[idx + 1] - token_offset;
+ if (token_count == 0) { return; }
+ auto d_result = d_tokens + token_offset;
+
+ auto const d_str = d_strings.element(idx);
+ whitespace_string_tokenizer tokenizer(d_str, dir != Dir::FORWARD);
+ size_type token_idx = 0;
+ position_pair token{0, 0};
+ if (dir == Dir::FORWARD) {
+ while (tokenizer.next_token() && (token_idx < token_count)) {
+ token = tokenizer.get_token();
+ d_result[token_idx++] =
+ string_index_pair{d_str.data() + token.first, token.second - token.first};
}
- }
- if (token_idx < info.token_count) {
- auto const offset_size_pair = compute_src_byte_offset_and_token_char_bytes(
- d_str, -1, to_token_pos, info.token_size_sum - char_bytes_copied);
- if (dir == Dir::FORWARD) {
- thrust::copy(thrust::seq,
- d_str.data() + offset_size_pair.first,
- d_str.data() + offset_size_pair.first + offset_size_pair.second,
- char_buf_ptr + char_bytes_copied);
- offset_buf_ptr[token_idx] = char_bytes_copied;
- } else {
- thrust::copy(thrust::seq,
- d_str.data() + offset_size_pair.first,
- d_str.data() + offset_size_pair.first + offset_size_pair.second,
- char_buf_ptr);
- offset_buf_ptr[0] = 0;
+ --token_idx;
+ token.second = d_str.size_bytes() - token.first;
+ } else {
+ while (tokenizer.prev_token() && (token_idx < token_count)) {
+ token = tokenizer.get_token();
+ d_result[token_count - 1 - token_idx] =
+ string_index_pair{d_str.data() + token.first, token.second - token.first};
+ ++token_idx;
}
+ token_idx = token_count - token_idx; // token_count - 1 - (token_idx-1)
+ token.first = 0;
}
- offset_buf_ptr[info.token_count] = info.token_size_sum;
+ // reset last token only if we hit the max
+ if (token_count == max_tokens)
+ d_result[token_idx] = string_index_pair{d_str.data() + token.first, token.second};
}
};
-// Generic split function used by split_record and rsplit_record
-template
-contiguous_split_record_result contiguous_split_record_fn(strings_column_view const& strings,
- TokenReader reader,
- TokenCopier copier,
- rmm::mr::device_memory_resource* mr,
- cudaStream_t stream)
-{
- // read each string element of the input column to count the number of tokens
- // and compute the memory offsets
+} // namespace
+// The output is one list item per string
+template
+std::unique_ptr split_record_fn(strings_column_view const& strings,
+ TokenCounter counter,
+ TokenReader reader,
+ rmm::mr::device_memory_resource* mr,
+ cudaStream_t stream)
+{
+ // create offsets column by counting the number of tokens per string
auto strings_count = strings.size();
- rmm::device_vector d_token_counts(strings_count);
- rmm::device_vector d_token_size_sums(strings_count);
- rmm::device_vector d_memory_offsets(strings_count + 1);
-
+ auto offsets = make_numeric_column(
+ data_type{type_id::INT32}, strings_count + 1, mask_state::UNALLOCATED, stream, mr);
+ auto d_offsets = offsets->mutable_view().data();
thrust::transform(rmm::exec_policy(stream)->on(stream),
thrust::make_counting_iterator(0),
thrust::make_counting_iterator(strings_count),
- thrust::make_zip_iterator(thrust::make_tuple(
- d_token_counts.begin(), d_token_size_sums.begin(), d_memory_offsets.begin())),
- reader);
-
- thrust::exclusive_scan(rmm::exec_policy(stream)->on(stream),
- d_memory_offsets.begin(),
- d_memory_offsets.end(),
- d_memory_offsets.begin());
-
- // allocate and copy
-
- thrust::host_vector h_token_counts = d_token_counts;
- thrust::host_vector h_token_size_sums = d_token_size_sums;
- thrust::host_vector h_memory_offsets = d_memory_offsets;
-
- auto memory_size = h_memory_offsets.back();
- auto all_data_ptr = std::make_unique(memory_size, stream, mr);
-
- auto d_all_data_ptr = reinterpret_cast(all_data_ptr->data());
- auto d_token_counts_ptr = d_token_counts.data().get();
- auto d_memory_offsets_ptr = d_memory_offsets.data().get();
- auto d_token_size_sums_ptr = d_token_size_sums.data().get();
- auto copy_info_begin = thrust::make_transform_iterator(
- thrust::make_counting_iterator(0),
- [d_all_data_ptr, d_token_counts_ptr, d_memory_offsets_ptr, d_token_size_sums_ptr] __device__(
- auto i) {
- return copy_info{i,
- d_token_counts_ptr[i],
- d_token_size_sums_ptr[i],
- d_all_data_ptr + d_memory_offsets_ptr[i]};
- });
-
- thrust::for_each(
- rmm::exec_policy(stream)->on(stream), copy_info_begin, copy_info_begin + strings_count, copier);
-
- // update column_view objects
-
- std::vector column_views{};
- for (size_type i = 0; i < strings_count; ++i) {
- if (h_token_counts[i] == 0) {
- column_views.emplace_back(strings.parent().type(), 0, nullptr);
- } else {
- auto memory_ptr = d_all_data_ptr + h_memory_offsets[i];
- auto char_buf_size = cudf::util::round_up_safe(h_token_size_sums[i], split_align);
-
- auto char_buf_ptr = memory_ptr;
- memory_ptr += char_buf_size;
- auto offset_buf_ptr = reinterpret_cast(memory_ptr);
-
- column_views.emplace_back(
- strings.parent().type(),
- h_token_counts[i],
- nullptr,
- nullptr,
- UNKNOWN_NULL_COUNT,
- 0,
- std::vector{
- column_view(strings.offsets().type(), h_token_counts[i] + 1, offset_buf_ptr),
- column_view(strings.chars().type(), h_token_size_sums[i], char_buf_ptr)});
- }
- }
-
- CUDA_TRY(cudaStreamSynchronize(stream));
-
- return contiguous_split_record_result{std::move(column_views), std::move(all_data_ptr)};
+ d_offsets,
+ counter);
+ thrust::exclusive_scan(
+ rmm::exec_policy(stream)->on(stream), d_offsets, d_offsets + strings_count + 1, d_offsets);
+
+ // last entry is the total number of tokens to be generated
+ auto total_tokens = cudf::detail::get_value(offsets->view(), strings_count, stream);
+ // split each string into an array of index-pair values
+ rmm::device_vector tokens(total_tokens);
+ reader.d_token_offsets = d_offsets;
+ reader.d_tokens = tokens.data().get();
+ thrust::for_each_n(rmm::exec_policy(stream)->on(stream),
+ thrust::make_counting_iterator(0),
+ strings_count,
+ reader);
+ // convert the index-pairs into one big strings column
+ auto strings_output = make_strings_column(tokens.begin(), tokens.end(), mr, stream);
+ // create a lists column using the offsets and the strings columns
+ return make_lists_column(strings_count,
+ std::move(offsets),
+ std::move(strings_output),
+ strings.null_count(),
+ copy_bitmask(strings.parent(), stream, mr));
}
-} // namespace
-
template
-contiguous_split_record_result contiguous_split_record(
+std::unique_ptr split_record(
strings_column_view const& strings,
string_scalar const& delimiter = string_scalar(""),
size_type maxsplit = -1,
@@ -488,24 +269,21 @@ contiguous_split_record_result contiguous_split_record(
// makes consistent with Pandas
size_type max_tokens = maxsplit > 0 ? maxsplit + 1 : std::numeric_limits::max();
- auto has_validity = strings.parent().nullable();
auto d_strings_column_ptr = column_device_view::create(strings.parent(), stream);
if (delimiter.size() == 0) {
- return contiguous_split_record_fn(
- strings,
- whitespace_token_reader_fn{*d_strings_column_ptr, max_tokens, has_validity},
- whitespace_token_copier_fn{*d_strings_column_ptr, has_validity},
- mr,
- stream);
+ return split_record_fn(strings,
+ whitespace_token_counter_fn{*d_strings_column_ptr, max_tokens},
+ whitespace_token_reader_fn{*d_strings_column_ptr, max_tokens},
+ mr,
+ stream);
} else {
string_view d_delimiter(delimiter.data(), delimiter.size());
- return contiguous_split_record_fn(
- strings,
- token_reader_fn{*d_strings_column_ptr, d_delimiter, max_tokens, has_validity},
- token_copier_fn{*d_strings_column_ptr, d_delimiter, has_validity},
- mr,
- stream);
+ return split_record_fn(strings,
+ token_counter_fn{*d_strings_column_ptr, d_delimiter, max_tokens},
+ token_reader_fn{*d_strings_column_ptr, d_delimiter},
+ mr,
+ stream);
}
}
@@ -513,23 +291,22 @@ contiguous_split_record_result contiguous_split_record(
// external APIs
-contiguous_split_record_result contiguous_split_record(strings_column_view const& strings,
- string_scalar const& delimiter,
- size_type maxsplit,
- rmm::mr::device_memory_resource* mr)
+std::unique_ptr split_record(strings_column_view const& strings,
+ string_scalar const& delimiter,
+ size_type maxsplit,
+ rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
- return detail::contiguous_split_record(strings, delimiter, maxsplit, mr, 0);
+ return detail::split_record(strings, delimiter, maxsplit, mr, 0);
}
-contiguous_split_record_result contiguous_rsplit_record(strings_column_view const& strings,
- string_scalar const& delimiter,
- size_type maxsplit,
- rmm::mr::device_memory_resource* mr)
+std::unique_ptr rsplit_record(strings_column_view const& strings,
+ string_scalar const& delimiter,
+ size_type maxsplit,
+ rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
- return detail::contiguous_split_record(
- strings, delimiter, maxsplit, mr, 0);
+ return detail::split_record(strings, delimiter, maxsplit, mr, 0);
}
} // namespace strings
diff --git a/cpp/src/strings/split/split_utils.cuh b/cpp/src/strings/split/split_utils.cuh
new file mode 100644
index 00000000000..a6afd1bef10
--- /dev/null
+++ b/cpp/src/strings/split/split_utils.cuh
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include
+
+namespace cudf {
+namespace strings {
+namespace detail {
+
+using position_pair = thrust::pair;
+
+/**
+ * @brief Instantiated for each string to manage navigating tokens from
+ * the beginning or the end of that string.
+ */
+struct whitespace_string_tokenizer {
+ /**
+ * @brief Identifies the position range of the next token in the given
+ * string at the specified iterator position.
+ *
+ * Tokens are delimited by one or more whitespace characters.
+ *
+ * @return true if a token has been found
+ */
+ __device__ bool next_token()
+ {
+ if (itr != d_str.begin()) { // skip these 2 lines the first time through
+ ++itr;
+ start_position = itr.byte_offset(); // end_position + 1;
+ }
+ if (start_position >= d_str.size_bytes()) return false;
+ // continue search for the next token
+ end_position = d_str.size_bytes();
+ for (; itr < d_str.end(); ++itr) {
+ if (spaces == (*itr <= ' ')) {
+ if (spaces)
+ start_position = (itr + 1).byte_offset();
+ else
+ end_position = (itr + 1).byte_offset();
+ continue;
+ }
+ spaces = !spaces;
+ if (spaces) {
+ end_position = itr.byte_offset();
+ break;
+ }
+ }
+ return start_position < end_position;
+ }
+
+ /**
+ * @brief Identifies the position range of the previous token in the given
+ * string at the specified iterator position.
+ *
+ * Tokens are delimited by one or more whitespace characters.
+ *
+ * @return true if a token has been found
+ */
+ __device__ bool prev_token()
+ {
+ end_position = start_position - 1;
+ --itr;
+ if (end_position <= 0) return false;
+ // continue search for the next token
+ start_position = 0;
+ for (; itr >= d_str.begin(); --itr) {
+ if (spaces == (*itr <= ' ')) {
+ if (spaces)
+ end_position = itr.byte_offset();
+ else
+ start_position = itr.byte_offset();
+ continue;
+ }
+ spaces = !spaces;
+ if (spaces) {
+ start_position = (itr + 1).byte_offset();
+ break;
+ }
+ }
+ return start_position < end_position;
+ }
+
+ __device__ position_pair get_token() const { return position_pair{start_position, end_position}; }
+
+ __device__ whitespace_string_tokenizer(string_view const& d_str, bool reverse = false)
+ : d_str{d_str},
+ spaces(true),
+ start_position{reverse ? d_str.size_bytes() + 1 : 0},
+ end_position{d_str.size_bytes()},
+ itr{reverse ? d_str.end() : d_str.begin()}
+ {
+ }
+
+ private:
+ string_view const d_str;
+ bool spaces; // true if current position is whitespace
+ cudf::string_view::const_iterator itr;
+ size_type start_position;
+ size_type end_position;
+};
+
+} // namespace detail
+} // namespace strings
+} // namespace cudf
diff --git a/cpp/tests/strings/split_tests.cpp b/cpp/tests/strings/split_tests.cpp
index ebb1e1e78f7..ffb875d330f 100644
--- a/cpp/tests/strings/split_tests.cpp
+++ b/cpp/tests/strings/split_tests.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -275,197 +275,95 @@ TEST_F(StringsSplitTest, AllNullsCase)
EXPECT_TRUE(column.null_count() == column.size());
}
-TEST_F(StringsSplitTest, ContiguousSplitRecord)
+TEST_F(StringsSplitTest, SplitRecord)
{
std::vector h_strings{" Héllo thesé", nullptr, "are some ", "tést String", ""};
- cudf::test::strings_column_wrapper strings(
- h_strings.begin(),
- h_strings.end(),
- thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
-
- cudf::strings_column_view strings_view(strings);
-
- std::vector h_expected1{"", "Héllo", "thesé"};
- cudf::test::strings_column_wrapper expected1(h_expected1.begin(), h_expected1.end());
- std::vector h_expected2{};
- cudf::test::strings_column_wrapper expected2(h_expected2.begin(), h_expected2.end());
- std::vector h_expected3{"are", "some", "", ""};
- cudf::test::strings_column_wrapper expected3(h_expected3.begin(), h_expected3.end());
- std::vector h_expected4{"tést", "String"};
- cudf::test::strings_column_wrapper expected4(h_expected4.begin(), h_expected4.end());
- std::vector h_expected5{""};
- cudf::test::strings_column_wrapper expected5(h_expected5.begin(), h_expected5.end());
-
- std::vector> expected_columns;
- expected_columns.push_back(expected1.release());
- expected_columns.push_back(expected2.release());
- expected_columns.push_back(expected3.release());
- expected_columns.push_back(expected4.release());
- expected_columns.push_back(expected5.release());
-
- auto result = cudf::strings::contiguous_split_record(strings_view, cudf::string_scalar(" "));
- EXPECT_TRUE(result.column_views.size() == expected_columns.size());
- for (size_t i = 0; i < result.column_views.size(); ++i) {
- cudf::test::expect_columns_equal(result.column_views[i], *expected_columns[i]);
- }
+ auto validity =
+ thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; });
+ cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end(), validity);
+
+ auto result =
+ cudf::strings::split_record(cudf::strings_column_view(strings), cudf::string_scalar(" "));
+ using LCW = cudf::test::lists_column_wrapper;
+ LCW expected(
+ {LCW{"", "Héllo", "thesé"}, LCW{}, LCW{"are", "some", "", ""}, LCW{"tést", "String"}, LCW{""}},
+ validity);
+ cudf::test::expect_columns_equal(result->view(), expected);
}
-TEST_F(StringsSplitTest, ContiguousSplitRecordWithMaxSplit)
+TEST_F(StringsSplitTest, SplitRecordWithMaxSplit)
{
std::vector h_strings{" Héllo thesé", nullptr, "are some ", "tést String", ""};
- cudf::test::strings_column_wrapper strings(
- h_strings.begin(),
- h_strings.end(),
- thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
-
- cudf::strings_column_view strings_view(strings);
-
- std::vector h_expected1{"", "Héllo thesé"};
- cudf::test::strings_column_wrapper expected1(h_expected1.begin(), h_expected1.end());
- std::vector h_expected2{};
- cudf::test::strings_column_wrapper expected2(h_expected2.begin(), h_expected2.end());
- std::vector h_expected3{"are", "some "};
- cudf::test::strings_column_wrapper expected3(h_expected3.begin(), h_expected3.end());
- std::vector h_expected4{"tést", "String"};
- cudf::test::strings_column_wrapper expected4(h_expected4.begin(), h_expected4.end());
- std::vector h_expected5{""};
- cudf::test::strings_column_wrapper expected5(h_expected5.begin(), h_expected5.end());
-
- std::vector> expected_columns;
- expected_columns.push_back(expected1.release());
- expected_columns.push_back(expected2.release());
- expected_columns.push_back(expected3.release());
- expected_columns.push_back(expected4.release());
- expected_columns.push_back(expected5.release());
-
- auto result = cudf::strings::contiguous_split_record(strings_view, cudf::string_scalar(" "), 1);
- EXPECT_TRUE(result.column_views.size() == expected_columns.size());
- for (size_t i = 0; i < result.column_views.size(); ++i) {
- cudf::test::expect_columns_equal(result.column_views[i], *expected_columns[i]);
- }
+ auto validity =
+ thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; });
+ cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end(), validity);
+
+ auto result =
+ cudf::strings::split_record(cudf::strings_column_view(strings), cudf::string_scalar(" "), 1);
+
+ using LCW = cudf::test::lists_column_wrapper;
+ LCW expected(
+ {LCW{"", "Héllo thesé"}, LCW{}, LCW{"are", "some "}, LCW{"tést", "String"}, LCW{""}},
+ validity);
+ cudf::test::expect_columns_equal(result->view(), expected);
}
-TEST_F(StringsSplitTest, ContiguousSplitRecordWhitespace)
+TEST_F(StringsSplitTest, SplitRecordWhitespace)
{
std::vector h_strings{
" Héllo thesé", nullptr, "are\tsome ", "tést\nString", " "};
- cudf::test::strings_column_wrapper strings(
- h_strings.begin(),
- h_strings.end(),
- thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
-
- cudf::strings_column_view strings_view(strings);
-
- std::vector h_expected1{"Héllo", "thesé"};
- cudf::test::strings_column_wrapper expected1(h_expected1.begin(), h_expected1.end());
- std::vector h_expected2{};
- cudf::test::strings_column_wrapper expected2(h_expected2.begin(), h_expected2.end());
- std::vector h_expected3{"are", "some"};
- cudf::test::strings_column_wrapper expected3(h_expected3.begin(), h_expected3.end());
- std::vector h_expected4{"tést", "String"};
- cudf::test::strings_column_wrapper expected4(h_expected4.begin(), h_expected4.end());
- std::vector h_expected5{};
- cudf::test::strings_column_wrapper expected5(h_expected5.begin(), h_expected5.end());
-
- std::vector> expected_columns;
- expected_columns.push_back(expected1.release());
- expected_columns.push_back(expected2.release());
- expected_columns.push_back(expected3.release());
- expected_columns.push_back(expected4.release());
- expected_columns.push_back(expected5.release());
-
- auto result = cudf::strings::contiguous_split_record(strings_view);
- EXPECT_TRUE(result.column_views.size() == expected_columns.size());
- for (size_t i = 0; i < result.column_views.size(); ++i) {
- cudf::test::expect_columns_equal(result.column_views[i], *expected_columns[i]);
- }
+ auto validity =
+ thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; });
+ cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end(), validity);
+
+ auto result = cudf::strings::split_record(cudf::strings_column_view(strings));
+ using LCW = cudf::test::lists_column_wrapper;
+ LCW expected({LCW{"Héllo", "thesé"}, LCW{}, LCW{"are", "some"}, LCW{"tést", "String"}, LCW{}},
+ validity);
+ cudf::test::expect_columns_equal(result->view(), expected);
}
-TEST_F(StringsSplitTest, ContiguousSplitRecordWhitespaceWithMaxSplit)
+TEST_F(StringsSplitTest, SplitRecordWhitespaceWithMaxSplit)
{
std::vector h_strings{
" Héllo thesé ", nullptr, "are\tsome ", "tést\nString", " "};
- cudf::test::strings_column_wrapper strings(
- h_strings.begin(),
- h_strings.end(),
- thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
-
- cudf::strings_column_view strings_view(strings);
-
- std::vector h_expected1{"Héllo", "thesé "};
- cudf::test::strings_column_wrapper expected1(h_expected1.begin(), h_expected1.end());
- std::vector h_expected2{};
- cudf::test::strings_column_wrapper expected2(h_expected2.begin(), h_expected2.end());
- std::vector h_expected3{"are", "some "};
- cudf::test::strings_column_wrapper expected3(h_expected3.begin(), h_expected3.end());
- std::vector h_expected4{"tést", "String"};
- cudf::test::strings_column_wrapper expected4(h_expected4.begin(), h_expected4.end());
- std::vector h_expected5{};
- cudf::test::strings_column_wrapper expected5(h_expected5.begin(), h_expected5.end());
-
- std::vector> expected_columns;
- expected_columns.push_back(expected1.release());
- expected_columns.push_back(expected2.release());
- expected_columns.push_back(expected3.release());
- expected_columns.push_back(expected4.release());
- expected_columns.push_back(expected5.release());
-
- auto result = cudf::strings::contiguous_split_record(strings_view, cudf::string_scalar(""), 1);
- EXPECT_TRUE(result.column_views.size() == expected_columns.size());
- for (size_t i = 0; i < result.column_views.size(); ++i) {
- cudf::test::expect_columns_equal(result.column_views[i], *expected_columns[i]);
- }
+ auto validity =
+ thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; });
+ cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end(), validity);
+
+ auto result =
+ cudf::strings::split_record(cudf::strings_column_view(strings), cudf::string_scalar(""), 1);
+ using LCW = cudf::test::lists_column_wrapper;
+ LCW expected({LCW{"Héllo", "thesé "}, LCW{}, LCW{"are", "some "}, LCW{"tést", "String"}, LCW{}},
+ validity);
+ cudf::test::expect_columns_equal(result->view(), expected);
}
-TEST_F(StringsSplitTest, ContiguousRSplitRecord)
+TEST_F(StringsSplitTest, RSplitRecord)
{
std::vector h_strings{
"héllo", nullptr, "a_bc_déf", "a__bc", "_ab_cd", "ab_cd_", "", " a b ", " a bbb c"};
- cudf::test::strings_column_wrapper strings(
- h_strings.begin(),
- h_strings.end(),
- thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
-
- cudf::strings_column_view strings_view(strings);
-
- std::vector h_expected1{"héllo"};
- cudf::test::strings_column_wrapper expected1(h_expected1.begin(), h_expected1.end());
- std::vector h_expected2{};
- cudf::test::strings_column_wrapper expected2(h_expected2.begin(), h_expected2.end());
- std::vector h_expected3{"a", "bc", "déf"};
- cudf::test::strings_column_wrapper expected3(h_expected3.begin(), h_expected3.end());
- std::vector h_expected4{"a", "", "bc"};
- cudf::test::strings_column_wrapper expected4(h_expected4.begin(), h_expected4.end());
- std::vector h_expected5{"", "ab", "cd"};
- cudf::test::strings_column_wrapper expected5(h_expected5.begin(), h_expected5.end());
- std::vector h_expected6{"ab", "cd", ""};
- cudf::test::strings_column_wrapper expected6(h_expected6.begin(), h_expected6.end());
- std::vector h_expected7{""};
- cudf::test::strings_column_wrapper expected7(h_expected7.begin(), h_expected7.end());
- std::vector h_expected8{" a b "};
- cudf::test::strings_column_wrapper expected8(h_expected8.begin(), h_expected8.end());
- std::vector h_expected9{" a bbb c"};
- cudf::test::strings_column_wrapper expected9(h_expected9.begin(), h_expected9.end());
-
- std::vector> expected_columns;
- expected_columns.push_back(expected1.release());
- expected_columns.push_back(expected2.release());
- expected_columns.push_back(expected3.release());
- expected_columns.push_back(expected4.release());
- expected_columns.push_back(expected5.release());
- expected_columns.push_back(expected6.release());
- expected_columns.push_back(expected7.release());
- expected_columns.push_back(expected8.release());
- expected_columns.push_back(expected9.release());
-
- auto result = cudf::strings::contiguous_rsplit_record(strings_view, cudf::string_scalar("_"));
- EXPECT_TRUE(result.column_views.size() == expected_columns.size());
- for (size_t i = 0; i < result.column_views.size(); i++) {
- cudf::test::expect_columns_equal(result.column_views[i], *expected_columns[i]);
- }
+ auto validity =
+ thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; });
+ cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end(), validity);
+
+ using LCW = cudf::test::lists_column_wrapper;
+ LCW expected({LCW{"héllo"},
+ LCW{},
+ LCW{"a", "bc", "déf"},
+ LCW{"a", "", "bc"},
+ LCW{"", "ab", "cd"},
+ LCW{"ab", "cd", ""},
+ LCW{""},
+ LCW{" a b "},
+ LCW{" a bbb c"}},
+ validity);
+ auto result =
+ cudf::strings::rsplit_record(cudf::strings_column_view(strings), cudf::string_scalar("_"));
+ cudf::test::expect_columns_equal(result->view(), expected);
}
-TEST_F(StringsSplitTest, ContiguousRSplitRecordWithMaxSplit)
+TEST_F(StringsSplitTest, RSplitRecordWithMaxSplit)
{
std::vector h_strings{"héllo",
nullptr,
@@ -476,135 +374,70 @@ TEST_F(StringsSplitTest, ContiguousRSplitRecordWithMaxSplit)
"",
" a b ___",
"___ a bbb c"};
- cudf::test::strings_column_wrapper strings(
- h_strings.begin(),
- h_strings.end(),
- thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
-
- cudf::strings_column_view strings_view(strings);
-
- std::vector h_expected1{"héllo"};
- cudf::test::strings_column_wrapper expected1(h_expected1.begin(), h_expected1.end());
- std::vector h_expected2{};
- cudf::test::strings_column_wrapper expected2(h_expected2.begin(), h_expected2.end());
- std::vector h_expected3{"a", "bc", "déf"};
- cudf::test::strings_column_wrapper expected3(h_expected3.begin(), h_expected3.end());
- std::vector h_expected4{"___a", "", "bc"};
- cudf::test::strings_column_wrapper expected4(h_expected4.begin(), h_expected4.end());
- std::vector h_expected5{"_ab", "cd", ""};
- cudf::test::strings_column_wrapper expected5(h_expected5.begin(), h_expected5.end());
- std::vector h_expected6{"ab", "cd", ""};
- cudf::test::strings_column_wrapper expected6(h_expected6.begin(), h_expected6.end());
- std::vector h_expected7{""};
- cudf::test::strings_column_wrapper expected7(h_expected7.begin(), h_expected7.end());
- std::vector h_expected8{" a b _", "", ""};
- cudf::test::strings_column_wrapper expected8(h_expected8.begin(), h_expected8.end());
- std::vector h_expected9{"_", "", " a bbb c"};
- cudf::test::strings_column_wrapper expected9(h_expected9.begin(), h_expected9.end());
-
- std::vector> expected_columns;
- expected_columns.push_back(expected1.release());
- expected_columns.push_back(expected2.release());
- expected_columns.push_back(expected3.release());
- expected_columns.push_back(expected4.release());
- expected_columns.push_back(expected5.release());
- expected_columns.push_back(expected6.release());
- expected_columns.push_back(expected7.release());
- expected_columns.push_back(expected8.release());
- expected_columns.push_back(expected9.release());
-
- auto result = cudf::strings::contiguous_rsplit_record(strings_view, cudf::string_scalar("_"), 2);
- EXPECT_TRUE(result.column_views.size() == expected_columns.size());
- for (size_t i = 0; i < result.column_views.size(); i++) {
- cudf::test::expect_columns_equal(result.column_views[i], *expected_columns[i]);
- }
+ auto validity =
+ thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; });
+ cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end(), validity);
+
+ using LCW = cudf::test::lists_column_wrapper;
+ LCW expected({LCW{"héllo"},
+ LCW{},
+ LCW{"a", "bc", "déf"},
+ LCW{"___a", "", "bc"},
+ LCW{"_ab", "cd", ""},
+ LCW{"ab", "cd", ""},
+ LCW{""},
+ LCW{" a b _", "", ""},
+ LCW{"_", "", " a bbb c"}},
+ validity);
+
+ auto result =
+ cudf::strings::rsplit_record(cudf::strings_column_view(strings), cudf::string_scalar("_"), 2);
+
+ cudf::test::expect_columns_equal(result->view(), expected);
}
-TEST_F(StringsSplitTest, ContiguousRSplitRecordWhitespace)
+TEST_F(StringsSplitTest, RSplitRecordWhitespace)
{
std::vector h_strings{"héllo", nullptr, "a_bc_déf", "", " a\tb ", " a\r bbb c"};
- cudf::test::strings_column_wrapper strings(
- h_strings.begin(),
- h_strings.end(),
- thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
+ auto validity =
+ thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; });
+ cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end(), validity);
- cudf::strings_column_view strings_view(strings);
+ using LCW = cudf::test::lists_column_wrapper;
+ LCW expected({LCW{"héllo"}, LCW{}, LCW{"a_bc_déf"}, LCW{}, LCW{"a", "b"}, LCW{"a", "bbb", "c"}},
+ validity);
- std::vector h_expected1{"héllo"};
- cudf::test::strings_column_wrapper expected1(h_expected1.begin(), h_expected1.end());
- std::vector h_expected2{};
- cudf::test::strings_column_wrapper expected2(h_expected2.begin(), h_expected2.end());
- std::vector h_expected3{"a_bc_déf"};
- cudf::test::strings_column_wrapper expected3(h_expected3.begin(), h_expected3.end());
- std::vector h_expected4{};
- cudf::test::strings_column_wrapper expected4(h_expected4.begin(), h_expected4.end());
- std::vector h_expected5{"a", "b"};
- cudf::test::strings_column_wrapper expected5(h_expected5.begin(), h_expected5.end());
- std::vector h_expected6{"a", "bbb", "c"};
- cudf::test::strings_column_wrapper expected6(h_expected6.begin(), h_expected6.end());
+ auto result = cudf::strings::rsplit_record(cudf::strings_column_view(strings));
- std::vector> expected_columns;
- expected_columns.push_back(expected1.release());
- expected_columns.push_back(expected2.release());
- expected_columns.push_back(expected3.release());
- expected_columns.push_back(expected4.release());
- expected_columns.push_back(expected5.release());
- expected_columns.push_back(expected6.release());
- auto result = cudf::strings::contiguous_rsplit_record(strings_view);
-
- EXPECT_TRUE(result.column_views.size() == expected_columns.size());
- for (size_t i = 4; i < 5; i++) {
- cudf::test::expect_columns_equal(result.column_views[i], *expected_columns[i]);
- }
+ cudf::test::expect_columns_equal(result->view(), expected);
}
-TEST_F(StringsSplitTest, ContiguousRSplitRecordWhitespaceWithMaxSplit)
+TEST_F(StringsSplitTest, RSplitRecordWhitespaceWithMaxSplit)
{
std::vector h_strings{
" héllo Asher ", nullptr, " a_bc_déf ", "", " a\tb ", " a\r bbb c"};
- cudf::test::strings_column_wrapper strings(
- h_strings.begin(),
- h_strings.end(),
- thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
-
- cudf::strings_column_view strings_view(strings);
-
- std::vector h_expected1{" héllo", "Asher"};
- cudf::test::strings_column_wrapper expected1(h_expected1.begin(), h_expected1.end());
- std::vector h_expected2{};
- cudf::test::strings_column_wrapper expected2(h_expected2.begin(), h_expected2.end());
- std::vector h_expected3{"a_bc_déf"};
- cudf::test::strings_column_wrapper expected3(h_expected3.begin(), h_expected3.end());
- std::vector h_expected4{};
- cudf::test::strings_column_wrapper expected4(h_expected4.begin(), h_expected4.end());
- std::vector h_expected5{" a", "b"};
- cudf::test::strings_column_wrapper expected5(h_expected5.begin(), h_expected5.end());
- std::vector h_expected6{" a\r bbb", "c"};
- cudf::test::strings_column_wrapper expected6(h_expected6.begin(), h_expected6.end());
-
- std::vector> expected_columns;
- expected_columns.push_back(expected1.release());
- expected_columns.push_back(expected2.release());
- expected_columns.push_back(expected3.release());
- expected_columns.push_back(expected4.release());
- expected_columns.push_back(expected5.release());
- expected_columns.push_back(expected6.release());
- auto result = cudf::strings::contiguous_rsplit_record(strings_view, cudf::string_scalar(""), 1);
-
- EXPECT_TRUE(result.column_views.size() == expected_columns.size());
- for (size_t i = 4; i < 5; i++) {
- cudf::test::expect_columns_equal(result.column_views[i], *expected_columns[i]);
- }
+ auto validity =
+ thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; });
+ cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end(), validity);
+
+ using LCW = cudf::test::lists_column_wrapper;
+ LCW expected(
+ {LCW{" héllo", "Asher"}, LCW{}, LCW{"a_bc_déf"}, LCW{}, LCW{" a", "b"}, LCW{" a\r bbb", "c"}},
+ validity);
+
+ auto result =
+ cudf::strings::rsplit_record(cudf::strings_column_view(strings), cudf::string_scalar(""), 1);
+ cudf::test::expect_columns_equal(result->view(), expected);
}
-TEST_F(StringsSplitTest, ContiguousSplitRecordZeroSizeStringsColumns)
+TEST_F(StringsSplitTest, SplitRecordZeroSizeStringsColumns)
{
cudf::column_view zero_size_strings_column(
cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
- auto split_record_result = cudf::strings::contiguous_split_record(zero_size_strings_column);
- EXPECT_TRUE(split_record_result.column_views.size() == 0);
- auto rsplit_record_result = cudf::strings::contiguous_rsplit_record(zero_size_strings_column);
- EXPECT_TRUE(rsplit_record_result.column_views.size() == 0);
+ auto split_record_result = cudf::strings::split_record(zero_size_strings_column);
+ EXPECT_TRUE(split_record_result->size() == 0);
+ auto rsplit_record_result = cudf::strings::rsplit_record(zero_size_strings_column);
+ EXPECT_TRUE(rsplit_record_result->size() == 0);
}
TEST_F(StringsSplitTest, Partition)