Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[REVIEW] Change strings::split_record to return a lists column #5687

Merged
merged 13 commits into from
Jul 21, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,7 @@
- PR #5662 Make Java ColumnVector(long nativePointer) constructor public
- PR #5679 Use `pickle5` to test older Python versions
- PR #5684 Use `pickle5` in `Serializable` (when available)
- PR #5687 Change strings::split_record to return a lists column
- PR #5708 Add support for `dummy_na` in `get_dummies`
- PR #5709 Update java build to help cu-spacial with java bindings
- PR #5713 Remove old NVTX utilities
Expand Down
181 changes: 124 additions & 57 deletions cpp/include/cudf/strings/split/split.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019, NVIDIA CORPORATION.
* Copyright (c) 2019-2020, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -82,82 +82,149 @@ std::unique_ptr<table> rsplit(
rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource());

/**
* @brief The result(s) of a `contiguous_(r)split_record`
*
* Each column_view resulting from a split operation performed by
* contiguous_split_record will be returned wrapped in a
* `contiguous_split_record_result`. The column data addresses stored in the
* column_view objects are not owned by top level cudf::column objects. The
* backing memory is instead owned by the `all_data` field and in one contiguous
* block.
*
* The user is responsible for assuring that the `column_views` or any derived
* objects do not outlive the memory owned by `all_data`
*/
struct contiguous_split_record_result {
std::vector<column_view> column_views;
std::unique_ptr<rmm::device_buffer> all_data;
};

/**
* @brief Splits each element of the input column to a column of tokens storing
* the resulting columns in a single contiguous block of memory.
*
* This function splits each element in the input column to a column of tokens.
* The number of columns in the output vector will be the same as the number of
* elements in the input column. The column length will coincide with the
* number of tokens; the resulting columns wrapped in the returned object may
* have different sizes.
*
* Splitting a null string element will result in an empty output column.
*
* @throws cudf:logic_error if `delimiter` is invalid.
* @brief Splits individual strings elements into a list of strings.
*
* Each element generates an array of strings that are stored in an output
* lists column.
*
* The number of elements in the output column will be the same as the number of
* elements in the input column. Each individual list item will contain the
* new strings for that row. The resulting number of strings in each row can vary
* from 0 to `maxsplit + 1`.
*
* The `delimiter` is searched within each string from beginning to end
* and splitting stops when either `maxsplit` or the end of the string is reached.
*
* If a delimiter is not whitespace and occurs adjacent to another delimiter,
* an empty string is produced for that split occurrence. Likewise, a non-whitespace
* delimiter produces an empty string if it appears at the beginning or the end
* of a string.
*
* @code{.pseudo}
* s = ["a_bc_def_g", "a__bc", "_ab_cd", "ab_cd_"]
* s1 = split_record(s, "_")
* s1 is a lists column of strings:
* [ ["a", "bc", "def", "g"],
* ["a", "", "bc"],
* ["", "ab", "cd"],
* ["ab", "cd", ""] ]
* s2 = split_record(s, "_", 1)
* s2 is a lists column of strings:
* [ ["a", "bc_def_g"],
* ["a", "_bc"],
* ["", "ab_cd"],
* ["ab", "cd_"] ]
* @endcode
*
* A whitespace delimiter produces no empty strings.
* @code{.pseudo}
* s = ["a bc def", "a bc", " ab cd", "ab cd "]
* s1 = split_record(s, "")
* s1 is a lists column of strings:
* [ ["a", "bc", "def"],
* ["a", "bc"],
* ["ab", "cd"],
* ["ab", "cd"] ]
* s2 = split_record(s, "", 1)
* s2 is a lists column of strings:
* [ ["a", "bc def"],
* ["a", "bc"],
* ["ab", "cd"],
* ["ab", "cd "] ]
* @endcode
*
* A null string element will result in a null list item for that row.
*
* @throw cudf:logic_error if `delimiter` is invalid.
*
* @param strings A column of string elements to be splitted.
* @param delimiter UTF-8 encoded string indicating the split points in each
* string.
* @param delimiter The string to identify split points in each string.
* Default of empty string indicates split on whitespace.
* @param maxsplit Maximum number of splits to perform.
* Default of -1 indicates all possible splits on each string.
* @param mr Device memory resource used to allocate the returned result's device memory.
* @return contiguous_split_record_result New vector of strings column_view
* objects
* (each column_view element of the vector holds splits from a string
* element of the input column).
* @return Lists column of strings
* Each vector of the lists column holds splits from a single row
* element of the input column.
*/
contiguous_split_record_result contiguous_split_record(
std::unique_ptr<column> split_record(
strings_column_view const& strings,
string_scalar const& delimiter = string_scalar(""),
size_type maxsplit = -1,
rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource());

/**
* @brief Splits each element of the input column from the end to a column of
* tokens storing the resulting columns in a single contiguous block of memory.
*
* This function splits each element in the input column to a column of tokens.
* The number of columns in the output vector will be the same as the number of
* elements in the input column. The column length will coincide with the
* number of tokens; the resulting columns wrapped in the returned object may
* have different sizes.
*
* Splitting a null string element will result in an empty output column.
*
* @throws cudf:logic_error if `delimiter` is invalid.
* @brief Splits individual strings elements into a list of strings starting
* from the end of each string.
*
* Each element generates an array of strings that are stored in an output
* lists column.
*
* The number of elements in the output column will be the same as the number of
* elements in the input column. Each individual list item will contain the
* new strings for that row. The resulting number of strings in each row can vary
* from 0 to `maxsplit + 1`.
*
* The `delimiter` is searched from end to beginning within each string
* and splitting stops when either `maxsplit` or the beginning of the string
* is reached.
*
* If a delimiter is not whitespace and occurs adjacent to another delimiter,
* an empty string is produced for that split occurrence. Likewise, a non-whitespace
* delimiter produces an empty string if it appears at the beginning or the end
* of a string.
*
* Note that `rsplit_record` and `split_record` produce equivalent results for
* the default `maxsplit` value.
*
* @code{.pseudo}
* s = ["a_bc_def_g", "a__bc", "_ab_cd", "ab_cd_"]
* s1 = rsplit_record(s, "_")
* s1 is a lists column of strings:
* [ ["a", "bc", "def", "g"],
* ["a", "", "bc"],
* ["", "ab", "cd"],
* ["ab", "cd", ""] ]
* s2 = rsplit_record(s, "_", 1)
* s2 is a lists column of strings:
* [ ["a_bc_def", "g"],
* ["a_", "bc"],
* ["_ab", "cd"],
* ["ab_cd", ""] ]
* @endcode
*
* A whitespace delimiter produces no empty strings.
* @code{.pseudo}
* s = ["a bc def", "a bc", " ab cd", "ab cd "]
* s1 = rsplit_record(s, "")
* s1 is a lists column of strings:
* [ ["a", "bc", "def"],
* ["a", "bc"],
* ["ab", "cd"],
* ["ab", "cd"] ]
* s2 = rsplit_record(s, "", 1)
* s2 is a lists column of strings:
* [ ["a bc", "def"],
* ["a", "bc"],
* [" ab", "cd"],
* ["ab", "cd"] ]
* @endcode
*
* A null string element will result in a null list item for that row.
*
* @throw cudf:logic_error if `delimiter` is invalid.
*
* @param strings A column of string elements to be splitted.
* @param delimiter UTF-8 encoded string indicating the split points in each
* string.
* @param delimiter The string to identify split points in each string.
* Default of empty string indicates split on whitespace.
* @param maxsplit Maximum number of splits to perform.
* Default of -1 indicates all possible splits on each string.
* @param mr Device memory resource used to allocate the returned result's device memory.
* @return contiguous_split_record_result New vector of strings column_view
* objects
* (each column_view element of the vector holds splits from a string
* element of the input column).
* @return Lists column of strings
* Each vector of the lists column holds splits from a single row
* element of the input column.
*/
contiguous_split_record_result contiguous_rsplit_record(
std::unique_ptr<column> rsplit_record(
strings_column_view const& strings,
string_scalar const& delimiter = string_scalar(""),
size_type maxsplit = -1,
Expand Down
100 changes: 4 additions & 96 deletions cpp/src/strings/split/split.cu
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#include <cudf/strings/string_view.cuh>
#include <cudf/strings/strings_column_view.hpp>
#include <cudf/utilities/error.hpp>
#include <strings/split/split_utils.cuh>

#include <thrust/binary_search.h> // upper_bound()
#include <thrust/copy.h> // copy_if()
Expand All @@ -34,8 +35,8 @@
namespace cudf {
namespace strings {
namespace detail {

using string_index_pair = thrust::pair<const char*, size_type>;
using position_pair = thrust::pair<size_type, size_type>;

namespace {

Expand Down Expand Up @@ -582,99 +583,6 @@ struct base_whitespace_split_tokenizer {
size_type max_tokens; // maximum number of tokens
};

/**
* @brief Instantiated for each string to manage navigating tokens from
* the beginning or the end of that string.
*/
struct whitespace_string_tokenizer {
/**
* @brief Identifies the position range of the next token in the given
* string at the specified iterator position.
*
* Tokens are delimited by one or more whitespace characters.
*
* @return true if a token has been found
*/
__device__ bool next_token()
{
if (itr != d_str.begin()) { // skip these 2 lines the first time through
start_position = end_position + 1;
++itr;
}
if (start_position >= d_str.length()) return false;
// continue search for the next token
end_position = d_str.length();
for (; itr < d_str.end(); ++itr) {
if (spaces == (*itr <= ' ')) {
if (spaces)
start_position = itr.position() + 1;
else
end_position = itr.position() + 1;
continue;
}
spaces = !spaces;
if (spaces) {
end_position = itr.position();
break;
}
}
return start_position < end_position;
}

/**
* @brief Identifies the position range of the previous token in the given
* string at the specified iterator position.
*
* Tokens are delimited by one or more whitespace characters.
*
* @return true if a token has been found
*/
__device__ bool prev_token()
{
end_position = start_position - 1;
--itr;
if (end_position <= 0) return false;
// continue search for the next token
start_position = 0;
for (; itr >= d_str.begin(); --itr) {
if (spaces == (*itr <= ' ')) {
if (spaces)
end_position = itr.position();
else
start_position = itr.position();
continue;
}
spaces = !spaces;
if (spaces) {
start_position = itr.position() + 1;
break;
}
}
return start_position < end_position;
}

__device__ position_pair token_byte_positions()
{
return position_pair{d_str.byte_offset(start_position), d_str.byte_offset(end_position)};
}

__device__ whitespace_string_tokenizer(string_view const& d_str, bool reverse = false)
: d_str{d_str},
spaces(true),
start_position{reverse ? d_str.length() + 1 : 0},
end_position{d_str.length()},
itr{reverse ? d_str.end() : d_str.begin()}
{
}

private:
string_view const d_str;
bool spaces; // true if current position is whitespace
cudf::string_view::const_iterator itr;
size_type start_position;
size_type end_position;
};

/**
* @brief The tokenizer functions for split() with whitespace.
*
Expand Down Expand Up @@ -709,7 +617,7 @@ struct whitespace_split_tokenizer_fn : base_whitespace_split_tokenizer {
size_type token_idx = 0;
position_pair token{0, 0};
while (tokenizer.next_token() && (token_idx < token_count)) {
token = tokenizer.token_byte_positions();
token = tokenizer.get_token();
d_tokens[d_strings.size() * (token_idx++)] =
string_index_pair{d_str.data() + token.first, (token.second - token.first)};
}
Expand Down Expand Up @@ -760,7 +668,7 @@ struct whitespace_rsplit_tokenizer_fn : base_whitespace_split_tokenizer {
size_type token_idx = 0;
position_pair token{0, 0};
while (tokenizer.prev_token() && (token_idx < token_count)) {
token = tokenizer.token_byte_positions();
token = tokenizer.get_token();
d_tokens[d_strings.size() * (token_count - 1 - token_idx)] =
string_index_pair{d_str.data() + token.first, (token.second - token.first)};
++token_idx;
Expand Down
Loading