Skip to content

Commit

Permalink
Merge pull request #5687 from davidwendt/split-record-to-list
Browse files Browse the repository at this point in the history
[REVIEW] Change strings::split_record to return a lists column
  • Loading branch information
davidwendt authored Jul 21, 2020
2 parents ed84164 + ca7212d commit 3d287b9
Show file tree
Hide file tree
Showing 6 changed files with 529 additions and 826 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,7 @@
- PR #5662 Make Java ColumnVector(long nativePointer) constructor public
- PR #5679 Use `pickle5` to test older Python versions
- PR #5684 Use `pickle5` in `Serializable` (when available)
- PR #5687 Change strings::split_record to return a lists column
- PR #5708 Add support for `dummy_na` in `get_dummies`
- PR #5709 Update java build to help cu-spacial with java bindings
- PR #5713 Remove old NVTX utilities
Expand Down
181 changes: 124 additions & 57 deletions cpp/include/cudf/strings/split/split.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019, NVIDIA CORPORATION.
* Copyright (c) 2019-2020, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -82,82 +82,149 @@ std::unique_ptr<table> rsplit(
rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource());

/**
* @brief The result(s) of a `contiguous_(r)split_record`
*
* Each column_view resulting from a split operation performed by
* contiguous_split_record will be returned wrapped in a
* `contiguous_split_record_result`. The column data addresses stored in the
* column_view objects are not owned by top level cudf::column objects. The
* backing memory is instead owned by the `all_data` field and in one contiguous
* block.
*
* The user is responsible for assuring that the `column_views` or any derived
* objects do not outlive the memory owned by `all_data`
*/
struct contiguous_split_record_result {
std::vector<column_view> column_views;
std::unique_ptr<rmm::device_buffer> all_data;
};

/**
* @brief Splits each element of the input column to a column of tokens storing
* the resulting columns in a single contiguous block of memory.
*
* This function splits each element in the input column to a column of tokens.
* The number of columns in the output vector will be the same as the number of
* elements in the input column. The column length will coincide with the
* number of tokens; the resulting columns wrapped in the returned object may
* have different sizes.
*
* Splitting a null string element will result in an empty output column.
*
* @throws cudf:logic_error if `delimiter` is invalid.
* @brief Splits individual strings elements into a list of strings.
*
* Each element generates an array of strings that are stored in an output
* lists column.
*
* The number of elements in the output column will be the same as the number of
* elements in the input column. Each individual list item will contain the
* new strings for that row. The resulting number of strings in each row can vary
* from 0 to `maxsplit + 1`.
*
* The `delimiter` is searched within each string from beginning to end
* and splitting stops when either `maxsplit` or the end of the string is reached.
*
* If a delimiter is not whitespace and occurs adjacent to another delimiter,
* an empty string is produced for that split occurrence. Likewise, a non-whitespace
* delimiter produces an empty string if it appears at the beginning or the end
* of a string.
*
* @code{.pseudo}
* s = ["a_bc_def_g", "a__bc", "_ab_cd", "ab_cd_"]
* s1 = split_record(s, "_")
* s1 is a lists column of strings:
* [ ["a", "bc", "def", "g"],
* ["a", "", "bc"],
* ["", "ab", "cd"],
* ["ab", "cd", ""] ]
* s2 = split_record(s, "_", 1)
* s2 is a lists column of strings:
* [ ["a", "bc_def_g"],
* ["a", "_bc"],
* ["", "ab_cd"],
* ["ab", "cd_"] ]
* @endcode
*
* A whitespace delimiter produces no empty strings.
* @code{.pseudo}
* s = ["a bc def", "a bc", " ab cd", "ab cd "]
* s1 = split_record(s, "")
* s1 is a lists column of strings:
* [ ["a", "bc", "def"],
* ["a", "bc"],
* ["ab", "cd"],
* ["ab", "cd"] ]
* s2 = split_record(s, "", 1)
* s2 is a lists column of strings:
* [ ["a", "bc def"],
* ["a", "bc"],
* ["ab", "cd"],
* ["ab", "cd "] ]
* @endcode
*
* A null string element will result in a null list item for that row.
*
* @throw cudf:logic_error if `delimiter` is invalid.
*
* @param strings A column of string elements to be splitted.
* @param delimiter UTF-8 encoded string indicating the split points in each
* string.
* @param delimiter The string to identify split points in each string.
* Default of empty string indicates split on whitespace.
* @param maxsplit Maximum number of splits to perform.
* Default of -1 indicates all possible splits on each string.
* @param mr Device memory resource used to allocate the returned result's device memory.
* @return contiguous_split_record_result New vector of strings column_view
* objects
* (each column_view element of the vector holds splits from a string
* element of the input column).
* @return Lists column of strings
* Each vector of the lists column holds splits from a single row
* element of the input column.
*/
contiguous_split_record_result contiguous_split_record(
std::unique_ptr<column> split_record(
strings_column_view const& strings,
string_scalar const& delimiter = string_scalar(""),
size_type maxsplit = -1,
rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource());

/**
* @brief Splits each element of the input column from the end to a column of
* tokens storing the resulting columns in a single contiguous block of memory.
*
* This function splits each element in the input column to a column of tokens.
* The number of columns in the output vector will be the same as the number of
* elements in the input column. The column length will coincide with the
* number of tokens; the resulting columns wrapped in the returned object may
* have different sizes.
*
* Splitting a null string element will result in an empty output column.
*
* @throws cudf:logic_error if `delimiter` is invalid.
* @brief Splits individual strings elements into a list of strings starting
* from the end of each string.
*
* Each element generates an array of strings that are stored in an output
* lists column.
*
* The number of elements in the output column will be the same as the number of
* elements in the input column. Each individual list item will contain the
* new strings for that row. The resulting number of strings in each row can vary
* from 0 to `maxsplit + 1`.
*
* The `delimiter` is searched from end to beginning within each string
* and splitting stops when either `maxsplit` or the beginning of the string
* is reached.
*
* If a delimiter is not whitespace and occurs adjacent to another delimiter,
* an empty string is produced for that split occurrence. Likewise, a non-whitespace
* delimiter produces an empty string if it appears at the beginning or the end
* of a string.
*
* Note that `rsplit_record` and `split_record` produce equivalent results for
* the default `maxsplit` value.
*
* @code{.pseudo}
* s = ["a_bc_def_g", "a__bc", "_ab_cd", "ab_cd_"]
* s1 = rsplit_record(s, "_")
* s1 is a lists column of strings:
* [ ["a", "bc", "def", "g"],
* ["a", "", "bc"],
* ["", "ab", "cd"],
* ["ab", "cd", ""] ]
* s2 = rsplit_record(s, "_", 1)
* s2 is a lists column of strings:
* [ ["a_bc_def", "g"],
* ["a_", "bc"],
* ["_ab", "cd"],
* ["ab_cd", ""] ]
* @endcode
*
* A whitespace delimiter produces no empty strings.
* @code{.pseudo}
* s = ["a bc def", "a bc", " ab cd", "ab cd "]
* s1 = rsplit_record(s, "")
* s1 is a lists column of strings:
* [ ["a", "bc", "def"],
* ["a", "bc"],
* ["ab", "cd"],
* ["ab", "cd"] ]
* s2 = rsplit_record(s, "", 1)
* s2 is a lists column of strings:
* [ ["a bc", "def"],
* ["a", "bc"],
* [" ab", "cd"],
* ["ab", "cd"] ]
* @endcode
*
* A null string element will result in a null list item for that row.
*
* @throw cudf:logic_error if `delimiter` is invalid.
*
* @param strings A column of string elements to be splitted.
* @param delimiter UTF-8 encoded string indicating the split points in each
* string.
* @param delimiter The string to identify split points in each string.
* Default of empty string indicates split on whitespace.
* @param maxsplit Maximum number of splits to perform.
* Default of -1 indicates all possible splits on each string.
* @param mr Device memory resource used to allocate the returned result's device memory.
* @return contiguous_split_record_result New vector of strings column_view
* objects
* (each column_view element of the vector holds splits from a string
* element of the input column).
* @return Lists column of strings
* Each vector of the lists column holds splits from a single row
* element of the input column.
*/
contiguous_split_record_result contiguous_rsplit_record(
std::unique_ptr<column> rsplit_record(
strings_column_view const& strings,
string_scalar const& delimiter = string_scalar(""),
size_type maxsplit = -1,
Expand Down
100 changes: 4 additions & 96 deletions cpp/src/strings/split/split.cu
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#include <cudf/strings/string_view.cuh>
#include <cudf/strings/strings_column_view.hpp>
#include <cudf/utilities/error.hpp>
#include <strings/split/split_utils.cuh>

#include <thrust/binary_search.h> // upper_bound()
#include <thrust/copy.h> // copy_if()
Expand All @@ -34,8 +35,8 @@
namespace cudf {
namespace strings {
namespace detail {

using string_index_pair = thrust::pair<const char*, size_type>;
using position_pair = thrust::pair<size_type, size_type>;

namespace {

Expand Down Expand Up @@ -582,99 +583,6 @@ struct base_whitespace_split_tokenizer {
size_type max_tokens; // maximum number of tokens
};

/**
* @brief Instantiated for each string to manage navigating tokens from
* the beginning or the end of that string.
*/
struct whitespace_string_tokenizer {
/**
* @brief Identifies the position range of the next token in the given
* string at the specified iterator position.
*
* Tokens are delimited by one or more whitespace characters.
*
* @return true if a token has been found
*/
__device__ bool next_token()
{
if (itr != d_str.begin()) { // skip these 2 lines the first time through
start_position = end_position + 1;
++itr;
}
if (start_position >= d_str.length()) return false;
// continue search for the next token
end_position = d_str.length();
for (; itr < d_str.end(); ++itr) {
if (spaces == (*itr <= ' ')) {
if (spaces)
start_position = itr.position() + 1;
else
end_position = itr.position() + 1;
continue;
}
spaces = !spaces;
if (spaces) {
end_position = itr.position();
break;
}
}
return start_position < end_position;
}

/**
* @brief Identifies the position range of the previous token in the given
* string at the specified iterator position.
*
* Tokens are delimited by one or more whitespace characters.
*
* @return true if a token has been found
*/
__device__ bool prev_token()
{
end_position = start_position - 1;
--itr;
if (end_position <= 0) return false;
// continue search for the next token
start_position = 0;
for (; itr >= d_str.begin(); --itr) {
if (spaces == (*itr <= ' ')) {
if (spaces)
end_position = itr.position();
else
start_position = itr.position();
continue;
}
spaces = !spaces;
if (spaces) {
start_position = itr.position() + 1;
break;
}
}
return start_position < end_position;
}

__device__ position_pair token_byte_positions()
{
return position_pair{d_str.byte_offset(start_position), d_str.byte_offset(end_position)};
}

__device__ whitespace_string_tokenizer(string_view const& d_str, bool reverse = false)
: d_str{d_str},
spaces(true),
start_position{reverse ? d_str.length() + 1 : 0},
end_position{d_str.length()},
itr{reverse ? d_str.end() : d_str.begin()}
{
}

private:
string_view const d_str;
bool spaces; // true if current position is whitespace
cudf::string_view::const_iterator itr;
size_type start_position;
size_type end_position;
};

/**
* @brief The tokenizer functions for split() with whitespace.
*
Expand Down Expand Up @@ -709,7 +617,7 @@ struct whitespace_split_tokenizer_fn : base_whitespace_split_tokenizer {
size_type token_idx = 0;
position_pair token{0, 0};
while (tokenizer.next_token() && (token_idx < token_count)) {
token = tokenizer.token_byte_positions();
token = tokenizer.get_token();
d_tokens[d_strings.size() * (token_idx++)] =
string_index_pair{d_str.data() + token.first, (token.second - token.first)};
}
Expand Down Expand Up @@ -760,7 +668,7 @@ struct whitespace_rsplit_tokenizer_fn : base_whitespace_split_tokenizer {
size_type token_idx = 0;
position_pair token{0, 0};
while (tokenizer.prev_token() && (token_idx < token_count)) {
token = tokenizer.token_byte_positions();
token = tokenizer.get_token();
d_tokens[d_strings.size() * (token_count - 1 - token_idx)] =
string_index_pair{d_str.data() + token.first, (token.second - token.first)};
++token_idx;
Expand Down
Loading

0 comments on commit 3d287b9

Please sign in to comment.