Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[REVIEW] Add filter_tokens nvtext API #5658

Merged
merged 20 commits into from
Jul 16, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
- PR #5612 Add `is_hex` strings API
- PR #5637 Parameterize Null comparator behaviour in Joins
- PR #5623 Add `is_ipv4` strings API
- PR #5658 Add `filter_tokens` nvtext API
- PR #5666 Add `filter_characters_of_type` strings API
- PR #5673 Always build and test with per-thread default stream enabled in the GPU CI build

Expand Down
47 changes: 47 additions & 0 deletions cpp/include/nvtext/replace.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -87,5 +87,52 @@ std::unique_ptr<cudf::column> replace_tokens(
cudf::string_scalar const& delimiter = cudf::string_scalar{""},
rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource());

/**
* @brief Removes tokens whose lengths are less than a specified number of characters.
*
* Tokens identified in each string are removed from the corresponding output string.
* The removed tokens can be replaced by specifying a `replacement` string as well.
*
* The `delimiter` may be zero or more characters. If the `delimiter` is empty,
* whitespace (character code-point <= ' ') is used for identifying tokens.
* Also, any consecutive delimiters found in a string are ignored.
*
* @code{.pseudo}
* Example:
* s = ["this is me", "theme music"]
* result = filter_tokens(s,3)
* result is now ["this ", "theme music"]
* @endcode
*
* Note the first string in `result` still retains the space delimiters.
*
* Example with a `replacement` string.
*
* @code{.pseudo}
* Example:
* s = ["this is me", "theme music"]
* result = filter_tokens(s,5,"---")
* result is now ["--- --- ---", "theme music"]
* @endcode
*
* The `replacement` string is allowed to be shorter than min_token_length.
*
* @throw cudf::logic_error if `delimiter` or `replacement` is invalid
*
* @param strings Strings column to replace.
* @param min_token_length The minimum number of characters to retain a token in the output string.
* @param replacement Optional replacement string to be used in place of removed tokens.
* @param delimiter Characters used to separate each string into tokens.
* The default of empty string will identify tokens using whitespace.
* @param mr Device memory resource used to allocate the returned column's device memory.
* @return New strings columns of with replaced strings.
*/
std::unique_ptr<cudf::column> filter_tokens(
cudf::strings_column_view const& strings,
cudf::size_type min_token_length,
cudf::string_scalar const& replacement = cudf::string_scalar{""},
cudf::string_scalar const& delimiter = cudf::string_scalar{""},
davidwendt marked this conversation as resolved.
Show resolved Hide resolved
rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource());

/** @} */ // end of group
} // namespace nvtext
240 changes: 176 additions & 64 deletions cpp/src/text/replace.cu
Original file line number Diff line number Diff line change
Expand Up @@ -31,78 +31,161 @@ namespace nvtext {
namespace detail {
namespace {

using strings_iterator = cudf::column_device_view::const_iterator<cudf::string_view>;
using replace_result = thrust::pair<bool, cudf::string_view>;

/**
* @brief Functor to replace tokens in each string.
*
* This tokenizes a string using the given d_delimiter and replaces any tokens that match
* a string in d_targets_begin/end with those from the d_replacements column.
* Strings with no matching tokens are left unchanged.
*
* This should be called first to compute the size of each output string and then a second
* time to fill in the allocated output buffer for each string.
*/
struct replace_tokens_fn {
struct base_token_replacer_fn {
cudf::column_device_view const d_strings; ///< strings to tokenize
strings_iterator d_targets_begin; ///< strings to search for
strings_iterator d_targets_end;
cudf::column_device_view const d_replacements; ///< replacement strings
cudf::string_view const d_delimiter; ///< delimiter characters for tokenizing
const int32_t* d_offsets{}; ///< for locating output string in d_chars
char* d_chars{}; ///< output buffer
cudf::string_view const d_delimiter; ///< delimiter characters for tokenizing
int32_t* d_offsets{}; ///< for locating output string in d_chars
char* d_chars{}; ///< output buffer

__device__ cudf::size_type operator()(cudf::size_type idx)
/**
* @brief Tokenizes each string and calls the provided `replacer` function
* for each token.
*
* @tparam ReplaceFn Should accept a `string_view` and return a `replace_result`
* @param idx Index of the current string to process
* @param replacer Function to call for each token to determined its replacement
*/
template <typename ReplaceFn>
__device__ void process_string(cudf::size_type idx, ReplaceFn replacer)
{
if (d_strings.is_null(idx)) return 0;
if (d_strings.is_null(idx)) {
if (!d_chars) d_offsets[idx] = 0;
return;
}

auto const d_str = d_strings.element<cudf::string_view>(idx);
auto const in_ptr = d_str.data();
auto out_ptr = d_chars ? d_chars + d_offsets[idx] : nullptr;
auto nbytes = d_str.size_bytes();
auto nbytes = d_str.size_bytes(); // count the output bytes
auto last_pos = cudf::size_type{0};
auto tokenizer = characters_tokenizer{d_str, d_delimiter};

// process each token
while (tokenizer.next_token()) {
auto const token_pos = tokenizer.token_byte_positions();
auto const token =
cudf::string_view{d_str.data() + token_pos.first, token_pos.second - token_pos.first};

// check if the token matches any of the targets
auto const found_itr = thrust::find(thrust::seq, d_targets_begin, d_targets_end, token);
if (found_itr != d_targets_end) { // match found
// retrieve the corresponding replacement string or
// if only one repl string, use that one for all targets
auto const d_repl = [&] {
auto const repl_idx = thrust::distance(d_targets_begin, found_itr);
return d_replacements.size() == 1 ? d_replacements.element<cudf::string_view>(0)
: d_replacements.element<cudf::string_view>(repl_idx);
}();

nbytes += d_repl.size_bytes() - token.size_bytes(); // total output bytes

// ask replacer if this token should be replaced
auto const result = replacer(token);
if (result.first) { // first == replace indicator, second == new string
auto d_replacement = result.second;
nbytes += d_replacement.size_bytes() - token.size_bytes();
if (out_ptr) {
// copy over string up to the token location
out_ptr = cudf::strings::detail::copy_and_increment(
out_ptr, in_ptr + last_pos, token_pos.first - last_pos);
// copy over replacement string
out_ptr = cudf::strings::detail::copy_string(out_ptr, d_repl);
out_ptr = cudf::strings::detail::copy_string(out_ptr, d_replacement);
last_pos = token_pos.second; // update last byte position for this string
}
}
}

// copy the remainder of the string bytes to the output buffer
if (out_ptr) memcpy(out_ptr, in_ptr + last_pos, d_str.size_bytes() - last_pos);
return nbytes;
// copy the remainder of the string's bytes to the output buffer
if (out_ptr)
memcpy(out_ptr, in_ptr + last_pos, d_str.size_bytes() - last_pos);
else
d_offsets[idx] = nbytes;
}
};

using strings_iterator = cudf::column_device_view::const_iterator<cudf::string_view>;

/**
* @brief Functor to replace tokens in each string.
*
* This tokenizes a string using the given d_delimiter and replaces any tokens that match
* a string in d_targets_begin/end with those from the d_replacements column.
* Strings with no matching tokens are left unchanged.
*
* This should be called first to compute the size of each output string and then a second
* time to fill in the allocated output buffer for each string.
*/
struct replace_tokens_fn : base_token_replacer_fn {
strings_iterator d_targets_begin; ///< strings to search for
strings_iterator d_targets_end;
cudf::column_device_view const d_replacements; ///< replacement strings

replace_tokens_fn(cudf::column_device_view const& d_strings,
cudf::string_view const& d_delimiter,
strings_iterator d_targets_begin,
strings_iterator d_targets_end,
cudf::column_device_view const& d_replacements)
: base_token_replacer_fn{d_strings, d_delimiter},
d_targets_begin{d_targets_begin},
d_targets_end{d_targets_end},
d_replacements{d_replacements}
{
}

/**
* @brief Return replacement string for the given token.
*
* @param token Token candidate to be replaced.
* @return result pair specifies replacement condition and new string
*/
__device__ replace_result token_replacement(cudf::string_view const& token)
{
// check if the token matches any of the targets
auto const found_itr = thrust::find(thrust::seq, d_targets_begin, d_targets_end, token);
if (found_itr != d_targets_end) { // match found
// retrieve the corresponding replacement string or
// if only one repl string, use that one for all targets
auto const d_repl = [&] {
auto const repl_idx = thrust::distance(d_targets_begin, found_itr);
return d_replacements.size() == 1 ? d_replacements.element<cudf::string_view>(0)
: d_replacements.element<cudf::string_view>(repl_idx);
}();
return replace_result{true, d_repl};
}
// otherwise, do not replace this token
return replace_result{false, cudf::string_view()};
}

__device__ void operator()(cudf::size_type idx)
{
process_string(
idx, [this] __device__(cudf::string_view const& token) { return token_replacement(token); });
}
};

/**
* @brief Functor to filter tokens in each string.
*
* This tokenizes a string using the given d_delimiter and replaces any tokens
* that are shorter than min_token_length with a replacement string.
*
* This should be called first to compute the size of each output string and then
* a second time to fill in the allocated output buffer for each string.
*/
struct remove_small_tokens_fn : base_token_replacer_fn {
cudf::size_type min_token_length; ///< minimum size for found tokens
cudf::string_view const d_replacement; ///< replacement string

remove_small_tokens_fn(cudf::column_device_view const& d_strings,
cudf::string_view const& d_delimiter,
cudf::size_type min_token_length,
cudf::string_view const& d_replacement)
: base_token_replacer_fn{d_strings, d_delimiter},
min_token_length{min_token_length},
d_replacement{d_replacement}
{
}

__device__ void operator()(cudf::size_type idx)
{
auto replacer = [this] __device__(cudf::string_view const& token) {
return replace_result{token.length() < min_token_length, d_replacement};
};
process_string(idx, replacer);
}
};

} // namespace

// detail APIs

// zero or more character tokenizer
std::unique_ptr<cudf::column> replace_tokens(cudf::strings_column_view const& strings,
cudf::strings_column_view const& targets,
cudf::strings_column_view const& replacements,
Expand All @@ -125,38 +208,57 @@ std::unique_ptr<cudf::column> replace_tokens(cudf::strings_column_view const& st
auto replacements_column = cudf::column_device_view::create(replacements.parent(), stream);
cudf::string_view d_delimiter(delimiter.data(), delimiter.size());
replace_tokens_fn replacer{*strings_column,
d_delimiter,
targets_column->begin<cudf::string_view>(),
targets_column->end<cudf::string_view>(),
*replacements_column,
d_delimiter};
*replacements_column};

// copy null mask from input column
rmm::device_buffer null_mask = copy_bitmask(strings.parent(), stream, mr);

// this utility calls replacer to build the offsets and chars columns
auto children = cudf::strings::detail::make_strings_children(
replacer, strings_count, strings.null_count(), mr, stream);

// return new strings column
return cudf::make_strings_column(strings_count,
std::move(children.first),
std::move(children.second),
strings.null_count(),
std::move(null_mask),
stream,
mr);
}

std::unique_ptr<cudf::column> filter_tokens(cudf::strings_column_view const& strings,
cudf::size_type min_token_length,
cudf::string_scalar const& replacement,
cudf::string_scalar const& delimiter,
cudaStream_t stream,
rmm::mr::device_memory_resource* mr)
{
CUDF_EXPECTS(replacement.is_valid(), "Parameter replacement must be valid");
CUDF_EXPECTS(delimiter.is_valid(), "Parameter delimiter must be valid");

cudf::size_type const strings_count = strings.size();
if (strings_count == 0) return cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING});
davidwendt marked this conversation as resolved.
Show resolved Hide resolved

auto strings_column = cudf::column_device_view::create(strings.parent(), stream);
cudf::string_view d_replacement(replacement.data(), replacement.size());
cudf::string_view d_delimiter(delimiter.data(), delimiter.size());
remove_small_tokens_fn filterer{*strings_column, d_delimiter, min_token_length, d_replacement};

// copy null mask from input column
rmm::device_buffer null_mask = copy_bitmask(strings.parent(), stream, mr);

// create offsets by calculating size of each string for output
auto offsets_transformer_itr =
thrust::make_transform_iterator(thrust::make_counting_iterator<int32_t>(0), replacer);
auto offsets_column = cudf::strings::detail::make_offsets_child_column(
offsets_transformer_itr, offsets_transformer_itr + strings_count, mr, stream);
replacer.d_offsets = offsets_column->view().data<int32_t>();

// build the chars column
cudf::size_type const bytes = thrust::device_pointer_cast(replacer.d_offsets)[strings_count];
auto chars_column = cudf::strings::detail::create_chars_child_column(
strings_count, strings.null_count(), bytes, mr, stream);
replacer.d_chars = chars_column->mutable_view().data<char>();

// copy tokens to the chars buffer
thrust::for_each_n(rmm::exec_policy(stream)->on(stream),
thrust::make_counting_iterator<cudf::size_type>(0),
strings_count,
replacer);
chars_column->set_null_count(0); // reset null count for child column
// this utility calls filterer to build the offsets and chars columns
auto children = cudf::strings::detail::make_strings_children(
filterer, strings_count, strings.null_count(), mr, stream);

// return new strings column
return cudf::make_strings_column(strings_count,
std::move(offsets_column),
std::move(chars_column),
std::move(children.first),
std::move(children.second),
strings.null_count(),
std::move(null_mask),
stream,
Expand All @@ -177,4 +279,14 @@ std::unique_ptr<cudf::column> replace_tokens(cudf::strings_column_view const& st
return detail::replace_tokens(strings, targets, replacements, delimiter, 0, mr);
}

std::unique_ptr<cudf::column> filter_tokens(cudf::strings_column_view const& strings,
cudf::size_type min_token_length,
cudf::string_scalar const& replacement,
cudf::string_scalar const& delimiter,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
return detail::filter_tokens(strings, min_token_length, replacement, delimiter, 0, mr);
}

} // namespace nvtext
Loading