Skip to content

Commit

Permalink
Merge pull request #5658 from davidwendt/fea-filter-tokens
Browse files Browse the repository at this point in the history
[REVIEW] Add filter_tokens nvtext API
  • Loading branch information
davidwendt authored Jul 16, 2020
2 parents e166da5 + 39a4409 commit c5c5dc7
Show file tree
Hide file tree
Showing 8 changed files with 420 additions and 65 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
- PR #5612 Add `is_hex` strings API
- PR #5637 Parameterize Null comparator behaviour in Joins
- PR #5623 Add `is_ipv4` strings API
- PR #5658 Add `filter_tokens` nvtext API
- PR #5666 Add `filter_characters_of_type` strings API
- PR #5673 Always build and test with per-thread default stream enabled in the GPU CI build

Expand Down
47 changes: 47 additions & 0 deletions cpp/include/nvtext/replace.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -87,5 +87,52 @@ std::unique_ptr<cudf::column> replace_tokens(
cudf::string_scalar const& delimiter = cudf::string_scalar{""},
rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource());

/**
* @brief Removes tokens whose lengths are less than a specified number of characters.
*
* Tokens identified in each string are removed from the corresponding output string.
* The removed tokens can be replaced by specifying a `replacement` string as well.
*
* The `delimiter` may be zero or more characters. If the `delimiter` is empty,
* whitespace (character code-point <= ' ') is used for identifying tokens.
* Also, any consecutive delimiters found in a string are ignored.
*
* @code{.pseudo}
* Example:
* s = ["this is me", "theme music"]
* result = filter_tokens(s,3)
* result is now ["this ", "theme music"]
* @endcode
*
* Note the first string in `result` still retains the space delimiters.
*
* Example with a `replacement` string.
*
* @code{.pseudo}
* Example:
* s = ["this is me", "theme music"]
* result = filter_tokens(s,5,"---")
* result is now ["--- --- ---", "theme music"]
* @endcode
*
* The `replacement` string is allowed to be shorter than min_token_length.
*
* @throw cudf::logic_error if `delimiter` or `replacement` is invalid
*
* @param strings Strings column to replace.
* @param min_token_length The minimum number of characters to retain a token in the output string.
* @param replacement Optional replacement string to be used in place of removed tokens.
* @param delimiter Characters used to separate each string into tokens.
* The default of empty string will identify tokens using whitespace.
* @param mr Device memory resource used to allocate the returned column's device memory.
* @return New strings columns of with replaced strings.
*/
std::unique_ptr<cudf::column> filter_tokens(
cudf::strings_column_view const& strings,
cudf::size_type min_token_length,
cudf::string_scalar const& replacement = cudf::string_scalar{""},
cudf::string_scalar const& delimiter = cudf::string_scalar{""},
rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource());

/** @} */ // end of group
} // namespace nvtext
240 changes: 176 additions & 64 deletions cpp/src/text/replace.cu
Original file line number Diff line number Diff line change
Expand Up @@ -31,78 +31,161 @@ namespace nvtext {
namespace detail {
namespace {

using strings_iterator = cudf::column_device_view::const_iterator<cudf::string_view>;
using replace_result = thrust::pair<bool, cudf::string_view>;

/**
* @brief Functor to replace tokens in each string.
*
* This tokenizes a string using the given d_delimiter and replaces any tokens that match
* a string in d_targets_begin/end with those from the d_replacements column.
* Strings with no matching tokens are left unchanged.
*
* This should be called first to compute the size of each output string and then a second
* time to fill in the allocated output buffer for each string.
*/
struct replace_tokens_fn {
struct base_token_replacer_fn {
cudf::column_device_view const d_strings; ///< strings to tokenize
strings_iterator d_targets_begin; ///< strings to search for
strings_iterator d_targets_end;
cudf::column_device_view const d_replacements; ///< replacement strings
cudf::string_view const d_delimiter; ///< delimiter characters for tokenizing
const int32_t* d_offsets{}; ///< for locating output string in d_chars
char* d_chars{}; ///< output buffer
cudf::string_view const d_delimiter; ///< delimiter characters for tokenizing
int32_t* d_offsets{}; ///< for locating output string in d_chars
char* d_chars{}; ///< output buffer

__device__ cudf::size_type operator()(cudf::size_type idx)
/**
* @brief Tokenizes each string and calls the provided `replacer` function
* for each token.
*
* @tparam ReplaceFn Should accept a `string_view` and return a `replace_result`
* @param idx Index of the current string to process
* @param replacer Function to call for each token to determined its replacement
*/
template <typename ReplaceFn>
__device__ void process_string(cudf::size_type idx, ReplaceFn replacer)
{
if (d_strings.is_null(idx)) return 0;
if (d_strings.is_null(idx)) {
if (!d_chars) d_offsets[idx] = 0;
return;
}

auto const d_str = d_strings.element<cudf::string_view>(idx);
auto const in_ptr = d_str.data();
auto out_ptr = d_chars ? d_chars + d_offsets[idx] : nullptr;
auto nbytes = d_str.size_bytes();
auto nbytes = d_str.size_bytes(); // count the output bytes
auto last_pos = cudf::size_type{0};
auto tokenizer = characters_tokenizer{d_str, d_delimiter};

// process each token
while (tokenizer.next_token()) {
auto const token_pos = tokenizer.token_byte_positions();
auto const token =
cudf::string_view{d_str.data() + token_pos.first, token_pos.second - token_pos.first};

// check if the token matches any of the targets
auto const found_itr = thrust::find(thrust::seq, d_targets_begin, d_targets_end, token);
if (found_itr != d_targets_end) { // match found
// retrieve the corresponding replacement string or
// if only one repl string, use that one for all targets
auto const d_repl = [&] {
auto const repl_idx = thrust::distance(d_targets_begin, found_itr);
return d_replacements.size() == 1 ? d_replacements.element<cudf::string_view>(0)
: d_replacements.element<cudf::string_view>(repl_idx);
}();

nbytes += d_repl.size_bytes() - token.size_bytes(); // total output bytes

// ask replacer if this token should be replaced
auto const result = replacer(token);
if (result.first) { // first == replace indicator, second == new string
auto d_replacement = result.second;
nbytes += d_replacement.size_bytes() - token.size_bytes();
if (out_ptr) {
// copy over string up to the token location
out_ptr = cudf::strings::detail::copy_and_increment(
out_ptr, in_ptr + last_pos, token_pos.first - last_pos);
// copy over replacement string
out_ptr = cudf::strings::detail::copy_string(out_ptr, d_repl);
out_ptr = cudf::strings::detail::copy_string(out_ptr, d_replacement);
last_pos = token_pos.second; // update last byte position for this string
}
}
}

// copy the remainder of the string bytes to the output buffer
if (out_ptr) memcpy(out_ptr, in_ptr + last_pos, d_str.size_bytes() - last_pos);
return nbytes;
// copy the remainder of the string's bytes to the output buffer
if (out_ptr)
memcpy(out_ptr, in_ptr + last_pos, d_str.size_bytes() - last_pos);
else
d_offsets[idx] = nbytes;
}
};

using strings_iterator = cudf::column_device_view::const_iterator<cudf::string_view>;

/**
* @brief Functor to replace tokens in each string.
*
* This tokenizes a string using the given d_delimiter and replaces any tokens that match
* a string in d_targets_begin/end with those from the d_replacements column.
* Strings with no matching tokens are left unchanged.
*
* This should be called first to compute the size of each output string and then a second
* time to fill in the allocated output buffer for each string.
*/
struct replace_tokens_fn : base_token_replacer_fn {
strings_iterator d_targets_begin; ///< strings to search for
strings_iterator d_targets_end;
cudf::column_device_view const d_replacements; ///< replacement strings

replace_tokens_fn(cudf::column_device_view const& d_strings,
cudf::string_view const& d_delimiter,
strings_iterator d_targets_begin,
strings_iterator d_targets_end,
cudf::column_device_view const& d_replacements)
: base_token_replacer_fn{d_strings, d_delimiter},
d_targets_begin{d_targets_begin},
d_targets_end{d_targets_end},
d_replacements{d_replacements}
{
}

/**
* @brief Return replacement string for the given token.
*
* @param token Token candidate to be replaced.
* @return result pair specifies replacement condition and new string
*/
__device__ replace_result token_replacement(cudf::string_view const& token)
{
// check if the token matches any of the targets
auto const found_itr = thrust::find(thrust::seq, d_targets_begin, d_targets_end, token);
if (found_itr != d_targets_end) { // match found
// retrieve the corresponding replacement string or
// if only one repl string, use that one for all targets
auto const d_repl = [&] {
auto const repl_idx = thrust::distance(d_targets_begin, found_itr);
return d_replacements.size() == 1 ? d_replacements.element<cudf::string_view>(0)
: d_replacements.element<cudf::string_view>(repl_idx);
}();
return replace_result{true, d_repl};
}
// otherwise, do not replace this token
return replace_result{false, cudf::string_view()};
}

__device__ void operator()(cudf::size_type idx)
{
process_string(
idx, [this] __device__(cudf::string_view const& token) { return token_replacement(token); });
}
};

/**
* @brief Functor to filter tokens in each string.
*
* This tokenizes a string using the given d_delimiter and replaces any tokens
* that are shorter than min_token_length with a replacement string.
*
* This should be called first to compute the size of each output string and then
* a second time to fill in the allocated output buffer for each string.
*/
struct remove_small_tokens_fn : base_token_replacer_fn {
cudf::size_type min_token_length; ///< minimum size for found tokens
cudf::string_view const d_replacement; ///< replacement string

remove_small_tokens_fn(cudf::column_device_view const& d_strings,
cudf::string_view const& d_delimiter,
cudf::size_type min_token_length,
cudf::string_view const& d_replacement)
: base_token_replacer_fn{d_strings, d_delimiter},
min_token_length{min_token_length},
d_replacement{d_replacement}
{
}

__device__ void operator()(cudf::size_type idx)
{
auto replacer = [this] __device__(cudf::string_view const& token) {
return replace_result{token.length() < min_token_length, d_replacement};
};
process_string(idx, replacer);
}
};

} // namespace

// detail APIs

// zero or more character tokenizer
std::unique_ptr<cudf::column> replace_tokens(cudf::strings_column_view const& strings,
cudf::strings_column_view const& targets,
cudf::strings_column_view const& replacements,
Expand All @@ -125,38 +208,57 @@ std::unique_ptr<cudf::column> replace_tokens(cudf::strings_column_view const& st
auto replacements_column = cudf::column_device_view::create(replacements.parent(), stream);
cudf::string_view d_delimiter(delimiter.data(), delimiter.size());
replace_tokens_fn replacer{*strings_column,
d_delimiter,
targets_column->begin<cudf::string_view>(),
targets_column->end<cudf::string_view>(),
*replacements_column,
d_delimiter};
*replacements_column};

// copy null mask from input column
rmm::device_buffer null_mask = copy_bitmask(strings.parent(), stream, mr);

// this utility calls replacer to build the offsets and chars columns
auto children = cudf::strings::detail::make_strings_children(
replacer, strings_count, strings.null_count(), mr, stream);

// return new strings column
return cudf::make_strings_column(strings_count,
std::move(children.first),
std::move(children.second),
strings.null_count(),
std::move(null_mask),
stream,
mr);
}

std::unique_ptr<cudf::column> filter_tokens(cudf::strings_column_view const& strings,
cudf::size_type min_token_length,
cudf::string_scalar const& replacement,
cudf::string_scalar const& delimiter,
cudaStream_t stream,
rmm::mr::device_memory_resource* mr)
{
CUDF_EXPECTS(replacement.is_valid(), "Parameter replacement must be valid");
CUDF_EXPECTS(delimiter.is_valid(), "Parameter delimiter must be valid");

cudf::size_type const strings_count = strings.size();
if (strings_count == 0) return cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING});

auto strings_column = cudf::column_device_view::create(strings.parent(), stream);
cudf::string_view d_replacement(replacement.data(), replacement.size());
cudf::string_view d_delimiter(delimiter.data(), delimiter.size());
remove_small_tokens_fn filterer{*strings_column, d_delimiter, min_token_length, d_replacement};

// copy null mask from input column
rmm::device_buffer null_mask = copy_bitmask(strings.parent(), stream, mr);

// create offsets by calculating size of each string for output
auto offsets_transformer_itr =
thrust::make_transform_iterator(thrust::make_counting_iterator<int32_t>(0), replacer);
auto offsets_column = cudf::strings::detail::make_offsets_child_column(
offsets_transformer_itr, offsets_transformer_itr + strings_count, mr, stream);
replacer.d_offsets = offsets_column->view().data<int32_t>();

// build the chars column
cudf::size_type const bytes = thrust::device_pointer_cast(replacer.d_offsets)[strings_count];
auto chars_column = cudf::strings::detail::create_chars_child_column(
strings_count, strings.null_count(), bytes, mr, stream);
replacer.d_chars = chars_column->mutable_view().data<char>();

// copy tokens to the chars buffer
thrust::for_each_n(rmm::exec_policy(stream)->on(stream),
thrust::make_counting_iterator<cudf::size_type>(0),
strings_count,
replacer);
chars_column->set_null_count(0); // reset null count for child column
// this utility calls filterer to build the offsets and chars columns
auto children = cudf::strings::detail::make_strings_children(
filterer, strings_count, strings.null_count(), mr, stream);

// return new strings column
return cudf::make_strings_column(strings_count,
std::move(offsets_column),
std::move(chars_column),
std::move(children.first),
std::move(children.second),
strings.null_count(),
std::move(null_mask),
stream,
Expand All @@ -177,4 +279,14 @@ std::unique_ptr<cudf::column> replace_tokens(cudf::strings_column_view const& st
return detail::replace_tokens(strings, targets, replacements, delimiter, 0, mr);
}

std::unique_ptr<cudf::column> filter_tokens(cudf::strings_column_view const& strings,
cudf::size_type min_token_length,
cudf::string_scalar const& replacement,
cudf::string_scalar const& delimiter,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
return detail::filter_tokens(strings, min_token_length, replacement, delimiter, 0, mr);
}

} // namespace nvtext
Loading

0 comments on commit c5c5dc7

Please sign in to comment.