rapidsai · davidwendt · Jul 16, 2020 · Jul 8, 2020 · Jul 8, 2020 · Jul 8, 2020
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -40,6 +40,7 @@
 - PR #5612 Add `is_hex` strings API
 - PR #5637 Parameterize Null comparator behaviour in Joins
 - PR #5623 Add `is_ipv4` strings API
+- PR #5658 Add `filter_tokens` nvtext API
 - PR #5666 Add `filter_characters_of_type` strings API
 - PR #5673 Always build and test with per-thread default stream enabled in the GPU CI build
 

@@ -87,5 +87,52 @@ std::unique_ptr<cudf::column> replace_tokens(
   cudf::string_scalar const& delimiter = cudf::string_scalar{""},
   rmm::mr::device_memory_resource* mr  = rmm::mr::get_default_resource());
 
+/**
+ * @brief Removes tokens whose lengths are less than a specified number of characters.
+ *
+ * Tokens identified in each string are removed from the corresponding output string.
+ * The removed tokens can be replaced by specifying a `replacement` string as well.
+ *
+ * The `delimiter` may be zero or more characters. If the `delimiter` is empty,
+ * whitespace (character code-point <= ' ') is used for identifying tokens.
+ * Also, any consecutive delimiters found in a string are ignored.
+ *
+ * @code{.pseudo}
+ * Example:
+ * s = ["this is me", "theme music"]
+ * result = filter_tokens(s,3)
+ * result is now ["this  ", "theme music"]
+ * @endcode
+ *
+ * Note the first string in `result` still retains the space delimiters.
+ *
+ * Example with a `replacement` string.
+ *
+ * @code{.pseudo}
+ * Example:
+ * s = ["this is me", "theme music"]
+ * result = filter_tokens(s,5,"---")
+ * result is now ["--- --- ---", "theme music"]
+ * @endcode
+ *
+ * The `replacement` string is allowed to be shorter than min_token_length.
+ *
+ * @throw cudf::logic_error if `delimiter` or `replacement` is invalid
+ *
+ * @param strings Strings column to replace.
+ * @param min_token_length The minimum number of characters to retain a token in the output string.
+ * @param replacement Optional replacement string to be used in place of removed tokens.
+ * @param delimiter Characters used to separate each string into tokens.
+ *                  The default of empty string will identify tokens using whitespace.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
+ * @return New strings columns of with replaced strings.
+ */
+std::unique_ptr<cudf::column> filter_tokens(
+  cudf::strings_column_view const& strings,
+  cudf::size_type min_token_length,
+  cudf::string_scalar const& replacement = cudf::string_scalar{""},
+  cudf::string_scalar const& delimiter   = cudf::string_scalar{""},
+  rmm::mr::device_memory_resource* mr    = rmm::mr::get_default_resource());
+
 /** @} */  // end of group
 }  // namespace nvtext
@@ -31,78 +31,161 @@ namespace nvtext {
 namespace detail {
 namespace {
 
-using strings_iterator = cudf::column_device_view::const_iterator<cudf::string_view>;
+using replace_result = thrust::pair<bool, cudf::string_view>;
 
-/**
- * @brief Functor to replace tokens in each string.
- *
- * This tokenizes a string using the given d_delimiter and replaces any tokens that match
- * a string in d_targets_begin/end with those from the d_replacements column.
- * Strings with no matching tokens are left unchanged.
- *
- * This should be called first to compute the size of each output string and then a second
- * time to fill in the allocated output buffer for each string.
- */
-struct replace_tokens_fn {
+struct base_token_replacer_fn {
   cudf::column_device_view const d_strings;  ///< strings to tokenize
-  strings_iterator d_targets_begin;          ///< strings to search for
-  strings_iterator d_targets_end;
-  cudf::column_device_view const d_replacements;  ///< replacement strings
-  cudf::string_view const d_delimiter;            ///< delimiter characters for tokenizing
-  const int32_t* d_offsets{};                     ///< for locating output string in d_chars
-  char* d_chars{};                                ///< output buffer
+  cudf::string_view const d_delimiter;       ///< delimiter characters for tokenizing
+  int32_t* d_offsets{};                      ///< for locating output string in d_chars
+  char* d_chars{};                           ///< output buffer
 
-  __device__ cudf::size_type operator()(cudf::size_type idx)
+  /**
+   * @brief Tokenizes each string and calls the provided `replacer` function
+   * for each token.
+   *
+   * @tparam ReplaceFn Should accept a `string_view` and return a `replace_result`
+   * @param idx Index of the current string to process
+   * @param replacer Function to call for each token to determined its replacement
+   */
+  template <typename ReplaceFn>
+  __device__ void process_string(cudf::size_type idx, ReplaceFn replacer)
   {
-    if (d_strings.is_null(idx)) return 0;
+    if (d_strings.is_null(idx)) {
+      if (!d_chars) d_offsets[idx] = 0;
+      return;
+    }
 
     auto const d_str  = d_strings.element<cudf::string_view>(idx);
     auto const in_ptr = d_str.data();
     auto out_ptr      = d_chars ? d_chars + d_offsets[idx] : nullptr;
-    auto nbytes       = d_str.size_bytes();
+    auto nbytes       = d_str.size_bytes();  // count the output bytes
     auto last_pos     = cudf::size_type{0};
     auto tokenizer    = characters_tokenizer{d_str, d_delimiter};
-
+    // process each token
     while (tokenizer.next_token()) {
       auto const token_pos = tokenizer.token_byte_positions();
       auto const token =
         cudf::string_view{d_str.data() + token_pos.first, token_pos.second - token_pos.first};
-
-      // check if the token matches any of the targets
-      auto const found_itr = thrust::find(thrust::seq, d_targets_begin, d_targets_end, token);
-      if (found_itr != d_targets_end) {  // match found
-        // retrieve the corresponding replacement string or
-        // if only one repl string, use that one for all targets
-        auto const d_repl = [&] {
-          auto const repl_idx = thrust::distance(d_targets_begin, found_itr);
-          return d_replacements.size() == 1 ? d_replacements.element<cudf::string_view>(0)
-                                            : d_replacements.element<cudf::string_view>(repl_idx);
-        }();
-
-        nbytes += d_repl.size_bytes() - token.size_bytes();  // total output bytes
-
+      // ask replacer if this token should be replaced
+      auto const result = replacer(token);
+      if (result.first) {  // first == replace indicator, second == new string
+        auto d_replacement = result.second;
+        nbytes += d_replacement.size_bytes() - token.size_bytes();
         if (out_ptr) {
           // copy over string up to the token location
           out_ptr = cudf::strings::detail::copy_and_increment(
             out_ptr, in_ptr + last_pos, token_pos.first - last_pos);
           // copy over replacement string
-          out_ptr  = cudf::strings::detail::copy_string(out_ptr, d_repl);
+          out_ptr  = cudf::strings::detail::copy_string(out_ptr, d_replacement);
           last_pos = token_pos.second;  // update last byte position for this string
         }
       }
     }
 
-    // copy the remainder of the string bytes to the output buffer
-    if (out_ptr) memcpy(out_ptr, in_ptr + last_pos, d_str.size_bytes() - last_pos);
-    return nbytes;
+    // copy the remainder of the string's bytes to the output buffer
+    if (out_ptr)
+      memcpy(out_ptr, in_ptr + last_pos, d_str.size_bytes() - last_pos);
+    else
+      d_offsets[idx] = nbytes;
+  }
+};
+
+using strings_iterator = cudf::column_device_view::const_iterator<cudf::string_view>;
+
+/**
+ * @brief Functor to replace tokens in each string.
+ *
+ * This tokenizes a string using the given d_delimiter and replaces any tokens that match
+ * a string in d_targets_begin/end with those from the d_replacements column.
+ * Strings with no matching tokens are left unchanged.
+ *
+ * This should be called first to compute the size of each output string and then a second
+ * time to fill in the allocated output buffer for each string.
+ */
+struct replace_tokens_fn : base_token_replacer_fn {
+  strings_iterator d_targets_begin;  ///< strings to search for
+  strings_iterator d_targets_end;
+  cudf::column_device_view const d_replacements;  ///< replacement strings
+
+  replace_tokens_fn(cudf::column_device_view const& d_strings,
+                    cudf::string_view const& d_delimiter,
+                    strings_iterator d_targets_begin,
+                    strings_iterator d_targets_end,
+                    cudf::column_device_view const& d_replacements)
+    : base_token_replacer_fn{d_strings, d_delimiter},
+      d_targets_begin{d_targets_begin},
+      d_targets_end{d_targets_end},
+      d_replacements{d_replacements}
+  {
+  }
+
+  /**
+   * @brief Return replacement string for the given token.
+   *
+   * @param token Token candidate to be replaced.
+   * @return result pair specifies replacement condition and new string
+   */
+  __device__ replace_result token_replacement(cudf::string_view const& token)
+  {
+    // check if the token matches any of the targets
+    auto const found_itr = thrust::find(thrust::seq, d_targets_begin, d_targets_end, token);
+    if (found_itr != d_targets_end) {  // match found
+      // retrieve the corresponding replacement string or
+      // if only one repl string, use that one for all targets
+      auto const d_repl = [&] {
+        auto const repl_idx = thrust::distance(d_targets_begin, found_itr);
+        return d_replacements.size() == 1 ? d_replacements.element<cudf::string_view>(0)
+                                          : d_replacements.element<cudf::string_view>(repl_idx);
+      }();
+      return replace_result{true, d_repl};
+    }
+    // otherwise, do not replace this token
+    return replace_result{false, cudf::string_view()};
+  }
+
+  __device__ void operator()(cudf::size_type idx)
+  {
+    process_string(
+      idx, [this] __device__(cudf::string_view const& token) { return token_replacement(token); });
+  }
+};
+
+/**
+ * @brief Functor to filter tokens in each string.
+ *
+ * This tokenizes a string using the given d_delimiter and replaces any tokens
+ * that are shorter than min_token_length with a replacement string.
+ *
+ * This should be called first to compute the size of each output string and then
+ * a second time to fill in the allocated output buffer for each string.
+ */
+struct remove_small_tokens_fn : base_token_replacer_fn {
+  cudf::size_type min_token_length;       ///< minimum size for found tokens
+  cudf::string_view const d_replacement;  ///< replacement string
+
+  remove_small_tokens_fn(cudf::column_device_view const& d_strings,
+                         cudf::string_view const& d_delimiter,
+                         cudf::size_type min_token_length,
+                         cudf::string_view const& d_replacement)
+    : base_token_replacer_fn{d_strings, d_delimiter},
+      min_token_length{min_token_length},
+      d_replacement{d_replacement}
+  {
+  }
+
+  __device__ void operator()(cudf::size_type idx)
+  {
+    auto replacer = [this] __device__(cudf::string_view const& token) {
+      return replace_result{token.length() < min_token_length, d_replacement};
+    };
+    process_string(idx, replacer);
   }
 };
 
 }  // namespace
 
 // detail APIs
 
-// zero or more character tokenizer
 std::unique_ptr<cudf::column> replace_tokens(cudf::strings_column_view const& strings,
                                              cudf::strings_column_view const& targets,
                                              cudf::strings_column_view const& replacements,
@@ -125,38 +208,57 @@ std::unique_ptr<cudf::column> replace_tokens(cudf::strings_column_view const& st
   auto replacements_column = cudf::column_device_view::create(replacements.parent(), stream);
   cudf::string_view d_delimiter(delimiter.data(), delimiter.size());
   replace_tokens_fn replacer{*strings_column,
+                             d_delimiter,
                              targets_column->begin<cudf::string_view>(),
                              targets_column->end<cudf::string_view>(),
-                             *replacements_column,
-                             d_delimiter};
+                             *replacements_column};
+
+  // copy null mask from input column
+  rmm::device_buffer null_mask = copy_bitmask(strings.parent(), stream, mr);
+
+  // this utility calls replacer to build the offsets and chars columns
+  auto children = cudf::strings::detail::make_strings_children(
+    replacer, strings_count, strings.null_count(), mr, stream);
+
+  // return new strings column
+  return cudf::make_strings_column(strings_count,
+                                   std::move(children.first),
+                                   std::move(children.second),
+                                   strings.null_count(),
+                                   std::move(null_mask),
+                                   stream,
+                                   mr);
+}
+
+std::unique_ptr<cudf::column> filter_tokens(cudf::strings_column_view const& strings,
+                                            cudf::size_type min_token_length,
+                                            cudf::string_scalar const& replacement,
+                                            cudf::string_scalar const& delimiter,
+                                            cudaStream_t stream,
+                                            rmm::mr::device_memory_resource* mr)
+{
+  CUDF_EXPECTS(replacement.is_valid(), "Parameter replacement must be valid");
+  CUDF_EXPECTS(delimiter.is_valid(), "Parameter delimiter must be valid");
+
+  cudf::size_type const strings_count = strings.size();
+  if (strings_count == 0) return cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING});
+
+  auto strings_column = cudf::column_device_view::create(strings.parent(), stream);
+  cudf::string_view d_replacement(replacement.data(), replacement.size());
+  cudf::string_view d_delimiter(delimiter.data(), delimiter.size());
+  remove_small_tokens_fn filterer{*strings_column, d_delimiter, min_token_length, d_replacement};
 
   // copy null mask from input column
   rmm::device_buffer null_mask = copy_bitmask(strings.parent(), stream, mr);
 
-  // create offsets by calculating size of each string for output
-  auto offsets_transformer_itr =
-    thrust::make_transform_iterator(thrust::make_counting_iterator<int32_t>(0), replacer);
-  auto offsets_column = cudf::strings::detail::make_offsets_child_column(
-    offsets_transformer_itr, offsets_transformer_itr + strings_count, mr, stream);
-  replacer.d_offsets = offsets_column->view().data<int32_t>();
-
-  // build the chars column
-  cudf::size_type const bytes = thrust::device_pointer_cast(replacer.d_offsets)[strings_count];
-  auto chars_column           = cudf::strings::detail::create_chars_child_column(
-    strings_count, strings.null_count(), bytes, mr, stream);
-  replacer.d_chars = chars_column->mutable_view().data<char>();
-
-  // copy tokens to the chars buffer
-  thrust::for_each_n(rmm::exec_policy(stream)->on(stream),
-                     thrust::make_counting_iterator<cudf::size_type>(0),
-                     strings_count,
-                     replacer);
-  chars_column->set_null_count(0);  // reset null count for child column
+  // this utility calls filterer to build the offsets and chars columns
+  auto children = cudf::strings::detail::make_strings_children(
+    filterer, strings_count, strings.null_count(), mr, stream);
 
   // return new strings column
   return cudf::make_strings_column(strings_count,
-                                   std::move(offsets_column),
-                                   std::move(chars_column),
+                                   std::move(children.first),
+                                   std::move(children.second),
                                    strings.null_count(),
                                    std::move(null_mask),
                                    stream,
@@ -177,4 +279,14 @@ std::unique_ptr<cudf::column> replace_tokens(cudf::strings_column_view const& st
   return detail::replace_tokens(strings, targets, replacements, delimiter, 0, mr);
 }
 
+std::unique_ptr<cudf::column> filter_tokens(cudf::strings_column_view const& strings,
+                                            cudf::size_type min_token_length,
+                                            cudf::string_scalar const& replacement,
+                                            cudf::string_scalar const& delimiter,
+                                            rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::filter_tokens(strings, min_token_length, replacement, delimiter, 0, mr);
+}
+
 }  // namespace nvtext