rapidsai · rapids-bot · Oct 27, 2023 · Oct 23, 2023 · Oct 25, 2023 · Oct 27, 2023
@@ -49,15 +49,17 @@ namespace nvtext {
  *
  * All null row entries are ignored and the output contains all valid rows.
  *
- * @param strings Strings column tokenize.
+ * @param input Strings column to tokenize
  * @param delimiter UTF-8 characters used to separate each string into tokens.
  *                  The default of empty string will separate tokens using whitespace.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New strings columns of tokens.
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New strings columns of tokens
  */
 std::unique_ptr<cudf::column> tokenize(
-  cudf::strings_column_view const& strings,
+  cudf::strings_column_view const& input,
   cudf::string_scalar const& delimiter = cudf::string_scalar{""},
+  rmm::cuda_stream_view stream         = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr  = rmm::mr::get_current_device_resource());
 
 /**
@@ -84,14 +86,16 @@ std::unique_ptr<cudf::column> tokenize(
  *
  * @throw cudf::logic_error if the delimiters column is empty or contains nulls.
  *
- * @param strings Strings column to tokenize.
- * @param delimiters Strings used to separate individual strings into tokens.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New strings columns of tokens.
+ * @param input Strings column to tokenize
+ * @param delimiters Strings used to separate individual strings into tokens
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New strings columns of tokens
  */
 std::unique_ptr<cudf::column> tokenize(
-  cudf::strings_column_view const& strings,
+  cudf::strings_column_view const& input,
   cudf::strings_column_view const& delimiters,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -112,15 +116,17 @@ std::unique_ptr<cudf::column> tokenize(
  * All null row entries are ignored and the output contains all valid rows.
  * The number of tokens for a null element is set to 0 in the output column.
  *
- * @param strings Strings column to use for this operation
- * @param delimiter Strings used to separate each string into tokens;
+ * @param input Strings column to count tokens
+ * @param delimiter Strings used to separate each string into tokens.
  *                  The default of empty string will separate tokens using whitespace.
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return New column of token counts
  */
 std::unique_ptr<cudf::column> count_tokens(
-  cudf::strings_column_view const& strings,
+  cudf::strings_column_view const& input,
   cudf::string_scalar const& delimiter = cudf::string_scalar{""},
+  rmm::cuda_stream_view stream         = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr  = rmm::mr::get_current_device_resource());
 
 /**
@@ -143,14 +149,16 @@ std::unique_ptr<cudf::column> count_tokens(
  *
  * @throw cudf::logic_error if the delimiters column is empty or contains nulls
  *
- * @param strings Strings column to use for this operation
+ * @param input Strings column to count tokens
  * @param delimiters Strings used to separate each string into tokens
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return New column of token counts
  */
 std::unique_ptr<cudf::column> count_tokens(
-  cudf::strings_column_view const& strings,
+  cudf::strings_column_view const& input,
   cudf::strings_column_view const& delimiters,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -168,12 +176,14 @@ std::unique_ptr<cudf::column> count_tokens(
  *
  * All null row entries are ignored and the output contains all valid rows.
  *
- * @param strings Strings column to tokenize.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New strings columns of tokens.
+ * @param input Strings column to tokenize
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New strings columns of tokens
  */
 std::unique_ptr<cudf::column> character_tokenize(
-  cudf::strings_column_view const& strings,
+  cudf::strings_column_view const& input,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -203,16 +213,18 @@ std::unique_ptr<cudf::column> character_tokenize(
  * @throw cudf::logic_error if `row_indices.size() != strings.size()`
  * @throw cudf::logic_error if `row_indices` contains nulls
  *
- * @param strings Strings column to detokenize.
- * @param row_indices The relative output row index assigned for each token in the input column.
- * @param separator String to append after concatenating each token to the proper output row.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New strings columns of tokens.
+ * @param input Strings column to detokenize
+ * @param row_indices The relative output row index assigned for each token in the input column
+ * @param separator String to append after concatenating each token to the proper output row
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New strings columns of tokens
  */
 std::unique_ptr<cudf::column> detokenize(
-  cudf::strings_column_view const& strings,
+  cudf::strings_column_view const& input,
   cudf::column_view const& row_indices,
   cudf::string_scalar const& separator = cudf::string_scalar(" "),
+  rmm::cuda_stream_view stream         = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr  = rmm::mr::get_current_device_resource());
 
 /**

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -169,13 +169,14 @@ std::unique_ptr<cudf::column> detokenize(cudf::strings_column_view const& string
 
 }  // namespace detail
 
-std::unique_ptr<cudf::column> detokenize(cudf::strings_column_view const& strings,
+std::unique_ptr<cudf::column> detokenize(cudf::strings_column_view const& input,
                                          cudf::column_view const& row_indices,
                                          cudf::string_scalar const& separator,
+                                         rmm::cuda_stream_view stream,
                                          rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::detokenize(strings, row_indices, separator, cudf::get_default_stream(), mr);
+  return detail::detokenize(input, row_indices, separator, stream, mr);
 }
 
 }  // namespace nvtext
@@ -232,43 +232,48 @@ std::unique_ptr<cudf::column> character_tokenize(cudf::strings_column_view const
 
 // external APIs
 
-std::unique_ptr<cudf::column> tokenize(cudf::strings_column_view const& strings,
+std::unique_ptr<cudf::column> tokenize(cudf::strings_column_view const& input,
                                        cudf::string_scalar const& delimiter,
+                                       rmm::cuda_stream_view stream,
                                        rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::tokenize(strings, delimiter, cudf::get_default_stream(), mr);
+  return detail::tokenize(input, delimiter, stream, mr);
 }
 
-std::unique_ptr<cudf::column> tokenize(cudf::strings_column_view const& strings,
+std::unique_ptr<cudf::column> tokenize(cudf::strings_column_view const& input,
                                        cudf::strings_column_view const& delimiters,
+                                       rmm::cuda_stream_view stream,
                                        rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::tokenize(strings, delimiters, cudf::get_default_stream(), mr);
+  return detail::tokenize(input, delimiters, stream, mr);
 }
 
-std::unique_ptr<cudf::column> count_tokens(cudf::strings_column_view const& strings,
+std::unique_ptr<cudf::column> count_tokens(cudf::strings_column_view const& input,
                                            cudf::string_scalar const& delimiter,
+                                           rmm::cuda_stream_view stream,
                                            rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::count_tokens(strings, delimiter, cudf::get_default_stream(), mr);
+  return detail::count_tokens(input, delimiter, stream, mr);
 }
 
-std::unique_ptr<cudf::column> count_tokens(cudf::strings_column_view const& strings,
+std::unique_ptr<cudf::column> count_tokens(cudf::strings_column_view const& input,
                                            cudf::strings_column_view const& delimiters,
+                                           rmm::cuda_stream_view stream,
                                            rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::count_tokens(strings, delimiters, cudf::get_default_stream(), mr);
+  return detail::count_tokens(input, delimiters, stream, mr);
 }
 
-std::unique_ptr<cudf::column> character_tokenize(cudf::strings_column_view const& strings,
+std::unique_ptr<cudf::column> character_tokenize(cudf::strings_column_view const& input,
+                                                 rmm::cuda_stream_view stream,
                                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::character_tokenize(strings, cudf::get_default_stream(), mr);
+  return detail::character_tokenize(input, stream, mr);
 }
 
 }  // namespace nvtext
@@ -649,7 +649,9 @@ ConfigureTest(
   testing
 )
 ConfigureTest(STREAM_SORTING_TEST streams/sorting_test.cpp STREAM_MODE testing)
-ConfigureTest(STREAM_TEXT_TEST streams/text/ngrams_test.cpp STREAM_MODE testing)
+ConfigureTest(
+  STREAM_TEXT_TEST streams/text/ngrams_test.cpp streams/text/tokenize_test.cpp STREAM_MODE testing
+)
 ConfigureTest(STREAM_LISTS_TEST streams/lists_test.cpp STREAM_MODE testing)
 
 # ##################################################################################################

@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <nvtext/tokenize.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+
+class TextTokenizeTest : public cudf::test::BaseFixture {};
+
+TEST_F(TextTokenizeTest, Tokenize)
+{
+  auto const input     = cudf::test::strings_column_wrapper({"the fox jumped", "over thé dog"});
+  auto const view      = cudf::strings_column_view(input);
+  auto const delimiter = cudf::string_scalar{" ", true, cudf::test::get_default_stream()};
+  nvtext::tokenize(view, delimiter, cudf::test::get_default_stream());
+  nvtext::count_tokens(view, delimiter, cudf::test::get_default_stream());
+  auto const delimiters = cudf::test::strings_column_wrapper({" ", "o", "é"});
+  nvtext::tokenize(view, cudf::strings_column_view(delimiters), cudf::test::get_default_stream());
+  nvtext::count_tokens(
+    view, cudf::strings_column_view(delimiters), cudf::test::get_default_stream());
+}
+
+TEST_F(TextTokenizeTest, CharacterTokenize)
+{
+  auto const input =
+    cudf::test::strings_column_wrapper({"the", "fox", "jumped", "over", "thé", "dog"});
+  nvtext::character_tokenize(cudf::strings_column_view(input), cudf::test::get_default_stream());
+}
+
+TEST_F(TextTokenizeTest, Detokenize)
+{
+  auto const input =
+    cudf::test::strings_column_wrapper({"the", "fox", "jumped", "over", "thé", "dog"});
+  auto const view      = cudf::strings_column_view(input);
+  auto const indices   = cudf::test::fixed_width_column_wrapper<int32_t>({0, 0, 0, 1, 1, 1});
+  auto const separator = cudf::string_scalar{" ", true, cudf::test::get_default_stream()};
+  nvtext::detokenize(view, indices, separator, cudf::test::get_default_stream());
+}