From 52f7d5c7d5d340c3c9011beaa075babc98c1cc0b Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Fri, 27 Oct 2023 11:59:46 -0400 Subject: [PATCH 1/2] Expose stream parameter in public strings filter APIs (#14293) Add stream parameter to public APIs: - `cudf::strings::translate()` - `cudf::strings::filter_characters()` - `cudf::strings::filter_characters_of_type()` - `cudf::strings::all_characters_of_type()` - `cudf::strings::reverse()` Also cleaned up some of the doxygen comments. Reference #13744 Authors: - David Wendt (https://github.com/davidwendt) - https://github.com/shrshi Approvers: - Bradley Dice (https://github.com/bdice) - https://github.com/shrshi URL: https://github.com/rapidsai/cudf/pull/14293 --- .../cudf/strings/char_types/char_types.hpp | 26 ++++--- cpp/include/cudf/strings/reverse.hpp | 4 +- cpp/include/cudf/strings/translate.hpp | 30 ++++---- cpp/src/strings/char_types/char_types.cu | 11 +-- cpp/src/strings/filter_chars.cu | 5 +- cpp/src/strings/reverse.cu | 3 +- cpp/src/strings/translate.cu | 5 +- cpp/tests/CMakeLists.txt | 2 + cpp/tests/streams/strings/filter_test.cpp | 77 +++++++++++++++++++ cpp/tests/streams/strings/reverse_test.cpp | 34 ++++++++ 10 files changed, 162 insertions(+), 35 deletions(-) create mode 100644 cpp/tests/streams/strings/filter_test.cpp create mode 100644 cpp/tests/streams/strings/reverse_test.cpp diff --git a/cpp/include/cudf/strings/char_types/char_types.hpp b/cpp/include/cudf/strings/char_types/char_types.hpp index 8b6c434719a..c6db5dab08a 100644 --- a/cpp/include/cudf/strings/char_types/char_types.hpp +++ b/cpp/include/cudf/strings/char_types/char_types.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -53,18 +53,20 @@ namespace strings { * * Any null row results in a null entry for that row in the output column. * - * @param strings Strings instance for this operation. - * @param types The character types to check in each string. + * @param input Strings instance for this operation + * @param types The character types to check in each string * @param verify_types Only verify against these character types. * Default `ALL_TYPES` means return `true` * iff all characters match `types`. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New column of boolean results for each string. + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New column of boolean results for each string */ std::unique_ptr all_characters_of_type( - strings_column_view const& strings, + strings_column_view const& input, string_character_types types, string_character_types verify_types = string_character_types::ALL_TYPES, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -96,20 +98,22 @@ std::unique_ptr all_characters_of_type( * @throw cudf::logic_error if neither or both `types_to_remove` and * `types_to_keep` are set to `ALL_TYPES`. * - * @param strings Strings instance for this operation. + * @param input Strings instance for this operation * @param types_to_remove The character types to check in each string. * Use `ALL_TYPES` here to specify `types_to_keep` instead. - * @param replacement The replacement character to use when removing characters. + * @param replacement The replacement character to use when removing characters * @param types_to_keep Default `ALL_TYPES` means all characters of * `types_to_remove` will be filtered. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New column of boolean results for each string. + * @param mr Device memory resource used to allocate the returned column's device memory + * @param stream CUDA stream used for device memory operations and kernel launches + * @return New column of boolean results for each string */ std::unique_ptr filter_characters_of_type( - strings_column_view const& strings, + strings_column_view const& input, string_character_types types_to_remove, string_scalar const& replacement = string_scalar(""), string_character_types types_to_keep = string_character_types::ALL_TYPES, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of doxygen group diff --git a/cpp/include/cudf/strings/reverse.hpp b/cpp/include/cudf/strings/reverse.hpp index 26fb36a540e..4fc8fbf67c2 100644 --- a/cpp/include/cudf/strings/reverse.hpp +++ b/cpp/include/cudf/strings/reverse.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -42,10 +42,12 @@ namespace strings { * * @param input Strings column for this operation * @param mr Device memory resource used to allocate the returned column's device memory + * @param stream CUDA stream used for device memory operations and kernel launches * @return New strings column */ std::unique_ptr reverse( strings_column_view const& input, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of doxygen group diff --git a/cpp/include/cudf/strings/translate.hpp b/cpp/include/cudf/strings/translate.hpp index 0cbf6b22029..4bd09352b09 100644 --- a/cpp/include/cudf/strings/translate.hpp +++ b/cpp/include/cudf/strings/translate.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -47,14 +47,16 @@ namespace strings { * r is now ["AA", "", "cccc", "AcQ"] * @endcode * - * @param strings Strings instance for this operation. - * @param chars_table Table of UTF-8 character mappings. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New column with padded strings. + * @param input Strings instance for this operation + * @param chars_table Table of UTF-8 character mappings + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New column with padded strings */ std::unique_ptr translate( - strings_column_view const& strings, + strings_column_view const& input, std::vector> const& chars_table, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -87,19 +89,21 @@ enum class filter_type : bool { * * @throw cudf::logic_error if `replacement` is invalid * - * @param strings Strings instance for this operation. - * @param characters_to_filter Table of character ranges to filter on. + * @param input Strings instance for this operation + * @param characters_to_filter Table of character ranges to filter on * @param keep_characters If true, the `characters_to_filter` are retained and all other characters - * are removed. - * @param replacement Optional replacement string for each character removed. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New column with filtered strings. + * are removed + * @param replacement Optional replacement string for each character removed + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New column with filtered strings */ std::unique_ptr filter_characters( - strings_column_view const& strings, + strings_column_view const& input, std::vector> characters_to_filter, filter_type keep_characters = filter_type::KEEP, string_scalar const& replacement = string_scalar(""), + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of doxygen group diff --git a/cpp/src/strings/char_types/char_types.cu b/cpp/src/strings/char_types/char_types.cu index 0c0ad0ad29e..35b0c0a2690 100644 --- a/cpp/src/strings/char_types/char_types.cu +++ b/cpp/src/strings/char_types/char_types.cu @@ -214,25 +214,26 @@ std::unique_ptr filter_characters_of_type(strings_column_view const& str // external API -std::unique_ptr all_characters_of_type(strings_column_view const& strings, +std::unique_ptr all_characters_of_type(strings_column_view const& input, string_character_types types, string_character_types verify_types, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::all_characters_of_type( - strings, types, verify_types, cudf::get_default_stream(), mr); + return detail::all_characters_of_type(input, types, verify_types, stream, mr); } -std::unique_ptr filter_characters_of_type(strings_column_view const& strings, +std::unique_ptr filter_characters_of_type(strings_column_view const& input, string_character_types types_to_remove, string_scalar const& replacement, string_character_types types_to_keep, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); return detail::filter_characters_of_type( - strings, types_to_remove, replacement, types_to_keep, cudf::get_default_stream(), mr); + input, types_to_remove, replacement, types_to_keep, stream, mr); } } // namespace strings diff --git a/cpp/src/strings/filter_chars.cu b/cpp/src/strings/filter_chars.cu index 3e38b5fa775..9f95fedfe0b 100644 --- a/cpp/src/strings/filter_chars.cu +++ b/cpp/src/strings/filter_chars.cu @@ -154,15 +154,16 @@ std::unique_ptr filter_characters( * @copydoc cudf::strings::filter_characters */ std::unique_ptr filter_characters( - strings_column_view const& strings, + strings_column_view const& input, std::vector> characters_to_filter, filter_type keep_characters, string_scalar const& replacement, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); return detail::filter_characters( - strings, characters_to_filter, keep_characters, replacement, cudf::get_default_stream(), mr); + input, characters_to_filter, keep_characters, replacement, stream, mr); } } // namespace strings diff --git a/cpp/src/strings/reverse.cu b/cpp/src/strings/reverse.cu index 090705ac25d..2855bdbb827 100644 --- a/cpp/src/strings/reverse.cu +++ b/cpp/src/strings/reverse.cu @@ -79,10 +79,11 @@ std::unique_ptr reverse(strings_column_view const& input, } // namespace detail std::unique_ptr reverse(strings_column_view const& input, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::reverse(input, cudf::get_default_stream(), mr); + return detail::reverse(input, stream, mr); } } // namespace strings diff --git a/cpp/src/strings/translate.cu b/cpp/src/strings/translate.cu index e7b637c52f3..0ca5e103d3d 100644 --- a/cpp/src/strings/translate.cu +++ b/cpp/src/strings/translate.cu @@ -124,12 +124,13 @@ std::unique_ptr translate(strings_column_view const& strings, // external APIs -std::unique_ptr translate(strings_column_view const& strings, +std::unique_ptr translate(strings_column_view const& input, std::vector> const& chars_table, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::translate(strings, chars_table, cudf::get_default_stream(), mr); + return detail::translate(input, chars_table, stream, mr); } } // namespace strings diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 95411668284..1259594dbc0 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -641,8 +641,10 @@ ConfigureTest( streams/strings/contains_test.cpp streams/strings/convert_test.cpp streams/strings/extract_test.cpp + streams/strings/filter_test.cpp streams/strings/find_test.cpp streams/strings/replace_test.cpp + streams/strings/reverse_test.cpp streams/strings/split_test.cpp streams/strings/strings_tests.cpp STREAM_MODE diff --git a/cpp/tests/streams/strings/filter_test.cpp b/cpp/tests/streams/strings/filter_test.cpp new file mode 100644 index 00000000000..3c44eb81380 --- /dev/null +++ b/cpp/tests/streams/strings/filter_test.cpp @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include +#include + +#include +#include + +class StringsFilterTest : public cudf::test::BaseFixture {}; + +static std::pair make_entry(char const* from, char const* to) +{ + cudf::char_utf8 in = 0; + cudf::char_utf8 out = 0; + cudf::strings::detail::to_char_utf8(from, in); + if (to) cudf::strings::detail::to_char_utf8(to, out); + return std::pair(in, out); +} + +TEST_F(StringsFilterTest, Translate) +{ + auto input = cudf::test::strings_column_wrapper({" aBc ", " ", "aaaa ", "\tb"}); + auto view = cudf::strings_column_view(input); + + std::vector> translate_table{ + make_entry("b", 0), make_entry("a", "A"), make_entry(" ", "_")}; + cudf::strings::translate(view, translate_table, cudf::test::get_default_stream()); +} + +TEST_F(StringsFilterTest, Filter) +{ + auto input = cudf::test::strings_column_wrapper({" aBc ", " ", "aaaa ", "\tb"}); + auto view = cudf::strings_column_view(input); + + std::vector> filter_table{ + make_entry("b", 0), make_entry("a", "A"), make_entry(" ", "_")}; + + auto const repl = cudf::string_scalar("X", true, cudf::test::get_default_stream()); + auto const keep = cudf::strings::filter_type::KEEP; + cudf::strings::filter_characters( + view, filter_table, keep, repl, cudf::test::get_default_stream()); +} + +TEST_F(StringsFilterTest, FilterTypes) +{ + auto input = cudf::test::strings_column_wrapper({" aBc ", " ", "aaaa ", "\tb"}); + auto view = cudf::strings_column_view(input); + + auto const verify_types = + cudf::strings::string_character_types::LOWER | cudf::strings::string_character_types::UPPER; + auto const all_types = cudf::strings::string_character_types::ALL_TYPES; + cudf::strings::all_characters_of_type( + view, verify_types, all_types, cudf::test::get_default_stream()); + + auto const repl = cudf::string_scalar("X", true, cudf::test::get_default_stream()); + auto const space_types = cudf::strings::string_character_types::SPACE; + cudf::strings::filter_characters_of_type( + view, all_types, repl, space_types, cudf::test::get_default_stream()); +} diff --git a/cpp/tests/streams/strings/reverse_test.cpp b/cpp/tests/streams/strings/reverse_test.cpp new file mode 100644 index 00000000000..83dcf24594e --- /dev/null +++ b/cpp/tests/streams/strings/reverse_test.cpp @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include + +#include +#include + +class StringsReverseTest : public cudf::test::BaseFixture {}; + +TEST_F(StringsReverseTest, Reverse) +{ + auto input = cudf::test::strings_column_wrapper({"aBcdef", " ", "12345"}); + auto view = cudf::strings_column_view(input); + + cudf::strings::reverse(view, cudf::test::get_default_stream()); +} From 83746a408381f45eccd15971f8a901149dce743e Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Fri, 27 Oct 2023 13:23:40 -0400 Subject: [PATCH 2/2] Expose stream parameter in public nvtext tokenize APIs (#14317) Add stream parameter to public APIs: - `nvtext::tokenize()` (x2) - `nvtext::count_tokens()` (x2) - `nvtext::character_tokenize()` - `nvtext::detokenize()` Also cleaned up some of the doxygen comments and added stream gtests. Reference #13744 Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Mike Wilson (https://github.com/hyperbolic2346) - Bradley Dice (https://github.com/bdice) - Karthikeyan (https://github.com/karthikeyann) URL: https://github.com/rapidsai/cudf/pull/14317 --- cpp/include/nvtext/tokenize.hpp | 60 ++++++++++++++---------- cpp/src/text/detokenize.cu | 7 +-- cpp/src/text/tokenize.cu | 25 ++++++---- cpp/tests/CMakeLists.txt | 4 +- cpp/tests/streams/text/tokenize_test.cpp | 53 +++++++++++++++++++++ 5 files changed, 111 insertions(+), 38 deletions(-) create mode 100644 cpp/tests/streams/text/tokenize_test.cpp diff --git a/cpp/include/nvtext/tokenize.hpp b/cpp/include/nvtext/tokenize.hpp index 44f8f44557c..107fefcc3bf 100644 --- a/cpp/include/nvtext/tokenize.hpp +++ b/cpp/include/nvtext/tokenize.hpp @@ -49,15 +49,17 @@ namespace nvtext { * * All null row entries are ignored and the output contains all valid rows. * - * @param strings Strings column tokenize. + * @param input Strings column to tokenize * @param delimiter UTF-8 characters used to separate each string into tokens. * The default of empty string will separate tokens using whitespace. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New strings columns of tokens. + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New strings columns of tokens */ std::unique_ptr tokenize( - cudf::strings_column_view const& strings, + cudf::strings_column_view const& input, cudf::string_scalar const& delimiter = cudf::string_scalar{""}, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -84,14 +86,16 @@ std::unique_ptr tokenize( * * @throw cudf::logic_error if the delimiters column is empty or contains nulls. * - * @param strings Strings column to tokenize. - * @param delimiters Strings used to separate individual strings into tokens. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New strings columns of tokens. + * @param input Strings column to tokenize + * @param delimiters Strings used to separate individual strings into tokens + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New strings columns of tokens */ std::unique_ptr tokenize( - cudf::strings_column_view const& strings, + cudf::strings_column_view const& input, cudf::strings_column_view const& delimiters, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -112,15 +116,17 @@ std::unique_ptr tokenize( * All null row entries are ignored and the output contains all valid rows. * The number of tokens for a null element is set to 0 in the output column. * - * @param strings Strings column to use for this operation - * @param delimiter Strings used to separate each string into tokens; + * @param input Strings column to count tokens + * @param delimiter Strings used to separate each string into tokens. * The default of empty string will separate tokens using whitespace. + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory * @return New column of token counts */ std::unique_ptr count_tokens( - cudf::strings_column_view const& strings, + cudf::strings_column_view const& input, cudf::string_scalar const& delimiter = cudf::string_scalar{""}, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -143,14 +149,16 @@ std::unique_ptr count_tokens( * * @throw cudf::logic_error if the delimiters column is empty or contains nulls * - * @param strings Strings column to use for this operation + * @param input Strings column to count tokens * @param delimiters Strings used to separate each string into tokens + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory * @return New column of token counts */ std::unique_ptr count_tokens( - cudf::strings_column_view const& strings, + cudf::strings_column_view const& input, cudf::strings_column_view const& delimiters, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -168,12 +176,14 @@ std::unique_ptr count_tokens( * * All null row entries are ignored and the output contains all valid rows. * - * @param strings Strings column to tokenize. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New strings columns of tokens. + * @param input Strings column to tokenize + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New strings columns of tokens */ std::unique_ptr character_tokenize( - cudf::strings_column_view const& strings, + cudf::strings_column_view const& input, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -203,16 +213,18 @@ std::unique_ptr character_tokenize( * @throw cudf::logic_error if `row_indices.size() != strings.size()` * @throw cudf::logic_error if `row_indices` contains nulls * - * @param strings Strings column to detokenize. - * @param row_indices The relative output row index assigned for each token in the input column. - * @param separator String to append after concatenating each token to the proper output row. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New strings columns of tokens. + * @param input Strings column to detokenize + * @param row_indices The relative output row index assigned for each token in the input column + * @param separator String to append after concatenating each token to the proper output row + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New strings columns of tokens */ std::unique_ptr detokenize( - cudf::strings_column_view const& strings, + cudf::strings_column_view const& input, cudf::column_view const& row_indices, cudf::string_scalar const& separator = cudf::string_scalar(" "), + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** diff --git a/cpp/src/text/detokenize.cu b/cpp/src/text/detokenize.cu index a17583cf649..38cb7dd6753 100644 --- a/cpp/src/text/detokenize.cu +++ b/cpp/src/text/detokenize.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -169,13 +169,14 @@ std::unique_ptr detokenize(cudf::strings_column_view const& string } // namespace detail -std::unique_ptr detokenize(cudf::strings_column_view const& strings, +std::unique_ptr detokenize(cudf::strings_column_view const& input, cudf::column_view const& row_indices, cudf::string_scalar const& separator, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::detokenize(strings, row_indices, separator, cudf::get_default_stream(), mr); + return detail::detokenize(input, row_indices, separator, stream, mr); } } // namespace nvtext diff --git a/cpp/src/text/tokenize.cu b/cpp/src/text/tokenize.cu index 16b9f25b802..87f6a61a533 100644 --- a/cpp/src/text/tokenize.cu +++ b/cpp/src/text/tokenize.cu @@ -232,43 +232,48 @@ std::unique_ptr character_tokenize(cudf::strings_column_view const // external APIs -std::unique_ptr tokenize(cudf::strings_column_view const& strings, +std::unique_ptr tokenize(cudf::strings_column_view const& input, cudf::string_scalar const& delimiter, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::tokenize(strings, delimiter, cudf::get_default_stream(), mr); + return detail::tokenize(input, delimiter, stream, mr); } -std::unique_ptr tokenize(cudf::strings_column_view const& strings, +std::unique_ptr tokenize(cudf::strings_column_view const& input, cudf::strings_column_view const& delimiters, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::tokenize(strings, delimiters, cudf::get_default_stream(), mr); + return detail::tokenize(input, delimiters, stream, mr); } -std::unique_ptr count_tokens(cudf::strings_column_view const& strings, +std::unique_ptr count_tokens(cudf::strings_column_view const& input, cudf::string_scalar const& delimiter, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::count_tokens(strings, delimiter, cudf::get_default_stream(), mr); + return detail::count_tokens(input, delimiter, stream, mr); } -std::unique_ptr count_tokens(cudf::strings_column_view const& strings, +std::unique_ptr count_tokens(cudf::strings_column_view const& input, cudf::strings_column_view const& delimiters, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::count_tokens(strings, delimiters, cudf::get_default_stream(), mr); + return detail::count_tokens(input, delimiters, stream, mr); } -std::unique_ptr character_tokenize(cudf::strings_column_view const& strings, +std::unique_ptr character_tokenize(cudf::strings_column_view const& input, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::character_tokenize(strings, cudf::get_default_stream(), mr); + return detail::character_tokenize(input, stream, mr); } } // namespace nvtext diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 1259594dbc0..10937212bc1 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -651,7 +651,9 @@ ConfigureTest( testing ) ConfigureTest(STREAM_SORTING_TEST streams/sorting_test.cpp STREAM_MODE testing) -ConfigureTest(STREAM_TEXT_TEST streams/text/ngrams_test.cpp STREAM_MODE testing) +ConfigureTest( + STREAM_TEXT_TEST streams/text/ngrams_test.cpp streams/text/tokenize_test.cpp STREAM_MODE testing +) ConfigureTest(STREAM_LISTS_TEST streams/lists_test.cpp STREAM_MODE testing) # ################################################################################################## diff --git a/cpp/tests/streams/text/tokenize_test.cpp b/cpp/tests/streams/text/tokenize_test.cpp new file mode 100644 index 00000000000..b281fbc2c0c --- /dev/null +++ b/cpp/tests/streams/text/tokenize_test.cpp @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include + +class TextTokenizeTest : public cudf::test::BaseFixture {}; + +TEST_F(TextTokenizeTest, Tokenize) +{ + auto const input = cudf::test::strings_column_wrapper({"the fox jumped", "over thé dog"}); + auto const view = cudf::strings_column_view(input); + auto const delimiter = cudf::string_scalar{" ", true, cudf::test::get_default_stream()}; + nvtext::tokenize(view, delimiter, cudf::test::get_default_stream()); + nvtext::count_tokens(view, delimiter, cudf::test::get_default_stream()); + auto const delimiters = cudf::test::strings_column_wrapper({" ", "o", "é"}); + nvtext::tokenize(view, cudf::strings_column_view(delimiters), cudf::test::get_default_stream()); + nvtext::count_tokens( + view, cudf::strings_column_view(delimiters), cudf::test::get_default_stream()); +} + +TEST_F(TextTokenizeTest, CharacterTokenize) +{ + auto const input = + cudf::test::strings_column_wrapper({"the", "fox", "jumped", "over", "thé", "dog"}); + nvtext::character_tokenize(cudf::strings_column_view(input), cudf::test::get_default_stream()); +} + +TEST_F(TextTokenizeTest, Detokenize) +{ + auto const input = + cudf::test::strings_column_wrapper({"the", "fox", "jumped", "over", "thé", "dog"}); + auto const view = cudf::strings_column_view(input); + auto const indices = cudf::test::fixed_width_column_wrapper({0, 0, 0, 1, 1, 1}); + auto const separator = cudf::string_scalar{" ", true, cudf::test::get_default_stream()}; + nvtext::detokenize(view, indices, separator, cudf::test::get_default_stream()); +}