From 865c21e4262aff1d6f99fdb00b892e7521087ffa Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Wed, 25 Oct 2023 15:52:12 -0400 Subject: [PATCH] Expose stream parameter in public strings contains APIs (#14280) Add stream parameter to public APIs: - `cudf::strings::contains_re()` - `cudf::strings::matches_re()` - `cudf::strings::count_re()` - `cudf::strings::like()` (x2) - `cudf::strings::extract()` - `cudf::strings::extract_all_record()` Also cleaned up some of the doxygen comments. Reference #13744 Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Mark Harris (https://github.com/harrism) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/14280 --- cpp/include/cudf/strings/contains.hpp | 32 ++++++++----- cpp/include/cudf/strings/extract.hpp | 12 +++-- cpp/src/strings/contains.cu | 15 +++--- cpp/src/strings/extract/extract.cu | 5 +- cpp/src/strings/extract/extract_all.cu | 5 +- cpp/src/strings/like.cu | 6 ++- cpp/tests/CMakeLists.txt | 2 + cpp/tests/streams/strings/contains_test.cpp | 52 +++++++++++++++++++++ cpp/tests/streams/strings/extract_test.cpp | 37 +++++++++++++++ 9 files changed, 139 insertions(+), 27 deletions(-) create mode 100644 cpp/tests/streams/strings/contains_test.cpp create mode 100644 cpp/tests/streams/strings/extract_test.cpp diff --git a/cpp/include/cudf/strings/contains.hpp b/cpp/include/cudf/strings/contains.hpp index 23c77cb60da..341c146df92 100644 --- a/cpp/include/cudf/strings/contains.hpp +++ b/cpp/include/cudf/strings/contains.hpp @@ -31,7 +31,7 @@ struct regex_program; * @addtogroup strings_contains * @{ * @file strings/contains.hpp - * @brief Strings APIs for regex contains, count, matches + * @brief Strings APIs for regex contains, count, matches, like */ /** @@ -50,14 +50,16 @@ struct regex_program; * * See the @ref md_regex "Regex Features" page for details on patterns supported by this API. * - * @param strings Strings instance for this operation + * @param input Strings instance for this operation * @param prog Regex program instance + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory * @return New column of boolean results for each string */ std::unique_ptr contains_re( - strings_column_view const& strings, + strings_column_view const& input, regex_program const& prog, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -76,14 +78,16 @@ std::unique_ptr contains_re( * * See the @ref md_regex "Regex Features" page for details on patterns supported by this API. * - * @param strings Strings instance for this operation + * @param input Strings instance for this operation * @param prog Regex program instance + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory * @return New column of boolean results for each string */ std::unique_ptr matches_re( - strings_column_view const& strings, + strings_column_view const& input, regex_program const& prog, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -102,14 +106,16 @@ std::unique_ptr matches_re( * * See the @ref md_regex "Regex Features" page for details on patterns supported by this API. * - * @param strings Strings instance for this operation + * @param input Strings instance for this operation * @param prog Regex program instance + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory * @return New column of match counts for each string */ std::unique_ptr count_re( - strings_column_view const& strings, + strings_column_view const& input, regex_program const& prog, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -146,8 +152,9 @@ std::unique_ptr count_re( * * @param input Strings instance for this operation * @param pattern Like pattern to match within each string - * @param escape_character Optional character specifies the escape prefix; - * default is no escape character + * @param escape_character Optional character specifies the escape prefix. + * Default is no escape character. + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory * @return New boolean column */ @@ -155,6 +162,7 @@ std::unique_ptr like( strings_column_view const& input, string_scalar const& pattern, string_scalar const& escape_character = string_scalar(""), + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -185,8 +193,9 @@ std::unique_ptr like( * * @param input Strings instance for this operation * @param patterns Like patterns to match within each corresponding string - * @param escape_character Optional character specifies the escape prefix; - * default is no escape character + * @param escape_character Optional character specifies the escape prefix. + * Default is no escape character. + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory * @return New boolean column */ @@ -194,6 +203,7 @@ std::unique_ptr like( strings_column_view const& input, strings_column_view const& patterns, string_scalar const& escape_character = string_scalar(""), + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of doxygen group diff --git a/cpp/include/cudf/strings/extract.hpp b/cpp/include/cudf/strings/extract.hpp index 586cb1f3f26..a4db1ac46da 100644 --- a/cpp/include/cudf/strings/extract.hpp +++ b/cpp/include/cudf/strings/extract.hpp @@ -53,14 +53,16 @@ struct regex_program; * * See the @ref md_regex "Regex Features" page for details on patterns supported by this API. * - * @param strings Strings instance for this operation + * @param input Strings instance for this operation * @param prog Regex program instance + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table's device memory * @return Columns of strings extracted from the input column */ std::unique_ptr extract( - strings_column_view const& strings, + strings_column_view const& input, regex_program const& prog, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -87,14 +89,16 @@ std::unique_ptr
extract( * * See the @ref md_regex "Regex Features" page for details on patterns supported by this API. * - * @param strings Strings instance for this operation + * @param input Strings instance for this operation * @param prog Regex program instance + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate any returned device memory * @return Lists column containing strings extracted from the input column */ std::unique_ptr extract_all_record( - strings_column_view const& strings, + strings_column_view const& input, regex_program const& prog, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of doxygen group diff --git a/cpp/src/strings/contains.cu b/cpp/src/strings/contains.cu index 22534870409..4383f358a33 100644 --- a/cpp/src/strings/contains.cu +++ b/cpp/src/strings/contains.cu @@ -123,28 +123,31 @@ std::unique_ptr count_re(strings_column_view const& input, // external APIs -std::unique_ptr contains_re(strings_column_view const& strings, +std::unique_ptr contains_re(strings_column_view const& input, regex_program const& prog, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::contains_re(strings, prog, cudf::get_default_stream(), mr); + return detail::contains_re(input, prog, stream, mr); } -std::unique_ptr matches_re(strings_column_view const& strings, +std::unique_ptr matches_re(strings_column_view const& input, regex_program const& prog, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::matches_re(strings, prog, cudf::get_default_stream(), mr); + return detail::matches_re(input, prog, stream, mr); } -std::unique_ptr count_re(strings_column_view const& strings, +std::unique_ptr count_re(strings_column_view const& input, regex_program const& prog, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::count_re(strings, prog, cudf::get_default_stream(), mr); + return detail::count_re(input, prog, stream, mr); } } // namespace strings diff --git a/cpp/src/strings/extract/extract.cu b/cpp/src/strings/extract/extract.cu index 532053e750e..8edcd167e5c 100644 --- a/cpp/src/strings/extract/extract.cu +++ b/cpp/src/strings/extract/extract.cu @@ -131,12 +131,13 @@ std::unique_ptr
extract(strings_column_view const& input, // external API -std::unique_ptr
extract(strings_column_view const& strings, +std::unique_ptr
extract(strings_column_view const& input, regex_program const& prog, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::extract(strings, prog, cudf::get_default_stream(), mr); + return detail::extract(input, prog, stream, mr); } } // namespace strings diff --git a/cpp/src/strings/extract/extract_all.cu b/cpp/src/strings/extract/extract_all.cu index 8a2f8f0cbfc..0c0d4ae4fbf 100644 --- a/cpp/src/strings/extract/extract_all.cu +++ b/cpp/src/strings/extract/extract_all.cu @@ -164,12 +164,13 @@ std::unique_ptr extract_all_record(strings_column_view const& input, // external API -std::unique_ptr extract_all_record(strings_column_view const& strings, +std::unique_ptr extract_all_record(strings_column_view const& input, regex_program const& prog, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::extract_all_record(strings, prog, cudf::get_default_stream(), mr); + return detail::extract_all_record(input, prog, stream, mr); } } // namespace strings diff --git a/cpp/src/strings/like.cu b/cpp/src/strings/like.cu index 5b91f295efb..93e00592ef2 100644 --- a/cpp/src/strings/like.cu +++ b/cpp/src/strings/like.cu @@ -185,19 +185,21 @@ std::unique_ptr like(strings_column_view const& input, std::unique_ptr like(strings_column_view const& input, string_scalar const& pattern, string_scalar const& escape_character, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::like(input, pattern, escape_character, cudf::get_default_stream(), mr); + return detail::like(input, pattern, escape_character, stream, mr); } std::unique_ptr like(strings_column_view const& input, strings_column_view const& patterns, string_scalar const& escape_character, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::like(input, patterns, escape_character, cudf::get_default_stream(), mr); + return detail::like(input, patterns, escape_character, stream, mr); } } // namespace strings diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index e7f4914fe05..95411668284 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -638,7 +638,9 @@ ConfigureTest( STREAM_STRINGS_TEST streams/strings/case_test.cpp streams/strings/combine_test.cpp + streams/strings/contains_test.cpp streams/strings/convert_test.cpp + streams/strings/extract_test.cpp streams/strings/find_test.cpp streams/strings/replace_test.cpp streams/strings/split_test.cpp diff --git a/cpp/tests/streams/strings/contains_test.cpp b/cpp/tests/streams/strings/contains_test.cpp new file mode 100644 index 00000000000..383d48abe1e --- /dev/null +++ b/cpp/tests/streams/strings/contains_test.cpp @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include +#include + +#include + +class StringsContainsTest : public cudf::test::BaseFixture {}; + +TEST_F(StringsContainsTest, Contains) +{ + auto input = cudf::test::strings_column_wrapper({"Héllo", "thesé", "tést strings", ""}); + auto view = cudf::strings_column_view(input); + + auto const pattern = std::string("[a-z]"); + auto const prog = cudf::strings::regex_program::create(pattern); + cudf::strings::contains_re(view, *prog, cudf::test::get_default_stream()); + cudf::strings::matches_re(view, *prog, cudf::test::get_default_stream()); + cudf::strings::count_re(view, *prog, cudf::test::get_default_stream()); +} + +TEST_F(StringsContainsTest, Like) +{ + auto input = cudf::test::strings_column_wrapper({"Héllo", "thesés", "tést", ""}); + auto view = cudf::strings_column_view(input); + + auto const pattern = cudf::string_scalar("%és", true, cudf::test::get_default_stream()); + auto const escape = cudf::string_scalar("%", true, cudf::test::get_default_stream()); + cudf::strings::like(view, pattern, escape, cudf::test::get_default_stream()); + + auto const patterns = cudf::test::strings_column_wrapper({"H%", "t%s", "t", ""}); + cudf::strings::like( + view, cudf::strings_column_view(patterns), escape, cudf::test::get_default_stream()); +} diff --git a/cpp/tests/streams/strings/extract_test.cpp b/cpp/tests/streams/strings/extract_test.cpp new file mode 100644 index 00000000000..06570fc5b38 --- /dev/null +++ b/cpp/tests/streams/strings/extract_test.cpp @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include +#include + +#include + +class StringsExtractTest : public cudf::test::BaseFixture {}; + +TEST_F(StringsExtractTest, Extract) +{ + auto input = cudf::test::strings_column_wrapper({"Joe Schmoe", "John Smith", "Jane Smith"}); + auto view = cudf::strings_column_view(input); + + auto const pattern = std::string("([A-Z][a-z]+)"); + auto const prog = cudf::strings::regex_program::create(pattern); + cudf::strings::extract(view, *prog, cudf::test::get_default_stream()); + cudf::strings::extract_all_record(view, *prog, cudf::test::get_default_stream()); +}