From 4da57497a8cd7607c10f713a4ae3831646ed5db5 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Wed, 4 Oct 2023 11:10:45 -0400 Subject: [PATCH 1/2] Expose stream parameter in public strings split/partition APIs --- cpp/include/cudf/strings/split/partition.hpp | 22 +++++---- cpp/include/cudf/strings/split/split_re.hpp | 8 ++++ cpp/src/strings/split/partition.cu | 10 ++-- cpp/src/strings/split/split_re.cu | 12 +++-- cpp/tests/CMakeLists.txt | 4 +- cpp/tests/streams/strings/split_test.cpp | 49 ++++++++++++++++++++ 6 files changed, 86 insertions(+), 19 deletions(-) create mode 100644 cpp/tests/streams/strings/split_test.cpp diff --git a/cpp/include/cudf/strings/split/partition.hpp b/cpp/include/cudf/strings/split/partition.hpp index 52ffb735eb7..25eedf1e86b 100644 --- a/cpp/include/cudf/strings/split/partition.hpp +++ b/cpp/include/cudf/strings/split/partition.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -51,15 +51,17 @@ namespace strings { * r[2] is ["cd","g_h"] * @endcode * - * @param strings Strings instance for this operation. + * @param input Strings instance for this operation * @param delimiter UTF-8 encoded string indicating where to split each string. * Default of empty string indicates split on whitespace. - * @param mr Device memory resource used to allocate the returned table's device memory. - * @return New table of strings columns. + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned table's device memory + * @return New table of strings columns */ std::unique_ptr partition( - strings_column_view const& strings, + strings_column_view const& input, string_scalar const& delimiter = string_scalar(""), + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -83,15 +85,17 @@ std::unique_ptr
partition( * r[2] is ["cd","h"] * @endcode * - * @param strings Strings instance for this operation. + * @param input Strings instance for this operation * @param delimiter UTF-8 encoded string indicating where to split each string. * Default of empty string indicates split on whitespace. - * @param mr Device memory resource used to allocate the returned table's device memory. - * @return New strings columns. + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned table's device memory + * @return New strings columns */ std::unique_ptr
rpartition( - strings_column_view const& strings, + strings_column_view const& input, string_scalar const& delimiter = string_scalar(""), + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of doxygen group diff --git a/cpp/include/cudf/strings/split/split_re.hpp b/cpp/include/cudf/strings/split/split_re.hpp index 14fcfaecdcd..a12b0a4ccd6 100644 --- a/cpp/include/cudf/strings/split/split_re.hpp +++ b/cpp/include/cudf/strings/split/split_re.hpp @@ -75,6 +75,7 @@ struct regex_program; * @param prog Regex program instance * @param maxsplit Maximum number of splits to perform. * Default of -1 indicates all possible splits on each string. + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned result's device memory * @return A table of columns of strings */ @@ -82,6 +83,7 @@ std::unique_ptr
split_re( strings_column_view const& input, regex_program const& prog, size_type maxsplit = -1, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -129,6 +131,7 @@ std::unique_ptr
split_re( * @param prog Regex program instance * @param maxsplit Maximum number of splits to perform. * Default of -1 indicates all possible splits on each string. + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned result's device memory. * @return A table of columns of strings. */ @@ -136,6 +139,7 @@ std::unique_ptr
rsplit_re( strings_column_view const& input, regex_program const& prog, size_type maxsplit = -1, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -185,6 +189,7 @@ std::unique_ptr
rsplit_re( * @param prog Regex program instance * @param maxsplit Maximum number of splits to perform. * Default of -1 indicates all possible splits on each string. + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned result's device memory * @return Lists column of strings. */ @@ -192,6 +197,7 @@ std::unique_ptr split_record_re( strings_column_view const& input, regex_program const& prog, size_type maxsplit = -1, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -243,6 +249,7 @@ std::unique_ptr split_record_re( * @param prog Regex program instance * @param maxsplit Maximum number of splits to perform. * Default of -1 indicates all possible splits on each string. + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned result's device memory * @return Lists column of strings */ @@ -250,6 +257,7 @@ std::unique_ptr rsplit_record_re( strings_column_view const& input, regex_program const& prog, size_type maxsplit = -1, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of doxygen group diff --git a/cpp/src/strings/split/partition.cu b/cpp/src/strings/split/partition.cu index 0c7d119ea38..16e6402cfef 100644 --- a/cpp/src/strings/split/partition.cu +++ b/cpp/src/strings/split/partition.cu @@ -239,20 +239,22 @@ std::unique_ptr
rpartition(strings_column_view const& strings, // external APIs -std::unique_ptr
partition(strings_column_view const& strings, +std::unique_ptr
partition(strings_column_view const& input, string_scalar const& delimiter, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::partition(strings, delimiter, cudf::get_default_stream(), mr); + return detail::partition(input, delimiter, stream, mr); } -std::unique_ptr
rpartition(strings_column_view const& strings, +std::unique_ptr
rpartition(strings_column_view const& input, string_scalar const& delimiter, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::rpartition(strings, delimiter, cudf::get_default_stream(), mr); + return detail::rpartition(input, delimiter, stream, mr); } } // namespace strings diff --git a/cpp/src/strings/split/split_re.cu b/cpp/src/strings/split/split_re.cu index 3be5937297f..913aec79758 100644 --- a/cpp/src/strings/split/split_re.cu +++ b/cpp/src/strings/split/split_re.cu @@ -340,37 +340,41 @@ std::unique_ptr rsplit_record_re(strings_column_view const& input, std::unique_ptr
split_re(strings_column_view const& input, regex_program const& prog, size_type maxsplit, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::split_re(input, prog, maxsplit, cudf::get_default_stream(), mr); + return detail::split_re(input, prog, maxsplit, stream, mr); } std::unique_ptr split_record_re(strings_column_view const& input, regex_program const& prog, size_type maxsplit, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::split_record_re(input, prog, maxsplit, cudf::get_default_stream(), mr); + return detail::split_record_re(input, prog, maxsplit, stream, mr); } std::unique_ptr
rsplit_re(strings_column_view const& input, regex_program const& prog, size_type maxsplit, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::rsplit_re(input, prog, maxsplit, cudf::get_default_stream(), mr); + return detail::rsplit_re(input, prog, maxsplit, stream, mr); } std::unique_ptr rsplit_record_re(strings_column_view const& input, regex_program const& prog, size_type maxsplit, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::rsplit_record_re(input, prog, maxsplit, cudf::get_default_stream(), mr); + return detail::rsplit_record_re(input, prog, maxsplit, stream, mr); } } // namespace strings diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 04939f3cd6d..28bf932d2b2 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -632,8 +632,8 @@ ConfigureTest(STREAM_REPLACE_TEST streams/replace_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_SEARCH_TEST streams/search_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_DICTIONARY_TEST streams/dictionary_test.cpp STREAM_MODE testing) ConfigureTest( - STREAM_STRINGS_TEST streams/strings/case_test.cpp streams/strings/find_test.cpp STREAM_MODE - testing + STREAM_STRINGS_TEST streams/strings/case_test.cpp streams/strings/find_test.cpp + streams/strings/split_test.cpp STREAM_MODE testing ) ConfigureTest(STREAM_SORTING_TEST streams/sorting_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_TEXT_TEST streams/text/ngrams_test.cpp STREAM_MODE testing) diff --git a/cpp/tests/streams/strings/split_test.cpp b/cpp/tests/streams/strings/split_test.cpp new file mode 100644 index 00000000000..a5f67ad7524 --- /dev/null +++ b/cpp/tests/streams/strings/split_test.cpp @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +#include +#include +#include + +#include + +class StringsSplitTest : public cudf::test::BaseFixture {}; + +TEST_F(StringsSplitTest, SplitPartition) +{ + auto input = cudf::test::strings_column_wrapper({"Héllo thesé", "tést strings", ""}); + auto view = cudf::strings_column_view(input); + + auto const delimiter = cudf::string_scalar("é", true, cudf::test::get_default_stream()); + cudf::strings::split(view, delimiter, -1, cudf::test::get_default_stream()); + cudf::strings::rsplit(view, delimiter, -1, cudf::test::get_default_stream()); + cudf::strings::split_record(view, delimiter, -1, cudf::test::get_default_stream()); + cudf::strings::rsplit_record(view, delimiter, -1, cudf::test::get_default_stream()); + cudf::strings::partition(view, delimiter, cudf::test::get_default_stream()); + cudf::strings::rpartition(view, delimiter, cudf::test::get_default_stream()); + + auto const pattern = std::string("\\s"); + auto const prog = cudf::strings::regex_program::create(pattern); + cudf::strings::split_re(view, *prog, -1, cudf::test::get_default_stream()); + cudf::strings::split_record_re(view, *prog, -1, cudf::test::get_default_stream()); + cudf::strings::rsplit_re(view, *prog, -1, cudf::test::get_default_stream()); + cudf::strings::rsplit_record_re(view, *prog, -1, cudf::test::get_default_stream()); +} From d7cfbcba2fcb1abb2ce12122052a7030e331734e Mon Sep 17 00:00:00 2001 From: David Wendt Date: Wed, 4 Oct 2023 17:21:51 -0400 Subject: [PATCH 2/2] fix doxygen consistency --- cpp/include/cudf/strings/split/split_re.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/include/cudf/strings/split/split_re.hpp b/cpp/include/cudf/strings/split/split_re.hpp index a12b0a4ccd6..f1736cb7e0c 100644 --- a/cpp/include/cudf/strings/split/split_re.hpp +++ b/cpp/include/cudf/strings/split/split_re.hpp @@ -127,13 +127,13 @@ std::unique_ptr
split_re( * * @throw cudf::logic_error if `pattern` is empty. * - * @param input A column of string elements to be split. + * @param input A column of string elements to be split * @param prog Regex program instance * @param maxsplit Maximum number of splits to perform. * Default of -1 indicates all possible splits on each string. * @param stream CUDA stream used for device memory operations and kernel launches - * @param mr Device memory resource used to allocate the returned result's device memory. - * @return A table of columns of strings. + * @param mr Device memory resource used to allocate the returned result's device memory + * @return A table of columns of strings */ std::unique_ptr
rsplit_re( strings_column_view const& input, @@ -191,7 +191,7 @@ std::unique_ptr
rsplit_re( * Default of -1 indicates all possible splits on each string. * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned result's device memory - * @return Lists column of strings. + * @return Lists column of strings */ std::unique_ptr split_record_re( strings_column_view const& input,