From aa598bc28e6e2459ca6bcfa58f2056134e6591ea Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Wed, 11 Oct 2023 15:34:59 -0400 Subject: [PATCH] Expose stream parameter in public strings split/partition APIs (#14247) Follow on to PR #13997 which did not include all the split APIs or a stream test. Add stream parameter to public APIs: - `cudf::strings::partition()` - `cudf::strings::rpartition()` - `cudf::strings::split_re()` - `cudf::strings::rsplit_re()` - `cudf::strings::split_record_re()` - `cudf::strings::rsplit_record_re()` Also cleaned up some of the doxygen comments. Reference #13744 Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Mark Harris (https://github.com/harrism) - Bradley Dice (https://github.com/bdice) - Nghia Truong (https://github.com/ttnghia) URL: https://github.com/rapidsai/cudf/pull/14247 --- cpp/include/cudf/strings/split/partition.hpp | 22 +++++---- cpp/include/cudf/strings/split/split_re.hpp | 16 +++++-- cpp/src/strings/split/partition.cu | 10 ++-- cpp/src/strings/split/split_re.cu | 12 +++-- cpp/tests/CMakeLists.txt | 2 +- cpp/tests/streams/strings/split_test.cpp | 49 ++++++++++++++++++++ 6 files changed, 89 insertions(+), 22 deletions(-) create mode 100644 cpp/tests/streams/strings/split_test.cpp diff --git a/cpp/include/cudf/strings/split/partition.hpp b/cpp/include/cudf/strings/split/partition.hpp index 52ffb735eb7..25eedf1e86b 100644 --- a/cpp/include/cudf/strings/split/partition.hpp +++ b/cpp/include/cudf/strings/split/partition.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -51,15 +51,17 @@ namespace strings { * r[2] is ["cd","g_h"] * @endcode * - * @param strings Strings instance for this operation. + * @param input Strings instance for this operation * @param delimiter UTF-8 encoded string indicating where to split each string. * Default of empty string indicates split on whitespace. - * @param mr Device memory resource used to allocate the returned table's device memory. - * @return New table of strings columns. + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned table's device memory + * @return New table of strings columns */ std::unique_ptr partition( - strings_column_view const& strings, + strings_column_view const& input, string_scalar const& delimiter = string_scalar(""), + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -83,15 +85,17 @@ std::unique_ptr
partition( * r[2] is ["cd","h"] * @endcode * - * @param strings Strings instance for this operation. + * @param input Strings instance for this operation * @param delimiter UTF-8 encoded string indicating where to split each string. * Default of empty string indicates split on whitespace. - * @param mr Device memory resource used to allocate the returned table's device memory. - * @return New strings columns. + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned table's device memory + * @return New strings columns */ std::unique_ptr
rpartition( - strings_column_view const& strings, + strings_column_view const& input, string_scalar const& delimiter = string_scalar(""), + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of doxygen group diff --git a/cpp/include/cudf/strings/split/split_re.hpp b/cpp/include/cudf/strings/split/split_re.hpp index 14fcfaecdcd..f1736cb7e0c 100644 --- a/cpp/include/cudf/strings/split/split_re.hpp +++ b/cpp/include/cudf/strings/split/split_re.hpp @@ -75,6 +75,7 @@ struct regex_program; * @param prog Regex program instance * @param maxsplit Maximum number of splits to perform. * Default of -1 indicates all possible splits on each string. + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned result's device memory * @return A table of columns of strings */ @@ -82,6 +83,7 @@ std::unique_ptr
split_re( strings_column_view const& input, regex_program const& prog, size_type maxsplit = -1, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -125,17 +127,19 @@ std::unique_ptr
split_re( * * @throw cudf::logic_error if `pattern` is empty. * - * @param input A column of string elements to be split. + * @param input A column of string elements to be split * @param prog Regex program instance * @param maxsplit Maximum number of splits to perform. * Default of -1 indicates all possible splits on each string. - * @param mr Device memory resource used to allocate the returned result's device memory. - * @return A table of columns of strings. + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned result's device memory + * @return A table of columns of strings */ std::unique_ptr
rsplit_re( strings_column_view const& input, regex_program const& prog, size_type maxsplit = -1, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -185,13 +189,15 @@ std::unique_ptr
rsplit_re( * @param prog Regex program instance * @param maxsplit Maximum number of splits to perform. * Default of -1 indicates all possible splits on each string. + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned result's device memory - * @return Lists column of strings. + * @return Lists column of strings */ std::unique_ptr split_record_re( strings_column_view const& input, regex_program const& prog, size_type maxsplit = -1, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -243,6 +249,7 @@ std::unique_ptr split_record_re( * @param prog Regex program instance * @param maxsplit Maximum number of splits to perform. * Default of -1 indicates all possible splits on each string. + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned result's device memory * @return Lists column of strings */ @@ -250,6 +257,7 @@ std::unique_ptr rsplit_record_re( strings_column_view const& input, regex_program const& prog, size_type maxsplit = -1, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of doxygen group diff --git a/cpp/src/strings/split/partition.cu b/cpp/src/strings/split/partition.cu index 0c7d119ea38..16e6402cfef 100644 --- a/cpp/src/strings/split/partition.cu +++ b/cpp/src/strings/split/partition.cu @@ -239,20 +239,22 @@ std::unique_ptr
rpartition(strings_column_view const& strings, // external APIs -std::unique_ptr
partition(strings_column_view const& strings, +std::unique_ptr
partition(strings_column_view const& input, string_scalar const& delimiter, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::partition(strings, delimiter, cudf::get_default_stream(), mr); + return detail::partition(input, delimiter, stream, mr); } -std::unique_ptr
rpartition(strings_column_view const& strings, +std::unique_ptr
rpartition(strings_column_view const& input, string_scalar const& delimiter, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::rpartition(strings, delimiter, cudf::get_default_stream(), mr); + return detail::rpartition(input, delimiter, stream, mr); } } // namespace strings diff --git a/cpp/src/strings/split/split_re.cu b/cpp/src/strings/split/split_re.cu index 3be5937297f..913aec79758 100644 --- a/cpp/src/strings/split/split_re.cu +++ b/cpp/src/strings/split/split_re.cu @@ -340,37 +340,41 @@ std::unique_ptr rsplit_record_re(strings_column_view const& input, std::unique_ptr
split_re(strings_column_view const& input, regex_program const& prog, size_type maxsplit, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::split_re(input, prog, maxsplit, cudf::get_default_stream(), mr); + return detail::split_re(input, prog, maxsplit, stream, mr); } std::unique_ptr split_record_re(strings_column_view const& input, regex_program const& prog, size_type maxsplit, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::split_record_re(input, prog, maxsplit, cudf::get_default_stream(), mr); + return detail::split_record_re(input, prog, maxsplit, stream, mr); } std::unique_ptr
rsplit_re(strings_column_view const& input, regex_program const& prog, size_type maxsplit, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::rsplit_re(input, prog, maxsplit, cudf::get_default_stream(), mr); + return detail::rsplit_re(input, prog, maxsplit, stream, mr); } std::unique_ptr rsplit_record_re(strings_column_view const& input, regex_program const& prog, size_type maxsplit, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::rsplit_record_re(input, prog, maxsplit, cudf::get_default_stream(), mr); + return detail::rsplit_record_re(input, prog, maxsplit, stream, mr); } } // namespace strings diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index b15a6c41d39..4de18fceac1 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -634,7 +634,7 @@ ConfigureTest(STREAM_SEARCH_TEST streams/search_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_DICTIONARY_TEST streams/dictionary_test.cpp STREAM_MODE testing) ConfigureTest( STREAM_STRINGS_TEST streams/strings/case_test.cpp streams/strings/find_test.cpp - streams/strings/strings_tests.cpp STREAM_MODE testing + streams/strings/split_test.cpp streams/strings/strings_tests.cpp STREAM_MODE testing ) ConfigureTest(STREAM_SORTING_TEST streams/sorting_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_TEXT_TEST streams/text/ngrams_test.cpp STREAM_MODE testing) diff --git a/cpp/tests/streams/strings/split_test.cpp b/cpp/tests/streams/strings/split_test.cpp new file mode 100644 index 00000000000..24247f6f79c --- /dev/null +++ b/cpp/tests/streams/strings/split_test.cpp @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include +#include +#include +#include + +#include + +class StringsSplitTest : public cudf::test::BaseFixture {}; + +TEST_F(StringsSplitTest, SplitPartition) +{ + auto input = cudf::test::strings_column_wrapper({"Héllo thesé", "tést strings", ""}); + auto view = cudf::strings_column_view(input); + + auto const delimiter = cudf::string_scalar("é", true, cudf::test::get_default_stream()); + cudf::strings::split(view, delimiter, -1, cudf::test::get_default_stream()); + cudf::strings::rsplit(view, delimiter, -1, cudf::test::get_default_stream()); + cudf::strings::split_record(view, delimiter, -1, cudf::test::get_default_stream()); + cudf::strings::rsplit_record(view, delimiter, -1, cudf::test::get_default_stream()); + cudf::strings::partition(view, delimiter, cudf::test::get_default_stream()); + cudf::strings::rpartition(view, delimiter, cudf::test::get_default_stream()); + + auto const pattern = std::string("\\s"); + auto const prog = cudf::strings::regex_program::create(pattern); + cudf::strings::split_re(view, *prog, -1, cudf::test::get_default_stream()); + cudf::strings::split_record_re(view, *prog, -1, cudf::test::get_default_stream()); + cudf::strings::rsplit_re(view, *prog, -1, cudf::test::get_default_stream()); + cudf::strings::rsplit_record_re(view, *prog, -1, cudf::test::get_default_stream()); +}