From aa598bc28e6e2459ca6bcfa58f2056134e6591ea Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Wed, 11 Oct 2023 15:34:59 -0400
Subject: [PATCH] Expose stream parameter in public strings split/partition
APIs (#14247)
Follow on to PR #13997 which did not include all the split APIs or a stream test.
Add stream parameter to public APIs:
- `cudf::strings::partition()`
- `cudf::strings::rpartition()`
- `cudf::strings::split_re()`
- `cudf::strings::rsplit_re()`
- `cudf::strings::split_record_re()`
- `cudf::strings::rsplit_record_re()`
Also cleaned up some of the doxygen comments.
Reference #13744
Authors:
- David Wendt (https://github.com/davidwendt)
Approvers:
- Mark Harris (https://github.com/harrism)
- Bradley Dice (https://github.com/bdice)
- Nghia Truong (https://github.com/ttnghia)
URL: https://github.com/rapidsai/cudf/pull/14247
---
cpp/include/cudf/strings/split/partition.hpp | 22 +++++----
cpp/include/cudf/strings/split/split_re.hpp | 16 +++++--
cpp/src/strings/split/partition.cu | 10 ++--
cpp/src/strings/split/split_re.cu | 12 +++--
cpp/tests/CMakeLists.txt | 2 +-
cpp/tests/streams/strings/split_test.cpp | 49 ++++++++++++++++++++
6 files changed, 89 insertions(+), 22 deletions(-)
create mode 100644 cpp/tests/streams/strings/split_test.cpp
diff --git a/cpp/include/cudf/strings/split/partition.hpp b/cpp/include/cudf/strings/split/partition.hpp
index 52ffb735eb7..25eedf1e86b 100644
--- a/cpp/include/cudf/strings/split/partition.hpp
+++ b/cpp/include/cudf/strings/split/partition.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -51,15 +51,17 @@ namespace strings {
* r[2] is ["cd","g_h"]
* @endcode
*
- * @param strings Strings instance for this operation.
+ * @param input Strings instance for this operation
* @param delimiter UTF-8 encoded string indicating where to split each string.
* Default of empty string indicates split on whitespace.
- * @param mr Device memory resource used to allocate the returned table's device memory.
- * @return New table of strings columns.
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned table's device memory
+ * @return New table of strings columns
*/
std::unique_ptr
partition(
- strings_column_view const& strings,
+ strings_column_view const& input,
string_scalar const& delimiter = string_scalar(""),
+ rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
/**
@@ -83,15 +85,17 @@ std::unique_ptr partition(
* r[2] is ["cd","h"]
* @endcode
*
- * @param strings Strings instance for this operation.
+ * @param input Strings instance for this operation
* @param delimiter UTF-8 encoded string indicating where to split each string.
* Default of empty string indicates split on whitespace.
- * @param mr Device memory resource used to allocate the returned table's device memory.
- * @return New strings columns.
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned table's device memory
+ * @return New strings columns
*/
std::unique_ptr rpartition(
- strings_column_view const& strings,
+ strings_column_view const& input,
string_scalar const& delimiter = string_scalar(""),
+ rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
/** @} */ // end of doxygen group
diff --git a/cpp/include/cudf/strings/split/split_re.hpp b/cpp/include/cudf/strings/split/split_re.hpp
index 14fcfaecdcd..f1736cb7e0c 100644
--- a/cpp/include/cudf/strings/split/split_re.hpp
+++ b/cpp/include/cudf/strings/split/split_re.hpp
@@ -75,6 +75,7 @@ struct regex_program;
* @param prog Regex program instance
* @param maxsplit Maximum number of splits to perform.
* Default of -1 indicates all possible splits on each string.
+ * @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned result's device memory
* @return A table of columns of strings
*/
@@ -82,6 +83,7 @@ std::unique_ptr split_re(
strings_column_view const& input,
regex_program const& prog,
size_type maxsplit = -1,
+ rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
/**
@@ -125,17 +127,19 @@ std::unique_ptr split_re(
*
* @throw cudf::logic_error if `pattern` is empty.
*
- * @param input A column of string elements to be split.
+ * @param input A column of string elements to be split
* @param prog Regex program instance
* @param maxsplit Maximum number of splits to perform.
* Default of -1 indicates all possible splits on each string.
- * @param mr Device memory resource used to allocate the returned result's device memory.
- * @return A table of columns of strings.
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned result's device memory
+ * @return A table of columns of strings
*/
std::unique_ptr rsplit_re(
strings_column_view const& input,
regex_program const& prog,
size_type maxsplit = -1,
+ rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
/**
@@ -185,13 +189,15 @@ std::unique_ptr rsplit_re(
* @param prog Regex program instance
* @param maxsplit Maximum number of splits to perform.
* Default of -1 indicates all possible splits on each string.
+ * @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned result's device memory
- * @return Lists column of strings.
+ * @return Lists column of strings
*/
std::unique_ptr split_record_re(
strings_column_view const& input,
regex_program const& prog,
size_type maxsplit = -1,
+ rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
/**
@@ -243,6 +249,7 @@ std::unique_ptr split_record_re(
* @param prog Regex program instance
* @param maxsplit Maximum number of splits to perform.
* Default of -1 indicates all possible splits on each string.
+ * @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned result's device memory
* @return Lists column of strings
*/
@@ -250,6 +257,7 @@ std::unique_ptr rsplit_record_re(
strings_column_view const& input,
regex_program const& prog,
size_type maxsplit = -1,
+ rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
/** @} */ // end of doxygen group
diff --git a/cpp/src/strings/split/partition.cu b/cpp/src/strings/split/partition.cu
index 0c7d119ea38..16e6402cfef 100644
--- a/cpp/src/strings/split/partition.cu
+++ b/cpp/src/strings/split/partition.cu
@@ -239,20 +239,22 @@ std::unique_ptr rpartition(strings_column_view const& strings,
// external APIs
-std::unique_ptr partition(strings_column_view const& strings,
+std::unique_ptr partition(strings_column_view const& input,
string_scalar const& delimiter,
+ rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
- return detail::partition(strings, delimiter, cudf::get_default_stream(), mr);
+ return detail::partition(input, delimiter, stream, mr);
}
-std::unique_ptr rpartition(strings_column_view const& strings,
+std::unique_ptr rpartition(strings_column_view const& input,
string_scalar const& delimiter,
+ rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
- return detail::rpartition(strings, delimiter, cudf::get_default_stream(), mr);
+ return detail::rpartition(input, delimiter, stream, mr);
}
} // namespace strings
diff --git a/cpp/src/strings/split/split_re.cu b/cpp/src/strings/split/split_re.cu
index 3be5937297f..913aec79758 100644
--- a/cpp/src/strings/split/split_re.cu
+++ b/cpp/src/strings/split/split_re.cu
@@ -340,37 +340,41 @@ std::unique_ptr rsplit_record_re(strings_column_view const& input,
std::unique_ptr split_re(strings_column_view const& input,
regex_program const& prog,
size_type maxsplit,
+ rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
- return detail::split_re(input, prog, maxsplit, cudf::get_default_stream(), mr);
+ return detail::split_re(input, prog, maxsplit, stream, mr);
}
std::unique_ptr split_record_re(strings_column_view const& input,
regex_program const& prog,
size_type maxsplit,
+ rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
- return detail::split_record_re(input, prog, maxsplit, cudf::get_default_stream(), mr);
+ return detail::split_record_re(input, prog, maxsplit, stream, mr);
}
std::unique_ptr rsplit_re(strings_column_view const& input,
regex_program const& prog,
size_type maxsplit,
+ rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
- return detail::rsplit_re(input, prog, maxsplit, cudf::get_default_stream(), mr);
+ return detail::rsplit_re(input, prog, maxsplit, stream, mr);
}
std::unique_ptr rsplit_record_re(strings_column_view const& input,
regex_program const& prog,
size_type maxsplit,
+ rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
- return detail::rsplit_record_re(input, prog, maxsplit, cudf::get_default_stream(), mr);
+ return detail::rsplit_record_re(input, prog, maxsplit, stream, mr);
}
} // namespace strings
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index b15a6c41d39..4de18fceac1 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -634,7 +634,7 @@ ConfigureTest(STREAM_SEARCH_TEST streams/search_test.cpp STREAM_MODE testing)
ConfigureTest(STREAM_DICTIONARY_TEST streams/dictionary_test.cpp STREAM_MODE testing)
ConfigureTest(
STREAM_STRINGS_TEST streams/strings/case_test.cpp streams/strings/find_test.cpp
- streams/strings/strings_tests.cpp STREAM_MODE testing
+ streams/strings/split_test.cpp streams/strings/strings_tests.cpp STREAM_MODE testing
)
ConfigureTest(STREAM_SORTING_TEST streams/sorting_test.cpp STREAM_MODE testing)
ConfigureTest(STREAM_TEXT_TEST streams/text/ngrams_test.cpp STREAM_MODE testing)
diff --git a/cpp/tests/streams/strings/split_test.cpp b/cpp/tests/streams/strings/split_test.cpp
new file mode 100644
index 00000000000..24247f6f79c
--- /dev/null
+++ b/cpp/tests/streams/strings/split_test.cpp
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include
+#include
+#include
+
+#include
+#include
+#include
+#include
+
+#include
+
+class StringsSplitTest : public cudf::test::BaseFixture {};
+
+TEST_F(StringsSplitTest, SplitPartition)
+{
+ auto input = cudf::test::strings_column_wrapper({"Héllo thesé", "tést strings", ""});
+ auto view = cudf::strings_column_view(input);
+
+ auto const delimiter = cudf::string_scalar("é", true, cudf::test::get_default_stream());
+ cudf::strings::split(view, delimiter, -1, cudf::test::get_default_stream());
+ cudf::strings::rsplit(view, delimiter, -1, cudf::test::get_default_stream());
+ cudf::strings::split_record(view, delimiter, -1, cudf::test::get_default_stream());
+ cudf::strings::rsplit_record(view, delimiter, -1, cudf::test::get_default_stream());
+ cudf::strings::partition(view, delimiter, cudf::test::get_default_stream());
+ cudf::strings::rpartition(view, delimiter, cudf::test::get_default_stream());
+
+ auto const pattern = std::string("\\s");
+ auto const prog = cudf::strings::regex_program::create(pattern);
+ cudf::strings::split_re(view, *prog, -1, cudf::test::get_default_stream());
+ cudf::strings::split_record_re(view, *prog, -1, cudf::test::get_default_stream());
+ cudf::strings::rsplit_re(view, *prog, -1, cudf::test::get_default_stream());
+ cudf::strings::rsplit_record_re(view, *prog, -1, cudf::test::get_default_stream());
+}