Skip to content

Commit

Permalink
Expose stream parameter in public strings contains APIs (#14280)
Browse files Browse the repository at this point in the history
Add stream parameter to public APIs:

- `cudf::strings::contains_re()`
- `cudf::strings::matches_re()`
- `cudf::strings::count_re()`
- `cudf::strings::like()` (x2)
- `cudf::strings::extract()`
- `cudf::strings::extract_all_record()`

Also cleaned up some of the doxygen comments.

Reference #13744

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Mark Harris (https://github.com/harrism)
  - Bradley Dice (https://github.com/bdice)

URL: #14280
  • Loading branch information
davidwendt authored Oct 25, 2023
1 parent 91aeec8 commit 865c21e
Show file tree
Hide file tree
Showing 9 changed files with 139 additions and 27 deletions.
32 changes: 21 additions & 11 deletions cpp/include/cudf/strings/contains.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ struct regex_program;
* @addtogroup strings_contains
* @{
* @file strings/contains.hpp
* @brief Strings APIs for regex contains, count, matches
* @brief Strings APIs for regex contains, count, matches, like
*/

/**
Expand All @@ -50,14 +50,16 @@ struct regex_program;
*
* See the @ref md_regex "Regex Features" page for details on patterns supported by this API.
*
* @param strings Strings instance for this operation
* @param input Strings instance for this operation
* @param prog Regex program instance
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned column's device memory
* @return New column of boolean results for each string
*/
std::unique_ptr<column> contains_re(
strings_column_view const& strings,
strings_column_view const& input,
regex_program const& prog,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
Expand All @@ -76,14 +78,16 @@ std::unique_ptr<column> contains_re(
*
* See the @ref md_regex "Regex Features" page for details on patterns supported by this API.
*
* @param strings Strings instance for this operation
* @param input Strings instance for this operation
* @param prog Regex program instance
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned column's device memory
* @return New column of boolean results for each string
*/
std::unique_ptr<column> matches_re(
strings_column_view const& strings,
strings_column_view const& input,
regex_program const& prog,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
Expand All @@ -102,14 +106,16 @@ std::unique_ptr<column> matches_re(
*
* See the @ref md_regex "Regex Features" page for details on patterns supported by this API.
*
* @param strings Strings instance for this operation
* @param input Strings instance for this operation
* @param prog Regex program instance
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned column's device memory
* @return New column of match counts for each string
*/
std::unique_ptr<column> count_re(
strings_column_view const& strings,
strings_column_view const& input,
regex_program const& prog,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
Expand Down Expand Up @@ -146,15 +152,17 @@ std::unique_ptr<column> count_re(
*
* @param input Strings instance for this operation
* @param pattern Like pattern to match within each string
* @param escape_character Optional character specifies the escape prefix;
* default is no escape character
* @param escape_character Optional character specifies the escape prefix.
* Default is no escape character.
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned column's device memory
* @return New boolean column
*/
std::unique_ptr<column> like(
strings_column_view const& input,
string_scalar const& pattern,
string_scalar const& escape_character = string_scalar(""),
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
Expand Down Expand Up @@ -185,15 +193,17 @@ std::unique_ptr<column> like(
*
* @param input Strings instance for this operation
* @param patterns Like patterns to match within each corresponding string
* @param escape_character Optional character specifies the escape prefix;
* default is no escape character
* @param escape_character Optional character specifies the escape prefix.
* Default is no escape character.
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned column's device memory
* @return New boolean column
*/
std::unique_ptr<column> like(
strings_column_view const& input,
strings_column_view const& patterns,
string_scalar const& escape_character = string_scalar(""),
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/** @} */ // end of doxygen group
Expand Down
12 changes: 8 additions & 4 deletions cpp/include/cudf/strings/extract.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,14 +53,16 @@ struct regex_program;
*
* See the @ref md_regex "Regex Features" page for details on patterns supported by this API.
*
* @param strings Strings instance for this operation
* @param input Strings instance for this operation
* @param prog Regex program instance
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned table's device memory
* @return Columns of strings extracted from the input column
*/
std::unique_ptr<table> extract(
strings_column_view const& strings,
strings_column_view const& input,
regex_program const& prog,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
Expand All @@ -87,14 +89,16 @@ std::unique_ptr<table> extract(
*
* See the @ref md_regex "Regex Features" page for details on patterns supported by this API.
*
* @param strings Strings instance for this operation
* @param input Strings instance for this operation
* @param prog Regex program instance
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate any returned device memory
* @return Lists column containing strings extracted from the input column
*/
std::unique_ptr<column> extract_all_record(
strings_column_view const& strings,
strings_column_view const& input,
regex_program const& prog,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/** @} */ // end of doxygen group
Expand Down
15 changes: 9 additions & 6 deletions cpp/src/strings/contains.cu
Original file line number Diff line number Diff line change
Expand Up @@ -123,28 +123,31 @@ std::unique_ptr<column> count_re(strings_column_view const& input,

// external APIs

std::unique_ptr<column> contains_re(strings_column_view const& strings,
std::unique_ptr<column> contains_re(strings_column_view const& input,
regex_program const& prog,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
return detail::contains_re(strings, prog, cudf::get_default_stream(), mr);
return detail::contains_re(input, prog, stream, mr);
}

std::unique_ptr<column> matches_re(strings_column_view const& strings,
std::unique_ptr<column> matches_re(strings_column_view const& input,
regex_program const& prog,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
return detail::matches_re(strings, prog, cudf::get_default_stream(), mr);
return detail::matches_re(input, prog, stream, mr);
}

std::unique_ptr<column> count_re(strings_column_view const& strings,
std::unique_ptr<column> count_re(strings_column_view const& input,
regex_program const& prog,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
return detail::count_re(strings, prog, cudf::get_default_stream(), mr);
return detail::count_re(input, prog, stream, mr);
}

} // namespace strings
Expand Down
5 changes: 3 additions & 2 deletions cpp/src/strings/extract/extract.cu
Original file line number Diff line number Diff line change
Expand Up @@ -131,12 +131,13 @@ std::unique_ptr<table> extract(strings_column_view const& input,

// external API

std::unique_ptr<table> extract(strings_column_view const& strings,
std::unique_ptr<table> extract(strings_column_view const& input,
regex_program const& prog,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
return detail::extract(strings, prog, cudf::get_default_stream(), mr);
return detail::extract(input, prog, stream, mr);
}

} // namespace strings
Expand Down
5 changes: 3 additions & 2 deletions cpp/src/strings/extract/extract_all.cu
Original file line number Diff line number Diff line change
Expand Up @@ -164,12 +164,13 @@ std::unique_ptr<column> extract_all_record(strings_column_view const& input,

// external API

std::unique_ptr<column> extract_all_record(strings_column_view const& strings,
std::unique_ptr<column> extract_all_record(strings_column_view const& input,
regex_program const& prog,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
return detail::extract_all_record(strings, prog, cudf::get_default_stream(), mr);
return detail::extract_all_record(input, prog, stream, mr);
}

} // namespace strings
Expand Down
6 changes: 4 additions & 2 deletions cpp/src/strings/like.cu
Original file line number Diff line number Diff line change
Expand Up @@ -185,19 +185,21 @@ std::unique_ptr<column> like(strings_column_view const& input,
std::unique_ptr<column> like(strings_column_view const& input,
string_scalar const& pattern,
string_scalar const& escape_character,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
return detail::like(input, pattern, escape_character, cudf::get_default_stream(), mr);
return detail::like(input, pattern, escape_character, stream, mr);
}

std::unique_ptr<column> like(strings_column_view const& input,
strings_column_view const& patterns,
string_scalar const& escape_character,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
return detail::like(input, patterns, escape_character, cudf::get_default_stream(), mr);
return detail::like(input, patterns, escape_character, stream, mr);
}

} // namespace strings
Expand Down
2 changes: 2 additions & 0 deletions cpp/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -638,7 +638,9 @@ ConfigureTest(
STREAM_STRINGS_TEST
streams/strings/case_test.cpp
streams/strings/combine_test.cpp
streams/strings/contains_test.cpp
streams/strings/convert_test.cpp
streams/strings/extract_test.cpp
streams/strings/find_test.cpp
streams/strings/replace_test.cpp
streams/strings/split_test.cpp
Expand Down
52 changes: 52 additions & 0 deletions cpp/tests/streams/strings/contains_test.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
/*
* Copyright (c) 2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <cudf_test/base_fixture.hpp>
#include <cudf_test/column_wrapper.hpp>
#include <cudf_test/default_stream.hpp>

#include <cudf/strings/contains.hpp>
#include <cudf/strings/regex/regex_program.hpp>

#include <string>

class StringsContainsTest : public cudf::test::BaseFixture {};

TEST_F(StringsContainsTest, Contains)
{
auto input = cudf::test::strings_column_wrapper({"Héllo", "thesé", "tést strings", ""});
auto view = cudf::strings_column_view(input);

auto const pattern = std::string("[a-z]");
auto const prog = cudf::strings::regex_program::create(pattern);
cudf::strings::contains_re(view, *prog, cudf::test::get_default_stream());
cudf::strings::matches_re(view, *prog, cudf::test::get_default_stream());
cudf::strings::count_re(view, *prog, cudf::test::get_default_stream());
}

TEST_F(StringsContainsTest, Like)
{
auto input = cudf::test::strings_column_wrapper({"Héllo", "thesés", "tést", ""});
auto view = cudf::strings_column_view(input);

auto const pattern = cudf::string_scalar("%és", true, cudf::test::get_default_stream());
auto const escape = cudf::string_scalar("%", true, cudf::test::get_default_stream());
cudf::strings::like(view, pattern, escape, cudf::test::get_default_stream());

auto const patterns = cudf::test::strings_column_wrapper({"H%", "t%s", "t", ""});
cudf::strings::like(
view, cudf::strings_column_view(patterns), escape, cudf::test::get_default_stream());
}
37 changes: 37 additions & 0 deletions cpp/tests/streams/strings/extract_test.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
/*
* Copyright (c) 2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <cudf_test/base_fixture.hpp>
#include <cudf_test/column_wrapper.hpp>
#include <cudf_test/default_stream.hpp>

#include <cudf/strings/extract.hpp>
#include <cudf/strings/regex/regex_program.hpp>

#include <string>

class StringsExtractTest : public cudf::test::BaseFixture {};

TEST_F(StringsExtractTest, Extract)
{
auto input = cudf::test::strings_column_wrapper({"Joe Schmoe", "John Smith", "Jane Smith"});
auto view = cudf::strings_column_view(input);

auto const pattern = std::string("([A-Z][a-z]+)");
auto const prog = cudf::strings::regex_program::create(pattern);
cudf::strings::extract(view, *prog, cudf::test::get_default_stream());
cudf::strings::extract_all_record(view, *prog, cudf::test::get_default_stream());
}

0 comments on commit 865c21e

Please sign in to comment.