Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Expose stream parameter in public nvtext tokenize APIs #14317

Merged
merged 4 commits into from
Oct 27, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 36 additions & 24 deletions cpp/include/nvtext/tokenize.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,15 +49,17 @@ namespace nvtext {
*
* All null row entries are ignored and the output contains all valid rows.
*
* @param strings Strings column tokenize.
* @param input Strings column to tokenize
* @param delimiter UTF-8 characters used to separate each string into tokens.
* The default of empty string will separate tokens using whitespace.
* @param mr Device memory resource used to allocate the returned column's device memory.
* @return New strings columns of tokens.
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned column's device memory
* @return New strings columns of tokens
*/
std::unique_ptr<cudf::column> tokenize(
cudf::strings_column_view const& strings,
cudf::strings_column_view const& input,
cudf::string_scalar const& delimiter = cudf::string_scalar{""},
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
Expand All @@ -84,14 +86,16 @@ std::unique_ptr<cudf::column> tokenize(
*
* @throw cudf::logic_error if the delimiters column is empty or contains nulls.
*
* @param strings Strings column to tokenize.
* @param delimiters Strings used to separate individual strings into tokens.
* @param mr Device memory resource used to allocate the returned column's device memory.
* @return New strings columns of tokens.
* @param input Strings column to tokenize
* @param delimiters Strings used to separate individual strings into tokens
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned column's device memory
* @return New strings columns of tokens
*/
std::unique_ptr<cudf::column> tokenize(
cudf::strings_column_view const& strings,
cudf::strings_column_view const& input,
cudf::strings_column_view const& delimiters,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
Expand All @@ -112,15 +116,17 @@ std::unique_ptr<cudf::column> tokenize(
* All null row entries are ignored and the output contains all valid rows.
* The number of tokens for a null element is set to 0 in the output column.
*
* @param strings Strings column to use for this operation
* @param delimiter Strings used to separate each string into tokens;
* @param input Strings column to count tokens
* @param delimiter Strings used to separate each string into tokens.
* The default of empty string will separate tokens using whitespace.
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned column's device memory
* @return New column of token counts
*/
std::unique_ptr<cudf::column> count_tokens(
cudf::strings_column_view const& strings,
cudf::strings_column_view const& input,
cudf::string_scalar const& delimiter = cudf::string_scalar{""},
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
Expand All @@ -143,14 +149,16 @@ std::unique_ptr<cudf::column> count_tokens(
*
* @throw cudf::logic_error if the delimiters column is empty or contains nulls
*
* @param strings Strings column to use for this operation
* @param input Strings column to count tokens
* @param delimiters Strings used to separate each string into tokens
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned column's device memory
* @return New column of token counts
*/
std::unique_ptr<cudf::column> count_tokens(
cudf::strings_column_view const& strings,
cudf::strings_column_view const& input,
cudf::strings_column_view const& delimiters,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
Expand All @@ -168,12 +176,14 @@ std::unique_ptr<cudf::column> count_tokens(
*
* All null row entries are ignored and the output contains all valid rows.
*
* @param strings Strings column to tokenize.
* @param mr Device memory resource used to allocate the returned column's device memory.
* @return New strings columns of tokens.
* @param input Strings column to tokenize
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned column's device memory
* @return New strings columns of tokens
*/
std::unique_ptr<cudf::column> character_tokenize(
cudf::strings_column_view const& strings,
cudf::strings_column_view const& input,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
Expand Down Expand Up @@ -203,16 +213,18 @@ std::unique_ptr<cudf::column> character_tokenize(
* @throw cudf::logic_error if `row_indices.size() != strings.size()`
* @throw cudf::logic_error if `row_indices` contains nulls
*
* @param strings Strings column to detokenize.
* @param row_indices The relative output row index assigned for each token in the input column.
* @param separator String to append after concatenating each token to the proper output row.
* @param mr Device memory resource used to allocate the returned column's device memory.
* @return New strings columns of tokens.
* @param input Strings column to detokenize
* @param row_indices The relative output row index assigned for each token in the input column
* @param separator String to append after concatenating each token to the proper output row
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned column's device memory
* @return New strings columns of tokens
*/
std::unique_ptr<cudf::column> detokenize(
cudf::strings_column_view const& strings,
cudf::strings_column_view const& input,
cudf::column_view const& row_indices,
cudf::string_scalar const& separator = cudf::string_scalar(" "),
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
Expand Down
7 changes: 4 additions & 3 deletions cpp/src/text/detokenize.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2022, NVIDIA CORPORATION.
* Copyright (c) 2020-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -169,13 +169,14 @@ std::unique_ptr<cudf::column> detokenize(cudf::strings_column_view const& string

} // namespace detail

std::unique_ptr<cudf::column> detokenize(cudf::strings_column_view const& strings,
std::unique_ptr<cudf::column> detokenize(cudf::strings_column_view const& input,
cudf::column_view const& row_indices,
cudf::string_scalar const& separator,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
return detail::detokenize(strings, row_indices, separator, cudf::get_default_stream(), mr);
return detail::detokenize(input, row_indices, separator, stream, mr);
}

} // namespace nvtext
25 changes: 15 additions & 10 deletions cpp/src/text/tokenize.cu
Original file line number Diff line number Diff line change
Expand Up @@ -232,43 +232,48 @@ std::unique_ptr<cudf::column> character_tokenize(cudf::strings_column_view const

// external APIs

std::unique_ptr<cudf::column> tokenize(cudf::strings_column_view const& strings,
std::unique_ptr<cudf::column> tokenize(cudf::strings_column_view const& input,
cudf::string_scalar const& delimiter,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
return detail::tokenize(strings, delimiter, cudf::get_default_stream(), mr);
return detail::tokenize(input, delimiter, stream, mr);
}

std::unique_ptr<cudf::column> tokenize(cudf::strings_column_view const& strings,
std::unique_ptr<cudf::column> tokenize(cudf::strings_column_view const& input,
cudf::strings_column_view const& delimiters,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
return detail::tokenize(strings, delimiters, cudf::get_default_stream(), mr);
return detail::tokenize(input, delimiters, stream, mr);
}

std::unique_ptr<cudf::column> count_tokens(cudf::strings_column_view const& strings,
std::unique_ptr<cudf::column> count_tokens(cudf::strings_column_view const& input,
cudf::string_scalar const& delimiter,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
return detail::count_tokens(strings, delimiter, cudf::get_default_stream(), mr);
return detail::count_tokens(input, delimiter, stream, mr);
}

std::unique_ptr<cudf::column> count_tokens(cudf::strings_column_view const& strings,
std::unique_ptr<cudf::column> count_tokens(cudf::strings_column_view const& input,
cudf::strings_column_view const& delimiters,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
return detail::count_tokens(strings, delimiters, cudf::get_default_stream(), mr);
return detail::count_tokens(input, delimiters, stream, mr);
}

std::unique_ptr<cudf::column> character_tokenize(cudf::strings_column_view const& strings,
std::unique_ptr<cudf::column> character_tokenize(cudf::strings_column_view const& input,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
return detail::character_tokenize(strings, cudf::get_default_stream(), mr);
return detail::character_tokenize(input, stream, mr);
}

} // namespace nvtext
4 changes: 3 additions & 1 deletion cpp/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -649,7 +649,9 @@ ConfigureTest(
testing
)
ConfigureTest(STREAM_SORTING_TEST streams/sorting_test.cpp STREAM_MODE testing)
ConfigureTest(STREAM_TEXT_TEST streams/text/ngrams_test.cpp STREAM_MODE testing)
ConfigureTest(
STREAM_TEXT_TEST streams/text/ngrams_test.cpp streams/text/tokenize_test.cpp STREAM_MODE testing
)
ConfigureTest(STREAM_LISTS_TEST streams/lists_test.cpp STREAM_MODE testing)

# ##################################################################################################
Expand Down
53 changes: 53 additions & 0 deletions cpp/tests/streams/text/tokenize_test.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
/*
* Copyright (c) 2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <nvtext/tokenize.hpp>

#include <cudf_test/base_fixture.hpp>
#include <cudf_test/column_wrapper.hpp>
#include <cudf_test/default_stream.hpp>

class TextTokenizeTest : public cudf::test::BaseFixture {};

TEST_F(TextTokenizeTest, Tokenize)
{
auto const input = cudf::test::strings_column_wrapper({"the fox jumped", "over thé dog"});
auto const view = cudf::strings_column_view(input);
auto const delimiter = cudf::string_scalar{" ", true, cudf::test::get_default_stream()};
nvtext::tokenize(view, delimiter, cudf::test::get_default_stream());
nvtext::count_tokens(view, delimiter, cudf::test::get_default_stream());
auto const delimiters = cudf::test::strings_column_wrapper({" ", "o", "é"});
nvtext::tokenize(view, cudf::strings_column_view(delimiters), cudf::test::get_default_stream());
nvtext::count_tokens(
view, cudf::strings_column_view(delimiters), cudf::test::get_default_stream());
}
karthikeyann marked this conversation as resolved.
Show resolved Hide resolved

TEST_F(TextTokenizeTest, CharacterTokenize)
{
auto const input =
cudf::test::strings_column_wrapper({"the", "fox", "jumped", "over", "thé", "dog"});
nvtext::character_tokenize(cudf::strings_column_view(input), cudf::test::get_default_stream());
}

TEST_F(TextTokenizeTest, Detokenize)
{
auto const input =
cudf::test::strings_column_wrapper({"the", "fox", "jumped", "over", "thé", "dog"});
auto const view = cudf::strings_column_view(input);
auto const indices = cudf::test::fixed_width_column_wrapper<int32_t>({0, 0, 0, 1, 1, 1});
auto const separator = cudf::string_scalar{" ", true, cudf::test::get_default_stream()};
nvtext::detokenize(view, indices, separator, cudf::test::get_default_stream());
}