Skip to content

Commit

Permalink
fix merge conflict
Browse files Browse the repository at this point in the history
  • Loading branch information
davidwendt committed Oct 27, 2023
2 parents c3999ac + 83746a4 commit f9a0f03
Show file tree
Hide file tree
Showing 14 changed files with 272 additions and 73 deletions.
26 changes: 15 additions & 11 deletions cpp/include/cudf/strings/char_types/char_types.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2022, NVIDIA CORPORATION.
* Copyright (c) 2019-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -53,18 +53,20 @@ namespace strings {
*
* Any null row results in a null entry for that row in the output column.
*
* @param strings Strings instance for this operation.
* @param types The character types to check in each string.
* @param input Strings instance for this operation
* @param types The character types to check in each string
* @param verify_types Only verify against these character types.
* Default `ALL_TYPES` means return `true`
* iff all characters match `types`.
* @param mr Device memory resource used to allocate the returned column's device memory.
* @return New column of boolean results for each string.
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned column's device memory
* @return New column of boolean results for each string
*/
std::unique_ptr<column> all_characters_of_type(
strings_column_view const& strings,
strings_column_view const& input,
string_character_types types,
string_character_types verify_types = string_character_types::ALL_TYPES,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
Expand Down Expand Up @@ -96,20 +98,22 @@ std::unique_ptr<column> all_characters_of_type(
* @throw cudf::logic_error if neither or both `types_to_remove` and
* `types_to_keep` are set to `ALL_TYPES`.
*
* @param strings Strings instance for this operation.
* @param input Strings instance for this operation
* @param types_to_remove The character types to check in each string.
* Use `ALL_TYPES` here to specify `types_to_keep` instead.
* @param replacement The replacement character to use when removing characters.
* @param replacement The replacement character to use when removing characters
* @param types_to_keep Default `ALL_TYPES` means all characters of
* `types_to_remove` will be filtered.
* @param mr Device memory resource used to allocate the returned column's device memory.
* @return New column of boolean results for each string.
* @param mr Device memory resource used to allocate the returned column's device memory
* @param stream CUDA stream used for device memory operations and kernel launches
* @return New column of boolean results for each string
*/
std::unique_ptr<column> filter_characters_of_type(
strings_column_view const& strings,
strings_column_view const& input,
string_character_types types_to_remove,
string_scalar const& replacement = string_scalar(""),
string_character_types types_to_keep = string_character_types::ALL_TYPES,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/** @} */ // end of doxygen group
Expand Down
4 changes: 3 additions & 1 deletion cpp/include/cudf/strings/reverse.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2022, NVIDIA CORPORATION.
* Copyright (c) 2022-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -42,10 +42,12 @@ namespace strings {
*
* @param input Strings column for this operation
* @param mr Device memory resource used to allocate the returned column's device memory
* @param stream CUDA stream used for device memory operations and kernel launches
* @return New strings column
*/
std::unique_ptr<column> reverse(
strings_column_view const& input,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/** @} */ // end of doxygen group
Expand Down
30 changes: 17 additions & 13 deletions cpp/include/cudf/strings/translate.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2021, NVIDIA CORPORATION.
* Copyright (c) 2019-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -47,14 +47,16 @@ namespace strings {
* r is now ["AA", "", "cccc", "AcQ"]
* @endcode
*
* @param strings Strings instance for this operation.
* @param chars_table Table of UTF-8 character mappings.
* @param mr Device memory resource used to allocate the returned column's device memory.
* @return New column with padded strings.
* @param input Strings instance for this operation
* @param chars_table Table of UTF-8 character mappings
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned column's device memory
* @return New column with padded strings
*/
std::unique_ptr<column> translate(
strings_column_view const& strings,
strings_column_view const& input,
std::vector<std::pair<char_utf8, char_utf8>> const& chars_table,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
Expand Down Expand Up @@ -87,19 +89,21 @@ enum class filter_type : bool {
*
* @throw cudf::logic_error if `replacement` is invalid
*
* @param strings Strings instance for this operation.
* @param characters_to_filter Table of character ranges to filter on.
* @param input Strings instance for this operation
* @param characters_to_filter Table of character ranges to filter on
* @param keep_characters If true, the `characters_to_filter` are retained and all other characters
* are removed.
* @param replacement Optional replacement string for each character removed.
* @param mr Device memory resource used to allocate the returned column's device memory.
* @return New column with filtered strings.
* are removed
* @param replacement Optional replacement string for each character removed
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned column's device memory
* @return New column with filtered strings
*/
std::unique_ptr<column> filter_characters(
strings_column_view const& strings,
strings_column_view const& input,
std::vector<std::pair<cudf::char_utf8, cudf::char_utf8>> characters_to_filter,
filter_type keep_characters = filter_type::KEEP,
string_scalar const& replacement = string_scalar(""),
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/** @} */ // end of doxygen group
Expand Down
60 changes: 36 additions & 24 deletions cpp/include/nvtext/tokenize.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,15 +49,17 @@ namespace nvtext {
*
* All null row entries are ignored and the output contains all valid rows.
*
* @param strings Strings column tokenize.
* @param input Strings column to tokenize
* @param delimiter UTF-8 characters used to separate each string into tokens.
* The default of empty string will separate tokens using whitespace.
* @param mr Device memory resource used to allocate the returned column's device memory.
* @return New strings columns of tokens.
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned column's device memory
* @return New strings columns of tokens
*/
std::unique_ptr<cudf::column> tokenize(
cudf::strings_column_view const& strings,
cudf::strings_column_view const& input,
cudf::string_scalar const& delimiter = cudf::string_scalar{""},
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
Expand All @@ -84,14 +86,16 @@ std::unique_ptr<cudf::column> tokenize(
*
* @throw cudf::logic_error if the delimiters column is empty or contains nulls.
*
* @param strings Strings column to tokenize.
* @param delimiters Strings used to separate individual strings into tokens.
* @param mr Device memory resource used to allocate the returned column's device memory.
* @return New strings columns of tokens.
* @param input Strings column to tokenize
* @param delimiters Strings used to separate individual strings into tokens
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned column's device memory
* @return New strings columns of tokens
*/
std::unique_ptr<cudf::column> tokenize(
cudf::strings_column_view const& strings,
cudf::strings_column_view const& input,
cudf::strings_column_view const& delimiters,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
Expand All @@ -112,15 +116,17 @@ std::unique_ptr<cudf::column> tokenize(
* All null row entries are ignored and the output contains all valid rows.
* The number of tokens for a null element is set to 0 in the output column.
*
* @param strings Strings column to use for this operation
* @param delimiter Strings used to separate each string into tokens;
* @param input Strings column to count tokens
* @param delimiter Strings used to separate each string into tokens.
* The default of empty string will separate tokens using whitespace.
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned column's device memory
* @return New column of token counts
*/
std::unique_ptr<cudf::column> count_tokens(
cudf::strings_column_view const& strings,
cudf::strings_column_view const& input,
cudf::string_scalar const& delimiter = cudf::string_scalar{""},
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
Expand All @@ -143,14 +149,16 @@ std::unique_ptr<cudf::column> count_tokens(
*
* @throw cudf::logic_error if the delimiters column is empty or contains nulls
*
* @param strings Strings column to use for this operation
* @param input Strings column to count tokens
* @param delimiters Strings used to separate each string into tokens
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned column's device memory
* @return New column of token counts
*/
std::unique_ptr<cudf::column> count_tokens(
cudf::strings_column_view const& strings,
cudf::strings_column_view const& input,
cudf::strings_column_view const& delimiters,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
Expand All @@ -168,12 +176,14 @@ std::unique_ptr<cudf::column> count_tokens(
*
* All null row entries are ignored and the output contains all valid rows.
*
* @param strings Strings column to tokenize.
* @param mr Device memory resource used to allocate the returned column's device memory.
* @return New strings columns of tokens.
* @param input Strings column to tokenize
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned column's device memory
* @return New strings columns of tokens
*/
std::unique_ptr<cudf::column> character_tokenize(
cudf::strings_column_view const& strings,
cudf::strings_column_view const& input,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
Expand Down Expand Up @@ -203,16 +213,18 @@ std::unique_ptr<cudf::column> character_tokenize(
* @throw cudf::logic_error if `row_indices.size() != strings.size()`
* @throw cudf::logic_error if `row_indices` contains nulls
*
* @param strings Strings column to detokenize.
* @param row_indices The relative output row index assigned for each token in the input column.
* @param separator String to append after concatenating each token to the proper output row.
* @param mr Device memory resource used to allocate the returned column's device memory.
* @return New strings columns of tokens.
* @param input Strings column to detokenize
* @param row_indices The relative output row index assigned for each token in the input column
* @param separator String to append after concatenating each token to the proper output row
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned column's device memory
* @return New strings columns of tokens
*/
std::unique_ptr<cudf::column> detokenize(
cudf::strings_column_view const& strings,
cudf::strings_column_view const& input,
cudf::column_view const& row_indices,
cudf::string_scalar const& separator = cudf::string_scalar(" "),
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
Expand Down
11 changes: 6 additions & 5 deletions cpp/src/strings/char_types/char_types.cu
Original file line number Diff line number Diff line change
Expand Up @@ -214,25 +214,26 @@ std::unique_ptr<column> filter_characters_of_type(strings_column_view const& str

// external API

std::unique_ptr<column> all_characters_of_type(strings_column_view const& strings,
std::unique_ptr<column> all_characters_of_type(strings_column_view const& input,
string_character_types types,
string_character_types verify_types,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
return detail::all_characters_of_type(
strings, types, verify_types, cudf::get_default_stream(), mr);
return detail::all_characters_of_type(input, types, verify_types, stream, mr);
}

std::unique_ptr<column> filter_characters_of_type(strings_column_view const& strings,
std::unique_ptr<column> filter_characters_of_type(strings_column_view const& input,
string_character_types types_to_remove,
string_scalar const& replacement,
string_character_types types_to_keep,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
return detail::filter_characters_of_type(
strings, types_to_remove, replacement, types_to_keep, cudf::get_default_stream(), mr);
input, types_to_remove, replacement, types_to_keep, stream, mr);
}

} // namespace strings
Expand Down
5 changes: 3 additions & 2 deletions cpp/src/strings/filter_chars.cu
Original file line number Diff line number Diff line change
Expand Up @@ -154,15 +154,16 @@ std::unique_ptr<column> filter_characters(
* @copydoc cudf::strings::filter_characters
*/
std::unique_ptr<column> filter_characters(
strings_column_view const& strings,
strings_column_view const& input,
std::vector<std::pair<cudf::char_utf8, cudf::char_utf8>> characters_to_filter,
filter_type keep_characters,
string_scalar const& replacement,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
return detail::filter_characters(
strings, characters_to_filter, keep_characters, replacement, cudf::get_default_stream(), mr);
input, characters_to_filter, keep_characters, replacement, stream, mr);
}

} // namespace strings
Expand Down
3 changes: 2 additions & 1 deletion cpp/src/strings/reverse.cu
Original file line number Diff line number Diff line change
Expand Up @@ -79,10 +79,11 @@ std::unique_ptr<column> reverse(strings_column_view const& input,
} // namespace detail

std::unique_ptr<column> reverse(strings_column_view const& input,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
return detail::reverse(input, cudf::get_default_stream(), mr);
return detail::reverse(input, stream, mr);
}

} // namespace strings
Expand Down
5 changes: 3 additions & 2 deletions cpp/src/strings/translate.cu
Original file line number Diff line number Diff line change
Expand Up @@ -124,12 +124,13 @@ std::unique_ptr<column> translate(strings_column_view const& strings,

// external APIs

std::unique_ptr<column> translate(strings_column_view const& strings,
std::unique_ptr<column> translate(strings_column_view const& input,
std::vector<std::pair<uint32_t, uint32_t>> const& chars_table,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
return detail::translate(strings, chars_table, cudf::get_default_stream(), mr);
return detail::translate(input, chars_table, stream, mr);
}

} // namespace strings
Expand Down
7 changes: 4 additions & 3 deletions cpp/src/text/detokenize.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2022, NVIDIA CORPORATION.
* Copyright (c) 2020-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -169,13 +169,14 @@ std::unique_ptr<cudf::column> detokenize(cudf::strings_column_view const& string

} // namespace detail

std::unique_ptr<cudf::column> detokenize(cudf::strings_column_view const& strings,
std::unique_ptr<cudf::column> detokenize(cudf::strings_column_view const& input,
cudf::column_view const& row_indices,
cudf::string_scalar const& separator,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
return detail::detokenize(strings, row_indices, separator, cudf::get_default_stream(), mr);
return detail::detokenize(input, row_indices, separator, stream, mr);
}

} // namespace nvtext
Loading

0 comments on commit f9a0f03

Please sign in to comment.