Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Performance improvement for nvtext tokenize/token functions #13480

Merged
merged 31 commits into from
Jun 29, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
63bde3b
Performance improvement for nvtext tokenize for long strings
davidwendt May 31, 2023
6a6f668
Merge branch 'branch-23.08' into nvtext-perf-tokenize
davidwendt Jun 2, 2023
1f99968
fix perf of normalize spaces
davidwendt Jun 2, 2023
bd2c072
Merge branch 'branch-23.08' into nvtext-perf-tokenize
davidwendt Jun 2, 2023
1dbe89a
Merge branch 'branch-23.08' into nvtext-perf-tokenize
davidwendt Jun 6, 2023
b06a871
Merge branch 'branch-23.08' into nvtext-perf-tokenize
davidwendt Jun 7, 2023
764f154
Merge branch 'branch-23.08' into nvtext-perf-tokenize
davidwendt Jun 7, 2023
05a9c55
update comments, style; name const value
davidwendt Jun 7, 2023
d743154
Merge branch 'branch-23.08' into nvtext-perf-tokenize
davidwendt Jun 8, 2023
a7d8826
Merge branch 'branch-23.08' into nvtext-perf-tokenize
davidwendt Jun 8, 2023
408ad51
Merge branch 'branch-23.08' into nvtext-perf-tokenize
davidwendt Jun 9, 2023
a68711d
Merge branch 'branch-23.08' into nvtext-perf-tokenize
davidwendt Jun 12, 2023
031b666
Merge branch 'branch-23.08' into nvtext-perf-tokenize
davidwendt Jun 12, 2023
d023e40
Merge branch 'branch-23.08' into nvtext-perf-tokenize
davidwendt Jun 13, 2023
51954c1
remove unneeded prefetch logic from normalize spaces
davidwendt Jun 13, 2023
7c89985
Merge branch 'branch-23.08' into nvtext-perf-tokenize
davidwendt Jun 13, 2023
50e75f1
Merge branch 'branch-23.08' into nvtext-perf-tokenize
davidwendt Jun 14, 2023
d59e695
Merge branch 'branch-23.08' into nvtext-perf-tokenize
davidwendt Jun 20, 2023
09b67cf
Merge branch 'nvtext-perf-tokenize' of github.com:davidwendt/cudf int…
davidwendt Jun 20, 2023
541440f
Merge branch 'branch-23.08' into nvtext-perf-tokenize
davidwendt Jun 21, 2023
e1e6709
Merge branch 'branch-23.08' into nvtext-perf-tokenize
davidwendt Jun 21, 2023
12b75c5
Merge branch 'nvtext-perf-tokenize' of github.com:davidwendt/cudf int…
davidwendt Jun 22, 2023
c5357c6
Merge branch 'branch-23.08' into nvtext-perf-tokenize
davidwendt Jun 22, 2023
241d7d3
Merge branch 'branch-23.08' into nvtext-perf-tokenize
davidwendt Jun 23, 2023
75a8c39
Merge branch 'branch-23.08' into nvtext-perf-tokenize
davidwendt Jun 23, 2023
bf03f1a
Merge branch 'nvtext-perf-tokenize' of github.com:davidwendt/cudf int…
davidwendt Jun 23, 2023
58701e7
Merge branch 'branch-23.08' into nvtext-perf-tokenize
davidwendt Jun 24, 2023
946ca1c
Merge branch 'branch-23.08' into nvtext-perf-tokenize
davidwendt Jun 26, 2023
95787ff
fix next-token boundary check
davidwendt Jun 26, 2023
4bdac42
Merge branch 'branch-23.08' into nvtext-perf-tokenize
davidwendt Jun 26, 2023
48288fe
Merge branch 'branch-23.08' into nvtext-perf-tokenize
davidwendt Jun 28, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 11 additions & 9 deletions cpp/src/text/normalize.cu
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ namespace {
*/
struct normalize_spaces_fn {
cudf::column_device_view const d_strings; // strings to normalize
int32_t* d_offsets{}; // offsets into d_buffer
cudf::size_type* d_offsets{}; // offsets into d_chars
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Have we deprecated offset_type entirely? Has it been removed?

Copy link
Contributor Author

@davidwendt davidwendt Jun 26, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not yet. I can look into doing that in a separate PR.

char* d_chars{}; // output buffer for characters

__device__ void operator()(cudf::size_type idx)
Expand All @@ -70,24 +70,26 @@ struct normalize_spaces_fn {
cudf::string_view const single_space(" ", 1);
auto const d_str = d_strings.element<cudf::string_view>(idx);
char* buffer = d_chars ? d_chars + d_offsets[idx] : nullptr;
char* optr = buffer; // running output pointer
int32_t nbytes = 0; // holds the number of bytes per output string
char* optr = buffer; // running output pointer

cudf::size_type nbytes = 0; // holds the number of bytes per output string

// create a tokenizer for this string with whitespace delimiter (default)
characters_tokenizer tokenizer(d_str);

// this will retrieve tokens automatically skipping runs of whitespace
while (tokenizer.next_token()) {
auto const token_pos = tokenizer.token_byte_positions();
nbytes += token_pos.second - token_pos.first + 1; // token size plus a single space
auto const token =
cudf::string_view(d_str.data() + token_pos.first, token_pos.second - token_pos.first);
if (optr) {
cudf::string_view const token(d_str.data() + token_pos.first,
token_pos.second - token_pos.first);
if (optr != buffer) // prepend space unless we are at the beginning
optr = cudf::strings::detail::copy_string(optr, single_space);
// prepend space unless we are at the beginning
if (optr != buffer) { optr = cudf::strings::detail::copy_string(optr, single_space); }
// write token to output buffer
optr = cudf::strings::detail::copy_string(optr, token);
thrust::copy_n(thrust::seq, token.data(), token.size_bytes(), optr);
optr += token.size_bytes();
}
nbytes += token.size_bytes() + 1; // token size plus a single space
}
// remove trailing space
if (!d_chars) d_offsets[idx] = (nbytes > 0) ? nbytes - 1 : 0;
Expand Down
8 changes: 5 additions & 3 deletions cpp/src/text/tokenize.cu
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,11 @@
* limitations under the License.
*/

#include <text/utilities/tokenize_ops.cuh>

#include <nvtext/detail/tokenize.hpp>
#include <nvtext/tokenize.hpp>

#include <cudf/column/column.hpp>
#include <cudf/column/column_device_view.cuh>
#include <cudf/column/column_factories.hpp>
Expand All @@ -24,9 +29,6 @@
#include <cudf/strings/strings_column_view.hpp>
#include <cudf/utilities/default_stream.hpp>
#include <cudf/utilities/error.hpp>
#include <nvtext/detail/tokenize.hpp>
#include <nvtext/tokenize.hpp>
#include <text/utilities/tokenize_ops.cuh>

#include <rmm/cuda_stream_view.hpp>
#include <rmm/device_uvector.hpp>
Expand Down
48 changes: 27 additions & 21 deletions cpp/src/text/utilities/tokenize_ops.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
*/

#include <cudf/column/column_device_view.cuh>
#include <cudf/strings/detail/utf8.hpp>
#include <cudf/strings/string_view.cuh>

#include <thrust/execution_policy.h>
Expand Down Expand Up @@ -50,7 +51,7 @@ struct characters_tokenizer {
: d_str{d_str},
d_delimiter{d_delimiter},
spaces{true},
itr{d_str.begin()},
current_position{0},
start_position(0),
end_position(d_str.size_bytes())
{
Expand All @@ -64,7 +65,7 @@ struct characters_tokenizer {
* @param chr The character to test.
* @return true if the character is a delimiter
*/
__device__ bool is_delimiter(cudf::char_utf8 chr)
__device__ bool is_delimiter(cudf::char_utf8 chr) const
{
return d_delimiter.empty() ? (chr <= ' ') : // whitespace check
thrust::any_of(thrust::seq,
Expand All @@ -78,7 +79,7 @@ struct characters_tokenizer {
* string at the specified iterator position.
*
* For empty delimiter, whitespace code-point is checked.
* Starting at the given iterator (itr) position, a token
* Starting at the current_position, a token
* start position is identified when a delimiter is
* not found. Once found, the end position is identified
* when a delimiter or the end of the string is found.
Expand All @@ -87,27 +88,32 @@ struct characters_tokenizer {
*/
__device__ bool next_token()
{
if (itr != d_str.begin()) { // skip these 2 lines the first time through
++itr;
start_position = itr.byte_offset();
auto const src_ptr = d_str.data();
if (current_position != 0) { // skip these 2 lines the first time through
current_position += cudf::strings::detail::bytes_in_char_utf8(src_ptr[current_position]);
start_position = current_position;
}
if (start_position >= d_str.size_bytes()) return false;
if (start_position >= d_str.size_bytes()) { return false; }
// continue search for the next token
end_position = d_str.size_bytes();
for (; itr != d_str.end(); ++itr) {
cudf::char_utf8 ch = *itr;
while (current_position < d_str.size_bytes()) {
cudf::char_utf8 ch = 0;
auto const chr_width = cudf::strings::detail::to_char_utf8(src_ptr + current_position, ch);
if (spaces == is_delimiter(ch)) {
if (spaces)
start_position = (itr + 1).byte_offset();
else
end_position = (itr + 1).byte_offset();
current_position += chr_width;
if (spaces) {
start_position = current_position;
} else {
end_position = current_position;
}
continue;
}
spaces = !spaces;
if (spaces) {
end_position = itr.byte_offset();
end_position = current_position;
break;
}
current_position += chr_width;
}
return start_position < end_position;
}
Expand All @@ -118,18 +124,18 @@ struct characters_tokenizer {
*
* @return Byte positions of the current token.
*/
__device__ position_pair token_byte_positions()
__device__ position_pair token_byte_positions() const
{
return position_pair{start_position, end_position};
}

private:
cudf::string_view const d_str; ///< string to tokenize
cudf::string_view const d_delimiter; ///< delimiter characters
bool spaces; ///< true if current position is delimiter
cudf::string_view::const_iterator itr; ///< current position in d_str
cudf::size_type start_position; ///< starting character position of token found
cudf::size_type end_position; ///< ending character position (excl) of token found
cudf::string_view const d_str; ///< string to tokenize
cudf::string_view const d_delimiter; ///< delimiter characters
bool spaces; ///< true if current position is delimiter
cudf::size_type current_position; ///< current position in d_str
cudf::size_type start_position; ///< starting byte position of token found
cudf::size_type end_position; ///< ending byte position (exclusive) of token found
};

/**
Expand Down