From e7f9db81eb698ba9ecdc999529e62c39f387d49d Mon Sep 17 00:00:00 2001 From: David Wendt Date: Thu, 18 Jan 2024 14:03:11 -0500 Subject: [PATCH 1/3] Use offsetalator in nvtext tokenize functions --- cpp/src/text/tokenize.cu | 56 +++++++++++-------------- cpp/src/text/utilities/tokenize_ops.cuh | 20 ++++----- 2 files changed, 35 insertions(+), 41 deletions(-) diff --git a/cpp/src/text/tokenize.cu b/cpp/src/text/tokenize.cu index c43b9dda56c..97896f20f4f 100644 --- a/cpp/src/text/tokenize.cu +++ b/cpp/src/text/tokenize.cu @@ -22,8 +22,8 @@ #include #include #include -#include #include +#include #include #include #include @@ -38,7 +38,6 @@ #include #include #include -#include #include namespace nvtext { @@ -80,18 +79,17 @@ std::unique_ptr tokenize_fn(cudf::size_type strings_count, token_count_fn(strings_count, tokenizer, stream, rmm::mr::get_current_device_resource()); auto d_token_counts = token_counts->view(); // create token-index offsets from the counts - rmm::device_uvector token_offsets(strings_count + 1, stream); - thrust::inclusive_scan(rmm::exec_policy(stream), - d_token_counts.template begin(), - d_token_counts.template end(), - token_offsets.begin() + 1); - token_offsets.set_element_to_zero_async(0, stream); - auto const total_tokens = token_offsets.back_element(stream); - // build a list of pointers to each token + auto [token_offsets, total_tokens] = + cudf::detail::make_offsets_child_column(d_token_counts.template begin(), + d_token_counts.template end(), + stream, + rmm::mr::get_current_device_resource()); + // build a list of pointers to each token rmm::device_uvector tokens(total_tokens, stream); // now go get the tokens - tokenizer.d_offsets = token_offsets.data(); - tokenizer.d_tokens = tokens.data(); + tokenizer.d_offsets = + cudf::detail::offsetalator_factory::make_input_iterator(token_offsets->view()); + tokenizer.d_tokens = tokens.data(); thrust::for_each_n(rmm::exec_policy(stream), thrust::make_counting_iterator(0), strings_count, @@ -178,8 +176,8 @@ std::unique_ptr character_tokenize(cudf::strings_column_view const } auto offsets = strings_column.offsets(); - auto offset = cudf::detail::get_value(offsets, strings_column.offset(), stream); - auto chars_bytes = cudf::detail::get_value( + auto offset = cudf::strings::detail::get_offset_value(offsets, strings_column.offset(), stream); + auto chars_bytes = cudf::strings::detail::get_offset_value( offsets, strings_column.offset() + strings_count, stream) - offset; auto d_chars = @@ -202,31 +200,27 @@ std::unique_ptr character_tokenize(cudf::strings_column_view const // create output offsets column // -- conditionally copy a counting iterator where // the first byte of each character is located - auto offsets_column = - cudf::make_numeric_column(cudf::data_type{cudf::type_to_id()}, - num_characters + 1, - cudf::mask_state::UNALLOCATED, - stream, - mr); - auto d_new_offsets = offsets_column->mutable_view().begin(); - thrust::copy_if( - rmm::exec_policy(stream), - thrust::counting_iterator(0), - thrust::counting_iterator(chars_bytes + 1), + auto offsets_column = cudf::make_numeric_column( + offsets.type(), num_characters + 1, cudf::mask_state::UNALLOCATED, stream, mr); + auto d_new_offsets = + cudf::detail::offsetalator_factory::make_output_iterator(offsets_column->mutable_view()); + cudf::detail::copy_if_safe( + thrust::counting_iterator(0), + thrust::counting_iterator(chars_bytes + 1), d_new_offsets, [d_chars, chars_bytes] __device__(auto idx) { // this will also set the final value to the size chars_bytes return idx < chars_bytes ? cudf::strings::detail::is_begin_utf8_char(d_chars[idx]) : true; - }); + }, + stream); - // create the output chars column -- just a copy of the input's chars column - cudf::column_view chars_view( - cudf::data_type{cudf::type_id::INT8}, chars_bytes, d_chars, nullptr, 0); - auto chars_column = std::make_unique(chars_view, stream, mr); + // create the output chars buffer -- just a copy of the input's chars + rmm::device_uvector output_chars(chars_bytes, stream, mr); + thrust::copy(rmm::exec_policy(stream), d_chars, d_chars + chars_bytes, output_chars.data()); // return new strings column return cudf::make_strings_column( - num_characters, std::move(offsets_column), std::move(chars_column), 0, rmm::device_buffer{}); + num_characters, std::move(offsets_column), output_chars.release(), 0, rmm::device_buffer{}); } } // namespace detail diff --git a/cpp/src/text/utilities/tokenize_ops.cuh b/cpp/src/text/utilities/tokenize_ops.cuh index a84e94a6924..0901dc37e56 100644 --- a/cpp/src/text/utilities/tokenize_ops.cuh +++ b/cpp/src/text/utilities/tokenize_ops.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -147,10 +147,10 @@ struct characters_tokenizer { * positions into the d_tokens vector. */ struct strings_tokenizer { - cudf::column_device_view const d_strings; ///< strings to tokenize - cudf::string_view const d_delimiter; ///< delimiter characters to tokenize around - cudf::size_type* d_offsets{}; ///< offsets into the d_tokens vector for each string - string_index_pair* d_tokens{}; ///< token positions in device memory + cudf::column_device_view const d_strings; ///< strings to tokenize + cudf::string_view const d_delimiter; ///< delimiter characters to tokenize around + cudf::detail::input_offsetalator d_offsets; ///< offsets into the d_tokens vector for each string + string_index_pair* d_tokens{}; ///< token positions in device memory /** * @brief Identifies the token positions within each string. @@ -191,11 +191,11 @@ using delimiterator = cudf::column_device_view::const_iterator Date: Thu, 18 Jan 2024 17:07:07 -0500 Subject: [PATCH 2/3] add ngrams_tokenizer --- cpp/src/text/ngrams_tokenize.cu | 97 +++++++++++++-------------------- 1 file changed, 39 insertions(+), 58 deletions(-) diff --git a/cpp/src/text/ngrams_tokenize.cu b/cpp/src/text/ngrams_tokenize.cu index bc5cd04eac6..99ea85a7a0d 100644 --- a/cpp/src/text/ngrams_tokenize.cu +++ b/cpp/src/text/ngrams_tokenize.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -37,12 +38,9 @@ #include #include #include -#include #include -#include - namespace nvtext { namespace detail { namespace { @@ -60,10 +58,10 @@ namespace { * member. */ struct string_tokens_positions_fn { - cudf::column_device_view const d_strings; // strings to tokenize - cudf::string_view const d_delimiter; // delimiter to tokenize around - cudf::size_type const* d_token_offsets; // offsets into the d_token_positions for each string - position_pair* d_token_positions; // token positions in each string + cudf::column_device_view const d_strings; // strings to tokenize + cudf::string_view const d_delimiter; // delimiter to tokenize around + cudf::detail::input_offsetalator d_token_offsets; // offsets of d_token_positions for each string + position_pair* d_token_positions; // token positions in each string __device__ void operator()(cudf::size_type idx) { @@ -95,12 +93,12 @@ struct ngram_builder_fn { cudf::column_device_view const d_strings; // strings to generate ngrams from cudf::string_view const d_separator; // separator to place between them 'grams cudf::size_type const ngrams; // ngram number to generate (2=bi-gram, 3=tri-gram) - cudf::size_type const* d_token_offsets; // offsets for token position for each string - position_pair const* d_token_positions; // token positions for each string - cudf::size_type const* d_chars_offsets{}; // offsets for each string's ngrams - char* d_chars{}; // write ngram strings to here - cudf::size_type const* d_ngram_offsets{}; // offsets for sizes of each string's ngrams - cudf::size_type* d_ngram_sizes{}; // write ngram sizes to here + cudf::detail::input_offsetalator d_token_offsets; // offsets for token position for each string + position_pair const* d_token_positions; // token positions for each string + cudf::detail::input_offsetalator d_chars_offsets{}; // offsets for each string's ngrams + char* d_chars{}; // write ngram strings to here + cudf::detail::input_offsetalator d_ngram_offsets{}; // offsets for sizes of each string's ngrams + cudf::size_type* d_ngram_sizes{}; // write ngram sizes to here __device__ cudf::size_type operator()(cudf::size_type idx) { @@ -165,16 +163,12 @@ std::unique_ptr ngrams_tokenize(cudf::strings_column_view const& s // first, get the number of tokens per string to get the token-offsets // Ex. token-counts = [3,2]; token-offsets = [0,3,5] - rmm::device_uvector token_offsets(strings_count + 1, stream); - auto d_token_offsets = token_offsets.data(); - thrust::transform_inclusive_scan(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(strings_count), - d_token_offsets + 1, - strings_tokenizer{d_strings, d_delimiter}, - thrust::plus()); - token_offsets.set_element_to_zero_async(0, stream); - auto const total_tokens = token_offsets.back_element(stream); // Ex. 5 tokens + auto const count_itr = + cudf::detail::make_counting_transform_iterator(0, strings_tokenizer{d_strings, d_delimiter}); + auto [token_offsets, total_tokens] = cudf::detail::make_offsets_child_column( + count_itr, count_itr + strings_count, stream, rmm::mr::get_current_device_resource()); + auto d_token_offsets = + cudf::detail::offsetalator_factory::make_input_iterator(token_offsets->view()); // get the token positions (in bytes) per string // Ex. start/end pairs: [(0,1),(2,4),(5,8), (0,2),(3,4)] @@ -188,21 +182,18 @@ std::unique_ptr ngrams_tokenize(cudf::strings_column_view const& s // compute the number of ngrams per string to get the total number of ngrams to generate // Ex. ngram-counts = [2,1]; ngram-offsets = [0,2,3]; total = 3 bigrams - rmm::device_uvector ngram_offsets(strings_count + 1, stream); - auto d_ngram_offsets = ngram_offsets.data(); - thrust::transform_inclusive_scan( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(strings_count), - d_ngram_offsets + 1, + auto const ngram_counts = cudf::detail::make_counting_transform_iterator( + 0, cuda::proclaim_return_type( [d_token_offsets, ngrams] __device__(cudf::size_type idx) { - auto token_count = d_token_offsets[idx + 1] - d_token_offsets[idx]; + auto token_count = + static_cast(d_token_offsets[idx + 1] - d_token_offsets[idx]); return (token_count >= ngrams) ? token_count - ngrams + 1 : 0; - }), - thrust::plus{}); - ngram_offsets.set_element_to_zero_async(0, stream); - auto const total_ngrams = ngram_offsets.back_element(stream); + })); + auto [ngram_offsets, total_ngrams] = cudf::detail::make_offsets_child_column( + ngram_counts, ngram_counts + strings_count, stream, rmm::mr::get_current_device_resource()); + auto d_ngram_offsets = + cudf::detail::offsetalator_factory::make_input_iterator(ngram_offsets->view()); // Compute the total size of the ngrams for each string (not for each ngram) // Ex. 2 bigrams in 1st string total to 10 bytes; 1 bigram in 2nd string is 4 bytes @@ -212,29 +203,21 @@ std::unique_ptr ngrams_tokenize(cudf::strings_column_view const& s // ngrams for each string. // Ex. bigram for first string produces 2 bigrams ("a_bb","bb_ccc") which // is built in memory like this: "a_bbbb_ccc" - rmm::device_uvector chars_offsets(strings_count + 1, stream); - // First compute the output sizes for each string (this not the final output result) - thrust::transform( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(strings_count), - chars_offsets.begin(), - ngram_builder_fn{d_strings, d_separator, ngrams, d_token_offsets, d_token_positions}); - // Convert the sizes to offsets - auto const output_chars_size = cudf::detail::sizes_to_offsets( - chars_offsets.begin(), chars_offsets.end(), chars_offsets.begin(), stream); - CUDF_EXPECTS( - output_chars_size <= static_cast(std::numeric_limits::max()), - "Size of output exceeds the column size limit", - std::overflow_error); + + // First compute the output sizes for each string (this not the final output result) + auto const sizes_itr = cudf::detail::make_counting_transform_iterator( + 0, ngram_builder_fn{d_strings, d_separator, ngrams, d_token_offsets, d_token_positions}); + auto [chars_offsets, output_chars_size] = cudf::detail::make_offsets_child_column( + sizes_itr, sizes_itr + strings_count, stream, rmm::mr::get_current_device_resource()); + auto d_chars_offsets = + cudf::detail::offsetalator_factory::make_input_iterator(chars_offsets->view()); // This will contain the size in bytes of each ngram to generate rmm::device_uvector ngram_sizes(total_ngrams, stream); // build output chars column - auto chars_column = cudf::strings::detail::create_chars_child_column( - static_cast(output_chars_size), stream, mr); - auto d_chars = chars_column->mutable_view().data(); + rmm::device_uvector chars(output_chars_size, stream, mr); + auto d_chars = chars.data(); // Generate the ngrams into the chars column data buffer. // The ngram_builder_fn functor also fills the ngram_sizes vector with the // size of each ngram. @@ -246,18 +229,16 @@ std::unique_ptr ngrams_tokenize(cudf::strings_column_view const& s ngrams, d_token_offsets, d_token_positions, - chars_offsets.data(), + d_chars_offsets, d_chars, d_ngram_offsets, ngram_sizes.data()}); // build the offsets column -- converting the ngram sizes into offsets auto offsets_column = std::get<0>( cudf::detail::make_offsets_child_column(ngram_sizes.begin(), ngram_sizes.end(), stream, mr)); - chars_column->set_null_count(0); - offsets_column->set_null_count(0); // create the output strings column return make_strings_column( - total_ngrams, std::move(offsets_column), std::move(chars_column), 0, rmm::device_buffer{}); + total_ngrams, std::move(offsets_column), chars.release(), 0, rmm::device_buffer{}); } } // namespace detail From 703ee0a9b4e73389ee68d8eecb4b3e193658c6e7 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Thu, 18 Jan 2024 17:20:32 -0500 Subject: [PATCH 3/3] add tokenize_with_vocabulary --- cpp/src/text/vocabulary_tokenize.cu | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/cpp/src/text/vocabulary_tokenize.cu b/cpp/src/text/vocabulary_tokenize.cu index a9e8d4d9a24..86c61cc4241 100644 --- a/cpp/src/text/vocabulary_tokenize.cu +++ b/cpp/src/text/vocabulary_tokenize.cu @@ -21,14 +21,15 @@ #include #include #include -#include #include #include #include +#include #include #include #include #include +#include #include #include #include @@ -299,7 +300,7 @@ struct vocabulary_tokenizer_fn { cudf::string_view const d_delimiter; MapRefType d_map; cudf::size_type const default_id; - cudf::size_type const* d_offsets; + cudf::detail::input_offsetalator d_offsets; cudf::size_type* d_results; __device__ void operator()(cudf::size_type idx) const @@ -380,7 +381,7 @@ std::unique_ptr tokenize_with_vocabulary(cudf::strings_column_view auto tokens = cudf::make_numeric_column( output_type, total_count, cudf::mask_state::UNALLOCATED, stream, mr); auto d_tokens = tokens->mutable_view().data(); - auto d_offsets = token_offsets->view().data(); + auto d_offsets = cudf::detail::offsetalator_factory::make_input_iterator(token_offsets->view()); vocabulary_tokenizer_fn tokenizer{ *d_strings, d_delimiter, map_ref, default_id, d_offsets, d_tokens}; thrust::for_each_n(rmm::exec_policy(stream), zero_itr, input.size(), tokenizer); @@ -396,11 +397,11 @@ std::unique_ptr tokenize_with_vocabulary(cudf::strings_column_view // longer strings perform better with warp-parallel approach auto const first_offset = (input.offset() == 0) ? 0 - : cudf::detail::get_value( + : cudf::strings::detail::get_offset_value( input.offsets(), input.offset(), stream); auto const last_offset = (input.offset() == 0 && input.size() == input.offsets().size() - 1) ? input.chars_size(stream) - : cudf::detail::get_value( + : cudf::strings::detail::get_offset_value( input.offsets(), input.size() + input.offset(), stream); auto const chars_size = last_offset - first_offset; auto const d_input_chars = input.chars_begin(stream) + first_offset;