From cd6d25065c0b1606b30ecf6b5fbdbb46760004d5 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Wed, 30 Nov 2022 11:10:44 -0500 Subject: [PATCH 1/7] Rework nvtext::detokenize to use indexalator for row indices --- cpp/src/text/detokenize.cu | 121 +++++++++++++------------------------ 1 file changed, 41 insertions(+), 80 deletions(-) diff --git a/cpp/src/text/detokenize.cu b/cpp/src/text/detokenize.cu index 76994dd0da4..3b40a6e051b 100644 --- a/cpp/src/text/detokenize.cu +++ b/cpp/src/text/detokenize.cu @@ -19,9 +19,9 @@ #include #include #include -#include +#include #include -#include +#include #include #include #include @@ -29,17 +29,13 @@ #include #include #include -#include -#include #include #include #include #include -#include #include -#include namespace nvtext { namespace detail { @@ -55,10 +51,10 @@ struct detokenizer_fn { int32_t const* d_row_map; // indices sorted by output row cudf::size_type const* d_token_offsets; // to each input token array cudf::string_view const d_separator; // append after each token - int32_t const* d_offsets{}; // offsets to output buffer d_chars + cudf::size_type* d_offsets{}; // offsets to output buffer d_chars char* d_chars{}; // output buffer for characters - __device__ cudf::size_type operator()(cudf::size_type idx) + __device__ void operator()(cudf::size_type idx) { auto const offset = d_token_offsets[idx]; auto d_tokens = d_row_map + offset; @@ -78,61 +74,41 @@ struct detokenizer_fn { nbytes += d_separator.size_bytes(); } } - return (nbytes > 0) ? (nbytes - d_separator.size_bytes()) : 0; + if (!d_chars) { d_offsets[idx] = (nbytes > 0) ? (nbytes - d_separator.size_bytes()) : 0; } } }; -/** - * @brief Identifies indexes where the row value changes. - */ -template struct index_changed_fn { - IndexType const* d_rows; - int32_t const* d_row_map; - __device__ bool operator()(cudf::size_type idx) + cudf::detail::input_indexalator const d_rows; + cudf::size_type const* d_row_map; + __device__ bool operator()(cudf::size_type idx) const { return (idx == 0) || (d_rows[d_row_map[idx]] != d_rows[d_row_map[idx - 1]]); } }; -/** - * @brief This is a type-dispatch function to convert the row indices - * into token offsets. - */ -struct token_row_offsets_fn { - cudf::column_view const row_indices; - cudf::column_view const sorted_indices; - cudf::size_type const tokens_counts; - - template ()>* = nullptr> - std::unique_ptr> operator()( - rmm::cuda_stream_view stream) const - { - index_changed_fn pfn{row_indices.data(), sorted_indices.template data()}; - auto const output_count = - thrust::count_if(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(tokens_counts), - pfn); - auto tokens_offsets = - std::make_unique>(output_count + 1, stream); - thrust::copy_if(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(tokens_counts), - tokens_offsets->begin(), - pfn); - // set the last element to the total number of tokens - tokens_offsets->set_element(output_count, tokens_counts, stream); - return tokens_offsets; - } - - // non-integral types throw an exception - template ()>* = nullptr> - std::unique_ptr> operator()(Args&&...) const - { - CUDF_FAIL("The detokenize indices parameter must be an integer type."); - } -}; +rmm::device_uvector create_token_row_offsets(cudf::column_view row_indices, + cudf::column_view sorted_indices, + cudf::size_type tokens_counts, + rmm::cuda_stream_view stream) +{ + index_changed_fn pfn{cudf::detail::indexalator_factory::make_input_iterator(row_indices), + sorted_indices.data()}; + auto const output_count = + thrust::count_if(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(tokens_counts), + pfn); + auto tokens_offsets = rmm::device_uvector(output_count + 1, stream); + thrust::copy_if(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(tokens_counts), + tokens_offsets.begin(), + pfn); + // set the last element to the total number of tokens + tokens_offsets.set_element(output_count, tokens_counts, stream); + return tokens_offsets; +} } // namespace @@ -155,42 +131,27 @@ std::unique_ptr detokenize(cudf::strings_column_view const& string return cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING}); auto strings_column = cudf::column_device_view::create(strings.parent(), stream); - // the indices may not be in order so we need to sort them - auto sorted_rows = cudf::stable_sorted_order(cudf::table_view({row_indices})); + // the indices may not be in order so we need to build a sorted map + auto sorted_rows = cudf::detail::stable_sorted_order( + cudf::table_view({row_indices}), {}, {}, stream, rmm::mr::get_current_device_resource()); auto const d_row_map = sorted_rows->view().data(); // create offsets for the tokens for each output string auto tokens_offsets = - cudf::type_dispatcher(row_indices.type(), - token_row_offsets_fn{row_indices, sorted_rows->view(), tokens_counts}, - stream); - auto const output_count = tokens_offsets->size() - 1; // number of output strings + create_token_row_offsets(row_indices, sorted_rows->view(), tokens_counts, stream); + auto const output_count = tokens_offsets.size() - 1; // number of output strings - // create output strings offsets by calculating the size of each output string cudf::string_view const d_separator(separator.data(), separator.size()); - auto offsets_transformer_itr = thrust::make_transform_iterator( - thrust::make_counting_iterator(0), - detokenizer_fn{*strings_column, d_row_map, tokens_offsets->data(), d_separator}); - auto offsets_column = cudf::strings::detail::make_offsets_child_column( - offsets_transformer_itr, offsets_transformer_itr + output_count, stream, mr); - auto d_offsets = offsets_column->view().data(); - - // build the chars column - append each source token to the appropriate output row - cudf::size_type const total_bytes = - cudf::detail::get_value(offsets_column->view(), output_count, stream); - auto chars_column = cudf::strings::detail::create_chars_child_column(total_bytes, stream, mr); - auto d_chars = chars_column->mutable_view().data(); - thrust::for_each_n( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), + + auto children = cudf::strings::detail::make_strings_children( + detokenizer_fn{*strings_column, d_row_map, tokens_offsets.data(), d_separator}, output_count, - detokenizer_fn{ - *strings_column, d_row_map, tokens_offsets->data(), d_separator, d_offsets, d_chars}); - chars_column->set_null_count(0); + stream, + mr); // make the output strings column from the offsets and chars column return cudf::make_strings_column( - output_count, std::move(offsets_column), std::move(chars_column), 0, rmm::device_buffer{}); + output_count, std::move(children.first), std::move(children.second), 0, rmm::device_buffer{}); } } // namespace detail From c92d89f25864abc2de9e8a8c0317b1fe139cdb15 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Thu, 1 Dec 2022 08:48:02 -0500 Subject: [PATCH 2/7] add back doxygen for internal utility --- cpp/src/text/detokenize.cu | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/cpp/src/text/detokenize.cu b/cpp/src/text/detokenize.cu index 3b40a6e051b..e9900a7bcb4 100644 --- a/cpp/src/text/detokenize.cu +++ b/cpp/src/text/detokenize.cu @@ -87,6 +87,14 @@ struct index_changed_fn { } }; +/** + * @brief Convert the row indices into token offsets + * + * @param row_indices Indices where each token should land + * @param sorted_indices Map of row_indices sorted + * @param tokens_counts Token counts for each row + * @param stream CUDA stream used for kernel launches + */ rmm::device_uvector create_token_row_offsets(cudf::column_view row_indices, cudf::column_view sorted_indices, cudf::size_type tokens_counts, From a407931dddcd94402ff54051372ce6736806268c Mon Sep 17 00:00:00 2001 From: David Wendt Date: Mon, 5 Dec 2022 09:39:33 -0500 Subject: [PATCH 3/7] add const decl to column_view parms --- cpp/src/text/detokenize.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/text/detokenize.cu b/cpp/src/text/detokenize.cu index e9900a7bcb4..8749bb7c9a7 100644 --- a/cpp/src/text/detokenize.cu +++ b/cpp/src/text/detokenize.cu @@ -95,8 +95,8 @@ struct index_changed_fn { * @param tokens_counts Token counts for each row * @param stream CUDA stream used for kernel launches */ -rmm::device_uvector create_token_row_offsets(cudf::column_view row_indices, - cudf::column_view sorted_indices, +rmm::device_uvector create_token_row_offsets(cudf::column_view const& row_indices, + cudf::column_view const& sorted_indices, cudf::size_type tokens_counts, rmm::cuda_stream_view stream) { From e9b3ab0c3e580494e169648db890bb3ee71051c0 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Mon, 5 Dec 2022 09:40:14 -0500 Subject: [PATCH 4/7] fix style --- cpp/src/text/detokenize.cu | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/cpp/src/text/detokenize.cu b/cpp/src/text/detokenize.cu index 8749bb7c9a7..14b86989491 100644 --- a/cpp/src/text/detokenize.cu +++ b/cpp/src/text/detokenize.cu @@ -95,10 +95,11 @@ struct index_changed_fn { * @param tokens_counts Token counts for each row * @param stream CUDA stream used for kernel launches */ -rmm::device_uvector create_token_row_offsets(cudf::column_view const& row_indices, - cudf::column_view const& sorted_indices, - cudf::size_type tokens_counts, - rmm::cuda_stream_view stream) +rmm::device_uvector create_token_row_offsets( + cudf::column_view const& row_indices, + cudf::column_view const& sorted_indices, + cudf::size_type tokens_counts, + rmm::cuda_stream_view stream) { index_changed_fn pfn{cudf::detail::indexalator_factory::make_input_iterator(row_indices), sorted_indices.data()}; From 609e3bb9e6bb5a6f38555cf759f50e306438aea7 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 6 Dec 2022 08:26:32 -0500 Subject: [PATCH 5/7] change row_map decl to size_type --- cpp/src/text/detokenize.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/text/detokenize.cu b/cpp/src/text/detokenize.cu index 14b86989491..fc4026c3cbb 100644 --- a/cpp/src/text/detokenize.cu +++ b/cpp/src/text/detokenize.cu @@ -48,7 +48,7 @@ namespace { */ struct detokenizer_fn { cudf::column_device_view const d_strings; // these are the tokens - int32_t const* d_row_map; // indices sorted by output row + cudf::size_type const* d_row_map; // indices sorted by output row cudf::size_type const* d_token_offsets; // to each input token array cudf::string_view const d_separator; // append after each token cudf::size_type* d_offsets{}; // offsets to output buffer d_chars @@ -143,7 +143,7 @@ std::unique_ptr detokenize(cudf::strings_column_view const& string // the indices may not be in order so we need to build a sorted map auto sorted_rows = cudf::detail::stable_sorted_order( cudf::table_view({row_indices}), {}, {}, stream, rmm::mr::get_current_device_resource()); - auto const d_row_map = sorted_rows->view().data(); + auto const d_row_map = sorted_rows->view().data(); // create offsets for the tokens for each output string auto tokens_offsets = From 0998f8df05e6cc150165023fd472ccdabacfa1a8 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 6 Dec 2022 13:33:27 -0500 Subject: [PATCH 6/7] remove count-if call --- cpp/src/text/detokenize.cu | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/cpp/src/text/detokenize.cu b/cpp/src/text/detokenize.cu index fc4026c3cbb..1a58e53b8a8 100644 --- a/cpp/src/text/detokenize.cu +++ b/cpp/src/text/detokenize.cu @@ -101,19 +101,19 @@ rmm::device_uvector create_token_row_offsets( cudf::size_type tokens_counts, rmm::cuda_stream_view stream) { - index_changed_fn pfn{cudf::detail::indexalator_factory::make_input_iterator(row_indices), - sorted_indices.data()}; - auto const output_count = - thrust::count_if(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(tokens_counts), - pfn); - auto tokens_offsets = rmm::device_uvector(output_count + 1, stream); - thrust::copy_if(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(tokens_counts), - tokens_offsets.begin(), - pfn); + auto tokens_offsets = rmm::device_uvector(tokens_counts + 1, stream); + + auto end_itr = thrust::copy_if( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(tokens_counts), + tokens_offsets.begin(), + index_changed_fn{cudf::detail::indexalator_factory::make_input_iterator(row_indices), + sorted_indices.data()}); + + auto const output_count = std::distance(tokens_offsets.begin(), end_itr); + tokens_offsets.resize(output_count + 1, stream); + // set the last element to the total number of tokens tokens_offsets.set_element(output_count, tokens_counts, stream); return tokens_offsets; From 64deeb1bbadb05f38ef124768fd242711dd34511 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 6 Dec 2022 13:40:57 -0500 Subject: [PATCH 7/7] put count-if back in --- cpp/src/text/detokenize.cu | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/cpp/src/text/detokenize.cu b/cpp/src/text/detokenize.cu index 1a58e53b8a8..a17583cf649 100644 --- a/cpp/src/text/detokenize.cu +++ b/cpp/src/text/detokenize.cu @@ -101,18 +101,22 @@ rmm::device_uvector create_token_row_offsets( cudf::size_type tokens_counts, rmm::cuda_stream_view stream) { - auto tokens_offsets = rmm::device_uvector(tokens_counts + 1, stream); - - auto end_itr = thrust::copy_if( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(tokens_counts), - tokens_offsets.begin(), - index_changed_fn{cudf::detail::indexalator_factory::make_input_iterator(row_indices), - sorted_indices.data()}); - - auto const output_count = std::distance(tokens_offsets.begin(), end_itr); - tokens_offsets.resize(output_count + 1, stream); + index_changed_fn fn{cudf::detail::indexalator_factory::make_input_iterator(row_indices), + sorted_indices.data()}; + + auto const output_count = + thrust::count_if(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(tokens_counts), + fn); + + auto tokens_offsets = rmm::device_uvector(output_count + 1, stream); + + thrust::copy_if(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(tokens_counts), + tokens_offsets.begin(), + fn); // set the last element to the total number of tokens tokens_offsets.set_element(output_count, tokens_counts, stream);