Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Change device_vector to device_uvector in nvtext source files #7512

Merged
merged 7 commits into from
Mar 11, 2021
Merged
14 changes: 4 additions & 10 deletions cpp/src/text/generate_ngrams.cu
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,7 @@ std::unique_ptr<cudf::column> generate_character_ngrams(cudf::strings_column_vie
auto const d_strings = *strings_column;

// create a vector of ngram offsets for each string
rmm::device_vector<int32_t> ngram_offsets(strings_count + 1);
rmm::device_uvector<int32_t> ngram_offsets(strings_count + 1, stream);
thrust::transform_exclusive_scan(
rmm::exec_policy(stream),
thrust::make_counting_iterator<cudf::size_type>(0),
Expand All @@ -235,14 +235,8 @@ std::unique_ptr<cudf::column> generate_character_ngrams(cudf::strings_column_vie
cudf::size_type{0},
thrust::plus<cudf::size_type>());

// total count is the last entry
auto const d_ngram_offsets = ngram_offsets.data().get();
cudf::size_type total_ngrams = 0;
CUDA_TRY(cudaMemcpyAsync(&total_ngrams,
d_ngram_offsets + strings_count,
sizeof(cudf::size_type),
cudaMemcpyDeviceToHost,
stream.value()));
// total ngrams count is the last entry
cudf::size_type const total_ngrams = ngram_offsets.back_element(stream);
CUDF_EXPECTS(total_ngrams > 0,
"Insufficient number of characters in each string to generate ngrams");

Expand All @@ -254,7 +248,7 @@ std::unique_ptr<cudf::column> generate_character_ngrams(cudf::strings_column_vie
mr);
auto d_offsets = offsets_column->mutable_view().data<int32_t>();
// compute the size of each ngram -- output goes in d_offsets
character_ngram_generator_fn generator{d_strings, ngrams, d_ngram_offsets, d_offsets};
character_ngram_generator_fn generator{d_strings, ngrams, ngram_offsets.data(), d_offsets};
thrust::for_each_n(rmm::exec_policy(stream),
thrust::make_counting_iterator<cudf::size_type>(0),
strings_count,
Expand Down
35 changes: 18 additions & 17 deletions cpp/src/text/ngrams_tokenize.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020, NVIDIA CORPORATION.
* Copyright (c) 2020-2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -158,21 +158,22 @@ std::unique_ptr<cudf::column> ngrams_tokenize(

// first, get the number of tokens per string to get the token-offsets
// Ex. token-counts = [3,2]; token-offsets = [0,3,5]
rmm::device_vector<int32_t> token_offsets(strings_count + 1);
auto d_token_offsets = token_offsets.data().get();
rmm::device_uvector<int32_t> token_offsets(strings_count + 1, stream);
auto d_token_offsets = token_offsets.data();
thrust::transform_inclusive_scan(rmm::exec_policy(stream),
thrust::make_counting_iterator<cudf::size_type>(0),
thrust::make_counting_iterator<cudf::size_type>(strings_count),
d_token_offsets + 1,
strings_tokenizer{d_strings, d_delimiter},
thrust::plus<int32_t>());
CUDA_TRY(cudaMemsetAsync(d_token_offsets, 0, sizeof(int32_t), stream.value()));
auto total_tokens = token_offsets[strings_count]; // Ex. 5 tokens
int32_t const zero = 0;
token_offsets.set_element_async(0, zero, stream);
auto const total_tokens = token_offsets.back_element(stream); // Ex. 5 tokens

// get the token positions (in bytes) per string
// Ex. start/end pairs: [(0,1),(2,4),(5,8), (0,2),(3,4)]
rmm::device_vector<position_pair> token_positions(total_tokens);
auto d_token_positions = token_positions.data().get();
rmm::device_uvector<position_pair> token_positions(total_tokens, stream);
auto d_token_positions = token_positions.data();
thrust::for_each_n(
rmm::exec_policy(stream),
thrust::make_counting_iterator<cudf::size_type>(0),
Expand All @@ -181,8 +182,8 @@ std::unique_ptr<cudf::column> ngrams_tokenize(

// compute the number of ngrams per string to get the total number of ngrams to generate
// Ex. ngram-counts = [2,1]; ngram-offsets = [0,2,3]; total = 3 bigrams
rmm::device_vector<int32_t> ngram_offsets(strings_count + 1);
auto d_ngram_offsets = ngram_offsets.data().get();
rmm::device_uvector<int32_t> ngram_offsets(strings_count + 1, stream);
auto d_ngram_offsets = ngram_offsets.data();
thrust::transform_inclusive_scan(
rmm::exec_policy(stream),
thrust::make_counting_iterator<cudf::size_type>(0),
Expand All @@ -193,8 +194,8 @@ std::unique_ptr<cudf::column> ngrams_tokenize(
return (token_count >= ngrams) ? token_count - ngrams + 1 : 0;
},
thrust::plus<int32_t>());
CUDA_TRY(cudaMemsetAsync(d_ngram_offsets, 0, sizeof(int32_t), stream.value()));
auto total_ngrams = ngram_offsets[strings_count];
ngram_offsets.set_element_async(0, zero, stream);
auto const total_ngrams = ngram_offsets.back_element(stream);

// Compute the total size of the ngrams for each string (not for each ngram)
// Ex. 2 bigrams in 1st string total to 10 bytes; 1 bigram in 2nd string is 4 bytes
Expand All @@ -204,20 +205,20 @@ std::unique_ptr<cudf::column> ngrams_tokenize(
// ngrams for each string.
// Ex. bigram for first string produces 2 bigrams ("a_bb","bb_ccc") which
// is built in memory like this: "a_bbbb_ccc"
rmm::device_vector<int32_t> chars_offsets(strings_count + 1); // output memory offsets
auto d_chars_offsets = chars_offsets.data().get(); // per input string
rmm::device_uvector<int32_t> chars_offsets(strings_count + 1, stream); // output memory offsets
auto d_chars_offsets = chars_offsets.data(); // per input string
thrust::transform_inclusive_scan(
rmm::exec_policy(stream),
thrust::make_counting_iterator<cudf::size_type>(0),
thrust::make_counting_iterator<cudf::size_type>(strings_count),
d_chars_offsets + 1,
ngram_builder_fn{d_strings, d_separator, ngrams, d_token_offsets, d_token_positions},
thrust::plus<int32_t>());
CUDA_TRY(cudaMemsetAsync(d_chars_offsets, 0, sizeof(int32_t), stream.value()));
auto output_chars_size = chars_offsets[strings_count]; // Ex. 14 output bytes total
chars_offsets.set_element_async(0, zero, stream);
auto const output_chars_size = chars_offsets.back_element(stream); // Ex. 14 output bytes total

rmm::device_vector<int32_t> ngram_sizes(total_ngrams); // size in bytes of each
auto d_ngram_sizes = ngram_sizes.data().get(); // ngram to generate
rmm::device_uvector<int32_t> ngram_sizes(total_ngrams, stream); // size in bytes of each
auto d_ngram_sizes = ngram_sizes.data(); // ngram to generate

// build chars column
auto chars_column = cudf::strings::detail::create_chars_child_column(
Expand Down
9 changes: 6 additions & 3 deletions cpp/src/text/tokenize.cu
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include <cudf/column/column_factories.hpp>
#include <cudf/detail/get_value.cuh>
#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/strings/detail/strings_column_factories.cuh>
#include <cudf/strings/string_view.cuh>
#include <cudf/strings/strings_column_view.hpp>
#include <cudf/utilities/error.hpp>
Expand All @@ -27,9 +28,10 @@
#include <text/utilities/tokenize_ops.cuh>

#include <rmm/cuda_stream_view.hpp>
#include <rmm/device_uvector.hpp>
#include <rmm/exec_policy.hpp>

#include <thrust/count.h>
#include <thrust/copy.h>
#include <thrust/transform.h>

namespace nvtext {
Expand Down Expand Up @@ -75,7 +77,8 @@ std::unique_ptr<cudf::column> tokenize_fn(cudf::size_type strings_count,
d_token_counts.template begin<int32_t>(),
d_token_counts.template end<int32_t>(),
token_offsets.begin() + 1);
CUDA_TRY(cudaMemsetAsync(token_offsets.data(), 0, sizeof(int32_t), stream.value()));
int32_t const zero = 0;
token_offsets.set_element_async(0, zero, stream);
auto const total_tokens = token_offsets.back_element(stream);
// build a list of pointers to each token
rmm::device_uvector<string_index_pair> tokens(total_tokens, stream);
Expand All @@ -87,7 +90,7 @@ std::unique_ptr<cudf::column> tokenize_fn(cudf::size_type strings_count,
strings_count,
tokenizer);
// create the strings column using the tokens pointers
return cudf::make_strings_column(tokens, stream, mr);
return cudf::strings::detail::make_strings_column(tokens.begin(), tokens.end(), stream, mr);
}

} // namespace
Expand Down