Skip to content

Commit

Permalink
Merge branch 'branch-0.19' into strings-vector2uvector
Browse files Browse the repository at this point in the history
  • Loading branch information
davidwendt committed Mar 11, 2021
2 parents 0818068 + 8cd927f commit 0600ad8
Show file tree
Hide file tree
Showing 3 changed files with 28 additions and 30 deletions.
14 changes: 4 additions & 10 deletions cpp/src/text/generate_ngrams.cu
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,7 @@ std::unique_ptr<cudf::column> generate_character_ngrams(cudf::strings_column_vie
auto const d_strings = *strings_column;

// create a vector of ngram offsets for each string
rmm::device_vector<int32_t> ngram_offsets(strings_count + 1);
rmm::device_uvector<int32_t> ngram_offsets(strings_count + 1, stream);
thrust::transform_exclusive_scan(
rmm::exec_policy(stream),
thrust::make_counting_iterator<cudf::size_type>(0),
Expand All @@ -235,14 +235,8 @@ std::unique_ptr<cudf::column> generate_character_ngrams(cudf::strings_column_vie
cudf::size_type{0},
thrust::plus<cudf::size_type>());

// total count is the last entry
auto const d_ngram_offsets = ngram_offsets.data().get();
cudf::size_type total_ngrams = 0;
CUDA_TRY(cudaMemcpyAsync(&total_ngrams,
d_ngram_offsets + strings_count,
sizeof(cudf::size_type),
cudaMemcpyDeviceToHost,
stream.value()));
// total ngrams count is the last entry
cudf::size_type const total_ngrams = ngram_offsets.back_element(stream);
CUDF_EXPECTS(total_ngrams > 0,
"Insufficient number of characters in each string to generate ngrams");

Expand All @@ -254,7 +248,7 @@ std::unique_ptr<cudf::column> generate_character_ngrams(cudf::strings_column_vie
mr);
auto d_offsets = offsets_column->mutable_view().data<int32_t>();
// compute the size of each ngram -- output goes in d_offsets
character_ngram_generator_fn generator{d_strings, ngrams, d_ngram_offsets, d_offsets};
character_ngram_generator_fn generator{d_strings, ngrams, ngram_offsets.data(), d_offsets};
thrust::for_each_n(rmm::exec_policy(stream),
thrust::make_counting_iterator<cudf::size_type>(0),
strings_count,
Expand Down
35 changes: 18 additions & 17 deletions cpp/src/text/ngrams_tokenize.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020, NVIDIA CORPORATION.
* Copyright (c) 2020-2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -158,21 +158,22 @@ std::unique_ptr<cudf::column> ngrams_tokenize(

// first, get the number of tokens per string to get the token-offsets
// Ex. token-counts = [3,2]; token-offsets = [0,3,5]
rmm::device_vector<int32_t> token_offsets(strings_count + 1);
auto d_token_offsets = token_offsets.data().get();
rmm::device_uvector<int32_t> token_offsets(strings_count + 1, stream);
auto d_token_offsets = token_offsets.data();
thrust::transform_inclusive_scan(rmm::exec_policy(stream),
thrust::make_counting_iterator<cudf::size_type>(0),
thrust::make_counting_iterator<cudf::size_type>(strings_count),
d_token_offsets + 1,
strings_tokenizer{d_strings, d_delimiter},
thrust::plus<int32_t>());
CUDA_TRY(cudaMemsetAsync(d_token_offsets, 0, sizeof(int32_t), stream.value()));
auto total_tokens = token_offsets[strings_count]; // Ex. 5 tokens
int32_t const zero = 0;
token_offsets.set_element_async(0, zero, stream);
auto const total_tokens = token_offsets.back_element(stream); // Ex. 5 tokens

// get the token positions (in bytes) per string
// Ex. start/end pairs: [(0,1),(2,4),(5,8), (0,2),(3,4)]
rmm::device_vector<position_pair> token_positions(total_tokens);
auto d_token_positions = token_positions.data().get();
rmm::device_uvector<position_pair> token_positions(total_tokens, stream);
auto d_token_positions = token_positions.data();
thrust::for_each_n(
rmm::exec_policy(stream),
thrust::make_counting_iterator<cudf::size_type>(0),
Expand All @@ -181,8 +182,8 @@ std::unique_ptr<cudf::column> ngrams_tokenize(

// compute the number of ngrams per string to get the total number of ngrams to generate
// Ex. ngram-counts = [2,1]; ngram-offsets = [0,2,3]; total = 3 bigrams
rmm::device_vector<int32_t> ngram_offsets(strings_count + 1);
auto d_ngram_offsets = ngram_offsets.data().get();
rmm::device_uvector<int32_t> ngram_offsets(strings_count + 1, stream);
auto d_ngram_offsets = ngram_offsets.data();
thrust::transform_inclusive_scan(
rmm::exec_policy(stream),
thrust::make_counting_iterator<cudf::size_type>(0),
Expand All @@ -193,8 +194,8 @@ std::unique_ptr<cudf::column> ngrams_tokenize(
return (token_count >= ngrams) ? token_count - ngrams + 1 : 0;
},
thrust::plus<int32_t>());
CUDA_TRY(cudaMemsetAsync(d_ngram_offsets, 0, sizeof(int32_t), stream.value()));
auto total_ngrams = ngram_offsets[strings_count];
ngram_offsets.set_element_async(0, zero, stream);
auto const total_ngrams = ngram_offsets.back_element(stream);

// Compute the total size of the ngrams for each string (not for each ngram)
// Ex. 2 bigrams in 1st string total to 10 bytes; 1 bigram in 2nd string is 4 bytes
Expand All @@ -204,20 +205,20 @@ std::unique_ptr<cudf::column> ngrams_tokenize(
// ngrams for each string.
// Ex. bigram for first string produces 2 bigrams ("a_bb","bb_ccc") which
// is built in memory like this: "a_bbbb_ccc"
rmm::device_vector<int32_t> chars_offsets(strings_count + 1); // output memory offsets
auto d_chars_offsets = chars_offsets.data().get(); // per input string
rmm::device_uvector<int32_t> chars_offsets(strings_count + 1, stream); // output memory offsets
auto d_chars_offsets = chars_offsets.data(); // per input string
thrust::transform_inclusive_scan(
rmm::exec_policy(stream),
thrust::make_counting_iterator<cudf::size_type>(0),
thrust::make_counting_iterator<cudf::size_type>(strings_count),
d_chars_offsets + 1,
ngram_builder_fn{d_strings, d_separator, ngrams, d_token_offsets, d_token_positions},
thrust::plus<int32_t>());
CUDA_TRY(cudaMemsetAsync(d_chars_offsets, 0, sizeof(int32_t), stream.value()));
auto output_chars_size = chars_offsets[strings_count]; // Ex. 14 output bytes total
chars_offsets.set_element_async(0, zero, stream);
auto const output_chars_size = chars_offsets.back_element(stream); // Ex. 14 output bytes total

rmm::device_vector<int32_t> ngram_sizes(total_ngrams); // size in bytes of each
auto d_ngram_sizes = ngram_sizes.data().get(); // ngram to generate
rmm::device_uvector<int32_t> ngram_sizes(total_ngrams, stream); // size in bytes of each
auto d_ngram_sizes = ngram_sizes.data(); // ngram to generate

// build chars column
auto chars_column = cudf::strings::detail::create_chars_child_column(
Expand Down
9 changes: 6 additions & 3 deletions cpp/src/text/tokenize.cu
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include <cudf/column/column_factories.hpp>
#include <cudf/detail/get_value.cuh>
#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/strings/detail/strings_column_factories.cuh>
#include <cudf/strings/string_view.cuh>
#include <cudf/strings/strings_column_view.hpp>
#include <cudf/utilities/error.hpp>
Expand All @@ -27,9 +28,10 @@
#include <text/utilities/tokenize_ops.cuh>

#include <rmm/cuda_stream_view.hpp>
#include <rmm/device_uvector.hpp>
#include <rmm/exec_policy.hpp>

#include <thrust/count.h>
#include <thrust/copy.h>
#include <thrust/transform.h>

namespace nvtext {
Expand Down Expand Up @@ -75,7 +77,8 @@ std::unique_ptr<cudf::column> tokenize_fn(cudf::size_type strings_count,
d_token_counts.template begin<int32_t>(),
d_token_counts.template end<int32_t>(),
token_offsets.begin() + 1);
CUDA_TRY(cudaMemsetAsync(token_offsets.data(), 0, sizeof(int32_t), stream.value()));
int32_t const zero = 0;
token_offsets.set_element_async(0, zero, stream);
auto const total_tokens = token_offsets.back_element(stream);
// build a list of pointers to each token
rmm::device_uvector<string_index_pair> tokens(total_tokens, stream);
Expand All @@ -87,7 +90,7 @@ std::unique_ptr<cudf::column> tokenize_fn(cudf::size_type strings_count,
strings_count,
tokenizer);
// create the strings column using the tokens pointers
return cudf::make_strings_column(tokens, stream, mr);
return cudf::strings::detail::make_strings_column(tokens.begin(), tokens.end(), stream, mr);
}

} // namespace
Expand Down

0 comments on commit 0600ad8

Please sign in to comment.