Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[REVIEW] Add nvtext::detokenize API #5739

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@
- PR #5658 Add `filter_tokens` nvtext API
- PR #5666 Add `filter_characters_of_type` strings API
- PR #5673 Always build and test with per-thread default stream enabled in the GPU CI build
- PR #5739 Add `nvtext::detokenize` API
- PR #5645 Enforce pd.NA and Pandas nullable dtype parity
- PR #5729 Create nvtext normalize_characters API from the subword_tokenize internal function
- PR #5572 Add `cudf::encode` API.
Expand Down
1 change: 1 addition & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -562,6 +562,7 @@ add_library(cudf
src/lists/lists_column_view.cu
src/lists/copying/concatenate.cu
src/lists/copying/gather.cu
src/text/detokenize.cu
src/text/generate_ngrams.cu
src/text/normalize.cu
src/text/tokenize.cu
Expand Down
39 changes: 39 additions & 0 deletions cpp/include/nvtext/tokenize.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -175,5 +175,44 @@ std::unique_ptr<cudf::column> character_tokenize(
cudf::strings_column_view const& strings,
rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource());

/**
* @brief Creates a strings column from a strings column of tokens and an
* associated column of row ids.
*
* Multiple tokens from the input column may be combined into a single row (string)
* in the output column. The tokens are concatenated along with the `separator` string
* in the order in which they appear in the `row_indices` column.
*
* @code{.pseudo}
* Example:
* s = ["hello", "world", "one", "two", "three"]
* r = [0, 0, 1, 1, 1]
davidwendt marked this conversation as resolved.
Show resolved Hide resolved
* s1 = detokenize(s,r)
* s1 is now ["hello world", "one two three"]
* r = [0, 2, 1, 1, 0]
* s2 = detokenize(s,r)
* s2 is now ["hello three", "one two", "world"]
* @endcode
*
* All null row entries are ignored and the output contains all valid rows.
* The values in `row_indices` are expected to have positive, sequential
* values without any missing row indices otherwise the output is undefined.
*
* @throw cudf::logic_error is `separator` is invalid
* @throw cudf::logic_error if `row_indices.size() != strings.size()`
* @throw cudf::logic_error if `row_indices` contains nulls
davidwendt marked this conversation as resolved.
Show resolved Hide resolved
*
* @param strings Strings column to detokenize.
* @param row_indices The relative output row index assigned for each token in the input column.
* @param separator String to append after concatenating each token to the proper output row.
* @param mr Device memory resource used to allocate the returned column's device memory.
* @return New strings columns of tokens.
*/
std::unique_ptr<cudf::column> detokenize(
cudf::strings_column_view const& strings,
cudf::column_view const& row_indices,
cudf::string_scalar const& separator = cudf::string_scalar(" "),
rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource());

/** @} */ // end of tokenize group
} // namespace nvtext
205 changes: 205 additions & 0 deletions cpp/src/text/detokenize.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,205 @@
/*
* Copyright (c) 2020, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <cudf/column/column.hpp>
#include <cudf/column/column_device_view.cuh>
#include <cudf/column/column_factories.hpp>
#include <cudf/detail/get_value.cuh>
#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/sorting.hpp>
#include <cudf/strings/detail/utilities.hpp>
#include <cudf/strings/string_view.cuh>
#include <cudf/strings/strings_column_view.hpp>
#include <cudf/table/table_view.hpp>
#include <cudf/utilities/error.hpp>
#include <cudf/utilities/traits.hpp>
#include <cudf/utilities/type_dispatcher.hpp>
#include <nvtext/tokenize.hpp>
#include <strings/utilities.cuh>

#include <thrust/copy.h>
#include <thrust/count.h>
#include <rmm/device_uvector.hpp>

namespace nvtext {
namespace detail {
namespace {
/**
* @brief Generate strings from tokens.
*
* Each string is created by appending all the tokens assigned to
* the same row. The `d_separator` is appended between each token.
*/
struct detokenizer_fn {
cudf::column_device_view const d_strings; // these are the tokens
int32_t const* d_row_map; // indices sorted by output row
cudf::size_type const* d_token_offsets; // to each input token array
cudf::string_view const d_separator; // append after each token
int32_t const* d_offsets{}; // offsets to output buffer d_chars
char* d_chars{}; // output buffer for characters

__device__ cudf::size_type operator()(cudf::size_type idx)
{
auto const offset = d_token_offsets[idx];
auto d_tokens = d_row_map + offset;
auto const token_count = d_token_offsets[idx + 1] - offset;
auto out_ptr = d_chars ? d_chars + d_offsets[idx] : nullptr;
cudf::size_type nbytes = 0;
for (cudf::size_type jdx = 0; jdx < token_count; ++jdx) {
auto const str_index = d_tokens[jdx];
if (d_strings.is_null(str_index)) continue;
auto const d_str = d_strings.element<cudf::string_view>(str_index);
if (out_ptr) {
out_ptr = cudf::strings::detail::copy_string(out_ptr, d_str);
if (jdx + 1 < token_count)
out_ptr = cudf::strings::detail::copy_string(out_ptr, d_separator);
} else {
nbytes += d_str.size_bytes();
nbytes += d_separator.size_bytes();
}
}
return (nbytes > 0) ? (nbytes - d_separator.size_bytes()) : 0;
}
};

/**
* @brief Identifies indexes where the row value changes.
*/
template <typename IndexType>
struct index_changed_fn {
IndexType const* d_rows;
int32_t const* d_row_map;
__device__ bool operator()(cudf::size_type idx)
{
return (idx == 0) || (d_rows[d_row_map[idx]] != d_rows[d_row_map[idx - 1]]);
}
};

/**
* @brief This is a type-dispatch function to convert the row indices
* into token offsets.
*/
struct token_row_offsets_fn {
cudf::column_view const row_indices;
cudf::column_view const sorted_indices;
cudf::size_type const tokens_counts;

template <typename T, std::enable_if_t<cudf::is_index_type<T>()>* = nullptr>
std::unique_ptr<rmm::device_uvector<cudf::size_type>> operator()(cudaStream_t stream) const
{
index_changed_fn<T> pfn{row_indices.data<T>(), sorted_indices.template data<int32_t>()};
auto const output_count =
thrust::count_if(rmm::exec_policy(stream)->on(stream),
thrust::make_counting_iterator<cudf::size_type>(0),
thrust::make_counting_iterator<cudf::size_type>(tokens_counts),
pfn);
auto tokens_offsets =
std::make_unique<rmm::device_uvector<cudf::size_type>>(output_count + 1, stream);
thrust::copy_if(rmm::exec_policy(stream)->on(stream),
thrust::make_counting_iterator<cudf::size_type>(0),
thrust::make_counting_iterator<cudf::size_type>(tokens_counts),
tokens_offsets->begin(),
pfn);
// set the last element to the total number of tokens
tokens_offsets->set_element(output_count, tokens_counts, stream);
return tokens_offsets;
}

// non-integral types throw an exception
template <typename T, typename... Args, std::enable_if_t<not cudf::is_index_type<T>()>* = nullptr>
std::unique_ptr<rmm::device_uvector<cudf::size_type>> operator()(Args&&... args) const
{
CUDF_FAIL("The detokenize indices parameter must be an integer type.");
}
};

} // namespace

/**
* @copydoc nvtext::detokenize
*/
std::unique_ptr<cudf::column> detokenize(cudf::strings_column_view const& strings,
cudf::column_view const& row_indices,
cudf::string_scalar const& separator,
cudaStream_t stream,
rmm::mr::device_memory_resource* mr)
{
CUDF_EXPECTS(separator.is_valid(), "Parameter separator must be valid");
CUDF_EXPECTS(row_indices.size() == strings.size(),
"Parameter row_indices must be the same size as the input column");
CUDF_EXPECTS(row_indices.has_nulls() == false, "Parameter row_indices must not have nulls");

auto tokens_counts = strings.size();
if (tokens_counts == 0) // if no input strings, return an empty column
return cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING});

auto strings_column = cudf::column_device_view::create(strings.parent(), stream);
// the indices may not be in order so we need to sort them
auto sorted_rows = cudf::stable_sorted_order(cudf::table_view({row_indices}));
auto const d_row_map = sorted_rows->view().data<int32_t>();

// create offsets for the tokens for each output string
auto tokens_offsets =
cudf::type_dispatcher(row_indices.type(),
token_row_offsets_fn{row_indices, sorted_rows->view(), tokens_counts},
stream);
auto const output_count = tokens_offsets->size() - 1; // number of output strings

// create output strings offsets by calculating the size of each output string
cudf::string_view const d_separator(separator.data(), separator.size());
auto offsets_transformer_itr = thrust::make_transform_iterator(
thrust::make_counting_iterator<cudf::size_type>(0),
detokenizer_fn{*strings_column, d_row_map, tokens_offsets->data(), d_separator});
auto offsets_column = cudf::strings::detail::make_offsets_child_column(
offsets_transformer_itr, offsets_transformer_itr + output_count, mr, stream);
auto d_offsets = offsets_column->view().data<int32_t>();

// build the chars column - append each source token to the appropriate output row
cudf::size_type const total_bytes =
cudf::detail::get_value<int32_t>(offsets_column->view(), output_count, stream);
auto chars_column =
cudf::strings::detail::create_chars_child_column(output_count, 0, total_bytes, mr, stream);
auto d_chars = chars_column->mutable_view().data<char>();
thrust::for_each_n(
rmm::exec_policy(stream)->on(stream),
thrust::make_counting_iterator<cudf::size_type>(0),
output_count,
detokenizer_fn{
*strings_column, d_row_map, tokens_offsets->data(), d_separator, d_offsets, d_chars});
chars_column->set_null_count(0);

// make the output strings column from the offsets and chars column
return cudf::make_strings_column(output_count,
std::move(offsets_column),
std::move(chars_column),
0,
rmm::device_buffer{0, stream, mr},
stream,
mr);
}

} // namespace detail

std::unique_ptr<cudf::column> detokenize(cudf::strings_column_view const& strings,
cudf::column_view const& row_indices,
cudf::string_scalar const& separator,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
return detail::detokenize(strings, row_indices, separator, 0, mr);
}

} // namespace nvtext
2 changes: 1 addition & 1 deletion cpp/src/text/replace.cu
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
#include <cudf/strings/strings_column_view.hpp>
#include <cudf/utilities/error.hpp>
#include <nvtext/detail/tokenize.hpp>
#include <nvtext/tokenize.hpp>
#include <nvtext/replace.hpp>
#include <strings/utilities.cuh>
#include <text/utilities/tokenize_ops.cuh>

Expand Down
42 changes: 42 additions & 0 deletions cpp/tests/text/tokenize_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -148,3 +148,45 @@ TEST_F(TextTokenizeTest, TokenizeEmptyTest)
results = nvtext::character_tokenize(cudf::strings_column_view(all_null));
EXPECT_EQ(results->size(), 0);
}

TEST_F(TextTokenizeTest, Detokenize)
{
cudf::test::strings_column_wrapper strings{
"the", "fox", "jumped", "over", "the", "dog", "the", "dog", "chased", "the",
"cat", "the", "cat", "chased", "the", "mouse", "the", "mousé", "ate", "cheese"};

{
cudf::test::fixed_width_column_wrapper<int32_t> rows{0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
1, 2, 2, 2, 2, 2, 3, 3, 3, 3};
auto results = nvtext::detokenize(cudf::strings_column_view(strings), rows);
cudf::test::strings_column_wrapper expected{"the fox jumped over the dog",
"the dog chased the cat",
"the cat chased the mouse",
"the mousé ate cheese"};
cudf::test::expect_columns_equal(*results, expected);
}
{
cudf::test::fixed_width_column_wrapper<int16_t> rows{0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
1, 2, 2, 2, 2, 2, 3, 3, 3, 0};
auto results =
nvtext::detokenize(cudf::strings_column_view(strings), rows, cudf::string_scalar("_"));
cudf::test::strings_column_wrapper expected{"the_fox_jumped_over_the_dog_cheese",
"the_dog_chased_the_cat",
"the_cat_chased_the_mouse",
"the_mousé_ate"};
cudf::test::expect_columns_equal(*results, expected);
}
}

TEST_F(TextTokenizeTest, DetokenizeErrors)
{
cudf::test::strings_column_wrapper strings{"this column intentionally left blank"};
cudf::strings_column_view strings_view(strings);

cudf::test::fixed_width_column_wrapper<int32_t> one({0});
cudf::test::fixed_width_column_wrapper<int32_t> none;

EXPECT_THROW(nvtext::detokenize(strings_view, none), cudf::logic_error);
EXPECT_THROW(nvtext::detokenize(strings_view, one, cudf::string_scalar("", false)),
cudf::logic_error);
}
6 changes: 6 additions & 0 deletions python/cudf/cudf/_lib/cpp/nvtext/tokenize.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -31,3 +31,9 @@ cdef extern from "nvtext/tokenize.hpp" namespace "nvtext" nogil:
cdef unique_ptr[column] character_tokenize(
const column_view & strings
) except +

cdef unique_ptr[column] detokenize(
const column_view & strings,
const column_view & row_indices,
const string_scalar & separator
) except +
14 changes: 14 additions & 0 deletions python/cudf/cudf/_lib/nvtext/tokenize.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ from cudf._lib.cpp.types cimport size_type
from cudf._lib.cpp.column.column_view cimport column_view
from cudf._lib.cpp.nvtext.tokenize cimport (
tokenize as cpp_tokenize,
detokenize as cpp_detokenize,
count_tokens as cpp_count_tokens,
character_tokenize as cpp_character_tokenize
)
Expand Down Expand Up @@ -118,3 +119,16 @@ def character_tokenize(Column strings):
)

return Column.from_unique_ptr(move(c_result))


def detokenize(Column strings, Column indices, Scalar separator):
cdef column_view c_strings = strings.view()
cdef column_view c_indices = indices.view()
cdef string_scalar* c_separator = <string_scalar*>separator.c_value.get()
cdef unique_ptr[column] c_result
with nogil:
c_result = move(
cpp_detokenize(c_strings, c_indices, c_separator[0])
)

return Column.from_unique_ptr(move(c_result))
Loading