rapidsai · davidwendt · Jul 28, 2020 · Jul 21, 2020 · Jul 21, 2020 · Jul 21, 2020
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -49,6 +49,7 @@
 - PR #5658 Add `filter_tokens` nvtext API
 - PR #5666 Add `filter_characters_of_type` strings API
 - PR #5673 Always build and test with per-thread default stream enabled in the GPU CI build
+- PR #5739 Add `nvtext::detokenize` API
 - PR #5645 Enforce pd.NA and Pandas nullable dtype parity
 - PR #5729 Create nvtext normalize_characters API from the subword_tokenize internal function
 - PR #5572 Add `cudf::encode` API.

@@ -562,6 +562,7 @@ add_library(cudf
             src/lists/lists_column_view.cu
             src/lists/copying/concatenate.cu
             src/lists/copying/gather.cu
+            src/text/detokenize.cu
             src/text/generate_ngrams.cu
             src/text/normalize.cu
             src/text/tokenize.cu

@@ -175,5 +175,44 @@ std::unique_ptr<cudf::column> character_tokenize(
   cudf::strings_column_view const& strings,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource());
 
+/**
+ * @brief Creates a strings column from a strings column of tokens and an
+ * associated column of row ids.
+ *
+ * Multiple tokens from the input column may be combined into a single row (string)
+ * in the output column. The tokens are concatenated along with the `separator` string
+ * in the order in which they appear in the `row_indices` column.
+ *
+ * @code{.pseudo}
+ * Example:
+ * s = ["hello", "world", "one", "two", "three"]
+ * r = [0, 0, 1, 1, 1]
+ * s1 = detokenize(s,r)
+ * s1 is now ["hello world", "one two three"]
+ * r = [0, 2, 1, 1, 0]
+ * s2 = detokenize(s,r)
+ * s2 is now ["hello three", "one two", "world"]
+ * @endcode
+ *
+ * All null row entries are ignored and the output contains all valid rows.
+ * The values in `row_indices` are expected to have positive, sequential
+ * values without any missing row indices otherwise the output is undefined.
+ *
+ * @throw cudf::logic_error is `separator` is invalid
+ * @throw cudf::logic_error if `row_indices.size() != strings.size()`
+ * @throw cudf::logic_error if `row_indices` contains nulls
+ *
+ * @param strings Strings column to detokenize.
+ * @param row_indices The relative output row index assigned for each token in the input column.
+ * @param separator String to append after concatenating each token to the proper output row.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
+ * @return New strings columns of tokens.
+ */
+std::unique_ptr<cudf::column> detokenize(
+  cudf::strings_column_view const& strings,
+  cudf::column_view const& row_indices,
+  cudf::string_scalar const& separator = cudf::string_scalar(" "),
+  rmm::mr::device_memory_resource* mr  = rmm::mr::get_default_resource());
+
 /** @} */  // end of tokenize group
 }  // namespace nvtext
@@ -0,0 +1,205 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/column/column.hpp>
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/get_value.cuh>
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/sorting.hpp>
+#include <cudf/strings/detail/utilities.hpp>
+#include <cudf/strings/string_view.cuh>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/table/table_view.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/traits.hpp>
+#include <cudf/utilities/type_dispatcher.hpp>
+#include <nvtext/tokenize.hpp>
+#include <strings/utilities.cuh>
+
+#include <thrust/copy.h>
+#include <thrust/count.h>
+#include <rmm/device_uvector.hpp>
+
+namespace nvtext {
+namespace detail {
+namespace {
+/**
+ * @brief Generate strings from tokens.
+ *
+ * Each string is created by appending all the tokens assigned to
+ * the same row. The `d_separator` is appended between each token.
+ */
+struct detokenizer_fn {
+  cudf::column_device_view const d_strings;  // these are the tokens
+  int32_t const* d_row_map;                  // indices sorted by output row
+  cudf::size_type const* d_token_offsets;    // to each input token array
+  cudf::string_view const d_separator;       // append after each token
+  int32_t const* d_offsets{};                // offsets to output buffer d_chars
+  char* d_chars{};                           // output buffer for characters
+
+  __device__ cudf::size_type operator()(cudf::size_type idx)
+  {
+    auto const offset      = d_token_offsets[idx];
+    auto d_tokens          = d_row_map + offset;
+    auto const token_count = d_token_offsets[idx + 1] - offset;
+    auto out_ptr           = d_chars ? d_chars + d_offsets[idx] : nullptr;
+    cudf::size_type nbytes = 0;
+    for (cudf::size_type jdx = 0; jdx < token_count; ++jdx) {
+      auto const str_index = d_tokens[jdx];
+      if (d_strings.is_null(str_index)) continue;
+      auto const d_str = d_strings.element<cudf::string_view>(str_index);
+      if (out_ptr) {
+        out_ptr = cudf::strings::detail::copy_string(out_ptr, d_str);
+        if (jdx + 1 < token_count)
+          out_ptr = cudf::strings::detail::copy_string(out_ptr, d_separator);
+      } else {
+        nbytes += d_str.size_bytes();
+        nbytes += d_separator.size_bytes();
+      }
+    }
+    return (nbytes > 0) ? (nbytes - d_separator.size_bytes()) : 0;
+  }
+};
+
+/**
+ * @brief Identifies indexes where the row value changes.
+ */
+template <typename IndexType>
+struct index_changed_fn {
+  IndexType const* d_rows;
+  int32_t const* d_row_map;
+  __device__ bool operator()(cudf::size_type idx)
+  {
+    return (idx == 0) || (d_rows[d_row_map[idx]] != d_rows[d_row_map[idx - 1]]);
+  }
+};
+
+/**
+ * @brief This is a type-dispatch function to convert the row indices
+ * into token offsets.
+ */
+struct token_row_offsets_fn {
+  cudf::column_view const row_indices;
+  cudf::column_view const sorted_indices;
+  cudf::size_type const tokens_counts;
+
+  template <typename T, std::enable_if_t<cudf::is_index_type<T>()>* = nullptr>
+  std::unique_ptr<rmm::device_uvector<cudf::size_type>> operator()(cudaStream_t stream) const
+  {
+    index_changed_fn<T> pfn{row_indices.data<T>(), sorted_indices.template data<int32_t>()};
+    auto const output_count =
+      thrust::count_if(rmm::exec_policy(stream)->on(stream),
+                       thrust::make_counting_iterator<cudf::size_type>(0),
+                       thrust::make_counting_iterator<cudf::size_type>(tokens_counts),
+                       pfn);
+    auto tokens_offsets =
+      std::make_unique<rmm::device_uvector<cudf::size_type>>(output_count + 1, stream);
+    thrust::copy_if(rmm::exec_policy(stream)->on(stream),
+                    thrust::make_counting_iterator<cudf::size_type>(0),
+                    thrust::make_counting_iterator<cudf::size_type>(tokens_counts),
+                    tokens_offsets->begin(),
+                    pfn);
+    // set the last element to the total number of tokens
+    tokens_offsets->set_element(output_count, tokens_counts, stream);
+    return tokens_offsets;
+  }
+
+  // non-integral types throw an exception
+  template <typename T, typename... Args, std::enable_if_t<not cudf::is_index_type<T>()>* = nullptr>
+  std::unique_ptr<rmm::device_uvector<cudf::size_type>> operator()(Args&&... args) const
+  {
+    CUDF_FAIL("The detokenize indices parameter must be an integer type.");
+  }
+};
+
+}  // namespace
+
+/**
+ * @copydoc nvtext::detokenize
+ */
+std::unique_ptr<cudf::column> detokenize(cudf::strings_column_view const& strings,
+                                         cudf::column_view const& row_indices,
+                                         cudf::string_scalar const& separator,
+                                         cudaStream_t stream,
+                                         rmm::mr::device_memory_resource* mr)
+{
+  CUDF_EXPECTS(separator.is_valid(), "Parameter separator must be valid");
+  CUDF_EXPECTS(row_indices.size() == strings.size(),
+               "Parameter row_indices must be the same size as the input column");
+  CUDF_EXPECTS(row_indices.has_nulls() == false, "Parameter row_indices must not have nulls");
+
+  auto tokens_counts = strings.size();
+  if (tokens_counts == 0)  // if no input strings, return an empty column
+    return cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING});
+
+  auto strings_column = cudf::column_device_view::create(strings.parent(), stream);
+  // the indices may not be in order so we need to sort them
+  auto sorted_rows     = cudf::stable_sorted_order(cudf::table_view({row_indices}));
+  auto const d_row_map = sorted_rows->view().data<int32_t>();
+
+  // create offsets for the tokens for each output string
+  auto tokens_offsets =
+    cudf::type_dispatcher(row_indices.type(),
+                          token_row_offsets_fn{row_indices, sorted_rows->view(), tokens_counts},
+                          stream);
+  auto const output_count = tokens_offsets->size() - 1;  // number of output strings
+
+  // create output strings offsets by calculating the size of each output string
+  cudf::string_view const d_separator(separator.data(), separator.size());
+  auto offsets_transformer_itr = thrust::make_transform_iterator(
+    thrust::make_counting_iterator<cudf::size_type>(0),
+    detokenizer_fn{*strings_column, d_row_map, tokens_offsets->data(), d_separator});
+  auto offsets_column = cudf::strings::detail::make_offsets_child_column(
+    offsets_transformer_itr, offsets_transformer_itr + output_count, mr, stream);
+  auto d_offsets = offsets_column->view().data<int32_t>();
+
+  // build the chars column - append each source token to the appropriate output row
+  cudf::size_type const total_bytes =
+    cudf::detail::get_value<int32_t>(offsets_column->view(), output_count, stream);
+  auto chars_column =
+    cudf::strings::detail::create_chars_child_column(output_count, 0, total_bytes, mr, stream);
+  auto d_chars = chars_column->mutable_view().data<char>();
+  thrust::for_each_n(
+    rmm::exec_policy(stream)->on(stream),
+    thrust::make_counting_iterator<cudf::size_type>(0),
+    output_count,
+    detokenizer_fn{
+      *strings_column, d_row_map, tokens_offsets->data(), d_separator, d_offsets, d_chars});
+  chars_column->set_null_count(0);
+
+  // make the output strings column from the offsets and chars column
+  return cudf::make_strings_column(output_count,
+                                   std::move(offsets_column),
+                                   std::move(chars_column),
+                                   0,
+                                   rmm::device_buffer{0, stream, mr},
+                                   stream,
+                                   mr);
+}
+
+}  // namespace detail
+
+std::unique_ptr<cudf::column> detokenize(cudf::strings_column_view const& strings,
+                                         cudf::column_view const& row_indices,
+                                         cudf::string_scalar const& separator,
+                                         rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::detokenize(strings, row_indices, separator, 0, mr);
+}
+
+}  // namespace nvtext
@@ -23,7 +23,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/error.hpp>
 #include <nvtext/detail/tokenize.hpp>
-#include <nvtext/tokenize.hpp>
+#include <nvtext/replace.hpp>
 #include <strings/utilities.cuh>
 #include <text/utilities/tokenize_ops.cuh>
 

@@ -148,3 +148,45 @@ TEST_F(TextTokenizeTest, TokenizeEmptyTest)
   results = nvtext::character_tokenize(cudf::strings_column_view(all_null));
   EXPECT_EQ(results->size(), 0);
 }
+
+TEST_F(TextTokenizeTest, Detokenize)
+{
+  cudf::test::strings_column_wrapper strings{
+    "the", "fox", "jumped", "over",   "the", "dog",   "the", "dog",   "chased", "the",
+    "cat", "the", "cat",    "chased", "the", "mouse", "the", "mousé", "ate",    "cheese"};
+
+  {
+    cudf::test::fixed_width_column_wrapper<int32_t> rows{0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
+                                                         1, 2, 2, 2, 2, 2, 3, 3, 3, 3};
+    auto results = nvtext::detokenize(cudf::strings_column_view(strings), rows);
+    cudf::test::strings_column_wrapper expected{"the fox jumped over the dog",
+                                                "the dog chased the cat",
+                                                "the cat chased the mouse",
+                                                "the mousé ate cheese"};
+    cudf::test::expect_columns_equal(*results, expected);
+  }
+  {
+    cudf::test::fixed_width_column_wrapper<int16_t> rows{0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
+                                                         1, 2, 2, 2, 2, 2, 3, 3, 3, 0};
+    auto results =
+      nvtext::detokenize(cudf::strings_column_view(strings), rows, cudf::string_scalar("_"));
+    cudf::test::strings_column_wrapper expected{"the_fox_jumped_over_the_dog_cheese",
+                                                "the_dog_chased_the_cat",
+                                                "the_cat_chased_the_mouse",
+                                                "the_mousé_ate"};
+    cudf::test::expect_columns_equal(*results, expected);
+  }
+}
+
+TEST_F(TextTokenizeTest, DetokenizeErrors)
+{
+  cudf::test::strings_column_wrapper strings{"this column intentionally left blank"};
+  cudf::strings_column_view strings_view(strings);
+
+  cudf::test::fixed_width_column_wrapper<int32_t> one({0});
+  cudf::test::fixed_width_column_wrapper<int32_t> none;
+
+  EXPECT_THROW(nvtext::detokenize(strings_view, none), cudf::logic_error);
+  EXPECT_THROW(nvtext::detokenize(strings_view, one, cudf::string_scalar("", false)),
+               cudf::logic_error);
+}
@@ -31,3 +31,9 @@ cdef extern from "nvtext/tokenize.hpp" namespace "nvtext" nogil:
     cdef unique_ptr[column] character_tokenize(
         const column_view & strings
     ) except +
+
+    cdef unique_ptr[column] detokenize(
+        const column_view & strings,
+        const column_view & row_indices,
+        const string_scalar & separator
+    ) except +
@@ -9,6 +9,7 @@ from cudf._lib.cpp.types cimport size_type
 from cudf._lib.cpp.column.column_view cimport column_view
 from cudf._lib.cpp.nvtext.tokenize cimport (
     tokenize as cpp_tokenize,
+    detokenize as cpp_detokenize,
     count_tokens as cpp_count_tokens,
     character_tokenize as cpp_character_tokenize
 )
@@ -118,3 +119,16 @@ def character_tokenize(Column strings):
         )
 
     return Column.from_unique_ptr(move(c_result))
+
+
+def detokenize(Column strings, Column indices, Scalar separator):
+    cdef column_view c_strings = strings.view()
+    cdef column_view c_indices = indices.view()
+    cdef string_scalar* c_separator = <string_scalar*>separator.c_value.get()
+    cdef unique_ptr[column] c_result
+    with nogil:
+        c_result = move(
+            cpp_detokenize(c_strings, c_indices, c_separator[0])
+        )
+
+    return Column.from_unique_ptr(move(c_result))