Add format API for list column of strings (#9454)

Closes #8351 This PR adds API `cudf::strings::format_list_column` to create the formatted output as described in #8351. The API only accepts lists columns of strings. ``` Example 1 l1 = { [[a,b,c], [d,e]], [[f,g], [h]] } s1 = format_list_column(l1) s1 is now ["[[a,b,c],[d,e]]", "[[f,g],[h]]"] Example 2 l2 = { [[a,b,c], [d,e]], [NULL], [[f,g], NULL [h]] } s2 = format_list_column(l1, '-', [':', '{', '}']) s2 is now ["{{a:b:c}:{d:e}}", "{-}", "{{f:g}:-:{h}}"] ``` The format API takes parameters to specify the strings to use for `[` , `]` and ',' as well as the string used to represent null entries. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Robert Maynard (https://github.com/robertmaynard) - AJ Schmidt (https://github.com/ajschmidt8) - Vyas Ramasubramani (https://github.com/vyasr) - Karthikeyan (https://github.com/karthikeyann) URL: #9454
rapidsai · Nov 9, 2021 · 499ebae · 499ebae
1 parent a7d520c
commit 499ebae
Show file tree

Hide file tree

Showing 10 changed files with 595 additions and 1 deletion.
diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
@@ -189,6 +189,7 @@ test:
     - test -f $PREFIX/include/cudf/strings/convert/convert_floats.hpp
     - test -f $PREFIX/include/cudf/strings/convert/convert_integers.hpp
     - test -f $PREFIX/include/cudf/strings/convert/convert_ipv4.hpp
+    - test -f $PREFIX/include/cudf/strings/convert/convert_lists.hpp
     - test -f $PREFIX/include/cudf/strings/convert/convert_urls.hpp
     - test -f $PREFIX/include/cudf/strings/detail/combine.hpp
     - test -f $PREFIX/include/cudf/strings/detail/concatenate.hpp

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
@@ -414,6 +414,7 @@ add_library(
   src/strings/convert/convert_integers.cu
   src/strings/convert/convert_ipv4.cu
   src/strings/convert/convert_urls.cu
+  src/strings/convert/convert_lists.cu
   src/strings/copying/concatenate.cu
   src/strings/copying/copying.cu
   src/strings/copying/shift.cu

diff --git a/cpp/include/cudf/strings/convert/convert_lists.hpp b/cpp/include/cudf/strings/convert/convert_lists.hpp
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/column/column.hpp>
+#include <cudf/lists/lists_column_view.hpp>
+#include <cudf/scalar/scalar.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+
+namespace cudf {
+namespace strings {
+/**
+ * @addtogroup strings_convert
+ * @{
+ * @file
+ */
+
+/**
+ * @brief Convert a list column of strings into a formatted strings column.
+ *
+ * The `separators` column should contain 3 strings elements in the following order:
+ * - element separator (default is comma `,`)
+ * - left-hand enclosure (default is `[`)
+ * - right-hand enclosure (default is `]`)
+ *
+ * @code{.pseudo}
+ * l1 = { [[a,b,c], [d,e]], [[f,g], [h]] }
+ * s1 = format_list_column(l1)
+ * s1 is now ["[[a,b,c],[d,e]]", "[[f,g],[h]]"]
+ *
+ * l2 = { [[a,b,c], [d,e]], [NULL], [[f,g], NULL, [h]] }
+ * s2 = format_list_column(l1, '-', [':', '{', '}'])
+ * s2 is now ["{{a:b:c}:{d:e}}", "{-}", "{{f:g}:-:{h}}"]
+ * @endcode
+ *
+ * @throw cudf::logic_error if the input column is not a LIST type with a STRING child.
+ *
+ * @param input Lists column to format.
+ * @param na_rep Replacment string for null elements.
+ * @param separator Strings to use for enclosing list components and separating elements.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
+ * @return New strings column.
+ */
+std::unique_ptr<column> format_list_column(
+  lists_column_view const& input,
+  string_scalar const& na_rep           = string_scalar("NULL"),
+  strings_column_view const& separators = strings_column_view(column_view{
+    data_type{type_id::STRING}, 0, nullptr}),
+  rmm::mr::device_memory_resource* mr   = rmm::mr::get_current_device_resource());
+
+/** @} */  // end of doxygen group
+}  // namespace strings
+}  // namespace cudf
diff --git a/cpp/src/strings/convert/convert_lists.cu b/cpp/src/strings/convert/convert_lists.cu
@@ -0,0 +1,241 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/strings/convert/convert_lists.hpp>
+#include <cudf/strings/detail/utilities.cuh>
+#include <cudf/strings/string_view.cuh>
+
+#include <rmm/cuda_stream_view.hpp>
+
+namespace cudf {
+namespace strings {
+namespace detail {
+namespace {
+
+// position of the element separator string (e.g. comma ',') within the separators column
+constexpr size_type separator_index = 0;
+// position of the enclosure strings (e.g. []) within the separators column
+constexpr size_type left_brace_index  = 1;
+constexpr size_type right_brace_index = 2;
+
+/**
+ * @brief Pending separator type for `stack_item`
+ */
+enum class item_separator : int8_t { NONE, ELEMENT, LIST };
+
+/**
+ * @brief Stack item used to manage nested lists.
+ *
+ * Each item includes the current range and the pending separator.
+ */
+struct alignas(8) stack_item {
+  size_type left_idx;
+  size_type right_idx;
+  item_separator separator{item_separator::NONE};
+};
+
+/**
+ * @brief Formatting lists functor.
+ *
+ * This formats the input list column into individual strings using the
+ * specified separators and null-representation (na_rep) string.
+ *
+ * Recursion is simulated by using stack allocating per output string.
+ */
+struct format_lists_fn {
+  column_device_view const d_input;
+  column_device_view const d_separators;
+  string_view const d_na_rep;
+  stack_item* d_stack;
+  size_type const max_depth;
+  size_type* d_offsets{};
+  char* d_chars{};
+
+  __device__ column_device_view get_nested_child(size_type idx)
+  {
+    auto current = d_input;
+    while (idx > 0) {
+      current = current.child(cudf::lists_column_view::child_column_index);
+      --idx;
+    }
+    return current;
+  }
+
+  __device__ size_type write_separator(char*& d_output, size_type sep_idx = separator_index)
+  {
+    auto d_str = [&] {
+      if (d_separators.size() > sep_idx) return d_separators.element<string_view>(sep_idx);
+      if (sep_idx == left_brace_index) return string_view("[", 1);
+      if (sep_idx == right_brace_index) return string_view("]", 1);
+      return string_view(",", 1);
+    }();
+    if (d_output) d_output = copy_string(d_output, d_str);
+    return d_str.size_bytes();
+  }
+
+  __device__ size_type write_na_rep(char*& d_output)
+  {
+    if (d_output) d_output = copy_string(d_output, d_na_rep);
+    return d_na_rep.size_bytes();
+  }
+
+  __device__ size_type write_strings(column_device_view const& col,
+                                     size_type left_idx,
+                                     size_type right_idx,
+                                     char* d_output)
+  {
+    size_type bytes = 0;
+    for (size_type idx = left_idx; idx < right_idx; ++idx) {
+      if (col.is_null(idx)) {
+        bytes += write_na_rep(d_output);  // e.g. 'NULL'
+      } else {
+        auto d_str = col.element<string_view>(idx);
+        if (d_output) d_output = copy_string(d_output, d_str);
+        bytes += d_str.size_bytes();
+      }
+      if (idx + 1 < right_idx) {
+        bytes += write_separator(d_output);  // e.g. comma ','
+      }
+    }
+    return bytes;
+  }
+
+  __device__ void operator()(size_type idx)
+  {
+    size_type bytes = 0;
+    char* d_output  = d_chars ? d_chars + d_offsets[idx] : nullptr;
+
+    // push first item to the stack
+    auto item_stack         = d_stack + idx * max_depth;
+    auto stack_idx          = size_type{0};
+    item_stack[stack_idx++] = stack_item{idx, idx + 1};
+
+    // process until stack is empty
+    while (stack_idx > 0) {
+      --stack_idx;  // pop from stack
+      auto const item = item_stack[stack_idx];
+      auto const view = get_nested_child(stack_idx);
+
+      auto offsets   = view.child(cudf::lists_column_view::offsets_column_index);
+      auto d_offsets = offsets.data<offset_type>() + view.offset();
+
+      // add pending separator
+      if (item.separator == item_separator::LIST) {
+        bytes += write_separator(d_output, right_brace_index);
+      } else if (item.separator == item_separator::ELEMENT) {
+        bytes += write_separator(d_output, separator_index);
+      }
+
+      // loop through the child elements for the current view
+      for (auto jdx = item.left_idx; jdx < item.right_idx; ++jdx) {
+        auto const lhs = d_offsets[jdx];
+        auto const rhs = d_offsets[jdx + 1];
+
+        if (view.is_null(jdx)) {
+          bytes += write_na_rep(d_output);  // e.g. 'NULL'
+        } else if (lhs == rhs) {            // e.g. '[]'
+          bytes += write_separator(d_output, left_brace_index);
+          bytes += write_separator(d_output, right_brace_index);
+        } else {
+          auto child = view.child(cudf::lists_column_view::child_column_index);
+          bytes += write_separator(d_output, left_brace_index);
+
+          // if child is a list type, then recurse into it
+          if (child.type().id() == type_id::LIST) {
+            // push current state to the stack
+            item_stack[stack_idx++] =
+              stack_item{jdx + 1,
+                         item.right_idx,
+                         jdx + 1 < item.right_idx ? item_separator::ELEMENT : item_separator::LIST};
+            // push child to the stack
+            item_stack[stack_idx++] = stack_item{lhs, rhs};
+            break;  // back to the stack (while-loop)
+          }
+
+          // otherwise, the child is a strings column;
+          // write out the string elements
+          auto const size = write_strings(child, lhs, rhs, d_output);
+          bytes += size;
+          if (d_output) d_output += size;
+
+          bytes += write_separator(d_output, right_brace_index);
+        }
+
+        // write element separator (e.g. comma ',') if not at the end
+        if (jdx + 1 < item.right_idx) { bytes += write_separator(d_output); }
+      }
+    }
+
+    if (!d_chars) d_offsets[idx] = bytes;
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<column> format_list_column(lists_column_view const& input,
+                                           string_scalar const& na_rep,
+                                           strings_column_view const& separators,
+                                           rmm::cuda_stream_view stream,
+                                           rmm::mr::device_memory_resource* mr)
+{
+  if (input.is_empty()) return make_empty_column(data_type{type_id::STRING});
+
+  size_type depth = 1;  // count the depth to the strings column
+  auto child_col  = input.child();
+  while (child_col.type().id() == type_id::LIST) {
+    child_col = cudf::lists_column_view(child_col).child();
+    ++depth;
+  }
+  CUDF_EXPECTS(child_col.type().id() == type_id::STRING, "lists child must be a STRING column");
+
+  CUDF_EXPECTS(separators.size() == 0 || separators.size() == 3,
+               "Invalid number of separator strings");
+  CUDF_EXPECTS(na_rep.is_valid(stream), "Null replacement string must be valid");
+
+  // create stack memory for processing nested lists
+  auto stack_buffer = rmm::device_uvector<stack_item>(input.size() * depth, stream);
+
+  auto const d_input      = column_device_view::create(input.parent(), stream);
+  auto const d_separators = column_device_view::create(separators.parent(), stream);
+  auto const d_na_rep     = na_rep.value(stream);
+
+  auto children = cudf::strings::detail::make_strings_children(
+    format_lists_fn{*d_input, *d_separators, d_na_rep, stack_buffer.data(), depth},
+    input.size(),
+    stream,
+    mr);
+
+  return make_strings_column(
+    input.size(), std::move(children.first), std::move(children.second), 0, rmm::device_buffer{});
+}
+
+}  // namespace detail
+
+// external API
+
+std::unique_ptr<column> format_list_column(lists_column_view const& input,
+                                           string_scalar const& na_rep,
+                                           strings_column_view const& separators,
+                                           rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::format_list_column(input, na_rep, separators, rmm::cuda_stream_default, mr);
+}
+
+}  // namespace strings
+}  // namespace cudf
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
@@ -367,6 +367,7 @@ ConfigureTest(
   strings/find_multiple_tests.cpp
   strings/fixed_point_tests.cpp
   strings/floats_tests.cpp
+  strings/format_lists_tests.cpp
   strings/integers_tests.cpp
   strings/ipv4_tests.cpp
   strings/json_tests.cpp