Skip to content

Commit

Permalink
Add format API for list column of strings (#9454)
Browse files Browse the repository at this point in the history
Closes #8351 

This PR adds API `cudf::strings::format_list_column` to create the formatted output as described in #8351. The API only accepts lists columns of strings. 

```
Example 1
  l1 = { [[a,b,c], [d,e]], [[f,g], [h]] }
  s1 = format_list_column(l1)
  s1 is now ["[[a,b,c],[d,e]]", "[[f,g],[h]]"]

Example 2
  l2 = { [[a,b,c], [d,e]], [NULL], [[f,g], NULL [h]] }
  s2 = format_list_column(l1, '-', [':', '{', '}'])
  s2 is now ["{{a:b:c}:{d:e}}", "{-}", "{{f:g}:-:{h}}"]
```

The format API takes parameters to specify the strings to use for `[` , `]` and ',' as well as the string used to represent null entries.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Robert Maynard (https://github.com/robertmaynard)
  - AJ Schmidt (https://github.com/ajschmidt8)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Karthikeyan (https://github.com/karthikeyann)

URL: #9454
  • Loading branch information
davidwendt authored Nov 9, 2021
1 parent a7d520c commit 499ebae
Show file tree
Hide file tree
Showing 10 changed files with 595 additions and 1 deletion.
1 change: 1 addition & 0 deletions conda/recipes/libcudf/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,7 @@ test:
- test -f $PREFIX/include/cudf/strings/convert/convert_floats.hpp
- test -f $PREFIX/include/cudf/strings/convert/convert_integers.hpp
- test -f $PREFIX/include/cudf/strings/convert/convert_ipv4.hpp
- test -f $PREFIX/include/cudf/strings/convert/convert_lists.hpp
- test -f $PREFIX/include/cudf/strings/convert/convert_urls.hpp
- test -f $PREFIX/include/cudf/strings/detail/combine.hpp
- test -f $PREFIX/include/cudf/strings/detail/concatenate.hpp
Expand Down
1 change: 1 addition & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -414,6 +414,7 @@ add_library(
src/strings/convert/convert_integers.cu
src/strings/convert/convert_ipv4.cu
src/strings/convert/convert_urls.cu
src/strings/convert/convert_lists.cu
src/strings/copying/concatenate.cu
src/strings/copying/copying.cu
src/strings/copying/shift.cu
Expand Down
66 changes: 66 additions & 0 deletions cpp/include/cudf/strings/convert/convert_lists.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
/*
* Copyright (c) 2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once

#include <cudf/column/column.hpp>
#include <cudf/lists/lists_column_view.hpp>
#include <cudf/scalar/scalar.hpp>
#include <cudf/strings/strings_column_view.hpp>

namespace cudf {
namespace strings {
/**
* @addtogroup strings_convert
* @{
* @file
*/

/**
* @brief Convert a list column of strings into a formatted strings column.
*
* The `separators` column should contain 3 strings elements in the following order:
* - element separator (default is comma `,`)
* - left-hand enclosure (default is `[`)
* - right-hand enclosure (default is `]`)
*
* @code{.pseudo}
* l1 = { [[a,b,c], [d,e]], [[f,g], [h]] }
* s1 = format_list_column(l1)
* s1 is now ["[[a,b,c],[d,e]]", "[[f,g],[h]]"]
*
* l2 = { [[a,b,c], [d,e]], [NULL], [[f,g], NULL, [h]] }
* s2 = format_list_column(l1, '-', [':', '{', '}'])
* s2 is now ["{{a:b:c}:{d:e}}", "{-}", "{{f:g}:-:{h}}"]
* @endcode
*
* @throw cudf::logic_error if the input column is not a LIST type with a STRING child.
*
* @param input Lists column to format.
* @param na_rep Replacment string for null elements.
* @param separator Strings to use for enclosing list components and separating elements.
* @param mr Device memory resource used to allocate the returned column's device memory.
* @return New strings column.
*/
std::unique_ptr<column> format_list_column(
lists_column_view const& input,
string_scalar const& na_rep = string_scalar("NULL"),
strings_column_view const& separators = strings_column_view(column_view{
data_type{type_id::STRING}, 0, nullptr}),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/** @} */ // end of doxygen group
} // namespace strings
} // namespace cudf
241 changes: 241 additions & 0 deletions cpp/src/strings/convert/convert_lists.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,241 @@
/*
* Copyright (c) 2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <cudf/column/column_device_view.cuh>
#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/strings/convert/convert_lists.hpp>
#include <cudf/strings/detail/utilities.cuh>
#include <cudf/strings/string_view.cuh>

#include <rmm/cuda_stream_view.hpp>

namespace cudf {
namespace strings {
namespace detail {
namespace {

// position of the element separator string (e.g. comma ',') within the separators column
constexpr size_type separator_index = 0;
// position of the enclosure strings (e.g. []) within the separators column
constexpr size_type left_brace_index = 1;
constexpr size_type right_brace_index = 2;

/**
* @brief Pending separator type for `stack_item`
*/
enum class item_separator : int8_t { NONE, ELEMENT, LIST };

/**
* @brief Stack item used to manage nested lists.
*
* Each item includes the current range and the pending separator.
*/
struct alignas(8) stack_item {
size_type left_idx;
size_type right_idx;
item_separator separator{item_separator::NONE};
};

/**
* @brief Formatting lists functor.
*
* This formats the input list column into individual strings using the
* specified separators and null-representation (na_rep) string.
*
* Recursion is simulated by using stack allocating per output string.
*/
struct format_lists_fn {
column_device_view const d_input;
column_device_view const d_separators;
string_view const d_na_rep;
stack_item* d_stack;
size_type const max_depth;
size_type* d_offsets{};
char* d_chars{};

__device__ column_device_view get_nested_child(size_type idx)
{
auto current = d_input;
while (idx > 0) {
current = current.child(cudf::lists_column_view::child_column_index);
--idx;
}
return current;
}

__device__ size_type write_separator(char*& d_output, size_type sep_idx = separator_index)
{
auto d_str = [&] {
if (d_separators.size() > sep_idx) return d_separators.element<string_view>(sep_idx);
if (sep_idx == left_brace_index) return string_view("[", 1);
if (sep_idx == right_brace_index) return string_view("]", 1);
return string_view(",", 1);
}();
if (d_output) d_output = copy_string(d_output, d_str);
return d_str.size_bytes();
}

__device__ size_type write_na_rep(char*& d_output)
{
if (d_output) d_output = copy_string(d_output, d_na_rep);
return d_na_rep.size_bytes();
}

__device__ size_type write_strings(column_device_view const& col,
size_type left_idx,
size_type right_idx,
char* d_output)
{
size_type bytes = 0;
for (size_type idx = left_idx; idx < right_idx; ++idx) {
if (col.is_null(idx)) {
bytes += write_na_rep(d_output); // e.g. 'NULL'
} else {
auto d_str = col.element<string_view>(idx);
if (d_output) d_output = copy_string(d_output, d_str);
bytes += d_str.size_bytes();
}
if (idx + 1 < right_idx) {
bytes += write_separator(d_output); // e.g. comma ','
}
}
return bytes;
}

__device__ void operator()(size_type idx)
{
size_type bytes = 0;
char* d_output = d_chars ? d_chars + d_offsets[idx] : nullptr;

// push first item to the stack
auto item_stack = d_stack + idx * max_depth;
auto stack_idx = size_type{0};
item_stack[stack_idx++] = stack_item{idx, idx + 1};

// process until stack is empty
while (stack_idx > 0) {
--stack_idx; // pop from stack
auto const item = item_stack[stack_idx];
auto const view = get_nested_child(stack_idx);

auto offsets = view.child(cudf::lists_column_view::offsets_column_index);
auto d_offsets = offsets.data<offset_type>() + view.offset();

// add pending separator
if (item.separator == item_separator::LIST) {
bytes += write_separator(d_output, right_brace_index);
} else if (item.separator == item_separator::ELEMENT) {
bytes += write_separator(d_output, separator_index);
}

// loop through the child elements for the current view
for (auto jdx = item.left_idx; jdx < item.right_idx; ++jdx) {
auto const lhs = d_offsets[jdx];
auto const rhs = d_offsets[jdx + 1];

if (view.is_null(jdx)) {
bytes += write_na_rep(d_output); // e.g. 'NULL'
} else if (lhs == rhs) { // e.g. '[]'
bytes += write_separator(d_output, left_brace_index);
bytes += write_separator(d_output, right_brace_index);
} else {
auto child = view.child(cudf::lists_column_view::child_column_index);
bytes += write_separator(d_output, left_brace_index);

// if child is a list type, then recurse into it
if (child.type().id() == type_id::LIST) {
// push current state to the stack
item_stack[stack_idx++] =
stack_item{jdx + 1,
item.right_idx,
jdx + 1 < item.right_idx ? item_separator::ELEMENT : item_separator::LIST};
// push child to the stack
item_stack[stack_idx++] = stack_item{lhs, rhs};
break; // back to the stack (while-loop)
}

// otherwise, the child is a strings column;
// write out the string elements
auto const size = write_strings(child, lhs, rhs, d_output);
bytes += size;
if (d_output) d_output += size;

bytes += write_separator(d_output, right_brace_index);
}

// write element separator (e.g. comma ',') if not at the end
if (jdx + 1 < item.right_idx) { bytes += write_separator(d_output); }
}
}

if (!d_chars) d_offsets[idx] = bytes;
}
};

} // namespace

std::unique_ptr<column> format_list_column(lists_column_view const& input,
string_scalar const& na_rep,
strings_column_view const& separators,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
if (input.is_empty()) return make_empty_column(data_type{type_id::STRING});

size_type depth = 1; // count the depth to the strings column
auto child_col = input.child();
while (child_col.type().id() == type_id::LIST) {
child_col = cudf::lists_column_view(child_col).child();
++depth;
}
CUDF_EXPECTS(child_col.type().id() == type_id::STRING, "lists child must be a STRING column");

CUDF_EXPECTS(separators.size() == 0 || separators.size() == 3,
"Invalid number of separator strings");
CUDF_EXPECTS(na_rep.is_valid(stream), "Null replacement string must be valid");

// create stack memory for processing nested lists
auto stack_buffer = rmm::device_uvector<stack_item>(input.size() * depth, stream);

auto const d_input = column_device_view::create(input.parent(), stream);
auto const d_separators = column_device_view::create(separators.parent(), stream);
auto const d_na_rep = na_rep.value(stream);

auto children = cudf::strings::detail::make_strings_children(
format_lists_fn{*d_input, *d_separators, d_na_rep, stack_buffer.data(), depth},
input.size(),
stream,
mr);

return make_strings_column(
input.size(), std::move(children.first), std::move(children.second), 0, rmm::device_buffer{});
}

} // namespace detail

// external API

std::unique_ptr<column> format_list_column(lists_column_view const& input,
string_scalar const& na_rep,
strings_column_view const& separators,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
return detail::format_list_column(input, na_rep, separators, rmm::cuda_stream_default, mr);
}

} // namespace strings
} // namespace cudf
1 change: 1 addition & 0 deletions cpp/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -367,6 +367,7 @@ ConfigureTest(
strings/find_multiple_tests.cpp
strings/fixed_point_tests.cpp
strings/floats_tests.cpp
strings/format_lists_tests.cpp
strings/integers_tests.cpp
strings/ipv4_tests.cpp
strings/json_tests.cpp
Expand Down
Loading

0 comments on commit 499ebae

Please sign in to comment.