Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add format API for list column of strings #9454

Merged
merged 29 commits into from
Nov 9, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
f79ef82
Add format API for list column of strings
davidwendt Oct 15, 2021
cd24c76
add python/cython for format_list_column
davidwendt Oct 15, 2021
ccebd72
fix style check error
davidwendt Oct 17, 2021
64500f7
update comments
davidwendt Oct 18, 2021
3f4c2d1
support nested lists in as_string_column
davidwendt Oct 18, 2021
96c4327
add pytest for list.as_string_column
davidwendt Oct 18, 2021
23c4013
fix mypy style check error
davidwendt Oct 18, 2021
1ee7005
fix mypy style check error 2
davidwendt Oct 18, 2021
9e08148
fix mypy style check error 3
davidwendt Oct 18, 2021
e62ca31
add convert_lists.hpp to meta.yaml
davidwendt Oct 18, 2021
17c1b37
add pytest data with floats
davidwendt Oct 19, 2021
a6f8793
fix typo in doxygen comment
davidwendt Oct 19, 2021
5869ffd
fix comments, spacing
davidwendt Oct 19, 2021
e293ad0
Merge branch 'branch-21.12' into fea-strings-format-lists
davidwendt Oct 19, 2021
223885f
Merge branch 'branch-21.12' into fea-strings-format-lists
davidwendt Oct 20, 2021
8ca60f9
fix doxygen comments, typos
davidwendt Oct 20, 2021
9ec95c6
add const decls
davidwendt Oct 20, 2021
6d6bd80
reinstall test_listcol_setitem
davidwendt Oct 20, 2021
7deb88a
fix meta.yaml
davidwendt Oct 20, 2021
4ae319e
Merge branch 'branch-21.12' into fea-strings-format-lists
davidwendt Oct 20, 2021
55cc488
fix merge conflict
davidwendt Oct 21, 2021
7fcccce
name hardcoded separator index
davidwendt Oct 21, 2021
7c6320f
reformat some comments
davidwendt Oct 21, 2021
484146c
Merge branch 'branch-21.12' into fea-strings-format-lists
davidwendt Oct 26, 2021
6a1da5f
change depth++ to ++depth
davidwendt Nov 1, 2021
4e387d1
Merge branch 'branch-21.12' into fea-strings-format-lists
davidwendt Nov 1, 2021
5a2ed2b
item-separator inherit from int8
davidwendt Nov 1, 2021
96b07d5
fix merge conflicts
davidwendt Nov 3, 2021
e9dbe8d
Merge branch 'branch-21.12' into fea-strings-format-lists
davidwendt Nov 4, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions conda/recipes/libcudf/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,7 @@ test:
- test -f $PREFIX/include/cudf/strings/convert/convert_floats.hpp
- test -f $PREFIX/include/cudf/strings/convert/convert_integers.hpp
- test -f $PREFIX/include/cudf/strings/convert/convert_ipv4.hpp
- test -f $PREFIX/include/cudf/strings/convert/convert_lists.hpp
- test -f $PREFIX/include/cudf/strings/convert/convert_urls.hpp
- test -f $PREFIX/include/cudf/strings/detail/combine.hpp
- test -f $PREFIX/include/cudf/strings/detail/concatenate.hpp
Expand Down
1 change: 1 addition & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -413,6 +413,7 @@ add_library(
src/strings/convert/convert_integers.cu
src/strings/convert/convert_ipv4.cu
src/strings/convert/convert_urls.cu
src/strings/convert/convert_lists.cu
src/strings/copying/concatenate.cu
src/strings/copying/copying.cu
src/strings/copying/shift.cu
Expand Down
66 changes: 66 additions & 0 deletions cpp/include/cudf/strings/convert/convert_lists.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
/*
* Copyright (c) 2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once

#include <cudf/column/column.hpp>
#include <cudf/lists/lists_column_view.hpp>
#include <cudf/scalar/scalar.hpp>
#include <cudf/strings/strings_column_view.hpp>

namespace cudf {
namespace strings {
/**
* @addtogroup strings_convert
* @{
* @file
*/

/**
* @brief Convert a list column of strings into a formatted strings column.
*
* The `separators` column should contain 3 strings elements in the following order:
* - element separator (default is comma `,`)
* - left-hand enclosure (default is `[`)
* - right-hand enclosure (default is `]`)
*
* @code{.pseudo}
* l1 = { [[a,b,c], [d,e]], [[f,g], [h]] }
* s1 = format_list_column(l1)
* s1 is now ["[[a,b,c],[d,e]]", "[[f,g],[h]]"]
*
* l2 = { [[a,b,c], [d,e]], [NULL], [[f,g], NULL, [h]] }
* s2 = format_list_column(l1, '-', [':', '{', '}'])
* s2 is now ["{{a:b:c}:{d:e}}", "{-}", "{{f:g}:-:{h}}"]
* @endcode
*
* @throw cudf::logic_error if the input column is not a LIST type with a STRING child.
*
* @param input Lists column to format.
* @param na_rep Replacment string for null elements.
* @param separator Strings to use for enclosing list components and separating elements.
* @param mr Device memory resource used to allocate the returned column's device memory.
* @return New strings column.
*/
std::unique_ptr<column> format_list_column(
lists_column_view const& input,
string_scalar const& na_rep = string_scalar("NULL"),
strings_column_view const& separators = strings_column_view(column_view{
data_type{type_id::STRING}, 0, nullptr}),
Comment on lines +60 to +61
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a reason to prefer a column rather than three separate scalars? Admittedly that adds more parameters to the API, but it seems awkward to stuff all three in a column (especially since I would anticipate that overloading the element separator would be a largely independent request from overriding the enclosures).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is a little more efficient to create a column of strings which normally is a single device copy rather than individual scalars which would be 3 small device copies.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's true, but it feels like a (minor) abuse of a column to stuff in three values that are semantically different but happen to be of the same type. The elements of a column seem like they should all "mean" the same thing, if that makes sense. This feels like a premature optimization, but H2D copies are expensive so maybe the improvement is worth it. I trust your judgment there, just felt odd to me.

rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/** @} */ // end of doxygen group
} // namespace strings
} // namespace cudf
241 changes: 241 additions & 0 deletions cpp/src/strings/convert/convert_lists.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,241 @@
/*
* Copyright (c) 2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <cudf/column/column_device_view.cuh>
#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/strings/convert/convert_lists.hpp>
#include <cudf/strings/detail/utilities.cuh>
#include <cudf/strings/string_view.cuh>

#include <rmm/cuda_stream_view.hpp>

namespace cudf {
namespace strings {
namespace detail {
namespace {

// position of the element separator string (e.g. comma ',') within the separators column
constexpr size_type separator_index = 0;
// position of the enclosure strings (e.g. []) within the separators column
constexpr size_type left_brace_index = 1;
constexpr size_type right_brace_index = 2;

/**
* @brief Pending separator type for `stack_item`
*/
enum class item_separator : int8_t { NONE, ELEMENT, LIST };

/**
* @brief Stack item used to manage nested lists.
*
* Each item includes the current range and the pending separator.
*/
struct alignas(8) stack_item {
davidwendt marked this conversation as resolved.
Show resolved Hide resolved
size_type left_idx;
size_type right_idx;
item_separator separator{item_separator::NONE};
};

/**
* @brief Formatting lists functor.
*
* This formats the input list column into individual strings using the
* specified separators and null-representation (na_rep) string.
*
* Recursion is simulated by using stack allocating per output string.
*/
struct format_lists_fn {
column_device_view const d_input;
column_device_view const d_separators;
string_view const d_na_rep;
davidwendt marked this conversation as resolved.
Show resolved Hide resolved
stack_item* d_stack;
size_type const max_depth;
size_type* d_offsets{};
char* d_chars{};
davidwendt marked this conversation as resolved.
Show resolved Hide resolved

__device__ column_device_view get_nested_child(size_type idx)
{
auto current = d_input;
while (idx > 0) {
current = current.child(cudf::lists_column_view::child_column_index);
--idx;
}
return current;
}

__device__ size_type write_separator(char*& d_output, size_type sep_idx = separator_index)
{
auto d_str = [&] {
if (d_separators.size() > sep_idx) return d_separators.element<string_view>(sep_idx);
if (sep_idx == left_brace_index) return string_view("[", 1);
if (sep_idx == right_brace_index) return string_view("]", 1);
return string_view(",", 1);
}();
if (d_output) d_output = copy_string(d_output, d_str);
return d_str.size_bytes();
}

__device__ size_type write_na_rep(char*& d_output)
{
if (d_output) d_output = copy_string(d_output, d_na_rep);
return d_na_rep.size_bytes();
}

__device__ size_type write_strings(column_device_view const& col,
size_type left_idx,
size_type right_idx,
char* d_output)
{
size_type bytes = 0;
for (size_type idx = left_idx; idx < right_idx; ++idx) {
if (col.is_null(idx)) {
bytes += write_na_rep(d_output); // e.g. 'NULL'
} else {
auto d_str = col.element<string_view>(idx);
if (d_output) d_output = copy_string(d_output, d_str);
bytes += d_str.size_bytes();
}
if (idx + 1 < right_idx) {
bytes += write_separator(d_output); // e.g. comma ','
}
}
return bytes;
}

__device__ void operator()(size_type idx)
{
size_type bytes = 0;
char* d_output = d_chars ? d_chars + d_offsets[idx] : nullptr;

// push first item to the stack
auto item_stack = d_stack + idx * max_depth;
auto stack_idx = size_type{0};
item_stack[stack_idx++] = stack_item{idx, idx + 1};

// process until stack is empty
while (stack_idx > 0) {
--stack_idx; // pop from stack
auto const item = item_stack[stack_idx];
auto const view = get_nested_child(stack_idx);

auto offsets = view.child(cudf::lists_column_view::offsets_column_index);
auto d_offsets = offsets.data<offset_type>() + view.offset();

// add pending separator
if (item.separator == item_separator::LIST) {
bytes += write_separator(d_output, right_brace_index);
} else if (item.separator == item_separator::ELEMENT) {
bytes += write_separator(d_output, separator_index);
}

// loop through the child elements for the current view
for (auto jdx = item.left_idx; jdx < item.right_idx; ++jdx) {
auto const lhs = d_offsets[jdx];
auto const rhs = d_offsets[jdx + 1];

if (view.is_null(jdx)) {
bytes += write_na_rep(d_output); // e.g. 'NULL'
} else if (lhs == rhs) { // e.g. '[]'
bytes += write_separator(d_output, left_brace_index);
bytes += write_separator(d_output, right_brace_index);
} else {
auto child = view.child(cudf::lists_column_view::child_column_index);
bytes += write_separator(d_output, left_brace_index);

// if child is a list type, then recurse into it
if (child.type().id() == type_id::LIST) {
// push current state to the stack
item_stack[stack_idx++] =
stack_item{jdx + 1,
item.right_idx,
jdx + 1 < item.right_idx ? item_separator::ELEMENT : item_separator::LIST};
// push child to the stack
item_stack[stack_idx++] = stack_item{lhs, rhs};
break; // back to the stack (while-loop)
vyasr marked this conversation as resolved.
Show resolved Hide resolved
}

// otherwise, the child is a strings column;
// write out the string elements
auto const size = write_strings(child, lhs, rhs, d_output);
bytes += size;
if (d_output) d_output += size;

bytes += write_separator(d_output, right_brace_index);
}

// write element separator (e.g. comma ',') if not at the end
if (jdx + 1 < item.right_idx) { bytes += write_separator(d_output); }
}
}

if (!d_chars) d_offsets[idx] = bytes;
}
};

} // namespace

std::unique_ptr<column> format_list_column(lists_column_view const& input,
string_scalar const& na_rep,
strings_column_view const& separators,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
if (input.is_empty()) return make_empty_column(data_type{type_id::STRING});

size_type depth = 1; // count the depth to the strings column
auto child_col = input.child();
while (child_col.type().id() == type_id::LIST) {
child_col = cudf::lists_column_view(child_col).child();
++depth;
}
CUDF_EXPECTS(child_col.type().id() == type_id::STRING, "lists child must be a STRING column");

CUDF_EXPECTS(separators.size() == 0 || separators.size() == 3,
"Invalid number of separator strings");
CUDF_EXPECTS(na_rep.is_valid(stream), "Null replacement string must be valid");

// create stack memory for processing nested lists
auto stack_buffer = rmm::device_uvector<stack_item>(input.size() * depth, stream);

auto const d_input = column_device_view::create(input.parent(), stream);
auto const d_separators = column_device_view::create(separators.parent(), stream);
auto const d_na_rep = na_rep.value(stream);

auto children = cudf::strings::detail::make_strings_children(
format_lists_fn{*d_input, *d_separators, d_na_rep, stack_buffer.data(), depth},
input.size(),
stream,
mr);

return make_strings_column(
input.size(), std::move(children.first), std::move(children.second), 0, rmm::device_buffer{});
}

} // namespace detail

// external API

std::unique_ptr<column> format_list_column(lists_column_view const& input,
string_scalar const& na_rep,
strings_column_view const& separators,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
return detail::format_list_column(input, na_rep, separators, rmm::cuda_stream_default, mr);
}

} // namespace strings
} // namespace cudf
1 change: 1 addition & 0 deletions cpp/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -365,6 +365,7 @@ ConfigureTest(
strings/find_multiple_tests.cpp
strings/fixed_point_tests.cpp
strings/floats_tests.cpp
strings/format_lists_tests.cpp
strings/integers_tests.cpp
strings/ipv4_tests.cpp
strings/json_tests.cpp
Expand Down
Loading