-
Notifications
You must be signed in to change notification settings - Fork 917
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add format API for list column of strings (#9454)
Closes #8351 This PR adds API `cudf::strings::format_list_column` to create the formatted output as described in #8351. The API only accepts lists columns of strings. ``` Example 1 l1 = { [[a,b,c], [d,e]], [[f,g], [h]] } s1 = format_list_column(l1) s1 is now ["[[a,b,c],[d,e]]", "[[f,g],[h]]"] Example 2 l2 = { [[a,b,c], [d,e]], [NULL], [[f,g], NULL [h]] } s2 = format_list_column(l1, '-', [':', '{', '}']) s2 is now ["{{a:b:c}:{d:e}}", "{-}", "{{f:g}:-:{h}}"] ``` The format API takes parameters to specify the strings to use for `[` , `]` and ',' as well as the string used to represent null entries. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Robert Maynard (https://github.com/robertmaynard) - AJ Schmidt (https://github.com/ajschmidt8) - Vyas Ramasubramani (https://github.com/vyasr) - Karthikeyan (https://github.com/karthikeyann) URL: #9454
- Loading branch information
1 parent
a7d520c
commit 499ebae
Showing
10 changed files
with
595 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
/* | ||
* Copyright (c) 2021, NVIDIA CORPORATION. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
#pragma once | ||
|
||
#include <cudf/column/column.hpp> | ||
#include <cudf/lists/lists_column_view.hpp> | ||
#include <cudf/scalar/scalar.hpp> | ||
#include <cudf/strings/strings_column_view.hpp> | ||
|
||
namespace cudf { | ||
namespace strings { | ||
/** | ||
* @addtogroup strings_convert | ||
* @{ | ||
* @file | ||
*/ | ||
|
||
/** | ||
* @brief Convert a list column of strings into a formatted strings column. | ||
* | ||
* The `separators` column should contain 3 strings elements in the following order: | ||
* - element separator (default is comma `,`) | ||
* - left-hand enclosure (default is `[`) | ||
* - right-hand enclosure (default is `]`) | ||
* | ||
* @code{.pseudo} | ||
* l1 = { [[a,b,c], [d,e]], [[f,g], [h]] } | ||
* s1 = format_list_column(l1) | ||
* s1 is now ["[[a,b,c],[d,e]]", "[[f,g],[h]]"] | ||
* | ||
* l2 = { [[a,b,c], [d,e]], [NULL], [[f,g], NULL, [h]] } | ||
* s2 = format_list_column(l1, '-', [':', '{', '}']) | ||
* s2 is now ["{{a:b:c}:{d:e}}", "{-}", "{{f:g}:-:{h}}"] | ||
* @endcode | ||
* | ||
* @throw cudf::logic_error if the input column is not a LIST type with a STRING child. | ||
* | ||
* @param input Lists column to format. | ||
* @param na_rep Replacment string for null elements. | ||
* @param separator Strings to use for enclosing list components and separating elements. | ||
* @param mr Device memory resource used to allocate the returned column's device memory. | ||
* @return New strings column. | ||
*/ | ||
std::unique_ptr<column> format_list_column( | ||
lists_column_view const& input, | ||
string_scalar const& na_rep = string_scalar("NULL"), | ||
strings_column_view const& separators = strings_column_view(column_view{ | ||
data_type{type_id::STRING}, 0, nullptr}), | ||
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); | ||
|
||
/** @} */ // end of doxygen group | ||
} // namespace strings | ||
} // namespace cudf |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,241 @@ | ||
/* | ||
* Copyright (c) 2021, NVIDIA CORPORATION. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
#include <cudf/column/column_device_view.cuh> | ||
#include <cudf/detail/nvtx/ranges.hpp> | ||
#include <cudf/strings/convert/convert_lists.hpp> | ||
#include <cudf/strings/detail/utilities.cuh> | ||
#include <cudf/strings/string_view.cuh> | ||
|
||
#include <rmm/cuda_stream_view.hpp> | ||
|
||
namespace cudf { | ||
namespace strings { | ||
namespace detail { | ||
namespace { | ||
|
||
// position of the element separator string (e.g. comma ',') within the separators column | ||
constexpr size_type separator_index = 0; | ||
// position of the enclosure strings (e.g. []) within the separators column | ||
constexpr size_type left_brace_index = 1; | ||
constexpr size_type right_brace_index = 2; | ||
|
||
/** | ||
* @brief Pending separator type for `stack_item` | ||
*/ | ||
enum class item_separator : int8_t { NONE, ELEMENT, LIST }; | ||
|
||
/** | ||
* @brief Stack item used to manage nested lists. | ||
* | ||
* Each item includes the current range and the pending separator. | ||
*/ | ||
struct alignas(8) stack_item { | ||
size_type left_idx; | ||
size_type right_idx; | ||
item_separator separator{item_separator::NONE}; | ||
}; | ||
|
||
/** | ||
* @brief Formatting lists functor. | ||
* | ||
* This formats the input list column into individual strings using the | ||
* specified separators and null-representation (na_rep) string. | ||
* | ||
* Recursion is simulated by using stack allocating per output string. | ||
*/ | ||
struct format_lists_fn { | ||
column_device_view const d_input; | ||
column_device_view const d_separators; | ||
string_view const d_na_rep; | ||
stack_item* d_stack; | ||
size_type const max_depth; | ||
size_type* d_offsets{}; | ||
char* d_chars{}; | ||
|
||
__device__ column_device_view get_nested_child(size_type idx) | ||
{ | ||
auto current = d_input; | ||
while (idx > 0) { | ||
current = current.child(cudf::lists_column_view::child_column_index); | ||
--idx; | ||
} | ||
return current; | ||
} | ||
|
||
__device__ size_type write_separator(char*& d_output, size_type sep_idx = separator_index) | ||
{ | ||
auto d_str = [&] { | ||
if (d_separators.size() > sep_idx) return d_separators.element<string_view>(sep_idx); | ||
if (sep_idx == left_brace_index) return string_view("[", 1); | ||
if (sep_idx == right_brace_index) return string_view("]", 1); | ||
return string_view(",", 1); | ||
}(); | ||
if (d_output) d_output = copy_string(d_output, d_str); | ||
return d_str.size_bytes(); | ||
} | ||
|
||
__device__ size_type write_na_rep(char*& d_output) | ||
{ | ||
if (d_output) d_output = copy_string(d_output, d_na_rep); | ||
return d_na_rep.size_bytes(); | ||
} | ||
|
||
__device__ size_type write_strings(column_device_view const& col, | ||
size_type left_idx, | ||
size_type right_idx, | ||
char* d_output) | ||
{ | ||
size_type bytes = 0; | ||
for (size_type idx = left_idx; idx < right_idx; ++idx) { | ||
if (col.is_null(idx)) { | ||
bytes += write_na_rep(d_output); // e.g. 'NULL' | ||
} else { | ||
auto d_str = col.element<string_view>(idx); | ||
if (d_output) d_output = copy_string(d_output, d_str); | ||
bytes += d_str.size_bytes(); | ||
} | ||
if (idx + 1 < right_idx) { | ||
bytes += write_separator(d_output); // e.g. comma ',' | ||
} | ||
} | ||
return bytes; | ||
} | ||
|
||
__device__ void operator()(size_type idx) | ||
{ | ||
size_type bytes = 0; | ||
char* d_output = d_chars ? d_chars + d_offsets[idx] : nullptr; | ||
|
||
// push first item to the stack | ||
auto item_stack = d_stack + idx * max_depth; | ||
auto stack_idx = size_type{0}; | ||
item_stack[stack_idx++] = stack_item{idx, idx + 1}; | ||
|
||
// process until stack is empty | ||
while (stack_idx > 0) { | ||
--stack_idx; // pop from stack | ||
auto const item = item_stack[stack_idx]; | ||
auto const view = get_nested_child(stack_idx); | ||
|
||
auto offsets = view.child(cudf::lists_column_view::offsets_column_index); | ||
auto d_offsets = offsets.data<offset_type>() + view.offset(); | ||
|
||
// add pending separator | ||
if (item.separator == item_separator::LIST) { | ||
bytes += write_separator(d_output, right_brace_index); | ||
} else if (item.separator == item_separator::ELEMENT) { | ||
bytes += write_separator(d_output, separator_index); | ||
} | ||
|
||
// loop through the child elements for the current view | ||
for (auto jdx = item.left_idx; jdx < item.right_idx; ++jdx) { | ||
auto const lhs = d_offsets[jdx]; | ||
auto const rhs = d_offsets[jdx + 1]; | ||
|
||
if (view.is_null(jdx)) { | ||
bytes += write_na_rep(d_output); // e.g. 'NULL' | ||
} else if (lhs == rhs) { // e.g. '[]' | ||
bytes += write_separator(d_output, left_brace_index); | ||
bytes += write_separator(d_output, right_brace_index); | ||
} else { | ||
auto child = view.child(cudf::lists_column_view::child_column_index); | ||
bytes += write_separator(d_output, left_brace_index); | ||
|
||
// if child is a list type, then recurse into it | ||
if (child.type().id() == type_id::LIST) { | ||
// push current state to the stack | ||
item_stack[stack_idx++] = | ||
stack_item{jdx + 1, | ||
item.right_idx, | ||
jdx + 1 < item.right_idx ? item_separator::ELEMENT : item_separator::LIST}; | ||
// push child to the stack | ||
item_stack[stack_idx++] = stack_item{lhs, rhs}; | ||
break; // back to the stack (while-loop) | ||
} | ||
|
||
// otherwise, the child is a strings column; | ||
// write out the string elements | ||
auto const size = write_strings(child, lhs, rhs, d_output); | ||
bytes += size; | ||
if (d_output) d_output += size; | ||
|
||
bytes += write_separator(d_output, right_brace_index); | ||
} | ||
|
||
// write element separator (e.g. comma ',') if not at the end | ||
if (jdx + 1 < item.right_idx) { bytes += write_separator(d_output); } | ||
} | ||
} | ||
|
||
if (!d_chars) d_offsets[idx] = bytes; | ||
} | ||
}; | ||
|
||
} // namespace | ||
|
||
std::unique_ptr<column> format_list_column(lists_column_view const& input, | ||
string_scalar const& na_rep, | ||
strings_column_view const& separators, | ||
rmm::cuda_stream_view stream, | ||
rmm::mr::device_memory_resource* mr) | ||
{ | ||
if (input.is_empty()) return make_empty_column(data_type{type_id::STRING}); | ||
|
||
size_type depth = 1; // count the depth to the strings column | ||
auto child_col = input.child(); | ||
while (child_col.type().id() == type_id::LIST) { | ||
child_col = cudf::lists_column_view(child_col).child(); | ||
++depth; | ||
} | ||
CUDF_EXPECTS(child_col.type().id() == type_id::STRING, "lists child must be a STRING column"); | ||
|
||
CUDF_EXPECTS(separators.size() == 0 || separators.size() == 3, | ||
"Invalid number of separator strings"); | ||
CUDF_EXPECTS(na_rep.is_valid(stream), "Null replacement string must be valid"); | ||
|
||
// create stack memory for processing nested lists | ||
auto stack_buffer = rmm::device_uvector<stack_item>(input.size() * depth, stream); | ||
|
||
auto const d_input = column_device_view::create(input.parent(), stream); | ||
auto const d_separators = column_device_view::create(separators.parent(), stream); | ||
auto const d_na_rep = na_rep.value(stream); | ||
|
||
auto children = cudf::strings::detail::make_strings_children( | ||
format_lists_fn{*d_input, *d_separators, d_na_rep, stack_buffer.data(), depth}, | ||
input.size(), | ||
stream, | ||
mr); | ||
|
||
return make_strings_column( | ||
input.size(), std::move(children.first), std::move(children.second), 0, rmm::device_buffer{}); | ||
} | ||
|
||
} // namespace detail | ||
|
||
// external API | ||
|
||
std::unique_ptr<column> format_list_column(lists_column_view const& input, | ||
string_scalar const& na_rep, | ||
strings_column_view const& separators, | ||
rmm::mr::device_memory_resource* mr) | ||
{ | ||
CUDF_FUNC_RANGE(); | ||
return detail::format_list_column(input, na_rep, separators, rmm::cuda_stream_default, mr); | ||
} | ||
|
||
} // namespace strings | ||
} // namespace cudf |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.