From 499ebaeebf25981fefcc43aa230085dd1db6f6c1 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Tue, 9 Nov 2021 18:05:45 -0500 Subject: [PATCH] Add format API for list column of strings (#9454) Closes #8351 This PR adds API `cudf::strings::format_list_column` to create the formatted output as described in #8351. The API only accepts lists columns of strings. ``` Example 1 l1 = { [[a,b,c], [d,e]], [[f,g], [h]] } s1 = format_list_column(l1) s1 is now ["[[a,b,c],[d,e]]", "[[f,g],[h]]"] Example 2 l2 = { [[a,b,c], [d,e]], [NULL], [[f,g], NULL [h]] } s2 = format_list_column(l1, '-', [':', '{', '}']) s2 is now ["{{a:b:c}:{d:e}}", "{-}", "{{f:g}:-:{h}}"] ``` The format API takes parameters to specify the strings to use for `[` , `]` and ',' as well as the string used to represent null entries. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Robert Maynard (https://github.com/robertmaynard) - AJ Schmidt (https://github.com/ajschmidt8) - Vyas Ramasubramani (https://github.com/vyasr) - Karthikeyan (https://github.com/karthikeyann) URL: https://github.com/rapidsai/cudf/pull/9454 --- conda/recipes/libcudf/meta.yaml | 1 + cpp/CMakeLists.txt | 1 + .../cudf/strings/convert/convert_lists.hpp | 66 +++++ cpp/src/strings/convert/convert_lists.cu | 241 ++++++++++++++++++ cpp/tests/CMakeLists.txt | 1 + cpp/tests/strings/format_lists_tests.cpp | 167 ++++++++++++ .../cpp/strings/convert/convert_lists.pxd | 15 ++ .../_lib/strings/convert/convert_lists.pyx | 48 ++++ python/cudf/cudf/core/column/lists.py | 36 ++- python/cudf/cudf/tests/test_list.py | 20 ++ 10 files changed, 595 insertions(+), 1 deletion(-) create mode 100644 cpp/include/cudf/strings/convert/convert_lists.hpp create mode 100644 cpp/src/strings/convert/convert_lists.cu create mode 100644 cpp/tests/strings/format_lists_tests.cpp create mode 100644 python/cudf/cudf/_lib/cpp/strings/convert/convert_lists.pxd create mode 100644 python/cudf/cudf/_lib/strings/convert/convert_lists.pyx diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml index 0e0fc816c62..d644369c264 100644 --- a/conda/recipes/libcudf/meta.yaml +++ b/conda/recipes/libcudf/meta.yaml @@ -189,6 +189,7 @@ test: - test -f $PREFIX/include/cudf/strings/convert/convert_floats.hpp - test -f $PREFIX/include/cudf/strings/convert/convert_integers.hpp - test -f $PREFIX/include/cudf/strings/convert/convert_ipv4.hpp + - test -f $PREFIX/include/cudf/strings/convert/convert_lists.hpp - test -f $PREFIX/include/cudf/strings/convert/convert_urls.hpp - test -f $PREFIX/include/cudf/strings/detail/combine.hpp - test -f $PREFIX/include/cudf/strings/detail/concatenate.hpp diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index e5777b77e45..cf7b5be0e3e 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -414,6 +414,7 @@ add_library( src/strings/convert/convert_integers.cu src/strings/convert/convert_ipv4.cu src/strings/convert/convert_urls.cu + src/strings/convert/convert_lists.cu src/strings/copying/concatenate.cu src/strings/copying/copying.cu src/strings/copying/shift.cu diff --git a/cpp/include/cudf/strings/convert/convert_lists.hpp b/cpp/include/cudf/strings/convert/convert_lists.hpp new file mode 100644 index 00000000000..ec22186ea99 --- /dev/null +++ b/cpp/include/cudf/strings/convert/convert_lists.hpp @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include + +namespace cudf { +namespace strings { +/** + * @addtogroup strings_convert + * @{ + * @file + */ + +/** + * @brief Convert a list column of strings into a formatted strings column. + * + * The `separators` column should contain 3 strings elements in the following order: + * - element separator (default is comma `,`) + * - left-hand enclosure (default is `[`) + * - right-hand enclosure (default is `]`) + * + * @code{.pseudo} + * l1 = { [[a,b,c], [d,e]], [[f,g], [h]] } + * s1 = format_list_column(l1) + * s1 is now ["[[a,b,c],[d,e]]", "[[f,g],[h]]"] + * + * l2 = { [[a,b,c], [d,e]], [NULL], [[f,g], NULL, [h]] } + * s2 = format_list_column(l1, '-', [':', '{', '}']) + * s2 is now ["{{a:b:c}:{d:e}}", "{-}", "{{f:g}:-:{h}}"] + * @endcode + * + * @throw cudf::logic_error if the input column is not a LIST type with a STRING child. + * + * @param input Lists column to format. + * @param na_rep Replacment string for null elements. + * @param separator Strings to use for enclosing list components and separating elements. + * @param mr Device memory resource used to allocate the returned column's device memory. + * @return New strings column. + */ +std::unique_ptr format_list_column( + lists_column_view const& input, + string_scalar const& na_rep = string_scalar("NULL"), + strings_column_view const& separators = strings_column_view(column_view{ + data_type{type_id::STRING}, 0, nullptr}), + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** @} */ // end of doxygen group +} // namespace strings +} // namespace cudf diff --git a/cpp/src/strings/convert/convert_lists.cu b/cpp/src/strings/convert/convert_lists.cu new file mode 100644 index 00000000000..7f325bf29ed --- /dev/null +++ b/cpp/src/strings/convert/convert_lists.cu @@ -0,0 +1,241 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include + +#include + +namespace cudf { +namespace strings { +namespace detail { +namespace { + +// position of the element separator string (e.g. comma ',') within the separators column +constexpr size_type separator_index = 0; +// position of the enclosure strings (e.g. []) within the separators column +constexpr size_type left_brace_index = 1; +constexpr size_type right_brace_index = 2; + +/** + * @brief Pending separator type for `stack_item` + */ +enum class item_separator : int8_t { NONE, ELEMENT, LIST }; + +/** + * @brief Stack item used to manage nested lists. + * + * Each item includes the current range and the pending separator. + */ +struct alignas(8) stack_item { + size_type left_idx; + size_type right_idx; + item_separator separator{item_separator::NONE}; +}; + +/** + * @brief Formatting lists functor. + * + * This formats the input list column into individual strings using the + * specified separators and null-representation (na_rep) string. + * + * Recursion is simulated by using stack allocating per output string. + */ +struct format_lists_fn { + column_device_view const d_input; + column_device_view const d_separators; + string_view const d_na_rep; + stack_item* d_stack; + size_type const max_depth; + size_type* d_offsets{}; + char* d_chars{}; + + __device__ column_device_view get_nested_child(size_type idx) + { + auto current = d_input; + while (idx > 0) { + current = current.child(cudf::lists_column_view::child_column_index); + --idx; + } + return current; + } + + __device__ size_type write_separator(char*& d_output, size_type sep_idx = separator_index) + { + auto d_str = [&] { + if (d_separators.size() > sep_idx) return d_separators.element(sep_idx); + if (sep_idx == left_brace_index) return string_view("[", 1); + if (sep_idx == right_brace_index) return string_view("]", 1); + return string_view(",", 1); + }(); + if (d_output) d_output = copy_string(d_output, d_str); + return d_str.size_bytes(); + } + + __device__ size_type write_na_rep(char*& d_output) + { + if (d_output) d_output = copy_string(d_output, d_na_rep); + return d_na_rep.size_bytes(); + } + + __device__ size_type write_strings(column_device_view const& col, + size_type left_idx, + size_type right_idx, + char* d_output) + { + size_type bytes = 0; + for (size_type idx = left_idx; idx < right_idx; ++idx) { + if (col.is_null(idx)) { + bytes += write_na_rep(d_output); // e.g. 'NULL' + } else { + auto d_str = col.element(idx); + if (d_output) d_output = copy_string(d_output, d_str); + bytes += d_str.size_bytes(); + } + if (idx + 1 < right_idx) { + bytes += write_separator(d_output); // e.g. comma ',' + } + } + return bytes; + } + + __device__ void operator()(size_type idx) + { + size_type bytes = 0; + char* d_output = d_chars ? d_chars + d_offsets[idx] : nullptr; + + // push first item to the stack + auto item_stack = d_stack + idx * max_depth; + auto stack_idx = size_type{0}; + item_stack[stack_idx++] = stack_item{idx, idx + 1}; + + // process until stack is empty + while (stack_idx > 0) { + --stack_idx; // pop from stack + auto const item = item_stack[stack_idx]; + auto const view = get_nested_child(stack_idx); + + auto offsets = view.child(cudf::lists_column_view::offsets_column_index); + auto d_offsets = offsets.data() + view.offset(); + + // add pending separator + if (item.separator == item_separator::LIST) { + bytes += write_separator(d_output, right_brace_index); + } else if (item.separator == item_separator::ELEMENT) { + bytes += write_separator(d_output, separator_index); + } + + // loop through the child elements for the current view + for (auto jdx = item.left_idx; jdx < item.right_idx; ++jdx) { + auto const lhs = d_offsets[jdx]; + auto const rhs = d_offsets[jdx + 1]; + + if (view.is_null(jdx)) { + bytes += write_na_rep(d_output); // e.g. 'NULL' + } else if (lhs == rhs) { // e.g. '[]' + bytes += write_separator(d_output, left_brace_index); + bytes += write_separator(d_output, right_brace_index); + } else { + auto child = view.child(cudf::lists_column_view::child_column_index); + bytes += write_separator(d_output, left_brace_index); + + // if child is a list type, then recurse into it + if (child.type().id() == type_id::LIST) { + // push current state to the stack + item_stack[stack_idx++] = + stack_item{jdx + 1, + item.right_idx, + jdx + 1 < item.right_idx ? item_separator::ELEMENT : item_separator::LIST}; + // push child to the stack + item_stack[stack_idx++] = stack_item{lhs, rhs}; + break; // back to the stack (while-loop) + } + + // otherwise, the child is a strings column; + // write out the string elements + auto const size = write_strings(child, lhs, rhs, d_output); + bytes += size; + if (d_output) d_output += size; + + bytes += write_separator(d_output, right_brace_index); + } + + // write element separator (e.g. comma ',') if not at the end + if (jdx + 1 < item.right_idx) { bytes += write_separator(d_output); } + } + } + + if (!d_chars) d_offsets[idx] = bytes; + } +}; + +} // namespace + +std::unique_ptr format_list_column(lists_column_view const& input, + string_scalar const& na_rep, + strings_column_view const& separators, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + if (input.is_empty()) return make_empty_column(data_type{type_id::STRING}); + + size_type depth = 1; // count the depth to the strings column + auto child_col = input.child(); + while (child_col.type().id() == type_id::LIST) { + child_col = cudf::lists_column_view(child_col).child(); + ++depth; + } + CUDF_EXPECTS(child_col.type().id() == type_id::STRING, "lists child must be a STRING column"); + + CUDF_EXPECTS(separators.size() == 0 || separators.size() == 3, + "Invalid number of separator strings"); + CUDF_EXPECTS(na_rep.is_valid(stream), "Null replacement string must be valid"); + + // create stack memory for processing nested lists + auto stack_buffer = rmm::device_uvector(input.size() * depth, stream); + + auto const d_input = column_device_view::create(input.parent(), stream); + auto const d_separators = column_device_view::create(separators.parent(), stream); + auto const d_na_rep = na_rep.value(stream); + + auto children = cudf::strings::detail::make_strings_children( + format_lists_fn{*d_input, *d_separators, d_na_rep, stack_buffer.data(), depth}, + input.size(), + stream, + mr); + + return make_strings_column( + input.size(), std::move(children.first), std::move(children.second), 0, rmm::device_buffer{}); +} + +} // namespace detail + +// external API + +std::unique_ptr format_list_column(lists_column_view const& input, + string_scalar const& na_rep, + strings_column_view const& separators, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::format_list_column(input, na_rep, separators, rmm::cuda_stream_default, mr); +} + +} // namespace strings +} // namespace cudf diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index d66a3e64bfc..98bade7e15f 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -367,6 +367,7 @@ ConfigureTest( strings/find_multiple_tests.cpp strings/fixed_point_tests.cpp strings/floats_tests.cpp + strings/format_lists_tests.cpp strings/integers_tests.cpp strings/ipv4_tests.cpp strings/json_tests.cpp diff --git a/cpp/tests/strings/format_lists_tests.cpp b/cpp/tests/strings/format_lists_tests.cpp new file mode 100644 index 00000000000..63fcdf6f00e --- /dev/null +++ b/cpp/tests/strings/format_lists_tests.cpp @@ -0,0 +1,167 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include +#include + +#include +#include +#include + +struct StringsFormatListsTest : public cudf::test::BaseFixture { +}; + +TEST_F(StringsFormatListsTest, EmptyList) +{ + using STR_LISTS = cudf::test::lists_column_wrapper; + + auto const input = STR_LISTS{}; + auto const view = cudf::lists_column_view(input); + + auto results = cudf::strings::format_list_column(view); + cudf::test::expect_strings_empty(results->view()); +} + +TEST_F(StringsFormatListsTest, EmptyNestedList) +{ + using STR_LISTS = cudf::test::lists_column_wrapper; + + auto const input = STR_LISTS{STR_LISTS{STR_LISTS{}, STR_LISTS{}}, STR_LISTS{STR_LISTS{}}}; + auto const view = cudf::lists_column_view(input); + + auto results = cudf::strings::format_list_column(view); + auto expected = cudf::test::strings_column_wrapper({"[[],[]]", "[[]]"}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); +} + +TEST_F(StringsFormatListsTest, WithNulls) +{ + using STR_LISTS = cudf::test::lists_column_wrapper; + + auto const input = STR_LISTS{{STR_LISTS{{"a", "", "ccc"}, cudf::test::iterators::null_at(1)}, + STR_LISTS{}, + STR_LISTS{{"", "bb", "ddd"}, cudf::test::iterators::null_at(0)}, + STR_LISTS{"zzz", "xxxxx"}, + STR_LISTS{{"v", "", "", "w"}, cudf::test::iterators::null_at(2)}}, + cudf::test::iterators::null_at(1)}; + auto const view = cudf::lists_column_view(input); + + auto results = cudf::strings::format_list_column(view); + auto expected = cudf::test::strings_column_wrapper( + {"[a,NULL,ccc]", "NULL", "[NULL,bb,ddd]", "[zzz,xxxxx]", "[v,,NULL,w]"}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); +} + +TEST_F(StringsFormatListsTest, CustomParameters) +{ + using STR_LISTS = cudf::test::lists_column_wrapper; + + auto const input = + STR_LISTS{STR_LISTS{{STR_LISTS{{"a", "", "ccc"}, cudf::test::iterators::null_at(1)}, + STR_LISTS{}, + STR_LISTS{"ddd", "ee", "f"}}, + cudf::test::iterators::null_at(1)}, + {STR_LISTS{"gg", "hhh"}, STR_LISTS{"i", "", "", "jj"}}}; + auto const view = cudf::lists_column_view(input); + auto separators = cudf::test::strings_column_wrapper({": ", "{", "}"}); + + auto results = cudf::strings::format_list_column( + view, cudf::string_scalar("null"), cudf::strings_column_view(separators)); + auto expected = cudf::test::strings_column_wrapper( + {"{{a: null: ccc}: null: {ddd: ee: f}}", "{{gg: hhh}: {i: : : jj}}"}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); +} + +TEST_F(StringsFormatListsTest, NestedList) +{ + using STR_LISTS = cudf::test::lists_column_wrapper; + + auto const input = + STR_LISTS{{STR_LISTS{"a", "bb", "ccc"}, STR_LISTS{}, STR_LISTS{"ddd", "ee", "f"}}, + {STR_LISTS{"gg", "hhh"}, STR_LISTS{"i", "", "", "jj"}}}; + auto const view = cudf::lists_column_view(input); + + auto results = cudf::strings::format_list_column(view); + auto expected = + cudf::test::strings_column_wrapper({"[[a,bb,ccc],[],[ddd,ee,f]]", "[[gg,hhh],[i,,,jj]]"}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); +} + +TEST_F(StringsFormatListsTest, SlicedLists) +{ + using STR_LISTS = cudf::test::lists_column_wrapper; + + auto const input = + STR_LISTS{{STR_LISTS{{"a", "", "bb"}, cudf::test::iterators::null_at(1)}, + STR_LISTS{}, + STR_LISTS{{"", "ccc", "dddd"}, cudf::test::iterators::null_at(0)}, + STR_LISTS{"zzz", ""}, + STR_LISTS{}, + STR_LISTS{{"abcdef", "012345", "", ""}, cudf::test::iterators::null_at(2)}, + STR_LISTS{{"", "11111", "00000"}, cudf::test::iterators::null_at(0)}, + STR_LISTS{"hey hey", "way way"}, + STR_LISTS{}, + STR_LISTS{"ééé", "12345abcdef"}, + STR_LISTS{"www", "12345"}}, + cudf::test::iterators::nulls_at({1, 4, 8})}; + + // matching expected strings + auto const h_expected = std::vector({"[a,NULL,bb]", + "NULL", + "[NULL,ccc,dddd]", + "[zzz,]", + "NULL", + "[abcdef,012345,NULL,]", + "[NULL,11111,00000]", + "[hey hey,way way]", + "NULL", + "[ééé,12345abcdef]", + "[www,12345]"}); + + // set of slice intervals: covers slicing the front, back, and middle + std::vector> index_pairs({{0, 11}, {0, 4}, {3, 8}, {5, 11}}); + for (auto indexes : index_pairs) { + auto sliced = cudf::lists_column_view(cudf::slice(input, {indexes.first, indexes.second})[0]); + auto results = cudf::strings::format_list_column(sliced); + auto expected = cudf::test::strings_column_wrapper(h_expected.begin() + indexes.first, + h_expected.begin() + indexes.second); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + } +} + +TEST_F(StringsFormatListsTest, Errors) +{ + using STR_LISTS = cudf::test::lists_column_wrapper; + + cudf::test::lists_column_wrapper invalid({1, 2, 3}); + EXPECT_THROW(cudf::strings::format_list_column(cudf::lists_column_view(invalid)), + cudf::logic_error); + + auto const input = STR_LISTS{STR_LISTS{}, STR_LISTS{}}; + auto const view = cudf::lists_column_view(input); + auto separators = cudf::test::strings_column_wrapper({"{", "}"}); + + EXPECT_THROW(cudf::strings::format_list_column( + view, cudf::string_scalar(""), cudf::strings_column_view(separators)), + cudf::logic_error); + + EXPECT_THROW(cudf::strings::format_list_column(view, cudf::string_scalar("", false)), + cudf::logic_error); +} diff --git a/python/cudf/cudf/_lib/cpp/strings/convert/convert_lists.pxd b/python/cudf/cudf/_lib/cpp/strings/convert/convert_lists.pxd new file mode 100644 index 00000000000..99bb80a813d --- /dev/null +++ b/python/cudf/cudf/_lib/cpp/strings/convert/convert_lists.pxd @@ -0,0 +1,15 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. +from libcpp.memory cimport unique_ptr + +from cudf._lib.cpp.column.column cimport column +from cudf._lib.cpp.column.column_view cimport column_view +from cudf._lib.cpp.scalar.scalar cimport string_scalar + + +cdef extern from "cudf/strings/convert/convert_lists.hpp" namespace \ + "cudf::strings" nogil: + + cdef unique_ptr[column] format_list_column( + column_view input_col, + string_scalar na_rep, + column_view separators) except + diff --git a/python/cudf/cudf/_lib/strings/convert/convert_lists.pyx b/python/cudf/cudf/_lib/strings/convert/convert_lists.pyx new file mode 100644 index 00000000000..7ffa69cd680 --- /dev/null +++ b/python/cudf/cudf/_lib/strings/convert/convert_lists.pyx @@ -0,0 +1,48 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move + +from cudf._lib.column cimport Column +from cudf._lib.cpp.column.column cimport column +from cudf._lib.cpp.column.column_view cimport column_view +from cudf._lib.cpp.scalar.scalar cimport string_scalar +from cudf._lib.cpp.strings.convert.convert_lists cimport ( + format_list_column as cpp_format_list_column, +) + +from cudf._lib.scalar import as_device_scalar + +from cudf._lib.scalar cimport DeviceScalar + + +def format_list_column(Column source_list, Column separators): + """ + Format a list column of strings into a strings column. + + Parameters + ---------- + input_col : input column of type list with strings child. + + separators: strings used for formatting (', ', '[', ']') + + Returns + ------- + Formatted strings column + """ + cdef unique_ptr[column] c_result + cdef column_view source_view = source_list.view() + cdef column_view separators_view = separators.view() + # Use 'None' as null-replacment string + cdef DeviceScalar str_na_rep = as_device_scalar("None") + cdef const string_scalar* string_scalar_na_rep = ( + str_na_rep.get_raw_ptr()) + + with nogil: + c_result = move(cpp_format_list_column( + source_view, string_scalar_na_rep[0], separators_view + )) + + return Column.from_unique_ptr( + move(c_result) + ) diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index effeb957238..da51ce3becc 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -1,7 +1,7 @@ # Copyright (c) 2020-2021, NVIDIA CORPORATION. import pickle -from typing import Sequence +from typing import List, Sequence import numpy as np import pyarrow as pa @@ -17,6 +17,7 @@ extract_element, sort_lists, ) +from cudf._lib.strings.convert.convert_lists import format_list_column from cudf._typing import BinaryOperand, ColumnLike, Dtype, ScalarLike from cudf.api.types import _is_non_decimal_numeric_dtype, is_list_dtype from cudf.core.buffer import Buffer @@ -317,6 +318,39 @@ def from_sequences( ) return res + def as_string_column( + self, dtype: Dtype, format=None, **kwargs + ) -> "cudf.core.column.StringColumn": + """ + Create a strings column from a list column + """ + # Convert the leaf child column to strings column + cc: List[ListColumn] = [] + c: ColumnBase = self + while isinstance(c, ListColumn): + cc.insert(0, c) + c = c.children[1] + s = c.as_string_column(dtype) + + # Rebuild the list column replacing just the leaf child + lc = s + for c in cc: + o = c.children[0] + lc = cudf.core.column.ListColumn( # type: ignore + size=c.size, + dtype=cudf.ListDtype(lc.dtype), + mask=c.mask, + offset=c.offset, + null_count=c.null_count, + children=(o, lc), + ) + + # Separator strings to match the Python format + separators = as_column([", ", "[", "]"]) + + # Call libcudf to format the list column + return format_list_column(lc, separators) + class ListMethods(ColumnMethods): """ diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py index abd24ddd0fd..2b71ca7ac36 100644 --- a/python/cudf/cudf/tests/test_list.py +++ b/python/cudf/cudf/tests/test_list.py @@ -540,6 +540,26 @@ def test_listcol_setitem(data, item): assert_eq(expect, sr) +@pytest.mark.parametrize( + "data", + [ + [[1, 2, 3], [4, 5, 6], [7, 8, 9]], + [ + [[1, 2, 3], [4, 5, 6]], + [[1, 2, 3], [4, 5, 6]], + [[1, 2, 3], [4, 5, 6]], + ], + [[[1, 2, 3], [4, None, 6]], [], None, [[7, 8], [], None, [9]]], + [[1, 2, 3], [4, None, 6], [7, 8], [], None, [9]], + [[1.0, 2.0, 3.0], [4.0, None, 6.0], [7.0, 8.0], [], None, [9.0]], + ], +) +def test_listcol_as_string(data): + got = cudf.Series(data).astype("str") + expect = pd.Series(data).astype("str") + assert_eq(expect, got) + + @pytest.mark.parametrize( "data,item,error", [