From 1f188e18804be140bd461cd0d2cd4dca75b93244 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Tue, 20 Apr 2021 12:08:48 -0400 Subject: [PATCH 1/8] Add initial list ravel --- python/cudf/cudf/core/column/lists.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index da953df5478..c919b5c1fd8 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -17,6 +17,7 @@ from cudf.core.buffer import Buffer from cudf.core.column import ColumnBase, as_column, column from cudf.core.column.methods import ColumnMethodsMixin +from cudf.core.dtypes import ListDtype from cudf.utils.dtypes import is_list_dtype, is_numerical_dtype @@ -451,3 +452,26 @@ def sort_values( sort_lists(self._column, ascending, na_position), retain_index=not ignore_index, ) + + def ravel(self): + result_dtype = self._column.dtype.element_type + if not isinstance(result_dtype, ListDtype): + raise ValueError( + "Cannot ravel a list column with just 1 level of nesting" + ) + + self_offsets = self._column.children[0] + child_offsets = self._column.children[1].children[0] + result_offsets = child_offsets[self_offsets] + result_children = (result_offsets, self._column.list().leaves) + + return self._return_or_inplace( + ListColumn( + self._column.size, + self._column.dtype.element_type, + self._column.mask, + self._column.offset, + self._column.null_count, + result_children, + ) + ) From ddf2907ddb228ed50bdb00048940c06cc6e3c82e Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Tue, 27 Apr 2021 08:32:04 -0400 Subject: [PATCH 2/8] Add typing for list methods. Ravel tests. --- python/cudf/cudf/core/column/categorical.py | 6 +- python/cudf/cudf/core/column/lists.py | 93 ++++++++++++++------- python/cudf/cudf/core/column/methods.py | 2 + python/cudf/cudf/core/column/string.py | 5 +- python/cudf/cudf/tests/test_list.py | 29 +++++++ 5 files changed, 97 insertions(+), 38 deletions(-) diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index bb1bf3c5d5c..4b9758b9d9d 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -11,7 +11,6 @@ Optional, Sequence, Tuple, - Union, cast, ) @@ -26,7 +25,7 @@ from cudf._typing import ColumnLike, Dtype, ScalarLike from cudf.core.buffer import Buffer from cudf.core.column import column -from cudf.core.column.methods import ColumnMethodsMixin +from cudf.core.column.methods import ColumnMethodsMixin, ParentType from cudf.core.dtypes import CategoricalDtype from cudf.utils.dtypes import ( is_categorical_dtype, @@ -45,9 +44,6 @@ ) -ParentType = Union["cudf.Series", "cudf.Index"] - - class CategoricalAccessor(ColumnMethodsMixin): _column: CategoricalColumn diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index c919b5c1fd8..62787770f87 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -14,14 +14,17 @@ extract_element, sort_lists, ) +from cudf._typing import ColumnLike, ScalarLike from cudf.core.buffer import Buffer from cudf.core.column import ColumnBase, as_column, column -from cudf.core.column.methods import ColumnMethodsMixin +from cudf.core.column.methods import ColumnMethodsMixin, ParentType from cudf.core.dtypes import ListDtype from cudf.utils.dtypes import is_list_dtype, is_numerical_dtype class ListColumn(ColumnBase): + dtype: ListDtype + def __init__( self, size, dtype, mask=None, offset=0, null_count=None, children=(), ): @@ -178,14 +181,16 @@ class ListMethods(ColumnMethodsMixin): List methods for Series """ - def __init__(self, column, parent=None): + _column: ListColumn + + def __init__(self, column: ListColumn, parent: ParentType = None): if not is_list_dtype(column.dtype): raise AttributeError( "Can only use .list accessor with a 'list' dtype" ) super().__init__(column=column, parent=parent) - def get(self, index): + def get(self, index: int) -> ParentType: """ Extract element at the given index from each component @@ -217,10 +222,10 @@ def get(self, index): else: raise IndexError("list index out of range") - def contains(self, search_key): + def contains(self, search_key: ScalarLike) -> ParentType: """ - Creates a column of bool values indicating whether the specified scalar - is an element of each row of a list column. + Returns boolean values indicating whether the specified scalar + is an element of each row. Parameters ---------- @@ -229,7 +234,7 @@ def contains(self, search_key): Returns ------- - Column + Series or Index Examples -------- @@ -257,14 +262,14 @@ def contains(self, search_key): return res @property - def leaves(self): + def leaves(self) -> ParentType: """ From a Series of (possibly nested) lists, obtain the elements from the innermost lists as a flat Series (one value per row). Returns ------- - Series + Series or Index Examples -------- @@ -285,7 +290,7 @@ def leaves(self): self._column.elements, retain_index=False ) - def len(self): + def len(self) -> ParentType: """ Computes the length of each element in the Series/Index. @@ -309,18 +314,18 @@ def len(self): """ return self._return_or_inplace(count_elements(self._column)) - def take(self, lists_indices): + def take(self, lists_indices: ColumnLike) -> ParentType: """ Collect list elements based on given indices. Parameters ---------- - lists_indices: List type arrays + lists_indices: Series-like of lists Specifies what to collect from each row Returns ------- - ListColumn + Series or Index Examples -------- @@ -364,14 +369,14 @@ def take(self, lists_indices): else: return res - def unique(self): + def unique(self) -> ParentType: """ - Returns unique element for each list in the column, order for each - unique element is not guaranteed. + Returns the unique elements in each list. + The ordering of elements is not guaranteed. Returns ------- - ListColumn + Series or Index Examples -------- @@ -401,12 +406,12 @@ def unique(self): def sort_values( self, - ascending=True, - inplace=False, - kind="quicksort", - na_position="last", - ignore_index=False, - ): + ascending: bool = True, + inplace: bool = False, + kind: str = "quicksort", + na_position: str = "last", + ignore_index: bool = False, + ) -> ParentType: """ Sort each list by the values. @@ -423,7 +428,7 @@ def sort_values( Returns ------- - ListColumn with each list sorted + Series or Index with each list sorted Notes ----- @@ -453,17 +458,47 @@ def sort_values( retain_index=not ignore_index, ) - def ravel(self): + def ravel(self) -> ParentType: + """ + Removes one level of nesting from each row of the list Series. + + Returns + ------- + Series or Index with one level of nesting removed from each row. + + Examples + -------- + >>> s1 + 0 [[1.0, 2.0], [3.0, 4.0, 5.0]] + 1 [[6.0, nan], [7.0], [8.0, 9.0]] + dtype: list + >>> s1.list.ravel() + 0 [1.0, 2.0, 3.0, 4.0, 5.0] + 1 [6.0, nan, 7.0, 8.0, 9.0] + dtype: list + + Null values at the top-level in each row are dropped: + + >>> s2 + 0 [[1.0, 2.0], None, [3.0, 4.0, 5.0]] + 1 [[6.0, nan], [7.0], [8.0, 9.0]] + dtype: list + >>> s2.list.ravel() + 0 [1.0, 2.0, 3.0, 4.0, 5.0] + 1 [6.0, nan, 7.0, 8.0, 9.0] + dtype: list + """ result_dtype = self._column.dtype.element_type if not isinstance(result_dtype, ListDtype): - raise ValueError( - "Cannot ravel a list column with just 1 level of nesting" - ) + return self._return_or_inplace(self._column) self_offsets = self._column.children[0] child_offsets = self._column.children[1].children[0] result_offsets = child_offsets[self_offsets] - result_children = (result_offsets, self._column.list().leaves) + result_children = ( + result_offsets, + self._column.children[1].children[1], + ) return self._return_or_inplace( ListColumn( diff --git a/python/cudf/cudf/core/column/methods.py b/python/cudf/cudf/core/column/methods.py index eec9c2a7860..953da311cc1 100644 --- a/python/cudf/cudf/core/column/methods.py +++ b/python/cudf/cudf/core/column/methods.py @@ -11,6 +11,8 @@ if TYPE_CHECKING: from cudf.core.column import ColumnBase +ParentType = Union["cudf.Series", "cudf.Index"] + class ColumnMethodsMixin: _column: ColumnBase diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index de2df9b50d7..6fd6f7cbeb8 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -154,7 +154,7 @@ from cudf._typing import ColumnLike, Dtype, ScalarLike from cudf.core.buffer import Buffer from cudf.core.column import column, datetime -from cudf.core.column.methods import ColumnMethodsMixin +from cudf.core.column.methods import ColumnMethodsMixin, ParentType from cudf.utils import utils from cudf.utils.docutils import copy_docstring from cudf.utils.dtypes import ( @@ -209,9 +209,6 @@ } -ParentType = Union["cudf.Series", "cudf.Index"] - - class StringMethods(ColumnMethodsMixin): def __init__(self, column, parent=None): """ diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py index 9906600304b..985cd71ead2 100644 --- a/python/cudf/cudf/tests/test_list.py +++ b/python/cudf/cudf/tests/test_list.py @@ -1,5 +1,6 @@ # Copyright (c) 2020-2021, NVIDIA CORPORATION. import functools +import itertools import numpy as np import pandas as pd @@ -315,3 +316,31 @@ def test_contains_null_search_key(data, expect): expect = cudf.Series(expect, dtype="bool") got = sr.list.contains(cudf.Scalar(cudf.NA, sr.dtype.element_type)) assert_eq(expect, got) + + +@pytest.mark.parametrize( + "row", + [ + [[]], + [[1]], + [[1, 2]], + [[1, 2], [3, 4, 5]], + [[1, 2, None], [3, 4, 5]], + [[[1, 2], [3, 4]], [[5, 6, 7], [8, 9]]], + [[["a", "c", "de", None], None, ["fg"]], [["abc", "de"], None]], + ], +) +def test_ravel(row): + def ravel(row): + return list( + itertools.chain.from_iterable([x for x in row if x is not None]) + ) + + expect = pd.Series([ravel(row)]) + got = cudf.Series([row]).list.ravel() + assert_eq(expect, got) + + +def test_ravel_no_nesting(): + s = cudf.Series([[1, 2], [3, 4, 5]]) + assert_eq(s, s.list.ravel()) From dfd1719ec4e5e570848b5649cf56fb2dc66e7262 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Mon, 3 May 2021 10:52:44 -0400 Subject: [PATCH 3/8] Rename ravel -> flatten --- python/cudf/cudf/core/column/lists.py | 6 +++--- python/cudf/cudf/tests/test_list.py | 12 ++++++------ 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index 62787770f87..aa1b96ba1f0 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -458,7 +458,7 @@ def sort_values( retain_index=not ignore_index, ) - def ravel(self) -> ParentType: + def flatten(self) -> ParentType: """ Removes one level of nesting from each row of the list Series. @@ -472,7 +472,7 @@ def ravel(self) -> ParentType: 0 [[1.0, 2.0], [3.0, 4.0, 5.0]] 1 [[6.0, nan], [7.0], [8.0, 9.0]] dtype: list - >>> s1.list.ravel() + >>> s1.list.flatten() 0 [1.0, 2.0, 3.0, 4.0, 5.0] 1 [6.0, nan, 7.0, 8.0, 9.0] dtype: list @@ -483,7 +483,7 @@ def ravel(self) -> ParentType: 0 [[1.0, 2.0], None, [3.0, 4.0, 5.0]] 1 [[6.0, nan], [7.0], [8.0, 9.0]] dtype: list - >>> s2.list.ravel() + >>> s2.list.flatten() 0 [1.0, 2.0, 3.0, 4.0, 5.0] 1 [6.0, nan, 7.0, 8.0, 9.0] dtype: list diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py index 985cd71ead2..37e8934fbff 100644 --- a/python/cudf/cudf/tests/test_list.py +++ b/python/cudf/cudf/tests/test_list.py @@ -330,17 +330,17 @@ def test_contains_null_search_key(data, expect): [[["a", "c", "de", None], None, ["fg"]], [["abc", "de"], None]], ], ) -def test_ravel(row): - def ravel(row): +def test_flatten(row): + def flatten(row): return list( itertools.chain.from_iterable([x for x in row if x is not None]) ) - expect = pd.Series([ravel(row)]) - got = cudf.Series([row]).list.ravel() + expect = pd.Series([flatten(row)]) + got = cudf.Series([row]).list.flatten() assert_eq(expect, got) -def test_ravel_no_nesting(): +def test_flatten_no_nesting(): s = cudf.Series([[1, 2], [3, 4, 5]]) - assert_eq(s, s.list.ravel()) + assert_eq(s, s.list.flatten()) From f3ddaba49693979ca1ff19967e775b3da6919676 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Mon, 3 May 2021 10:57:18 -0400 Subject: [PATCH 4/8] Don't access children by index --- python/cudf/cudf/core/column/lists.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index aa1b96ba1f0..9b686b5a8ca 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -492,13 +492,10 @@ def flatten(self) -> ParentType: if not isinstance(result_dtype, ListDtype): return self._return_or_inplace(self._column) - self_offsets = self._column.children[0] - child_offsets = self._column.children[1].children[0] + self_offsets = self._column.offsets + child_offsets = self._column.elements.offsets result_offsets = child_offsets[self_offsets] - result_children = ( - result_offsets, - self._column.children[1].children[1], - ) + result_children = (result_offsets, self._column.elements.elements) return self._return_or_inplace( ListColumn( From 65cfa914503cdb5575d26d33c4d7ed1e70541764 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Wed, 26 May 2021 18:01:17 -0400 Subject: [PATCH 5/8] Add bindings for lists::concatenate_list_elements --- python/cudf/cudf/_lib/cpp/lists/combine.pxd | 16 +++++++ python/cudf/cudf/_lib/lists.pyx | 22 ++++++++- python/cudf/cudf/core/column/lists.py | 51 ++++++++++----------- python/cudf/cudf/tests/test_list.py | 24 ++++------ 4 files changed, 72 insertions(+), 41 deletions(-) diff --git a/python/cudf/cudf/_lib/cpp/lists/combine.pxd b/python/cudf/cudf/_lib/cpp/lists/combine.pxd index ea9ade178e2..164253e39b5 100644 --- a/python/cudf/cudf/_lib/cpp/lists/combine.pxd +++ b/python/cudf/cudf/_lib/cpp/lists/combine.pxd @@ -3,10 +3,26 @@ from libcpp.memory cimport unique_ptr from cudf._lib.cpp.column.column cimport column +from cudf._lib.cpp.column.column_view cimport column_view from cudf._lib.cpp.table.table_view cimport table_view cdef extern from "cudf/lists/combine.hpp" namespace \ "cudf::lists" nogil: + + ctypedef enum concatenate_null_policy: + IGNORE "cudf::lists::concatenate_null_policy::IGNORE" + NULLIFY_OUTPUT_ROW \ + "cudf::lists::concatenate_null_policy::NULLIFY_OUTPUT_ROW" + cdef unique_ptr[column] concatenate_rows( const table_view input_table ) except + + + cdef unique_ptr[column] concatenate_list_elements( + const table_view input_table, + ) except + + + cdef unique_ptr[column] concatenate_list_elements( + const column_view input_table, + concatenate_null_policy null_policy + ) except + diff --git a/python/cudf/cudf/_lib/lists.pyx b/python/cudf/cudf/_lib/lists.pyx index 7d8909610dc..9fd7d7611ae 100644 --- a/python/cudf/cudf/_lib/lists.pyx +++ b/python/cudf/cudf/_lib/lists.pyx @@ -17,8 +17,11 @@ from cudf._lib.cpp.lists.sorting cimport ( sort_lists as cpp_sort_lists ) from cudf._lib.cpp.lists.combine cimport ( - concatenate_rows as cpp_concatenate_rows + concatenate_rows as cpp_concatenate_rows, + concatenate_null_policy, + concatenate_list_elements as cpp_concatenate_list_elements ) + from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view from cudf._lib.cpp.column.column_view cimport column_view from cudf._lib.cpp.column.column cimport column @@ -181,3 +184,20 @@ def concatenate_rows(Table tbl): result = Column.from_unique_ptr(move(c_result)) return result + + +def concatenate_list_elements(Column input_column, dropna=False): + cdef concatenate_null_policy policy = ( + concatenate_null_policy.IGNORE if dropna + else concatenate_null_policy.NULLIFY_OUTPUT_ROW + ) + cdef column_view c_input = input_column.view() + cdef unique_ptr[column] c_result + + with nogil: + c_result = move(cpp_concatenate_list_elements( + c_input, + policy + )) + + return Column.from_unique_ptr(move(c_result)) diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index 219f4a18a5d..2b6badff02b 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -8,6 +8,7 @@ import cudf from cudf._lib.copying import segmented_gather from cudf._lib.lists import ( + concatenate_list_elements, concatenate_rows, contains_scalar, count_elements, @@ -518,52 +519,50 @@ def sort_values( retain_index=not ignore_index, ) - def flatten(self) -> ParentType: + def concat(self, dropna=True) -> ParentType: """ - Removes one level of nesting from each row of the list Series. + Concatenates the lists in each row + + Parameters + ---------- + dropna: bool,optional + If True (default), ignores top-level null elements in each row. + If False, and top-level null elements are present, the resulting + row in the output is null. Returns ------- - Series or Index with one level of nesting removed from each row. + Series or Index Examples -------- >>> s1 0 [[1.0, 2.0], [3.0, 4.0, 5.0]] - 1 [[6.0, nan], [7.0], [8.0, 9.0]] + 1 [[6.0, None], [7.0], [8.0, 9.0]] dtype: list - >>> s1.list.flatten() + >>> s1.list.concat() 0 [1.0, 2.0, 3.0, 4.0, 5.0] - 1 [6.0, nan, 7.0, 8.0, 9.0] + 1 [6.0, None, 7.0, 8.0, 9.0] dtype: list - Null values at the top-level in each row are dropped: + Null values at the top-level in each row are dropped by default: >>> s2 0 [[1.0, 2.0], None, [3.0, 4.0, 5.0]] - 1 [[6.0, nan], [7.0], [8.0, 9.0]] + 1 [[6.0, None], [7.0], [8.0, 9.0]] dtype: list - >>> s2.list.flatten() + >>> s2.list.concat() 0 [1.0, 2.0, 3.0, 4.0, 5.0] - 1 [6.0, nan, 7.0, 8.0, 9.0] + 1 [6.0, None, 7.0, 8.0, 9.0] dtype: list - """ - result_dtype = self._column.dtype.element_type - if not isinstance(result_dtype, ListDtype): - return self._return_or_inplace(self._column) - self_offsets = self._column.offsets - child_offsets = self._column.elements.offsets - result_offsets = child_offsets[self_offsets] - result_children = (result_offsets, self._column.elements.elements) + Use ``dropna=False`` to produce a null instead: + >>> s2.list.concat(dropna=False) + 0 None + 1 [6.0, nan, 7.0, 8.0, 9.0] + dtype: list + """ return self._return_or_inplace( - ListColumn( - self._column.size, - self._column.dtype.element_type, - self._column.mask, - self._column.offset, - self._column.null_count, - result_children, - ) + concatenate_list_elements(self._column, dropna=dropna) ) diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py index 6f959c97f2e..11e3fcd0b6f 100644 --- a/python/cudf/cudf/tests/test_list.py +++ b/python/cudf/cudf/tests/test_list.py @@ -1,6 +1,6 @@ # Copyright (c) 2020-2021, NVIDIA CORPORATION. import functools -import itertools +import operator import numpy as np import pandas as pd @@ -331,20 +331,16 @@ def test_contains_null_search_key(data, expect): [[["a", "c", "de", None], None, ["fg"]], [["abc", "de"], None]], ], ) -def test_flatten(row): - def flatten(row): - return list( - itertools.chain.from_iterable([x for x in row if x is not None]) - ) - - expect = pd.Series([flatten(row)]) - got = cudf.Series([row]).list.flatten() - assert_eq(expect, got) - +@pytest.mark.parametrize("dropna", [True, False]) +def test_concat_elements(row, dropna): + if not dropna and any(x is None for x in row): + result = None + else: + result = functools.reduce(operator.add, row) -def test_flatten_no_nesting(): - s = cudf.Series([[1, 2], [3, 4, 5]]) - assert_eq(s, s.list.flatten()) + expect = pd.Series([result]) + got = cudf.Series([row]).list.concat(dropna=dropna) + assert_eq(expect, got) def test_concatenate_rows_of_lists(): From 48f694980989a2d0d2280e97660a66ee1b82106b Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Wed, 26 May 2021 18:14:54 -0400 Subject: [PATCH 6/8] Fix test --- python/cudf/cudf/tests/test_list.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py index 11e3fcd0b6f..34afd2a5036 100644 --- a/python/cudf/cudf/tests/test_list.py +++ b/python/cudf/cudf/tests/test_list.py @@ -326,15 +326,22 @@ def test_contains_null_search_key(data, expect): [[1]], [[1, 2]], [[1, 2], [3, 4, 5]], + [[1, 2], [], [3, 4, 5]], [[1, 2, None], [3, 4, 5]], + [[1, 2, None], None, [3, 4, 5]], + [[1, 2, None], None, [], [3, 4, 5]], [[[1, 2], [3, 4]], [[5, 6, 7], [8, 9]]], [[["a", "c", "de", None], None, ["fg"]], [["abc", "de"], None]], ], ) @pytest.mark.parametrize("dropna", [True, False]) def test_concat_elements(row, dropna): - if not dropna and any(x is None for x in row): - result = None + if any(x is None for x in row): + if dropna: + row = [x for x in row if x is not None] + result = functools.reduce(operator.add, row) + else: + result = None else: result = functools.reduce(operator.add, row) From 978807aab98fede136a037b5d881a92734d04462 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Wed, 26 May 2021 18:23:14 -0400 Subject: [PATCH 7/8] Test raise case --- python/cudf/cudf/core/column/lists.py | 16 ++++++++++++---- python/cudf/cudf/tests/test_list.py | 6 ++++++ 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index 2b6badff02b..d217309b976 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -521,7 +521,8 @@ def sort_values( def concat(self, dropna=True) -> ParentType: """ - Concatenates the lists in each row + For a column with at least one level of nesting, concatenate the + lists in each row. Parameters ---------- @@ -563,6 +564,13 @@ def concat(self, dropna=True) -> ParentType: 1 [6.0, nan, 7.0, 8.0, 9.0] dtype: list """ - return self._return_or_inplace( - concatenate_list_elements(self._column, dropna=dropna) - ) + try: + result = concatenate_list_elements(self._column, dropna=dropna) + except RuntimeError as e: + if "Rows of the input column must be lists." in str(e): + raise ValueError( + "list.concat() can only be called on " + "list columns with at least one level " + "of nesting" + ) + return self._return_or_inplace(result) diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py index 34afd2a5036..f4bd4dec7b4 100644 --- a/python/cudf/cudf/tests/test_list.py +++ b/python/cudf/cudf/tests/test_list.py @@ -350,6 +350,12 @@ def test_concat_elements(row, dropna): assert_eq(expect, got) +def test_concat_elements_raise(): + s = cudf.Series([[1, 2, 3]]) # no nesting + with pytest.raises(ValueError): + s.list.concat() + + def test_concatenate_rows_of_lists(): pdf = pd.DataFrame({"val": [["a", "a"], ["b"], ["c"]]}) gdf = cudf.from_pandas(pdf) From cdb5f4b7f6ea2b8a91cd449d84548f6f081070ef Mon Sep 17 00:00:00 2001 From: Ashwin Srinath <3190405+shwina@users.noreply.github.com> Date: Wed, 26 May 2021 18:23:54 -0400 Subject: [PATCH 8/8] Update python/cudf/cudf/core/column/lists.py Co-authored-by: Nghia Truong --- python/cudf/cudf/core/column/lists.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index d217309b976..2eb72ae0c2d 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -526,7 +526,7 @@ def concat(self, dropna=True) -> ParentType: Parameters ---------- - dropna: bool,optional + dropna: bool, optional If True (default), ignores top-level null elements in each row. If False, and top-level null elements are present, the resulting row in the output is null.