From df45976b5e27f565d9a4d6a435a74a59b2b4a9d6 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath <3190405+shwina@users.noreply.github.com> Date: Wed, 30 Jun 2021 19:12:23 -0400 Subject: [PATCH] Add Python bindings for `lists::concatenate_list_elements` and expose them as `.list.concat()` (#8006) Adds a method to concatenate the lists in a nested list Series: ```python In [15]: s Out[15]: 0 [[1, 2], [3, 4]] dtype: list In [16]: s.list.concat() Out[16]: 0 [1, 2, 3, 4] dtype: list ``` Authors: - Ashwin Srinath (https://github.com/shwina) Approvers: - Nghia Truong (https://github.com/ttnghia) - GALI PREM SAGAR (https://github.com/galipremsagar) - Charles Blackmon-Luca (https://github.com/charlesbluca) URL: https://github.com/rapidsai/cudf/pull/8006 --- python/cudf/cudf/_lib/cpp/lists/combine.pxd | 16 +++ python/cudf/cudf/_lib/lists.pyx | 22 +++- python/cudf/cudf/core/column/categorical.py | 6 +- python/cudf/cudf/core/column/lists.py | 111 +++++++++++++++----- python/cudf/cudf/core/column/methods.py | 12 ++- python/cudf/cudf/core/column/string.py | 5 +- python/cudf/cudf/tests/test_list.py | 38 +++++++ 7 files changed, 170 insertions(+), 40 deletions(-) diff --git a/python/cudf/cudf/_lib/cpp/lists/combine.pxd b/python/cudf/cudf/_lib/cpp/lists/combine.pxd index ea9ade178e2..164253e39b5 100644 --- a/python/cudf/cudf/_lib/cpp/lists/combine.pxd +++ b/python/cudf/cudf/_lib/cpp/lists/combine.pxd @@ -3,10 +3,26 @@ from libcpp.memory cimport unique_ptr from cudf._lib.cpp.column.column cimport column +from cudf._lib.cpp.column.column_view cimport column_view from cudf._lib.cpp.table.table_view cimport table_view cdef extern from "cudf/lists/combine.hpp" namespace \ "cudf::lists" nogil: + + ctypedef enum concatenate_null_policy: + IGNORE "cudf::lists::concatenate_null_policy::IGNORE" + NULLIFY_OUTPUT_ROW \ + "cudf::lists::concatenate_null_policy::NULLIFY_OUTPUT_ROW" + cdef unique_ptr[column] concatenate_rows( const table_view input_table ) except + + + cdef unique_ptr[column] concatenate_list_elements( + const table_view input_table, + ) except + + + cdef unique_ptr[column] concatenate_list_elements( + const column_view input_table, + concatenate_null_policy null_policy + ) except + diff --git a/python/cudf/cudf/_lib/lists.pyx b/python/cudf/cudf/_lib/lists.pyx index 7d8909610dc..9fd7d7611ae 100644 --- a/python/cudf/cudf/_lib/lists.pyx +++ b/python/cudf/cudf/_lib/lists.pyx @@ -17,8 +17,11 @@ from cudf._lib.cpp.lists.sorting cimport ( sort_lists as cpp_sort_lists ) from cudf._lib.cpp.lists.combine cimport ( - concatenate_rows as cpp_concatenate_rows + concatenate_rows as cpp_concatenate_rows, + concatenate_null_policy, + concatenate_list_elements as cpp_concatenate_list_elements ) + from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view from cudf._lib.cpp.column.column_view cimport column_view from cudf._lib.cpp.column.column cimport column @@ -181,3 +184,20 @@ def concatenate_rows(Table tbl): result = Column.from_unique_ptr(move(c_result)) return result + + +def concatenate_list_elements(Column input_column, dropna=False): + cdef concatenate_null_policy policy = ( + concatenate_null_policy.IGNORE if dropna + else concatenate_null_policy.NULLIFY_OUTPUT_ROW + ) + cdef column_view c_input = input_column.view() + cdef unique_ptr[column] c_result + + with nogil: + c_result = move(cpp_concatenate_list_elements( + c_input, + policy + )) + + return Column.from_unique_ptr(move(c_result)) diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index 135fb6e6f30..cbcc30d38a7 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -12,7 +12,6 @@ Optional, Sequence, Tuple, - Union, cast, ) @@ -28,7 +27,7 @@ from cudf._typing import ColumnLike, Dtype, ScalarLike from cudf.core.buffer import Buffer from cudf.core.column import column -from cudf.core.column.methods import ColumnMethodsMixin +from cudf.core.column.methods import ColumnMethodsMixin, ParentType from cudf.core.dtypes import CategoricalDtype from cudf.utils.dtypes import ( is_categorical_dtype, @@ -48,9 +47,6 @@ ) -ParentType = Union["cudf.Series", "cudf.Index"] - - class CategoricalAccessor(ColumnMethodsMixin): _column: CategoricalColumn diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index ff63d8c5aaa..843190f38aa 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -8,6 +8,7 @@ import cudf from cudf._lib.copying import segmented_gather from cudf._lib.lists import ( + concatenate_list_elements, concatenate_rows, contains_scalar, count_elements, @@ -16,15 +17,17 @@ sort_lists, ) from cudf._lib.table import Table -from cudf._typing import BinaryOperand, Dtype +from cudf._typing import BinaryOperand, ColumnLike, Dtype, ScalarLike from cudf.core.buffer import Buffer from cudf.core.column import ColumnBase, as_column, column -from cudf.core.column.methods import ColumnMethodsMixin +from cudf.core.column.methods import ColumnMethodsMixin, ParentType from cudf.core.dtypes import ListDtype from cudf.utils.dtypes import _is_non_decimal_numeric_dtype, is_list_dtype class ListColumn(ColumnBase): + dtype: ListDtype + def __init__( self, size, dtype, mask=None, offset=0, null_count=None, children=(), ): @@ -278,14 +281,16 @@ class ListMethods(ColumnMethodsMixin): List methods for Series """ - def __init__(self, column, parent=None): + _column: ListColumn + + def __init__(self, column: ListColumn, parent: ParentType = None): if not is_list_dtype(column.dtype): raise AttributeError( "Can only use .list accessor with a 'list' dtype" ) super().__init__(column=column, parent=parent) - def get(self, index): + def get(self, index: int) -> ParentType: """ Extract element at the given index from each component @@ -317,10 +322,10 @@ def get(self, index): else: raise IndexError("list index out of range") - def contains(self, search_key): + def contains(self, search_key: ScalarLike) -> ParentType: """ - Creates a column of bool values indicating whether the specified scalar - is an element of each row of a list column. + Returns boolean values indicating whether the specified scalar + is an element of each row. Parameters ---------- @@ -329,7 +334,7 @@ def contains(self, search_key): Returns ------- - Column + Series or Index Examples -------- @@ -357,14 +362,14 @@ def contains(self, search_key): return res @property - def leaves(self): + def leaves(self) -> ParentType: """ From a Series of (possibly nested) lists, obtain the elements from the innermost lists as a flat Series (one value per row). Returns ------- - Series + Series or Index Examples -------- @@ -385,7 +390,7 @@ def leaves(self): self._column.elements, retain_index=False ) - def len(self): + def len(self) -> ParentType: """ Computes the length of each element in the Series/Index. @@ -409,18 +414,18 @@ def len(self): """ return self._return_or_inplace(count_elements(self._column)) - def take(self, lists_indices): + def take(self, lists_indices: ColumnLike) -> ParentType: """ Collect list elements based on given indices. Parameters ---------- - lists_indices: List type arrays + lists_indices: Series-like of lists Specifies what to collect from each row Returns ------- - ListColumn + Series or Index Examples -------- @@ -464,14 +469,14 @@ def take(self, lists_indices): else: return res - def unique(self): + def unique(self) -> ParentType: """ - Returns unique element for each list in the column, order for each - unique element is not guaranteed. + Returns the unique elements in each list. + The ordering of elements is not guaranteed. Returns ------- - ListColumn + Series or Index Examples -------- @@ -501,12 +506,12 @@ def unique(self): def sort_values( self, - ascending=True, - inplace=False, - kind="quicksort", - na_position="last", - ignore_index=False, - ): + ascending: bool = True, + inplace: bool = False, + kind: str = "quicksort", + na_position: str = "last", + ignore_index: bool = False, + ) -> ParentType: """ Sort each list by the values. @@ -523,7 +528,7 @@ def sort_values( Returns ------- - ListColumn with each list sorted + Series or Index with each list sorted Notes ----- @@ -552,3 +557,59 @@ def sort_values( sort_lists(self._column, ascending, na_position), retain_index=not ignore_index, ) + + def concat(self, dropna=True) -> ParentType: + """ + For a column with at least one level of nesting, concatenate the + lists in each row. + + Parameters + ---------- + dropna: bool, optional + If True (default), ignores top-level null elements in each row. + If False, and top-level null elements are present, the resulting + row in the output is null. + + Returns + ------- + Series or Index + + Examples + -------- + >>> s1 + 0 [[1.0, 2.0], [3.0, 4.0, 5.0]] + 1 [[6.0, None], [7.0], [8.0, 9.0]] + dtype: list + >>> s1.list.concat() + 0 [1.0, 2.0, 3.0, 4.0, 5.0] + 1 [6.0, None, 7.0, 8.0, 9.0] + dtype: list + + Null values at the top-level in each row are dropped by default: + + >>> s2 + 0 [[1.0, 2.0], None, [3.0, 4.0, 5.0]] + 1 [[6.0, None], [7.0], [8.0, 9.0]] + dtype: list + >>> s2.list.concat() + 0 [1.0, 2.0, 3.0, 4.0, 5.0] + 1 [6.0, None, 7.0, 8.0, 9.0] + dtype: list + + Use ``dropna=False`` to produce a null instead: + + >>> s2.list.concat(dropna=False) + 0 None + 1 [6.0, nan, 7.0, 8.0, 9.0] + dtype: list + """ + try: + result = concatenate_list_elements(self._column, dropna=dropna) + except RuntimeError as e: + if "Rows of the input column must be lists." in str(e): + raise ValueError( + "list.concat() can only be called on " + "list columns with at least one level " + "of nesting" + ) + return self._return_or_inplace(result) diff --git a/python/cudf/cudf/core/column/methods.py b/python/cudf/cudf/core/column/methods.py index d7b416d06c9..4b448e27a53 100644 --- a/python/cudf/cudf/core/column/methods.py +++ b/python/cudf/cudf/core/column/methods.py @@ -11,15 +11,17 @@ if TYPE_CHECKING: from cudf.core.column import ColumnBase +ParentType = Union["cudf.Series", "cudf.BaseIndex"] + class ColumnMethodsMixin: _column: ColumnBase - _parent: Optional[Union["cudf.Series", "cudf.Index"]] + _parent: Optional[Union["cudf.Series", "cudf.BaseIndex"]] def __init__( self, column: ColumnBase, - parent: Union["cudf.Series", "cudf.Index"] = None, + parent: Union["cudf.Series", "cudf.BaseIndex"] = None, ): self._column = column self._parent = parent @@ -27,13 +29,13 @@ def __init__( @overload def _return_or_inplace( self, new_col, inplace: Literal[False], expand=False, retain_index=True - ) -> Union["cudf.Series", "cudf.Index"]: + ) -> Union["cudf.Series", "cudf.BaseIndex"]: ... @overload def _return_or_inplace( self, new_col, expand: bool = False, retain_index: bool = True - ) -> Union["cudf.Series", "cudf.Index"]: + ) -> Union["cudf.Series", "cudf.BaseIndex"]: ... @overload @@ -49,7 +51,7 @@ def _return_or_inplace( inplace: bool = False, expand: bool = False, retain_index: bool = True, - ) -> Optional[Union["cudf.Series", "cudf.Index"]]: + ) -> Optional[Union["cudf.Series", "cudf.BaseIndex"]]: ... def _return_or_inplace( diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index a6a9de2e77b..11051b63920 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -161,7 +161,7 @@ from cudf.api.types import is_integer from cudf.core.buffer import Buffer from cudf.core.column import column, datetime -from cudf.core.column.methods import ColumnMethodsMixin +from cudf.core.column.methods import ColumnMethodsMixin, ParentType from cudf.utils import utils from cudf.utils.docutils import copy_docstring from cudf.utils.dtypes import ( @@ -216,9 +216,6 @@ } -ParentType = Union["cudf.Series", "cudf.core.index.BaseIndex"] - - class StringMethods(ColumnMethodsMixin): def __init__(self, column, parent=None): """ diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py index 0e93ff8a232..abd24ddd0fd 100644 --- a/python/cudf/cudf/tests/test_list.py +++ b/python/cudf/cudf/tests/test_list.py @@ -1,5 +1,6 @@ # Copyright (c) 2020-2021, NVIDIA CORPORATION. import functools +import operator import numpy as np import pandas as pd @@ -324,6 +325,43 @@ def test_contains_null_search_key(data, expect): assert_eq(expect, got) +@pytest.mark.parametrize( + "row", + [ + [[]], + [[1]], + [[1, 2]], + [[1, 2], [3, 4, 5]], + [[1, 2], [], [3, 4, 5]], + [[1, 2, None], [3, 4, 5]], + [[1, 2, None], None, [3, 4, 5]], + [[1, 2, None], None, [], [3, 4, 5]], + [[[1, 2], [3, 4]], [[5, 6, 7], [8, 9]]], + [[["a", "c", "de", None], None, ["fg"]], [["abc", "de"], None]], + ], +) +@pytest.mark.parametrize("dropna", [True, False]) +def test_concat_elements(row, dropna): + if any(x is None for x in row): + if dropna: + row = [x for x in row if x is not None] + result = functools.reduce(operator.add, row) + else: + result = None + else: + result = functools.reduce(operator.add, row) + + expect = pd.Series([result]) + got = cudf.Series([row]).list.concat(dropna=dropna) + assert_eq(expect, got) + + +def test_concat_elements_raise(): + s = cudf.Series([[1, 2, 3]]) # no nesting + with pytest.raises(ValueError): + s.list.concat() + + def test_concatenate_rows_of_lists(): pdf = pd.DataFrame({"val": [["a", "a"], ["b"], ["c"]]}) gdf = cudf.from_pandas(pdf)