Skip to content

Commit

Permalink
Add Python bindings for lists::concatenate_list_elements and expose…
Browse files Browse the repository at this point in the history
… them as `.list.concat()` (#8006)

Adds a method to concatenate the lists in a nested list Series:

```python
In [15]: s
Out[15]:
0    [[1, 2], [3, 4]]
dtype: list

In [16]: s.list.concat()
Out[16]:
0    [1, 2, 3, 4]
dtype: list
```

Authors:
  - Ashwin Srinath (https://github.com/shwina)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Charles Blackmon-Luca (https://github.com/charlesbluca)

URL: #8006
  • Loading branch information
shwina authored Jun 30, 2021
1 parent 5884b95 commit df45976
Show file tree
Hide file tree
Showing 7 changed files with 170 additions and 40 deletions.
16 changes: 16 additions & 0 deletions python/cudf/cudf/_lib/cpp/lists/combine.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,26 @@
from libcpp.memory cimport unique_ptr

from cudf._lib.cpp.column.column cimport column
from cudf._lib.cpp.column.column_view cimport column_view
from cudf._lib.cpp.table.table_view cimport table_view

cdef extern from "cudf/lists/combine.hpp" namespace \
"cudf::lists" nogil:

ctypedef enum concatenate_null_policy:
IGNORE "cudf::lists::concatenate_null_policy::IGNORE"
NULLIFY_OUTPUT_ROW \
"cudf::lists::concatenate_null_policy::NULLIFY_OUTPUT_ROW"

cdef unique_ptr[column] concatenate_rows(
const table_view input_table
) except +

cdef unique_ptr[column] concatenate_list_elements(
const table_view input_table,
) except +

cdef unique_ptr[column] concatenate_list_elements(
const column_view input_table,
concatenate_null_policy null_policy
) except +
22 changes: 21 additions & 1 deletion python/cudf/cudf/_lib/lists.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,11 @@ from cudf._lib.cpp.lists.sorting cimport (
sort_lists as cpp_sort_lists
)
from cudf._lib.cpp.lists.combine cimport (
concatenate_rows as cpp_concatenate_rows
concatenate_rows as cpp_concatenate_rows,
concatenate_null_policy,
concatenate_list_elements as cpp_concatenate_list_elements
)

from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view
from cudf._lib.cpp.column.column_view cimport column_view
from cudf._lib.cpp.column.column cimport column
Expand Down Expand Up @@ -181,3 +184,20 @@ def concatenate_rows(Table tbl):

result = Column.from_unique_ptr(move(c_result))
return result


def concatenate_list_elements(Column input_column, dropna=False):
cdef concatenate_null_policy policy = (
concatenate_null_policy.IGNORE if dropna
else concatenate_null_policy.NULLIFY_OUTPUT_ROW
)
cdef column_view c_input = input_column.view()
cdef unique_ptr[column] c_result

with nogil:
c_result = move(cpp_concatenate_list_elements(
c_input,
policy
))

return Column.from_unique_ptr(move(c_result))
6 changes: 1 addition & 5 deletions python/cudf/cudf/core/column/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
Optional,
Sequence,
Tuple,
Union,
cast,
)

Expand All @@ -28,7 +27,7 @@
from cudf._typing import ColumnLike, Dtype, ScalarLike
from cudf.core.buffer import Buffer
from cudf.core.column import column
from cudf.core.column.methods import ColumnMethodsMixin
from cudf.core.column.methods import ColumnMethodsMixin, ParentType
from cudf.core.dtypes import CategoricalDtype
from cudf.utils.dtypes import (
is_categorical_dtype,
Expand All @@ -48,9 +47,6 @@
)


ParentType = Union["cudf.Series", "cudf.Index"]


class CategoricalAccessor(ColumnMethodsMixin):
_column: CategoricalColumn

Expand Down
111 changes: 86 additions & 25 deletions python/cudf/cudf/core/column/lists.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import cudf
from cudf._lib.copying import segmented_gather
from cudf._lib.lists import (
concatenate_list_elements,
concatenate_rows,
contains_scalar,
count_elements,
Expand All @@ -16,15 +17,17 @@
sort_lists,
)
from cudf._lib.table import Table
from cudf._typing import BinaryOperand, Dtype
from cudf._typing import BinaryOperand, ColumnLike, Dtype, ScalarLike
from cudf.core.buffer import Buffer
from cudf.core.column import ColumnBase, as_column, column
from cudf.core.column.methods import ColumnMethodsMixin
from cudf.core.column.methods import ColumnMethodsMixin, ParentType
from cudf.core.dtypes import ListDtype
from cudf.utils.dtypes import _is_non_decimal_numeric_dtype, is_list_dtype


class ListColumn(ColumnBase):
dtype: ListDtype

def __init__(
self, size, dtype, mask=None, offset=0, null_count=None, children=(),
):
Expand Down Expand Up @@ -278,14 +281,16 @@ class ListMethods(ColumnMethodsMixin):
List methods for Series
"""

def __init__(self, column, parent=None):
_column: ListColumn

def __init__(self, column: ListColumn, parent: ParentType = None):
if not is_list_dtype(column.dtype):
raise AttributeError(
"Can only use .list accessor with a 'list' dtype"
)
super().__init__(column=column, parent=parent)

def get(self, index):
def get(self, index: int) -> ParentType:
"""
Extract element at the given index from each component
Expand Down Expand Up @@ -317,10 +322,10 @@ def get(self, index):
else:
raise IndexError("list index out of range")

def contains(self, search_key):
def contains(self, search_key: ScalarLike) -> ParentType:
"""
Creates a column of bool values indicating whether the specified scalar
is an element of each row of a list column.
Returns boolean values indicating whether the specified scalar
is an element of each row.
Parameters
----------
Expand All @@ -329,7 +334,7 @@ def contains(self, search_key):
Returns
-------
Column
Series or Index
Examples
--------
Expand Down Expand Up @@ -357,14 +362,14 @@ def contains(self, search_key):
return res

@property
def leaves(self):
def leaves(self) -> ParentType:
"""
From a Series of (possibly nested) lists, obtain the elements from
the innermost lists as a flat Series (one value per row).
Returns
-------
Series
Series or Index
Examples
--------
Expand All @@ -385,7 +390,7 @@ def leaves(self):
self._column.elements, retain_index=False
)

def len(self):
def len(self) -> ParentType:
"""
Computes the length of each element in the Series/Index.
Expand All @@ -409,18 +414,18 @@ def len(self):
"""
return self._return_or_inplace(count_elements(self._column))

def take(self, lists_indices):
def take(self, lists_indices: ColumnLike) -> ParentType:
"""
Collect list elements based on given indices.
Parameters
----------
lists_indices: List type arrays
lists_indices: Series-like of lists
Specifies what to collect from each row
Returns
-------
ListColumn
Series or Index
Examples
--------
Expand Down Expand Up @@ -464,14 +469,14 @@ def take(self, lists_indices):
else:
return res

def unique(self):
def unique(self) -> ParentType:
"""
Returns unique element for each list in the column, order for each
unique element is not guaranteed.
Returns the unique elements in each list.
The ordering of elements is not guaranteed.
Returns
-------
ListColumn
Series or Index
Examples
--------
Expand Down Expand Up @@ -501,12 +506,12 @@ def unique(self):

def sort_values(
self,
ascending=True,
inplace=False,
kind="quicksort",
na_position="last",
ignore_index=False,
):
ascending: bool = True,
inplace: bool = False,
kind: str = "quicksort",
na_position: str = "last",
ignore_index: bool = False,
) -> ParentType:
"""
Sort each list by the values.
Expand All @@ -523,7 +528,7 @@ def sort_values(
Returns
-------
ListColumn with each list sorted
Series or Index with each list sorted
Notes
-----
Expand Down Expand Up @@ -552,3 +557,59 @@ def sort_values(
sort_lists(self._column, ascending, na_position),
retain_index=not ignore_index,
)

def concat(self, dropna=True) -> ParentType:
"""
For a column with at least one level of nesting, concatenate the
lists in each row.
Parameters
----------
dropna: bool, optional
If True (default), ignores top-level null elements in each row.
If False, and top-level null elements are present, the resulting
row in the output is null.
Returns
-------
Series or Index
Examples
--------
>>> s1
0 [[1.0, 2.0], [3.0, 4.0, 5.0]]
1 [[6.0, None], [7.0], [8.0, 9.0]]
dtype: list
>>> s1.list.concat()
0 [1.0, 2.0, 3.0, 4.0, 5.0]
1 [6.0, None, 7.0, 8.0, 9.0]
dtype: list
Null values at the top-level in each row are dropped by default:
>>> s2
0 [[1.0, 2.0], None, [3.0, 4.0, 5.0]]
1 [[6.0, None], [7.0], [8.0, 9.0]]
dtype: list
>>> s2.list.concat()
0 [1.0, 2.0, 3.0, 4.0, 5.0]
1 [6.0, None, 7.0, 8.0, 9.0]
dtype: list
Use ``dropna=False`` to produce a null instead:
>>> s2.list.concat(dropna=False)
0 None
1 [6.0, nan, 7.0, 8.0, 9.0]
dtype: list
"""
try:
result = concatenate_list_elements(self._column, dropna=dropna)
except RuntimeError as e:
if "Rows of the input column must be lists." in str(e):
raise ValueError(
"list.concat() can only be called on "
"list columns with at least one level "
"of nesting"
)
return self._return_or_inplace(result)
12 changes: 7 additions & 5 deletions python/cudf/cudf/core/column/methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,29 +11,31 @@
if TYPE_CHECKING:
from cudf.core.column import ColumnBase

ParentType = Union["cudf.Series", "cudf.BaseIndex"]


class ColumnMethodsMixin:
_column: ColumnBase
_parent: Optional[Union["cudf.Series", "cudf.Index"]]
_parent: Optional[Union["cudf.Series", "cudf.BaseIndex"]]

def __init__(
self,
column: ColumnBase,
parent: Union["cudf.Series", "cudf.Index"] = None,
parent: Union["cudf.Series", "cudf.BaseIndex"] = None,
):
self._column = column
self._parent = parent

@overload
def _return_or_inplace(
self, new_col, inplace: Literal[False], expand=False, retain_index=True
) -> Union["cudf.Series", "cudf.Index"]:
) -> Union["cudf.Series", "cudf.BaseIndex"]:
...

@overload
def _return_or_inplace(
self, new_col, expand: bool = False, retain_index: bool = True
) -> Union["cudf.Series", "cudf.Index"]:
) -> Union["cudf.Series", "cudf.BaseIndex"]:
...

@overload
Expand All @@ -49,7 +51,7 @@ def _return_or_inplace(
inplace: bool = False,
expand: bool = False,
retain_index: bool = True,
) -> Optional[Union["cudf.Series", "cudf.Index"]]:
) -> Optional[Union["cudf.Series", "cudf.BaseIndex"]]:
...

def _return_or_inplace(
Expand Down
5 changes: 1 addition & 4 deletions python/cudf/cudf/core/column/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,7 @@
from cudf.api.types import is_integer
from cudf.core.buffer import Buffer
from cudf.core.column import column, datetime
from cudf.core.column.methods import ColumnMethodsMixin
from cudf.core.column.methods import ColumnMethodsMixin, ParentType
from cudf.utils import utils
from cudf.utils.docutils import copy_docstring
from cudf.utils.dtypes import (
Expand Down Expand Up @@ -216,9 +216,6 @@
}


ParentType = Union["cudf.Series", "cudf.core.index.BaseIndex"]


class StringMethods(ColumnMethodsMixin):
def __init__(self, column, parent=None):
"""
Expand Down
Loading

0 comments on commit df45976

Please sign in to comment.