Add Python bindings for lists::concatenate_list_elements and expose…

… them as `.list.concat()` (#8006) Adds a method to concatenate the lists in a nested list Series: ```python In [15]: s Out[15]: 0 [[1, 2], [3, 4]] dtype: list In [16]: s.list.concat() Out[16]: 0 [1, 2, 3, 4] dtype: list ``` Authors: - Ashwin Srinath (https://github.com/shwina) Approvers: - Nghia Truong (https://github.com/ttnghia) - GALI PREM SAGAR (https://github.com/galipremsagar) - Charles Blackmon-Luca (https://github.com/charlesbluca) URL: #8006
rapidsai · Jun 30, 2021 · df45976 · df45976
1 parent 5884b95
commit df45976
Show file tree

Hide file tree

Showing 7 changed files with 170 additions and 40 deletions.
diff --git a/python/cudf/cudf/_lib/cpp/lists/combine.pxd b/python/cudf/cudf/_lib/cpp/lists/combine.pxd
@@ -3,10 +3,26 @@
 from libcpp.memory cimport unique_ptr
 
 from cudf._lib.cpp.column.column cimport column
+from cudf._lib.cpp.column.column_view cimport column_view
 from cudf._lib.cpp.table.table_view cimport table_view
 
 cdef extern from "cudf/lists/combine.hpp" namespace \
         "cudf::lists" nogil:
+
+    ctypedef enum concatenate_null_policy:
+        IGNORE "cudf::lists::concatenate_null_policy::IGNORE"
+        NULLIFY_OUTPUT_ROW \
+            "cudf::lists::concatenate_null_policy::NULLIFY_OUTPUT_ROW"
+
     cdef unique_ptr[column] concatenate_rows(
         const table_view input_table
     ) except +
+
+    cdef unique_ptr[column] concatenate_list_elements(
+        const table_view input_table,
+    ) except +
+
+    cdef unique_ptr[column] concatenate_list_elements(
+        const column_view input_table,
+        concatenate_null_policy null_policy
+    ) except +
diff --git a/python/cudf/cudf/_lib/lists.pyx b/python/cudf/cudf/_lib/lists.pyx
@@ -17,8 +17,11 @@ from cudf._lib.cpp.lists.sorting cimport (
     sort_lists as cpp_sort_lists
 )
 from cudf._lib.cpp.lists.combine cimport (
-    concatenate_rows as cpp_concatenate_rows
+    concatenate_rows as cpp_concatenate_rows,
+    concatenate_null_policy,
+    concatenate_list_elements as cpp_concatenate_list_elements
 )
+
 from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view
 from cudf._lib.cpp.column.column_view cimport column_view
 from cudf._lib.cpp.column.column cimport column
@@ -181,3 +184,20 @@ def concatenate_rows(Table tbl):
 
     result = Column.from_unique_ptr(move(c_result))
     return result
+
+
+def concatenate_list_elements(Column input_column, dropna=False):
+    cdef concatenate_null_policy policy = (
+        concatenate_null_policy.IGNORE if dropna
+        else concatenate_null_policy.NULLIFY_OUTPUT_ROW
+    )
+    cdef column_view c_input = input_column.view()
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(cpp_concatenate_list_elements(
+            c_input,
+            policy
+        ))
+
+    return Column.from_unique_ptr(move(c_result))
diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
@@ -12,7 +12,6 @@
     Optional,
     Sequence,
     Tuple,
-    Union,
     cast,
 )
 
@@ -28,7 +27,7 @@
 from cudf._typing import ColumnLike, Dtype, ScalarLike
 from cudf.core.buffer import Buffer
 from cudf.core.column import column
-from cudf.core.column.methods import ColumnMethodsMixin
+from cudf.core.column.methods import ColumnMethodsMixin, ParentType
 from cudf.core.dtypes import CategoricalDtype
 from cudf.utils.dtypes import (
     is_categorical_dtype,
@@ -48,9 +47,6 @@
     )
 
 
-ParentType = Union["cudf.Series", "cudf.Index"]
-
-
 class CategoricalAccessor(ColumnMethodsMixin):
     _column: CategoricalColumn
 

diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py
@@ -8,6 +8,7 @@
 import cudf
 from cudf._lib.copying import segmented_gather
 from cudf._lib.lists import (
+    concatenate_list_elements,
     concatenate_rows,
     contains_scalar,
     count_elements,
@@ -16,15 +17,17 @@
     sort_lists,
 )
 from cudf._lib.table import Table
-from cudf._typing import BinaryOperand, Dtype
+from cudf._typing import BinaryOperand, ColumnLike, Dtype, ScalarLike
 from cudf.core.buffer import Buffer
 from cudf.core.column import ColumnBase, as_column, column
-from cudf.core.column.methods import ColumnMethodsMixin
+from cudf.core.column.methods import ColumnMethodsMixin, ParentType
 from cudf.core.dtypes import ListDtype
 from cudf.utils.dtypes import _is_non_decimal_numeric_dtype, is_list_dtype
 
 
 class ListColumn(ColumnBase):
+    dtype: ListDtype
+
     def __init__(
         self, size, dtype, mask=None, offset=0, null_count=None, children=(),
     ):
@@ -278,14 +281,16 @@ class ListMethods(ColumnMethodsMixin):
     List methods for Series
     """
 
-    def __init__(self, column, parent=None):
+    _column: ListColumn
+
+    def __init__(self, column: ListColumn, parent: ParentType = None):
         if not is_list_dtype(column.dtype):
             raise AttributeError(
                 "Can only use .list accessor with a 'list' dtype"
             )
         super().__init__(column=column, parent=parent)
 
-    def get(self, index):
+    def get(self, index: int) -> ParentType:
         """
         Extract element at the given index from each component
 
@@ -317,10 +322,10 @@ def get(self, index):
         else:
             raise IndexError("list index out of range")
 
-    def contains(self, search_key):
+    def contains(self, search_key: ScalarLike) -> ParentType:
         """
-        Creates a column of bool values indicating whether the specified scalar
-        is an element of each row of a list column.
+        Returns boolean values indicating whether the specified scalar
+        is an element of each row.
 
         Parameters
         ----------
@@ -329,7 +334,7 @@ def contains(self, search_key):
 
         Returns
         -------
-        Column
+        Series or Index
 
         Examples
         --------
@@ -357,14 +362,14 @@ def contains(self, search_key):
             return res
 
     @property
-    def leaves(self):
+    def leaves(self) -> ParentType:
         """
         From a Series of (possibly nested) lists, obtain the elements from
         the innermost lists as a flat Series (one value per row).
 
         Returns
         -------
-        Series
+        Series or Index
 
         Examples
         --------
@@ -385,7 +390,7 @@ def leaves(self):
                 self._column.elements, retain_index=False
             )
 
-    def len(self):
+    def len(self) -> ParentType:
         """
         Computes the length of each element in the Series/Index.
 
@@ -409,18 +414,18 @@ def len(self):
         """
         return self._return_or_inplace(count_elements(self._column))
 
-    def take(self, lists_indices):
+    def take(self, lists_indices: ColumnLike) -> ParentType:
         """
         Collect list elements based on given indices.
 
         Parameters
         ----------
-        lists_indices: List type arrays
+        lists_indices: Series-like of lists
             Specifies what to collect from each row
 
         Returns
         -------
-        ListColumn
+        Series or Index
 
         Examples
         --------
@@ -464,14 +469,14 @@ def take(self, lists_indices):
         else:
             return res
 
-    def unique(self):
+    def unique(self) -> ParentType:
         """
-        Returns unique element for each list in the column, order for each
-        unique element is not guaranteed.
+        Returns the unique elements in each list.
+        The ordering of elements is not guaranteed.
 
         Returns
         -------
-        ListColumn
+        Series or Index
 
         Examples
         --------
@@ -501,12 +506,12 @@ def unique(self):
 
     def sort_values(
         self,
-        ascending=True,
-        inplace=False,
-        kind="quicksort",
-        na_position="last",
-        ignore_index=False,
-    ):
+        ascending: bool = True,
+        inplace: bool = False,
+        kind: str = "quicksort",
+        na_position: str = "last",
+        ignore_index: bool = False,
+    ) -> ParentType:
         """
         Sort each list by the values.
 
@@ -523,7 +528,7 @@ def sort_values(
 
         Returns
         -------
-        ListColumn with each list sorted
+        Series or Index with each list sorted
 
         Notes
         -----
@@ -552,3 +557,59 @@ def sort_values(
             sort_lists(self._column, ascending, na_position),
             retain_index=not ignore_index,
         )
+
+    def concat(self, dropna=True) -> ParentType:
+        """
+        For a column with at least one level of nesting, concatenate the
+        lists in each row.
+
+        Parameters
+        ----------
+        dropna: bool, optional
+            If True (default), ignores top-level null elements in each row.
+            If False, and top-level null elements are present, the resulting
+            row in the output is null.
+
+        Returns
+        -------
+        Series or Index
+
+        Examples
+        --------
+        >>> s1
+        0      [[1.0, 2.0], [3.0, 4.0, 5.0]]
+        1    [[6.0, None], [7.0], [8.0, 9.0]]
+        dtype: list
+        >>> s1.list.concat()
+        0    [1.0, 2.0, 3.0, 4.0, 5.0]
+        1    [6.0, None, 7.0, 8.0, 9.0]
+        dtype: list
+
+        Null values at the top-level in each row are dropped by default:
+
+        >>> s2
+        0    [[1.0, 2.0], None, [3.0, 4.0, 5.0]]
+        1        [[6.0, None], [7.0], [8.0, 9.0]]
+        dtype: list
+        >>> s2.list.concat()
+        0    [1.0, 2.0, 3.0, 4.0, 5.0]
+        1    [6.0, None, 7.0, 8.0, 9.0]
+        dtype: list
+
+        Use ``dropna=False`` to produce a null instead:
+
+        >>> s2.list.concat(dropna=False)
+        0                         None
+        1    [6.0, nan, 7.0, 8.0, 9.0]
+        dtype: list
+        """
+        try:
+            result = concatenate_list_elements(self._column, dropna=dropna)
+        except RuntimeError as e:
+            if "Rows of the input column must be lists." in str(e):
+                raise ValueError(
+                    "list.concat() can only be called on "
+                    "list columns with at least one level "
+                    "of nesting"
+                )
+        return self._return_or_inplace(result)
diff --git a/python/cudf/cudf/core/column/methods.py b/python/cudf/cudf/core/column/methods.py
@@ -11,29 +11,31 @@
 if TYPE_CHECKING:
     from cudf.core.column import ColumnBase
 
+ParentType = Union["cudf.Series", "cudf.BaseIndex"]
+
 
 class ColumnMethodsMixin:
     _column: ColumnBase
-    _parent: Optional[Union["cudf.Series", "cudf.Index"]]
+    _parent: Optional[Union["cudf.Series", "cudf.BaseIndex"]]
 
     def __init__(
         self,
         column: ColumnBase,
-        parent: Union["cudf.Series", "cudf.Index"] = None,
+        parent: Union["cudf.Series", "cudf.BaseIndex"] = None,
     ):
         self._column = column
         self._parent = parent
 
     @overload
     def _return_or_inplace(
         self, new_col, inplace: Literal[False], expand=False, retain_index=True
-    ) -> Union["cudf.Series", "cudf.Index"]:
+    ) -> Union["cudf.Series", "cudf.BaseIndex"]:
         ...
 
     @overload
     def _return_or_inplace(
         self, new_col, expand: bool = False, retain_index: bool = True
-    ) -> Union["cudf.Series", "cudf.Index"]:
+    ) -> Union["cudf.Series", "cudf.BaseIndex"]:
         ...
 
     @overload
@@ -49,7 +51,7 @@ def _return_or_inplace(
         inplace: bool = False,
         expand: bool = False,
         retain_index: bool = True,
-    ) -> Optional[Union["cudf.Series", "cudf.Index"]]:
+    ) -> Optional[Union["cudf.Series", "cudf.BaseIndex"]]:
         ...
 
     def _return_or_inplace(

diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
@@ -161,7 +161,7 @@
 from cudf.api.types import is_integer
 from cudf.core.buffer import Buffer
 from cudf.core.column import column, datetime
-from cudf.core.column.methods import ColumnMethodsMixin
+from cudf.core.column.methods import ColumnMethodsMixin, ParentType
 from cudf.utils import utils
 from cudf.utils.docutils import copy_docstring
 from cudf.utils.dtypes import (
@@ -216,9 +216,6 @@
 }
 
 
-ParentType = Union["cudf.Series", "cudf.core.index.BaseIndex"]
-
-
 class StringMethods(ColumnMethodsMixin):
     def __init__(self, column, parent=None):
         """