From df45976b5e27f565d9a4d6a435a74a59b2b4a9d6 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <3190405+shwina@users.noreply.github.com>
Date: Wed, 30 Jun 2021 19:12:23 -0400
Subject: [PATCH] Add Python bindings for `lists::concatenate_list_elements`
 and expose them as `.list.concat()` (#8006)

Adds a method to concatenate the lists in a nested list Series:

```python
In [15]: s
Out[15]:
0    [[1, 2], [3, 4]]
dtype: list

In [16]: s.list.concat()
Out[16]:
0    [1, 2, 3, 4]
dtype: list
```

Authors:
  - Ashwin Srinath (https://github.com/shwina)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Charles Blackmon-Luca (https://github.com/charlesbluca)

URL: https://github.com/rapidsai/cudf/pull/8006
---
 python/cudf/cudf/_lib/cpp/lists/combine.pxd |  16 +++
 python/cudf/cudf/_lib/lists.pyx             |  22 +++-
 python/cudf/cudf/core/column/categorical.py |   6 +-
 python/cudf/cudf/core/column/lists.py       | 111 +++++++++++++++-----
 python/cudf/cudf/core/column/methods.py     |  12 ++-
 python/cudf/cudf/core/column/string.py      |   5 +-
 python/cudf/cudf/tests/test_list.py         |  38 +++++++
 7 files changed, 170 insertions(+), 40 deletions(-)

diff --git a/python/cudf/cudf/_lib/cpp/lists/combine.pxd b/python/cudf/cudf/_lib/cpp/lists/combine.pxd
index ea9ade178e2..164253e39b5 100644
--- a/python/cudf/cudf/_lib/cpp/lists/combine.pxd
+++ b/python/cudf/cudf/_lib/cpp/lists/combine.pxd
@@ -3,10 +3,26 @@
 from libcpp.memory cimport unique_ptr
 
 from cudf._lib.cpp.column.column cimport column
+from cudf._lib.cpp.column.column_view cimport column_view
 from cudf._lib.cpp.table.table_view cimport table_view
 
 cdef extern from "cudf/lists/combine.hpp" namespace \
         "cudf::lists" nogil:
+
+    ctypedef enum concatenate_null_policy:
+        IGNORE "cudf::lists::concatenate_null_policy::IGNORE"
+        NULLIFY_OUTPUT_ROW \
+            "cudf::lists::concatenate_null_policy::NULLIFY_OUTPUT_ROW"
+
     cdef unique_ptr[column] concatenate_rows(
         const table_view input_table
     ) except +
+
+    cdef unique_ptr[column] concatenate_list_elements(
+        const table_view input_table,
+    ) except +
+
+    cdef unique_ptr[column] concatenate_list_elements(
+        const column_view input_table,
+        concatenate_null_policy null_policy
+    ) except +
diff --git a/python/cudf/cudf/_lib/lists.pyx b/python/cudf/cudf/_lib/lists.pyx
index 7d8909610dc..9fd7d7611ae 100644
--- a/python/cudf/cudf/_lib/lists.pyx
+++ b/python/cudf/cudf/_lib/lists.pyx
@@ -17,8 +17,11 @@ from cudf._lib.cpp.lists.sorting cimport (
     sort_lists as cpp_sort_lists
 )
 from cudf._lib.cpp.lists.combine cimport (
-    concatenate_rows as cpp_concatenate_rows
+    concatenate_rows as cpp_concatenate_rows,
+    concatenate_null_policy,
+    concatenate_list_elements as cpp_concatenate_list_elements
 )
+
 from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view
 from cudf._lib.cpp.column.column_view cimport column_view
 from cudf._lib.cpp.column.column cimport column
@@ -181,3 +184,20 @@ def concatenate_rows(Table tbl):
 
     result = Column.from_unique_ptr(move(c_result))
     return result
+
+
+def concatenate_list_elements(Column input_column, dropna=False):
+    cdef concatenate_null_policy policy = (
+        concatenate_null_policy.IGNORE if dropna
+        else concatenate_null_policy.NULLIFY_OUTPUT_ROW
+    )
+    cdef column_view c_input = input_column.view()
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(cpp_concatenate_list_elements(
+            c_input,
+            policy
+        ))
+
+    return Column.from_unique_ptr(move(c_result))
diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index 135fb6e6f30..cbcc30d38a7 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -12,7 +12,6 @@
     Optional,
     Sequence,
     Tuple,
-    Union,
     cast,
 )
 
@@ -28,7 +27,7 @@
 from cudf._typing import ColumnLike, Dtype, ScalarLike
 from cudf.core.buffer import Buffer
 from cudf.core.column import column
-from cudf.core.column.methods import ColumnMethodsMixin
+from cudf.core.column.methods import ColumnMethodsMixin, ParentType
 from cudf.core.dtypes import CategoricalDtype
 from cudf.utils.dtypes import (
     is_categorical_dtype,
@@ -48,9 +47,6 @@
     )
 
 
-ParentType = Union["cudf.Series", "cudf.Index"]
-
-
 class CategoricalAccessor(ColumnMethodsMixin):
     _column: CategoricalColumn
 
diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py
index ff63d8c5aaa..843190f38aa 100644
--- a/python/cudf/cudf/core/column/lists.py
+++ b/python/cudf/cudf/core/column/lists.py
@@ -8,6 +8,7 @@
 import cudf
 from cudf._lib.copying import segmented_gather
 from cudf._lib.lists import (
+    concatenate_list_elements,
     concatenate_rows,
     contains_scalar,
     count_elements,
@@ -16,15 +17,17 @@
     sort_lists,
 )
 from cudf._lib.table import Table
-from cudf._typing import BinaryOperand, Dtype
+from cudf._typing import BinaryOperand, ColumnLike, Dtype, ScalarLike
 from cudf.core.buffer import Buffer
 from cudf.core.column import ColumnBase, as_column, column
-from cudf.core.column.methods import ColumnMethodsMixin
+from cudf.core.column.methods import ColumnMethodsMixin, ParentType
 from cudf.core.dtypes import ListDtype
 from cudf.utils.dtypes import _is_non_decimal_numeric_dtype, is_list_dtype
 
 
 class ListColumn(ColumnBase):
+    dtype: ListDtype
+
     def __init__(
         self, size, dtype, mask=None, offset=0, null_count=None, children=(),
     ):
@@ -278,14 +281,16 @@ class ListMethods(ColumnMethodsMixin):
     List methods for Series
     """
 
-    def __init__(self, column, parent=None):
+    _column: ListColumn
+
+    def __init__(self, column: ListColumn, parent: ParentType = None):
         if not is_list_dtype(column.dtype):
             raise AttributeError(
                 "Can only use .list accessor with a 'list' dtype"
             )
         super().__init__(column=column, parent=parent)
 
-    def get(self, index):
+    def get(self, index: int) -> ParentType:
         """
         Extract element at the given index from each component
 
@@ -317,10 +322,10 @@ def get(self, index):
         else:
             raise IndexError("list index out of range")
 
-    def contains(self, search_key):
+    def contains(self, search_key: ScalarLike) -> ParentType:
         """
-        Creates a column of bool values indicating whether the specified scalar
-        is an element of each row of a list column.
+        Returns boolean values indicating whether the specified scalar
+        is an element of each row.
 
         Parameters
         ----------
@@ -329,7 +334,7 @@ def contains(self, search_key):
 
         Returns
         -------
-        Column
+        Series or Index
 
         Examples
         --------
@@ -357,14 +362,14 @@ def contains(self, search_key):
             return res
 
     @property
-    def leaves(self):
+    def leaves(self) -> ParentType:
         """
         From a Series of (possibly nested) lists, obtain the elements from
         the innermost lists as a flat Series (one value per row).
 
         Returns
         -------
-        Series
+        Series or Index
 
         Examples
         --------
@@ -385,7 +390,7 @@ def leaves(self):
                 self._column.elements, retain_index=False
             )
 
-    def len(self):
+    def len(self) -> ParentType:
         """
         Computes the length of each element in the Series/Index.
 
@@ -409,18 +414,18 @@ def len(self):
         """
         return self._return_or_inplace(count_elements(self._column))
 
-    def take(self, lists_indices):
+    def take(self, lists_indices: ColumnLike) -> ParentType:
         """
         Collect list elements based on given indices.
 
         Parameters
         ----------
-        lists_indices: List type arrays
+        lists_indices: Series-like of lists
             Specifies what to collect from each row
 
         Returns
         -------
-        ListColumn
+        Series or Index
 
         Examples
         --------
@@ -464,14 +469,14 @@ def take(self, lists_indices):
         else:
             return res
 
-    def unique(self):
+    def unique(self) -> ParentType:
         """
-        Returns unique element for each list in the column, order for each
-        unique element is not guaranteed.
+        Returns the unique elements in each list.
+        The ordering of elements is not guaranteed.
 
         Returns
         -------
-        ListColumn
+        Series or Index
 
         Examples
         --------
@@ -501,12 +506,12 @@ def unique(self):
 
     def sort_values(
         self,
-        ascending=True,
-        inplace=False,
-        kind="quicksort",
-        na_position="last",
-        ignore_index=False,
-    ):
+        ascending: bool = True,
+        inplace: bool = False,
+        kind: str = "quicksort",
+        na_position: str = "last",
+        ignore_index: bool = False,
+    ) -> ParentType:
         """
         Sort each list by the values.
 
@@ -523,7 +528,7 @@ def sort_values(
 
         Returns
         -------
-        ListColumn with each list sorted
+        Series or Index with each list sorted
 
         Notes
         -----
@@ -552,3 +557,59 @@ def sort_values(
             sort_lists(self._column, ascending, na_position),
             retain_index=not ignore_index,
         )
+
+    def concat(self, dropna=True) -> ParentType:
+        """
+        For a column with at least one level of nesting, concatenate the
+        lists in each row.
+
+        Parameters
+        ----------
+        dropna: bool, optional
+            If True (default), ignores top-level null elements in each row.
+            If False, and top-level null elements are present, the resulting
+            row in the output is null.
+
+        Returns
+        -------
+        Series or Index
+
+        Examples
+        --------
+        >>> s1
+        0      [[1.0, 2.0], [3.0, 4.0, 5.0]]
+        1    [[6.0, None], [7.0], [8.0, 9.0]]
+        dtype: list
+        >>> s1.list.concat()
+        0    [1.0, 2.0, 3.0, 4.0, 5.0]
+        1    [6.0, None, 7.0, 8.0, 9.0]
+        dtype: list
+
+        Null values at the top-level in each row are dropped by default:
+
+        >>> s2
+        0    [[1.0, 2.0], None, [3.0, 4.0, 5.0]]
+        1        [[6.0, None], [7.0], [8.0, 9.0]]
+        dtype: list
+        >>> s2.list.concat()
+        0    [1.0, 2.0, 3.0, 4.0, 5.0]
+        1    [6.0, None, 7.0, 8.0, 9.0]
+        dtype: list
+
+        Use ``dropna=False`` to produce a null instead:
+
+        >>> s2.list.concat(dropna=False)
+        0                         None
+        1    [6.0, nan, 7.0, 8.0, 9.0]
+        dtype: list
+        """
+        try:
+            result = concatenate_list_elements(self._column, dropna=dropna)
+        except RuntimeError as e:
+            if "Rows of the input column must be lists." in str(e):
+                raise ValueError(
+                    "list.concat() can only be called on "
+                    "list columns with at least one level "
+                    "of nesting"
+                )
+        return self._return_or_inplace(result)
diff --git a/python/cudf/cudf/core/column/methods.py b/python/cudf/cudf/core/column/methods.py
index d7b416d06c9..4b448e27a53 100644
--- a/python/cudf/cudf/core/column/methods.py
+++ b/python/cudf/cudf/core/column/methods.py
@@ -11,15 +11,17 @@
 if TYPE_CHECKING:
     from cudf.core.column import ColumnBase
 
+ParentType = Union["cudf.Series", "cudf.BaseIndex"]
+
 
 class ColumnMethodsMixin:
     _column: ColumnBase
-    _parent: Optional[Union["cudf.Series", "cudf.Index"]]
+    _parent: Optional[Union["cudf.Series", "cudf.BaseIndex"]]
 
     def __init__(
         self,
         column: ColumnBase,
-        parent: Union["cudf.Series", "cudf.Index"] = None,
+        parent: Union["cudf.Series", "cudf.BaseIndex"] = None,
     ):
         self._column = column
         self._parent = parent
@@ -27,13 +29,13 @@ def __init__(
     @overload
     def _return_or_inplace(
         self, new_col, inplace: Literal[False], expand=False, retain_index=True
-    ) -> Union["cudf.Series", "cudf.Index"]:
+    ) -> Union["cudf.Series", "cudf.BaseIndex"]:
         ...
 
     @overload
     def _return_or_inplace(
         self, new_col, expand: bool = False, retain_index: bool = True
-    ) -> Union["cudf.Series", "cudf.Index"]:
+    ) -> Union["cudf.Series", "cudf.BaseIndex"]:
         ...
 
     @overload
@@ -49,7 +51,7 @@ def _return_or_inplace(
         inplace: bool = False,
         expand: bool = False,
         retain_index: bool = True,
-    ) -> Optional[Union["cudf.Series", "cudf.Index"]]:
+    ) -> Optional[Union["cudf.Series", "cudf.BaseIndex"]]:
         ...
 
     def _return_or_inplace(
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index a6a9de2e77b..11051b63920 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -161,7 +161,7 @@
 from cudf.api.types import is_integer
 from cudf.core.buffer import Buffer
 from cudf.core.column import column, datetime
-from cudf.core.column.methods import ColumnMethodsMixin
+from cudf.core.column.methods import ColumnMethodsMixin, ParentType
 from cudf.utils import utils
 from cudf.utils.docutils import copy_docstring
 from cudf.utils.dtypes import (
@@ -216,9 +216,6 @@
 }
 
 
-ParentType = Union["cudf.Series", "cudf.core.index.BaseIndex"]
-
-
 class StringMethods(ColumnMethodsMixin):
     def __init__(self, column, parent=None):
         """
diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py
index 0e93ff8a232..abd24ddd0fd 100644
--- a/python/cudf/cudf/tests/test_list.py
+++ b/python/cudf/cudf/tests/test_list.py
@@ -1,5 +1,6 @@
 # Copyright (c) 2020-2021, NVIDIA CORPORATION.
 import functools
+import operator
 
 import numpy as np
 import pandas as pd
@@ -324,6 +325,43 @@ def test_contains_null_search_key(data, expect):
     assert_eq(expect, got)
 
 
+@pytest.mark.parametrize(
+    "row",
+    [
+        [[]],
+        [[1]],
+        [[1, 2]],
+        [[1, 2], [3, 4, 5]],
+        [[1, 2], [], [3, 4, 5]],
+        [[1, 2, None], [3, 4, 5]],
+        [[1, 2, None], None, [3, 4, 5]],
+        [[1, 2, None], None, [], [3, 4, 5]],
+        [[[1, 2], [3, 4]], [[5, 6, 7], [8, 9]]],
+        [[["a", "c", "de", None], None, ["fg"]], [["abc", "de"], None]],
+    ],
+)
+@pytest.mark.parametrize("dropna", [True, False])
+def test_concat_elements(row, dropna):
+    if any(x is None for x in row):
+        if dropna:
+            row = [x for x in row if x is not None]
+            result = functools.reduce(operator.add, row)
+        else:
+            result = None
+    else:
+        result = functools.reduce(operator.add, row)
+
+    expect = pd.Series([result])
+    got = cudf.Series([row]).list.concat(dropna=dropna)
+    assert_eq(expect, got)
+
+
+def test_concat_elements_raise():
+    s = cudf.Series([[1, 2, 3]])  # no nesting
+    with pytest.raises(ValueError):
+        s.list.concat()
+
+
 def test_concatenate_rows_of_lists():
     pdf = pd.DataFrame({"val": [["a", "a"], ["b"], ["c"]]})
     gdf = cudf.from_pandas(pdf)