Adds explode API (#7607)

Closes #2975 This PR introduces `explode` API, which flattens list columns and turns list elements into rows. Example: ```python >>> s = cudf.Series([[1, 2, 3], [], None, [4, 5]]) >>> s 0 [1, 2, 3] 1 [] 2 None 3 [4, 5] dtype: list >>> s.explode() 0 1 0 2 0 3 1 <NA> 2 <NA> 3 4 3 5 dtype: int64 ``` Supersedes #7538 Authors: - Michael Wang (@isVoid) Approvers: - Keith Kraus (@kkraus14) - GALI PREM SAGAR (@galipremsagar) URL: #7607
rapidsai · Mar 18, 2021 · ec5364c · ec5364c
1 parent f5a4214
commit ec5364c
Show file tree

Hide file tree

Showing 7 changed files with 231 additions and 1 deletion.
diff --git a/python/cudf/cudf/_lib/cpp/lists/explode.pxd b/python/cudf/cudf/_lib/cpp/lists/explode.pxd
@@ -0,0 +1,13 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+
+from cudf._lib.cpp.table.table cimport table
+from cudf._lib.cpp.table.table_view cimport table_view
+from cudf._lib.cpp.types cimport size_type
+
+cdef extern from "cudf/lists/explode.hpp" namespace "cudf" nogil:
+    cdef unique_ptr[table] explode_outer(
+        const table_view,
+        size_type explode_column_idx,
+    ) except +
diff --git a/python/cudf/cudf/_lib/lists.pyx b/python/cudf/cudf/_lib/lists.pyx
@@ -1,17 +1,25 @@
 # Copyright (c) 2021, NVIDIA CORPORATION.
 
+from libcpp cimport bool
 from libcpp.memory cimport unique_ptr, shared_ptr, make_shared
 from libcpp.utility cimport move
 
 from cudf._lib.cpp.lists.count_elements cimport (
     count_elements as cpp_count_elements
 )
+from cudf._lib.cpp.lists.explode cimport (
+    explode_outer as cpp_explode_outer
+)
 from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view
 from cudf._lib.cpp.column.column_view cimport column_view
 from cudf._lib.cpp.column.column cimport column
 
-from cudf._lib.column cimport Column
+from cudf._lib.cpp.table.table cimport table
+from cudf._lib.cpp.table.table_view cimport table_view
+from cudf._lib.cpp.types cimport size_type
 
+from cudf._lib.column cimport Column
+from cudf._lib.table cimport Table
 
 from cudf.core.dtypes import ListDtype
 
@@ -32,3 +40,21 @@ def count_elements(Column col):
 
     result = Column.from_unique_ptr(move(c_result))
     return result
+
+
+def explode_outer(Table tbl, int explode_column_idx, bool ignore_index=False):
+    cdef table_view c_table_view = (
+        tbl.data_view() if ignore_index else tbl.view()
+    )
+    cdef size_type c_explode_column_idx = explode_column_idx
+
+    cdef unique_ptr[table] c_result
+
+    with nogil:
+        c_result = move(cpp_explode_outer(c_table_view, c_explode_column_idx))
+
+    return Table.from_unique_ptr(
+        move(c_result),
+        column_names=tbl._column_names,
+        index_names=None if ignore_index else tbl._index_names
+    )
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
@@ -7705,6 +7705,52 @@ def equals(self, other):
                 return False
         return super().equals(other)
 
+    def explode(self, column, ignore_index=False):
+        """
+        Transform each element of a list-like to a row, replicating index
+        values.
+
+        Parameters
+        ----------
+        column : str or tuple
+            Column to explode.
+        ignore_index : bool, default False
+            If True, the resulting index will be labeled 0, 1, …, n - 1.
+
+        Returns
+        -------
+        DataFrame
+
+        Examples
+        --------
+        >>> import cudf
+        >>> cudf.DataFrame(
+                {"a": [[1, 2, 3], [], None, [4, 5]], "b": [11, 22, 33, 44]})
+                   a   b
+        0  [1, 2, 3]  11
+        1         []  22
+        2       None  33
+        3     [4, 5]  44
+        >>> df.explode('a')
+              a   b
+        0     1  11
+        0     2  11
+        0     3  11
+        1  <NA>  22
+        2  <NA>  33
+        3     4  44
+        3     5  44
+        """
+        if column not in self._column_names:
+            raise KeyError(column)
+
+        if not is_list_dtype(self._data[column].dtype):
+            data = self._data.copy(deep=True)
+            idx = None if ignore_index else self._index.copy(deep=True)
+            return self.__class__._from_data(data, index=idx)
+
+        return super()._explode(column, ignore_index)
+
     _accessors = set()  # type: Set[Any]
 
 

diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
@@ -573,6 +573,28 @@ def equals(self, other, **kwargs):
         else:
             return self._index.equals(other._index)
 
+    def _explode(self, explode_column: Any, ignore_index: bool):
+        """Helper function for `explode` in `Series` and `Dataframe`, explodes
+        a specified nested column. Other columns' corresponding rows are
+        duplicated. If ignore_index is set, the original index is not exploded
+        and will be replaced with a `RangeIndex`.
+        """
+        explode_column_num = self._column_names.index(explode_column)
+        if not ignore_index and self._index is not None:
+            explode_column_num += self._index.nlevels
+
+        res_tbl = libcudf.lists.explode_outer(
+            self, explode_column_num, ignore_index
+        )
+        res = self.__class__._from_table(res_tbl)
+
+        res._data.multiindex = self._data.multiindex
+        res._data._level_names = self._data._level_names
+
+        if not ignore_index and self._index is not None:
+            res.index.names = self._index.names
+        return res
+
     def _get_columns_by_label(self, labels, downcast):
         """
         Returns columns of the Frame specified by `labels`

diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
@@ -6362,6 +6362,47 @@ def keys(self):
         """
         return self.index
 
+    def explode(self, ignore_index=False):
+        """
+        Transform each element of a list-like to a row, replicating index
+        values.
+
+        Parameters
+        ----------
+        ignore_index : bool, default False
+            If True, the resulting index will be labeled 0, 1, …, n - 1.
+
+        Returns
+        -------
+        DataFrame
+
+        Examples
+        --------
+        >>> import cudf
+        >>> s = cudf.Series([[1, 2, 3], [], None, [4, 5]])
+        >>> s
+        0    [1, 2, 3]
+        1           []
+        2         None
+        3       [4, 5]
+        dtype: list
+        >>> s.explode()
+        0       1
+        0       2
+        0       3
+        1    <NA>
+        2    <NA>
+        3       4
+        3       5
+        dtype: int64
+        """
+        if not is_list_dtype(self._column.dtype):
+            data = self._data.copy(deep=True)
+            idx = None if ignore_index else self._index.copy(deep=True)
+            return self.__class__._from_data(data, index=idx)
+
+        return super()._explode(self._column_names[0], ignore_index)
+
     _accessors = set()  # type: Set[Any]
 
 

diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
@@ -8442,3 +8442,56 @@ def test_rename_for_level_is_None_MC():
     got = gdf.rename(columns={"a": "f"}, level=None)
 
     assert_eq(expect, got)
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        [
+            [[1, 2, 3], 11, "a"],
+            [None, 22, "e"],
+            [[4], 33, "i"],
+            [[], 44, "o"],
+            [[5, 6], 55, "u"],
+        ],  # nested
+        [
+            [1, 11, "a"],
+            [2, 22, "e"],
+            [3, 33, "i"],
+            [4, 44, "o"],
+            [5, 55, "u"],
+        ],  # non-nested
+    ],
+)
+@pytest.mark.parametrize(
+    ("labels", "label_to_explode"),
+    [
+        (None, 0),
+        (pd.Index(["a", "b", "c"]), "a"),
+        (
+            pd.MultiIndex.from_tuples(
+                [(0, "a"), (0, "b"), (1, "a")], names=["l0", "l1"]
+            ),
+            (0, "a"),
+        ),
+    ],
+)
+@pytest.mark.parametrize("ignore_index", [True, False])
+@pytest.mark.parametrize(
+    "p_index",
+    [
+        None,
+        ["ia", "ib", "ic", "id", "ie"],
+        pd.MultiIndex.from_tuples(
+            [(0, "a"), (0, "b"), (0, "c"), (1, "a"), (1, "b")]
+        ),
+    ],
+)
+def test_explode(data, labels, ignore_index, p_index, label_to_explode):
+    pdf = pd.DataFrame(data, index=p_index, columns=labels)
+    gdf = cudf.from_pandas(pdf)
+
+    expect = pdf.explode(label_to_explode, ignore_index)
+    got = gdf.explode(label_to_explode, ignore_index)
+
+    assert_eq(expect, got, check_dtype=False)
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
@@ -1118,3 +1118,32 @@ def test_series_drop_raises():
     actual = gs.drop("p", errors="ignore")
 
     assert_eq(actual, expect)
+
+
+@pytest.mark.parametrize(
+    "data", [[[1, 2, 3], None, [4], [], [5, 6]], [1, 2, 3, 4, 5]],
+)
+@pytest.mark.parametrize("ignore_index", [True, False])
+@pytest.mark.parametrize(
+    "p_index",
+    [
+        None,
+        ["ia", "ib", "ic", "id", "ie"],
+        pd.MultiIndex.from_tuples(
+            [(0, "a"), (0, "b"), (0, "c"), (1, "a"), (1, "b")]
+        ),
+    ],
+)
+def test_explode(data, ignore_index, p_index):
+    pdf = pd.Series(data, index=p_index, name="someseries")
+    gdf = cudf.from_pandas(pdf)
+
+    expect = pdf.explode(ignore_index)
+    got = gdf.explode(ignore_index)
+
+    if data == [1, 2, 3, 4, 5] and ignore_index and p_index is not None:
+        # https://github.com/pandas-dev/pandas/issues/40487
+        with pytest.raises(AssertionError, match="different"):
+            assert_eq(expect, got, check_dtype=False)
+    else:
+        assert_eq(expect, got, check_dtype=False)