try adding frame

pandas-dev · Jul 11, 2019 · c2b91e2 · c2b91e2
1 parent 2b9e2e6
commit c2b91e2
Show file tree

Hide file tree

Showing 5 changed files with 181 additions and 10 deletions.
diff --git a/doc/source/reference/frame.rst b/doc/source/reference/frame.rst
@@ -239,6 +239,7 @@ Reshaping, sorting, transposing
    DataFrame.unstack
    DataFrame.swapaxes
    DataFrame.melt
+   DataFrame.explode
    DataFrame.squeeze
    DataFrame.to_xarray
    DataFrame.T

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -15,7 +15,7 @@
 import itertools
 import sys
 from textwrap import dedent
-from typing import FrozenSet, List, Optional, Set, Type, Union
+from typing import FrozenSet, Iterable, List, Optional, Set, Type, Union
 import warnings
 
 import numpy as np
@@ -6252,6 +6252,78 @@ def stack(self, level=-1, dropna=True):
         else:
             return stack(self, level, dropna=dropna)
 
+    def explode(self, subset: Iterable) -> "DataFrame":
+        """
+        Create new DataFrame expanding a list-like columns.
+
+        .. versionadded:: 0.25.0
+
+        Parameters
+        ----------
+        subset : list-like
+
+        Returns
+        -------
+        DataFrame
+            Exploded lists to rows of the subset columns; index will be duplicated for these rows.
+
+        Raises
+        ------
+        ValueError :
+            if columns & subset are not unique.
+        ValueError :
+            subset must be list-like
+
+        See Also
+        --------
+        Series.str.split : Split string values on specified separator.
+        Series.unstack : Unstack, a.k.a. pivot, Series with MultiIndex to produce DataFrame.
+        DataFrame.melt : Unpivot a DataFrame from wide format to long format
+        Series.explode : Explode a DataFrame from list-like columns to long format.
+
+        Notes
+        -----
+        This routine will explode list-likes including lists, tuples, Series, and np.ndarray.
+        The result dtype of the subset rows will be object.
+        Scalars will be returned unchanged.
+        Empty list-likes will result in a np.nan for that row.
+
+        Examples
+        --------
+        In [1]: df = pd.DataFrame({'A': [[1, 2, 3], 'foo', [], [3, 4]], 'B': 1})
+
+        In [3]: df.explode()
+        Out[3]:
+        0      1
+        0      2
+        0      3
+        1    foo
+        2    NaN
+        3      3
+        3      4
+        dtype: object
+        """
+
+        if not is_list_like(subset):
+            raise ValueError("subset must be a list-like")
+        if not Index(subset).is_unique:
+            raise ValueError("subset must be unique")
+        if not self.columns.is_unique:
+            raise ValueError("columns must be unique")
+
+        results = [self[s].explode() for s in subset]
+        result = self.drop(subset, axis=1)
+
+        # recursive merge
+        from pandas.core.reshape.merge import merge
+
+        def merger(left, right):
+            return merge(left, right, left_index=True, right_index=True)
+
+        return functools.reduce(merger, [result] + results).reindex(
+            columns=self.columns, copy=False
+        )
+
     def unstack(self, level=-1, fill_value=None):
         """
         Pivot a level of the (necessarily hierarchical) index labels, returning

diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -3654,6 +3654,7 @@ def explode(self) -> "Series":
         Series.str.split : Split string values on specified separator.
         Series.unstack : Unstack, a.k.a. pivot, Series with MultiIndex to produce DataFrame.
         DataFrame.melt : Unpivot a DataFrame from wide format to long format
+        DataFrame.explode : Explode a DataFrame from list-like columns to long format.
 
         Notes
         -----

diff --git a/pandas/tests/frame/test_explode.py b/pandas/tests/frame/test_explode.py
@@ -0,0 +1,103 @@
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas.util import testing as tm
+
+
+def test_error():
+    df = pd.DataFrame(
+        {"A": pd.Series([[0, 1, 2], np.nan, [], (3, 4)], index=list("abcd")), "B": 1}
+    )
+    df.columns = list("AA")
+    with pytest.raises(ValueError):
+        df.explode(subset=list("AA"))
+
+
+def test_basic():
+    df = pd.DataFrame(
+        {"A": pd.Series([[0, 1, 2], np.nan, [], (3, 4)], index=list("abcd")), "B": 1}
+    )
+    result = df.explode(subset=["A"])
+    expected = pd.DataFrame(
+        {
+            "A": pd.Series(
+                [0, 1, 2, np.nan, np.nan, 3, 4], index=list("aaabcdd"), dtype=object
+            ),
+            "B": 1,
+        }
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+def test_all_columns():
+    df = pd.DataFrame(
+        {"A": pd.Series([[0, 1, 2], np.nan, [], (3, 4)], index=list("abcd")), "B": 1}
+    )
+    result = df.explode(subset=["A", "B"])
+    expected = pd.DataFrame(
+        {
+            "A": pd.Series(
+                [0, 1, 2, np.nan, np.nan, 3, 4], index=list("aaabcdd"), dtype=object
+            ),
+            "B": 1,
+        }
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+def test_multiple_columns():
+    df = pd.DataFrame(
+        {
+            "A": pd.Series([[0, 1, 2], np.nan, [], (3, 4)], index=list("abcd")),
+            "B": pd.Series([[0, 1, 2], np.nan, np.nan, 3], index=list("abcd")),
+        }
+    )
+    result = df.explode(subset=["A", "B"])
+    expected = pd.DataFrame(
+        {
+            "A": [0, 0, 0, 1, 1, 1, 2, 2, 2, np.nan, np.nan, 3, 4],
+            "B": [0, 1, 2, 0, 1, 2, 0, 1, 2, np.nan, np.nan, 3, 3],
+        },
+        dtype=object,
+        index=list("aaaaaaaaabcdd"),
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+def test_usecase():
+    # explode a single column
+    # gh-10511
+    df = pd.DataFrame(
+        [[11, range(5), 10], [22, range(3), 20]], columns=["A", "B", "C"]
+    ).set_index("C")
+    result = df.explode(["B"])
+
+    expected = pd.DataFrame(
+        {
+            "A": [11, 11, 11, 11, 11, 22, 22, 22],
+            "B": np.array([0, 1, 2, 3, 4, 0, 1, 2], dtype=object),
+            "C": [10, 10, 10, 10, 10, 20, 20, 20],
+        },
+        columns=list("ABC"),
+    ).set_index("C")
+
+    tm.assert_frame_equal(result, expected)
+
+    # gh-8517
+    df = pd.DataFrame(
+        [["2014-01-01", "Alice", "A B"], ["2014-01-02", "Bob", "C D"]],
+        columns=["dt", "name", "text"],
+    )
+    result = df.assign(text=df.text.str.split(" ")).explode(["text"])
+    expected = pd.DataFrame(
+        [
+            ["2014-01-01", "Alice", "A"],
+            ["2014-01-01", "Alice", "B"],
+            ["2014-01-02", "Bob", "C"],
+            ["2014-01-02", "Bob", "D"],
+        ],
+        columns=["dt", "name", "text"],
+        index=[0, 0, 1, 1],
+    )
+    tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/series/test_explode.py b/pandas/tests/series/test_explode.py
@@ -6,15 +6,10 @@
 
 
 def test_basic():
-    s = pd.Series([[0, 1, 2], np.nan, [], (3, 4)],
-                  index=list('abcd'),
-                  name="foo")
+    s = pd.Series([[0, 1, 2], np.nan, [], (3, 4)], index=list("abcd"), name="foo")
     result = s.explode()
     expected = pd.Series(
-        [0, 1, 2, np.nan, np.nan, 3, 4],
-        index=list('aaabcdd'),
-        dtype=object,
-        name="foo",
+        [0, 1, 2, np.nan, np.nan, 3, 4], index=list("aaabcdd"), dtype=object, name="foo"
     )
     tm.assert_series_equal(result, expected)
 
@@ -43,8 +38,7 @@ def test_empty():
 def test_nested_lists():
     s = pd.Series([[[1, 2, 3]], [1, 2], 1])
     result = s.explode()
-    expected = pd.Series([[1, 2, 3], 1, 2, 1],
-                         index=[0, 1, 1, 2])
+    expected = pd.Series([[1, 2, 3], 1, 2, 1], index=[0, 1, 1, 2])
     tm.assert_series_equal(result, expected)