diff --git a/doc/source/reference/frame.rst b/doc/source/reference/frame.rst index c0b58fd2d99f5..b1c6172fb1261 100644 --- a/doc/source/reference/frame.rst +++ b/doc/source/reference/frame.rst @@ -239,6 +239,7 @@ Reshaping, sorting, transposing DataFrame.unstack DataFrame.swapaxes DataFrame.melt + DataFrame.explode DataFrame.squeeze DataFrame.to_xarray DataFrame.T diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 2fc5f57b56800..4b0c46d08f228 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -15,7 +15,7 @@ import itertools import sys from textwrap import dedent -from typing import FrozenSet, List, Optional, Set, Type, Union +from typing import FrozenSet, Iterable, List, Optional, Set, Type, Union import warnings import numpy as np @@ -6252,6 +6252,78 @@ def stack(self, level=-1, dropna=True): else: return stack(self, level, dropna=dropna) + def explode(self, subset: Iterable) -> "DataFrame": + """ + Create new DataFrame expanding a list-like columns. + + .. versionadded:: 0.25.0 + + Parameters + ---------- + subset : list-like + + Returns + ------- + DataFrame + Exploded lists to rows of the subset columns; index will be duplicated for these rows. + + Raises + ------ + ValueError : + if columns & subset are not unique. + ValueError : + subset must be list-like + + See Also + -------- + Series.str.split : Split string values on specified separator. + Series.unstack : Unstack, a.k.a. pivot, Series with MultiIndex to produce DataFrame. + DataFrame.melt : Unpivot a DataFrame from wide format to long format + Series.explode : Explode a DataFrame from list-like columns to long format. + + Notes + ----- + This routine will explode list-likes including lists, tuples, Series, and np.ndarray. + The result dtype of the subset rows will be object. + Scalars will be returned unchanged. + Empty list-likes will result in a np.nan for that row. + + Examples + -------- + In [1]: df = pd.DataFrame({'A': [[1, 2, 3], 'foo', [], [3, 4]], 'B': 1}) + + In [3]: df.explode() + Out[3]: + 0 1 + 0 2 + 0 3 + 1 foo + 2 NaN + 3 3 + 3 4 + dtype: object + """ + + if not is_list_like(subset): + raise ValueError("subset must be a list-like") + if not Index(subset).is_unique: + raise ValueError("subset must be unique") + if not self.columns.is_unique: + raise ValueError("columns must be unique") + + results = [self[s].explode() for s in subset] + result = self.drop(subset, axis=1) + + # recursive merge + from pandas.core.reshape.merge import merge + + def merger(left, right): + return merge(left, right, left_index=True, right_index=True) + + return functools.reduce(merger, [result] + results).reindex( + columns=self.columns, copy=False + ) + def unstack(self, level=-1, fill_value=None): """ Pivot a level of the (necessarily hierarchical) index labels, returning diff --git a/pandas/core/series.py b/pandas/core/series.py index b2214da2c2daa..ba40d708b837b 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -3654,6 +3654,7 @@ def explode(self) -> "Series": Series.str.split : Split string values on specified separator. Series.unstack : Unstack, a.k.a. pivot, Series with MultiIndex to produce DataFrame. DataFrame.melt : Unpivot a DataFrame from wide format to long format + DataFrame.explode : Explode a DataFrame from list-like columns to long format. Notes ----- diff --git a/pandas/tests/frame/test_explode.py b/pandas/tests/frame/test_explode.py new file mode 100644 index 0000000000000..56438e7cc290d --- /dev/null +++ b/pandas/tests/frame/test_explode.py @@ -0,0 +1,103 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas.util import testing as tm + + +def test_error(): + df = pd.DataFrame( + {"A": pd.Series([[0, 1, 2], np.nan, [], (3, 4)], index=list("abcd")), "B": 1} + ) + df.columns = list("AA") + with pytest.raises(ValueError): + df.explode(subset=list("AA")) + + +def test_basic(): + df = pd.DataFrame( + {"A": pd.Series([[0, 1, 2], np.nan, [], (3, 4)], index=list("abcd")), "B": 1} + ) + result = df.explode(subset=["A"]) + expected = pd.DataFrame( + { + "A": pd.Series( + [0, 1, 2, np.nan, np.nan, 3, 4], index=list("aaabcdd"), dtype=object + ), + "B": 1, + } + ) + tm.assert_frame_equal(result, expected) + + +def test_all_columns(): + df = pd.DataFrame( + {"A": pd.Series([[0, 1, 2], np.nan, [], (3, 4)], index=list("abcd")), "B": 1} + ) + result = df.explode(subset=["A", "B"]) + expected = pd.DataFrame( + { + "A": pd.Series( + [0, 1, 2, np.nan, np.nan, 3, 4], index=list("aaabcdd"), dtype=object + ), + "B": 1, + } + ) + tm.assert_frame_equal(result, expected) + + +def test_multiple_columns(): + df = pd.DataFrame( + { + "A": pd.Series([[0, 1, 2], np.nan, [], (3, 4)], index=list("abcd")), + "B": pd.Series([[0, 1, 2], np.nan, np.nan, 3], index=list("abcd")), + } + ) + result = df.explode(subset=["A", "B"]) + expected = pd.DataFrame( + { + "A": [0, 0, 0, 1, 1, 1, 2, 2, 2, np.nan, np.nan, 3, 4], + "B": [0, 1, 2, 0, 1, 2, 0, 1, 2, np.nan, np.nan, 3, 3], + }, + dtype=object, + index=list("aaaaaaaaabcdd"), + ) + tm.assert_frame_equal(result, expected) + + +def test_usecase(): + # explode a single column + # gh-10511 + df = pd.DataFrame( + [[11, range(5), 10], [22, range(3), 20]], columns=["A", "B", "C"] + ).set_index("C") + result = df.explode(["B"]) + + expected = pd.DataFrame( + { + "A": [11, 11, 11, 11, 11, 22, 22, 22], + "B": np.array([0, 1, 2, 3, 4, 0, 1, 2], dtype=object), + "C": [10, 10, 10, 10, 10, 20, 20, 20], + }, + columns=list("ABC"), + ).set_index("C") + + tm.assert_frame_equal(result, expected) + + # gh-8517 + df = pd.DataFrame( + [["2014-01-01", "Alice", "A B"], ["2014-01-02", "Bob", "C D"]], + columns=["dt", "name", "text"], + ) + result = df.assign(text=df.text.str.split(" ")).explode(["text"]) + expected = pd.DataFrame( + [ + ["2014-01-01", "Alice", "A"], + ["2014-01-01", "Alice", "B"], + ["2014-01-02", "Bob", "C"], + ["2014-01-02", "Bob", "D"], + ], + columns=["dt", "name", "text"], + index=[0, 0, 1, 1], + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/series/test_explode.py b/pandas/tests/series/test_explode.py index 661d4cb320cca..3f57f71561232 100644 --- a/pandas/tests/series/test_explode.py +++ b/pandas/tests/series/test_explode.py @@ -6,15 +6,10 @@ def test_basic(): - s = pd.Series([[0, 1, 2], np.nan, [], (3, 4)], - index=list('abcd'), - name="foo") + s = pd.Series([[0, 1, 2], np.nan, [], (3, 4)], index=list("abcd"), name="foo") result = s.explode() expected = pd.Series( - [0, 1, 2, np.nan, np.nan, 3, 4], - index=list('aaabcdd'), - dtype=object, - name="foo", + [0, 1, 2, np.nan, np.nan, 3, 4], index=list("aaabcdd"), dtype=object, name="foo" ) tm.assert_series_equal(result, expected) @@ -43,8 +38,7 @@ def test_empty(): def test_nested_lists(): s = pd.Series([[[1, 2, 3]], [1, 2], 1]) result = s.explode() - expected = pd.Series([[1, 2, 3], 1, 2, 1], - index=[0, 1, 1, 2]) + expected = pd.Series([[1, 2, 3], 1, 2, 1], index=[0, 1, 1, 2]) tm.assert_series_equal(result, expected)