Skip to content

Commit

Permalink
try adding frame
Browse files Browse the repository at this point in the history
  • Loading branch information
jreback committed Jul 11, 2019
1 parent 2b9e2e6 commit c2b91e2
Show file tree
Hide file tree
Showing 5 changed files with 181 additions and 10 deletions.
1 change: 1 addition & 0 deletions doc/source/reference/frame.rst
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,7 @@ Reshaping, sorting, transposing
DataFrame.unstack
DataFrame.swapaxes
DataFrame.melt
DataFrame.explode
DataFrame.squeeze
DataFrame.to_xarray
DataFrame.T
Expand Down
74 changes: 73 additions & 1 deletion pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
import itertools
import sys
from textwrap import dedent
from typing import FrozenSet, List, Optional, Set, Type, Union
from typing import FrozenSet, Iterable, List, Optional, Set, Type, Union
import warnings

import numpy as np
Expand Down Expand Up @@ -6252,6 +6252,78 @@ def stack(self, level=-1, dropna=True):
else:
return stack(self, level, dropna=dropna)

def explode(self, subset: Iterable) -> "DataFrame":
"""
Create new DataFrame expanding a list-like columns.
.. versionadded:: 0.25.0
Parameters
----------
subset : list-like
Returns
-------
DataFrame
Exploded lists to rows of the subset columns; index will be duplicated for these rows.
Raises
------
ValueError :
if columns & subset are not unique.
ValueError :
subset must be list-like
See Also
--------
Series.str.split : Split string values on specified separator.
Series.unstack : Unstack, a.k.a. pivot, Series with MultiIndex to produce DataFrame.
DataFrame.melt : Unpivot a DataFrame from wide format to long format
Series.explode : Explode a DataFrame from list-like columns to long format.
Notes
-----
This routine will explode list-likes including lists, tuples, Series, and np.ndarray.
The result dtype of the subset rows will be object.
Scalars will be returned unchanged.
Empty list-likes will result in a np.nan for that row.
Examples
--------
In [1]: df = pd.DataFrame({'A': [[1, 2, 3], 'foo', [], [3, 4]], 'B': 1})
In [3]: df.explode()
Out[3]:
0 1
0 2
0 3
1 foo
2 NaN
3 3
3 4
dtype: object
"""

if not is_list_like(subset):
raise ValueError("subset must be a list-like")
if not Index(subset).is_unique:
raise ValueError("subset must be unique")
if not self.columns.is_unique:
raise ValueError("columns must be unique")

results = [self[s].explode() for s in subset]
result = self.drop(subset, axis=1)

# recursive merge
from pandas.core.reshape.merge import merge

def merger(left, right):
return merge(left, right, left_index=True, right_index=True)

return functools.reduce(merger, [result] + results).reindex(
columns=self.columns, copy=False
)

def unstack(self, level=-1, fill_value=None):
"""
Pivot a level of the (necessarily hierarchical) index labels, returning
Expand Down
1 change: 1 addition & 0 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -3654,6 +3654,7 @@ def explode(self) -> "Series":
Series.str.split : Split string values on specified separator.
Series.unstack : Unstack, a.k.a. pivot, Series with MultiIndex to produce DataFrame.
DataFrame.melt : Unpivot a DataFrame from wide format to long format
DataFrame.explode : Explode a DataFrame from list-like columns to long format.
Notes
-----
Expand Down
103 changes: 103 additions & 0 deletions pandas/tests/frame/test_explode.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
import numpy as np
import pytest

import pandas as pd
from pandas.util import testing as tm


def test_error():
df = pd.DataFrame(
{"A": pd.Series([[0, 1, 2], np.nan, [], (3, 4)], index=list("abcd")), "B": 1}
)
df.columns = list("AA")
with pytest.raises(ValueError):
df.explode(subset=list("AA"))


def test_basic():
df = pd.DataFrame(
{"A": pd.Series([[0, 1, 2], np.nan, [], (3, 4)], index=list("abcd")), "B": 1}
)
result = df.explode(subset=["A"])
expected = pd.DataFrame(
{
"A": pd.Series(
[0, 1, 2, np.nan, np.nan, 3, 4], index=list("aaabcdd"), dtype=object
),
"B": 1,
}
)
tm.assert_frame_equal(result, expected)


def test_all_columns():
df = pd.DataFrame(
{"A": pd.Series([[0, 1, 2], np.nan, [], (3, 4)], index=list("abcd")), "B": 1}
)
result = df.explode(subset=["A", "B"])
expected = pd.DataFrame(
{
"A": pd.Series(
[0, 1, 2, np.nan, np.nan, 3, 4], index=list("aaabcdd"), dtype=object
),
"B": 1,
}
)
tm.assert_frame_equal(result, expected)


def test_multiple_columns():
df = pd.DataFrame(
{
"A": pd.Series([[0, 1, 2], np.nan, [], (3, 4)], index=list("abcd")),
"B": pd.Series([[0, 1, 2], np.nan, np.nan, 3], index=list("abcd")),
}
)
result = df.explode(subset=["A", "B"])
expected = pd.DataFrame(
{
"A": [0, 0, 0, 1, 1, 1, 2, 2, 2, np.nan, np.nan, 3, 4],
"B": [0, 1, 2, 0, 1, 2, 0, 1, 2, np.nan, np.nan, 3, 3],
},
dtype=object,
index=list("aaaaaaaaabcdd"),
)
tm.assert_frame_equal(result, expected)


def test_usecase():
# explode a single column
# gh-10511
df = pd.DataFrame(
[[11, range(5), 10], [22, range(3), 20]], columns=["A", "B", "C"]
).set_index("C")
result = df.explode(["B"])

expected = pd.DataFrame(
{
"A": [11, 11, 11, 11, 11, 22, 22, 22],
"B": np.array([0, 1, 2, 3, 4, 0, 1, 2], dtype=object),
"C": [10, 10, 10, 10, 10, 20, 20, 20],
},
columns=list("ABC"),
).set_index("C")

tm.assert_frame_equal(result, expected)

# gh-8517
df = pd.DataFrame(
[["2014-01-01", "Alice", "A B"], ["2014-01-02", "Bob", "C D"]],
columns=["dt", "name", "text"],
)
result = df.assign(text=df.text.str.split(" ")).explode(["text"])
expected = pd.DataFrame(
[
["2014-01-01", "Alice", "A"],
["2014-01-01", "Alice", "B"],
["2014-01-02", "Bob", "C"],
["2014-01-02", "Bob", "D"],
],
columns=["dt", "name", "text"],
index=[0, 0, 1, 1],
)
tm.assert_frame_equal(result, expected)
12 changes: 3 additions & 9 deletions pandas/tests/series/test_explode.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,10 @@


def test_basic():
s = pd.Series([[0, 1, 2], np.nan, [], (3, 4)],
index=list('abcd'),
name="foo")
s = pd.Series([[0, 1, 2], np.nan, [], (3, 4)], index=list("abcd"), name="foo")
result = s.explode()
expected = pd.Series(
[0, 1, 2, np.nan, np.nan, 3, 4],
index=list('aaabcdd'),
dtype=object,
name="foo",
[0, 1, 2, np.nan, np.nan, 3, 4], index=list("aaabcdd"), dtype=object, name="foo"
)
tm.assert_series_equal(result, expected)

Expand Down Expand Up @@ -43,8 +38,7 @@ def test_empty():
def test_nested_lists():
s = pd.Series([[[1, 2, 3]], [1, 2], 1])
result = s.explode()
expected = pd.Series([[1, 2, 3], 1, 2, 1],
index=[0, 1, 1, 2])
expected = pd.Series([[1, 2, 3], 1, 2, 1], index=[0, 1, 1, 2])
tm.assert_series_equal(result, expected)


Expand Down

0 comments on commit c2b91e2

Please sign in to comment.