Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add unstack() support for non-multiindexed dataframes #7054

Merged
merged 6 commits into from
Jan 6, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 35 additions & 5 deletions python/cudf/cudf/core/reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -902,6 +902,11 @@ def unstack(df, level, fill_value=None):
Pivots the specified levels of the index labels of df to the innermost
levels of the columns labels of the result.

* If the index of ``df`` has multiple levels, returns a ``Dataframe`` with
specified level of the index pivoted to the column levels.
* If the index of ``df`` has single level, returns a ``Series`` with all
column levels pivoted to the index levels.

Parameters
----------
df : DataFrame
Expand All @@ -913,7 +918,7 @@ def unstack(df, level, fill_value=None):

Returns
-------
DataFrame with specified index levels pivoted to column levels
Series or DataFrame

Examples
--------
Expand Down Expand Up @@ -964,18 +969,43 @@ def unstack(df, level, fill_value=None):
a
1 5 <NA> 6 <NA> 7
2 <NA> 8 <NA> 9 <NA>

Unstacking single level index dataframe:

>>> df = cudf.DataFrame({('c', 1): [1, 2, 3], ('c', 2):[9, 8, 7]})
>>> df.unstack()
c 1 0 1
1 2
2 3
2 0 9
1 8
2 7
dtype: int64
"""
if not isinstance(df, cudf.DataFrame):
raise ValueError("`df` should be a cudf Dataframe object.")

if df.empty:
raise ValueError("Cannot unstack an empty dataframe.")

if fill_value is not None:
raise NotImplementedError("fill_value is not supported.")
if pd.api.types.is_list_like(level):
if not level:
return df
df = df.copy(deep=False)
if not isinstance(df.index, cudf.MultiIndex):
raise NotImplementedError(
"Calling unstack() on a DataFrame without a MultiIndex "
"is not supported"
)
dtype = df._columns[0].dtype
for col in df._columns:
if not col.dtype == dtype:
raise ValueError(
"Calling unstack() on single index dataframe"
" with different column datatype is not supported."
)
res = df.T.stack(dropna=False)
# Result's index is a multiindex
res.index.names = tuple(df.columns.names) + df.index.names
return res
else:
columns = df.index._poplevels(level)
index = df.index
Expand Down
51 changes: 50 additions & 1 deletion python/cudf/cudf/tests/test_reshape.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import re

import numpy as np
import pandas as pd
import pytest
Expand Down Expand Up @@ -402,7 +404,7 @@ def test_pivot_multi_values():
),
],
)
def test_unstack(level):
def test_unstack_multiindex(level):
pdf = pd.DataFrame(
{
"foo": ["one", "one", "one", "two", "two", "two"],
Expand All @@ -417,6 +419,53 @@ def test_unstack(level):
)


@pytest.mark.parametrize(
"data",
[{"A": [1.0, 2.0, 3.0, 4.0, 5.0], "B": [11.0, 12.0, 13.0, 14.0, 15.0]}],
)
@pytest.mark.parametrize(
"index",
[
pd.Index(range(0, 5), name=None),
pd.Index(range(0, 5), name="row_index"),
],
)
@pytest.mark.parametrize(
"col_idx",
[
pd.Index(["a", "b"], name=None),
pd.Index(["a", "b"], name="col_index"),
pd.MultiIndex.from_tuples([("c", 1), ("c", 2)], names=[None, None]),
pd.MultiIndex.from_tuples(
[("c", 1), ("c", 2)], names=["col_index1", "col_index2"]
),
],
)
def test_unstack_index(data, index, col_idx):
pdf = pd.DataFrame(data)
gdf = cudf.from_pandas(pdf)

pdf.index = index
pdf.columns = col_idx

gdf.index = cudf.from_pandas(index)
gdf.columns = cudf.from_pandas(col_idx)

assert_eq(pdf.unstack(), gdf.unstack())


def test_unstack_index_invalid():
gdf = cudf.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]})
with pytest.raises(
ValueError,
match=re.escape(
"Calling unstack() on single index dataframe with "
"different column datatype is not supported."
),
):
gdf.unstack()


def test_pivot_duplicate_error():
gdf = cudf.DataFrame(
{"a": [0, 1, 2, 2], "b": [1, 2, 3, 3], "d": [1, 2, 3, 4]}
Expand Down