diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index b5707a3a07c..cd1ce36c6c5 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -902,6 +902,11 @@ def unstack(df, level, fill_value=None): Pivots the specified levels of the index labels of df to the innermost levels of the columns labels of the result. + * If the index of ``df`` has multiple levels, returns a ``Dataframe`` with + specified level of the index pivoted to the column levels. + * If the index of ``df`` has single level, returns a ``Series`` with all + column levels pivoted to the index levels. + Parameters ---------- df : DataFrame @@ -913,7 +918,7 @@ def unstack(df, level, fill_value=None): Returns ------- - DataFrame with specified index levels pivoted to column levels + Series or DataFrame Examples -------- @@ -964,7 +969,25 @@ def unstack(df, level, fill_value=None): a 1 5 6 7 2 8 9 + + Unstacking single level index dataframe: + + >>> df = cudf.DataFrame({('c', 1): [1, 2, 3], ('c', 2):[9, 8, 7]}) + >>> df.unstack() + c 1 0 1 + 1 2 + 2 3 + 2 0 9 + 1 8 + 2 7 + dtype: int64 """ + if not isinstance(df, cudf.DataFrame): + raise ValueError("`df` should be a cudf Dataframe object.") + + if df.empty: + raise ValueError("Cannot unstack an empty dataframe.") + if fill_value is not None: raise NotImplementedError("fill_value is not supported.") if pd.api.types.is_list_like(level): @@ -972,10 +995,17 @@ def unstack(df, level, fill_value=None): return df df = df.copy(deep=False) if not isinstance(df.index, cudf.MultiIndex): - raise NotImplementedError( - "Calling unstack() on a DataFrame without a MultiIndex " - "is not supported" - ) + dtype = df._columns[0].dtype + for col in df._columns: + if not col.dtype == dtype: + raise ValueError( + "Calling unstack() on single index dataframe" + " with different column datatype is not supported." + ) + res = df.T.stack(dropna=False) + # Result's index is a multiindex + res.index.names = tuple(df.columns.names) + df.index.names + return res else: columns = df.index._poplevels(level) index = df.index diff --git a/python/cudf/cudf/tests/test_reshape.py b/python/cudf/cudf/tests/test_reshape.py index 890c69b660a..315762c931f 100644 --- a/python/cudf/cudf/tests/test_reshape.py +++ b/python/cudf/cudf/tests/test_reshape.py @@ -1,3 +1,5 @@ +import re + import numpy as np import pandas as pd import pytest @@ -402,7 +404,7 @@ def test_pivot_multi_values(): ), ], ) -def test_unstack(level): +def test_unstack_multiindex(level): pdf = pd.DataFrame( { "foo": ["one", "one", "one", "two", "two", "two"], @@ -417,6 +419,53 @@ def test_unstack(level): ) +@pytest.mark.parametrize( + "data", + [{"A": [1.0, 2.0, 3.0, 4.0, 5.0], "B": [11.0, 12.0, 13.0, 14.0, 15.0]}], +) +@pytest.mark.parametrize( + "index", + [ + pd.Index(range(0, 5), name=None), + pd.Index(range(0, 5), name="row_index"), + ], +) +@pytest.mark.parametrize( + "col_idx", + [ + pd.Index(["a", "b"], name=None), + pd.Index(["a", "b"], name="col_index"), + pd.MultiIndex.from_tuples([("c", 1), ("c", 2)], names=[None, None]), + pd.MultiIndex.from_tuples( + [("c", 1), ("c", 2)], names=["col_index1", "col_index2"] + ), + ], +) +def test_unstack_index(data, index, col_idx): + pdf = pd.DataFrame(data) + gdf = cudf.from_pandas(pdf) + + pdf.index = index + pdf.columns = col_idx + + gdf.index = cudf.from_pandas(index) + gdf.columns = cudf.from_pandas(col_idx) + + assert_eq(pdf.unstack(), gdf.unstack()) + + +def test_unstack_index_invalid(): + gdf = cudf.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]}) + with pytest.raises( + ValueError, + match=re.escape( + "Calling unstack() on single index dataframe with " + "different column datatype is not supported." + ), + ): + gdf.unstack() + + def test_pivot_duplicate_error(): gdf = cudf.DataFrame( {"a": [0, 1, 2, 2], "b": [1, 2, 3, 3], "d": [1, 2, 3, 4]}