From 193ab17eb69405a9f3252d681001184a4a8055a5 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Tue, 29 Dec 2020 14:54:22 -0800 Subject: [PATCH 1/6] Initial --- python/cudf/cudf/core/reshape.py | 14 +++++++--- python/cudf/cudf/tests/test_reshape.py | 37 +++++++++++++++++++++++++- 2 files changed, 46 insertions(+), 5 deletions(-) diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index b5707a3a07c..7a0238cbefb 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -972,10 +972,16 @@ def unstack(df, level, fill_value=None): return df df = df.copy(deep=False) if not isinstance(df.index, cudf.MultiIndex): - raise NotImplementedError( - "Calling unstack() on a DataFrame without a MultiIndex " - "is not supported" - ) + if isinstance(df, cudf.DataFrame): + res = df.T.stack(dropna=False) + # Result's index is a multiindex + res.index.names = tuple(df.columns.names) + df.index.names + return res + else: + raise NotImplementedError( + "Calling unstack() on a Series without a MultiIndex " + "is not supported" + ) else: columns = df.index._poplevels(level) index = df.index diff --git a/python/cudf/cudf/tests/test_reshape.py b/python/cudf/cudf/tests/test_reshape.py index 890c69b660a..2ed9fc1888a 100644 --- a/python/cudf/cudf/tests/test_reshape.py +++ b/python/cudf/cudf/tests/test_reshape.py @@ -402,7 +402,7 @@ def test_pivot_multi_values(): ), ], ) -def test_unstack(level): +def test_unstack_multiindex(level): pdf = pd.DataFrame( { "foo": ["one", "one", "one", "two", "two", "two"], @@ -417,6 +417,41 @@ def test_unstack(level): ) +@pytest.mark.parametrize( + "data", + [{"A": [1.0, 2.0, 3.0, 4.0, 5.0], "B": [11.0, 12.0, 13.0, 14.0, 15.0]}], +) +@pytest.mark.parametrize( + "index", + [ + pd.Index(range(0, 5), name=None), + pd.Index(range(0, 5), name="row_index"), + ], +) +@pytest.mark.parametrize( + "col_idx", + [ + pd.Index(["a", "b"], name=None), + pd.Index(["a", "b"], name="col_index"), + pd.MultiIndex.from_tuples([("c", 1), ("c", 2)], names=[None, None]), + pd.MultiIndex.from_tuples( + [("c", 1), ("c", 2)], names=["col_index1", "col_index2"] + ), + ], +) +def test_unstack_index(data, index, col_idx): + pdf = pd.DataFrame(data) + gdf = cudf.from_pandas(pdf) + + pdf.index = index + pdf.columns = col_idx + + gdf.index = cudf.from_pandas(index) + gdf.columns = cudf.from_pandas(col_idx) + + assert_eq(pdf.unstack(), gdf.unstack()) + + def test_pivot_duplicate_error(): gdf = cudf.DataFrame( {"a": [0, 1, 2, 2], "b": [1, 2, 3, 3], "d": [1, 2, 3, 4]} From d9326be3d918a372fee0c4d3b2adc8e0c881b21c Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Tue, 29 Dec 2020 16:03:23 -0800 Subject: [PATCH 2/6] docstrings --- python/cudf/cudf/core/reshape.py | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index 7a0238cbefb..2f813c3b704 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -902,6 +902,11 @@ def unstack(df, level, fill_value=None): Pivots the specified levels of the index labels of df to the innermost levels of the columns labels of the result. + * If the index of ``df`` has multiple levels, returns a ``Dataframe`` with + specified level of the index pivoted to the column levels. + * If the index of ``df`` has single level, returns a ``Series`` with all + column levels pivoted to the index levels. + Parameters ---------- df : DataFrame @@ -913,7 +918,7 @@ def unstack(df, level, fill_value=None): Returns ------- - DataFrame with specified index levels pivoted to column levels + Series or DataFrame Examples -------- @@ -964,6 +969,21 @@ def unstack(df, level, fill_value=None): a 1 5 6 7 2 8 9 + + Unstacking single level index dataframe: + + >>> df.unstack(['b', 'd']).unstack() + b d a + c 1 a 1 5 + 2 + d 1 + 2 8 + 2 b 1 6 + 2 + e 1 + 2 9 + 3 a 1 7 + 2 """ if fill_value is not None: raise NotImplementedError("fill_value is not supported.") From 4545bc966448e9046d1c121bde871aa749e5d908 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Wed, 30 Dec 2020 10:19:53 -0800 Subject: [PATCH 3/6] clearer example --- python/cudf/cudf/core/reshape.py | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index 2f813c3b704..5985c767495 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -972,18 +972,15 @@ def unstack(df, level, fill_value=None): Unstacking single level index dataframe: - >>> df.unstack(['b', 'd']).unstack() - b d a - c 1 a 1 5 - 2 - d 1 - 2 8 - 2 b 1 6 - 2 - e 1 - 2 9 - 3 a 1 7 - 2 + >>> df = cudf.DataFrame({('c', 1): [1, 2, 3], ('c', 2):[9, 8, 7]}) + >>> df.unstack() + c 1 0 1 + 1 2 + 2 3 + 2 0 9 + 1 8 + 2 7 + dtype: int64 """ if fill_value is not None: raise NotImplementedError("fill_value is not supported.") From 7c6002813af54593cf9963b514b97a9a35cf7046 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Mon, 4 Jan 2021 10:24:05 -0800 Subject: [PATCH 4/6] rev: check column dtypes --- python/cudf/cudf/core/reshape.py | 22 +++++++++++++++++----- python/cudf/cudf/tests/test_reshape.py | 14 ++++++++++++++ 2 files changed, 31 insertions(+), 5 deletions(-) diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index 5985c767495..7265ad62748 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -982,6 +982,12 @@ def unstack(df, level, fill_value=None): 2 7 dtype: int64 """ + if not isinstance(df, cudf.DataFrame): + raise ValueError("`df` should be a cudf Dataframe object.") + + if df.empty: + raise ValueError("Cannot unstack an empty dataframe.") + if fill_value is not None: raise NotImplementedError("fill_value is not supported.") if pd.api.types.is_list_like(level): @@ -990,15 +996,21 @@ def unstack(df, level, fill_value=None): df = df.copy(deep=False) if not isinstance(df.index, cudf.MultiIndex): if isinstance(df, cudf.DataFrame): + dtype = df._columns[0].dtype + if any( + [ + not df._columns[i].dtype == dtype + for i in range(len(df._columns)) + ] + ): + raise ValueError( + "Calling unstack() on single index dataframe" + " with different column datatype is not supported." + ) res = df.T.stack(dropna=False) # Result's index is a multiindex res.index.names = tuple(df.columns.names) + df.index.names return res - else: - raise NotImplementedError( - "Calling unstack() on a Series without a MultiIndex " - "is not supported" - ) else: columns = df.index._poplevels(level) index = df.index diff --git a/python/cudf/cudf/tests/test_reshape.py b/python/cudf/cudf/tests/test_reshape.py index 2ed9fc1888a..315762c931f 100644 --- a/python/cudf/cudf/tests/test_reshape.py +++ b/python/cudf/cudf/tests/test_reshape.py @@ -1,3 +1,5 @@ +import re + import numpy as np import pandas as pd import pytest @@ -452,6 +454,18 @@ def test_unstack_index(data, index, col_idx): assert_eq(pdf.unstack(), gdf.unstack()) +def test_unstack_index_invalid(): + gdf = cudf.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]}) + with pytest.raises( + ValueError, + match=re.escape( + "Calling unstack() on single index dataframe with " + "different column datatype is not supported." + ), + ): + gdf.unstack() + + def test_pivot_duplicate_error(): gdf = cudf.DataFrame( {"a": [0, 1, 2, 2], "b": [1, 2, 3, 3], "d": [1, 2, 3, 4]} From e0c2f2a751faf80577d4a374f46609c72108ed18 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Mon, 4 Jan 2021 16:50:35 -0800 Subject: [PATCH 5/6] Efficient datatype checking. --- python/cudf/cudf/core/reshape.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index 7265ad62748..8c732474b8c 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -997,16 +997,12 @@ def unstack(df, level, fill_value=None): if not isinstance(df.index, cudf.MultiIndex): if isinstance(df, cudf.DataFrame): dtype = df._columns[0].dtype - if any( - [ - not df._columns[i].dtype == dtype - for i in range(len(df._columns)) - ] - ): - raise ValueError( - "Calling unstack() on single index dataframe" - " with different column datatype is not supported." - ) + for col in df._columns: + if not col.dtype == dtype: + raise ValueError( + "Calling unstack() on single index dataframe" + " with different column datatype is not supported." + ) res = df.T.stack(dropna=False) # Result's index is a multiindex res.index.names = tuple(df.columns.names) + df.index.names From 57fdd4a4c674ac4406807ce75d9c74fda93db819 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Mon, 4 Jan 2021 16:56:42 -0800 Subject: [PATCH 6/6] Remove stale dataframe check --- python/cudf/cudf/core/reshape.py | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index 8c732474b8c..cd1ce36c6c5 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -995,18 +995,17 @@ def unstack(df, level, fill_value=None): return df df = df.copy(deep=False) if not isinstance(df.index, cudf.MultiIndex): - if isinstance(df, cudf.DataFrame): - dtype = df._columns[0].dtype - for col in df._columns: - if not col.dtype == dtype: - raise ValueError( - "Calling unstack() on single index dataframe" - " with different column datatype is not supported." - ) - res = df.T.stack(dropna=False) - # Result's index is a multiindex - res.index.names = tuple(df.columns.names) + df.index.names - return res + dtype = df._columns[0].dtype + for col in df._columns: + if not col.dtype == dtype: + raise ValueError( + "Calling unstack() on single index dataframe" + " with different column datatype is not supported." + ) + res = df.T.stack(dropna=False) + # Result's index is a multiindex + res.index.names = tuple(df.columns.names) + df.index.names + return res else: columns = df.index._poplevels(level) index = df.index