From dcf949caf895dbbcd936b21d372d4072b95be37e Mon Sep 17 00:00:00 2001 From: skirui-source <71867292+skirui-source@users.noreply.github.com> Date: Tue, 23 Feb 2021 22:31:10 -0800 Subject: [PATCH] Adding support to specify "level" parameter for `Dataframe.rename` (#7135) Authors: - @skirui-source - Ashwin Srinath (@shwina) Approvers: - Michael Wang (@isVoid) - Ashwin Srinath (@shwina) - Karthikeyan (@karthikeyann) - Keith Kraus (@kkraus14) URL: https://github.com/rapidsai/cudf/pull/7135 --- python/cudf/cudf/core/column_accessor.py | 85 ++++++++++++++++++- python/cudf/cudf/core/dataframe.py | 47 ++++------ python/cudf/cudf/core/frame.py | 1 - .../cudf/cudf/tests/test_column_accessor.py | 30 +++++++ python/cudf/cudf/tests/test_dataframe.py | 73 ++++++++++++++-- 5 files changed, 198 insertions(+), 38 deletions(-) diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py index f5823528d02..ad1a0c80ef5 100644 --- a/python/cudf/cudf/core/column_accessor.py +++ b/python/cudf/cudf/core/column_accessor.py @@ -5,7 +5,15 @@ import itertools from collections import OrderedDict from collections.abc import MutableMapping -from typing import TYPE_CHECKING, Any, Tuple, Union +from typing import ( + TYPE_CHECKING, + Any, + Callable, + Mapping, + Optional, + Tuple, + Union, +) import pandas as pd @@ -341,6 +349,81 @@ def _pad_key(self, key: Any, pad_value="") -> Any: key = (key,) return key + (pad_value,) * (self.nlevels - len(key)) + def rename_levels( + self, mapper: Union[Mapping[Any, Any], Callable], level: Optional[int] + ) -> ColumnAccessor: + """ + Rename the specified levels of the given ColumnAccessor + + Parameters + ---------- + self : ColumnAccessor of a given dataframe + + mapper : dict-like or function transformations to apply to + the column label values depending on selected ``level``. + + If dict-like, only replace the specified level of the + ColumnAccessor's keys (that match the mapper's keys) with + mapper's values + + If callable, the function is applied only to the specified level + of the ColumnAccessor's keys. + + level : int + In case of RangeIndex, only supported level is [0, None]. + In case of a MultiColumn, only the column labels in the specified + level of the ColumnAccessor's keys will be transformed. + + Returns + ------- + A new ColumnAccessor with values in the keys replaced according + to the given mapper and level. + + """ + if self.multiindex: + + def rename_column(x): + x = list(x) + if isinstance(mapper, Mapping): + x[level] = mapper.get(x[level], x[level]) + else: + x[level] = mapper(x[level]) + x = tuple(x) + return x + + if level is None: + raise NotImplementedError( + "Renaming columns with a MultiIndex and level=None is" + "not supported" + ) + new_names = map(rename_column, self.keys()) + ca = ColumnAccessor( + dict(zip(new_names, self.values())), + level_names=self.level_names, + multiindex=self.multiindex, + ) + + else: + if level is None: + level = 0 + if level != 0: + raise IndexError( + f"Too many levels: Index has only 1 level, not {level+1}" + ) + if isinstance(mapper, Mapping): + new_names = ( + mapper.get(col_name, col_name) for col_name in self.keys() + ) + else: + new_names = (mapper(col_name) for col_name in self.keys()) + ca = ColumnAccessor( + dict(zip(new_names, self.values())), + level_names=self.level_names, + multiindex=self.multiindex, + ) + + return self.__class__(ca) + def _compare_keys(target: Any, key: Any) -> bool: """ diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 3e7e6625abe..605c1fbc6c6 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -9,7 +9,7 @@ import sys import warnings from collections import OrderedDict, defaultdict -from collections.abc import Iterable, Mapping, Sequence +from collections.abc import Iterable, Sequence from typing import Any, Set, TypeVar import cupy @@ -3445,11 +3445,6 @@ def rename( "Only errors='ignore' is currently supported" ) - if level: - raise NotImplementedError( - "Only level=False is currently supported" - ) - if mapper is None and index is None and columns is None: return self.copy(deep=copy) @@ -3467,35 +3462,29 @@ def rename( "Implicit conversion of index to " "mixed type is not yet supported." ) - out = DataFrame( - index=self.index.replace( + + if level is not None and isinstance( + self.index, cudf.core.multiindex.MultiIndex + ): + out_index = self.index.copy(deep=copy) + out_index.get_level_values(level).to_frame().replace( to_replace=list(index.keys()), - replacement=list(index.values()), + value=list(index.values()), + inplace=True, + ) + out = DataFrame(index=out_index) + else: + out = DataFrame( + index=self.index.replace( + to_replace=list(index.keys()), + replacement=list(index.values()), + ) ) - ) else: out = DataFrame(index=self.index) if columns: - postfix = 1 - if isinstance(columns, Mapping): - # It is possible for DataFrames with a MultiIndex columns - # object to have columns with the same name. The following - # use of _cols.items and ("_1", "_2"... allows the use of - # rename in this case - for key, col in self._data.items(): - if key in columns: - if columns[key] in out._data: - out_column = columns[key] + "_" + str(postfix) - postfix += 1 - else: - out_column = columns[key] - out[out_column] = col - else: - out[key] = col - elif callable(columns): - for key, col in self._data.items(): - out[columns(key)] = col + out._data = self._data.rename_levels(mapper=columns, level=level) else: out._data = self._data.copy(deep=copy) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index e763a164003..a21201a7f10 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -27,7 +27,6 @@ min_scalar_type, ) - T = TypeVar("T", bound="Frame") if TYPE_CHECKING: diff --git a/python/cudf/cudf/tests/test_column_accessor.py b/python/cudf/cudf/tests/test_column_accessor.py index 964e79a57b0..86a7927dcac 100644 --- a/python/cudf/cudf/tests/test_column_accessor.py +++ b/python/cudf/cudf/tests/test_column_accessor.py @@ -255,3 +255,33 @@ def test_select_by_index_empty(): got = ca.select_by_index([]) check_ca_equal(expect, got) + + +def test_replace_level_values_RangeIndex(): + ca = ColumnAccessor( + {("a"): [1, 2, 3], ("b"): [2, 3, 4], ("c"): [3, 4, 5]}, + multiindex=False, + ) + + expect = ColumnAccessor( + {("f"): [1, 2, 3], ("b"): [2, 3, 4], ("c"): [3, 4, 5]}, + multiindex=False, + ) + + got = ca.rename_levels(mapper={"a": "f"}, level=0) + check_ca_equal(expect, got) + + +def test_replace_level_values_MultiColumn(): + ca = ColumnAccessor( + {("a", 1): [1, 2, 3], ("a", 2): [2, 3, 4], ("b", 1): [3, 4, 5]}, + multiindex=True, + ) + + expect = ColumnAccessor( + {("f", 1): [1, 2, 3], ("f", 2): [2, 3, 4], ("b", 1): [3, 4, 5]}, + multiindex=True, + ) + + got = ca.rename_levels(mapper={"a": "f"}, level=0) + check_ca_equal(expect, got) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index d8005911fcd..b45d71bd088 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -648,13 +648,6 @@ def test_dataframe_column_rename(axis): assert_eq(expect, got) - gdf = gd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}) - rename_mapper = {"a": "z", "b": "z", "c": "z"} - expect = gd.DataFrame({"z": [1, 2, 3], "z_1": [4, 5, 6], "z_2": [7, 8, 9]}) - got = gdf.rename(columns=rename_mapper) - - assert_eq(expect, got) - def test_dataframe_pop(): pdf = pd.DataFrame( @@ -8327,3 +8320,69 @@ def test_dataframe_setitem_cupy_array(): gdf[gpu_array] = 1.5 assert_eq(pdf, gdf) + + +@pytest.mark.parametrize( + "data", [{"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}] +) +@pytest.mark.parametrize( + "index", [{0: 123, 1: 4, 2: 6}], +) +@pytest.mark.parametrize( + "level", ["x", 0], +) +def test_rename_for_level_MultiIndex_dataframe(data, index, level): + pdf = pd.DataFrame( + data, + index=pd.MultiIndex.from_tuples([(0, 1, 2), (1, 2, 3), (2, 3, 4)]), + ) + pdf.index.names = ["x", "y", "z"] + gdf = gd.from_pandas(pdf) + + expect = pdf.rename(index=index, level=level) + got = gdf.rename(index=index, level=level) + + assert_eq(expect, got) + + +@pytest.mark.parametrize( + "data", [{"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}] +) +@pytest.mark.parametrize( + "columns", [{"a": "f", "b": "g"}, {1: 3, 2: 4}, lambda s: 2 * s], +) +@pytest.mark.parametrize( + "level", [0, 1], +) +def test_rename_for_level_MultiColumn_dataframe(data, columns, level): + gdf = gd.DataFrame(data) + gdf.columns = pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1)]) + + pdf = gdf.to_pandas() + + expect = pdf.rename(columns=columns, level=level) + got = gdf.rename(columns=columns, level=level) + + assert_eq(expect, got) + + +def test_rename_for_level_RangeIndex_dataframe(): + gdf = gd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}) + pdf = gdf.to_pandas() + + expect = pdf.rename(columns={"a": "f"}, index={0: 3, 1: 4}, level=0) + got = gdf.rename(columns={"a": "f"}, index={0: 3, 1: 4}, level=0) + + assert_eq(expect, got) + + +@pytest.mark.xfail(reason="level=None not implemented yet") +def test_rename_for_level_is_None_MC(): + gdf = gd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}) + gdf.columns = pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1)]) + pdf = gdf.to_pandas() + + expect = pdf.rename(columns={"a": "f"}, level=None) + got = gdf.rename(columns={"a": "f"}, level=None) + + assert_eq(expect, got)