From 9678c900a484818b489b723e2568e7b7c0d0b090 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 6 Mar 2024 12:54:09 -1000 Subject: [PATCH] Avoid factorization in MultiIndex.to_pandas (#15150) This also uncovered a bug in `DataFrame.rename` where the underlying `MultiIndex` `ColumnAccessor` was not being replaced Authors: - Matthew Roeschke (https://github.com/mroeschke) - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/15150 --- python/cudf/cudf/core/dataframe.py | 6 +++++- python/cudf/cudf/core/multiindex.py | 15 +++++++++++---- python/cudf/cudf/tests/test_dataframe.py | 16 ++++------------ python/cudf/cudf/tests/test_dropna.py | 11 +---------- 4 files changed, 21 insertions(+), 27 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 1dc79127f60..6a4fe346eb1 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -3583,12 +3583,16 @@ def rename( ) if level is not None and isinstance(self.index, MultiIndex): + level = self.index._get_level_label(level) out_index = self.index.copy(deep=copy) - out_index.get_level_values(level).to_frame().replace( + level_values = out_index.get_level_values(level) + level_values.to_frame().replace( to_replace=list(index.keys()), value=list(index.values()), inplace=True, ) + out_index._data[level] = column.as_column(level_values) + out_index._compute_levels_and_codes() out = DataFrame(index=out_index) else: to_replace = list(index.keys()) diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 315a21020a2..019daacddba 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -1577,10 +1577,17 @@ def droplevel(self, level=-1): def to_pandas( self, *, nullable: bool = False, arrow_type: bool = False ) -> pd.MultiIndex: - result = self.to_frame( - index=False, name=list(range(self.nlevels)) - ).to_pandas(nullable=nullable, arrow_type=arrow_type) - return pd.MultiIndex.from_frame(result, names=self.names) + # cudf uses np.iinfo(size_type_dtype).min as missing code + # pandas uses -1 as missing code + pd_codes = self._codes_frame.replace(np.iinfo(size_type_dtype).min, -1) + return pd.MultiIndex( + levels=[ + level.to_pandas(nullable=nullable, arrow_type=arrow_type) + for level in self.levels + ], + codes=[col.values_host for col in pd_codes._columns], + names=self.names, + ) @classmethod @_cudf_nvtx_annotate diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 444a4c60055..e6cf3988d23 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -9332,18 +9332,10 @@ def test_dataframe_setitem_cupy_array(): assert_eq(pdf, gdf) -@pytest.mark.parametrize( - "data", [{"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}] -) -@pytest.mark.parametrize( - "index", - [{0: 123, 1: 4, 2: 6}], -) -@pytest.mark.parametrize( - "level", - ["x", 0], -) -def test_rename_for_level_MultiIndex_dataframe(data, index, level): +@pytest.mark.parametrize("level", ["x", 0]) +def test_rename_for_level_MultiIndex_dataframe(level): + data = {"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]} + index = {0: 123, 1: 4, 2: 6} pdf = pd.DataFrame( data, index=pd.MultiIndex.from_tuples([(0, 1, 2), (1, 2, 3), (2, 3, 4)]), diff --git a/python/cudf/cudf/tests/test_dropna.py b/python/cudf/cudf/tests/test_dropna.py index f1acd7b4320..c3c8ed922f0 100644 --- a/python/cudf/cudf/tests/test_dropna.py +++ b/python/cudf/cudf/tests/test_dropna.py @@ -252,21 +252,12 @@ def test_dropna_index(data, dtype): @pytest.mark.parametrize("data", [[[1, None, 2], [None, None, 2]]]) @pytest.mark.parametrize("how", ["all", "any"]) -def test_dropna_multiindex(data, how, request): +def test_dropna_multiindex(data, how): pi = pd.MultiIndex.from_arrays(data) gi = cudf.from_pandas(pi) expect = pi.dropna(how) got = gi.dropna(how) - - if how == "all" and "data0" in request.node.callspec.id: - request.applymarker( - pytest.mark.xfail( - reason="pandas NA value np.nan results in float type. " - "cuDF correctly retains int type " - "(https://github.com/pandas-dev/pandas/issues/44792)" - ) - ) assert_eq(expect, got)