From 8ac7f7ea4235e056064e56778892e43f67165239 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 26 Feb 2024 16:50:56 -0800 Subject: [PATCH 1/2] Avoid factorization in MultiIndex.to_pandas --- python/cudf/cudf/core/dataframe.py | 6 +++++- python/cudf/cudf/core/multiindex.py | 15 +++++++++++---- python/cudf/cudf/tests/test_dataframe.py | 16 ++++------------ 3 files changed, 20 insertions(+), 17 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 9b4a79c6841..d33372e9035 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -3543,12 +3543,16 @@ def rename( ) if level is not None and isinstance(self.index, MultiIndex): + level = self.index._get_level_label(level) out_index = self.index.copy(deep=copy) - out_index.get_level_values(level).to_frame().replace( + level_values = out_index.get_level_values(level) + level_values.to_frame().replace( to_replace=list(index.keys()), value=list(index.values()), inplace=True, ) + out_index._data[level] = column.as_column(level_values) + out_index._compute_levels_and_codes() out = DataFrame(index=out_index) else: to_replace = list(index.keys()) diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index df1b1ea10cd..32ca67e6828 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -1575,10 +1575,17 @@ def droplevel(self, level=-1): @_cudf_nvtx_annotate def to_pandas(self, *, nullable: bool = False) -> pd.MultiIndex: - result = self.to_frame( - index=False, name=list(range(self.nlevels)) - ).to_pandas(nullable=nullable) - return pd.MultiIndex.from_frame(result, names=self.names) + return pd.MultiIndex( + levels=[ + level.to_pandas(nullable=nullable) for level in self.levels + ], + # np.iinfo.min used as missing code, but pandas uses -1 + codes=[ + cp.clip(col.values, a_min=-1, a_max=None).get() + for col in self._codes_frame._columns + ], + names=self.names, + ) @classmethod @_cudf_nvtx_annotate diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 2084db89909..5380503b64f 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -9332,18 +9332,10 @@ def test_dataframe_setitem_cupy_array(): assert_eq(pdf, gdf) -@pytest.mark.parametrize( - "data", [{"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}] -) -@pytest.mark.parametrize( - "index", - [{0: 123, 1: 4, 2: 6}], -) -@pytest.mark.parametrize( - "level", - ["x", 0], -) -def test_rename_for_level_MultiIndex_dataframe(data, index, level): +@pytest.mark.parametrize("level", ["x", 0]) +def test_rename_for_level_MultiIndex_dataframe(level): + data = {"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]} + index = {0: 123, 1: 4, 2: 6} pdf = pd.DataFrame( data, index=pd.MultiIndex.from_tuples([(0, 1, 2), (1, 2, 3), (2, 3, 4)]), From 3687a1140942195912a55526a2293a1aafb9d57d Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 27 Feb 2024 16:03:42 -0800 Subject: [PATCH 2/2] Use replace --- python/cudf/cudf/core/multiindex.py | 9 ++++----- python/cudf/cudf/tests/test_dropna.py | 11 +---------- 2 files changed, 5 insertions(+), 15 deletions(-) diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 32ca67e6828..0df63df6830 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -1575,15 +1575,14 @@ def droplevel(self, level=-1): @_cudf_nvtx_annotate def to_pandas(self, *, nullable: bool = False) -> pd.MultiIndex: + # cudf uses np.iinfo(size_type_dtype).min as missing code + # pandas uses -1 as missing code + pd_codes = self._codes_frame.replace(np.iinfo(size_type_dtype).min, -1) return pd.MultiIndex( levels=[ level.to_pandas(nullable=nullable) for level in self.levels ], - # np.iinfo.min used as missing code, but pandas uses -1 - codes=[ - cp.clip(col.values, a_min=-1, a_max=None).get() - for col in self._codes_frame._columns - ], + codes=[col.values_host for col in pd_codes._columns], names=self.names, ) diff --git a/python/cudf/cudf/tests/test_dropna.py b/python/cudf/cudf/tests/test_dropna.py index f1acd7b4320..c3c8ed922f0 100644 --- a/python/cudf/cudf/tests/test_dropna.py +++ b/python/cudf/cudf/tests/test_dropna.py @@ -252,21 +252,12 @@ def test_dropna_index(data, dtype): @pytest.mark.parametrize("data", [[[1, None, 2], [None, None, 2]]]) @pytest.mark.parametrize("how", ["all", "any"]) -def test_dropna_multiindex(data, how, request): +def test_dropna_multiindex(data, how): pi = pd.MultiIndex.from_arrays(data) gi = cudf.from_pandas(pi) expect = pi.dropna(how) got = gi.dropna(how) - - if how == "all" and "data0" in request.node.callspec.id: - request.applymarker( - pytest.mark.xfail( - reason="pandas NA value np.nan results in float type. " - "cuDF correctly retains int type " - "(https://github.com/pandas-dev/pandas/issues/44792)" - ) - ) assert_eq(expect, got)