Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Avoid factorization in MultiIndex.to_pandas #15150

Merged
6 changes: 5 additions & 1 deletion python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -3549,12 +3549,16 @@ def rename(
)

if level is not None and isinstance(self.index, MultiIndex):
level = self.index._get_level_label(level)
out_index = self.index.copy(deep=copy)
out_index.get_level_values(level).to_frame().replace(
level_values = out_index.get_level_values(level)
level_values.to_frame().replace(
to_replace=list(index.keys()),
value=list(index.values()),
inplace=True,
)
out_index._data[level] = column.as_column(level_values)
out_index._compute_levels_and_codes()
out = DataFrame(index=out_index)
else:
to_replace = list(index.keys())
Expand Down
15 changes: 11 additions & 4 deletions python/cudf/cudf/core/multiindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -1577,10 +1577,17 @@ def droplevel(self, level=-1):
def to_pandas(
self, *, nullable: bool = False, arrow_type: bool = False
) -> pd.MultiIndex:
result = self.to_frame(
index=False, name=list(range(self.nlevels))
).to_pandas(nullable=nullable, arrow_type=arrow_type)
return pd.MultiIndex.from_frame(result, names=self.names)
# cudf uses np.iinfo(size_type_dtype).min as missing code
# pandas uses -1 as missing code
pd_codes = self._codes_frame.replace(np.iinfo(size_type_dtype).min, -1)
return pd.MultiIndex(
levels=[
level.to_pandas(nullable=nullable, arrow_type=arrow_type)
for level in self.levels
],
codes=[col.values_host for col in pd_codes._columns],
names=self.names,
)

@classmethod
@_cudf_nvtx_annotate
Expand Down
16 changes: 4 additions & 12 deletions python/cudf/cudf/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -9331,18 +9331,10 @@ def test_dataframe_setitem_cupy_array():
assert_eq(pdf, gdf)


@pytest.mark.parametrize(
"data", [{"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}]
)
@pytest.mark.parametrize(
"index",
[{0: 123, 1: 4, 2: 6}],
)
@pytest.mark.parametrize(
"level",
["x", 0],
)
def test_rename_for_level_MultiIndex_dataframe(data, index, level):
@pytest.mark.parametrize("level", ["x", 0])
def test_rename_for_level_MultiIndex_dataframe(level):
data = {"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}
index = {0: 123, 1: 4, 2: 6}
pdf = pd.DataFrame(
data,
index=pd.MultiIndex.from_tuples([(0, 1, 2), (1, 2, 3), (2, 3, 4)]),
Expand Down
11 changes: 1 addition & 10 deletions python/cudf/cudf/tests/test_dropna.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,21 +252,12 @@ def test_dropna_index(data, dtype):

@pytest.mark.parametrize("data", [[[1, None, 2], [None, None, 2]]])
@pytest.mark.parametrize("how", ["all", "any"])
def test_dropna_multiindex(data, how, request):
def test_dropna_multiindex(data, how):
pi = pd.MultiIndex.from_arrays(data)
gi = cudf.from_pandas(pi)

expect = pi.dropna(how)
got = gi.dropna(how)

if how == "all" and "data0" in request.node.callspec.id:
request.applymarker(
pytest.mark.xfail(
reason="pandas NA value np.nan results in float type. "
"cuDF correctly retains int type "
"(https://github.com/pandas-dev/pandas/issues/44792)"
)
)
assert_eq(expect, got)


Expand Down
Loading