Skip to content

Commit

Permalink
Avoid factorization in MultiIndex.to_pandas (#15150)
Browse files Browse the repository at this point in the history
This also uncovered a bug in `DataFrame.rename` where the underlying `MultiIndex` `ColumnAccessor` was not being replaced

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: #15150
  • Loading branch information
mroeschke authored Mar 6, 2024
1 parent c299a62 commit 9678c90
Show file tree
Hide file tree
Showing 4 changed files with 21 additions and 27 deletions.
6 changes: 5 additions & 1 deletion python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -3583,12 +3583,16 @@ def rename(
)

if level is not None and isinstance(self.index, MultiIndex):
level = self.index._get_level_label(level)
out_index = self.index.copy(deep=copy)
out_index.get_level_values(level).to_frame().replace(
level_values = out_index.get_level_values(level)
level_values.to_frame().replace(
to_replace=list(index.keys()),
value=list(index.values()),
inplace=True,
)
out_index._data[level] = column.as_column(level_values)
out_index._compute_levels_and_codes()
out = DataFrame(index=out_index)
else:
to_replace = list(index.keys())
Expand Down
15 changes: 11 additions & 4 deletions python/cudf/cudf/core/multiindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -1577,10 +1577,17 @@ def droplevel(self, level=-1):
def to_pandas(
self, *, nullable: bool = False, arrow_type: bool = False
) -> pd.MultiIndex:
result = self.to_frame(
index=False, name=list(range(self.nlevels))
).to_pandas(nullable=nullable, arrow_type=arrow_type)
return pd.MultiIndex.from_frame(result, names=self.names)
# cudf uses np.iinfo(size_type_dtype).min as missing code
# pandas uses -1 as missing code
pd_codes = self._codes_frame.replace(np.iinfo(size_type_dtype).min, -1)
return pd.MultiIndex(
levels=[
level.to_pandas(nullable=nullable, arrow_type=arrow_type)
for level in self.levels
],
codes=[col.values_host for col in pd_codes._columns],
names=self.names,
)

@classmethod
@_cudf_nvtx_annotate
Expand Down
16 changes: 4 additions & 12 deletions python/cudf/cudf/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -9332,18 +9332,10 @@ def test_dataframe_setitem_cupy_array():
assert_eq(pdf, gdf)


@pytest.mark.parametrize(
"data", [{"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}]
)
@pytest.mark.parametrize(
"index",
[{0: 123, 1: 4, 2: 6}],
)
@pytest.mark.parametrize(
"level",
["x", 0],
)
def test_rename_for_level_MultiIndex_dataframe(data, index, level):
@pytest.mark.parametrize("level", ["x", 0])
def test_rename_for_level_MultiIndex_dataframe(level):
data = {"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}
index = {0: 123, 1: 4, 2: 6}
pdf = pd.DataFrame(
data,
index=pd.MultiIndex.from_tuples([(0, 1, 2), (1, 2, 3), (2, 3, 4)]),
Expand Down
11 changes: 1 addition & 10 deletions python/cudf/cudf/tests/test_dropna.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,21 +252,12 @@ def test_dropna_index(data, dtype):

@pytest.mark.parametrize("data", [[[1, None, 2], [None, None, 2]]])
@pytest.mark.parametrize("how", ["all", "any"])
def test_dropna_multiindex(data, how, request):
def test_dropna_multiindex(data, how):
pi = pd.MultiIndex.from_arrays(data)
gi = cudf.from_pandas(pi)

expect = pi.dropna(how)
got = gi.dropna(how)

if how == "all" and "data0" in request.node.callspec.id:
request.applymarker(
pytest.mark.xfail(
reason="pandas NA value np.nan results in float type. "
"cuDF correctly retains int type "
"(https://github.com/pandas-dev/pandas/issues/44792)"
)
)
assert_eq(expect, got)


Expand Down

0 comments on commit 9678c90

Please sign in to comment.