Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Avoid factorization in MultiIndex.to_pandas #15150

Merged
6 changes: 5 additions & 1 deletion python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -3543,12 +3543,16 @@ def rename(
)

if level is not None and isinstance(self.index, MultiIndex):
level = self.index._get_level_label(level)
out_index = self.index.copy(deep=copy)
out_index.get_level_values(level).to_frame().replace(
level_values = out_index.get_level_values(level)
level_values.to_frame().replace(
to_replace=list(index.keys()),
value=list(index.values()),
inplace=True,
)
out_index._data[level] = column.as_column(level_values)
out_index._compute_levels_and_codes()
out = DataFrame(index=out_index)
else:
to_replace = list(index.keys())
Expand Down
15 changes: 11 additions & 4 deletions python/cudf/cudf/core/multiindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -1575,10 +1575,17 @@ def droplevel(self, level=-1):

@_cudf_nvtx_annotate
def to_pandas(self, *, nullable: bool = False) -> pd.MultiIndex:
result = self.to_frame(
index=False, name=list(range(self.nlevels))
).to_pandas(nullable=nullable)
return pd.MultiIndex.from_frame(result, names=self.names)
return pd.MultiIndex(
levels=[
level.to_pandas(nullable=nullable) for level in self.levels
],
# np.iinfo.min used as missing code, but pandas uses -1
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure if I follow what this comment means.

Is the np.iinfo.min coming from libcudf? (More generally, where is that value coming from?)

Should this be a "replace value" rather than a "clip" call?

Should we prefer to call something from libcudf instead of cupy?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it's coming from libcudf indirectly because it's being used in MultiIndex.__init__:

code[code == -1] = np.iinfo(size_type_dtype).min

Should this be a "replace value" rather than a "clip" call? Should we prefer to call something from libcudf instead of cupy?

Good point. A "replace value" would be more suitable here. I was kinda blindly following how MultiIndex.codes generates cupy arrays and working off that

codes=[
cp.clip(col.values, a_min=-1, a_max=None).get()
for col in self._codes_frame._columns
],
names=self.names,
)

@classmethod
@_cudf_nvtx_annotate
Expand Down
16 changes: 4 additions & 12 deletions python/cudf/cudf/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -9332,18 +9332,10 @@ def test_dataframe_setitem_cupy_array():
assert_eq(pdf, gdf)


@pytest.mark.parametrize(
"data", [{"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}]
)
@pytest.mark.parametrize(
"index",
[{0: 123, 1: 4, 2: 6}],
)
@pytest.mark.parametrize(
"level",
["x", 0],
)
def test_rename_for_level_MultiIndex_dataframe(data, index, level):
@pytest.mark.parametrize("level", ["x", 0])
def test_rename_for_level_MultiIndex_dataframe(level):
data = {"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}
index = {0: 123, 1: 4, 2: 6}
pdf = pd.DataFrame(
data,
index=pd.MultiIndex.from_tuples([(0, 1, 2), (1, 2, 3), (2, 3, 4)]),
Expand Down
Loading