Skip to content

Commit

Permalink
Ensure MultiIndex.to_frame deep copies columns (rapidsai#16110)
Browse files Browse the repository at this point in the history
Additionally, this allows simplification in `MultiIndex.__repr__` which avoids a shallow copy and also caught a bug where `NaT` was not supposed to be quoted

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: rapidsai#16110
  • Loading branch information
mroeschke authored Jun 27, 2024
1 parent a71c249 commit 2ed69c9
Show file tree
Hide file tree
Showing 2 changed files with 35 additions and 63 deletions.
88 changes: 30 additions & 58 deletions python/cudf/cudf/core/multiindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from cudf.api.types import is_integer, is_list_like, is_object_dtype
from cudf.core import column
from cudf.core._base_index import _return_get_indexer_result
from cudf.core.column_accessor import ColumnAccessor
from cudf.core.frame import Frame
from cudf.core.index import (
BaseIndex,
Expand Down Expand Up @@ -446,45 +447,26 @@ def __repr__(self):
)
preprocess = self.take(indices)
else:
preprocess = self.copy(deep=False)

if any(col.has_nulls() for col in preprocess._data.columns):
preprocess_df = preprocess.to_frame(index=False)
for name, col in preprocess._data.items():
if isinstance(
col,
(
column.datetime.DatetimeColumn,
column.timedelta.TimeDeltaColumn,
),
):
preprocess_df[name] = col.astype("str").fillna(
str(cudf.NaT)
)
preprocess = self

tuples_list = list(
zip(
*list(
map(lambda val: pd.NA if val is None else val, col)
for col in preprocess_df.to_arrow()
.to_pydict()
.values()
)
)
)
arrays = []
for name, col in zip(self.names, preprocess._columns):
try:
pd_idx = col.to_pandas(nullable=True)
except NotImplementedError:
pd_idx = col.to_pandas(nullable=False)
pd_idx.name = name
arrays.append(pd_idx)

preprocess = preprocess.to_pandas(nullable=True)
preprocess.values[:] = tuples_list
else:
preprocess = preprocess.to_pandas(nullable=True)
preprocess_pd = pd.MultiIndex.from_arrays(arrays)

output = repr(preprocess)
output = repr(preprocess_pd)
output_prefix = self.__class__.__name__ + "("
output = output.lstrip(output_prefix)
lines = output.split("\n")

if len(lines) > 1:
if "length=" in lines[-1] and len(self) != len(preprocess):
if "length=" in lines[-1] and len(self) != len(preprocess_pd):
last_line = lines[-1]
length_index = last_line.index("length=")
last_line = last_line[:length_index] + f"length={len(self)})"
Expand Down Expand Up @@ -1022,42 +1004,32 @@ def to_frame(self, index=True, name=no_default, allow_duplicates=False):
a c a c
b d b d
"""
# TODO: Currently this function makes a shallow copy, which is
# incorrect. We want to make a deep copy, otherwise further
# modifications of the resulting DataFrame will affect the MultiIndex.
if name is no_default:
column_names = [
level if name is None else name
for level, name in enumerate(self.names)
]
elif not is_list_like(name):
raise TypeError(
"'name' must be a list / sequence of column names."
)
elif len(name) != len(self.levels):
raise ValueError(
"'name' should have the same length as "
"number of levels on index."
)
else:
if not is_list_like(name):
raise TypeError(
"'name' must be a list / sequence of column names."
)
if len(name) != len(self.levels):
raise ValueError(
"'name' should have the same length as "
"number of levels on index."
)
column_names = name

all_none_names = None
if not (
all_none_names := all(x is None for x in column_names)
) and len(column_names) != len(set(column_names)):
if len(column_names) != len(set(column_names)):
raise ValueError("Duplicate column names are not allowed")
df = cudf.DataFrame._from_data(
data=self._data,
columns=column_names
if name is not no_default and not all_none_names
else None,
ca = ColumnAccessor(
dict(zip(column_names, (col.copy() for col in self._columns))),
verify=False,
)
return cudf.DataFrame._from_data(
data=ca, index=self if index else None
)

if index:
df = df.set_index(self)

return df

@_cudf_nvtx_annotate
def get_level_values(self, level):
Expand Down Expand Up @@ -1243,7 +1215,7 @@ def values(self):

@classmethod
@_cudf_nvtx_annotate
def from_frame(cls, df, names=None):
def from_frame(cls, df: pd.DataFrame | cudf.DataFrame, names=None):
"""
Make a MultiIndex from a DataFrame.
Expand Down
10 changes: 5 additions & 5 deletions python/cudf/cudf/tests/test_repr.py
Original file line number Diff line number Diff line change
Expand Up @@ -1210,7 +1210,7 @@ def test_multiindex_repr(pmi, max_seq_items):
.index,
textwrap.dedent(
"""
MultiIndex([('abc', 'NaT', 0.345),
MultiIndex([('abc', NaT, 0.345),
( <NA>, '0 days 00:00:00.000000001', <NA>),
('xyz', '0 days 00:00:00.000000002', 100.0),
( <NA>, '0 days 00:00:00.000000003', 10.0)],
Expand Down Expand Up @@ -1252,10 +1252,10 @@ def test_multiindex_repr(pmi, max_seq_items):
.index,
textwrap.dedent(
"""
MultiIndex([('NaT', <NA>),
('NaT', <NA>),
('NaT', <NA>),
('NaT', <NA>)],
MultiIndex([(NaT, <NA>),
(NaT, <NA>),
(NaT, <NA>),
(NaT, <NA>)],
names=['b', 'a'])
"""
),
Expand Down

0 comments on commit 2ed69c9

Please sign in to comment.