Skip to content

Commit

Permalink
Add ability to modify and propagate names of columns object (#17597)
Browse files Browse the repository at this point in the history
Fixes: #17482, #14012

This PR fixes a long-standing issue where modifying `columns` `name` never propagates to the parent object. This PR fixes this issue by making `to_pandas_index` a cached-property and accessing it's names if this property was ever invoked in `level_names` property.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: #17597
  • Loading branch information
galipremsagar authored Dec 19, 2024
1 parent 0e01dbd commit 8e9254b
Show file tree
Hide file tree
Showing 10 changed files with 85 additions and 46 deletions.
2 changes: 1 addition & 1 deletion python/cudf/cudf/core/_base_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -1447,7 +1447,7 @@ def _union(self, other, sort=None):
other_df["order"] = other_df.index
res = self_df.merge(other_df, on=[0], how="outer")
res = res.sort_values(
by=res._data.to_pandas_index()[1:], ignore_index=True
by=res._data.to_pandas_index[1:], ignore_index=True
)
union_result = cudf.core.index._index_from_data({0: res._data[0]})

Expand Down
17 changes: 14 additions & 3 deletions python/cudf/cudf/core/column_accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,11 +207,16 @@ def _from_columns_like_self(

@property
def level_names(self) -> tuple[abc.Hashable, ...]:
if self.is_cached("to_pandas_index"):
return self.to_pandas_index.names
if self._level_names is None or len(self._level_names) == 0:
return tuple((None,) * max(1, self.nlevels))
else:
return self._level_names

def is_cached(self, attr_name: str) -> bool:
return attr_name in self.__dict__

@property
def nlevels(self) -> int:
if len(self) == 0:
Expand Down Expand Up @@ -262,7 +267,12 @@ def _clear_cache(self, old_ncols: int, new_ncols: int) -> None:
new_ncols: int
len(self) after self._data was modified
"""
cached_properties = ("columns", "names", "_grouped_data")
cached_properties = (
"columns",
"names",
"_grouped_data",
"to_pandas_index",
)
for attr in cached_properties:
try:
self.__delattr__(attr)
Expand All @@ -276,6 +286,7 @@ def _clear_cache(self, old_ncols: int, new_ncols: int) -> None:
except AttributeError:
pass

@cached_property
def to_pandas_index(self) -> pd.Index:
"""Convert the keys of the ColumnAccessor to a Pandas Index object."""
if self.multiindex and len(self.level_names) > 0:
Expand Down Expand Up @@ -726,10 +737,10 @@ def droplevel(self, level: int) -> None:
}
new_ncols = len(self)
self._level_names = (
self._level_names[:level] + self._level_names[level + 1 :]
self.level_names[:level] + self.level_names[level + 1 :]
)

if len(self._level_names) == 1:
if len(self.level_names) == 1:
# can't use nlevels, as it depends on multiindex
self.multiindex = False
self._clear_cache(old_ncols, new_ncols)
Expand Down
46 changes: 23 additions & 23 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -961,7 +961,7 @@ def _init_from_series_list(self, data, columns, index):
warnings.simplefilter("ignore", FutureWarning)
concat_df = cudf.concat(data, axis=1)

cols = concat_df._data.to_pandas_index()
cols = concat_df._data.to_pandas_index
if cols.dtype == "object":
concat_df.columns = cols.astype("str")

Expand Down Expand Up @@ -2092,7 +2092,7 @@ def _make_operands_and_index_for_binop(
equal_columns = True
elif isinstance(other, Series):
if (
not (self_pd_columns := self._data.to_pandas_index()).equals(
not (self_pd_columns := self._data.to_pandas_index).equals(
other_pd_index := other.index.to_pandas()
)
and not can_reindex
Expand All @@ -2117,8 +2117,8 @@ def _make_operands_and_index_for_binop(
and fn in cudf.utils.utils._EQUALITY_OPS
and (
not self.index.equals(other.index)
or not self._data.to_pandas_index().equals(
other._data.to_pandas_index()
or not self._data.to_pandas_index.equals(
other._data.to_pandas_index
)
)
):
Expand Down Expand Up @@ -2162,11 +2162,11 @@ def _make_operands_and_index_for_binop(

if not equal_columns:
if isinstance(other, DataFrame):
column_names_list = self._data.to_pandas_index().join(
other._data.to_pandas_index(), how="outer"
column_names_list = self._data.to_pandas_index.join(
other._data.to_pandas_index, how="outer"
)
elif isinstance(other, Series):
column_names_list = self._data.to_pandas_index().join(
column_names_list = self._data.to_pandas_index.join(
other.index.to_pandas(), how="outer"
)
else:
Expand Down Expand Up @@ -2626,8 +2626,8 @@ def update(
if not isinstance(other, DataFrame):
other = DataFrame(other)

self_cols = self._data.to_pandas_index()
if not self_cols.equals(other._data.to_pandas_index()):
self_cols = self._data.to_pandas_index
if not self_cols.equals(other._data.to_pandas_index):
other = other.reindex(self_cols, axis=1)
if not self.index.equals(other.index):
other = other.reindex(self.index, axis=0)
Expand Down Expand Up @@ -2663,7 +2663,7 @@ def __iter__(self):
def __contains__(self, item):
# This must check against containment in the pandas Index and not
# self._column_names to handle NA, None, nan, etc. correctly.
return item in self._data.to_pandas_index()
return item in self._data.to_pandas_index

@_performance_tracking
def items(self):
Expand Down Expand Up @@ -2700,14 +2700,14 @@ def at(self):

@property # type: ignore
@_external_only_api(
"Use _column_names instead, or _data.to_pandas_index() if a pandas "
"Use _column_names instead, or _data.to_pandas_index if a pandas "
"index is absolutely necessary. For checking if the columns are a "
"MultiIndex, use _data.multiindex."
)
@_performance_tracking
def columns(self):
"""Returns a tuple of columns"""
return self._data.to_pandas_index()
return self._data.to_pandas_index

@columns.setter # type: ignore
@_performance_tracking
Expand Down Expand Up @@ -2916,7 +2916,7 @@ def reindex(
df = self
else:
columns = cudf.Index(columns)
intersection = self._data.to_pandas_index().intersection(
intersection = self._data.to_pandas_index.intersection(
columns.to_pandas()
)
df = self.loc[:, intersection]
Expand Down Expand Up @@ -3430,7 +3430,7 @@ def axes(self):
Index(['key', 'k2', 'val', 'temp'], dtype='object')]
"""
return [self.index, self._data.to_pandas_index()]
return [self.index, self._data.to_pandas_index]

def diff(self, periods=1, axis=0):
"""
Expand Down Expand Up @@ -4129,7 +4129,7 @@ def transpose(self):
Not supporting *copy* because default and only behavior is
copy=True
"""
index = self._data.to_pandas_index()
index = self._data.to_pandas_index
columns = self.index.copy(deep=False)
if self._num_columns == 0 or self._num_rows == 0:
return DataFrame(index=index, columns=columns)
Expand Down Expand Up @@ -5535,7 +5535,7 @@ def to_pandas(
}

out_df = pd.DataFrame(out_data, index=out_index)
out_df.columns = self._data.to_pandas_index()
out_df.columns = self._data.to_pandas_index

return out_df

Expand Down Expand Up @@ -6487,7 +6487,7 @@ def _reduce(
source = self._get_columns_by_label(numeric_cols)
if source.empty:
return Series(
index=self._data.to_pandas_index()[:0]
index=self._data.to_pandas_index[:0]
if axis == 0
else source.index,
dtype="float64",
Expand Down Expand Up @@ -6540,7 +6540,7 @@ def _reduce(
"Columns must all have the same dtype to "
f"perform {op=} with {axis=}"
)
pd_index = source._data.to_pandas_index()
pd_index = source._data.to_pandas_index
if source._data.multiindex:
idx = MultiIndex.from_pandas(pd_index)
else:
Expand Down Expand Up @@ -7242,7 +7242,7 @@ def stack(
]
has_unnamed_levels = len(unnamed_levels_indices) > 0

column_name_idx = self._data.to_pandas_index()
column_name_idx = self._data.to_pandas_index
# Construct new index from the levels specified by `level`
named_levels = pd.MultiIndex.from_arrays(
[column_name_idx.get_level_values(lv) for lv in level_indices]
Expand Down Expand Up @@ -7432,7 +7432,7 @@ def cov(self, min_periods=None, ddof: int = 1, numeric_only: bool = False):
)

cov = cupy.cov(self.values, ddof=ddof, rowvar=False)
cols = self._data.to_pandas_index()
cols = self._data.to_pandas_index
df = DataFrame(cupy.asfortranarray(cov), index=cols)
df._set_columns_like(self._data)
return df
Expand Down Expand Up @@ -7475,7 +7475,7 @@ def corr(
)

corr = cupy.corrcoef(values, rowvar=False)
cols = self._data.to_pandas_index()
cols = self._data.to_pandas_index
df = DataFrame(cupy.asfortranarray(corr), index=cols)
df._set_columns_like(self._data)
return df
Expand Down Expand Up @@ -7544,7 +7544,7 @@ def keys(self):
>>> df.keys()
Index([0, 1, 2, 3], dtype='int64')
"""
return self._data.to_pandas_index()
return self._data.to_pandas_index

def itertuples(self, index=True, name="Pandas"):
"""
Expand Down Expand Up @@ -7778,7 +7778,7 @@ def nunique(self, axis=0, dropna: bool = True) -> Series:
raise NotImplementedError("axis parameter is not supported yet.")
counts = [col.distinct_count(dropna=dropna) for col in self._columns]
return self._constructor_sliced(
counts, index=self._data.to_pandas_index()
counts, index=self._data.to_pandas_index
)

def _sample_axis_1(
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -3087,7 +3087,7 @@ def agg(self, func, *args, engine=None, engine_kwargs=None, **kwargs):

# drop the first level if we have a multiindex
if result._data.nlevels > 1:
result.columns = result._data.to_pandas_index().droplevel(0)
result.columns = result._data.to_pandas_index.droplevel(0)

return result

Expand Down
12 changes: 5 additions & 7 deletions python/cudf/cudf/core/indexed_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -1106,13 +1106,11 @@ def dot(self, other, reflect=False):
lhs = self.reindex(index=common, copy=False).values
rhs = other.reindex(index=common, copy=False).values
if isinstance(other, cudf.DataFrame):
result_index = other._data.to_pandas_index()
result_index = other._data.to_pandas_index
elif isinstance(self, cudf.DataFrame) and isinstance(
other, (cudf.Series, cudf.DataFrame)
):
common = self._data.to_pandas_index().union(
other.index.to_pandas()
)
common = self._data.to_pandas_index.union(other.index.to_pandas())
if len(common) > self._num_columns or len(common) > len(
other.index
):
Expand All @@ -1124,7 +1122,7 @@ def dot(self, other, reflect=False):
rhs = other.reindex(index=common, copy=False).values
lhs = lhs.values
if isinstance(other, cudf.DataFrame):
result_cols = other._data.to_pandas_index()
result_cols = other._data.to_pandas_index

elif isinstance(
other, (cp.ndarray, np.ndarray)
Expand Down Expand Up @@ -2244,7 +2242,7 @@ def truncate(self, before=None, after=None, axis=0, copy=True):
if not copy:
raise ValueError("Truncating with copy=False is not supported.")
axis = self._get_axis_from_axis_arg(axis)
ax = self.index if axis == 0 else self._data.to_pandas_index()
ax = self.index if axis == 0 else self._data.to_pandas_index

if not ax.is_monotonic_increasing and not ax.is_monotonic_decreasing:
raise ValueError("truncate requires a sorted index")
Expand Down Expand Up @@ -6770,7 +6768,7 @@ def _drop_rows_by_labels(
return obj.__class__._from_data(
join_res.iloc[:, idx_nlv:]._data,
index=midx,
columns=obj._data.to_pandas_index(),
columns=obj._data.to_pandas_index,
)

else:
Expand Down
4 changes: 2 additions & 2 deletions python/cudf/cudf/core/multiindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -1123,7 +1123,7 @@ def _concat(cls, objs) -> Self:
# TODO: Verify if this is really necessary or if we can rely on
# DataFrame._concat.
if len(source_data) > 1:
colnames = source_data[0]._data.to_pandas_index()
colnames = source_data[0]._data.to_pandas_index
for obj in source_data[1:]:
obj.columns = colnames

Expand Down Expand Up @@ -2068,7 +2068,7 @@ def _union(self, other, sort=None) -> Self:

result_df = self_df.merge(other_df, on=col_names, how="outer")
result_df = result_df.sort_values(
by=result_df._data.to_pandas_index()[self.nlevels :],
by=result_df._data.to_pandas_index[self.nlevels :],
ignore_index=True,
)

Expand Down
9 changes: 5 additions & 4 deletions python/cudf/cudf/core/reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -431,8 +431,9 @@ def concat(

result_columns = (
objs[0]
._data.to_pandas_index()
.append([obj._data.to_pandas_index() for obj in objs[1:]])
._data.to_pandas_index.append(
[obj._data.to_pandas_index for obj in objs[1:]]
)
.unique()
)

Expand Down Expand Up @@ -689,7 +690,7 @@ def _tile(A, reps):
if not value_vars:
# TODO: Use frame._data.label_dtype when it's more consistently set
var_data = cudf.Series(
value_vars, dtype=frame._data.to_pandas_index().dtype
value_vars, dtype=frame._data.to_pandas_index.dtype
)
else:
var_data = (
Expand Down Expand Up @@ -1273,7 +1274,7 @@ def unstack(df, level, fill_value=None, sort: bool = True):
res = df.T.stack(future_stack=False)
# Result's index is a multiindex
res.index.names = (
tuple(df._data.to_pandas_index().names) + df.index.names
tuple(df._data.to_pandas_index.names) + df.index.names
)
return res
else:
Expand Down
4 changes: 2 additions & 2 deletions python/cudf/cudf/testing/testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -692,8 +692,8 @@ def assert_frame_equal(
)

pd.testing.assert_index_equal(
left._data.to_pandas_index(),
right._data.to_pandas_index(),
left._data.to_pandas_index,
right._data.to_pandas_index,
exact=check_column_type,
check_names=check_names,
check_exact=check_exact,
Expand Down
6 changes: 3 additions & 3 deletions python/cudf/cudf/tests/test_column_accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ def test_to_pandas_simple(simple_data):
# Index([], dtype='object'), and `integer` for RangeIndex()
# to ignore this `inferred_type` comparison, we pass exact=False.
assert_eq(
ca.to_pandas_index(),
ca.to_pandas_index,
pd.DataFrame(
{key: value.values_host for key, value in simple_data.items()}
).columns,
Expand All @@ -75,7 +75,7 @@ def test_to_pandas_simple(simple_data):
def test_to_pandas_multiindex(mi_data):
ca = ColumnAccessor(mi_data, multiindex=True)
assert_eq(
ca.to_pandas_index(),
ca.to_pandas_index,
pd.DataFrame(
{key: value.values_host for key, value in mi_data.items()}
).columns,
Expand All @@ -89,7 +89,7 @@ def test_to_pandas_multiindex_names():
level_names=("foo", "bar"),
)
assert_eq(
ca.to_pandas_index(),
ca.to_pandas_index,
pd.MultiIndex.from_tuples(
(("a", "b"), ("c", "d")), names=("foo", "bar")
),
Expand Down
Loading

0 comments on commit 8e9254b

Please sign in to comment.