diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index f4543bc6156..c2f3c782d10 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -1447,7 +1447,7 @@ def _union(self, other, sort=None): other_df["order"] = other_df.index res = self_df.merge(other_df, on=[0], how="outer") res = res.sort_values( - by=res._data.to_pandas_index()[1:], ignore_index=True + by=res._data.to_pandas_index[1:], ignore_index=True ) union_result = cudf.core.index._index_from_data({0: res._data[0]}) diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py index e4fd82e819b..aaf7d071dff 100644 --- a/python/cudf/cudf/core/column_accessor.py +++ b/python/cudf/cudf/core/column_accessor.py @@ -207,11 +207,16 @@ def _from_columns_like_self( @property def level_names(self) -> tuple[abc.Hashable, ...]: + if self.is_cached("to_pandas_index"): + return self.to_pandas_index.names if self._level_names is None or len(self._level_names) == 0: return tuple((None,) * max(1, self.nlevels)) else: return self._level_names + def is_cached(self, attr_name: str) -> bool: + return attr_name in self.__dict__ + @property def nlevels(self) -> int: if len(self) == 0: @@ -262,7 +267,12 @@ def _clear_cache(self, old_ncols: int, new_ncols: int) -> None: new_ncols: int len(self) after self._data was modified """ - cached_properties = ("columns", "names", "_grouped_data") + cached_properties = ( + "columns", + "names", + "_grouped_data", + "to_pandas_index", + ) for attr in cached_properties: try: self.__delattr__(attr) @@ -276,6 +286,7 @@ def _clear_cache(self, old_ncols: int, new_ncols: int) -> None: except AttributeError: pass + @cached_property def to_pandas_index(self) -> pd.Index: """Convert the keys of the ColumnAccessor to a Pandas Index object.""" if self.multiindex and len(self.level_names) > 0: @@ -726,10 +737,10 @@ def droplevel(self, level: int) -> None: } new_ncols = len(self) self._level_names = ( - self._level_names[:level] + self._level_names[level + 1 :] + self.level_names[:level] + self.level_names[level + 1 :] ) - if len(self._level_names) == 1: + if len(self.level_names) == 1: # can't use nlevels, as it depends on multiindex self.multiindex = False self._clear_cache(old_ncols, new_ncols) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index e66e4f41642..3334b57ce1b 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -961,7 +961,7 @@ def _init_from_series_list(self, data, columns, index): warnings.simplefilter("ignore", FutureWarning) concat_df = cudf.concat(data, axis=1) - cols = concat_df._data.to_pandas_index() + cols = concat_df._data.to_pandas_index if cols.dtype == "object": concat_df.columns = cols.astype("str") @@ -2092,7 +2092,7 @@ def _make_operands_and_index_for_binop( equal_columns = True elif isinstance(other, Series): if ( - not (self_pd_columns := self._data.to_pandas_index()).equals( + not (self_pd_columns := self._data.to_pandas_index).equals( other_pd_index := other.index.to_pandas() ) and not can_reindex @@ -2117,8 +2117,8 @@ def _make_operands_and_index_for_binop( and fn in cudf.utils.utils._EQUALITY_OPS and ( not self.index.equals(other.index) - or not self._data.to_pandas_index().equals( - other._data.to_pandas_index() + or not self._data.to_pandas_index.equals( + other._data.to_pandas_index ) ) ): @@ -2162,11 +2162,11 @@ def _make_operands_and_index_for_binop( if not equal_columns: if isinstance(other, DataFrame): - column_names_list = self._data.to_pandas_index().join( - other._data.to_pandas_index(), how="outer" + column_names_list = self._data.to_pandas_index.join( + other._data.to_pandas_index, how="outer" ) elif isinstance(other, Series): - column_names_list = self._data.to_pandas_index().join( + column_names_list = self._data.to_pandas_index.join( other.index.to_pandas(), how="outer" ) else: @@ -2626,8 +2626,8 @@ def update( if not isinstance(other, DataFrame): other = DataFrame(other) - self_cols = self._data.to_pandas_index() - if not self_cols.equals(other._data.to_pandas_index()): + self_cols = self._data.to_pandas_index + if not self_cols.equals(other._data.to_pandas_index): other = other.reindex(self_cols, axis=1) if not self.index.equals(other.index): other = other.reindex(self.index, axis=0) @@ -2663,7 +2663,7 @@ def __iter__(self): def __contains__(self, item): # This must check against containment in the pandas Index and not # self._column_names to handle NA, None, nan, etc. correctly. - return item in self._data.to_pandas_index() + return item in self._data.to_pandas_index @_performance_tracking def items(self): @@ -2700,14 +2700,14 @@ def at(self): @property # type: ignore @_external_only_api( - "Use _column_names instead, or _data.to_pandas_index() if a pandas " + "Use _column_names instead, or _data.to_pandas_index if a pandas " "index is absolutely necessary. For checking if the columns are a " "MultiIndex, use _data.multiindex." ) @_performance_tracking def columns(self): """Returns a tuple of columns""" - return self._data.to_pandas_index() + return self._data.to_pandas_index @columns.setter # type: ignore @_performance_tracking @@ -2916,7 +2916,7 @@ def reindex( df = self else: columns = cudf.Index(columns) - intersection = self._data.to_pandas_index().intersection( + intersection = self._data.to_pandas_index.intersection( columns.to_pandas() ) df = self.loc[:, intersection] @@ -3430,7 +3430,7 @@ def axes(self): Index(['key', 'k2', 'val', 'temp'], dtype='object')] """ - return [self.index, self._data.to_pandas_index()] + return [self.index, self._data.to_pandas_index] def diff(self, periods=1, axis=0): """ @@ -4129,7 +4129,7 @@ def transpose(self): Not supporting *copy* because default and only behavior is copy=True """ - index = self._data.to_pandas_index() + index = self._data.to_pandas_index columns = self.index.copy(deep=False) if self._num_columns == 0 or self._num_rows == 0: return DataFrame(index=index, columns=columns) @@ -5535,7 +5535,7 @@ def to_pandas( } out_df = pd.DataFrame(out_data, index=out_index) - out_df.columns = self._data.to_pandas_index() + out_df.columns = self._data.to_pandas_index return out_df @@ -6487,7 +6487,7 @@ def _reduce( source = self._get_columns_by_label(numeric_cols) if source.empty: return Series( - index=self._data.to_pandas_index()[:0] + index=self._data.to_pandas_index[:0] if axis == 0 else source.index, dtype="float64", @@ -6540,7 +6540,7 @@ def _reduce( "Columns must all have the same dtype to " f"perform {op=} with {axis=}" ) - pd_index = source._data.to_pandas_index() + pd_index = source._data.to_pandas_index if source._data.multiindex: idx = MultiIndex.from_pandas(pd_index) else: @@ -7242,7 +7242,7 @@ def stack( ] has_unnamed_levels = len(unnamed_levels_indices) > 0 - column_name_idx = self._data.to_pandas_index() + column_name_idx = self._data.to_pandas_index # Construct new index from the levels specified by `level` named_levels = pd.MultiIndex.from_arrays( [column_name_idx.get_level_values(lv) for lv in level_indices] @@ -7432,7 +7432,7 @@ def cov(self, min_periods=None, ddof: int = 1, numeric_only: bool = False): ) cov = cupy.cov(self.values, ddof=ddof, rowvar=False) - cols = self._data.to_pandas_index() + cols = self._data.to_pandas_index df = DataFrame(cupy.asfortranarray(cov), index=cols) df._set_columns_like(self._data) return df @@ -7475,7 +7475,7 @@ def corr( ) corr = cupy.corrcoef(values, rowvar=False) - cols = self._data.to_pandas_index() + cols = self._data.to_pandas_index df = DataFrame(cupy.asfortranarray(corr), index=cols) df._set_columns_like(self._data) return df @@ -7544,7 +7544,7 @@ def keys(self): >>> df.keys() Index([0, 1, 2, 3], dtype='int64') """ - return self._data.to_pandas_index() + return self._data.to_pandas_index def itertuples(self, index=True, name="Pandas"): """ @@ -7778,7 +7778,7 @@ def nunique(self, axis=0, dropna: bool = True) -> Series: raise NotImplementedError("axis parameter is not supported yet.") counts = [col.distinct_count(dropna=dropna) for col in self._columns] return self._constructor_sliced( - counts, index=self._data.to_pandas_index() + counts, index=self._data.to_pandas_index ) def _sample_axis_1( diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 6cd8e11695f..e8a9e599cb0 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -3087,7 +3087,7 @@ def agg(self, func, *args, engine=None, engine_kwargs=None, **kwargs): # drop the first level if we have a multiindex if result._data.nlevels > 1: - result.columns = result._data.to_pandas_index().droplevel(0) + result.columns = result._data.to_pandas_index.droplevel(0) return result diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 72bb85821fa..6854cb02aa5 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -1106,13 +1106,11 @@ def dot(self, other, reflect=False): lhs = self.reindex(index=common, copy=False).values rhs = other.reindex(index=common, copy=False).values if isinstance(other, cudf.DataFrame): - result_index = other._data.to_pandas_index() + result_index = other._data.to_pandas_index elif isinstance(self, cudf.DataFrame) and isinstance( other, (cudf.Series, cudf.DataFrame) ): - common = self._data.to_pandas_index().union( - other.index.to_pandas() - ) + common = self._data.to_pandas_index.union(other.index.to_pandas()) if len(common) > self._num_columns or len(common) > len( other.index ): @@ -1124,7 +1122,7 @@ def dot(self, other, reflect=False): rhs = other.reindex(index=common, copy=False).values lhs = lhs.values if isinstance(other, cudf.DataFrame): - result_cols = other._data.to_pandas_index() + result_cols = other._data.to_pandas_index elif isinstance( other, (cp.ndarray, np.ndarray) @@ -2244,7 +2242,7 @@ def truncate(self, before=None, after=None, axis=0, copy=True): if not copy: raise ValueError("Truncating with copy=False is not supported.") axis = self._get_axis_from_axis_arg(axis) - ax = self.index if axis == 0 else self._data.to_pandas_index() + ax = self.index if axis == 0 else self._data.to_pandas_index if not ax.is_monotonic_increasing and not ax.is_monotonic_decreasing: raise ValueError("truncate requires a sorted index") @@ -6770,7 +6768,7 @@ def _drop_rows_by_labels( return obj.__class__._from_data( join_res.iloc[:, idx_nlv:]._data, index=midx, - columns=obj._data.to_pandas_index(), + columns=obj._data.to_pandas_index, ) else: diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index d2afe643dc4..1e613e49ffc 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -1123,7 +1123,7 @@ def _concat(cls, objs) -> Self: # TODO: Verify if this is really necessary or if we can rely on # DataFrame._concat. if len(source_data) > 1: - colnames = source_data[0]._data.to_pandas_index() + colnames = source_data[0]._data.to_pandas_index for obj in source_data[1:]: obj.columns = colnames @@ -2068,7 +2068,7 @@ def _union(self, other, sort=None) -> Self: result_df = self_df.merge(other_df, on=col_names, how="outer") result_df = result_df.sort_values( - by=result_df._data.to_pandas_index()[self.nlevels :], + by=result_df._data.to_pandas_index[self.nlevels :], ignore_index=True, ) diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index 3ab6ed306b6..0abd42d4d4e 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -431,8 +431,9 @@ def concat( result_columns = ( objs[0] - ._data.to_pandas_index() - .append([obj._data.to_pandas_index() for obj in objs[1:]]) + ._data.to_pandas_index.append( + [obj._data.to_pandas_index for obj in objs[1:]] + ) .unique() ) @@ -689,7 +690,7 @@ def _tile(A, reps): if not value_vars: # TODO: Use frame._data.label_dtype when it's more consistently set var_data = cudf.Series( - value_vars, dtype=frame._data.to_pandas_index().dtype + value_vars, dtype=frame._data.to_pandas_index.dtype ) else: var_data = ( @@ -1273,7 +1274,7 @@ def unstack(df, level, fill_value=None, sort: bool = True): res = df.T.stack(future_stack=False) # Result's index is a multiindex res.index.names = ( - tuple(df._data.to_pandas_index().names) + df.index.names + tuple(df._data.to_pandas_index.names) + df.index.names ) return res else: diff --git a/python/cudf/cudf/testing/testing.py b/python/cudf/cudf/testing/testing.py index 0b09cf7dc34..a1df2c7d857 100644 --- a/python/cudf/cudf/testing/testing.py +++ b/python/cudf/cudf/testing/testing.py @@ -692,8 +692,8 @@ def assert_frame_equal( ) pd.testing.assert_index_equal( - left._data.to_pandas_index(), - right._data.to_pandas_index(), + left._data.to_pandas_index, + right._data.to_pandas_index, exact=check_column_type, check_names=check_names, check_exact=check_exact, diff --git a/python/cudf/cudf/tests/test_column_accessor.py b/python/cudf/cudf/tests/test_column_accessor.py index 5cef077c18d..27ec4fcd1f3 100644 --- a/python/cudf/cudf/tests/test_column_accessor.py +++ b/python/cudf/cudf/tests/test_column_accessor.py @@ -64,7 +64,7 @@ def test_to_pandas_simple(simple_data): # Index([], dtype='object'), and `integer` for RangeIndex() # to ignore this `inferred_type` comparison, we pass exact=False. assert_eq( - ca.to_pandas_index(), + ca.to_pandas_index, pd.DataFrame( {key: value.values_host for key, value in simple_data.items()} ).columns, @@ -75,7 +75,7 @@ def test_to_pandas_simple(simple_data): def test_to_pandas_multiindex(mi_data): ca = ColumnAccessor(mi_data, multiindex=True) assert_eq( - ca.to_pandas_index(), + ca.to_pandas_index, pd.DataFrame( {key: value.values_host for key, value in mi_data.items()} ).columns, @@ -89,7 +89,7 @@ def test_to_pandas_multiindex_names(): level_names=("foo", "bar"), ) assert_eq( - ca.to_pandas_index(), + ca.to_pandas_index, pd.MultiIndex.from_tuples( (("a", "b"), ("c", "d")), names=("foo", "bar") ), diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index d04fd97dcbd..11a9b398b50 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -11193,3 +11193,32 @@ def test_dataframe_init_column(): expect = cudf.DataFrame({"a": s}) actual = cudf.DataFrame._from_arrays(s._column, columns=["a"]) assert_eq(expect, actual) + + +@pytest.mark.parametrize("name", [None, "foo", 1, 1.0]) +def test_dataframe_column_name(name): + df = cudf.DataFrame({"a": [1, 2, 3]}) + pdf = df.to_pandas() + + df.columns.name = name + pdf.columns.name = name + + assert_eq(df, pdf) + assert_eq(df.columns.name, pdf.columns.name) + + +@pytest.mark.parametrize("names", [["abc", "def"], [1, 2], ["abc", 10]]) +def test_dataframe_multiindex_column_names(names): + arrays = [["A", "A", "B", "B"], ["one", "two", "one", "two"]] + tuples = list(zip(*arrays)) + index = pd.MultiIndex.from_tuples(tuples, names=["first", "second"]) + + pdf = pd.DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]], columns=index) + df = cudf.from_pandas(pdf) + + assert_eq(df, pdf) + assert_eq(df.columns.names, pdf.columns.names) + pdf.columns.names = names + df.columns.names = names + assert_eq(df, pdf) + assert_eq(df.columns.names, pdf.columns.names)