diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index afbd00bb00c..6b5f3809c98 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1618,7 +1618,7 @@ def _concat( # Reassign index and column names if objs[0]._data.multiindex: - out.columns = objs[0].columns + out._set_column_names_like(objs[0]) else: out.columns = names if not ignore_index: @@ -6606,10 +6606,10 @@ def _align_indices(lhs, rhs): df = df.sort_index() lhs_out = DataFrame(index=df.index) rhs_out = DataFrame(index=df.index) - common = set(lhs.columns) & set(rhs.columns) + common = set(lhs._column_names) & set(rhs._column_names) common_x = {f"{x}_x" for x in common} common_y = {f"{x}_y" for x in common} - for col in df.columns: + for col in df._column_names: if col in common_x: lhs_out[col[:-2]] = df[col] elif col in common_y: @@ -6639,7 +6639,7 @@ def _setitem_with_dataframe( """ if input_cols is None: - input_cols = input_df.columns + input_cols = input_df._column_names if len(input_cols) != len(replace_df._column_names): raise ValueError( diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py index 8f258ce27b2..8f00289afcb 100644 --- a/python/cudf/cudf/core/df_protocol.py +++ b/python/cudf/cudf/core/df_protocol.py @@ -1,3 +1,5 @@ +# Copyright (c) 2021-2022, NVIDIA CORPORATION. + import collections import enum from typing import ( @@ -535,7 +537,7 @@ def metadata(self): return {"cudf.index": self._df.index} def num_columns(self) -> int: - return len(self._df.columns) + return len(self._df._column_names) def num_rows(self) -> int: return len(self._df) @@ -544,7 +546,7 @@ def num_chunks(self) -> int: return 1 def column_names(self) -> Iterable[str]: - return self._df.columns.tolist() + return self._df._column_names def get_column(self, i: int) -> _CuDFColumn: return _CuDFColumn( diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index cc4a6b1adc6..5aa7f616e35 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -494,7 +494,7 @@ def melt( if not isinstance(id_vars, collections.abc.Sequence): id_vars = [id_vars] id_vars = list(id_vars) - missing = set(id_vars) - set(frame.columns) + missing = set(id_vars) - set(frame._column_names) if not len(missing) == 0: raise KeyError( f"The following 'id_vars' are not present" @@ -508,7 +508,7 @@ def melt( if not isinstance(value_vars, collections.abc.Sequence): value_vars = [value_vars] value_vars = list(value_vars) - missing = set(value_vars) - set(frame.columns) + missing = set(value_vars) - set(frame._column_names) if not len(missing) == 0: raise KeyError( f"The following 'value_vars' are not present" @@ -516,8 +516,7 @@ def melt( ) else: # then all remaining columns in frame - value_vars = frame.columns.drop(id_vars) - value_vars = list(value_vars) + value_vars = list(set(frame._column_names) - set(id_vars)) # Error for unimplemented support for datatype dtypes = [frame[col].dtype for col in id_vars + value_vars] @@ -691,7 +690,9 @@ def get_dummies( encode_fallback_dtypes = ["object", "category"] if columns is None or len(columns) == 0: - columns = df.select_dtypes(include=encode_fallback_dtypes).columns + columns = df.select_dtypes( + include=encode_fallback_dtypes + )._column_names _length_check_params(prefix, columns, "prefix") _length_check_params(prefix_sep, columns, "prefix_sep") @@ -1062,7 +1063,9 @@ def unstack(df, level, fill_value=None): ) res = df.T.stack(dropna=False) # Result's index is a multiindex - res.index.names = tuple(df.columns.names) + df.index.names + res.index.names = ( + tuple(df._data.to_pandas_index().names) + df.index.names + ) return res else: columns = df.index._poplevels(level) diff --git a/python/cudf/cudf/core/window/rolling.py b/python/cudf/cudf/core/window/rolling.py index 8ffd75b1d76..2282a435ed3 100644 --- a/python/cudf/cudf/core/window/rolling.py +++ b/python/cudf/cudf/core/window/rolling.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2021, NVIDIA CORPORATION +# Copyright (c) 2020-2022, NVIDIA CORPORATION import itertools @@ -198,8 +198,7 @@ def __getitem__(self, arg): center=self.center, ) - def _apply_agg_series(self, sr, agg_name): - source_column = sr._column + def _apply_agg_column(self, source_column, agg_name): min_periods = self.min_periods or 1 if isinstance(self.window, int): preceding_window = None @@ -230,7 +229,7 @@ def _apply_agg_series(self, sr, agg_name): ) window = None - result_col = libcudf.rolling.rolling( + return libcudf.rolling.rolling( source_column=source_column, pre_column_window=preceding_window, fwd_column_window=following_window, @@ -240,19 +239,26 @@ def _apply_agg_series(self, sr, agg_name): op=agg_name, agg_params=self.agg_params, ) - return sr._from_data({sr.name: result_col}, sr._index) def _apply_agg_dataframe(self, df, agg_name): - result_df = cudf.DataFrame({}) - for i, col_name in enumerate(df.columns): - result_col = self._apply_agg_series(df[col_name], agg_name) - result_df.insert(i, col_name, result_col) - result_df.index = df.index - return result_df + return cudf.DataFrame._from_data( + { + col_name: self._apply_agg_column(col, agg_name) + for col_name, col in df._data.items() + }, + index=df.index, + ) def _apply_agg(self, agg_name): if isinstance(self.obj, cudf.Series): - return self._apply_agg_series(self.obj, agg_name) + return cudf.Series._from_data( + { + self.obj.name: self._apply_agg_column( + self.obj._column, agg_name + ) + }, + index=self.obj.index, + ) else: return self._apply_agg_dataframe(self.obj, agg_name) diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py index 948428de4f0..6e4e104df4d 100644 --- a/python/cudf/cudf/io/parquet.py +++ b/python/cudf/cudf/io/parquet.py @@ -565,7 +565,7 @@ def to_parquet( if engine == "cudf": # Ensure that no columns dtype is 'category' - for col in df.columns: + for col in df._column_names: if partition_cols is None or col not in partition_cols: if df[col].dtype.name == "category": raise ValueError( diff --git a/python/cudf/cudf/testing/testing.py b/python/cudf/cudf/testing/testing.py index b3e30fac7d5..5f7616cc75e 100644 --- a/python/cudf/cudf/testing/testing.py +++ b/python/cudf/cudf/testing/testing.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2021, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. from __future__ import annotations @@ -696,8 +696,8 @@ def assert_frame_equal( if PANDAS_GE_110: pd.testing.assert_index_equal( - left.columns, - right.columns, + left._data.to_pandas_index(), + right._data.to_pandas_index(), exact=check_column_type, check_names=check_names, check_exact=check_exact, @@ -708,8 +708,8 @@ def assert_frame_equal( ) else: pd.testing.assert_index_equal( - left.columns, - right.columns, + left._data.to_pandas_index(), + right._data.to_pandas_index(), exact=check_column_type, check_names=check_names, check_exact=check_exact, @@ -717,7 +717,7 @@ def assert_frame_equal( obj=f"{obj}.columns", ) - for col in left.columns: + for col in left._column_names: assert_column_equal( left._data[col], right._data[col], diff --git a/python/cudf/cudf/tests/test_df_protocol.py b/python/cudf/cudf/tests/test_df_protocol.py index d24c8ca2860..de6aa0a6bf3 100644 --- a/python/cudf/cudf/tests/test_df_protocol.py +++ b/python/cudf/cudf/tests/test_df_protocol.py @@ -1,3 +1,5 @@ +# Copyright (c) 2021-2022, NVIDIA CORPORATION. + from typing import Any, Tuple import cupy as cp @@ -74,7 +76,7 @@ def assert_dataframe_equal(dfo: DataFrameObject, df: cudf.DataFrame): assert dfo.num_columns() == len(df.columns) assert dfo.num_rows() == len(df) assert dfo.num_chunks() == 1 - assert dfo.column_names() == list(df.columns) + assert dfo.column_names() == tuple(df.columns) for col in df.columns: assert_column_equal(dfo.get_column_by_name(col), df[col]._column)