From c36779e59aa0ebb45c7e06225712871875327d8d Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 26 Jul 2023 11:40:03 -0700 Subject: [PATCH 1/5] preserve column names --- python/cudf/cudf/core/dataframe.py | 7 +++++++ python/cudf/cudf/core/indexed_frame.py | 4 +++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 0fe89490905..faa1f5b2396 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -723,6 +723,10 @@ def __init__( if dtype: self._data = self.astype(dtype)._data + self._data.multiindex = self._data.multiindex or isinstance( + columns, pd.MultiIndex + ) + @_cudf_nvtx_annotate def _init_from_series_list(self, data, columns, index): if index is None: @@ -5042,6 +5046,7 @@ def from_pandas(cls, dataframe, nan_as_null=None): index = cudf.from_pandas(dataframe.index, nan_as_null=nan_as_null) df = cls._from_data(data, index) + df._data._level_names = list(dataframe.columns.names) # Set columns only if it is a MultiIndex if isinstance(dataframe.columns, pd.MultiIndex): @@ -5337,6 +5342,8 @@ def _from_arrays(cls, data, index=None, columns=None, nan_as_null=False): df._data[names[0]] = column.as_column( data, nan_as_null=nan_as_null ) + if isinstance(columns, pd.Index): + df._data._level_names = list(columns.names) if index is None: df._index = RangeIndex(start=0, stop=len(data)) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index aa0f060c8da..c7ef015bf23 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -293,7 +293,9 @@ def _from_data( @_cudf_nvtx_annotate def _from_data_like_self(self, data: MutableMapping): - return self._from_data(data, self._index) + out = self._from_data(data, self._index) + out._data._level_names = self._data._level_names + return out @classmethod @_cudf_nvtx_annotate From e845c72b44ff789da2a7097999c10823cd6b98ab Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 26 Jul 2023 12:48:39 -0700 Subject: [PATCH 2/5] add tests --- python/cudf/cudf/core/dataframe.py | 40 ++++++++++++++++++------ python/cudf/cudf/core/indexed_frame.py | 20 +++++++++--- python/cudf/cudf/core/series.py | 5 ++- python/cudf/cudf/tests/test_dataframe.py | 15 +++++++-- 4 files changed, 64 insertions(+), 16 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index faa1f5b2396..667341588fd 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1824,19 +1824,29 @@ def _make_operands_and_index_for_binop( NotImplementedType, ], Optional[BaseIndex], + bool, ]: lhs, rhs = self._data, other index = self._index fill_requires_key = False left_default: Any = False + equal_columns = False + can_use_self_column_name = True if _is_scalar_or_zero_d_array(other): rhs = {name: other for name in self._data} + equal_columns = True elif isinstance(other, Series): - rhs = dict(zip(other.index.values_host, other.values_host)) + rhs = dict(zip(other.index.to_pandas(), other.values_host)) # For keys in right but not left, perform binops between NaN (not # NULL!) and the right value (result is NaN). left_default = as_column(np.nan, length=len(self)) + equal_columns = other.index.to_pandas().equals( + self._data.to_pandas_index() + ) + can_use_self_column_name = equal_columns or ( + list(other._index._data.names) == self._data._level_names + ) elif isinstance(other, DataFrame): if ( not can_reindex @@ -1858,13 +1868,18 @@ def _make_operands_and_index_for_binop( # For DataFrame-DataFrame ops, always default to operating against # the fill value. left_default = fill_value + equal_columns = self._column_names == other._column_names + can_use_self_column_name = ( + equal_columns + or self._data._level_names == other._data._level_names + ) elif isinstance(other, (dict, abc.Mapping)): # Need to fail early on host mapping types because we ultimately # convert everything to a dict. - return NotImplemented, None + return NotImplemented, None, True if not isinstance(rhs, (dict, abc.Mapping)): - return NotImplemented, None + return NotImplemented, None, True operands = { k: ( @@ -1880,7 +1895,8 @@ def _make_operands_and_index_for_binop( for k, v in rhs.items(): if k not in lhs: operands[k] = (left_default, v, reflect, None) - return operands, index + + return operands, index, can_use_self_column_name @classmethod @_cudf_nvtx_annotate @@ -5090,13 +5106,19 @@ def from_arrow(cls, table): 2 3 6 """ index_col = None + col_index_names = None if isinstance(table, pa.Table) and isinstance( table.schema.pandas_metadata, dict ): index_col = table.schema.pandas_metadata["index_columns"] + if "column_indexes" in table.schema.pandas_metadata: + col_index_names = [] + for col_meta in table.schema.pandas_metadata["column_indexes"]: + col_index_names.append(col_meta["name"]) out = super().from_arrow(table) - + if col_index_names is not None: + out._data._level_names = col_index_names if index_col: if isinstance(index_col[0], dict): idx = cudf.RangeIndex( @@ -7450,13 +7472,13 @@ def _align_indices(lhs, rhs): lhs_out = DataFrame(index=df.index) rhs_out = DataFrame(index=df.index) common = set(lhs._column_names) & set(rhs._column_names) - common_x = {f"{x}_x" for x in common} - common_y = {f"{x}_y" for x in common} + common_x = {f"{x}_x": x for x in common} + common_y = {f"{x}_y": x for x in common} for col in df._column_names: if col in common_x: - lhs_out[col[:-2]] = df[col] + lhs_out[common_x[col]] = df[col] elif col in common_y: - rhs_out[col[:-2]] = df[col] + rhs_out[common_y[col]] = df[col] elif col in lhs: lhs_out[col] = df[col] elif col in rhs: diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index c7ef015bf23..492cbfd8cb6 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -3130,7 +3130,9 @@ def _reset_index(self, level, drop, col_level=0, col_fill=""): # inserted to the left of existing data columns. return ( ColumnAccessor( - {**new_column_data, **self._data}, self._data.multiindex + {**new_column_data, **self._data}, + self._data.multiindex, + self._data._level_names, ), index, ) @@ -3467,14 +3469,23 @@ def _binaryop( **kwargs, ): reflect, op = self._check_reflected_op(op) - operands, out_index = self._make_operands_and_index_for_binop( + ( + operands, + out_index, + can_use_self_column_name, + ) = self._make_operands_and_index_for_binop( other, op, fill_value, reflect, can_reindex ) if operands is NotImplemented: return NotImplemented return self._from_data( - ColumnAccessor(type(self)._colwise_binop(operands, op)), + ColumnAccessor( + type(self)._colwise_binop(operands, op), + level_names=None + if not can_use_self_column_name + else self._data._level_names, + ), index=out_index, ) @@ -3493,6 +3504,7 @@ def _make_operands_and_index_for_binop( NotImplementedType, ], Optional[cudf.BaseIndex], + bool, ]: raise NotImplementedError( f"Binary operations are not supported for {self.__class__}" @@ -3518,7 +3530,7 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): if cupy_func: if ufunc.nin == 2: other = inputs[self is inputs[0]] - inputs, index = self._make_operands_and_index_for_binop( + inputs, index, _ = self._make_operands_and_index_for_binop( other, fname ) else: diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index aaac91e927a..02de3b8282a 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -1472,6 +1472,7 @@ def _make_operands_and_index_for_binop( NotImplementedType, ], Optional[BaseIndex], + bool, ]: # Specialize binops to align indices. if isinstance(other, Series): @@ -1484,11 +1485,13 @@ def _make_operands_and_index_for_binop( "Can only compare identically-labeled Series objects" ) lhs, other = _align_indices([self, other], allow_non_unique=True) + can_use_self_column_name = self.name == other.name else: lhs = self + can_use_self_column_name = False operands = lhs._make_operands_for_binop(other, fill_value, reflect) - return operands, lhs._index + return operands, lhs._index, can_use_self_column_name @copy_docstring(CategoricalAccessor) # type: ignore @property diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 25a17697538..e512c8e9530 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -1708,6 +1708,7 @@ def test_nonmatching_index_setitem(nrows): ) def test_from_pandas(dtype): df = pd.DataFrame({"x": [1, 2, 3]}, index=[4.0, 5.0, 6.0], dtype=dtype) + df.columns.name = "custom_column_name" gdf = cudf.DataFrame.from_pandas(df) assert isinstance(gdf, cudf.DataFrame) @@ -2483,8 +2484,12 @@ def test_bitwise_binops_series(pdf, gdf, binop): @pytest.mark.parametrize("unaryop", [operator.neg, operator.inv, operator.abs]) -def test_unaryops_df(pdf, gdf, unaryop): - d = unaryop(pdf - 5) +@pytest.mark.parametrize("col_name", [None, "abc"]) +def test_unaryops_df(pdf, unaryop, col_name): + pd_df = pdf.copy() + pd_df.columns.name = col_name + gdf = cudf.from_pandas(pd_df) + d = unaryop(pd_df - 5) g = unaryop(gdf - 5) assert_eq(d, g) @@ -2626,6 +2631,12 @@ def test_arrow_pandas_compat(pdf, gdf, preserve_index): pdf2 = pdf_arrow_table.to_pandas() assert_eq(pdf2, gdf2) + pdf.columns.name = "abc" + pdf_arrow_table = pa.Table.from_pandas(pdf, preserve_index=preserve_index) + + gdf2 = cudf.DataFrame.from_arrow(pdf_arrow_table) + pdf2 = pdf_arrow_table.to_pandas() + assert_eq(pdf2, gdf2) @pytest.mark.parametrize("dtype", NUMERIC_TYPES + ["bool"]) From 388e6f49a8f8782201699fedef4247b75e9eb106 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 26 Jul 2023 13:00:51 -0700 Subject: [PATCH 3/5] revert some changes --- python/cudf/cudf/core/dataframe.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 667341588fd..fc6c669256f 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1837,7 +1837,7 @@ def _make_operands_and_index_for_binop( rhs = {name: other for name in self._data} equal_columns = True elif isinstance(other, Series): - rhs = dict(zip(other.index.to_pandas(), other.values_host)) + rhs = dict(zip(other.index.values_host, other.values_host)) # For keys in right but not left, perform binops between NaN (not # NULL!) and the right value (result is NaN). left_default = as_column(np.nan, length=len(self)) @@ -7472,13 +7472,13 @@ def _align_indices(lhs, rhs): lhs_out = DataFrame(index=df.index) rhs_out = DataFrame(index=df.index) common = set(lhs._column_names) & set(rhs._column_names) - common_x = {f"{x}_x": x for x in common} - common_y = {f"{x}_y": x for x in common} + common_x = {f"{x}_x" for x in common} + common_y = {f"{x}_y" for x in common} for col in df._column_names: if col in common_x: - lhs_out[common_x[col]] = df[col] + lhs_out[col[:-2]] = df[col] elif col in common_y: - rhs_out[common_y[col]] = df[col] + rhs_out[col[:-2]] = df[col] elif col in lhs: lhs_out[col] = df[col] elif col in rhs: From a892023f072afca2f32a414cab5e19480371e0e2 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 26 Jul 2023 14:38:23 -0700 Subject: [PATCH 4/5] Add more tests --- python/cudf/cudf/tests/test_dataframe.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index e512c8e9530..9db45ceba44 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -2923,6 +2923,7 @@ def test_tail_for_string(): ["v0", "v1"], ["v0", "index"], pd.MultiIndex.from_tuples([("x0", "x1"), ("y0", "y1")]), + pd.MultiIndex.from_tuples([(1, 2), (10, 11)], names=["ABC", "DEF"]), ], ) @pytest.mark.parametrize("inplace", [True, False]) @@ -10158,3 +10159,25 @@ def test_dataframe_init_length_error(data, index): {"data": data, "index": index}, ), ) + + +def test_dataframe_init_columns_named_multiindex(): + np.random.seed(0) + data = np.random.randn(2, 2) + columns = cudf.MultiIndex.from_tuples( + [("A", "one"), ("A", "two")], names=["y", "z"] + ) + gdf = cudf.DataFrame(data, columns=columns) + pdf = pd.DataFrame(data, columns=columns.to_pandas()) + + assert_eq(gdf, pdf) + + +def test_dataframe_init_columns_named_index(): + np.random.seed(0) + data = np.random.randn(2, 2) + columns = pd.Index(["a", "b"], name="custom_name") + gdf = cudf.DataFrame(data, columns=columns) + pdf = pd.DataFrame(data, columns=columns) + + assert_eq(gdf, pdf) From 4513ba87aa0a3d3828a28b3b55208a252551e2ad Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Fri, 28 Jul 2023 07:57:59 -0700 Subject: [PATCH 5/5] address reviews --- python/cudf/cudf/core/indexed_frame.py | 7 ++++--- python/cudf/cudf/tests/test_dataframe.py | 9 ++++++--- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 492cbfd8cb6..0ffc3948e67 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -3479,12 +3479,13 @@ def _binaryop( if operands is NotImplemented: return NotImplemented + level_names = ( + None if not can_use_self_column_name else self._data._level_names + ) return self._from_data( ColumnAccessor( type(self)._colwise_binop(operands, op), - level_names=None - if not can_use_self_column_name - else self._data._level_names, + level_names=level_names, ), index=out_index, ) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 9db45ceba44..d443cd92968 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -2484,10 +2484,13 @@ def test_bitwise_binops_series(pdf, gdf, binop): @pytest.mark.parametrize("unaryop", [operator.neg, operator.inv, operator.abs]) -@pytest.mark.parametrize("col_name", [None, "abc"]) -def test_unaryops_df(pdf, unaryop, col_name): +@pytest.mark.parametrize( + "col_name,assign_col_name", [(None, False), (None, True), ("abc", True)] +) +def test_unaryops_df(pdf, unaryop, col_name, assign_col_name): pd_df = pdf.copy() - pd_df.columns.name = col_name + if assign_col_name: + pd_df.columns.name = col_name gdf = cudf.from_pandas(pd_df) d = unaryop(pd_df - 5) g = unaryop(gdf - 5)