Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Preserve column metadata during more DataFrame operations #15519

Merged
merged 11 commits into from
May 3, 2024
33 changes: 33 additions & 0 deletions python/cudf/cudf/core/column_accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from __future__ import annotations

import itertools
import sys
from collections import abc
from functools import cached_property, reduce
from typing import (
Expand Down Expand Up @@ -174,6 +175,38 @@ def __repr__(self) -> str:
)
return f"{type_info}\n{column_info}"

def _from_columns_like_self(
self, columns: abc.Iterable[ColumnBase], verify: bool = True
):
"""
Return a new ColumnAccessor with columns and the properties of self.

Parameters
----------
columns : iterable of Columns
New columns for the ColumnAccessor.
verify : bool, optional
Whether to verify column length and type.
"""
wence- marked this conversation as resolved.
Show resolved Hide resolved
if sys.version_info.major >= 3 and sys.version_info.minor >= 10:
data = zip(self.names, columns, strict=True)
else:
columns = list(columns)
if len(columns) != len(self.names):
raise ValueError(
f"The number of columns ({len(columns)}) must match "
f"the number of existing column labels ({len(self.names)})."
)
data = zip(self.names, columns)
return type(self)(
data=dict(data),
multiindex=self.multiindex,
level_names=self.level_names,
rangeindex=self.rangeindex,
label_dtype=self.label_dtype,
verify=verify,
)

@property
def level_names(self) -> Tuple[Any, ...]:
if self._level_names is None or len(self._level_names) == 0:
Expand Down
16 changes: 10 additions & 6 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -3036,8 +3036,11 @@ def where(self, cond, other=None, inplace=False):

# First process the condition.
if isinstance(cond, Series):
cond = self._from_data_like_self(
{name: cond._column for name in self._column_names},
cond = self._from_data(
self._data._from_columns_like_self(
itertools.repeat(cond._column, len(self._column_names)),
verify=False,
)
)
elif hasattr(cond, "__cuda_array_interface__"):
cond = DataFrame(
Expand Down Expand Up @@ -3078,7 +3081,7 @@ def where(self, cond, other=None, inplace=False):
should be equal to number of columns of self"""
)

out = {}
out = []
for (name, col), other_col in zip(self._data.items(), other_cols):
col, other_col = _check_and_cast_columns_with_other(
source_col=col,
Expand All @@ -3091,16 +3094,17 @@ def where(self, cond, other=None, inplace=False):
col, other_col, cond_col
)

out[name] = _make_categorical_like(result, self._data[name])
out.append(_make_categorical_like(result, self._data[name]))
else:
out_mask = cudf._lib.null_mask.create_null_mask(
len(col),
state=cudf._lib.null_mask.MaskState.ALL_NULL,
)
out[name] = col.set_mask(out_mask)
out.append(col.set_mask(out_mask))

return self._mimic_inplace(
self._from_data_like_self(out), inplace=inplace
self._from_data_like_self(self._data._from_columns_like_self(out)),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I still see a number of instances of _from_data_like_self. Can we use _from_data instead consistently now?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we use _from_data instead consistently now?

Getting there but not quite yet due to preserving the index. IndexedFrame._from_data_like_self is like _from_data but preserves the existing index which IndexedFrame._from_data doesn't do by default.

inplace=inplace,
)

@docutils.doc_apply(
Expand Down
33 changes: 20 additions & 13 deletions python/cudf/cudf/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -1120,7 +1120,9 @@ def isna(self):
array([False, False, True, True, False, False])
"""
data_columns = (col.isnull() for col in self._columns)
return self._from_data_like_self(zip(self._column_names, data_columns))
return self._from_data_like_self(
self._data._from_columns_like_self(data_columns)
)

# Alias for isna
isnull = isna
Expand Down Expand Up @@ -1199,7 +1201,9 @@ def notna(self):
array([ True, True, False, False, True, True])
"""
data_columns = (col.notnull() for col in self._columns)
return self._from_data_like_self(zip(self._column_names, data_columns))
return self._from_data_like_self(
self._data._from_columns_like_self(data_columns)
)

# Alias for notna
notnull = notna
Expand Down Expand Up @@ -1506,7 +1510,9 @@ def _encode(self):
@_cudf_nvtx_annotate
def _unaryop(self, op):
data_columns = (col.unary_operator(op) for col in self._columns)
return self._from_data_like_self(zip(self._column_names, data_columns))
return self._from_data_like_self(
self._data._from_columns_like_self(data_columns)
)

@classmethod
@_cudf_nvtx_annotate
Expand Down Expand Up @@ -1638,12 +1644,14 @@ def _apply_cupy_ufunc_to_operands(
def __neg__(self):
"""Negate for integral dtypes, logical NOT for bools."""
return self._from_data_like_self(
{
name: col.unary_operator("not")
if is_bool_dtype(col.dtype)
else -1 * col
for name, col in self._data.items()
}
self._data._from_columns_like_self(
(
col.unary_operator("not")
if col.dtype.kind == "b"
else -1 * col
for col in self._data.columns
)
)
)

@_cudf_nvtx_annotate
Expand Down Expand Up @@ -1897,10 +1905,9 @@ def __copy__(self):
def __invert__(self):
"""Bitwise invert (~) for integral dtypes, logical NOT for bools."""
return self._from_data_like_self(
{
name: _apply_inverse_column(col)
for name, col in self._data.items()
}
self._data._from_columns_like_self(
(_apply_inverse_column(col) for col in self._data.columns)
)
)

@_cudf_nvtx_annotate
Expand Down
16 changes: 9 additions & 7 deletions python/cudf/cudf/core/indexed_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -1903,13 +1903,15 @@ def nans_to_nulls(self):
1 <NA> 3.14
2 <NA> <NA>
"""
result_data = {}
for name, col in self._data.items():
try:
result_data[name] = col.nans_to_nulls()
except AttributeError:
result_data[name] = col.copy()
return self._from_data_like_self(result_data)
result = (
col.nans_to_nulls()
if isinstance(col, cudf.core.column.NumericalColumn)
else col.copy()
for col in self._data.columns
)
return self._from_data_like_self(
self._data._from_columns_like_self(result)
)

def _copy_type_metadata(
self,
Expand Down
6 changes: 4 additions & 2 deletions python/cudf/cudf/core/multiindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -2088,6 +2088,8 @@ def _split_columns_by_levels(self, levels):
return data_columns, index_columns, data_names, index_names

def repeat(self, repeats, axis=None):
return self._from_columns_like_self(
Frame._repeat([*self._columns], repeats, axis), self._column_names
return self._from_data(
self._data._from_columns_like_self(
super()._repeat([*self._columns], repeats, axis)
)
)
4 changes: 3 additions & 1 deletion python/cudf/cudf/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -3654,7 +3654,9 @@ def pct_change(
def where(self, cond, other=None, inplace=False):
result_col = super().where(cond, other, inplace)
return self._mimic_inplace(
self._from_data_like_self({self.name: result_col}),
self._from_data_like_self(
self._data._from_columns_like_self([result_col])
),
inplace=inplace,
)

Expand Down
20 changes: 20 additions & 0 deletions python/cudf/cudf/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -10986,3 +10986,23 @@ def test_squeeze(axis, data):
result = df.squeeze(axis=axis)
expected = df.to_pandas().squeeze(axis=axis)
assert_eq(result, expected)


@pytest.mark.parametrize("column", [range(1), np.array([1], dtype=np.int8)])
@pytest.mark.parametrize(
"operation",
[
lambda df: df.where(df < 2, 2),
lambda df: df.nans_to_nulls(),
lambda df: df.isna(),
lambda df: df.notna(),
lambda df: abs(df),
lambda df: -df,
lambda df: ~df,
],
)
def test_op_preserves_column_metadata(column, operation):
df = cudf.DataFrame([1], columns=cudf.Index(column))
result = operation(df).columns
expected = pd.Index(column)
pd.testing.assert_index_equal(result, expected, exact=True)
Loading