Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Preserve names of column object in various APIs #13772

Merged
merged 6 commits into from
Jul 28, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 33 additions & 4 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -723,6 +723,10 @@ def __init__(
if dtype:
self._data = self.astype(dtype)._data

self._data.multiindex = self._data.multiindex or isinstance(
columns, pd.MultiIndex
)

@_cudf_nvtx_annotate
def _init_from_series_list(self, data, columns, index):
if index is None:
Expand Down Expand Up @@ -1820,19 +1824,29 @@ def _make_operands_and_index_for_binop(
NotImplementedType,
],
Optional[BaseIndex],
bool,
]:
lhs, rhs = self._data, other
index = self._index
fill_requires_key = False
left_default: Any = False
equal_columns = False
can_use_self_column_name = True

if _is_scalar_or_zero_d_array(other):
rhs = {name: other for name in self._data}
equal_columns = True
elif isinstance(other, Series):
rhs = dict(zip(other.index.values_host, other.values_host))
# For keys in right but not left, perform binops between NaN (not
# NULL!) and the right value (result is NaN).
left_default = as_column(np.nan, length=len(self))
equal_columns = other.index.to_pandas().equals(
galipremsagar marked this conversation as resolved.
Show resolved Hide resolved
self._data.to_pandas_index()
)
can_use_self_column_name = equal_columns or (
list(other._index._data.names) == self._data._level_names
)
galipremsagar marked this conversation as resolved.
Show resolved Hide resolved
elif isinstance(other, DataFrame):
if (
not can_reindex
Expand All @@ -1854,13 +1868,18 @@ def _make_operands_and_index_for_binop(
# For DataFrame-DataFrame ops, always default to operating against
# the fill value.
left_default = fill_value
equal_columns = self._column_names == other._column_names
can_use_self_column_name = (
equal_columns
or self._data._level_names == other._data._level_names
)
elif isinstance(other, (dict, abc.Mapping)):
# Need to fail early on host mapping types because we ultimately
# convert everything to a dict.
return NotImplemented, None
return NotImplemented, None, True

if not isinstance(rhs, (dict, abc.Mapping)):
return NotImplemented, None
return NotImplemented, None, True

operands = {
k: (
Expand All @@ -1876,7 +1895,8 @@ def _make_operands_and_index_for_binop(
for k, v in rhs.items():
if k not in lhs:
operands[k] = (left_default, v, reflect, None)
return operands, index

return operands, index, can_use_self_column_name

@classmethod
@_cudf_nvtx_annotate
Expand Down Expand Up @@ -5042,6 +5062,7 @@ def from_pandas(cls, dataframe, nan_as_null=None):

index = cudf.from_pandas(dataframe.index, nan_as_null=nan_as_null)
df = cls._from_data(data, index)
df._data._level_names = list(dataframe.columns.names)

# Set columns only if it is a MultiIndex
if isinstance(dataframe.columns, pd.MultiIndex):
Expand Down Expand Up @@ -5085,13 +5106,19 @@ def from_arrow(cls, table):
2 3 6
"""
index_col = None
col_index_names = None
if isinstance(table, pa.Table) and isinstance(
table.schema.pandas_metadata, dict
):
index_col = table.schema.pandas_metadata["index_columns"]
if "column_indexes" in table.schema.pandas_metadata:
col_index_names = []
for col_meta in table.schema.pandas_metadata["column_indexes"]:
col_index_names.append(col_meta["name"])

out = super().from_arrow(table)

if col_index_names is not None:
out._data._level_names = col_index_names
if index_col:
if isinstance(index_col[0], dict):
idx = cudf.RangeIndex(
Expand Down Expand Up @@ -5337,6 +5364,8 @@ def _from_arrays(cls, data, index=None, columns=None, nan_as_null=False):
df._data[names[0]] = column.as_column(
data, nan_as_null=nan_as_null
)
if isinstance(columns, pd.Index):
df._data._level_names = list(columns.names)

if index is None:
df._index = RangeIndex(start=0, stop=len(data))
Expand Down
25 changes: 20 additions & 5 deletions python/cudf/cudf/core/indexed_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -293,7 +293,9 @@ def _from_data(

@_cudf_nvtx_annotate
def _from_data_like_self(self, data: MutableMapping):
return self._from_data(data, self._index)
out = self._from_data(data, self._index)
out._data._level_names = self._data._level_names
return out

@classmethod
@_cudf_nvtx_annotate
Expand Down Expand Up @@ -3128,7 +3130,9 @@ def _reset_index(self, level, drop, col_level=0, col_fill=""):
# inserted to the left of existing data columns.
return (
ColumnAccessor(
{**new_column_data, **self._data}, self._data.multiindex
{**new_column_data, **self._data},
self._data.multiindex,
self._data._level_names,
),
index,
)
Expand Down Expand Up @@ -3465,14 +3469,24 @@ def _binaryop(
**kwargs,
):
reflect, op = self._check_reflected_op(op)
operands, out_index = self._make_operands_and_index_for_binop(
(
operands,
out_index,
can_use_self_column_name,
) = self._make_operands_and_index_for_binop(
other, op, fill_value, reflect, can_reindex
)
if operands is NotImplemented:
return NotImplemented

level_names = (
None if not can_use_self_column_name else self._data._level_names
)
return self._from_data(
ColumnAccessor(type(self)._colwise_binop(operands, op)),
ColumnAccessor(
type(self)._colwise_binop(operands, op),
level_names=level_names,
),
index=out_index,
)

Expand All @@ -3491,6 +3505,7 @@ def _make_operands_and_index_for_binop(
NotImplementedType,
],
Optional[cudf.BaseIndex],
bool,
]:
raise NotImplementedError(
f"Binary operations are not supported for {self.__class__}"
Expand All @@ -3516,7 +3531,7 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
if cupy_func:
if ufunc.nin == 2:
other = inputs[self is inputs[0]]
inputs, index = self._make_operands_and_index_for_binop(
inputs, index, _ = self._make_operands_and_index_for_binop(
other, fname
)
else:
Expand Down
5 changes: 4 additions & 1 deletion python/cudf/cudf/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1472,6 +1472,7 @@ def _make_operands_and_index_for_binop(
NotImplementedType,
],
Optional[BaseIndex],
bool,
]:
# Specialize binops to align indices.
if isinstance(other, Series):
Expand All @@ -1484,11 +1485,13 @@ def _make_operands_and_index_for_binop(
"Can only compare identically-labeled Series objects"
)
lhs, other = _align_indices([self, other], allow_non_unique=True)
can_use_self_column_name = self.name == other.name
else:
lhs = self
can_use_self_column_name = False

operands = lhs._make_operands_for_binop(other, fill_value, reflect)
return operands, lhs._index
return operands, lhs._index, can_use_self_column_name

@copy_docstring(CategoricalAccessor) # type: ignore
@property
Expand Down
41 changes: 39 additions & 2 deletions python/cudf/cudf/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -1708,6 +1708,7 @@ def test_nonmatching_index_setitem(nrows):
)
def test_from_pandas(dtype):
df = pd.DataFrame({"x": [1, 2, 3]}, index=[4.0, 5.0, 6.0], dtype=dtype)
df.columns.name = "custom_column_name"
gdf = cudf.DataFrame.from_pandas(df)
assert isinstance(gdf, cudf.DataFrame)

Expand Down Expand Up @@ -2483,8 +2484,15 @@ def test_bitwise_binops_series(pdf, gdf, binop):


@pytest.mark.parametrize("unaryop", [operator.neg, operator.inv, operator.abs])
def test_unaryops_df(pdf, gdf, unaryop):
d = unaryop(pdf - 5)
@pytest.mark.parametrize(
"col_name,assign_col_name", [(None, False), (None, True), ("abc", True)]
)
def test_unaryops_df(pdf, unaryop, col_name, assign_col_name):
pd_df = pdf.copy()
if assign_col_name:
pd_df.columns.name = col_name
gdf = cudf.from_pandas(pd_df)
d = unaryop(pd_df - 5)
g = unaryop(gdf - 5)
assert_eq(d, g)

Expand Down Expand Up @@ -2626,6 +2634,12 @@ def test_arrow_pandas_compat(pdf, gdf, preserve_index):
pdf2 = pdf_arrow_table.to_pandas()

assert_eq(pdf2, gdf2)
pdf.columns.name = "abc"
pdf_arrow_table = pa.Table.from_pandas(pdf, preserve_index=preserve_index)

gdf2 = cudf.DataFrame.from_arrow(pdf_arrow_table)
pdf2 = pdf_arrow_table.to_pandas()
assert_eq(pdf2, gdf2)


@pytest.mark.parametrize("dtype", NUMERIC_TYPES + ["bool"])
Expand Down Expand Up @@ -2912,6 +2926,7 @@ def test_tail_for_string():
["v0", "v1"],
["v0", "index"],
pd.MultiIndex.from_tuples([("x0", "x1"), ("y0", "y1")]),
pd.MultiIndex.from_tuples([(1, 2), (10, 11)], names=["ABC", "DEF"]),
],
)
@pytest.mark.parametrize("inplace", [True, False])
Expand Down Expand Up @@ -10147,3 +10162,25 @@ def test_dataframe_init_length_error(data, index):
{"data": data, "index": index},
),
)


def test_dataframe_init_columns_named_multiindex():
np.random.seed(0)
data = np.random.randn(2, 2)
columns = cudf.MultiIndex.from_tuples(
[("A", "one"), ("A", "two")], names=["y", "z"]
)
gdf = cudf.DataFrame(data, columns=columns)
pdf = pd.DataFrame(data, columns=columns.to_pandas())

assert_eq(gdf, pdf)


def test_dataframe_init_columns_named_index():
np.random.seed(0)
data = np.random.randn(2, 2)
columns = pd.Index(["a", "b"], name="custom_name")
gdf = cudf.DataFrame(data, columns=columns)
pdf = pd.DataFrame(data, columns=columns)

assert_eq(gdf, pdf)