Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Simplify preprocessing of arguments for DataFrame binops #10563

Merged
merged 9 commits into from
Apr 12, 2022
125 changes: 57 additions & 68 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import sys
import warnings
from collections import defaultdict
from collections.abc import Iterable, Sequence
from collections.abc import Iterable, Mapping, Sequence
from typing import (
Any,
Dict,
Expand Down Expand Up @@ -1854,86 +1854,75 @@ def _make_operands_and_index_for_binop(
],
Optional[BaseIndex],
]:
lhs, rhs = self, other

if _is_scalar_or_zero_d_array(rhs):
rhs = [rhs] * lhs._num_columns

# For columns that exist in rhs but not lhs, we swap the order so that
# we can always assume that left has a binary operator. This
# implementation assumes that binary operations between a column and
# NULL are always commutative, even for binops (like subtraction) that
# are normally anticommutative.
# TODO: The above should no longer be necessary once we switch to
# properly invoking the operator since we can then rely on reflection.
if isinstance(rhs, Sequence):
# TODO: Consider validating sequence length (pandas does).
operands = {
name: (left, right, reflect, fill_value)
for right, (name, left) in zip(rhs, lhs._data.items())
}
elif isinstance(rhs, DataFrame):
# Check built-in types first for speed.
if isinstance(other, (list, dict, Sequence, Mapping)):
vyasr marked this conversation as resolved.
Show resolved Hide resolved
warnings.warn(
"Binary operations between host objects such as "
f"{type(other)} and cudf.DataFrame are deprecated and will be "
"removed in a future release. Please convert it to a cudf "
"object before performing the operation.",
FutureWarning,
)
if len(other) != self._num_columns:
raise ValueError(
"Other is of the wrong length. Expected "
f"{self._num_columns}, got {len(other)}"
)

lhs, rhs = self._data, other
index = self._index
fill_requires_key = False
left_default: Any = False

if _is_scalar_or_zero_d_array(other):
rhs = {name: other for name in self._data}
elif isinstance(other, (list, Sequence)):
rhs = {name: o for (name, o) in zip(self._data, other)}
elif isinstance(other, Series):
rhs = dict(zip(other.index.values_host, other.values_host))
# For keys in right but not left, perform binops between NaN (not
# NULL!) and the right value (result is NaN).
left_default = as_column(np.nan, length=len(self))
elif isinstance(other, DataFrame):
if (
not can_reindex
and fn in cudf.utils.utils._EQUALITY_OPS
and (
not lhs._data.to_pandas_index().equals(
rhs._data.to_pandas_index()
not self.index.equals(other.index)
or not self._data.to_pandas_index().equals(
other._data.to_pandas_index()
)
or not lhs.index.equals(rhs.index)
)
):
raise ValueError(
"Can only compare identically-labeled DataFrame objects"
)
new_lhs, new_rhs = _align_indices(self, other)
index = new_lhs._index
lhs, rhs = new_lhs._data, new_rhs._data
fill_requires_key = True
# For DataFrame-DataFrame ops, always default to operating against
# the fill value.
left_default = fill_value

if not isinstance(rhs, (dict, Mapping)):
return NotImplemented, None
vyasr marked this conversation as resolved.
Show resolved Hide resolved

lhs, rhs = _align_indices(lhs, rhs)

operands = {
name: (
lcol,
rhs._data[name]
if name in rhs._data
else (fill_value or None),
reflect,
fill_value if name in rhs._data else None,
)
for name, lcol in lhs._data.items()
}
for name, col in rhs._data.items():
if name not in lhs._data:
operands[name] = (
col,
(fill_value or None),
not reflect,
None,
)
elif isinstance(rhs, Series):
# Note: This logic will need updating if any of the user-facing
# binop methods (e.g. DataFrame.add) ever support axis=0/rows.
right_dict = dict(zip(rhs.index.values_host, rhs.values_host))
left_cols = lhs._column_names
# mypy thinks lhs._column_names is a List rather than a Tuple, so
# we have to ignore the type check.
result_cols = left_cols + tuple( # type: ignore
col for col in right_dict if col not in left_cols
operands = {
k: (
v,
rhs.get(k, fill_value),
reflect,
fill_value if (not fill_requires_key or k in rhs) else None,
)
operands = {}
for col in result_cols:
if col in left_cols:
left = lhs._data[col]
right = right_dict[col] if col in right_dict else None
else:
# We match pandas semantics here by performing binops
# between a NaN (not NULL!) column and the actual values,
# which results in nans, the pandas output.
left = as_column(np.nan, length=lhs._num_rows)
right = right_dict[col]
operands[col] = (left, right, reflect, fill_value)
else:
return NotImplemented, None
for k, v in lhs.items()
}

return operands, lhs._index
if left_default is not False:
for k, v in rhs.items():
if k not in lhs:
operands[k] = (left_default, v, reflect, None)
return operands, index

@_cudf_nvtx_annotate
def update(
Expand Down
12 changes: 10 additions & 2 deletions python/cudf/cudf/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -2475,7 +2475,10 @@ def _colwise_binop(
) in operands.items():
output_mask = None
if fill_value is not None:
if isinstance(right_column, ColumnBase):
left_is_column = isinstance(left_column, ColumnBase)
right_is_column = isinstance(right_column, ColumnBase)

if left_is_column and right_is_column:
# If both columns are nullable, pandas semantics dictate
# that nulls that are present in both left_column and
# right_column are not filled.
Expand All @@ -2489,9 +2492,14 @@ def _colwise_binop(
left_column = left_column.fillna(fill_value)
elif right_column.nullable:
right_column = right_column.fillna(fill_value)
else:
elif left_is_column:
if left_column.nullable:
left_column = left_column.fillna(fill_value)
elif right_is_column:
if right_column.nullable:
right_column = right_column.fillna(fill_value)
else:
assert False, "At least one operand must be a column."
galipremsagar marked this conversation as resolved.
Show resolved Hide resolved

# TODO: Disable logical and binary operators between columns that
# are not numerical using the new binops mixin.
Expand Down
12 changes: 12 additions & 0 deletions python/cudf/cudf/core/single_column_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

from __future__ import annotations

import warnings
from typing import Any, Dict, Optional, Tuple, Type, TypeVar, Union

import cupy
Expand Down Expand Up @@ -333,6 +334,17 @@ def _make_operands_for_binop(
if isinstance(other, SingleColumnFrame):
other = other._column
elif not _is_scalar_or_zero_d_array(other):
if not hasattr(other, "__cuda_array_interface__"):
# TODO: When this deprecated behavior is removed, also change
# the above conditional to stop checking for pd.Series and
# pd.Index since we only need to support SingleColumnFrame.
warnings.warn(
f"Binary operations between host objects such as "
f"{type(other)} and {type(self)} are deprecated and will "
"be removed in a future release. Please convert it to a "
"cudf object before performing the operation.",
FutureWarning,
)
# Non-scalar right operands are valid iff they convert to columns.
try:
other = as_column(other)
Expand Down
81 changes: 75 additions & 6 deletions python/cudf/cudf/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
import re
import string
import textwrap
import warnings
from contextlib import contextmanager
from copy import copy

import cupy
Expand Down Expand Up @@ -2017,6 +2019,15 @@ def test_dataframe_min_count_ops(data, ops, skipna, min_count):
)


@contextmanager
def _hide_host_other_warning(other):
if isinstance(other, (dict, list)):
with pytest.warns(FutureWarning):
yield
else:
yield


@pytest.mark.parametrize(
"binop",
[
Expand All @@ -2034,12 +2045,70 @@ def test_dataframe_min_count_ops(data, ops, skipna, min_count):
operator.ne,
],
)
def test_binops_df(pdf, gdf, binop):
pdf = pdf + 1.0
gdf = gdf + 1.0
d = binop(pdf, pdf)
g = binop(gdf, gdf)
assert_eq(d, g)
@pytest.mark.parametrize(
"other",
[
1.0,
[1.0],
[1.0, 2.0],
[1.0, 2.0, 3.0],
{"x": 1.0},
{"x": 1.0, "y": 2.0},
{"x": 1.0, "y": 2.0, "z": 3.0},
{"x": 1.0, "z": 3.0},
pd.Series([1.0]),
pd.Series([1.0, 2.0]),
pd.Series([1.0, 2.0, 3.0]),
pd.Series([1.0], index=["x"]),
pd.Series([1.0, 2.0], index=["x", "y"]),
pd.Series([1.0, 2.0, 3.0], index=["x", "y", "z"]),
pd.DataFrame({"x": [1.0]}),
pd.DataFrame({"x": [1.0], "y": [2.0]}),
pd.DataFrame({"x": [1.0], "y": [2.0], "z": [3.0]}),
],
)
def test_binops_df(pdf, gdf, binop, other):
# Avoid 1**NA cases: https://github.com/pandas-dev/pandas/issues/29997
pdf[pdf == 1.0] = 2
gdf[gdf == 1.0] = 2
try:
with warnings.catch_warnings(record=True) as w:
d = binop(pdf, other)
except Exception:
if isinstance(other, (pd.Series, pd.DataFrame)):
other = cudf.from_pandas(other)

# TODO: When we remove support for binary operations with lists and
# dicts, those cases should all be checked in a `pytest.raises` block
# that returns before we enter this try-except.
with _hide_host_other_warning(other):
assert_exceptions_equal(
lfunc=binop,
rfunc=binop,
lfunc_args_and_kwargs=([pdf, other], {}),
rfunc_args_and_kwargs=([gdf, other], {}),
compare_error_message=False,
)
else:
if isinstance(other, (pd.Series, pd.DataFrame)):
other = cudf.from_pandas(other)
with _hide_host_other_warning(other):
g = binop(gdf, other)
try:
assert_eq(d, g)
except AssertionError:
# Currently we will not match pandas for equality/inequality
# operators when there are columns that exist in a Series but not
# the DataFrame because pandas returns True/False values whereas we
# return NA. However, this reindexing is deprecated in pandas so we
# opt not to add support.
if w and "DataFrame vs Series comparisons is deprecated" in str(w):
pass


def test_binops_df_invalid(gdf):
with pytest.raises(TypeError):
gdf + np.array([1, 2])


@pytest.mark.parametrize("binop", [operator.and_, operator.or_, operator.xor])
Expand Down