diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 08a30729e7c..5483dcf653c 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -10,7 +10,7 @@ import sys import warnings from collections import defaultdict -from collections.abc import Iterable, Sequence +from collections.abc import Iterable, Mapping, Sequence from typing import ( Any, Dict, @@ -1854,86 +1854,75 @@ def _make_operands_and_index_for_binop( ], Optional[BaseIndex], ]: - lhs, rhs = self, other - - if _is_scalar_or_zero_d_array(rhs): - rhs = [rhs] * lhs._num_columns - - # For columns that exist in rhs but not lhs, we swap the order so that - # we can always assume that left has a binary operator. This - # implementation assumes that binary operations between a column and - # NULL are always commutative, even for binops (like subtraction) that - # are normally anticommutative. - # TODO: The above should no longer be necessary once we switch to - # properly invoking the operator since we can then rely on reflection. - if isinstance(rhs, Sequence): - # TODO: Consider validating sequence length (pandas does). - operands = { - name: (left, right, reflect, fill_value) - for right, (name, left) in zip(rhs, lhs._data.items()) - } - elif isinstance(rhs, DataFrame): + # Check built-in types first for speed. + if isinstance(other, (list, dict, Sequence, Mapping)): + warnings.warn( + "Binary operations between host objects such as " + f"{type(other)} and cudf.DataFrame are deprecated and will be " + "removed in a future release. Please convert it to a cudf " + "object before performing the operation.", + FutureWarning, + ) + if len(other) != self._num_columns: + raise ValueError( + "Other is of the wrong length. Expected " + f"{self._num_columns}, got {len(other)}" + ) + + lhs, rhs = self._data, other + index = self._index + fill_requires_key = False + left_default: Any = False + + if _is_scalar_or_zero_d_array(other): + rhs = {name: other for name in self._data} + elif isinstance(other, (list, Sequence)): + rhs = {name: o for (name, o) in zip(self._data, other)} + elif isinstance(other, Series): + rhs = dict(zip(other.index.values_host, other.values_host)) + # For keys in right but not left, perform binops between NaN (not + # NULL!) and the right value (result is NaN). + left_default = as_column(np.nan, length=len(self)) + elif isinstance(other, DataFrame): if ( not can_reindex and fn in cudf.utils.utils._EQUALITY_OPS and ( - not lhs._data.to_pandas_index().equals( - rhs._data.to_pandas_index() + not self.index.equals(other.index) + or not self._data.to_pandas_index().equals( + other._data.to_pandas_index() ) - or not lhs.index.equals(rhs.index) ) ): raise ValueError( "Can only compare identically-labeled DataFrame objects" ) + new_lhs, new_rhs = _align_indices(self, other) + index = new_lhs._index + lhs, rhs = new_lhs._data, new_rhs._data + fill_requires_key = True + # For DataFrame-DataFrame ops, always default to operating against + # the fill value. + left_default = fill_value + + if not isinstance(rhs, (dict, Mapping)): + return NotImplemented, None - lhs, rhs = _align_indices(lhs, rhs) - - operands = { - name: ( - lcol, - rhs._data[name] - if name in rhs._data - else (fill_value or None), - reflect, - fill_value if name in rhs._data else None, - ) - for name, lcol in lhs._data.items() - } - for name, col in rhs._data.items(): - if name not in lhs._data: - operands[name] = ( - col, - (fill_value or None), - not reflect, - None, - ) - elif isinstance(rhs, Series): - # Note: This logic will need updating if any of the user-facing - # binop methods (e.g. DataFrame.add) ever support axis=0/rows. - right_dict = dict(zip(rhs.index.values_host, rhs.values_host)) - left_cols = lhs._column_names - # mypy thinks lhs._column_names is a List rather than a Tuple, so - # we have to ignore the type check. - result_cols = left_cols + tuple( # type: ignore - col for col in right_dict if col not in left_cols + operands = { + k: ( + v, + rhs.get(k, fill_value), + reflect, + fill_value if (not fill_requires_key or k in rhs) else None, ) - operands = {} - for col in result_cols: - if col in left_cols: - left = lhs._data[col] - right = right_dict[col] if col in right_dict else None - else: - # We match pandas semantics here by performing binops - # between a NaN (not NULL!) column and the actual values, - # which results in nans, the pandas output. - left = as_column(np.nan, length=lhs._num_rows) - right = right_dict[col] - operands[col] = (left, right, reflect, fill_value) - else: - return NotImplemented, None + for k, v in lhs.items() + } - return operands, lhs._index + if left_default is not False: + for k, v in rhs.items(): + if k not in lhs: + operands[k] = (left_default, v, reflect, None) + return operands, index @_cudf_nvtx_annotate def update( diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 75c6e4d0964..fdbeee0e374 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -2475,7 +2475,10 @@ def _colwise_binop( ) in operands.items(): output_mask = None if fill_value is not None: - if isinstance(right_column, ColumnBase): + left_is_column = isinstance(left_column, ColumnBase) + right_is_column = isinstance(right_column, ColumnBase) + + if left_is_column and right_is_column: # If both columns are nullable, pandas semantics dictate # that nulls that are present in both left_column and # right_column are not filled. @@ -2489,9 +2492,14 @@ def _colwise_binop( left_column = left_column.fillna(fill_value) elif right_column.nullable: right_column = right_column.fillna(fill_value) - else: + elif left_is_column: if left_column.nullable: left_column = left_column.fillna(fill_value) + elif right_is_column: + if right_column.nullable: + right_column = right_column.fillna(fill_value) + else: + assert False, "At least one operand must be a column." # TODO: Disable logical and binary operators between columns that # are not numerical using the new binops mixin. diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py index 3e91aa634f4..1a06e98148b 100644 --- a/python/cudf/cudf/core/single_column_frame.py +++ b/python/cudf/cudf/core/single_column_frame.py @@ -3,6 +3,7 @@ from __future__ import annotations +import warnings from typing import Any, Dict, Optional, Tuple, Type, TypeVar, Union import cupy @@ -333,6 +334,17 @@ def _make_operands_for_binop( if isinstance(other, SingleColumnFrame): other = other._column elif not _is_scalar_or_zero_d_array(other): + if not hasattr(other, "__cuda_array_interface__"): + # TODO: When this deprecated behavior is removed, also change + # the above conditional to stop checking for pd.Series and + # pd.Index since we only need to support SingleColumnFrame. + warnings.warn( + f"Binary operations between host objects such as " + f"{type(other)} and {type(self)} are deprecated and will " + "be removed in a future release. Please convert it to a " + "cudf object before performing the operation.", + FutureWarning, + ) # Non-scalar right operands are valid iff they convert to columns. try: other = as_column(other) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 303c245777c..a7fad792bd0 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -8,6 +8,8 @@ import re import string import textwrap +import warnings +from contextlib import contextmanager from copy import copy import cupy @@ -2017,6 +2019,15 @@ def test_dataframe_min_count_ops(data, ops, skipna, min_count): ) +@contextmanager +def _hide_host_other_warning(other): + if isinstance(other, (dict, list)): + with pytest.warns(FutureWarning): + yield + else: + yield + + @pytest.mark.parametrize( "binop", [ @@ -2034,12 +2045,70 @@ def test_dataframe_min_count_ops(data, ops, skipna, min_count): operator.ne, ], ) -def test_binops_df(pdf, gdf, binop): - pdf = pdf + 1.0 - gdf = gdf + 1.0 - d = binop(pdf, pdf) - g = binop(gdf, gdf) - assert_eq(d, g) +@pytest.mark.parametrize( + "other", + [ + 1.0, + [1.0], + [1.0, 2.0], + [1.0, 2.0, 3.0], + {"x": 1.0}, + {"x": 1.0, "y": 2.0}, + {"x": 1.0, "y": 2.0, "z": 3.0}, + {"x": 1.0, "z": 3.0}, + pd.Series([1.0]), + pd.Series([1.0, 2.0]), + pd.Series([1.0, 2.0, 3.0]), + pd.Series([1.0], index=["x"]), + pd.Series([1.0, 2.0], index=["x", "y"]), + pd.Series([1.0, 2.0, 3.0], index=["x", "y", "z"]), + pd.DataFrame({"x": [1.0]}), + pd.DataFrame({"x": [1.0], "y": [2.0]}), + pd.DataFrame({"x": [1.0], "y": [2.0], "z": [3.0]}), + ], +) +def test_binops_df(pdf, gdf, binop, other): + # Avoid 1**NA cases: https://github.com/pandas-dev/pandas/issues/29997 + pdf[pdf == 1.0] = 2 + gdf[gdf == 1.0] = 2 + try: + with warnings.catch_warnings(record=True) as w: + d = binop(pdf, other) + except Exception: + if isinstance(other, (pd.Series, pd.DataFrame)): + other = cudf.from_pandas(other) + + # TODO: When we remove support for binary operations with lists and + # dicts, those cases should all be checked in a `pytest.raises` block + # that returns before we enter this try-except. + with _hide_host_other_warning(other): + assert_exceptions_equal( + lfunc=binop, + rfunc=binop, + lfunc_args_and_kwargs=([pdf, other], {}), + rfunc_args_and_kwargs=([gdf, other], {}), + compare_error_message=False, + ) + else: + if isinstance(other, (pd.Series, pd.DataFrame)): + other = cudf.from_pandas(other) + with _hide_host_other_warning(other): + g = binop(gdf, other) + try: + assert_eq(d, g) + except AssertionError: + # Currently we will not match pandas for equality/inequality + # operators when there are columns that exist in a Series but not + # the DataFrame because pandas returns True/False values whereas we + # return NA. However, this reindexing is deprecated in pandas so we + # opt not to add support. + if w and "DataFrame vs Series comparisons is deprecated" in str(w): + pass + + +def test_binops_df_invalid(gdf): + with pytest.raises(TypeError): + gdf + np.array([1, 2]) @pytest.mark.parametrize("binop", [operator.and_, operator.or_, operator.xor])