From ab372f0f7604ac507d6c2dbba662a8a817dfda0b Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 21 Mar 2022 15:04:17 -0700 Subject: [PATCH 1/9] Various simplifications of DataFrame._make_index_and_operands_for_binop. --- python/cudf/cudf/core/dataframe.py | 77 ++++++++++++------------------ 1 file changed, 31 insertions(+), 46 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 08a30729e7c..7603dd746c6 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1854,47 +1854,42 @@ def _make_operands_and_index_for_binop( ], Optional[BaseIndex], ]: - lhs, rhs = self, other - - if _is_scalar_or_zero_d_array(rhs): - rhs = [rhs] * lhs._num_columns - - # For columns that exist in rhs but not lhs, we swap the order so that - # we can always assume that left has a binary operator. This - # implementation assumes that binary operations between a column and - # NULL are always commutative, even for binops (like subtraction) that - # are normally anticommutative. - # TODO: The above should no longer be necessary once we switch to - # properly invoking the operator since we can then rely on reflection. - if isinstance(rhs, Sequence): + # The default index to return. + index = self._index + if _is_scalar_or_zero_d_array(other): + operands = { + name: (left, other, reflect, fill_value) + for name, left in self._data.items() + } + elif isinstance(other, Sequence): # TODO: Consider validating sequence length (pandas does). operands = { name: (left, right, reflect, fill_value) - for right, (name, left) in zip(rhs, lhs._data.items()) + for right, (name, left) in zip(other, self._data.items()) } - elif isinstance(rhs, DataFrame): + elif isinstance(other, DataFrame): if ( not can_reindex and fn in cudf.utils.utils._EQUALITY_OPS and ( - not lhs._data.to_pandas_index().equals( - rhs._data.to_pandas_index() + not self.index.equals(other.index) + or not self._data.to_pandas_index().equals( + other._data.to_pandas_index() ) - or not lhs.index.equals(rhs.index) ) ): raise ValueError( "Can only compare identically-labeled DataFrame objects" ) - lhs, rhs = _align_indices(lhs, rhs) + lhs, rhs = _align_indices(self, other) + # Return the aligned index. + index = lhs._index operands = { name: ( lcol, - rhs._data[name] - if name in rhs._data - else (fill_value or None), + rhs._data.get(name, fill_value), reflect, fill_value if name in rhs._data else None, ) @@ -1902,38 +1897,28 @@ def _make_operands_and_index_for_binop( } for name, col in rhs._data.items(): if name not in lhs._data: - operands[name] = ( - col, - (fill_value or None), - not reflect, - None, - ) - elif isinstance(rhs, Series): + operands[name] = (fill_value, col, reflect, None) + elif isinstance(other, Series): # Note: This logic will need updating if any of the user-facing # binop methods (e.g. DataFrame.add) ever support axis=0/rows. - right_dict = dict(zip(rhs.index.values_host, rhs.values_host)) - left_cols = lhs._column_names - # mypy thinks lhs._column_names is a List rather than a Tuple, so - # we have to ignore the type check. - result_cols = left_cols + tuple( # type: ignore - col for col in right_dict if col not in left_cols - ) - operands = {} - for col in result_cols: - if col in left_cols: - left = lhs._data[col] - right = right_dict[col] if col in right_dict else None - else: + right = dict(zip(other.index.values_host, other.values_host)) + + operands = { + col: (self._data[col], right.get(col), reflect, fill_value) + for col in self._column_names + } + + for col in right: + if col not in self._column_names: # We match pandas semantics here by performing binops # between a NaN (not NULL!) column and the actual values, # which results in nans, the pandas output. - left = as_column(np.nan, length=lhs._num_rows) - right = right_dict[col] - operands[col] = (left, right, reflect, fill_value) + left = as_column(np.nan, length=self._num_rows) + operands[col] = (left, right[col], reflect, fill_value) else: return NotImplemented, None - return operands, lhs._index + return operands, index @_cudf_nvtx_annotate def update( From 4f4fa3d665174266ebb4407d3e7fe0baa27e7b22 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 21 Mar 2022 17:14:45 -0700 Subject: [PATCH 2/9] Unify more logic for scalar, list, and Series binops with DataFrame. --- python/cudf/cudf/core/dataframe.py | 59 +++++++++++++---------- python/cudf/cudf/tests/test_dataframe.py | 60 +++++++++++++++++++++--- 2 files changed, 89 insertions(+), 30 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 7603dd746c6..1abc787e924 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1856,17 +1856,45 @@ def _make_operands_and_index_for_binop( ]: # The default index to return. index = self._index + other_is_series = False + if _is_scalar_or_zero_d_array(other): - operands = { - name: (left, other, reflect, fill_value) - for name, left in self._data.items() - } + other = {name: other for name in self._data} elif isinstance(other, Sequence): - # TODO: Consider validating sequence length (pandas does). + if len(other) != self._num_columns: + raise ValueError( + f"Other is the wrong length. Expected {self._num_columns} " + f", got {len(other)}" + ) + other = {name: o for (name, o) in zip(self._data, other)} + elif isinstance(other, Series): + # Note: This logic will need updating if any of the user-facing + # binop methods (e.g. DataFrame.add) ever support axis=0/rows. + other = dict(zip(other.index.values_host, other.values_host)) + other_is_series = True + + # Check dict first for speed. + if isinstance(other, (dict, MutableMapping)): + # Series can have extra index elements, other objects cannot. + if not other_is_series and len(other) != self._num_columns: + raise ValueError( + f"Other is the wrong length. Expected {self._num_columns} " + f", got {len(other)}" + ) + # pandas checks the length of other, but not the actual elements, + # so we can end up with a mismatch (which produces nulls). operands = { - name: (left, right, reflect, fill_value) - for right, (name, left) in zip(other, self._data.items()) + name: (left, other.get(name), reflect, fill_value) + for name, left in self._data.items() } + if other_is_series: + for col in other: + if col not in self._column_names: + # We match pandas semantics here by performing binops + # between a NaN (not NULL!) column and the actual + # values, which results in nans, the pandas output. + left = as_column(np.nan, length=self._num_rows) + operands[col] = (left, other[col], reflect, fill_value) elif isinstance(other, DataFrame): if ( not can_reindex @@ -1898,23 +1926,6 @@ def _make_operands_and_index_for_binop( for name, col in rhs._data.items(): if name not in lhs._data: operands[name] = (fill_value, col, reflect, None) - elif isinstance(other, Series): - # Note: This logic will need updating if any of the user-facing - # binop methods (e.g. DataFrame.add) ever support axis=0/rows. - right = dict(zip(other.index.values_host, other.values_host)) - - operands = { - col: (self._data[col], right.get(col), reflect, fill_value) - for col in self._column_names - } - - for col in right: - if col not in self._column_names: - # We match pandas semantics here by performing binops - # between a NaN (not NULL!) column and the actual values, - # which results in nans, the pandas output. - left = as_column(np.nan, length=self._num_rows) - operands[col] = (left, right[col], reflect, fill_value) else: return NotImplemented, None diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 303c245777c..816bc8221f3 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -8,6 +8,7 @@ import re import string import textwrap +import warnings from copy import copy import cupy @@ -2034,12 +2035,59 @@ def test_dataframe_min_count_ops(data, ops, skipna, min_count): operator.ne, ], ) -def test_binops_df(pdf, gdf, binop): - pdf = pdf + 1.0 - gdf = gdf + 1.0 - d = binop(pdf, pdf) - g = binop(gdf, gdf) - assert_eq(d, g) +@pytest.mark.parametrize( + "other", + [ + 1.0, + [1.0], + [1.0, 2.0], + [1.0, 2.0, 3.0], + {"x": 1.0}, + {"x": 1.0, "y": 2.0}, + {"x": 1.0, "y": 2.0, "z": 3.0}, + pd.Series([1.0]), + # pd.Series([1.0, 2.0]), + # pd.Series([1.0, 2.0, 3.0]), + # pd.Series([1.0], index=["x"]), + # pd.Series([1.0, 2.0], index=["x", "y"]), + # pd.Series([1.0, 2.0, 3.0], index=["x", "y", "z"]), + pd.DataFrame({"x": [1.0]}), + pd.DataFrame({"x": [1.0], "y": [2.0]}), + pd.DataFrame({"x": [1.0], "y": [2.0], "z": [3.0]}), + ], +) +def test_binops_df(pdf, gdf, binop, other): + # Avoid 1**NA cases: https://github.com/pandas-dev/pandas/issues/29997 + pdf[pdf == 1.0] = 2 + gdf[gdf == 1.0] = 2 + try: + with warnings.catch_warnings(record=True) as w: + d = binop(pdf, other) + except Exception: + if isinstance(other, (pd.Series, pd.DataFrame)): + other = cudf.from_pandas(other) + + assert_exceptions_equal( + lfunc=binop, + rfunc=binop, + lfunc_args_and_kwargs=([pdf, other], {}), + rfunc_args_and_kwargs=([gdf, other], {}), + compare_error_message=False, + ) + else: + if isinstance(other, (pd.Series, pd.DataFrame)): + other = cudf.from_pandas(other) + g = binop(gdf, other) + try: + assert_eq(d, g) + except AssertionError: + # Currently we will not match pandas for equality/inequality + # operators when there are columns that exist in a Series but not + # the DataFrame because pandas returns True/False values whereas we + # return NA. However, this reindexing is deprecated in pandas so we + # opt not to add support. + if w and "DataFrame vs Series comparisons is deprecated" in str(w): + pass @pytest.mark.parametrize("binop", [operator.and_, operator.or_, operator.xor]) From c925cea402bfca387cad7a3e2c8741c8d7548580 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 21 Mar 2022 17:30:25 -0700 Subject: [PATCH 3/9] Separate Series back into its own block. --- python/cudf/cudf/core/dataframe.py | 44 +++++++++++++----------------- python/cudf/cudf/core/frame.py | 12 ++++++-- 2 files changed, 29 insertions(+), 27 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 1abc787e924..70c50dfa72a 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1854,47 +1854,41 @@ def _make_operands_and_index_for_binop( ], Optional[BaseIndex], ]: - # The default index to return. index = self._index - other_is_series = False - if _is_scalar_or_zero_d_array(other): - other = {name: other for name in self._data} - elif isinstance(other, Sequence): + # Check built-in types first for speed. + if isinstance(other, (list, dict, Sequence, MutableMapping)): if len(other) != self._num_columns: raise ValueError( f"Other is the wrong length. Expected {self._num_columns} " f", got {len(other)}" ) + + if _is_scalar_or_zero_d_array(other): + other = {name: other for name in self._data} + elif isinstance(other, Sequence): other = {name: o for (name, o) in zip(self._data, other)} - elif isinstance(other, Series): - # Note: This logic will need updating if any of the user-facing - # binop methods (e.g. DataFrame.add) ever support axis=0/rows. - other = dict(zip(other.index.values_host, other.values_host)) - other_is_series = True - # Check dict first for speed. if isinstance(other, (dict, MutableMapping)): - # Series can have extra index elements, other objects cannot. - if not other_is_series and len(other) != self._num_columns: - raise ValueError( - f"Other is the wrong length. Expected {self._num_columns} " - f", got {len(other)}" - ) # pandas checks the length of other, but not the actual elements, # so we can end up with a mismatch (which produces nulls). operands = { name: (left, other.get(name), reflect, fill_value) for name, left in self._data.items() } - if other_is_series: - for col in other: - if col not in self._column_names: - # We match pandas semantics here by performing binops - # between a NaN (not NULL!) column and the actual - # values, which results in nans, the pandas output. - left = as_column(np.nan, length=self._num_rows) - operands[col] = (left, other[col], reflect, fill_value) + elif isinstance(other, Series): + other = dict(zip(other.index.values_host, other.values_host)) + operands = { + name: (left, other.get(name), reflect, fill_value) + for name, left in self._data.items() + } + for col in other: + if col not in self._column_names: + # We match pandas semantics here by performing binops + # between a NaN (not NULL!) column and the actual + # values, which results in nans, the pandas output. + left = as_column(np.nan, length=self._num_rows) + operands[col] = (left, other[col], reflect, fill_value) elif isinstance(other, DataFrame): if ( not can_reindex diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 75c6e4d0964..fdbeee0e374 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -2475,7 +2475,10 @@ def _colwise_binop( ) in operands.items(): output_mask = None if fill_value is not None: - if isinstance(right_column, ColumnBase): + left_is_column = isinstance(left_column, ColumnBase) + right_is_column = isinstance(right_column, ColumnBase) + + if left_is_column and right_is_column: # If both columns are nullable, pandas semantics dictate # that nulls that are present in both left_column and # right_column are not filled. @@ -2489,9 +2492,14 @@ def _colwise_binop( left_column = left_column.fillna(fill_value) elif right_column.nullable: right_column = right_column.fillna(fill_value) - else: + elif left_is_column: if left_column.nullable: left_column = left_column.fillna(fill_value) + elif right_is_column: + if right_column.nullable: + right_column = right_column.fillna(fill_value) + else: + assert False, "At least one operand must be a column." # TODO: Disable logical and binary operators between columns that # are not numerical using the new binops mixin. From 8d5d90aa318a010c880ef0bf46f6c4b93369db4c Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 21 Mar 2022 17:48:05 -0700 Subject: [PATCH 4/9] Create helper functions for operand generation. --- python/cudf/cudf/core/dataframe.py | 55 ++++++++++++++++-------------- 1 file changed, 29 insertions(+), 26 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 70c50dfa72a..2e037f34e69 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1856,6 +1856,24 @@ def _make_operands_and_index_for_binop( ]: index = self._index + def make_operands(left_data, right_data, right_default=None, fill_requires_key=False): + return { + name: ( + left, + right_data.get(name, right_default), + reflect, + fill_value + if (not fill_requires_key or name in right_data) + else None, + ) + for name, left in left_data.items() + } + + def add_right_operands(operands, left_data, right_data, left_default): + for name, col in right_data.items(): + if name not in left_data: + operands[name] = (left_default(), col, reflect, fill_value) + # Check built-in types first for speed. if isinstance(other, (list, dict, Sequence, MutableMapping)): if len(other) != self._num_columns: @@ -1872,23 +1890,17 @@ def _make_operands_and_index_for_binop( if isinstance(other, (dict, MutableMapping)): # pandas checks the length of other, but not the actual elements, # so we can end up with a mismatch (which produces nulls). - operands = { - name: (left, other.get(name), reflect, fill_value) - for name, left in self._data.items() - } + operands = make_operands(self._data, other) elif isinstance(other, Series): other = dict(zip(other.index.values_host, other.values_host)) - operands = { - name: (left, other.get(name), reflect, fill_value) - for name, left in self._data.items() - } - for col in other: - if col not in self._column_names: - # We match pandas semantics here by performing binops - # between a NaN (not NULL!) column and the actual - # values, which results in nans, the pandas output. - left = as_column(np.nan, length=self._num_rows) - operands[col] = (left, other[col], reflect, fill_value) + operands = make_operands(self._data, other) + + # For keys in right but not left, we match pandas semantics here by + # performing binops between a NaN (not NULL!) column and the actual + # values, which results in nans, the pandas output. + for name, col in other.items(): + if name not in self._data: + operands[name] = (as_column(np.nan, length=self._num_rows), col, reflect, fill_value) elif isinstance(other, DataFrame): if ( not can_reindex @@ -1905,18 +1917,9 @@ def _make_operands_and_index_for_binop( ) lhs, rhs = _align_indices(self, other) - # Return the aligned index. index = lhs._index - - operands = { - name: ( - lcol, - rhs._data.get(name, fill_value), - reflect, - fill_value if name in rhs._data else None, - ) - for name, lcol in lhs._data.items() - } + operands = make_operands(lhs._data, rhs._data, fill_value, True) + add_right_operands(operands, lhs._data, rhs._data, lambda: None) for name, col in rhs._data.items(): if name not in lhs._data: operands[name] = (fill_value, col, reflect, None) From 4efbaba9cf388cab618f7fbef72003e107c6baeb Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 21 Mar 2022 18:23:34 -0700 Subject: [PATCH 5/9] Streamline the implementation. --- python/cudf/cudf/core/dataframe.py | 71 ++++++++++++++---------------- 1 file changed, 34 insertions(+), 37 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 2e037f34e69..a8ceadf2e56 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1854,25 +1854,20 @@ def _make_operands_and_index_for_binop( ], Optional[BaseIndex], ]: - index = self._index - - def make_operands(left_data, right_data, right_default=None, fill_requires_key=False): - return { - name: ( - left, - right_data.get(name, right_default), + def make_operands(left, right, add_operands, fill_if_no_key, right_default): + operands = { + k: ( + v, + right.get(k, right_default), reflect, - fill_value - if (not fill_requires_key or name in right_data) - else None, + fill_value if (not fill_if_no_key or k in right) else None, ) - for name, left in left_data.items() + for k, v in left.items() } - def add_right_operands(operands, left_data, right_data, left_default): - for name, col in right_data.items(): - if name not in left_data: - operands[name] = (left_default(), col, reflect, fill_value) + if add_operands is not None: + add_operands(operands, left, right) + return operands # Check built-in types first for speed. if isinstance(other, (list, dict, Sequence, MutableMapping)): @@ -1882,25 +1877,25 @@ def add_right_operands(operands, left_data, right_data, left_default): f", got {len(other)}" ) + lhs, rhs = self._data, other + index = self._index + add_operands = None + fill_if_no_key = False + right_default = None + if _is_scalar_or_zero_d_array(other): - other = {name: other for name in self._data} + rhs = {name: other for name in self._data} elif isinstance(other, Sequence): - other = {name: o for (name, o) in zip(self._data, other)} - - if isinstance(other, (dict, MutableMapping)): - # pandas checks the length of other, but not the actual elements, - # so we can end up with a mismatch (which produces nulls). - operands = make_operands(self._data, other) + rhs = {name: o for (name, o) in zip(self._data, other)} elif isinstance(other, Series): - other = dict(zip(other.index.values_host, other.values_host)) - operands = make_operands(self._data, other) - + rhs = dict(zip(other.index.values_host, other.values_host)) # For keys in right but not left, we match pandas semantics here by # performing binops between a NaN (not NULL!) column and the actual # values, which results in nans, the pandas output. - for name, col in other.items(): - if name not in self._data: - operands[name] = (as_column(np.nan, length=self._num_rows), col, reflect, fill_value) + def add_operands(operands, left, right): + for k, v in right.items(): + if k not in left: + operands[k] = (as_column(np.nan, length=self._num_rows), v, reflect, fill_value) elif isinstance(other, DataFrame): if ( not can_reindex @@ -1915,18 +1910,20 @@ def add_right_operands(operands, left_data, right_data, left_default): raise ValueError( "Can only compare identically-labeled DataFrame objects" ) - lhs, rhs = _align_indices(self, other) index = lhs._index - operands = make_operands(lhs._data, rhs._data, fill_value, True) - add_right_operands(operands, lhs._data, rhs._data, lambda: None) - for name, col in rhs._data.items(): - if name not in lhs._data: - operands[name] = (fill_value, col, reflect, None) - else: - return NotImplemented, None + lhs, rhs = lhs._data, rhs._data + fill_if_no_key = True + right_default = fill_value - return operands, index + def add_operands(operands, left, right): + for k, v in right.items(): + if k not in left: + operands[k] = (fill_value, v, reflect, None) + + if not isinstance(rhs, MutableMapping): + return NotImplemented, None + return make_operands(lhs, rhs, add_operands, fill_if_no_key, right_default), index @_cudf_nvtx_annotate def update( From 71b8e7b1d471354eda51d0bb3ccc0c0552312f46 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 21 Mar 2022 18:35:50 -0700 Subject: [PATCH 6/9] Inline make_operands. --- python/cudf/cudf/core/dataframe.py | 34 +++++++++++------------- python/cudf/cudf/tests/test_dataframe.py | 11 ++++---- 2 files changed, 22 insertions(+), 23 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index a8ceadf2e56..41778a971e7 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1854,27 +1854,12 @@ def _make_operands_and_index_for_binop( ], Optional[BaseIndex], ]: - def make_operands(left, right, add_operands, fill_if_no_key, right_default): - operands = { - k: ( - v, - right.get(k, right_default), - reflect, - fill_value if (not fill_if_no_key or k in right) else None, - ) - for k, v in left.items() - } - - if add_operands is not None: - add_operands(operands, left, right) - return operands - # Check built-in types first for speed. if isinstance(other, (list, dict, Sequence, MutableMapping)): if len(other) != self._num_columns: raise ValueError( - f"Other is the wrong length. Expected {self._num_columns} " - f", got {len(other)}" + "Other is the wrong length. Expected " + f"{self._num_columns}, got {len(other)}" ) lhs, rhs = self._data, other @@ -1923,7 +1908,20 @@ def add_operands(operands, left, right): if not isinstance(rhs, MutableMapping): return NotImplemented, None - return make_operands(lhs, rhs, add_operands, fill_if_no_key, right_default), index + + operands = { + k: ( + v, + rhs.get(k, right_default), + reflect, + fill_value if (not fill_if_no_key or k in rhs) else None, + ) + for k, v in lhs.items() + } + + if add_operands is not None: + add_operands(operands, lhs, rhs) + return operands, index @_cudf_nvtx_annotate def update( diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 816bc8221f3..8549c9a1b20 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -2045,12 +2045,13 @@ def test_dataframe_min_count_ops(data, ops, skipna, min_count): {"x": 1.0}, {"x": 1.0, "y": 2.0}, {"x": 1.0, "y": 2.0, "z": 3.0}, + {"x": 1.0, "z": 3.0}, pd.Series([1.0]), - # pd.Series([1.0, 2.0]), - # pd.Series([1.0, 2.0, 3.0]), - # pd.Series([1.0], index=["x"]), - # pd.Series([1.0, 2.0], index=["x", "y"]), - # pd.Series([1.0, 2.0, 3.0], index=["x", "y", "z"]), + pd.Series([1.0, 2.0]), + pd.Series([1.0, 2.0, 3.0]), + pd.Series([1.0], index=["x"]), + pd.Series([1.0, 2.0], index=["x", "y"]), + pd.Series([1.0, 2.0, 3.0], index=["x", "y", "z"]), pd.DataFrame({"x": [1.0]}), pd.DataFrame({"x": [1.0], "y": [2.0]}), pd.DataFrame({"x": [1.0], "y": [2.0], "z": [3.0]}), From f3219285dcfd52985188ad1ed2361b7370d2e1e7 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 22 Mar 2022 16:00:44 -0700 Subject: [PATCH 7/9] Simplify defaults as much as possible. --- python/cudf/cudf/core/dataframe.py | 48 +++++++++++++----------------- 1 file changed, 21 insertions(+), 27 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 41778a971e7..5bdf7e5f202 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1864,23 +1864,18 @@ def _make_operands_and_index_for_binop( lhs, rhs = self._data, other index = self._index - add_operands = None - fill_if_no_key = False - right_default = None + fill_requires_key = False + left_default: Any = False if _is_scalar_or_zero_d_array(other): rhs = {name: other for name in self._data} - elif isinstance(other, Sequence): + elif isinstance(other, (list, Sequence)): rhs = {name: o for (name, o) in zip(self._data, other)} elif isinstance(other, Series): rhs = dict(zip(other.index.values_host, other.values_host)) - # For keys in right but not left, we match pandas semantics here by - # performing binops between a NaN (not NULL!) column and the actual - # values, which results in nans, the pandas output. - def add_operands(operands, left, right): - for k, v in right.items(): - if k not in left: - operands[k] = (as_column(np.nan, length=self._num_rows), v, reflect, fill_value) + # For keys in right but not left, perform binops between NaN (not + # NULL!) and the right value (result is NaN). + left_default = as_column(np.nan, length=len(self)) elif isinstance(other, DataFrame): if ( not can_reindex @@ -1895,32 +1890,31 @@ def add_operands(operands, left, right): raise ValueError( "Can only compare identically-labeled DataFrame objects" ) - lhs, rhs = _align_indices(self, other) - index = lhs._index - lhs, rhs = lhs._data, rhs._data - fill_if_no_key = True - right_default = fill_value - - def add_operands(operands, left, right): - for k, v in right.items(): - if k not in left: - operands[k] = (fill_value, v, reflect, None) - - if not isinstance(rhs, MutableMapping): + new_lhs, new_rhs = _align_indices(self, other) + index = new_lhs._index + lhs, rhs = new_lhs._data, new_rhs._data + fill_requires_key = True + # For DataFrame-DataFrame ops, always default to operating against + # the fill value. + left_default = fill_value + + if not isinstance(rhs, (dict, MutableMapping)): return NotImplemented, None operands = { k: ( v, - rhs.get(k, right_default), + rhs.get(k, fill_value), reflect, - fill_value if (not fill_if_no_key or k in rhs) else None, + fill_value if (not fill_requires_key or k in rhs) else None, ) for k, v in lhs.items() } - if add_operands is not None: - add_operands(operands, lhs, rhs) + if left_default is not False: + for k, v in rhs.items(): + if k not in lhs: + operands[k] = (left_default, v, reflect, None) return operands, index @_cudf_nvtx_annotate From db84ddf821fc157724a555b5fe77ff0b053886b7 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 31 Mar 2022 14:22:56 -0700 Subject: [PATCH 8/9] Deprecate binary operations with future unsupported types. --- python/cudf/cudf/core/dataframe.py | 13 ++++++-- python/cudf/cudf/core/single_column_frame.py | 12 ++++++++ python/cudf/cudf/tests/test_dataframe.py | 31 +++++++++++++++----- 3 files changed, 45 insertions(+), 11 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 5bdf7e5f202..0e25f245efd 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -10,7 +10,7 @@ import sys import warnings from collections import defaultdict -from collections.abc import Iterable, Sequence +from collections.abc import Iterable, Mapping, Sequence from typing import ( Any, Dict, @@ -1855,7 +1855,14 @@ def _make_operands_and_index_for_binop( Optional[BaseIndex], ]: # Check built-in types first for speed. - if isinstance(other, (list, dict, Sequence, MutableMapping)): + if isinstance(other, (list, dict, Sequence, Mapping)): + warnings.warn( + "Binary operations between host objects such as " + f"{type(other)} and cudf.DataFrame are deprecated and will be " + "removed in a future release. Please convert it to a cudf " + "object before performing the operation.", + FutureWarning, + ) if len(other) != self._num_columns: raise ValueError( "Other is the wrong length. Expected " @@ -1898,7 +1905,7 @@ def _make_operands_and_index_for_binop( # the fill value. left_default = fill_value - if not isinstance(rhs, (dict, MutableMapping)): + if not isinstance(rhs, (dict, Mapping)): return NotImplemented, None operands = { diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py index 3e91aa634f4..1a06e98148b 100644 --- a/python/cudf/cudf/core/single_column_frame.py +++ b/python/cudf/cudf/core/single_column_frame.py @@ -3,6 +3,7 @@ from __future__ import annotations +import warnings from typing import Any, Dict, Optional, Tuple, Type, TypeVar, Union import cupy @@ -333,6 +334,17 @@ def _make_operands_for_binop( if isinstance(other, SingleColumnFrame): other = other._column elif not _is_scalar_or_zero_d_array(other): + if not hasattr(other, "__cuda_array_interface__"): + # TODO: When this deprecated behavior is removed, also change + # the above conditional to stop checking for pd.Series and + # pd.Index since we only need to support SingleColumnFrame. + warnings.warn( + f"Binary operations between host objects such as " + f"{type(other)} and {type(self)} are deprecated and will " + "be removed in a future release. Please convert it to a " + "cudf object before performing the operation.", + FutureWarning, + ) # Non-scalar right operands are valid iff they convert to columns. try: other = as_column(other) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 8549c9a1b20..bddc1316aec 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -9,6 +9,7 @@ import string import textwrap import warnings +from contextlib import contextmanager from copy import copy import cupy @@ -2018,6 +2019,15 @@ def test_dataframe_min_count_ops(data, ops, skipna, min_count): ) +@contextmanager +def _hide_host_other_warning(other): + if isinstance(other, (dict, list)): + with pytest.warns(FutureWarning): + yield + else: + yield + + @pytest.mark.parametrize( "binop", [ @@ -2068,17 +2078,22 @@ def test_binops_df(pdf, gdf, binop, other): if isinstance(other, (pd.Series, pd.DataFrame)): other = cudf.from_pandas(other) - assert_exceptions_equal( - lfunc=binop, - rfunc=binop, - lfunc_args_and_kwargs=([pdf, other], {}), - rfunc_args_and_kwargs=([gdf, other], {}), - compare_error_message=False, - ) + # TODO: When we remove support for binary operations with lists and + # dicts, those cases should all be checked in a `pytest.raises` block + # that returns before we enter this try-except. + with _hide_host_other_warning(other): + assert_exceptions_equal( + lfunc=binop, + rfunc=binop, + lfunc_args_and_kwargs=([pdf, other], {}), + rfunc_args_and_kwargs=([gdf, other], {}), + compare_error_message=False, + ) else: if isinstance(other, (pd.Series, pd.DataFrame)): other = cudf.from_pandas(other) - g = binop(gdf, other) + with _hide_host_other_warning(other): + g = binop(gdf, other) try: assert_eq(d, g) except AssertionError: From 46fd12957ba1cf8ef03d5ef74d5819d20e560a57 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 12 Apr 2022 13:49:23 -0700 Subject: [PATCH 9/9] Address PR comments. --- python/cudf/cudf/core/dataframe.py | 2 +- python/cudf/cudf/tests/test_dataframe.py | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 0e25f245efd..5483dcf653c 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1865,7 +1865,7 @@ def _make_operands_and_index_for_binop( ) if len(other) != self._num_columns: raise ValueError( - "Other is the wrong length. Expected " + "Other is of the wrong length. Expected " f"{self._num_columns}, got {len(other)}" ) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index bddc1316aec..a7fad792bd0 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -2106,6 +2106,11 @@ def test_binops_df(pdf, gdf, binop, other): pass +def test_binops_df_invalid(gdf): + with pytest.raises(TypeError): + gdf + np.array([1, 2]) + + @pytest.mark.parametrize("binop", [operator.and_, operator.or_, operator.xor]) def test_bitwise_binops_df(pdf, gdf, binop): d = binop(pdf, pdf + 1)