From 4afc6d7f608349150c015d43d3d9d1d08fa1f76b Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 4 Feb 2020 16:50:57 -0800 Subject: [PATCH 1/5] POC: fix op(frame, frame2) with reindex --- pandas/core/ops/__init__.py | 15 +++++++++++++++ pandas/tests/frame/test_arithmetic.py | 19 +++++++++++++++++++ 2 files changed, 34 insertions(+) diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index 0312c11a6d590..1b988a54a65de 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -722,6 +722,7 @@ def _arith_method_FRAME(cls, op, special): @Appender(doc) def f(self, other, axis=default_axis, level=None, fill_value=None): + orig_self, orig_other = self, other self, other = _align_method_FRAME(self, other, axis, flex=True, level=level) if isinstance(other, ABCDataFrame): @@ -729,6 +730,20 @@ def f(self, other, axis=default_axis, level=None, fill_value=None): pass_op = op if should_series_dispatch(self, other, op) else na_op pass_op = pass_op if not is_logical else op + if isinstance(orig_other, ABCDataFrame): + if fill_value is None and level is None and axis is default_axis: + # TODO: any other cases we should handle here? + cols = orig_self.columns.intersection(orig_other.columns) + if not ( + cols.equals(orig_self.columns) + and cols.equals(orig_other.columns) + ): + # GH#31623, only operate on shared columns + new_left = orig_self[cols] + new_right = orig_other[cols] + result = op(new_left, new_right) + return result.reindex(self.columns, axis=1) + new_data = self._combine_frame(other, pass_op, fill_value) return self._construct_result(new_data) diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index c6eacf2bbcd84..44ad55517dcea 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -711,6 +711,25 @@ def test_operations_with_interval_categories_index(self, all_arithmetic_operator expected = pd.DataFrame([[getattr(n, op)(num) for n in data]], columns=ind) tm.assert_frame_equal(result, expected) + def test_frame_with_frame_reindex(self): + # GH#31623 + df = pd.DataFrame( + { + "foo": [pd.Timestamp("2019"), pd.Timestamp("2020")], + "bar": [pd.Timestamp("2018"), pd.Timestamp("2021")], + }, + columns=["foo", "bar"], + ) + df2 = df[["foo"]] + + result = df - df2 + + expected = pd.DataFrame( + {"foo": [pd.Timedelta(0), pd.Timedelta(0)], "bar": [np.nan, np.nan]}, + columns=["bar", "foo"], + ) + tm.assert_frame_equal(result, expected) + def test_frame_with_zero_len_series_corner_cases(): # GH#28600 From 1eeac12513c0a0802320aeb441d3ddaaff5a271e Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 11 Feb 2020 14:00:31 -0800 Subject: [PATCH 2/5] REF: separate funcs --- pandas/core/ops/__init__.py | 64 ++++++++++++++++++++++++++++--------- 1 file changed, 49 insertions(+), 15 deletions(-) diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index 1b988a54a65de..76a45ae3c1fdd 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -705,6 +705,52 @@ def to_series(right): return left, right +def _should_reindex_op(left, right, axis, default_axis, fill_value, level) -> bool: + """ + Check if this is an operation between DataFrames that will need to reindex. + """ + if not isinstance(right, ABCDataFrame): + return False + + if fill_value is None and level is None and axis is default_axis: + # TODO: any other cases we should handle here? + cols = left.columns.intersection(right.columns) + if not (cols.equals(left.columns) and cols.equals(right.columns)): + return True + + return False + + +def _frame_arith_method_with_reindex(left, right, op): + """ + For DataFrame-with-DataFrame operations that require reindexing, + operate only on shared columns, then reindex. + + Parameters + ---------- + left : DataFrame + right : DataFrame + op : binary operator + + Returns + ------- + DataFrame + """ + # GH#31623, only operate on shared columns + cols = left.columns.intersection(right.columns) + + new_left = left[cols] + new_right = right[cols] + result = op(new_left, new_right) + + # Do the join on the columns instead of using _align_method_FRAME + # to avoid constructing two potentially large/sparse DataFrames + join_columns, _, _ = left.columns.join( + right.columns, how="outer", level=None, return_indexers=True + ) + return result.reindex(join_columns, axis=1) + + def _arith_method_FRAME(cls, op, special): str_rep = _get_opstr(op) op_name = _get_op_name(op, special) @@ -722,7 +768,9 @@ def _arith_method_FRAME(cls, op, special): @Appender(doc) def f(self, other, axis=default_axis, level=None, fill_value=None): - orig_self, orig_other = self, other + if _should_reindex_op(self, other, axis, default_axis, fill_value, level): + return _frame_arith_method_with_reindex(self, other, op) + self, other = _align_method_FRAME(self, other, axis, flex=True, level=level) if isinstance(other, ABCDataFrame): @@ -730,20 +778,6 @@ def f(self, other, axis=default_axis, level=None, fill_value=None): pass_op = op if should_series_dispatch(self, other, op) else na_op pass_op = pass_op if not is_logical else op - if isinstance(orig_other, ABCDataFrame): - if fill_value is None and level is None and axis is default_axis: - # TODO: any other cases we should handle here? - cols = orig_self.columns.intersection(orig_other.columns) - if not ( - cols.equals(orig_self.columns) - and cols.equals(orig_other.columns) - ): - # GH#31623, only operate on shared columns - new_left = orig_self[cols] - new_right = orig_other[cols] - result = op(new_left, new_right) - return result.reindex(self.columns, axis=1) - new_data = self._combine_frame(other, pass_op, fill_value) return self._construct_result(new_data) From 06d206c1b5bffc461701f92068f45681b708c0a0 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 12 Feb 2020 12:44:33 -0800 Subject: [PATCH 3/5] rename, assert --- pandas/core/ops/__init__.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index 2cce030829ddc..2fc349b829874 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -703,10 +703,14 @@ def to_series(right): return left, right -def _should_reindex_op(left, right, axis, default_axis, fill_value, level) -> bool: +def _should_reindex_frame_op( + left, right, axis, default_axis, fill_value, level +) -> bool: """ Check if this is an operation between DataFrames that will need to reindex. """ + assert isinstance(left, ABCDataFrame) + if not isinstance(right, ABCDataFrame): return False @@ -766,7 +770,7 @@ def _arith_method_FRAME(cls, op, special): @Appender(doc) def f(self, other, axis=default_axis, level=None, fill_value=None): - if _should_reindex_op(self, other, axis, default_axis, fill_value, level): + if _should_reindex_frame_op(self, other, axis, default_axis, fill_value, level): return _frame_arith_method_with_reindex(self, other, op) self, other = _align_method_FRAME(self, other, axis, flex=True, level=level) From ce236359f1b4646b672fc3cdb31c6dbebf48eb6c Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 12 Feb 2020 12:46:18 -0800 Subject: [PATCH 4/5] whatsnew --- doc/source/whatsnew/v1.0.2.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.0.2.rst b/doc/source/whatsnew/v1.0.2.rst index 0216007ea5ba8..6d3bb9e2626e5 100644 --- a/doc/source/whatsnew/v1.0.2.rst +++ b/doc/source/whatsnew/v1.0.2.rst @@ -19,6 +19,7 @@ Fixed regressions - Fixed regression in :meth:`Series.align` when ``other`` is a DataFrame and ``method`` is not None (:issue:`31785`) - Fixed regression in :meth:`pandas.core.groupby.RollingGroupby.apply` where the ``raw`` parameter was ignored (:issue:`31754`) - Fixed regression in :meth:`rolling(..).corr() ` when using a time offset (:issue:`31789`) +- Fixed regression in :class:`DataFrame` arithmetic operations with mis-matched columns (:issue:`31623`) - .. --------------------------------------------------------------------------- From abc73f8d8939c6289a343cde9ee1ce4d952e715c Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 17 Feb 2020 11:52:42 -0800 Subject: [PATCH 5/5] annotations --- pandas/core/ops/__init__.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index 2fc349b829874..b74dea686a89f 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -5,7 +5,7 @@ """ import datetime import operator -from typing import Optional, Set, Tuple, Union +from typing import TYPE_CHECKING, Optional, Set, Tuple, Union import numpy as np @@ -61,6 +61,9 @@ rxor, ) +if TYPE_CHECKING: + from pandas import DataFrame # noqa:F401 + # ----------------------------------------------------------------------------- # constants ARITHMETIC_BINOPS: Set[str] = { @@ -704,7 +707,7 @@ def to_series(right): def _should_reindex_frame_op( - left, right, axis, default_axis, fill_value, level + left: "DataFrame", right, axis, default_axis: int, fill_value, level ) -> bool: """ Check if this is an operation between DataFrames that will need to reindex. @@ -723,7 +726,9 @@ def _should_reindex_frame_op( return False -def _frame_arith_method_with_reindex(left, right, op): +def _frame_arith_method_with_reindex( + left: "DataFrame", right: "DataFrame", op +) -> "DataFrame": """ For DataFrame-with-DataFrame operations that require reindexing, operate only on shared columns, then reindex.