From 28b237cff807aaf6b083fc32528e7b6007a12cac Mon Sep 17 00:00:00 2001 From: Chris Jarrett Date: Mon, 7 Jun 2021 14:21:58 -0700 Subject: [PATCH 1/4] Fix empty column binop failures --- python/cudf/cudf/core/series.py | 8 ++++++++ python/cudf/cudf/tests/test_binops.py | 16 ++++++++++++++++ 2 files changed, 24 insertions(+) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index c5a7b07d778..6b85e6099ea 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -1326,6 +1326,14 @@ def _binaryop( if isinstance(other, cudf.DataFrame): return NotImplemented + if ( + self.isnull().all() + and fill_value is None + and fn + in ("add", "sub", "mul", "mod", "pow", "truediv", "floordiv") + ): + return self + if isinstance(other, Series): if ( not can_reindex diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py index 40df234580c..e53db07d294 100644 --- a/python/cudf/cudf/tests/test_binops.py +++ b/python/cudf/cudf/tests/test_binops.py @@ -2870,3 +2870,19 @@ def set_null_cases(column_l, column_r, case): ) def test_null_equals_columnops(lcol, rcol, ans, case): assert lcol._null_equals(rcol).all() == ans + + +@pytest.mark.parametrize("binop", _binops) +@pytest.mark.parametrize("data", [None, [-9, 7], [5, -2], [12, 18]]) +@pytest.mark.parametrize("scalar", [1, 3, 12, np.nan]) +def test_empty_column(binop, data, scalar): + gdf = cudf.DataFrame(columns=["a", "b"]) + if data is not None: + gdf["a"] = data + + pdf = gdf.to_pandas() + + got = binop(gdf, scalar) + expected = binop(pdf, scalar) + + utils.assert_eq(expected, got) From 7e48e0b8e16251b62e63ef804142523c594891e0 Mon Sep 17 00:00:00 2001 From: Chris Jarrett Date: Tue, 8 Jun 2021 12:27:48 -0700 Subject: [PATCH 2/4] Only change behavior for object columns --- python/cudf/cudf/core/series.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 6b85e6099ea..008053adcd6 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -1326,8 +1326,10 @@ def _binaryop( if isinstance(other, cudf.DataFrame): return NotImplemented + # Ignore empty object columns when performing arithmetic operations if ( - self.isnull().all() + self.dtype == "object" + and self.isnull().all() and fill_value is None and fn in ("add", "sub", "mul", "mod", "pow", "truediv", "floordiv") From a2476f530683bb4adeb41ccb6f875e2ff2a390b3 Mon Sep 17 00:00:00 2001 From: Chris Jarrett Date: Tue, 20 Jul 2021 00:55:45 -0700 Subject: [PATCH 3/4] Refactor --- python/cudf/cudf/core/frame.py | 24 ++++++++++++++++ python/cudf/cudf/core/series.py | 10 ------- python/cudf/cudf/tests/test_binops.py | 41 ++++++++++++++++----------- 3 files changed, 49 insertions(+), 26 deletions(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index de39e0f0631..67650a09f3b 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -3401,6 +3401,30 @@ def _colwise_binop( col, (left_column, right_column, reflect, fill_value), ) in operands.items(): + + # Handle object columns that are empty or + # all nulls when performing binary operations + if ( + left_column.dtype == "object" + and left_column.null_count == len(left_column) + and fill_value is None + ): + if fn in ( + "add", + "sub", + "mul", + "mod", + "pow", + "truediv", + "floordiv", + ): + output[col] = left_column + elif fn in ("eq", "lt", "le", "gt", "ge"): + output[col] = left_column.notnull() + elif fn == "ne": + output[col] = left_column.isnull() + continue + if right_column is cudf.NA: right_column = cudf.Scalar( right_column, dtype=left_column.dtype diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index efa61557a6a..4777664a9a9 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -1342,16 +1342,6 @@ def _binaryop( if isinstance(other, cudf.DataFrame): return NotImplemented - # Ignore empty object columns when performing arithmetic operations - if ( - self.dtype == "object" - and self.isnull().all() - and fill_value is None - and fn - in ("add", "sub", "mul", "mod", "pow", "truediv", "floordiv") - ): - return self - if isinstance(other, SingleColumnFrame): if ( # TODO: The can_reindex logic also needs to be applied for diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py index 74d36c2e9ca..8277b8e7b32 100644 --- a/python/cudf/cudf/tests/test_binops.py +++ b/python/cudf/cudf/tests/test_binops.py @@ -37,6 +37,15 @@ operator.pow, ] +_binops_compare = [ + operator.eq, + operator.ne, + operator.lt, + operator.le, + operator.gt, + operator.ge, +] + @pytest.mark.parametrize("obj_class", ["Series", "Index"]) @pytest.mark.parametrize("binop", _binops) @@ -2867,22 +2876,6 @@ def test_null_equals_columnops(lcol, rcol, ans, case): assert lcol._null_equals(rcol).all() == ans -@pytest.mark.parametrize("binop", _binops) -@pytest.mark.parametrize("data", [None, [-9, 7], [5, -2], [12, 18]]) -@pytest.mark.parametrize("scalar", [1, 3, 12, np.nan]) -def test_empty_column(binop, data, scalar): - gdf = cudf.DataFrame(columns=["a", "b"]) - if data is not None: - gdf["a"] = data - - pdf = gdf.to_pandas() - - got = binop(gdf, scalar) - expected = binop(pdf, scalar) - - utils.assert_eq(expected, got) - - def test_add_series_to_dataframe(): """Verify that missing columns result in NaNs, not NULLs.""" assert cp.all( @@ -2904,3 +2897,19 @@ def test_binops_non_cudf_types(obj_class, binop, other_type): lhs = obj_class(data) rhs = other_type(data) assert cp.all((binop(lhs, rhs) == binop(lhs, lhs)).values) + + +@pytest.mark.parametrize("binop", _binops + _binops_compare) +@pytest.mark.parametrize("data", [None, [-9, 7], [5, -2], [12, 18]]) +@pytest.mark.parametrize("scalar", [1, 3, 12, np.nan]) +def test_empty_column(binop, data, scalar): + gdf = cudf.DataFrame(columns=["a", "b"]) + if data is not None: + gdf["a"] = data + + pdf = gdf.to_pandas() + + got = binop(gdf, scalar) + expected = binop(pdf, scalar) + + utils.assert_eq(expected, got) From 3e59d12dc46c9ab3a228279495f0cd46bc000d85 Mon Sep 17 00:00:00 2001 From: Chris Jarrett Date: Tue, 20 Jul 2021 00:58:32 -0700 Subject: [PATCH 4/4] Fix merge --- python/cudf/cudf/core/series.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 4777664a9a9..565b72bd087 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -1339,9 +1339,6 @@ def _binaryop( *args, **kwargs, ): - if isinstance(other, cudf.DataFrame): - return NotImplemented - if isinstance(other, SingleColumnFrame): if ( # TODO: The can_reindex logic also needs to be applied for