Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix Series comparison vs scalars #12519

Merged
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion python/cudf/cudf/core/column/datetime.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2019-2022, NVIDIA CORPORATION.
# Copyright (c) 2019-2023, NVIDIA CORPORATION.

from __future__ import annotations

Expand Down Expand Up @@ -261,6 +261,11 @@ def normalize_binop_value(self, other: DatetimeLikeScalar) -> ScalarLike:
return cudf.Scalar(None, dtype=other.dtype)

return cudf.Scalar(other)
elif isinstance(other, str):
try:
return cudf.Scalar(other, dtype=self.dtype)
except ValueError:
pass

return NotImplemented

Expand Down
20 changes: 19 additions & 1 deletion python/cudf/cudf/core/column/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -5665,7 +5665,7 @@ def normalize_binop_value(
and other.dtype == "object"
):
return other
if isinstance(other, str):
if is_scalar(other):
return cudf.Scalar(other)
return NotImplemented

Expand Down Expand Up @@ -5701,6 +5701,24 @@ def _binaryop(
return NotImplemented

if isinstance(other, (StringColumn, str, cudf.Scalar)):
if isinstance(other, cudf.Scalar) and other.dtype != "O":
if op in {
"__eq__",
"__lt__",
"__le__",
"__gt__",
"__ge__",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This seems wrong. Python, and pandas raise TypeError for ordering between str and not-str:

In [5]: s = pd.Series(["1", "2", "3"])

In [6]: s < 1
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
Cell In[6], line 1
----> 1 s < 1

File ~/compose/etc/conda/cuda_11.8/envs/rapids/lib/python3.8/site-packages/pandas/core/ops/common.py:72, in _unpack_zerodim_and_defer.<locals>.new_method(self, other)
     68             return NotImplemented
     70 other = item_from_zerodim(other)
---> 72 return method(self, other)

File ~/compose/etc/conda/cuda_11.8/envs/rapids/lib/python3.8/site-packages/pandas/core/arraylike.py:50, in OpsMixin.__lt__(self, other)
     48 @unpack_zerodim_and_defer("__lt__")
     49 def __lt__(self, other):
---> 50     return self._cmp_method(other, operator.lt)

File ~/compose/etc/conda/cuda_11.8/envs/rapids/lib/python3.8/site-packages/pandas/core/series.py:6243, in Series._cmp_method(self, other, op)
   6240 rvalues = extract_array(other, extract_numpy=True, extract_range=True)
   6242 with np.errstate(all="ignore"):
-> 6243     res_values = ops.comparison_op(lvalues, rvalues, op)
   6245 return self._construct_result(res_values, name=res_name)

File ~/compose/etc/conda/cuda_11.8/envs/rapids/lib/python3.8/site-packages/pandas/core/ops/array_ops.py:287, in comparison_op(left, right, op)
    284     return invalid_comparison(lvalues, rvalues, op)
    286 elif is_object_dtype(lvalues.dtype) or isinstance(rvalues, str):
--> 287     res_values = comp_method_OBJECT_ARRAY(op, lvalues, rvalues)
    289 else:
    290     res_values = _na_arithmetic_op(lvalues, rvalues, op, is_cmp=True)

File ~/compose/etc/conda/cuda_11.8/envs/rapids/lib/python3.8/site-packages/pandas/core/ops/array_ops.py:75, in comp_method_OBJECT_ARRAY(op, x, y)
     73     result = libops.vec_compare(x.ravel(), y.ravel(), op)
     74 else:
---> 75     result = libops.scalar_compare(x.ravel(), y, op)
     76 return result.reshape(x.shape)

File ~/compose/etc/conda/cuda_11.8/envs/rapids/lib/python3.8/site-packages/pandas/_libs/ops.pyx:107, in pandas._libs.ops.scalar_compare()

TypeError: '<' not supported between instances of 'str' and 'int'

Which I think is achievable by return NotImplemented instead?

"__ne__",
}:
val = False
if op == "__ne__":
val = True
return column.full(len(self), val, dtype="bool").set_mask(
self.mask
)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
val = False
if op == "__ne__":
val = True
return column.full(len(self), val, dtype="bool").set_mask(
self.mask
)
return column.full(len(self), op == "__ne__", dtype="bool").set_mask(
self.mask
)

else:
return NotImplemented

if op == "__add__":
if isinstance(other, cudf.Scalar):
other = cast(
Expand Down
36 changes: 29 additions & 7 deletions python/cudf/cudf/tests/test_binops.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2018-2022, NVIDIA CORPORATION.
# Copyright (c) 2018-2023, NVIDIA CORPORATION.

import decimal
import operator
Expand Down Expand Up @@ -320,13 +320,35 @@ def test_series_compare_nulls(cmpop, dtypes):
utils.assert_eq(expect, got)


def string_series_compare_test_cases():
cases = []
pd_sr = pd.Series(["a", "b", None, "d", "e", None], dtype="string")
all_cmpop_cases = [
(pd_sr, pd_sr),
(pd_sr, "a"),
("a", pd_sr),
]

for op in _cmpops:
for case in all_cmpop_cases:
cases.append((*case, op))

eq_neq_cases = (
(pd_sr, 1),
(1, pd_sr),
(pd_sr, 1.5),
(1.5, pd_sr),
(pd_sr, True),
(True, pd_sr),
)
for case in eq_neq_cases:
cases += [(*case, operator.eq), (*case, operator.ne)]

return cases
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please use a fixture to generate these cases, something like:

@pytest.fixture
def string_series_compare():
    return pd.Series(...)

@pytest.fixture(params=[False, True], ids=["normal", "reflected"])
def string_series_compare_reflect(request):
    return request.param

@pytest.fixture(params=[1, 1.5, True], ids=["int", "float", "bool"])
def string_series_compare_other(request):
    return request.param

@pytest.fixture(params=_cmpops):
def cmpop(request):
    return request.param

def test_string_series_compare(string_series_compare, string_series_compare_reflect, string_series_compare_other, cmpop):
    assert whatever

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you help me understand a bit more why this approach is better? I've worked it up locally a few different ways and it seems like this forces some kind of programmatic skipping inside the test body since we're not dealing with a cartesian product of the parameterization, whereas the current approach just generates the exact tests we need. Is it the test output that you're concerned about here? (I think this would show up as test_string_series_compare[data0] on the command line for instance as opposed to an expanded output listing the actual parameters)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We're trying to move away from a setup where test collection creates heavyweight objects (e.g. the series here).

I see that you don't actually need the cartesian product. Does it work to separate the eq/neq cases from the other?

You can then have two test functions that handle the general case and then the eq/neq case taking different parameters. There's some discussion here https://docs.rapids.ai/api/cudf/nightly/developer_guide/testing.html

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Got it, thanks for the context. An attempt at 7172f04



@pytest.mark.parametrize(
"obj", [pd.Series(["a", "b", None, "d", "e", None], dtype="string"), "a"]
)
@pytest.mark.parametrize("cmpop", _cmpops)
@pytest.mark.parametrize(
"cmp_obj",
[pd.Series(["b", "a", None, "d", "f", None], dtype="string"), "a"],
"obj, cmp_obj, cmpop", string_series_compare_test_cases()
)
def test_string_series_compare(obj, cmpop, cmp_obj):

Expand Down
26 changes: 26 additions & 0 deletions python/cudf/cudf/tests/test_datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,15 @@
expect_warning_if,
)

_cmpops = [
operator.lt,
operator.gt,
operator.le,
operator.ge,
operator.eq,
operator.ne,
]


def data1():
return pd.date_range("20010101", "20020215", freq="400h", name="times")
Expand Down Expand Up @@ -986,6 +995,23 @@ def test_datetime_series_ops_with_scalars(data, other_scalars, dtype, op):
)


@pytest.mark.parametrize("data", ["20110101", "20120101", "20130101"])
@pytest.mark.parametrize("other_scalars", ["20110101", "20120101", "20130101"])
@pytest.mark.parametrize("op", _cmpops)
@pytest.mark.parametrize(
"dtype",
["datetime64[ns]", "datetime64[us]", "datetime64[ms]", "datetime64[s]"],
)
def test_datetime_series_cmpops_with_scalars(data, other_scalars, dtype, op):
gsr = cudf.Series(data=data, dtype=dtype)
psr = gsr.to_pandas()

expect = op(psr, other_scalars)
got = op(gsr, other_scalars)

assert_eq(expect, got)


@pytest.mark.parametrize(
"data",
[
Expand Down