Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix type promotion edge cases in numerical binops #12074

Merged
merged 15 commits into from
Nov 16, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -502,7 +502,8 @@ def _wrap_binop_normalization(self, other):
if other is NA or other is None:
return cudf.Scalar(other, dtype=self.dtype)
if isinstance(other, np.ndarray) and other.ndim == 0:
other = other.item()
# Try and maintain the dtype
other = other.dtype.type(other.item())
wence- marked this conversation as resolved.
Show resolved Hide resolved
return self.normalize_binop_value(other)

def _scatter_by_slice(
Expand Down
53 changes: 30 additions & 23 deletions python/cudf/cudf/core/column/numerical.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
is_number,
is_scalar,
)
from cudf.core.buffer import Buffer, as_buffer, cuda_array_interface_wrapper
from cudf.core.buffer import Buffer, cuda_array_interface_wrapper
from cudf.core.column import (
ColumnBase,
as_column,
Expand Down Expand Up @@ -225,10 +225,18 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
(tmp.dtype.type in int_float_dtype_mapping)
and (tmp.dtype.type != np.bool_)
and (
(np.isscalar(tmp) and (0 == tmp))
or (
(isinstance(tmp, NumericalColumn)) and (0.0 in tmp)
(
(
np.isscalar(tmp)
or (
isinstance(tmp, cudf.Scalar)
# host to device copy
and tmp.is_valid()
)
)
and (0 == tmp)
)
or ((isinstance(tmp, NumericalColumn)) and (0 in tmp))
wence- marked this conversation as resolved.
Show resolved Hide resolved
)
):
out_dtype = cudf.dtype("float64")
Expand Down Expand Up @@ -274,7 +282,7 @@ def nans_to_nulls(self: NumericalColumn) -> NumericalColumn:

def normalize_binop_value(
self, other: ScalarLike
) -> Union[ColumnBase, ScalarLike]:
) -> Union[ColumnBase, cudf.Scalar]:
if isinstance(other, ColumnBase):
if not isinstance(other, NumericalColumn):
return NotImplemented
Expand All @@ -285,25 +293,24 @@ def normalize_binop_value(
# expensive device-host transfer just to
# adjust the dtype
other = other.value
other_dtype = np.min_scalar_type(other)
if other_dtype.kind in {"b", "i", "u", "f"}:
if isinstance(other, cudf.Scalar):
return other
other_dtype = np.promote_types(self.dtype, other_dtype)
if other_dtype == np.dtype("float16"):
other_dtype = cudf.dtype("float32")
other = other_dtype.type(other)
# Try and match pandas and hence numpy. Deduce the common
# dtype via the _value_ of other, and the dtype of self. TODO:
# When NEP50 is accepted, this might want changed or
# simplified.
# This is not at all simple:
# np.result_type(np.int64(0), np.uint8)
# => np.uint8
# np.result_type(np.asarray([0], dtype=np.int64), np.uint8)
# => np.int64
# np.promote_types(np.int64(0), np.uint8)
# => np.int64
# np.promote_types(np.asarray([0], dtype=np.int64).dtype, np.uint8)
# => np.int64
common_dtype = np.result_type(self.dtype, other)
wence- marked this conversation as resolved.
Show resolved Hide resolved
if common_dtype.kind in {"b", "i", "u", "f"}:
if self.dtype.kind == "b":
other_dtype = min_signed_type(other)
if np.isscalar(other):
return cudf.dtype(other_dtype).type(other)
else:
ary = full(len(self), other, dtype=other_dtype)
return column.build_column(
data=as_buffer(ary),
dtype=ary.dtype,
mask=self.mask,
)
wence- marked this conversation as resolved.
Show resolved Hide resolved
common_dtype = min_signed_type(other)
wence- marked this conversation as resolved.
Show resolved Hide resolved
return cudf.Scalar(other, dtype=common_dtype)
else:
return NotImplemented

Expand Down
10 changes: 5 additions & 5 deletions python/cudf/cudf/core/column/timedelta.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,17 +181,17 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
out_dtype = determine_out_dtype(self.dtype, other.dtype)
elif op in {"__truediv__", "__floordiv__"}:
common_dtype = determine_out_dtype(self.dtype, other.dtype)
this = self.astype(common_dtype).astype("float64")
out_dtype = np.float64 if op == "__truediv__" else np.int64
this = self.astype(common_dtype).astype(out_dtype)
if isinstance(other, cudf.Scalar):
if other.is_valid():
other = other.value.astype(common_dtype).astype(
"float64"
out_dtype
)
else:
other = cudf.Scalar(None, "float64")
other = cudf.Scalar(None, out_dtype)
else:
other = other.astype(common_dtype).astype("float64")
out_dtype = np.float64 if op == "__truediv__" else np.int64
other = other.astype(common_dtype).astype(out_dtype)
elif op in {"__add__", "__sub__"}:
out_dtype = determine_out_dtype(self.dtype, other.dtype)
elif other.dtype.kind in {"f", "i", "u"}:
Expand Down
5 changes: 4 additions & 1 deletion python/cudf/cudf/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -2072,7 +2072,10 @@ def microsecond(self):
""" # noqa: E501
return as_index(
(
self._values.get_dt_field("millisecond")
# Need to manually promote column to int32 because
# pandas-matching binop behaviour requires that this
# __mul__ returns an int16 column.
self._values.get_dt_field("millisecond").astype("int32")
* cudf.Scalar(1000, dtype="int32")
)
+ self._values.get_dt_field("microsecond"),
Expand Down
5 changes: 4 additions & 1 deletion python/cudf/cudf/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -3660,7 +3660,10 @@ def microsecond(self):
"""
return Series(
data=(
self.series._column.get_dt_field("millisecond")
# Need to manually promote column to int32 because
# pandas-matching binop behaviour requires that this
# __mul__ returns an int16 column.
self.series._column.get_dt_field("millisecond").astype("int32")
wence- marked this conversation as resolved.
Show resolved Hide resolved
* cudf.Scalar(1000, dtype="int32")
)
+ self.series._column.get_dt_field("microsecond"),
Expand Down
61 changes: 61 additions & 0 deletions python/cudf/cudf/tests/test_binops.py
Original file line number Diff line number Diff line change
Expand Up @@ -877,6 +877,67 @@ def test_binop_bool_uint(func, rhs):
)


@pytest.mark.parametrize(
"series_dtype", (np.bool_, np.int8, np.uint8, np.int64, np.uint64)
)
@pytest.mark.parametrize(
"divisor_dtype",
(
pytest.param(
np.bool_,
marks=pytest.mark.xfail(
reason=(
"Pandas handling of division by zero-bool is too strange"
wence- marked this conversation as resolved.
Show resolved Hide resolved
)
),
),
np.int8,
np.uint8,
np.int64,
np.uint64,
),
)
@pytest.mark.parametrize("scalar_divisor", [False, True])
def test_floordiv_zero_float64(series_dtype, divisor_dtype, scalar_divisor):
sr = pd.Series([1, 2, 3], dtype=series_dtype)
cr = cudf.from_pandas(sr)

if scalar_divisor:
pd_div = divisor_dtype(0)
cudf_div = cudf.Scalar(0, dtype=divisor_dtype)
else:
pd_div = pd.Series([0], dtype=divisor_dtype)
cudf_div = cudf.from_pandas(pd_div)
utils.assert_eq((sr // pd_div), (cr // cudf_div))


@pytest.mark.parametrize(
"dtype",
(
pytest.param(
np.bool_,
marks=pytest.mark.xfail(
reason=(
"Pandas handling of division by zero-bool is too strange"
wence- marked this conversation as resolved.
Show resolved Hide resolved
)
),
),
np.int8,
np.uint8,
np.int64,
np.uint64,
np.float32,
np.float64,
),
)
def test_rmod_zero_nan(dtype):
sr = pd.Series([1, 1, 0], dtype=dtype)
cr = cudf.from_pandas(sr)
utils.assert_eq(1 % sr, 1 % cr)
expected_dtype = np.float64 if cr.dtype.kind != "f" else dtype
utils.assert_eq(1 % cr, cudf.Series([0, 0, None], dtype=expected_dtype))


def test_series_misc_binop():
pds = pd.Series([1, 2, 4], name="abc xyz")
gds = cudf.Series([1, 2, 4], name="abc xyz")
Expand Down
6 changes: 6 additions & 0 deletions python/cudf/cudf/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1995,6 +1995,12 @@ def test_set_bool_error(dtype, bool_scalar):
)


def test_int64_equality():
s = cudf.Series(np.asarray([2**63 - 10, 2**63 - 100], dtype=np.int64))
assert (s != np.int64(2**63 - 1)).all()
assert (s != cudf.Scalar(2**63 - 1, dtype=np.int64)).all()


@pytest.mark.parametrize("into", [dict, OrderedDict, defaultdict(list)])
def test_series_to_dict(into):
gs = cudf.Series(["ab", "de", "zx"], index=[10, 20, 100])
Expand Down
Loading