diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 6c17b492f8a..59851a1c11b 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -502,7 +502,8 @@ def _wrap_binop_normalization(self, other): if other is NA or other is None: return cudf.Scalar(other, dtype=self.dtype) if isinstance(other, np.ndarray) and other.ndim == 0: - other = other.item() + # Try and maintain the dtype + other = other.dtype.type(other.item()) return self.normalize_binop_value(other) def _scatter_by_slice( diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index f126f47c3c2..7943135afe1 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -35,7 +35,7 @@ is_number, is_scalar, ) -from cudf.core.buffer import Buffer, as_buffer, cuda_array_interface_wrapper +from cudf.core.buffer import Buffer, cuda_array_interface_wrapper from cudf.core.column import ( ColumnBase, as_column, @@ -225,10 +225,18 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase: (tmp.dtype.type in int_float_dtype_mapping) and (tmp.dtype.type != np.bool_) and ( - (np.isscalar(tmp) and (0 == tmp)) - or ( - (isinstance(tmp, NumericalColumn)) and (0.0 in tmp) + ( + ( + np.isscalar(tmp) + or ( + isinstance(tmp, cudf.Scalar) + # host to device copy + and tmp.is_valid() + ) + ) + and (0 == tmp) ) + or ((isinstance(tmp, NumericalColumn)) and (0 in tmp)) ) ): out_dtype = cudf.dtype("float64") @@ -274,7 +282,7 @@ def nans_to_nulls(self: NumericalColumn) -> NumericalColumn: def normalize_binop_value( self, other: ScalarLike - ) -> Union[ColumnBase, ScalarLike]: + ) -> Union[ColumnBase, cudf.Scalar]: if isinstance(other, ColumnBase): if not isinstance(other, NumericalColumn): return NotImplemented @@ -285,25 +293,24 @@ def normalize_binop_value( # expensive device-host transfer just to # adjust the dtype other = other.value - other_dtype = np.min_scalar_type(other) - if other_dtype.kind in {"b", "i", "u", "f"}: - if isinstance(other, cudf.Scalar): - return other - other_dtype = np.promote_types(self.dtype, other_dtype) - if other_dtype == np.dtype("float16"): - other_dtype = cudf.dtype("float32") - other = other_dtype.type(other) + # Try and match pandas and hence numpy. Deduce the common + # dtype via the _value_ of other, and the dtype of self. TODO: + # When NEP50 is accepted, this might want changed or + # simplified. + # This is not at all simple: + # np.result_type(np.int64(0), np.uint8) + # => np.uint8 + # np.result_type(np.asarray([0], dtype=np.int64), np.uint8) + # => np.int64 + # np.promote_types(np.int64(0), np.uint8) + # => np.int64 + # np.promote_types(np.asarray([0], dtype=np.int64).dtype, np.uint8) + # => np.int64 + common_dtype = np.result_type(self.dtype, other) + if common_dtype.kind in {"b", "i", "u", "f"}: if self.dtype.kind == "b": - other_dtype = min_signed_type(other) - if np.isscalar(other): - return cudf.dtype(other_dtype).type(other) - else: - ary = full(len(self), other, dtype=other_dtype) - return column.build_column( - data=as_buffer(ary), - dtype=ary.dtype, - mask=self.mask, - ) + common_dtype = min_signed_type(other) + return cudf.Scalar(other, dtype=common_dtype) else: return NotImplemented diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index 3dc923e7ded..901547d94a9 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -181,17 +181,17 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase: out_dtype = determine_out_dtype(self.dtype, other.dtype) elif op in {"__truediv__", "__floordiv__"}: common_dtype = determine_out_dtype(self.dtype, other.dtype) - this = self.astype(common_dtype).astype("float64") + out_dtype = np.float64 if op == "__truediv__" else np.int64 + this = self.astype(common_dtype).astype(out_dtype) if isinstance(other, cudf.Scalar): if other.is_valid(): other = other.value.astype(common_dtype).astype( - "float64" + out_dtype ) else: - other = cudf.Scalar(None, "float64") + other = cudf.Scalar(None, out_dtype) else: - other = other.astype(common_dtype).astype("float64") - out_dtype = np.float64 if op == "__truediv__" else np.int64 + other = other.astype(common_dtype).astype(out_dtype) elif op in {"__add__", "__sub__"}: out_dtype = determine_out_dtype(self.dtype, other.dtype) elif other.dtype.kind in {"f", "i", "u"}: diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 61971e3c749..e561dd0a214 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -2072,7 +2072,10 @@ def microsecond(self): """ # noqa: E501 return as_index( ( - self._values.get_dt_field("millisecond") + # Need to manually promote column to int32 because + # pandas-matching binop behaviour requires that this + # __mul__ returns an int16 column. + self._values.get_dt_field("millisecond").astype("int32") * cudf.Scalar(1000, dtype="int32") ) + self._values.get_dt_field("microsecond"), diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 873bebf1292..8f4f6fe57d6 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -3660,7 +3660,10 @@ def microsecond(self): """ return Series( data=( - self.series._column.get_dt_field("millisecond") + # Need to manually promote column to int32 because + # pandas-matching binop behaviour requires that this + # __mul__ returns an int16 column. + self.series._column.get_dt_field("millisecond").astype("int32") * cudf.Scalar(1000, dtype="int32") ) + self.series._column.get_dt_field("microsecond"), diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py index 2229bcc1938..abbb7a3bd89 100644 --- a/python/cudf/cudf/tests/test_binops.py +++ b/python/cudf/cudf/tests/test_binops.py @@ -877,6 +877,67 @@ def test_binop_bool_uint(func, rhs): ) +@pytest.mark.parametrize( + "series_dtype", (np.bool_, np.int8, np.uint8, np.int64, np.uint64) +) +@pytest.mark.parametrize( + "divisor_dtype", + ( + pytest.param( + np.bool_, + marks=pytest.mark.xfail( + reason=( + "Pandas handling of division by zero-bool is too strange" + ) + ), + ), + np.int8, + np.uint8, + np.int64, + np.uint64, + ), +) +@pytest.mark.parametrize("scalar_divisor", [False, True]) +def test_floordiv_zero_float64(series_dtype, divisor_dtype, scalar_divisor): + sr = pd.Series([1, 2, 3], dtype=series_dtype) + cr = cudf.from_pandas(sr) + + if scalar_divisor: + pd_div = divisor_dtype(0) + cudf_div = cudf.Scalar(0, dtype=divisor_dtype) + else: + pd_div = pd.Series([0], dtype=divisor_dtype) + cudf_div = cudf.from_pandas(pd_div) + utils.assert_eq((sr // pd_div), (cr // cudf_div)) + + +@pytest.mark.parametrize( + "dtype", + ( + pytest.param( + np.bool_, + marks=pytest.mark.xfail( + reason=( + "Pandas handling of division by zero-bool is too strange" + ) + ), + ), + np.int8, + np.uint8, + np.int64, + np.uint64, + np.float32, + np.float64, + ), +) +def test_rmod_zero_nan(dtype): + sr = pd.Series([1, 1, 0], dtype=dtype) + cr = cudf.from_pandas(sr) + utils.assert_eq(1 % sr, 1 % cr) + expected_dtype = np.float64 if cr.dtype.kind != "f" else dtype + utils.assert_eq(1 % cr, cudf.Series([0, 0, None], dtype=expected_dtype)) + + def test_series_misc_binop(): pds = pd.Series([1, 2, 4], name="abc xyz") gds = cudf.Series([1, 2, 4], name="abc xyz") diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index c902bcb8b47..2525f055738 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -1995,6 +1995,12 @@ def test_set_bool_error(dtype, bool_scalar): ) +def test_int64_equality(): + s = cudf.Series(np.asarray([2**63 - 10, 2**63 - 100], dtype=np.int64)) + assert (s != np.int64(2**63 - 1)).all() + assert (s != cudf.Scalar(2**63 - 1, dtype=np.int64)).all() + + @pytest.mark.parametrize("into", [dict, OrderedDict, defaultdict(list)]) def test_series_to_dict(into): gs = cudf.Series(["ab", "de", "zx"], index=[10, 20, 100]) diff --git a/python/cudf/cudf/tests/test_timedelta.py b/python/cudf/cudf/tests/test_timedelta.py index 23270875a92..c1b603e34f2 100644 --- a/python/cudf/cudf/tests/test_timedelta.py +++ b/python/cudf/cudf/tests/test_timedelta.py @@ -400,12 +400,7 @@ def test_timedelta_dataframe_ops(df, op): [1], [12, 11, 232, 223432411, 2343241, 234324, 23234], [12, 11, 2.32, 2234.32411, 2343.241, 23432.4, 23234], - pytest.param( - [1.321, 1132.324, 23223231.11, 233.41, 0.2434, 332, 323], - marks=pytest.mark.xfail( - reason="https://github.com/rapidsai/cudf/issues/5938" - ), - ), + [1.321, 1132.324, 23223231.11, 233.41, 332, 323], [12, 11, 2.32, 2234.32411, 2343.241, 23432.4, 23234], ], ) @@ -492,6 +487,36 @@ def test_timedelta_series_ops_with_scalars(data, other_scalars, dtype, op): assert_eq(expected, actual) +@pytest.mark.parametrize( + "reverse", + [ + False, + pytest.param( + True, + marks=pytest.mark.xfail( + strict=True, + reason=( + "timedelta modulo by zero is dubiously defined in " + "both pandas and cuDF " + "(see https://github.com/rapidsai/cudf/issues/5938)" + ), + ), + ), + ], +) +def test_timedelta_series_mod_with_scalar_zero(reverse): + gsr = cudf.Series(data=[0.2434], dtype=np.timedelta64(1, "ns")) + psr = gsr.to_pandas() + scalar = datetime.timedelta(days=768) + if reverse: + expected = scalar % psr + actual = scalar % gsr + else: + expected = psr % scalar + actual = gsr % scalar + assert_eq(expected, actual) + + @pytest.mark.parametrize( "data", [ @@ -597,6 +622,37 @@ def test_timedelta_series_ops_with_cudf_scalars(data, cpu_scalar, dtype, op): assert_eq(expected, actual) +@pytest.mark.parametrize( + "reverse", + [ + False, + pytest.param( + True, + marks=pytest.mark.xfail( + strict=True, + reason=( + "timedelta modulo by zero is dubiously defined in " + "both pandas and cuDF " + "(see https://github.com/rapidsai/cudf/issues/5938)" + ), + ), + ), + ], +) +def test_timedelta_series_mod_with_cudf_scalar_zero(reverse): + gsr = cudf.Series(data=[0.2434], dtype=np.timedelta64(1, "ns")) + psr = gsr.to_pandas() + scalar = datetime.timedelta(days=768) + gpu_scalar = cudf.Scalar(scalar) + if reverse: + expected = scalar % psr + actual = gpu_scalar % gsr + else: + expected = psr % scalar + actual = gsr % gpu_scalar + assert_eq(expected, actual) + + @pytest.mark.parametrize( "data", [ @@ -812,7 +868,8 @@ def test_timedelta_datetime_index_ops_misc( pytest.param( "floordiv", marks=pytest.mark.xfail( - reason="https://github.com/pandas-dev/pandas/issues/35529" + condition=not PANDAS_GE_120, + reason="https://github.com/pandas-dev/pandas/issues/35529", ), ), ], @@ -850,7 +907,35 @@ def test_timedelta_index_ops_with_scalars(data, other_scalars, dtype, op): expected = other_scalars // ptdi actual = other_scalars // gtdi - assert_eq(expected, actual) + if op == "floordiv": + # Hand-coding pytest.xfail behaviour for certain combinations + if ( + 0 in ptdi.astype("int") + and np.timedelta64(other_scalars).item() is not None + ): + with pytest.raises(AssertionError): + # Related to https://github.com/rapidsai/cudf/issues/5938 + # + # Division by zero for datetime or timedelta is + # dubiously defined in both pandas (Any // 0 -> 0 in + # pandas) and cuDF (undefined behaviour) + assert_eq(expected, actual) + elif ( + (None not in ptdi) + and np.nan not in expected + and ( + expected.astype("float64").astype("int64") + != expected.astype("int64") + ).any() + ): + with pytest.raises(AssertionError): + # Incorrect implementation of floordiv in cuDF: + # https://github.com/rapidsai/cudf/issues/12120 + assert_eq(expected, actual) + else: + assert_eq(expected, actual) + else: + assert_eq(expected, actual) @pytest.mark.parametrize("data", _TIMEDELTA_DATA_NON_OVERFLOW) @@ -876,12 +961,12 @@ def test_timedelta_index_ops_with_scalars(data, other_scalars, dtype, op): pytest.param( "floordiv", marks=pytest.mark.xfail( - reason="https://github.com/rapidsai/cudf/issues/5938" + condition=not PANDAS_GE_120, + reason="https://github.com/pandas-dev/pandas/issues/35529", ), ), ], ) -@pytest.mark.filterwarnings("ignore:divide by zero:RuntimeWarning:pandas") def test_timedelta_index_ops_with_cudf_scalars(data, cpu_scalar, dtype, op): gtdi = cudf.Index(data=data, dtype=dtype) ptdi = gtdi.to_pandas() @@ -916,7 +1001,35 @@ def test_timedelta_index_ops_with_cudf_scalars(data, cpu_scalar, dtype, op): expected = cpu_scalar // ptdi actual = gpu_scalar // gtdi - assert_eq(expected, actual) + if op == "floordiv": + # Hand-coding pytest.xfail behaviour for certain combinations + if ( + 0 in ptdi.astype("int") + and np.timedelta64(cpu_scalar).item() is not None + ): + with pytest.raises(AssertionError): + # Related to https://github.com/rapidsai/cudf/issues/5938 + # + # Division by zero for datetime or timedelta is + # dubiously defined in both pandas (Any // 0 -> 0 in + # pandas) and cuDF (undefined behaviour) + assert_eq(expected, actual) + elif ( + (None not in ptdi) + and np.nan not in expected + and ( + expected.astype("float64").astype("int64") + != expected.astype("int64") + ).any() + ): + with pytest.raises(AssertionError): + # Incorrect implementation of floordiv in cuDF: + # https://github.com/rapidsai/cudf/issues/12120 + assert_eq(expected, actual) + else: + assert_eq(expected, actual) + else: + assert_eq(expected, actual) @pytest.mark.parametrize("data", _TIMEDELTA_DATA)