From 11b875bbd97053c29e7bbd4e9d2d1e528f9f221b Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Fri, 4 Nov 2022 23:57:07 +0000 Subject: [PATCH] Fix type casting in Series.__setitem__ (#11904) To mimic pandas, we must upcast a column to the numpy result_type of the column itself and the input value dtype. This previously occurred in all relevant cases except when the index provided to __setitem__ was a single integer (originally introduced in #2442). Closes #11901. Authors: - Lawrence Mitchell (https://github.com/wence-) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/11904 --- python/cudf/cudf/core/column/column.py | 2 ++ python/cudf/cudf/core/scalar.py | 2 ++ python/cudf/cudf/core/series.py | 18 ++++++----- python/cudf/cudf/tests/test_setitem.py | 45 ++++++++++++++++++++++++++ 4 files changed, 59 insertions(+), 8 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 22f8d27f9e8..6c17b492f8a 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -847,6 +847,8 @@ def can_cast_safely(self, to_dtype: Dtype) -> bool: raise NotImplementedError() def astype(self, dtype: Dtype, **kwargs) -> ColumnBase: + if self.dtype == dtype: + return self if is_categorical_dtype(dtype): return self.as_categorical_column(dtype, **kwargs) diff --git a/python/cudf/cudf/core/scalar.py b/python/cudf/cudf/core/scalar.py index e05e8662fe4..e516177ad29 100644 --- a/python/cudf/cudf/core/scalar.py +++ b/python/cudf/cudf/core/scalar.py @@ -392,4 +392,6 @@ def _dispatch_scalar_unaop(self, op): return getattr(self.value, op)() def astype(self, dtype): + if self.dtype == dtype: + return self return Scalar(self.value, dtype) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index f54f4b385e6..8c30ae258db 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -35,6 +35,7 @@ is_integer_dtype, is_list_dtype, is_scalar, + is_string_dtype, is_struct_dtype, ) from cudf.core.abc import Serializable @@ -214,19 +215,20 @@ def __setitem__(self, key, value): value = column.as_column(value) if ( - not isinstance( - self._frame._column.dtype, - (cudf.core.dtypes.DecimalDtype, cudf.CategoricalDtype), + ( + _is_non_decimal_numeric_dtype(self._frame._column.dtype) + or is_string_dtype(self._frame._column.dtype) ) and hasattr(value, "dtype") and _is_non_decimal_numeric_dtype(value.dtype) ): # normalize types if necessary: - if not is_integer(key): - to_dtype = np.result_type( - value.dtype, self._frame._column.dtype - ) - value = value.astype(to_dtype) + # In contrast to Column.__setitem__ (which downcasts the value to + # the dtype of the column) here we upcast the series to the + # larger data type mimicking pandas + to_dtype = np.result_type(value.dtype, self._frame._column.dtype) + value = value.astype(to_dtype) + if to_dtype != self._frame._column.dtype: self._frame._column._mimic_inplace( self._frame._column.astype(to_dtype), inplace=True ) diff --git a/python/cudf/cudf/tests/test_setitem.py b/python/cudf/cudf/tests/test_setitem.py index 13b342e6c3b..ac9dbecda65 100644 --- a/python/cudf/cudf/tests/test_setitem.py +++ b/python/cudf/cudf/tests/test_setitem.py @@ -297,3 +297,48 @@ def test_series_slice_setitem_struct(): actual[0:3] = cudf.Scalar({"a": {"b": 5050}, "b": 101}) assert_eq(actual, expected) + + +@pytest.mark.parametrize("dtype", [np.int32, np.int64, np.float32, np.float64]) +@pytest.mark.parametrize("indices", [0, [1, 2]]) +def test_series_setitem_upcasting(dtype, indices): + sr = pd.Series([0, 0, 0], dtype=dtype) + cr = cudf.from_pandas(sr) + assert_eq(sr, cr) + # Must be a non-integral floating point value that can't be losslessly + # converted to float32, otherwise pandas will try and match the source + # column dtype. + new_value = np.float64(np.pi) + col_ref = cr._column + sr[indices] = new_value + cr[indices] = new_value + if PANDAS_GE_150: + assert_eq(sr, cr) + else: + # pandas bug, incorrectly fails to upcast from float32 to float64 + assert_eq(sr.values, cr.values) + if dtype == np.float64: + # no-op type cast should not modify backing column + assert col_ref == cr._column + + +# TODO: these two tests could perhaps be changed once specifics of +# pandas compat wrt upcasting are decided on; this is just baking in +# status-quo. +def test_series_setitem_upcasting_string_column(): + sr = pd.Series([0, 0, 0], dtype=str) + cr = cudf.from_pandas(sr) + new_value = np.float64(10.5) + sr[0] = str(new_value) + cr[0] = new_value + assert_eq(sr, cr) + + +def test_series_setitem_upcasting_string_value(): + sr = cudf.Series([0, 0, 0], dtype=int) + # This is a distinction with pandas, which lets you instead make an + # object column with ["10", 0, 0] + sr[0] = "10" + assert_eq(pd.Series([10, 0, 0], dtype=int), sr) + with pytest.raises(ValueError): + sr[0] = "non-integer"