From 11b875bbd97053c29e7bbd4e9d2d1e528f9f221b Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <wence@gmx.li>
Date: Fri, 4 Nov 2022 23:57:07 +0000
Subject: [PATCH] Fix type casting in Series.__setitem__ (#11904)

To mimic pandas, we must upcast a column to the numpy result_type of the column itself and the input value dtype. This previously occurred in all relevant cases except when the index provided to __setitem__ was a single integer (originally introduced in #2442). Closes #11901.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/11904
---
 python/cudf/cudf/core/column/column.py |  2 ++
 python/cudf/cudf/core/scalar.py        |  2 ++
 python/cudf/cudf/core/series.py        | 18 ++++++-----
 python/cudf/cudf/tests/test_setitem.py | 45 ++++++++++++++++++++++++++
 4 files changed, 59 insertions(+), 8 deletions(-)

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 22f8d27f9e8..6c17b492f8a 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -847,6 +847,8 @@ def can_cast_safely(self, to_dtype: Dtype) -> bool:
         raise NotImplementedError()
 
     def astype(self, dtype: Dtype, **kwargs) -> ColumnBase:
+        if self.dtype == dtype:
+            return self
         if is_categorical_dtype(dtype):
             return self.as_categorical_column(dtype, **kwargs)
 
diff --git a/python/cudf/cudf/core/scalar.py b/python/cudf/cudf/core/scalar.py
index e05e8662fe4..e516177ad29 100644
--- a/python/cudf/cudf/core/scalar.py
+++ b/python/cudf/cudf/core/scalar.py
@@ -392,4 +392,6 @@ def _dispatch_scalar_unaop(self, op):
         return getattr(self.value, op)()
 
     def astype(self, dtype):
+        if self.dtype == dtype:
+            return self
         return Scalar(self.value, dtype)
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index f54f4b385e6..8c30ae258db 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -35,6 +35,7 @@
     is_integer_dtype,
     is_list_dtype,
     is_scalar,
+    is_string_dtype,
     is_struct_dtype,
 )
 from cudf.core.abc import Serializable
@@ -214,19 +215,20 @@ def __setitem__(self, key, value):
             value = column.as_column(value)
 
         if (
-            not isinstance(
-                self._frame._column.dtype,
-                (cudf.core.dtypes.DecimalDtype, cudf.CategoricalDtype),
+            (
+                _is_non_decimal_numeric_dtype(self._frame._column.dtype)
+                or is_string_dtype(self._frame._column.dtype)
             )
             and hasattr(value, "dtype")
             and _is_non_decimal_numeric_dtype(value.dtype)
         ):
             # normalize types if necessary:
-            if not is_integer(key):
-                to_dtype = np.result_type(
-                    value.dtype, self._frame._column.dtype
-                )
-                value = value.astype(to_dtype)
+            # In contrast to Column.__setitem__ (which downcasts the value to
+            # the dtype of the column) here we upcast the series to the
+            # larger data type mimicking pandas
+            to_dtype = np.result_type(value.dtype, self._frame._column.dtype)
+            value = value.astype(to_dtype)
+            if to_dtype != self._frame._column.dtype:
                 self._frame._column._mimic_inplace(
                     self._frame._column.astype(to_dtype), inplace=True
                 )
diff --git a/python/cudf/cudf/tests/test_setitem.py b/python/cudf/cudf/tests/test_setitem.py
index 13b342e6c3b..ac9dbecda65 100644
--- a/python/cudf/cudf/tests/test_setitem.py
+++ b/python/cudf/cudf/tests/test_setitem.py
@@ -297,3 +297,48 @@ def test_series_slice_setitem_struct():
     actual[0:3] = cudf.Scalar({"a": {"b": 5050}, "b": 101})
 
     assert_eq(actual, expected)
+
+
+@pytest.mark.parametrize("dtype", [np.int32, np.int64, np.float32, np.float64])
+@pytest.mark.parametrize("indices", [0, [1, 2]])
+def test_series_setitem_upcasting(dtype, indices):
+    sr = pd.Series([0, 0, 0], dtype=dtype)
+    cr = cudf.from_pandas(sr)
+    assert_eq(sr, cr)
+    # Must be a non-integral floating point value that can't be losslessly
+    # converted to float32, otherwise pandas will try and match the source
+    # column dtype.
+    new_value = np.float64(np.pi)
+    col_ref = cr._column
+    sr[indices] = new_value
+    cr[indices] = new_value
+    if PANDAS_GE_150:
+        assert_eq(sr, cr)
+    else:
+        # pandas bug, incorrectly fails to upcast from float32 to float64
+        assert_eq(sr.values, cr.values)
+    if dtype == np.float64:
+        # no-op type cast should not modify backing column
+        assert col_ref == cr._column
+
+
+# TODO: these two tests could perhaps be changed once specifics of
+# pandas compat wrt upcasting are decided on; this is just baking in
+# status-quo.
+def test_series_setitem_upcasting_string_column():
+    sr = pd.Series([0, 0, 0], dtype=str)
+    cr = cudf.from_pandas(sr)
+    new_value = np.float64(10.5)
+    sr[0] = str(new_value)
+    cr[0] = new_value
+    assert_eq(sr, cr)
+
+
+def test_series_setitem_upcasting_string_value():
+    sr = cudf.Series([0, 0, 0], dtype=int)
+    # This is a distinction with pandas, which lets you instead make an
+    # object column with ["10", 0, 0]
+    sr[0] = "10"
+    assert_eq(pd.Series([10, 0, 0], dtype=int), sr)
+    with pytest.raises(ValueError):
+        sr[0] = "non-integer"