Skip to content

Commit

Permalink
Fix type casting in Series.__setitem__ (#11904)
Browse files Browse the repository at this point in the history
To mimic pandas, we must upcast a column to the numpy result_type of the column itself and the input value dtype. This previously occurred in all relevant cases except when the index provided to __setitem__ was a single integer (originally introduced in #2442). Closes #11901.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Bradley Dice (https://github.com/bdice)

URL: #11904
  • Loading branch information
wence- authored Nov 4, 2022
1 parent 9df2eba commit 11b875b
Show file tree
Hide file tree
Showing 4 changed files with 59 additions and 8 deletions.
2 changes: 2 additions & 0 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -847,6 +847,8 @@ def can_cast_safely(self, to_dtype: Dtype) -> bool:
raise NotImplementedError()

def astype(self, dtype: Dtype, **kwargs) -> ColumnBase:
if self.dtype == dtype:
return self
if is_categorical_dtype(dtype):
return self.as_categorical_column(dtype, **kwargs)

Expand Down
2 changes: 2 additions & 0 deletions python/cudf/cudf/core/scalar.py
Original file line number Diff line number Diff line change
Expand Up @@ -392,4 +392,6 @@ def _dispatch_scalar_unaop(self, op):
return getattr(self.value, op)()

def astype(self, dtype):
if self.dtype == dtype:
return self
return Scalar(self.value, dtype)
18 changes: 10 additions & 8 deletions python/cudf/cudf/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
is_integer_dtype,
is_list_dtype,
is_scalar,
is_string_dtype,
is_struct_dtype,
)
from cudf.core.abc import Serializable
Expand Down Expand Up @@ -214,19 +215,20 @@ def __setitem__(self, key, value):
value = column.as_column(value)

if (
not isinstance(
self._frame._column.dtype,
(cudf.core.dtypes.DecimalDtype, cudf.CategoricalDtype),
(
_is_non_decimal_numeric_dtype(self._frame._column.dtype)
or is_string_dtype(self._frame._column.dtype)
)
and hasattr(value, "dtype")
and _is_non_decimal_numeric_dtype(value.dtype)
):
# normalize types if necessary:
if not is_integer(key):
to_dtype = np.result_type(
value.dtype, self._frame._column.dtype
)
value = value.astype(to_dtype)
# In contrast to Column.__setitem__ (which downcasts the value to
# the dtype of the column) here we upcast the series to the
# larger data type mimicking pandas
to_dtype = np.result_type(value.dtype, self._frame._column.dtype)
value = value.astype(to_dtype)
if to_dtype != self._frame._column.dtype:
self._frame._column._mimic_inplace(
self._frame._column.astype(to_dtype), inplace=True
)
Expand Down
45 changes: 45 additions & 0 deletions python/cudf/cudf/tests/test_setitem.py
Original file line number Diff line number Diff line change
Expand Up @@ -297,3 +297,48 @@ def test_series_slice_setitem_struct():
actual[0:3] = cudf.Scalar({"a": {"b": 5050}, "b": 101})

assert_eq(actual, expected)


@pytest.mark.parametrize("dtype", [np.int32, np.int64, np.float32, np.float64])
@pytest.mark.parametrize("indices", [0, [1, 2]])
def test_series_setitem_upcasting(dtype, indices):
sr = pd.Series([0, 0, 0], dtype=dtype)
cr = cudf.from_pandas(sr)
assert_eq(sr, cr)
# Must be a non-integral floating point value that can't be losslessly
# converted to float32, otherwise pandas will try and match the source
# column dtype.
new_value = np.float64(np.pi)
col_ref = cr._column
sr[indices] = new_value
cr[indices] = new_value
if PANDAS_GE_150:
assert_eq(sr, cr)
else:
# pandas bug, incorrectly fails to upcast from float32 to float64
assert_eq(sr.values, cr.values)
if dtype == np.float64:
# no-op type cast should not modify backing column
assert col_ref == cr._column


# TODO: these two tests could perhaps be changed once specifics of
# pandas compat wrt upcasting are decided on; this is just baking in
# status-quo.
def test_series_setitem_upcasting_string_column():
sr = pd.Series([0, 0, 0], dtype=str)
cr = cudf.from_pandas(sr)
new_value = np.float64(10.5)
sr[0] = str(new_value)
cr[0] = new_value
assert_eq(sr, cr)


def test_series_setitem_upcasting_string_value():
sr = cudf.Series([0, 0, 0], dtype=int)
# This is a distinction with pandas, which lets you instead make an
# object column with ["10", 0, 0]
sr[0] = "10"
assert_eq(pd.Series([10, 0, 0], dtype=int), sr)
with pytest.raises(ValueError):
sr[0] = "non-integer"

0 comments on commit 11b875b

Please sign in to comment.