diff --git a/python/cudf/cudf/tests/test_setitem.py b/python/cudf/cudf/tests/test_setitem.py index 4d9ffc7cd81..dd82a9244b6 100644 --- a/python/cudf/cudf/tests/test_setitem.py +++ b/python/cudf/cudf/tests/test_setitem.py @@ -353,3 +353,18 @@ def test_scatter_by_slice_with_start_and_step(): target[1::2] = source ctarget[1::2] = csource assert_eq(target, ctarget) + + +@pytest.mark.parametrize("n", [1, 3]) +def test_setitem_str_trailing_null(n): + trailing_nulls = "\x00" * n + s = cudf.Series(["a", "b", "c" + trailing_nulls]) + assert s[2] == "c" + trailing_nulls + s[0] = "a" + trailing_nulls + assert s[0] == "a" + trailing_nulls + s[1] = trailing_nulls + assert s[1] == trailing_nulls + s[0] = "" + assert s[0] == "" + s[0] = "\x00" + assert s[0] == "\x00" diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index acf00b3a3d5..2484003bd38 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -260,6 +260,15 @@ def to_cudf_compatible_scalar(val, dtype=None): ) or cudf.api.types.is_string_dtype(dtype): dtype = "str" + if isinstance(val, str) and val.endswith("\x00"): + # Numpy string dtypes are fixed width and use NULL to + # indicate the end of the string, so they cannot + # distinguish between "abc\x00" and "abc". + # https://github.com/numpy/numpy/issues/20118 + # In this case, don't try going through numpy and just use + # the string value directly (cudf.DeviceScalar will DTRT) + return val + if isinstance(val, datetime.datetime): val = np.datetime64(val) elif isinstance(val, datetime.timedelta):