diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 11dd7556812..de2df9b50d7 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -75,10 +75,6 @@ is_space as cpp_isspace, is_upper as cpp_is_upper, ) -from cudf._lib.strings.convert.convert_integers import ( - is_integer as cpp_is_integer, -) -from cudf._lib.strings.convert.convert_floats import is_float as cpp_is_float from cudf._lib.strings.combine import ( concatenate as cpp_concatenate, join as cpp_join, @@ -91,6 +87,10 @@ from cudf._lib.strings.convert.convert_fixed_point import ( to_decimal as cpp_to_decimal, ) +from cudf._lib.strings.convert.convert_floats import is_float as cpp_is_float +from cudf._lib.strings.convert.convert_integers import ( + is_integer as cpp_is_integer, +) from cudf._lib.strings.convert.convert_urls import ( url_decode as cpp_url_decode, url_encode as cpp_url_encode, @@ -4760,10 +4760,7 @@ def base_size(self) -> int: if len(self.base_children) == 0: return 0 else: - return int( - (self.base_children[0].size - 1) - / self.base_children[0].dtype.itemsize - ) + return self.base_children[0].size - 1 @property def data_array_view(self) -> cuda.devicearray.DeviceNDArray: diff --git a/python/cudf/cudf/tests/test_serialize.py b/python/cudf/cudf/tests/test_serialize.py index 656b66bf793..0e9c61b634d 100644 --- a/python/cudf/cudf/tests/test_serialize.py +++ b/python/cudf/cudf/tests/test_serialize.py @@ -296,3 +296,24 @@ def test_deserialize_cudf_0_16(datadir): actual = pickle.load(open(fname, "rb")) assert_eq(expected, actual) + + +def test_serialize_sliced_string(): + # https://github.com/rapidsai/cudf/issues/7735 + data = ["hi", "hello", None] + pd_series = pd.Series(data, dtype=pd.StringDtype()) + gd_series = cudf.Series(data, dtype="str") + sliced = gd_series[0:3] + serialized_gd_series = gd_series.serialize() + serialized_sliced = sliced.serialize() + + # validate frames are equal or not + # because both should be identical + for i in range(3): + assert_eq( + serialized_gd_series[1][i].to_host_array(), + serialized_sliced[1][i].to_host_array(), + ) + + recreated = cudf.Series.deserialize(*sliced.serialize()) + assert_eq(recreated.to_pandas(nullable=True), pd_series) diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py index 98b8bfb870d..8b1ad696f04 100644 --- a/python/cudf/cudf/tests/test_string.py +++ b/python/cudf/cudf/tests/test_string.py @@ -2922,3 +2922,14 @@ def test_string_std(): assert_exceptions_equal( lfunc=psr.std, rfunc=sr.std, compare_error_message=False ) + + +def test_string_slice_with_mask(): + actual = cudf.Series(["hi", "hello", None]) + expected = actual[0:3] + + assert actual._column.base_size == 3 + assert_eq(actual._column.base_size, expected._column.base_size) + assert_eq(actual._column.null_count, expected._column.null_count) + + assert_eq(actual, expected) diff --git a/python/dask_cudf/dask_cudf/tests/test_distributed.py b/python/dask_cudf/dask_cudf/tests/test_distributed.py index cb3c696adc3..85354704902 100644 --- a/python/dask_cudf/dask_cudf/tests/test_distributed.py +++ b/python/dask_cudf/dask_cudf/tests/test_distributed.py @@ -6,11 +6,11 @@ from dask.distributed import Client from distributed.utils_test import loop # noqa: F401 -import dask_cudf - import cudf from cudf.tests.utils import assert_eq +import dask_cudf + dask_cuda = pytest.importorskip("dask_cuda") @@ -65,3 +65,13 @@ def test_ucx_seriesgroupby(): dask_df_g = dask_df.groupby(["a"]).b.sum().compute() assert dask_df_g.name == "b" + + +def test_str_series_roundtrip(): + with dask_cuda.LocalCUDACluster(n_workers=1) as cluster: + with Client(cluster): + expected = cudf.Series(["hi", "hello", None]) + dask_series = dask_cudf.from_cudf(expected, npartitions=2) + + actual = dask_series.compute() + assert_eq(actual, expected)