diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index e59b395ec0f..0922761eeb9 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -1510,7 +1510,7 @@ def column_empty( dtype="int32", ), ) - elif dtype.kind in "OU": + elif dtype.kind in "OU" and not is_decimal_dtype(dtype): data = None children = ( full(row_count + 1, 0, dtype="int32"), @@ -1641,6 +1641,7 @@ def build_column( return cudf.core.column.DecimalColumn( data=data, size=size, + offset=offset, dtype=dtype, mask=mask, null_count=null_count, diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py index 96e09a5abb5..131b1729de9 100644 --- a/python/cudf/cudf/core/column/decimal.py +++ b/python/cudf/cudf/core/column/decimal.py @@ -35,6 +35,7 @@ def from_arrow(cls, data: pa.Array): data=Buffer(data_64.view("uint8")), size=len(data), dtype=dtype, + offset=data.offset, mask=mask, ) @@ -56,6 +57,7 @@ def to_arrow(self): ) return pa.Array.from_buffers( type=self.dtype.to_arrow(), + offset=self._offset, length=self.size, buffers=[mask_buf, data_buf], ) diff --git a/python/cudf/cudf/tests/test_column.py b/python/cudf/cudf/tests/test_column.py index 9509cabc117..51745c9bd03 100644 --- a/python/cudf/cudf/tests/test_column.py +++ b/python/cudf/cudf/tests/test_column.py @@ -54,7 +54,7 @@ def test_column_offset_and_size(pandas_input, offset, size): if cudf.utils.dtypes.is_categorical_dtype(col.dtype): assert col.size == col.codes.size assert col.size == (col.codes.data.size / col.codes.dtype.itemsize) - elif pd.api.types.is_string_dtype(col.dtype): + elif cudf.utils.dtypes.is_string_dtype(col.dtype): if col.size > 0: assert col.size == (col.children[0].size - 1) assert col.size == ( @@ -79,6 +79,49 @@ def test_column_offset_and_size(pandas_input, offset, size): assert_eq(expect, got) +def column_slicing_test(col, offset, size, cast_to_float=False): + sl = slice(offset, offset + size) + col_slice = col[sl] + series = cudf.Series(col) + sliced_series = cudf.Series(col_slice) + + if cast_to_float: + pd_series = series.astype(float).to_pandas() + sliced_series = sliced_series.astype(float) + else: + pd_series = series.to_pandas() + + if cudf.utils.dtypes.is_categorical_dtype(col.dtype): + # The cudf.Series is constructed from an already sliced column, whereas + # the pandas.Series is constructed from the unsliced series and then + # sliced, so the indexes should be different and we must ignore it. + # However, we must compare these as frames, not raw arrays, because + # numpy comparison of categorical values won't work. + assert_eq( + pd_series[sl].reset_index(drop=True), + sliced_series.reset_index(drop=True), + ) + else: + assert_eq(np.asarray(pd_series[sl]), sliced_series.to_array()) + + +@pytest.mark.parametrize("offset", [0, 1, 15]) +@pytest.mark.parametrize("size", [50, 10, 0]) +def test_column_slicing(pandas_input, offset, size): + col = cudf.core.column.as_column(pandas_input) + column_slicing_test(col, offset, size) + + +@pytest.mark.parametrize("offset", [0, 1, 15]) +@pytest.mark.parametrize("size", [50, 10, 0]) +@pytest.mark.parametrize("precision", [2, 3, 5]) +@pytest.mark.parametrize("scale", [0, 1, 2]) +def test_decimal_column_slicing(offset, size, precision, scale): + col = cudf.core.column.as_column(pd.Series(np.random.rand(1000))) + col = col.astype(cudf.Decimal64Dtype(precision, scale)) + column_slicing_test(col, offset, size, True) + + @pytest.mark.parametrize( "data", [