From 93050d46fd34a6335849fd240ed1439a788e85d4 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 1 Apr 2021 09:06:08 -0700 Subject: [PATCH] Fix slicing and arrow representations of decimal columns (#7755) Slices of decimal columns were not passing the appropriate offset to the new column views being created. Additionally, the conversion of decimal columns to and from arrow did not include the offset. This made all slices of decimal columns appear invalid. Thanks to @shwina for helping me trace these bugs back to their source! **Edit** This PR also now includes an additional fix for the behavior of slices of decimal columns that should generate empty data sets (e.g. starting a slice past the end of a column). Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) - https://github.com/brandon-b-miller URL: https://github.com/rapidsai/cudf/pull/7755 --- python/cudf/cudf/core/column/column.py | 3 +- python/cudf/cudf/core/column/decimal.py | 2 ++ python/cudf/cudf/tests/test_column.py | 45 ++++++++++++++++++++++++- 3 files changed, 48 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 8531ec18edc..c9d110d7e3a 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -1512,7 +1512,7 @@ def column_empty( dtype="int32", ), ) - elif dtype.kind in "OU": + elif dtype.kind in "OU" and not is_decimal_dtype(dtype): data = None children = ( full(row_count + 1, 0, dtype="int32"), @@ -1643,6 +1643,7 @@ def build_column( return cudf.core.column.DecimalColumn( data=data, size=size, + offset=offset, dtype=dtype, mask=mask, null_count=null_count, diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py index e93c5824817..ddcb5859bbc 100644 --- a/python/cudf/cudf/core/column/decimal.py +++ b/python/cudf/cudf/core/column/decimal.py @@ -37,6 +37,7 @@ def from_arrow(cls, data: pa.Array): data=Buffer(data_64.view("uint8")), size=len(data), dtype=dtype, + offset=data.offset, mask=mask, ) @@ -58,6 +59,7 @@ def to_arrow(self): ) return pa.Array.from_buffers( type=self.dtype.to_arrow(), + offset=self._offset, length=self.size, buffers=[mask_buf, data_buf], ) diff --git a/python/cudf/cudf/tests/test_column.py b/python/cudf/cudf/tests/test_column.py index 9509cabc117..51745c9bd03 100644 --- a/python/cudf/cudf/tests/test_column.py +++ b/python/cudf/cudf/tests/test_column.py @@ -54,7 +54,7 @@ def test_column_offset_and_size(pandas_input, offset, size): if cudf.utils.dtypes.is_categorical_dtype(col.dtype): assert col.size == col.codes.size assert col.size == (col.codes.data.size / col.codes.dtype.itemsize) - elif pd.api.types.is_string_dtype(col.dtype): + elif cudf.utils.dtypes.is_string_dtype(col.dtype): if col.size > 0: assert col.size == (col.children[0].size - 1) assert col.size == ( @@ -79,6 +79,49 @@ def test_column_offset_and_size(pandas_input, offset, size): assert_eq(expect, got) +def column_slicing_test(col, offset, size, cast_to_float=False): + sl = slice(offset, offset + size) + col_slice = col[sl] + series = cudf.Series(col) + sliced_series = cudf.Series(col_slice) + + if cast_to_float: + pd_series = series.astype(float).to_pandas() + sliced_series = sliced_series.astype(float) + else: + pd_series = series.to_pandas() + + if cudf.utils.dtypes.is_categorical_dtype(col.dtype): + # The cudf.Series is constructed from an already sliced column, whereas + # the pandas.Series is constructed from the unsliced series and then + # sliced, so the indexes should be different and we must ignore it. + # However, we must compare these as frames, not raw arrays, because + # numpy comparison of categorical values won't work. + assert_eq( + pd_series[sl].reset_index(drop=True), + sliced_series.reset_index(drop=True), + ) + else: + assert_eq(np.asarray(pd_series[sl]), sliced_series.to_array()) + + +@pytest.mark.parametrize("offset", [0, 1, 15]) +@pytest.mark.parametrize("size", [50, 10, 0]) +def test_column_slicing(pandas_input, offset, size): + col = cudf.core.column.as_column(pandas_input) + column_slicing_test(col, offset, size) + + +@pytest.mark.parametrize("offset", [0, 1, 15]) +@pytest.mark.parametrize("size", [50, 10, 0]) +@pytest.mark.parametrize("precision", [2, 3, 5]) +@pytest.mark.parametrize("scale", [0, 1, 2]) +def test_decimal_column_slicing(offset, size, precision, scale): + col = cudf.core.column.as_column(pd.Series(np.random.rand(1000))) + col = col.astype(cudf.Decimal64Dtype(precision, scale)) + column_slicing_test(col, offset, size, True) + + @pytest.mark.parametrize( "data", [