Skip to content

Commit

Permalink
Fix slicing and arrow representations of decimal columns (#7755)
Browse files Browse the repository at this point in the history
Slices of decimal columns were not passing the appropriate offset to the new column views being created. Additionally, the conversion of decimal columns to and from arrow did not include the offset. This made all slices of decimal columns appear invalid.

Thanks to @shwina for helping me trace these bugs back to their source!

**Edit**
This PR also now includes an additional fix for the behavior of slices of decimal columns that should generate empty data sets (e.g. starting a slice past the end of a column).

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - https://github.com/brandon-b-miller

URL: #7755
  • Loading branch information
vyasr authored Apr 1, 2021
1 parent f4ab813 commit 93050d4
Show file tree
Hide file tree
Showing 3 changed files with 48 additions and 2 deletions.
3 changes: 2 additions & 1 deletion python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -1512,7 +1512,7 @@ def column_empty(
dtype="int32",
),
)
elif dtype.kind in "OU":
elif dtype.kind in "OU" and not is_decimal_dtype(dtype):
data = None
children = (
full(row_count + 1, 0, dtype="int32"),
Expand Down Expand Up @@ -1643,6 +1643,7 @@ def build_column(
return cudf.core.column.DecimalColumn(
data=data,
size=size,
offset=offset,
dtype=dtype,
mask=mask,
null_count=null_count,
Expand Down
2 changes: 2 additions & 0 deletions python/cudf/cudf/core/column/decimal.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ def from_arrow(cls, data: pa.Array):
data=Buffer(data_64.view("uint8")),
size=len(data),
dtype=dtype,
offset=data.offset,
mask=mask,
)

Expand All @@ -58,6 +59,7 @@ def to_arrow(self):
)
return pa.Array.from_buffers(
type=self.dtype.to_arrow(),
offset=self._offset,
length=self.size,
buffers=[mask_buf, data_buf],
)
Expand Down
45 changes: 44 additions & 1 deletion python/cudf/cudf/tests/test_column.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def test_column_offset_and_size(pandas_input, offset, size):
if cudf.utils.dtypes.is_categorical_dtype(col.dtype):
assert col.size == col.codes.size
assert col.size == (col.codes.data.size / col.codes.dtype.itemsize)
elif pd.api.types.is_string_dtype(col.dtype):
elif cudf.utils.dtypes.is_string_dtype(col.dtype):
if col.size > 0:
assert col.size == (col.children[0].size - 1)
assert col.size == (
Expand All @@ -79,6 +79,49 @@ def test_column_offset_and_size(pandas_input, offset, size):
assert_eq(expect, got)


def column_slicing_test(col, offset, size, cast_to_float=False):
sl = slice(offset, offset + size)
col_slice = col[sl]
series = cudf.Series(col)
sliced_series = cudf.Series(col_slice)

if cast_to_float:
pd_series = series.astype(float).to_pandas()
sliced_series = sliced_series.astype(float)
else:
pd_series = series.to_pandas()

if cudf.utils.dtypes.is_categorical_dtype(col.dtype):
# The cudf.Series is constructed from an already sliced column, whereas
# the pandas.Series is constructed from the unsliced series and then
# sliced, so the indexes should be different and we must ignore it.
# However, we must compare these as frames, not raw arrays, because
# numpy comparison of categorical values won't work.
assert_eq(
pd_series[sl].reset_index(drop=True),
sliced_series.reset_index(drop=True),
)
else:
assert_eq(np.asarray(pd_series[sl]), sliced_series.to_array())


@pytest.mark.parametrize("offset", [0, 1, 15])
@pytest.mark.parametrize("size", [50, 10, 0])
def test_column_slicing(pandas_input, offset, size):
col = cudf.core.column.as_column(pandas_input)
column_slicing_test(col, offset, size)


@pytest.mark.parametrize("offset", [0, 1, 15])
@pytest.mark.parametrize("size", [50, 10, 0])
@pytest.mark.parametrize("precision", [2, 3, 5])
@pytest.mark.parametrize("scale", [0, 1, 2])
def test_decimal_column_slicing(offset, size, precision, scale):
col = cudf.core.column.as_column(pd.Series(np.random.rand(1000)))
col = col.astype(cudf.Decimal64Dtype(precision, scale))
column_slicing_test(col, offset, size, True)


@pytest.mark.parametrize(
"data",
[
Expand Down

0 comments on commit 93050d4

Please sign in to comment.