Skip to content

Commit

Permalink
Match pandas scalar result types in reductions (#9717)
Browse files Browse the repository at this point in the history
Moving this casting logic to python and updating it so that integer sum and product operations give back an `int64` and give back the original column dtype in float cases. This is a breaking change.

Closes #8449

Authors:
  - https://github.com/brandon-b-miller

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Ashwin Srinath (https://github.com/shwina)

URL: #9717
  • Loading branch information
brandon-b-miller authored Jan 11, 2022
1 parent 7ec4271 commit cc25f3d
Show file tree
Hide file tree
Showing 4 changed files with 24 additions and 11 deletions.
11 changes: 4 additions & 7 deletions python/cudf/cudf/_lib/reduce.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -43,13 +43,10 @@ def reduce(reduction_op, Column incol, dtype=None, **kwargs):
to the same type as the input column
"""

col_dtype = incol.dtype
if (
reduction_op in ['sum', 'sum_of_squares', 'product']
and not is_decimal_dtype(col_dtype)
):
col_dtype = np.find_common_type([col_dtype], [np.uint64])
col_dtype = col_dtype if dtype is None else dtype
col_dtype = (
dtype if dtype is not None
else incol._reduction_result_dtype(reduction_op)
)

cdef column_view c_incol_view = incol.view()
cdef unique_ptr[scalar] c_result
Expand Down
7 changes: 7 additions & 0 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -1235,6 +1235,13 @@ def _process_for_reduction(
)
return result_col

def _reduction_result_dtype(self, reduction_op: str) -> Dtype:
"""
Determine the correct dtype to pass to libcudf based on
the input dtype, data dtype, and specific reduction op
"""
return self.dtype

def _with_type_metadata(self: ColumnBase, dtype: Dtype) -> ColumnBase:
"""
Copies type metadata from self onto other, returning a new column.
Expand Down
11 changes: 11 additions & 0 deletions python/cudf/cudf/core/column/numerical.py
Original file line number Diff line number Diff line change
Expand Up @@ -641,6 +641,17 @@ def to_pandas(
pd_series.index = index
return pd_series

def _reduction_result_dtype(self, reduction_op: str) -> Dtype:
col_dtype = self.dtype
if reduction_op in {"sum", "product"}:
col_dtype = (
col_dtype if col_dtype.kind == "f" else np.dtype("int64")
)
elif reduction_op == "sum_of_squares":
col_dtype = np.find_common_type([col_dtype], [np.dtype("uint64")])

return col_dtype


def _normalize_find_and_replace_input(
input_column_dtype: DtypeObj, col_to_normalize: Union[ColumnBase, list]
Expand Down
6 changes: 2 additions & 4 deletions python/cudf/cudf/tests/test_reductions.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,7 @@ def test_sum(dtype, nelem):
sr = Series(data)

got = sr.sum()
expect = dtype(data.sum())

expect = data.sum()
significant = 4 if dtype == np.float32 else 6
np.testing.assert_approx_equal(expect, got, significant=significant)

Expand Down Expand Up @@ -83,8 +82,7 @@ def test_product(dtype, nelem):
sr = Series(data)

got = sr.product()
expect = np.product(data)

expect = pd.Series(data).product()
significant = 4 if dtype == np.float32 else 6
np.testing.assert_approx_equal(expect, got, significant=significant)

Expand Down

0 comments on commit cc25f3d

Please sign in to comment.