From cc25f3d30ce6f4e4939d5b913ca1a66ab32ac75e Mon Sep 17 00:00:00 2001 From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com> Date: Tue, 11 Jan 2022 12:28:59 -0600 Subject: [PATCH] Match pandas scalar result types in reductions (#9717) Moving this casting logic to python and updating it so that integer sum and product operations give back an `int64` and give back the original column dtype in float cases. This is a breaking change. Closes https://github.com/rapidsai/cudf/issues/8449 Authors: - https://github.com/brandon-b-miller Approvers: - Vyas Ramasubramani (https://github.com/vyasr) - Ashwin Srinath (https://github.com/shwina) URL: https://github.com/rapidsai/cudf/pull/9717 --- python/cudf/cudf/_lib/reduce.pyx | 11 ++++------- python/cudf/cudf/core/column/column.py | 7 +++++++ python/cudf/cudf/core/column/numerical.py | 11 +++++++++++ python/cudf/cudf/tests/test_reductions.py | 6 ++---- 4 files changed, 24 insertions(+), 11 deletions(-) diff --git a/python/cudf/cudf/_lib/reduce.pyx b/python/cudf/cudf/_lib/reduce.pyx index 21a039dbf78..ecb787703d2 100644 --- a/python/cudf/cudf/_lib/reduce.pyx +++ b/python/cudf/cudf/_lib/reduce.pyx @@ -43,13 +43,10 @@ def reduce(reduction_op, Column incol, dtype=None, **kwargs): to the same type as the input column """ - col_dtype = incol.dtype - if ( - reduction_op in ['sum', 'sum_of_squares', 'product'] - and not is_decimal_dtype(col_dtype) - ): - col_dtype = np.find_common_type([col_dtype], [np.uint64]) - col_dtype = col_dtype if dtype is None else dtype + col_dtype = ( + dtype if dtype is not None + else incol._reduction_result_dtype(reduction_op) + ) cdef column_view c_incol_view = incol.view() cdef unique_ptr[scalar] c_result diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index a3a8b0c91d1..c1e037499fc 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -1235,6 +1235,13 @@ def _process_for_reduction( ) return result_col + def _reduction_result_dtype(self, reduction_op: str) -> Dtype: + """ + Determine the correct dtype to pass to libcudf based on + the input dtype, data dtype, and specific reduction op + """ + return self.dtype + def _with_type_metadata(self: ColumnBase, dtype: Dtype) -> ColumnBase: """ Copies type metadata from self onto other, returning a new column. diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index c947440edb1..8f0a858ee34 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -641,6 +641,17 @@ def to_pandas( pd_series.index = index return pd_series + def _reduction_result_dtype(self, reduction_op: str) -> Dtype: + col_dtype = self.dtype + if reduction_op in {"sum", "product"}: + col_dtype = ( + col_dtype if col_dtype.kind == "f" else np.dtype("int64") + ) + elif reduction_op == "sum_of_squares": + col_dtype = np.find_common_type([col_dtype], [np.dtype("uint64")]) + + return col_dtype + def _normalize_find_and_replace_input( input_column_dtype: DtypeObj, col_to_normalize: Union[ColumnBase, list] diff --git a/python/cudf/cudf/tests/test_reductions.py b/python/cudf/cudf/tests/test_reductions.py index 89d665382d3..4ed6448de50 100644 --- a/python/cudf/cudf/tests/test_reductions.py +++ b/python/cudf/cudf/tests/test_reductions.py @@ -30,8 +30,7 @@ def test_sum(dtype, nelem): sr = Series(data) got = sr.sum() - expect = dtype(data.sum()) - + expect = data.sum() significant = 4 if dtype == np.float32 else 6 np.testing.assert_approx_equal(expect, got, significant=significant) @@ -83,8 +82,7 @@ def test_product(dtype, nelem): sr = Series(data) got = sr.product() - expect = np.product(data) - + expect = pd.Series(data).product() significant = 4 if dtype == np.float32 else 6 np.testing.assert_approx_equal(expect, got, significant=significant)