diff --git a/python/cudf/cudf/_lib/reduce.pyx b/python/cudf/cudf/_lib/reduce.pyx index 2185cb089a7..62013ea88ae 100644 --- a/python/cudf/cudf/_lib/reduce.pyx +++ b/python/cudf/cudf/_lib/reduce.pyx @@ -1,6 +1,8 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2021, NVIDIA CORPORATION. import cudf +from cudf.utils.dtypes import is_decimal_dtype +from cudf.core.dtypes import Decimal64Dtype from cudf._lib.cpp.reduce cimport cpp_reduce, cpp_scan, scan_type, cpp_minmax from cudf._lib.cpp.scalar.scalar cimport scalar from cudf._lib.cpp.types cimport data_type, type_id @@ -9,12 +11,14 @@ from cudf._lib.cpp.column.column cimport column from cudf._lib.scalar cimport DeviceScalar from cudf._lib.column cimport Column from cudf._lib.types import np_to_cudf_types -from cudf._lib.types cimport underlying_type_t_type_id +from cudf._lib.types cimport underlying_type_t_type_id, dtype_to_data_type from cudf._lib.aggregation cimport make_aggregation, aggregation from libcpp.memory cimport unique_ptr from libcpp.utility cimport move, pair import numpy as np +cimport cudf._lib.cpp.types as libcudf_types + def reduce(reduction_op, Column incol, dtype=None, **kwargs): """ @@ -32,7 +36,10 @@ def reduce(reduction_op, Column incol, dtype=None, **kwargs): """ col_dtype = incol.dtype - if reduction_op in ['sum', 'sum_of_squares', 'product']: + if ( + reduction_op in ['sum', 'sum_of_squares', 'product'] + and not is_decimal_dtype(col_dtype) + ): col_dtype = np.find_common_type([col_dtype], [np.uint64]) col_dtype = col_dtype if dtype is None else dtype @@ -41,15 +48,8 @@ def reduce(reduction_op, Column incol, dtype=None, **kwargs): cdef unique_ptr[aggregation] c_agg = move(make_aggregation( reduction_op, kwargs )) - cdef type_id tid = ( - ( - ( - np_to_cudf_types[np.dtype(col_dtype)] - ) - ) - ) - cdef data_type c_out_dtype = data_type(tid) + cdef data_type c_out_dtype = dtype_to_data_type(col_dtype) # check empty case if len(incol) <= incol.null_count: @@ -69,7 +69,14 @@ def reduce(reduction_op, Column incol, dtype=None, **kwargs): c_out_dtype )) - py_result = DeviceScalar.from_unique_ptr(move(c_result)) + if c_result.get()[0].type().id() == libcudf_types.type_id.DECIMAL64: + scale = -c_result.get()[0].type().scale() + precision = _reduce_precision(col_dtype, reduction_op, len(incol)) + py_result = DeviceScalar.from_unique_ptr( + move(c_result), dtype=Decimal64Dtype(precision, scale) + ) + else: + py_result = DeviceScalar.from_unique_ptr(move(c_result)) return py_result.value @@ -132,3 +139,24 @@ def minmax(Column incol): py_result_max = DeviceScalar.from_unique_ptr(move(c_result.second)) return cudf.Scalar(py_result_min), cudf.Scalar(py_result_max) + + +def _reduce_precision(dtype, op, nrows): + """ + Returns the result precision when performing the reduce + operation `op` for the given dtype and column size. + + See: https://docs.microsoft.com/en-us/sql/t-sql/data-types/precision-scale-and-length-transact-sql + """ # noqa: E501 + p = dtype.precision + if op in ("min", "max"): + new_p = p + elif op == "sum": + new_p = p + nrows - 1 + elif op == "product": + new_p = p * nrows + nrows - 1 + elif op == "sum_of_squares": + new_p = 2 * p + nrows + else: + raise NotImplementedError() + return max(min(new_p, Decimal64Dtype.MAX_PRECISION), 0) diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py index 971d849d970..7204aebbf19 100644 --- a/python/cudf/cudf/core/column/decimal.py +++ b/python/cudf/cudf/core/column/decimal.py @@ -104,6 +104,37 @@ def as_string_column( "cudf.core.column.StringColumn", as_column([], dtype="object") ) + def reduce(self, op: str, skipna: bool = None, **kwargs) -> Decimal: + min_count = kwargs.pop("min_count", 0) + preprocessed = self._process_for_reduction( + skipna=skipna, min_count=min_count + ) + if isinstance(preprocessed, ColumnBase): + return libcudf.reduce.reduce(op, preprocessed, **kwargs) + else: + return preprocessed + + def sum( + self, skipna: bool = None, dtype: Dtype = None, min_count: int = 0 + ) -> Decimal: + return self.reduce( + "sum", skipna=skipna, dtype=dtype, min_count=min_count + ) + + def product( + self, skipna: bool = None, dtype: Dtype = None, min_count: int = 0 + ) -> Decimal: + return self.reduce( + "product", skipna=skipna, dtype=dtype, min_count=min_count + ) + + def sum_of_squares( + self, skipna: bool = None, dtype: Dtype = None, min_count: int = 0 + ) -> Decimal: + return self.reduce( + "sum_of_squares", skipna=skipna, dtype=dtype, min_count=min_count + ) + def _binop_scale(l_dtype, r_dtype, op): # This should at some point be hooked up to libcudf's diff --git a/python/cudf/cudf/tests/test_reductions.py b/python/cudf/cudf/tests/test_reductions.py index 80a2e89bf46..c998f308417 100644 --- a/python/cudf/cudf/tests/test_reductions.py +++ b/python/cudf/cudf/tests/test_reductions.py @@ -7,12 +7,15 @@ from itertools import product import numpy as np +import pandas as pd import pytest +from decimal import Decimal import cudf from cudf.core import Series +from cudf.core.dtypes import Decimal64Dtype from cudf.tests import utils -from cudf.tests.utils import NUMERIC_TYPES, gen_rand +from cudf.tests.utils import NUMERIC_TYPES, gen_rand, assert_eq params_dtype = NUMERIC_TYPES @@ -50,6 +53,20 @@ def test_sum_string(): assert got == expected +@pytest.mark.parametrize( + "dtype", + [Decimal64Dtype(6, 3), Decimal64Dtype(10, 6), Decimal64Dtype(16, 7)], +) +@pytest.mark.parametrize("nelem", params_sizes) +def test_sum_decimal(dtype, nelem): + data = [str(x) for x in gen_rand("int64", nelem) / 100] + + expected = pd.Series([Decimal(x) for x in data]).sum() + got = cudf.Series(data).astype(dtype).sum() + + assert_eq(expected, got) + + @pytest.mark.parametrize("dtype,nelem", params) def test_product(dtype, nelem): dtype = np.dtype(dtype).type @@ -70,6 +87,19 @@ def test_product(dtype, nelem): np.testing.assert_approx_equal(expect, got, significant=significant) +@pytest.mark.parametrize( + "dtype", + [Decimal64Dtype(6, 2), Decimal64Dtype(8, 4), Decimal64Dtype(10, 5)], +) +def test_product_decimal(dtype): + data = [str(x) for x in gen_rand("int8", 3) / 10] + + expected = pd.Series([Decimal(x) for x in data]).product() + got = cudf.Series(data).astype(dtype).product() + + assert_eq(expected, got) + + accuracy_for_dtype = {np.float64: 6, np.float32: 5} @@ -94,6 +124,19 @@ def test_sum_of_squares(dtype, nelem): ) +@pytest.mark.parametrize( + "dtype", + [Decimal64Dtype(6, 2), Decimal64Dtype(8, 4), Decimal64Dtype(10, 5)], +) +def test_sum_of_squares_decimal(dtype): + data = [str(x) for x in gen_rand("int8", 3) / 10] + + expected = pd.Series([Decimal(x) for x in data]).pow(2).sum() + got = cudf.Series(data).astype(dtype).sum_of_squares() + + assert_eq(expected, got) + + @pytest.mark.parametrize("dtype,nelem", params) def test_min(dtype, nelem): dtype = np.dtype(dtype).type @@ -106,6 +149,20 @@ def test_min(dtype, nelem): assert expect == got +@pytest.mark.parametrize( + "dtype", + [Decimal64Dtype(6, 3), Decimal64Dtype(10, 6), Decimal64Dtype(16, 7)], +) +@pytest.mark.parametrize("nelem", params_sizes) +def test_min_decimal(dtype, nelem): + data = [str(x) for x in gen_rand("int64", nelem) / 100] + + expected = pd.Series([Decimal(x) for x in data]).min() + got = cudf.Series(data).astype(dtype).min() + + assert_eq(expected, got) + + @pytest.mark.parametrize("dtype,nelem", params) def test_max(dtype, nelem): dtype = np.dtype(dtype).type @@ -118,6 +175,20 @@ def test_max(dtype, nelem): assert expect == got +@pytest.mark.parametrize( + "dtype", + [Decimal64Dtype(6, 3), Decimal64Dtype(10, 6), Decimal64Dtype(16, 7)], +) +@pytest.mark.parametrize("nelem", params_sizes) +def test_max_decimal(dtype, nelem): + data = [str(x) for x in gen_rand("int64", nelem) / 100] + + expected = pd.Series([Decimal(x) for x in data]).max() + got = cudf.Series(data).astype(dtype).max() + + assert_eq(expected, got) + + @pytest.mark.parametrize("nelem", params_sizes) def test_sum_masked(nelem): dtype = np.float64