diff --git a/cpp/include/cudf/detail/gather.cuh b/cpp/include/cudf/detail/gather.cuh index bf488621d52..936c6d9307c 100644 --- a/cpp/include/cudf/detail/gather.cuh +++ b/cpp/include/cudf/detail/gather.cuh @@ -142,7 +142,11 @@ void gather_helper(InputItr source_itr, // Error case when no other overload or specialization is available template struct column_gatherer_impl { - std::unique_ptr operator()(...) { CUDF_FAIL("Unsupported type in gather."); } + template + std::unique_ptr operator()(Args&&...) + { + CUDF_FAIL("Unsupported type in gather."); + } }; /** diff --git a/cpp/include/cudf/detail/scatter.cuh b/cpp/include/cudf/detail/scatter.cuh index 30764b9b89f..b51c44772b5 100644 --- a/cpp/include/cudf/detail/scatter.cuh +++ b/cpp/include/cudf/detail/scatter.cuh @@ -81,7 +81,11 @@ auto scatter_to_gather(MapIterator scatter_map_begin, template struct column_scatterer_impl { - std::unique_ptr operator()(...) const { CUDF_FAIL("Unsupported type for scatter."); } + template + std::unique_ptr operator()(Args&&...) const + { + CUDF_FAIL("Unsupported type for scatter."); + } }; template diff --git a/cpp/src/copying/copy.cu b/cpp/src/copying/copy.cu index e6adc027acc..fecf7d18d46 100644 --- a/cpp/src/copying/copy.cu +++ b/cpp/src/copying/copy.cu @@ -31,7 +31,11 @@ namespace { template struct copy_if_else_functor_impl { - std::unique_ptr operator()(...) { CUDF_FAIL("Unsupported type for copy_if_else."); } + template + std::unique_ptr operator()(Args&&...) + { + CUDF_FAIL("Unsupported type for copy_if_else."); + } }; template diff --git a/cpp/src/replace/nulls.cu b/cpp/src/replace/nulls.cu index 65750deaa57..4cf6899116d 100644 --- a/cpp/src/replace/nulls.cu +++ b/cpp/src/replace/nulls.cu @@ -424,7 +424,7 @@ std::unique_ptr replace_nulls(cudf::column_view const& input, CUDF_EXPECTS(replacement.size() == input.size(), "Column size mismatch"); if (input.is_empty()) { return cudf::empty_like(input); } - if (!input.has_nulls()) { return std::make_unique(input); } + if (!input.has_nulls()) { return std::make_unique(input, stream, mr); } return cudf::type_dispatcher( input.type(), replace_nulls_column_kernel_forwarder{}, input, replacement, stream, mr); diff --git a/docs/cudf/source/basics.rst b/docs/cudf/source/basics.rst index e270708df90..15b4b43662b 100644 --- a/docs/cudf/source/basics.rst +++ b/docs/cudf/source/basics.rst @@ -34,6 +34,8 @@ The following table lists all of cudf types. For methods requiring dtype argumen +------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+ | Boolean | | np.bool_ | ``'bool'`` | +------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+ +| Decimal | Decimal64Dtype | (none) | (none) | ++------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+ **Note: All dtypes above are Nullable** diff --git a/docs/cudf/source/groupby.md b/docs/cudf/source/groupby.md index 5376df261e7..8a0e5dddba0 100644 --- a/docs/cudf/source/groupby.md +++ b/docs/cudf/source/groupby.md @@ -120,24 +120,24 @@ a The following table summarizes the available aggregations and the types that support them: -| Aggregations\dtypes | Numeric | Datetime | String | Categorical | List | Struct | -| ------------------- | -------- | ------- | -------- | ----------- | ---- | ------ | -| count | ✅ | ✅ | ✅ | ✅ | | | -| size | ✅ | ✅ | ✅ | ✅ | | | -| sum | ✅ | ✅ | | | | | -| idxmin | ✅ | ✅ | | | | | -| idxmax | ✅ | ✅ | | | | | -| min | ✅ | ✅ | ✅ | | | | -| max | ✅ | ✅ | ✅ | | | | -| mean | ✅ | ✅ | | | | | -| var | ✅ | ✅ | | | | | -| std | ✅ | ✅ | | | | | -| quantile | ✅ | ✅ | | | | | -| median | ✅ | ✅ | | | | | -| nunique | ✅ | ✅ | ✅ | ✅ | | | -| nth | ✅ | ✅ | ✅ | | | | -| collect | ✅ | ✅ | ✅ | | ✅ | | -| unique | ✅ | ✅ | ✅ | ✅ | | | +| Aggregations\dtypes | Numeric | Datetime | String | Categorical | List | Struct | Interval | Decimal | +| ------------------- | -------- | ------- | -------- | ----------- | ---- | ------ | -------- | ------- | +| count | ✅ | ✅ | ✅ | ✅ | | | | ✅ | +| size | ✅ | ✅ | ✅ | ✅ | | | | ✅ | +| sum | ✅ | ✅ | | | | | | ✅ | +| idxmin | ✅ | ✅ | | | | | | ✅ | +| idxmax | ✅ | ✅ | | | | | | ✅ | +| min | ✅ | ✅ | ✅ | | | | | ✅ | +| max | ✅ | ✅ | ✅ | | | | | ✅ | +| mean | ✅ | ✅ | | | | | | | +| var | ✅ | ✅ | | | | | | | +| std | ✅ | ✅ | | | | | | | +| quantile | ✅ | ✅ | | | | | | | +| median | ✅ | ✅ | | | | | | | +| nunique | ✅ | ✅ | ✅ | ✅ | | | | ✅ | +| nth | ✅ | ✅ | ✅ | | | | | ✅ | +| collect | ✅ | ✅ | ✅ | | ✅ | | | ✅ | +| unique | ✅ | ✅ | ✅ | ✅ | | | | | ## GroupBy apply diff --git a/python/cudf/cudf/_lib/cpp/wrappers/decimals.pxd b/python/cudf/cudf/_lib/cpp/wrappers/decimals.pxd index a73e6e0151d..9de23fb2595 100644 --- a/python/cudf/cudf/_lib/cpp/wrappers/decimals.pxd +++ b/python/cudf/cudf/_lib/cpp/wrappers/decimals.pxd @@ -2,6 +2,7 @@ from libc.stdint cimport int64_t, int32_t cdef extern from "cudf/fixed_point/fixed_point.hpp" namespace "numeric" nogil: + # cython type stub to help resolve to numeric::decimal64 ctypedef int64_t decimal64 cdef cppclass scale_type: diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx index 713a2274a77..4584841dd33 100644 --- a/python/cudf/cudf/_lib/groupby.pyx +++ b/python/cudf/cudf/_lib/groupby.pyx @@ -3,6 +3,7 @@ from collections import defaultdict import numpy as np +import rmm from libcpp.pair cimport pair from libcpp.memory cimport unique_ptr @@ -20,25 +21,9 @@ cimport cudf._lib.cpp.groupby as libcudf_groupby cimport cudf._lib.cpp.aggregation as libcudf_aggregation -_GROUPBY_AGGS = { - "count", - "size", - "sum", - "idxmin", - "idxmax", - "min", - "max", - "mean", - "var", - "std", - "quantile", - "median", - "nunique", - "nth", - "collect", - "unique", -} - +# The sets below define the possible aggregations that can be performed on +# different dtypes. The uppercased versions of these strings correspond to +# elements of the AggregationKind enum. _CATEGORICAL_AGGS = { "count", "size", @@ -61,6 +46,24 @@ _LIST_AGGS = { "collect", } +_STRUCT_AGGS = { +} + +_INTERVAL_AGGS = { +} + +_DECIMAL_AGGS = { + "count", + "sum", + "argmin", + "argmax", + "min", + "max", + "nunique", + "nth", + "collect" +} + cdef class GroupBy: cdef unique_ptr[libcudf_groupby.groupby] c_obj @@ -197,7 +200,10 @@ def _drop_unsupported_aggs(Table values, aggs): from cudf.utils.dtypes import ( is_categorical_dtype, is_string_dtype, - is_list_dtype + is_list_dtype, + is_interval_dtype, + is_struct_dtype, + is_decimal_dtype, ) result = aggs.copy() @@ -220,6 +226,29 @@ def _drop_unsupported_aggs(Table values, aggs): for i, agg_name in enumerate(aggs[col_name]): if Aggregation(agg_name).kind not in _CATEGORICAL_AGGS: del result[col_name][i] + elif ( + is_struct_dtype(values._data[col_name].dtype) + ): + for i, agg_name in enumerate(aggs[col_name]): + if Aggregation(agg_name).kind not in _STRUCT_AGGS: + del result[col_name][i] + elif ( + is_interval_dtype(values._data[col_name].dtype) + ): + for i, agg_name in enumerate(aggs[col_name]): + if Aggregation(agg_name).kind not in _INTERVAL_AGGS: + del result[col_name][i] + elif ( + is_decimal_dtype(values._data[col_name].dtype) + ): + if rmm._cuda.gpu.runtimeGetVersion() < 11000: + raise RuntimeError( + "Decimal aggregations are only supported on CUDA >= 11 " + "due to an nvcc compiler bug." + ) + for i, agg_name in enumerate(aggs[col_name]): + if Aggregation(agg_name).kind not in _DECIMAL_AGGS: + del result[col_name][i] if all(len(v) == 0 for v in result.values()): raise DataError("No numeric types to aggregate") diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 3969c1b16e7..f80bef79c79 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -426,6 +426,8 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase: array.type, pd.core.arrays._arrow_utils.ArrowIntervalType ): return cudf.core.column.IntervalColumn.from_arrow(array) + elif isinstance(array.type, pa.Decimal128Type): + return cudf.core.column.DecimalColumn.from_arrow(array) return libcudf.interop.from_arrow(data, data.column_names)._data[ "None" @@ -1853,10 +1855,14 @@ def as_column( cupy.asarray(arbitrary), nan_as_null=nan_as_null, dtype=dtype ) else: - data = as_column( - pa.array(arbitrary, from_pandas=nan_as_null), - dtype=arbitrary.dtype, - ) + pyarrow_array = pa.array(arbitrary, from_pandas=nan_as_null) + if isinstance(pyarrow_array.type, pa.Decimal128Type): + pyarrow_type = cudf.Decimal64Dtype.from_arrow( + pyarrow_array.type + ) + else: + pyarrow_type = arbitrary.dtype + data = as_column(pyarrow_array, dtype=pyarrow_type) if dtype is not None: data = data.astype(dtype) diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py index d32ef97d372..d5568df6468 100644 --- a/python/cudf/cudf/core/column/decimal.py +++ b/python/cudf/cudf/core/column/decimal.py @@ -6,6 +6,7 @@ import cupy as cp import numpy as np import pyarrow as pa +from pandas.api.types import is_integer_dtype import cudf from cudf import _lib as libcudf @@ -66,17 +67,47 @@ def to_arrow(self): def binary_operator(self, op, other, reflect=False): if reflect: self, other = other, self - scale = _binop_scale(self.dtype, other.dtype, op) - output_type = Decimal64Dtype( - scale=scale, precision=Decimal64Dtype.MAX_PRECISION - ) # precision will be ignored, libcudf has no notion of precision - result = libcudf.binaryop.binaryop(self, other, op, output_type) - result.dtype.precision = _binop_precision(self.dtype, other.dtype, op) + + # Binary Arithmatics between decimal columns. `Scale` and `precision` + # are computed outside of libcudf + if op in ("add", "sub", "mul"): + scale = _binop_scale(self.dtype, other.dtype, op) + output_type = Decimal64Dtype( + scale=scale, precision=Decimal64Dtype.MAX_PRECISION + ) # precision will be ignored, libcudf has no notion of precision + result = libcudf.binaryop.binaryop(self, other, op, output_type) + result.dtype.precision = _binop_precision( + self.dtype, other.dtype, op + ) + elif op in ("eq", "lt", "gt", "le", "ge"): + if not isinstance( + other, + (DecimalColumn, cudf.core.column.NumericalColumn, cudf.Scalar), + ): + raise TypeError( + f"Operator {op} not supported between" + f"{str(type(self))} and {str(type(other))}" + ) + if isinstance( + other, cudf.core.column.NumericalColumn + ) and not is_integer_dtype(other.dtype): + raise TypeError( + f"Only decimal and integer column is supported for {op}." + ) + if isinstance(other, cudf.core.column.NumericalColumn): + other = other.as_decimal_column( + Decimal64Dtype(Decimal64Dtype.MAX_PRECISION, 0) + ) + result = libcudf.binaryop.binaryop(self, other, op, bool) return result def normalize_binop_value(self, other): if is_scalar(other) and isinstance(other, (int, np.int, Decimal)): return cudf.Scalar(Decimal(other)) + elif isinstance(other, cudf.Scalar) and isinstance( + other.dtype, cudf.Decimal64Dtype + ): + return other else: raise TypeError(f"cannot normalize {type(other)}") diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index f58a47a918c..10a9ffbfbae 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -22,6 +22,7 @@ column, string, ) +from cudf.core.dtypes import Decimal64Dtype from cudf.utils import cudautils, utils from cudf.utils.dtypes import ( min_column_type, @@ -103,11 +104,23 @@ def binary_operator( out_dtype = self.dtype else: if not ( - isinstance(rhs, (NumericalColumn, cudf.Scalar,),) + isinstance( + rhs, + ( + NumericalColumn, + cudf.Scalar, + cudf.core.column.DecimalColumn, + ), + ) or np.isscalar(rhs) ): msg = "{!r} operator not supported between {} and {}" raise TypeError(msg.format(binop, type(self), type(rhs))) + if isinstance(rhs, cudf.core.column.DecimalColumn): + lhs = self.as_decimal_column( + Decimal64Dtype(Decimal64Dtype.MAX_PRECISION, 0) + ) + return lhs.binary_operator(binop, rhs) out_dtype = np.result_type(self.dtype, rhs.dtype) if binop in ["mod", "floordiv"]: tmp = self if reflect else rhs diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 86e1f5cfe30..cc94548d9a2 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -13,6 +13,8 @@ from cudf.utils.utils import cached_property +# Note that all valid aggregation methods (e.g. GroupBy.min) are bound to the +# class after its definition (see below). class GroupBy(Serializable): _MAX_GROUPS_BEFORE_WARN = 100 @@ -58,14 +60,6 @@ def __init__( else: self.grouping = _Grouping(obj, by, level) - def __getattribute__(self, key): - try: - return super().__getattribute__(key) - except AttributeError: - if key in libgroupby._GROUPBY_AGGS: - return functools.partial(self._agg_func_name_with_args, key) - raise - def __iter__(self): group_names, offsets, _, grouped_values = self._grouped() if isinstance(group_names, cudf.Index): @@ -267,19 +261,6 @@ def _grouped(self): group_names = grouped_keys.unique() return (group_names, offsets, grouped_keys, grouped_values) - def _agg_func_name_with_args(self, func_name, *args, **kwargs): - """ - Aggregate given an aggregate function name - and arguments to the function, e.g., - `_agg_func_name_with_args("quantile", 0.5)` - """ - - def func(x): - return getattr(x, func_name)(*args, **kwargs) - - func.__name__ = func_name - return self.agg(func) - def _normalize_aggs(self, aggs): """ Normalize aggs to a dict mapping column names @@ -590,6 +571,48 @@ def rolling(self, *args, **kwargs): return cudf.core.window.rolling.RollingGroupby(self, *args, **kwargs) +# Set of valid groupby aggregations that are monkey-patched into the GroupBy +# namespace. +_VALID_GROUPBY_AGGS = { + "count", + "sum", + "idxmin", + "idxmax", + "min", + "max", + "mean", + "var", + "std", + "quantile", + "median", + "nunique", + "collect", + "unique", +} + + +# Dynamically bind the different aggregation methods. +def _agg_func_name_with_args(self, func_name, *args, **kwargs): + """ + Aggregate given an aggregate function name and arguments to the + function, e.g., `_agg_func_name_with_args("quantile", 0.5)`. The named + aggregations must be members of _AggregationFactory. + """ + + def func(x): + """Compute the {} of the group.""".format(func_name) + return getattr(x, func_name)(*args, **kwargs) + + func.__name__ = func_name + return self.agg(func) + + +for key in _VALID_GROUPBY_AGGS: + setattr( + GroupBy, key, functools.partialmethod(_agg_func_name_with_args, key) + ) + + class DataFrameGroupBy(GroupBy): def __init__( self, obj, by=None, level=None, sort=False, as_index=True, dropna=True @@ -685,15 +708,16 @@ def __init__( dropna=dropna, ) - def __getattribute__(self, key): + def __getattr__(self, key): + # Without this check, copying can trigger a RecursionError. See + # https://nedbatchelder.com/blog/201010/surprising_getattr_recursion.html # noqa: E501 + # for an explanation. + if key == "obj": + raise AttributeError try: - return super().__getattribute__(key) - except AttributeError: - if key in self.obj: - return self.obj[key].groupby( - self.grouping, dropna=self._dropna, sort=self._sort - ) - raise + return self[key] + except KeyError: + raise AttributeError def __getitem__(self, key): return self.obj[key].groupby( diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py index 2e2992fc524..ac80071c8e4 100644 --- a/python/cudf/cudf/tests/test_binops.py +++ b/python/cudf/cudf/tests/test_binops.py @@ -1615,6 +1615,12 @@ def test_binops_with_NA_consistent(dtype, op): assert result._column.null_count == len(data) +def _decimal_series(input, dtype): + return cudf.Series( + [x if x is None else decimal.Decimal(x) for x in input], dtype=dtype, + ) + + @pytest.mark.parametrize( "args", [ @@ -1753,26 +1759,311 @@ def test_binops_with_NA_consistent(dtype, op): ["10.0", None], cudf.Decimal64Dtype(scale=1, precision=8), ), + ( + operator.eq, + ["0.18", "0.42"], + cudf.Decimal64Dtype(scale=2, precision=3), + ["0.18", "0.21"], + cudf.Decimal64Dtype(scale=2, precision=3), + [True, False], + bool, + ), + ( + operator.eq, + ["0.18", "0.42"], + cudf.Decimal64Dtype(scale=2, precision=3), + ["0.1800", "0.2100"], + cudf.Decimal64Dtype(scale=4, precision=5), + [True, False], + bool, + ), + ( + operator.eq, + ["100", None], + cudf.Decimal64Dtype(scale=-2, precision=3), + ["100", "200"], + cudf.Decimal64Dtype(scale=-1, precision=4), + [True, None], + bool, + ), + ( + operator.lt, + ["0.18", "0.42", "1.00"], + cudf.Decimal64Dtype(scale=2, precision=3), + ["0.10", "0.87", "1.00"], + cudf.Decimal64Dtype(scale=2, precision=3), + [False, True, False], + bool, + ), + ( + operator.lt, + ["0.18", "0.42", "1.00"], + cudf.Decimal64Dtype(scale=2, precision=3), + ["0.1000", "0.8700", "1.0000"], + cudf.Decimal64Dtype(scale=4, precision=5), + [False, True, False], + bool, + ), + ( + operator.lt, + ["200", None, "100"], + cudf.Decimal64Dtype(scale=-2, precision=3), + ["100", "200", "100"], + cudf.Decimal64Dtype(scale=-1, precision=4), + [False, None, False], + bool, + ), + ( + operator.gt, + ["0.18", "0.42", "1.00"], + cudf.Decimal64Dtype(scale=2, precision=3), + ["0.10", "0.87", "1.00"], + cudf.Decimal64Dtype(scale=2, precision=3), + [True, False, False], + bool, + ), + ( + operator.gt, + ["0.18", "0.42", "1.00"], + cudf.Decimal64Dtype(scale=2, precision=3), + ["0.1000", "0.8700", "1.0000"], + cudf.Decimal64Dtype(scale=4, precision=5), + [True, False, False], + bool, + ), + ( + operator.gt, + ["300", None, "100"], + cudf.Decimal64Dtype(scale=-2, precision=3), + ["100", "200", "100"], + cudf.Decimal64Dtype(scale=-1, precision=4), + [True, None, False], + bool, + ), + ( + operator.le, + ["0.18", "0.42", "1.00"], + cudf.Decimal64Dtype(scale=2, precision=3), + ["0.10", "0.87", "1.00"], + cudf.Decimal64Dtype(scale=2, precision=3), + [False, True, True], + bool, + ), + ( + operator.le, + ["0.18", "0.42", "1.00"], + cudf.Decimal64Dtype(scale=2, precision=3), + ["0.1000", "0.8700", "1.0000"], + cudf.Decimal64Dtype(scale=4, precision=5), + [False, True, True], + bool, + ), + ( + operator.le, + ["300", None, "100"], + cudf.Decimal64Dtype(scale=-2, precision=3), + ["100", "200", "100"], + cudf.Decimal64Dtype(scale=-1, precision=4), + [False, None, True], + bool, + ), + ( + operator.ge, + ["0.18", "0.42", "1.00"], + cudf.Decimal64Dtype(scale=2, precision=3), + ["0.10", "0.87", "1.00"], + cudf.Decimal64Dtype(scale=2, precision=3), + [True, False, True], + bool, + ), + ( + operator.ge, + ["0.18", "0.42", "1.00"], + cudf.Decimal64Dtype(scale=2, precision=3), + ["0.1000", "0.8700", "1.0000"], + cudf.Decimal64Dtype(scale=4, precision=5), + [True, False, True], + bool, + ), + ( + operator.ge, + ["300", None, "100"], + cudf.Decimal64Dtype(scale=-2, precision=3), + ["100", "200", "100"], + cudf.Decimal64Dtype(scale=-1, precision=4), + [True, None, True], + bool, + ), ], ) def test_binops_decimal(args): op, lhs, l_dtype, rhs, r_dtype, expect, expect_dtype = args - def decimal_series(input, dtype): - return cudf.Series( - [x if x is None else decimal.Decimal(x) for x in input], - dtype=dtype, - ) - - a = decimal_series(lhs, l_dtype) - b = decimal_series(rhs, r_dtype) - expect = decimal_series(expect, expect_dtype) + a = _decimal_series(lhs, l_dtype) + b = _decimal_series(rhs, r_dtype) + expect = ( + _decimal_series(expect, expect_dtype) + if isinstance(expect_dtype, cudf.Decimal64Dtype) + else cudf.Series(expect, dtype=expect_dtype) + ) got = op(a, b) assert expect.dtype == got.dtype utils.assert_eq(expect, got) +@pytest.mark.parametrize( + "args", + [ + ( + operator.eq, + ["100", "41", None], + cudf.Decimal64Dtype(scale=0, precision=5), + [100, 42, 12], + cudf.Series([True, False, None], dtype=bool), + cudf.Series([True, False, None], dtype=bool), + ), + ( + operator.eq, + ["100.000", "42.001", None], + cudf.Decimal64Dtype(scale=3, precision=6), + [100, 42, 12], + cudf.Series([True, False, None], dtype=bool), + cudf.Series([True, False, None], dtype=bool), + ), + ( + operator.eq, + ["100", "40", None], + cudf.Decimal64Dtype(scale=-1, precision=3), + [100, 42, 12], + cudf.Series([True, False, None], dtype=bool), + cudf.Series([True, False, None], dtype=bool), + ), + ( + operator.lt, + ["100", "40", "28", None], + cudf.Decimal64Dtype(scale=0, precision=3), + [100, 42, 24, 12], + cudf.Series([False, True, False, None], dtype=bool), + cudf.Series([False, False, True, None], dtype=bool), + ), + ( + operator.lt, + ["100.000", "42.002", "23.999", None], + cudf.Decimal64Dtype(scale=3, precision=6), + [100, 42, 24, 12], + cudf.Series([False, False, True, None], dtype=bool), + cudf.Series([False, True, False, None], dtype=bool), + ), + ( + operator.lt, + ["100", "40", "10", None], + cudf.Decimal64Dtype(scale=-1, precision=3), + [100, 42, 8, 12], + cudf.Series([False, True, False, None], dtype=bool), + cudf.Series([False, False, True, None], dtype=bool), + ), + ( + operator.gt, + ["100", "42", "20", None], + cudf.Decimal64Dtype(scale=0, precision=3), + [100, 40, 24, 12], + cudf.Series([False, True, False, None], dtype=bool), + cudf.Series([False, False, True, None], dtype=bool), + ), + ( + operator.gt, + ["100.000", "42.002", "23.999", None], + cudf.Decimal64Dtype(scale=3, precision=6), + [100, 42, 24, 12], + cudf.Series([False, True, False, None], dtype=bool), + cudf.Series([False, False, True, None], dtype=bool), + ), + ( + operator.gt, + ["100", "40", "10", None], + cudf.Decimal64Dtype(scale=-1, precision=3), + [100, 42, 8, 12], + cudf.Series([False, False, True, None], dtype=bool), + cudf.Series([False, True, False, None], dtype=bool), + ), + ( + operator.le, + ["100", "40", "28", None], + cudf.Decimal64Dtype(scale=0, precision=3), + [100, 42, 24, 12], + cudf.Series([True, True, False, None], dtype=bool), + cudf.Series([True, False, True, None], dtype=bool), + ), + ( + operator.le, + ["100.000", "42.002", "23.999", None], + cudf.Decimal64Dtype(scale=3, precision=6), + [100, 42, 24, 12], + cudf.Series([True, False, True, None], dtype=bool), + cudf.Series([True, True, False, None], dtype=bool), + ), + ( + operator.le, + ["100", "40", "10", None], + cudf.Decimal64Dtype(scale=-1, precision=3), + [100, 42, 8, 12], + cudf.Series([True, True, False, None], dtype=bool), + cudf.Series([True, False, True, None], dtype=bool), + ), + ( + operator.ge, + ["100", "42", "20", None], + cudf.Decimal64Dtype(scale=0, precision=3), + [100, 40, 24, 12], + cudf.Series([True, True, False, None], dtype=bool), + cudf.Series([True, False, True, None], dtype=bool), + ), + ( + operator.ge, + ["100.000", "42.002", "23.999", None], + cudf.Decimal64Dtype(scale=3, precision=6), + [100, 42, 24, 12], + cudf.Series([True, True, False, None], dtype=bool), + cudf.Series([True, False, True, None], dtype=bool), + ), + ( + operator.ge, + ["100", "40", "10", None], + cudf.Decimal64Dtype(scale=-1, precision=3), + [100, 42, 8, 12], + cudf.Series([True, False, True, None], dtype=bool), + cudf.Series([True, True, False, None], dtype=bool), + ), + ], +) +@pytest.mark.parametrize("integer_dtype", cudf.tests.utils.INTEGER_TYPES) +@pytest.mark.parametrize("reflected", [True, False]) +def test_binops_decimal_comp_mixed_integer(args, integer_dtype, reflected): + """ + Tested compare operations: + eq, lt, gt, le, ge + Each operation has 3 decimal data setups, with scale from {==0, >0, <0}. + Decimal precisions are sufficient to hold the digits. + For each decimal data setup, there is at least one row that lead to one + of the following compare results: {True, False, None}. + """ + if not reflected: + op, ldata, ldtype, rdata, expected, _ = args + else: + op, ldata, ldtype, rdata, _, expected = args + + lhs = _decimal_series(ldata, ldtype) + rhs = cudf.Series(rdata, dtype=integer_dtype) + + if reflected: + rhs, lhs = lhs, rhs + + actual = op(lhs, rhs) + + utils.assert_eq(expected, actual) + + @pytest.mark.parametrize( "args", [ @@ -1803,6 +2094,15 @@ def decimal_series(input, dtype): cudf.Decimal64Dtype(scale=1, precision=7), False, ), + ( + operator.add, + ["100", "200"], + cudf.Decimal64Dtype(scale=-2, precision=3), + cudf.Scalar(decimal.Decimal("1.5")), + ["101.5", "201.5"], + cudf.Decimal64Dtype(scale=1, precision=7), + False, + ), ( operator.add, ["100", "200"], @@ -1830,6 +2130,15 @@ def decimal_series(input, dtype): cudf.Decimal64Dtype(scale=1, precision=7), True, ), + ( + operator.add, + ["100", "200"], + cudf.Decimal64Dtype(scale=-2, precision=3), + cudf.Scalar(decimal.Decimal("1.5")), + ["101.5", "201.5"], + cudf.Decimal64Dtype(scale=1, precision=7), + True, + ), ( operator.mul, ["100", "200"], @@ -1857,6 +2166,15 @@ def decimal_series(input, dtype): cudf.Decimal64Dtype(scale=-1, precision=6), False, ), + ( + operator.mul, + ["100", "200"], + cudf.Decimal64Dtype(scale=-2, precision=3), + cudf.Scalar(decimal.Decimal("1.5")), + ["150", "300"], + cudf.Decimal64Dtype(scale=-1, precision=6), + False, + ), ( operator.mul, ["100", "200"], @@ -1884,6 +2202,15 @@ def decimal_series(input, dtype): cudf.Decimal64Dtype(scale=-1, precision=6), True, ), + ( + operator.mul, + ["100", "200"], + cudf.Decimal64Dtype(scale=-2, precision=3), + cudf.Scalar(decimal.Decimal("1.5")), + ["150", "300"], + cudf.Decimal64Dtype(scale=-1, precision=6), + True, + ), ( operator.sub, ["100", "200"], @@ -1911,6 +2238,15 @@ def decimal_series(input, dtype): cudf.Decimal64Dtype(scale=0, precision=6), False, ), + ( + operator.sub, + ["100", "200"], + cudf.Decimal64Dtype(scale=-2, precision=3), + cudf.Scalar(decimal.Decimal("2.5")), + ["97.5", "197.5"], + cudf.Decimal64Dtype(scale=1, precision=7), + False, + ), ( operator.sub, ["100", "200"], @@ -1938,6 +2274,15 @@ def decimal_series(input, dtype): cudf.Decimal64Dtype(scale=1, precision=7), True, ), + ( + operator.sub, + ["100", "200"], + cudf.Decimal64Dtype(scale=-2, precision=3), + cudf.Scalar(decimal.Decimal("2.5")), + ["-97.5", "-197.5"], + cudf.Decimal64Dtype(scale=1, precision=7), + True, + ), ], ) def test_binops_decimal_scalar(args): @@ -1960,6 +2305,157 @@ def decimal_series(input, dtype): utils.assert_eq(expect, got) +@pytest.mark.parametrize( + "args", + [ + ( + operator.eq, + ["100.00", "41", None], + cudf.Decimal64Dtype(scale=0, precision=5), + 100, + cudf.Series([True, False, None], dtype=bool), + cudf.Series([True, False, None], dtype=bool), + ), + ( + operator.eq, + ["100.123", "41", None], + cudf.Decimal64Dtype(scale=3, precision=6), + decimal.Decimal("100.123"), + cudf.Series([True, False, None], dtype=bool), + cudf.Series([True, False, None], dtype=bool), + ), + ( + operator.eq, + ["100.123", "41", None], + cudf.Decimal64Dtype(scale=3, precision=6), + cudf.Scalar(decimal.Decimal("100.123")), + cudf.Series([True, False, None], dtype=bool), + cudf.Series([True, False, None], dtype=bool), + ), + ( + operator.gt, + ["100.00", "41", "120.21", None], + cudf.Decimal64Dtype(scale=2, precision=5), + 100, + cudf.Series([False, False, True, None], dtype=bool), + cudf.Series([False, True, False, None], dtype=bool), + ), + ( + operator.gt, + ["100.123", "41", "120.21", None], + cudf.Decimal64Dtype(scale=3, precision=6), + decimal.Decimal("100.123"), + cudf.Series([False, False, True, None], dtype=bool), + cudf.Series([False, True, False, None], dtype=bool), + ), + ( + operator.gt, + ["100.123", "41", "120.21", None], + cudf.Decimal64Dtype(scale=3, precision=6), + cudf.Scalar(decimal.Decimal("100.123")), + cudf.Series([False, False, True, None], dtype=bool), + cudf.Series([False, True, False, None], dtype=bool), + ), + ( + operator.ge, + ["100.00", "41", "120.21", None], + cudf.Decimal64Dtype(scale=2, precision=5), + 100, + cudf.Series([True, False, True, None], dtype=bool), + cudf.Series([True, True, False, None], dtype=bool), + ), + ( + operator.ge, + ["100.123", "41", "120.21", None], + cudf.Decimal64Dtype(scale=3, precision=6), + decimal.Decimal("100.123"), + cudf.Series([True, False, True, None], dtype=bool), + cudf.Series([True, True, False, None], dtype=bool), + ), + ( + operator.ge, + ["100.123", "41", "120.21", None], + cudf.Decimal64Dtype(scale=3, precision=6), + cudf.Scalar(decimal.Decimal("100.123")), + cudf.Series([True, False, True, None], dtype=bool), + cudf.Series([True, True, False, None], dtype=bool), + ), + ( + operator.lt, + ["100.00", "41", "120.21", None], + cudf.Decimal64Dtype(scale=2, precision=5), + 100, + cudf.Series([False, True, False, None], dtype=bool), + cudf.Series([False, False, True, None], dtype=bool), + ), + ( + operator.lt, + ["100.123", "41", "120.21", None], + cudf.Decimal64Dtype(scale=3, precision=6), + decimal.Decimal("100.123"), + cudf.Series([False, True, False, None], dtype=bool), + cudf.Series([False, False, True, None], dtype=bool), + ), + ( + operator.lt, + ["100.123", "41", "120.21", None], + cudf.Decimal64Dtype(scale=3, precision=6), + cudf.Scalar(decimal.Decimal("100.123")), + cudf.Series([False, True, False, None], dtype=bool), + cudf.Series([False, False, True, None], dtype=bool), + ), + ( + operator.le, + ["100.00", "41", "120.21", None], + cudf.Decimal64Dtype(scale=2, precision=5), + 100, + cudf.Series([True, True, False, None], dtype=bool), + cudf.Series([True, False, True, None], dtype=bool), + ), + ( + operator.le, + ["100.123", "41", "120.21", None], + cudf.Decimal64Dtype(scale=3, precision=6), + decimal.Decimal("100.123"), + cudf.Series([True, True, False, None], dtype=bool), + cudf.Series([True, False, True, None], dtype=bool), + ), + ( + operator.le, + ["100.123", "41", "120.21", None], + cudf.Decimal64Dtype(scale=3, precision=6), + cudf.Scalar(decimal.Decimal("100.123")), + cudf.Series([True, True, False, None], dtype=bool), + cudf.Series([True, False, True, None], dtype=bool), + ), + ], +) +@pytest.mark.parametrize("reflected", [True, False]) +def test_binops_decimal_scalar_compare(args, reflected): + """ + Tested compare operations: + eq, lt, gt, le, ge + Each operation has 3 data setups: pyints, Decimal, and + decimal cudf.Scalar + For each data setup, there is at least one row that lead to one of the + following compare results: {True, False, None}. + """ + if not reflected: + op, ldata, ldtype, rdata, expected, _ = args + else: + op, ldata, ldtype, rdata, _, expected = args + + lhs = _decimal_series(ldata, ldtype) + rhs = rdata + + if reflected: + rhs, lhs = lhs, rhs + + actual = op(lhs, rhs) + + utils.assert_eq(expected, actual) + + @pytest.mark.parametrize( "dtype", [ diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index a96db59dee3..84b52b1befb 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -8,6 +8,7 @@ import pytest from numba import cuda from numpy.testing import assert_array_equal +from decimal import Decimal import cudf from cudf.core import DataFrame, Series @@ -20,6 +21,8 @@ assert_exceptions_equal, ) +import rmm + _now = np.datetime64("now") _tomorrow = _now + np.timedelta64(1, "D") _now = np.int64(_now.astype("datetime64[ns]")) @@ -148,26 +151,6 @@ def test_groupby_agg_min_max_dictlist(nelem): assert_eq(got_df, expect_df) -@pytest.mark.parametrize("nelem", [2, 3, 100, 1000]) -@pytest.mark.parametrize( - "func", ["mean", "min", "max", "idxmin", "idxmax", "count", "sum"] -) -def test_groupby_2keys_agg(nelem, func): - # gdf (Note: lack of multiIndex) - expect_df = ( - make_frame(pd.DataFrame, nelem=nelem) - .groupby(["x", "y"], sort=True) - .agg(func) - ) - got_df = ( - make_frame(DataFrame, nelem=nelem) - .groupby(["x", "y"], sort=True) - .agg(func) - ) - check_dtype = False if func in _index_type_aggs else True - assert_eq(got_df, expect_df, check_dtype=check_dtype) - - @pytest.mark.parametrize("as_index", [True, False]) def test_groupby_as_index_single_agg(pdf, gdf, as_index): gdf = gdf.groupby("y", as_index=as_index, sort=True).agg({"x": "mean"}) @@ -331,28 +314,90 @@ def emulate(df): assert_eq(expect, got) -@pytest.mark.parametrize("nelem", [100, 500]) +@pytest.mark.parametrize("nelem", [2, 3, 100, 500, 1000]) @pytest.mark.parametrize( "func", ["mean", "std", "var", "min", "max", "idxmin", "idxmax", "count", "sum"], ) -def test_groupby_cudf_2keys_agg(nelem, func): - got_df = ( - make_frame(DataFrame, nelem=nelem) +def test_groupby_2keys_agg(nelem, func): + # gdf (Note: lack of multiIndex) + expect_df = ( + make_frame(pd.DataFrame, nelem=nelem) .groupby(["x", "y"], sort=True) .agg(func) ) - - # pandas - expect_df = ( - make_frame(pd.DataFrame, nelem=nelem) + got_df = ( + make_frame(DataFrame, nelem=nelem) .groupby(["x", "y"], sort=True) .agg(func) ) + check_dtype = False if func in _index_type_aggs else True assert_eq(got_df, expect_df, check_dtype=check_dtype) +@pytest.mark.parametrize("num_groups", [2, 3, 10, 50, 100]) +@pytest.mark.parametrize("nelem_per_group", [1, 10, 100]) +@pytest.mark.parametrize( + "func", + ["min", "max", "count", "sum"], + # TODO: Replace the above line with the one below once + # https://github.com/pandas-dev/pandas/issues/40685 is resolved. + # "func", ["min", "max", "idxmin", "idxmax", "count", "sum"], +) +def test_groupby_agg_decimal(num_groups, nelem_per_group, func): + # The number of digits after the decimal to use. + decimal_digits = 2 + # The number of digits before the decimal to use. + whole_digits = 2 + + scale = 10 ** whole_digits + nelem = num_groups * nelem_per_group + + # The unique is necessary because otherwise if there are duplicates idxmin + # and idxmax may return different results than pandas (see + # https://github.com/rapidsai/cudf/issues/7756). This is not relevant to + # the current version of the test, because idxmin and idxmax simply don't + # work with pandas Series composed of Decimal objects (see + # https://github.com/pandas-dev/pandas/issues/40685). However, if that is + # ever enabled, then this issue will crop up again so we may as well have + # it fixed now. + x = np.unique((np.random.rand(nelem) * scale).round(decimal_digits)) + y = np.unique((np.random.rand(nelem) * scale).round(decimal_digits)) + + if x.size < y.size: + total_elements = x.size + y = y[: x.size] + else: + total_elements = y.size + x = x[: y.size] + + # Note that this filtering can lead to one group with fewer elements, but + # that shouldn't be a problem and is probably useful to test. + idx_col = np.tile(np.arange(num_groups), nelem_per_group)[:total_elements] + + decimal_x = pd.Series([Decimal(str(d)) for d in x]) + decimal_y = pd.Series([Decimal(str(d)) for d in y]) + + pdf = pd.DataFrame({"idx": idx_col, "x": decimal_x, "y": decimal_y}) + gdf = DataFrame( + { + "idx": idx_col, + "x": cudf.Series(decimal_x), + "y": cudf.Series(decimal_y), + } + ) + + expect_df = pdf.groupby("idx", sort=True).agg(func) + if rmm._cuda.gpu.runtimeGetVersion() < 11000: + with pytest.raises(RuntimeError): + got_df = gdf.groupby("idx", sort=True).agg(func) + else: + got_df = gdf.groupby("idx", sort=True).agg(func) + assert_eq(expect_df["x"], got_df["x"], check_dtype=False) + assert_eq(expect_df["y"], got_df["y"], check_dtype=False) + + @pytest.mark.parametrize( "agg", ["min", "max", "idxmin", "idxmax", "count", "sum", "mean"] ) diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index 61cae792bb5..5cb0391d76f 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -295,9 +295,11 @@ def cudf_dtype_to_pa_type(dtype): """ if is_categorical_dtype(dtype): raise NotImplementedError() - elif is_list_dtype(dtype): - return dtype.to_arrow() - elif is_struct_dtype(dtype): + elif ( + is_list_dtype(dtype) + or is_struct_dtype(dtype) + or is_decimal_dtype(dtype) + ): return dtype.to_arrow() else: return np_to_pa_dtype(np.dtype(dtype))