From 2d9fd5fc38b32329bcc07046f4c522de88ec3b6d Mon Sep 17 00:00:00 2001 From: Sheilah Kirui <71867292+skirui-source@users.noreply.github.com> Date: Tue, 29 Jun 2021 09:02:17 -0700 Subject: [PATCH] Expose a Decimal32Dtype in cuDF Python (#8438) Fixes: #8218 Similarly to libcudf's 64-bit decimal type, this PR exposes the `Decimal32Dtype `and its corresponding `Decimal32Column` type. Following this implementation, user can create a series or dataframe with `decimal32` dtype. Note: Only `to_arrow` and `from_arrow` methods are currently being supported. **Example:** ``` >>> import cudf >>> s = cudf.Series([1,2,3,4], dtype=cudf.Decimal32Dtype(precision=8, scale=2)) >>> s 0 1.00 1 2.00 2 3.00 3 4.00 dtype: decimal32 ``` Authors: - Sheilah Kirui (https://github.com/skirui-source) Approvers: - Michael Wang (https://github.com/isVoid) URL: https://github.com/rapidsai/cudf/pull/8438 --- python/cudf/cudf/__init__.py | 1 + .../strings/convert/convert_fixed_point.pyx | 4 +- python/cudf/cudf/_lib/types.pyx | 40 +++-- python/cudf/cudf/api/types.py | 10 +- python/cudf/cudf/core/column/__init__.py | 5 +- python/cudf/cudf/core/column/column.py | 68 +++++++-- python/cudf/cudf/core/column/decimal.py | 78 ++++++++-- python/cudf/cudf/core/column/numerical.py | 6 +- python/cudf/cudf/core/column/string.py | 2 +- python/cudf/cudf/core/dtypes.py | 144 ++++++++++++++++-- python/cudf/cudf/core/frame.py | 2 +- python/cudf/cudf/core/series.py | 2 +- python/cudf/cudf/tests/test_decimal.py | 80 +++++----- python/cudf/cudf/utils/dtypes.py | 12 +- 14 files changed, 360 insertions(+), 94 deletions(-) diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py index b26d95e7951..84e612c1cbe 100644 --- a/python/cudf/cudf/__init__.py +++ b/python/cudf/cudf/__init__.py @@ -47,6 +47,7 @@ from cudf.core.dtypes import ( CategoricalDtype, Decimal64Dtype, + Decimal32Dtype, IntervalDtype, ListDtype, StructDtype, diff --git a/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx b/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx index 38d238b8266..e002d630fc3 100644 --- a/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx +++ b/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx @@ -28,7 +28,7 @@ from libcpp.string cimport string def from_decimal(Column input_col): """ - Converts a `DecimalColumn` to a `StringColumn`. + Converts a `Decimal64Column` to a `StringColumn`. Parameters ---------- @@ -50,7 +50,7 @@ def from_decimal(Column input_col): def to_decimal(Column input_col, object out_type): """ - Returns a `DecimalColumn` from the provided `StringColumn` + Returns a `Decimal64Column` from the provided `StringColumn` using the scale in the `out_type`. Parameters diff --git a/python/cudf/cudf/_lib/types.pyx b/python/cudf/cudf/_lib/types.pyx index e9ed4f21ddd..43e5c213947 100644 --- a/python/cudf/cudf/_lib/types.pyx +++ b/python/cudf/cudf/_lib/types.pyx @@ -14,9 +14,19 @@ from cudf._lib.types cimport ( ) from cudf._lib.cpp.column.column_view cimport column_view from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view -from cudf.core.dtypes import ListDtype, StructDtype, Decimal64Dtype -from cudf.utils.dtypes import is_decimal_dtype, is_list_dtype, is_struct_dtype - +from cudf.core.dtypes import ( + ListDtype, + StructDtype, + Decimal64Dtype, + Decimal32Dtype +) +from cudf.utils.dtypes import ( + is_decimal_dtype, + is_list_dtype, + is_struct_dtype, + is_decimal64_dtype, + is_decimal32_dtype +) cimport cudf._lib.cpp.types as libcudf_types @@ -191,10 +201,6 @@ cdef dtype_from_structs_column_view(column_view cv): } return StructDtype(fields) -cdef dtype_from_decimal_column_view(column_view cv): - scale = -cv.type().scale() - return Decimal64Dtype(precision=Decimal64Dtype.MAX_PRECISION, scale=scale) - cdef dtype_from_column_view(column_view cv): cdef libcudf_types.type_id tid = cv.type().id() if tid == libcudf_types.type_id.LIST: @@ -202,10 +208,15 @@ cdef dtype_from_column_view(column_view cv): elif tid == libcudf_types.type_id.STRUCT: return dtype_from_structs_column_view(cv) elif tid == libcudf_types.type_id.DECIMAL64: - return dtype_from_decimal_column_view(cv) + return Decimal64Dtype( + precision=Decimal64Dtype.MAX_PRECISION, + scale=-cv.type().scale() + ) elif tid == libcudf_types.type_id.DECIMAL32: - raise NotImplementedError("decimal32 types are not supported yet. " - "Use decimal64 instead") + return Decimal32Dtype( + precision=Decimal32Dtype.MAX_PRECISION, + scale=-cv.type().scale() + ) else: return cudf_to_np_types[(tid)] @@ -214,14 +225,19 @@ cdef libcudf_types.data_type dtype_to_data_type(dtype) except *: tid = libcudf_types.type_id.LIST elif is_struct_dtype(dtype): tid = libcudf_types.type_id.STRUCT - elif is_decimal_dtype(dtype): + elif is_decimal64_dtype(dtype): tid = libcudf_types.type_id.DECIMAL64 + elif is_decimal32_dtype(dtype): + tid = libcudf_types.type_id.DECIMAL32 else: tid = ( ( np_to_cudf_types[np.dtype(dtype)])) - if tid == libcudf_types.type_id.DECIMAL64: + if tid in ( + libcudf_types.type_id.DECIMAL64, + libcudf_types.type_id.DECIMAL32 + ): return libcudf_types.data_type(tid, -dtype.scale) else: return libcudf_types.data_type(tid) diff --git a/python/cudf/cudf/api/types.py b/python/cudf/cudf/api/types.py index a985efeca51..56398bd4f13 100644 --- a/python/cudf/cudf/api/types.py +++ b/python/cudf/cudf/api/types.py @@ -15,9 +15,11 @@ import cudf from cudf._lib.scalar import DeviceScalar -from cudf.core.dtypes import ( +from cudf.core.dtypes import ( # noqa: F401 _BaseDtype, is_categorical_dtype, + is_decimal32_dtype, + is_decimal64_dtype, is_decimal_dtype, is_interval_dtype, is_list_dtype, @@ -39,11 +41,15 @@ def is_numeric_dtype(obj): Whether or not the array or dtype is of a numeric dtype. """ if isclass(obj): - if issubclass(obj, cudf.Decimal64Dtype): + if issubclass(obj, (cudf.Decimal32Dtype, cudf.Decimal64Dtype)): return True if issubclass(obj, _BaseDtype): return False else: + if isinstance(obj, cudf.Decimal32Dtype) or isinstance( + getattr(obj, "dtype", None), cudf.Decimal32Dtype + ): + return True if isinstance(obj, cudf.Decimal64Dtype) or isinstance( getattr(obj, "dtype", None), cudf.Decimal64Dtype ): diff --git a/python/cudf/cudf/core/column/__init__.py b/python/cudf/cudf/core/column/__init__.py index 76d38e00790..18d48e16480 100644 --- a/python/cudf/cudf/core/column/__init__.py +++ b/python/cudf/cudf/core/column/__init__.py @@ -23,4 +23,7 @@ from cudf.core.column.struct import StructColumn # noqa: F401 from cudf.core.column.timedelta import TimeDeltaColumn # noqa: F401 from cudf.core.column.interval import IntervalColumn # noqa: F401 -from cudf.core.column.decimal import DecimalColumn # noqa: F401 +from cudf.core.column.decimal import ( # noqa: F401 + Decimal32Column, + Decimal64Column, +) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 50367651146..111b96c6da7 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -47,28 +47,30 @@ ) from cudf.utils import ioutils, utils from cudf.utils.dtypes import ( - _is_non_decimal_numeric_dtype, - _is_scalar_or_zero_d_array, check_cast_unsupported_dtype, cudf_dtype_from_pa_type, get_time_unit, - is_categorical_dtype, - is_decimal_dtype, - is_interval_dtype, - is_list_dtype, - is_scalar, - is_string_dtype, - is_struct_dtype, min_unsigned_type, np_to_pa_dtype, ) from cudf.utils.utils import mask_dtype from ...api.types import ( + _is_non_decimal_numeric_dtype, + _is_scalar_or_zero_d_array, infer_dtype, is_bool_dtype, + is_categorical_dtype, + is_decimal32_dtype, + is_decimal64_dtype, + is_decimal_dtype, is_dtype_equal, is_integer_dtype, + is_interval_dtype, + is_list_dtype, + is_scalar, + is_string_dtype, + is_struct_dtype, pandas_dtype, ) @@ -279,7 +281,7 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase: ): return cudf.core.column.IntervalColumn.from_arrow(array) elif isinstance(array.type, pa.Decimal128Type): - return cudf.core.column.DecimalColumn.from_arrow(array) + return cudf.core.column.Decimal64Column.from_arrow(array) result = libcudf.interop.from_arrow(data, data.column_names)._data[ "None" @@ -973,7 +975,19 @@ def as_string_column( def as_decimal_column( self, dtype: Dtype, **kwargs - ) -> "cudf.core.column.DecimalColumn": + ) -> Union[ + "cudf.core.column.Decimal32Column", "cudf.core.column.Decimal64Column" + ]: + raise NotImplementedError + + def as_decimal64_column( + self, dtype: Dtype, **kwargs + ) -> "cudf.core.column.Decimal64Column": + raise NotImplementedError + + def as_decimal32_column( + self, dtype: Dtype, **kwargs + ) -> "cudf.core.column.Decimal32Column": raise NotImplementedError def apply_boolean_mask(self, mask) -> ColumnBase: @@ -1468,10 +1482,22 @@ def build_column( null_count=null_count, children=children, ) - elif is_decimal_dtype(dtype): + elif is_decimal64_dtype(dtype): if size is None: raise TypeError("Must specify size") - return cudf.core.column.DecimalColumn( + return cudf.core.column.Decimal64Column( + data=data, + size=size, + offset=offset, + dtype=dtype, + mask=mask, + null_count=null_count, + children=children, + ) + elif is_decimal32_dtype(dtype): + if size is None: + raise TypeError("Must specify size") + return cudf.core.column.Decimal32Column( data=data, size=size, offset=offset, @@ -2020,8 +2046,20 @@ def as_column( precision=dtype.precision, scale=dtype.scale ), ) - return cudf.core.column.DecimalColumn.from_arrow(data) - dtype = pandas_dtype(dtype) + return cudf.core.column.Decimal64Column.from_arrow( + data + ) + if isinstance(dtype, cudf.core.dtypes.Decimal32Dtype): + data = pa.array( + arbitrary, + type=pa.decimal128( + precision=dtype.precision, scale=dtype.scale + ), + ) + return cudf.core.column.Decimal32Column.from_arrow( + data + ) + dtype = pd.api.types.pandas_dtype(dtype) if is_categorical_dtype(dtype) or is_interval_dtype(dtype): raise TypeError else: diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py index 2f0ddb78987..acb8c02a220 100644 --- a/python/cudf/cudf/core/column/decimal.py +++ b/python/cudf/cudf/core/column/decimal.py @@ -16,8 +16,8 @@ ) from cudf._typing import Dtype from cudf.core.buffer import Buffer -from cudf.core.column import ColumnBase, NumericalColumn, as_column -from cudf.core.dtypes import Decimal64Dtype +from cudf.core.column import ColumnBase, as_column +from cudf.core.dtypes import Decimal32Dtype, Decimal64Dtype from cudf.utils.dtypes import is_scalar from cudf.utils.utils import pa_mask_buffer_to_mask @@ -25,7 +25,60 @@ from .numerical_base import NumericalBaseColumn -class DecimalColumn(NumericalBaseColumn): +class Decimal32Column(NumericalBaseColumn): + dtype: Decimal32Dtype + + @classmethod + def from_arrow(cls, data: pa.Array): + dtype = Decimal32Dtype.from_arrow(data.type) + mask_buf = data.buffers()[0] + mask = ( + mask_buf + if mask_buf is None + else pa_mask_buffer_to_mask(mask_buf, len(data)) + ) + data_128 = cp.array(np.frombuffer(data.buffers()[1]).view("int32")) + data_32 = data_128[::4].copy() + return cls( + data=Buffer(data_32.view("uint8")), + size=len(data), + dtype=dtype, + offset=data.offset, + mask=mask, + ) + + def to_arrow(self): + data_buf_32 = self.base_data.to_host_array().view("int32") + data_buf_128 = np.empty(len(data_buf_32) * 4, dtype="int32") + + # use striding to set the first 32 bits of each 128-bit chunk: + data_buf_128[::4] = data_buf_32 + # use striding again to set the remaining bits of each 128-bit chunk: + # 0 for non-negative values, -1 for negative values: + data_buf_128[1::4] = np.piecewise( + data_buf_32, [data_buf_32 < 0], [-1, 0] + ) + data_buf_128[2::4] = np.piecewise( + data_buf_32, [data_buf_32 < 0], [-1, 0] + ) + data_buf_128[3::4] = np.piecewise( + data_buf_32, [data_buf_32 < 0], [-1, 0] + ) + data_buf = pa.py_buffer(data_buf_128) + mask_buf = ( + self.base_mask + if self.base_mask is None + else pa.py_buffer(self.base_mask.to_host_array()) + ) + return pa.Array.from_buffers( + type=self.dtype.to_arrow(), + offset=self._offset, + length=self.size, + buffers=[mask_buf, data_buf], + ) + + +class Decimal64Column(NumericalBaseColumn): dtype: Decimal64Dtype def __truediv__(self, other): @@ -61,6 +114,7 @@ def from_arrow(cls, data: pa.Array): def to_arrow(self): data_buf_64 = self.base_data.to_host_array().view("int64") data_buf_128 = np.empty(len(data_buf_64) * 2, dtype="int64") + # use striding to set the first 64 bits of each 128-bit chunk: data_buf_128[::2] = data_buf_64 # use striding again to set the remaining bits of each 128-bit chunk: @@ -99,7 +153,11 @@ def binary_operator(self, op, other, reflect=False): elif op in ("eq", "ne", "lt", "gt", "le", "ge"): if not isinstance( other, - (DecimalColumn, cudf.core.column.NumericalColumn, cudf.Scalar), + ( + Decimal64Column, + cudf.core.column.NumericalColumn, + cudf.Scalar, + ), ): raise TypeError( f"Operator {op} not supported between" @@ -146,7 +204,9 @@ def _decimal_quantile( def as_decimal_column( self, dtype: Dtype, **kwargs - ) -> "cudf.core.column.DecimalColumn": + ) -> Union[ + "cudf.core.column.Decimal32Column", "cudf.core.column.Decimal64Column" + ]: if ( isinstance(dtype, Decimal64Dtype) and dtype.scale < self.dtype.scale @@ -185,8 +245,8 @@ def fillna( if isinstance(value, (int, Decimal)): value = cudf.Scalar(value, dtype=self.dtype) elif ( - isinstance(value, DecimalColumn) - or isinstance(value, NumericalColumn) + isinstance(value, Decimal64Column) + or isinstance(value, cudf.core.column.NumericalColumn) and is_integer_dtype(value.dtype) ): value = value.astype(self.dtype) @@ -220,8 +280,8 @@ def __cuda_array_interface__(self): ) def _with_type_metadata( - self: "cudf.core.column.DecimalColumn", dtype: Dtype - ) -> "cudf.core.column.DecimalColumn": + self: "cudf.core.column.Decimal64Column", dtype: Dtype + ) -> "cudf.core.column.Decimal64Column": if isinstance(dtype, Decimal64Dtype): self.dtype.precision = dtype.precision diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index 64a0780e9f9..cee9b693bdf 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -139,14 +139,14 @@ def binary_operator( ( NumericalColumn, cudf.Scalar, - cudf.core.column.DecimalColumn, + cudf.core.column.Decimal64Column, ), ) or np.isscalar(rhs) ): msg = "{!r} operator not supported between {} and {}" raise TypeError(msg.format(binop, type(self), type(rhs))) - if isinstance(rhs, cudf.core.column.DecimalColumn): + if isinstance(rhs, cudf.core.column.Decimal64Column): lhs: Union[ScalarLike, ColumnBase] = self.as_decimal_column( Decimal64Dtype(Decimal64Dtype.MAX_PRECISION, 0) ) @@ -249,7 +249,7 @@ def as_timedelta_column( def as_decimal_column( self, dtype: Dtype, **kwargs - ) -> "cudf.core.column.DecimalColumn": + ) -> "cudf.core.column.Decimal64Column": return libcudf.unary.cast(self, dtype) def as_numerical_column(self, dtype: Dtype, **kwargs) -> NumericalColumn: diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index c1d98ac5600..af5b77124a1 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -5192,7 +5192,7 @@ def as_timedelta_column( def as_decimal_column( self, dtype: Dtype, **kwargs - ) -> "cudf.core.column.DecimalColumn": + ) -> "cudf.core.column.Decimal64Column": return cpp_to_decimal(self, dtype) def as_string_column( diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index e63c538c108..6dbe55d0bb8 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -308,9 +308,117 @@ def deserialize(cls, header: dict, frames: list): return cls(fields) +class Decimal32Dtype(_BaseDtype): + + name = "decimal32" + _metadata = ("precision", "scale") + MAX_PRECISION = np.floor(np.log10(np.iinfo("int32").max)) + + def __init__(self, precision, scale=0): + """ + Parameters + ---------- + precision : int + The total number of digits in each value of this dtype + scale : int, optional + The scale of the Decimal32Dtype. See Notes below. + + Notes + ----- + When the scale is positive: + - numbers with fractional parts (e.g., 0.0042) can be represented + - the scale is the total number of digits to the right of the + decimal point + When the scale is negative: + - only multiples of powers of 10 (including 10**0) can be + represented (e.g., 1729, 4200, 1000000) + - the scale represents the number of trailing zeros in the value. + For example, 42 is representable with precision=2 and scale=0. + 13.0051 is representable with precision=6 and scale=4, + and *not* representable with precision<6 or scale<4. + """ + self._validate(precision, scale) + self._typ = pa.decimal128(precision, scale) + + @property + def str(self): + return f"decimal32({self.precision}, {self.scale})" + + @property + def precision(self): + return self._typ.precision + + @precision.setter + def precision(self, value): + self._validate(value, self.scale) + self._typ = pa.decimal128(precision=value, scale=self.scale) + + @property + def scale(self): + return self._typ.scale + + @property + def type(self): + # might need to account for precision and scale here + return decimal.Decimal + + def to_arrow(self): + return self._typ + + @classmethod + def from_arrow(cls, typ): + return cls(typ.precision, typ.scale) + + @property + def itemsize(self): + return 4 + + def __repr__(self): + return ( + f"{self.__class__.__name__}" + f"(precision={self.precision}, scale={self.scale})" + ) + + def __hash__(self): + return hash(self._typ) + + @classmethod + def _validate(cls, precision, scale=0): + if precision > Decimal32Dtype.MAX_PRECISION: + raise ValueError( + f"Cannot construct a {cls.__name__}" + f" with precision > {cls.MAX_PRECISION}" + ) + if abs(scale) > precision: + raise ValueError(f"scale={scale} exceeds precision={precision}") + + @classmethod + def _from_decimal(cls, decimal): + """ + Create a cudf.Decimal32Dtype from a decimal.Decimal object + """ + metadata = decimal.as_tuple() + precision = max(len(metadata.digits), -metadata.exponent) + return cls(precision, -metadata.exponent) + + def serialize(self) -> Tuple[dict, list]: + return ( + { + "type-serialized": pickle.dumps(type(self)), + "precision": self.precision, + "scale": self.scale, + }, + [], + ) + + @classmethod + def deserialize(cls, header: dict, frames: list): + return cls(header["precision"], header["scale"]) + + class Decimal64Dtype(_BaseDtype): - name = "decimal" + name = "decimal64" _metadata = ("precision", "scale") MAX_PRECISION = np.floor(np.log10(np.iinfo("int64").max)) @@ -578,15 +686,7 @@ def is_decimal_dtype(obj): bool Whether or not the array-like or dtype is of the decimal dtype. """ - return ( - type(obj) is cudf.core.dtypes.Decimal64Dtype - or obj is cudf.core.dtypes.Decimal64Dtype - or ( - isinstance(obj, str) - and obj == cudf.core.dtypes.Decimal64Dtype.name - ) - or (hasattr(obj, "dtype") and is_decimal_dtype(obj.dtype)) - ) + return is_decimal32_dtype(obj) or is_decimal64_dtype(obj) def is_interval_dtype(obj): @@ -613,3 +713,27 @@ def is_interval_dtype(obj): ) or (hasattr(obj, "dtype") and is_interval_dtype(obj.dtype)) ) + + +def is_decimal32_dtype(obj): + return ( + type(obj) is cudf.core.dtypes.Decimal32Dtype + or obj is cudf.core.dtypes.Decimal32Dtype + or ( + isinstance(obj, str) + and obj == cudf.core.dtypes.Decimal32Dtype.name + ) + or (hasattr(obj, "dtype") and is_decimal32_dtype(obj.dtype)) + ) + + +def is_decimal64_dtype(obj): + return ( + type(obj) is cudf.core.dtypes.Decimal64Dtype + or obj is cudf.core.dtypes.Decimal64Dtype + or ( + isinstance(obj, str) + and obj == cudf.core.dtypes.Decimal64Dtype.name + ) + or (hasattr(obj, "dtype") and is_decimal64_dtype(obj.dtype)) + ) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 5e4293f8f8b..3629358ee9f 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -506,7 +506,7 @@ def _concat( # Reassign precision for any decimal cols for name, col in out._data.items(): - if isinstance(col, cudf.core.column.DecimalColumn): + if isinstance(col, cudf.core.column.Decimal64Column): col = col._with_type_metadata(tables[0]._data[name].dtype) # Reassign index and column names diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 31ebf90b3c2..2dada48be4d 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -2414,7 +2414,7 @@ def _concat(cls, objs, axis=0, index=True): col = _concat_columns([o._column for o in objs]) - if isinstance(col, cudf.core.column.DecimalColumn): + if isinstance(col, cudf.core.column.Decimal64Column): col = col._with_type_metadata(objs[0]._column.dtype) return cls(data=col, index=index, name=name) diff --git a/python/cudf/cudf/tests/test_decimal.py b/python/cudf/cudf/tests/test_decimal.py index bf1845b9315..4816094814a 100644 --- a/python/cudf/cudf/tests/test_decimal.py +++ b/python/cudf/cudf/tests/test_decimal.py @@ -8,7 +8,7 @@ import pytest import cudf -from cudf.core.column import DecimalColumn, NumericalColumn +from cudf.core.column import Decimal32Column, Decimal64Column, NumericalColumn from cudf.core.dtypes import Decimal64Dtype from cudf.tests.utils import ( FLOAT_TYPES, @@ -18,39 +18,51 @@ assert_eq, ) - -@pytest.mark.parametrize( - "data", - [ - [Decimal("1.1"), Decimal("2.2"), Decimal("3.3"), Decimal("4.4")], - [Decimal("-1.1"), Decimal("2.2"), Decimal("3.3"), Decimal("4.4")], - [1], - [-1], - [1, 2, 3, 4], - [42, 1729, 4104], - [1, 2, None, 4], - [None, None, None], - [], - ], -) -@pytest.mark.parametrize( - "typ", - [ - pa.decimal128(precision=4, scale=2), - pa.decimal128(precision=5, scale=3), - pa.decimal128(precision=6, scale=4), - ], -) -def test_round_trip_decimal_column(data, typ): - pa_arr = pa.array(data, type=typ) - col = DecimalColumn.from_arrow(pa_arr) - assert pa_arr.equals(col.to_arrow()) +data_ = [ + [Decimal("1.1"), Decimal("2.2"), Decimal("3.3"), Decimal("4.4")], + [Decimal("-1.1"), Decimal("2.2"), Decimal("3.3"), Decimal("4.4")], + [1], + [-1], + [1, 2, 3, 4], + [42, 1729, 4104], + [1, 2, None, 4], + [None, None, None], + [], +] +typ_ = [ + pa.decimal128(precision=4, scale=2), + pa.decimal128(precision=5, scale=3), + pa.decimal128(precision=6, scale=4), +] + + +@pytest.mark.parametrize("data_", data_) +@pytest.mark.parametrize("typ_", typ_) +def test_round_trip_decimal64_column(data_, typ_): + pa_arr = pa.array(data_, type=typ_) + col_64 = Decimal64Column.from_arrow(pa_arr) + assert pa_arr.equals(col_64.to_arrow()) + + +@pytest.mark.parametrize("data_", data_) +@pytest.mark.parametrize("typ_", typ_) +def test_round_trip_decimal32_column(data_, typ_): + pa_arr = pa.array(data_, type=typ_) + col_32 = Decimal32Column.from_arrow(pa_arr) + assert pa_arr.equals(col_32.to_arrow()) + + +def test_from_arrow_max_precision_decimal64(): + with pytest.raises(ValueError): + Decimal64Column.from_arrow( + pa.array([1, 2, 3], type=pa.decimal128(scale=0, precision=19)) + ) -def test_from_arrow_max_precision(): +def test_from_arrow_max_precision_decimal32(): with pytest.raises(ValueError): - DecimalColumn.from_arrow( - pa.array([1, 2, 3], type=pa.decimal128(scale=0, precision=19)) + Decimal32Column.from_arrow( + pa.array([1, 2, 3], type=pa.decimal128(scale=0, precision=10)) ) @@ -84,7 +96,7 @@ def test_typecast_from_float_to_decimal(data, from_dtype, to_dtype): pa_arr = got.to_arrow().cast( pa.decimal128(to_dtype.precision, to_dtype.scale) ) - expected = cudf.Series(DecimalColumn.from_arrow(pa_arr)) + expected = cudf.Series(Decimal64Column.from_arrow(pa_arr)) got = got.astype(to_dtype) @@ -124,7 +136,7 @@ def test_typecast_from_int_to_decimal(data, from_dtype, to_dtype): .cast("float64") .cast(pa.decimal128(to_dtype.precision, to_dtype.scale)) ) - expected = cudf.Series(DecimalColumn.from_arrow(pa_arr)) + expected = cudf.Series(Decimal64Column.from_arrow(pa_arr)) got = got.astype(to_dtype) @@ -164,7 +176,7 @@ def test_typecast_to_from_decimal(data, from_dtype, to_dtype): pa_arr = got.to_arrow().cast( pa.decimal128(to_dtype.precision, to_dtype.scale), safe=False ) - expected = cudf.Series(DecimalColumn.from_arrow(pa_arr)) + expected = cudf.Series(Decimal64Column.from_arrow(pa_arr)) got = got.astype(to_dtype) diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index 4f0c02f5002..e1ae87e5089 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -14,8 +14,11 @@ from cudf.api.types import ( # noqa: F401 _is_non_decimal_numeric_dtype, _is_scalar_or_zero_d_array, + infer_dtype, is_categorical_dtype, is_datetime_dtype as is_datetime_dtype, + is_decimal32_dtype, + is_decimal64_dtype, is_decimal_dtype, is_integer, is_integer_dtype, @@ -27,6 +30,7 @@ is_string_dtype, is_struct_dtype, is_timedelta_dtype, + pandas_dtype, ) from cudf.core._compat import PANDAS_GE_120 @@ -175,7 +179,9 @@ def cudf_dtype_from_pydata_dtype(dtype): if is_categorical_dtype(dtype): return cudf.core.dtypes.CategoricalDtype - elif is_decimal_dtype(dtype): + elif is_decimal32_dtype(dtype): + return cudf.core.dtypes.Decimal32Dtype + elif is_decimal64_dtype(dtype): return cudf.core.dtypes.Decimal64Dtype elif dtype in cudf._lib.types.np_to_cudf_types: return dtype.type @@ -210,7 +216,7 @@ def cudf_dtype_from_pa_type(typ): elif pa.types.is_decimal(typ): return cudf.core.dtypes.Decimal64Dtype.from_arrow(typ) else: - return pd.api.types.pandas_dtype(typ.to_pandas_dtype()) + return pandas_dtype(typ.to_pandas_dtype()) def to_cudf_compatible_scalar(val, dtype=None): @@ -250,7 +256,7 @@ def to_cudf_compatible_scalar(val, dtype=None): elif isinstance(val, pd.Timedelta): val = val.to_timedelta64() - val = pd.api.types.pandas_dtype(type(val)).type(val) + val = pandas_dtype(type(val)).type(val) if dtype is not None: val = val.astype(dtype)