From 08e119bdf5e47840e0f021f829e9837e75a8be48 Mon Sep 17 00:00:00 2001 From: Sheilah Date: Thu, 3 Jun 2021 16:56:39 -0700 Subject: [PATCH 01/13] create pull request --- python/cudf/cudf/core/dtypes.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index f0b0dbba4a5..d5b78a872df 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -237,6 +237,8 @@ def __repr__(self): def __hash__(self): return hash(self._typ) +class Decimal32Dtype(_BaseDtype): + pass class Decimal64Dtype(_BaseDtype): From ad4a10b72c0418b17e6dc2d0c8ac2a4e64f4f7f9 Mon Sep 17 00:00:00 2001 From: Sheilah Date: Thu, 3 Jun 2021 17:03:03 -0700 Subject: [PATCH 02/13] create pull request --- python/cudf/cudf/core/dtypes.py | 100 +++++++++++++++++++++++++++++++- 1 file changed, 99 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index d5b78a872df..41aa87a24bf 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -238,7 +238,105 @@ def __hash__(self): return hash(self._typ) class Decimal32Dtype(_BaseDtype): - pass + + name = "decimal" + _metadata = ("precision", "scale") + MAX_PRECISION = np.floor(np.log10(np.iinfo("int64").max)) + + def __init__(self, precision, scale=0): + """ + Parameters + ---------- + precision : int + The total number of digits in each value of this dtype + scale : int, optional + The scale of the Decimal64Dtype. See Notes below. + + Notes + ----- + When the scale is positive: + - numbers with fractional parts (e.g., 0.0042) can be represented + - the scale is the total number of digits to the right of the + decimal point + When the scale is negative: + - only multiples of powers of 10 (including 10**0) can be + represented (e.g., 1729, 4200, 1000000) + - the scale represents the number of trailing zeros in the value. + For example, 42 is representable with precision=2 and scale=0. + 13.0051 is representable with precision=6 and scale=4, + and *not* representable with precision<6 or scale<4. + """ + self._validate(precision, scale) + self._typ = pa.decimal128(precision, scale) + + @property + def str(self): + return f"decimal32({self.precision}, {self.scale})" + + @property + def precision(self): + return self._typ.precision + + @precision.setter + def precision(self, value): + self._validate(value, self.scale) + self._typ = pa.decimal128(precision=value, scale=self.scale) + + @property + def scale(self): + return self._typ.scale + + @property + def type(self): + # might need to account for precision and scale here + return decimal.Decimal + + def to_arrow(self): + return self._typ + + @classmethod + def from_arrow(cls, typ): + return cls(typ.precision, typ.scale) + + @property + def itemsize(self): + return 8 + + def __repr__(self): + return ( + f"{self.__class__.__name__}" + f"(precision={self.precision}, scale={self.scale})" + ) + + def __hash__(self): + return hash(self._typ) + + @classmethod + def _validate(cls, precision, scale=0): + if precision > Decimal64Dtype.MAX_PRECISION: + raise ValueError( + f"Cannot construct a {cls.__name__}" + f" with precision > {cls.MAX_PRECISION}" + ) + if abs(scale) > precision: + raise ValueError(f"scale={scale} exceeds precision={precision}") + + @classmethod + def _from_decimal(cls, decimal): + """ + Create a cudf.Decimal32Dtype from a decimal.Decimal object + """ + metadata = decimal.as_tuple() + precision = max(len(metadata.digits), -metadata.exponent) + return cls(precision, -metadata.exponent) + + def serialize(self) -> Tuple[dict, list]: + return {"precision": self.precision, "scale": self.scale}, [] + + @classmethod + def deserialize(cls, header: dict, frames: list): + return cls(header["precision"], header["scale"]) + class Decimal64Dtype(_BaseDtype): From 02ab5548e5ac0e9592554d43a87e02b52403a9c7 Mon Sep 17 00:00:00 2001 From: Sheilah Date: Tue, 8 Jun 2021 22:53:18 -0700 Subject: [PATCH 03/13] . --- python/cudf/cudf/_lib/types.pyx | 21 +++++++++------------ python/cudf/cudf/core/dtypes.py | 13 ++++++++++--- 2 files changed, 19 insertions(+), 15 deletions(-) diff --git a/python/cudf/cudf/_lib/types.pyx b/python/cudf/cudf/_lib/types.pyx index e9ed4f21ddd..9c16c593a6c 100644 --- a/python/cudf/cudf/_lib/types.pyx +++ b/python/cudf/cudf/_lib/types.pyx @@ -191,21 +191,16 @@ cdef dtype_from_structs_column_view(column_view cv): } return StructDtype(fields) -cdef dtype_from_decimal_column_view(column_view cv): - scale = -cv.type().scale() - return Decimal64Dtype(precision=Decimal64Dtype.MAX_PRECISION, scale=scale) - cdef dtype_from_column_view(column_view cv): cdef libcudf_types.type_id tid = cv.type().id() if tid == libcudf_types.type_id.LIST: return dtype_from_lists_column_view(cv) elif tid == libcudf_types.type_id.STRUCT: - return dtype_from_structs_column_view(cv) - elif tid == libcudf_types.type_id.DECIMAL64: - return dtype_from_decimal_column_view(cv) - elif tid == libcudf_types.type_id.DECIMAL32: - raise NotImplementedError("decimal32 types are not supported yet. " - "Use decimal64 instead") + sreturn dtype_from_structs_column_view(cv) + elif tid == libcudf_types.type_id.DECIMAL64: + Decimal64Dtype(precision=Decimal64Dtype.MAX_PRECISION, scale=-cv.type().scale()) + elif tid == libcudf_types.type_id.DECIMAL32: + Decimal32Dtype(precision=Decimal32Dtype.MAX_PRECISION, scale=-cv.type().scale()) else: return cudf_to_np_types[(tid)] @@ -214,14 +209,16 @@ cdef libcudf_types.data_type dtype_to_data_type(dtype) except *: tid = libcudf_types.type_id.LIST elif is_struct_dtype(dtype): tid = libcudf_types.type_id.STRUCT - elif is_decimal_dtype(dtype): + elif is_decimal64_dtype(dtype): tid = libcudf_types.type_id.DECIMAL64 + elif is_decimal32_dtype(dtype): + tid = libcudf_types.type_id.DECIMAL32 else: tid = ( ( np_to_cudf_types[np.dtype(dtype)])) - if tid == libcudf_types.type_id.DECIMAL64: + if isinstance(tid, [libcudf_types.type_id.DECIMAL32, libcudf_types.type_id.DECIMAL64]): return libcudf_types.data_type(tid, -dtype.scale) else: return libcudf_types.data_type(tid) diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index 41aa87a24bf..9aebfbaf6a9 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -241,7 +241,7 @@ class Decimal32Dtype(_BaseDtype): name = "decimal" _metadata = ("precision", "scale") - MAX_PRECISION = np.floor(np.log10(np.iinfo("int64").max)) + MAX_PRECISION = np.floor(np.log10(np.iinfo("int32").max)) def __init__(self, precision, scale=0): """ @@ -250,7 +250,7 @@ def __init__(self, precision, scale=0): precision : int The total number of digits in each value of this dtype scale : int, optional - The scale of the Decimal64Dtype. See Notes below. + The scale of the Decimal32Dtype. See Notes below. Notes ----- @@ -313,7 +313,7 @@ def __hash__(self): @classmethod def _validate(cls, precision, scale=0): - if precision > Decimal64Dtype.MAX_PRECISION: + if precision > Decimal32Dtype.MAX_PRECISION: raise ValueError( f"Cannot construct a {cls.__name__}" f" with precision > {cls.MAX_PRECISION}" @@ -337,6 +337,10 @@ def serialize(self) -> Tuple[dict, list]: def deserialize(cls, header: dict, frames: list): return cls(header["precision"], header["scale"]) + @classmethod + def is_decimal32_dtype(): + pass + class Decimal64Dtype(_BaseDtype): @@ -438,6 +442,9 @@ def serialize(self) -> Tuple[dict, list]: def deserialize(cls, header: dict, frames: list): return cls(header["precision"], header["scale"]) + @classmethod + def is_decimal64_dtype(): + pass class IntervalDtype(StructDtype): name = "interval" From 0cf41a3e7bfeedf7ff7c0f92ad425db9e6c2a4b1 Mon Sep 17 00:00:00 2001 From: Sheilah Date: Wed, 9 Jun 2021 00:38:51 -0700 Subject: [PATCH 04/13] added is_decimal32dtype/ is_decimal64_dtype --- python/cudf/cudf/_lib/types.pyx | 4 ++-- python/cudf/cudf/core/dtypes.py | 8 +------- python/cudf/cudf/utils/dtypes.py | 26 +++++++++++++++++++++++++- 3 files changed, 28 insertions(+), 10 deletions(-) diff --git a/python/cudf/cudf/_lib/types.pyx b/python/cudf/cudf/_lib/types.pyx index 9c16c593a6c..d448fc31425 100644 --- a/python/cudf/cudf/_lib/types.pyx +++ b/python/cudf/cudf/_lib/types.pyx @@ -15,7 +15,7 @@ from cudf._lib.types cimport ( from cudf._lib.cpp.column.column_view cimport column_view from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view from cudf.core.dtypes import ListDtype, StructDtype, Decimal64Dtype -from cudf.utils.dtypes import is_decimal_dtype, is_list_dtype, is_struct_dtype +from cudf.utils.dtypes import is_decimal_dtype, is_list_dtype, is_struct_dtype, is_decimal32_dtype, is_decimal64_dtype cimport cudf._lib.cpp.types as libcudf_types @@ -196,7 +196,7 @@ cdef dtype_from_column_view(column_view cv): if tid == libcudf_types.type_id.LIST: return dtype_from_lists_column_view(cv) elif tid == libcudf_types.type_id.STRUCT: - sreturn dtype_from_structs_column_view(cv) + return dtype_from_structs_column_view(cv) elif tid == libcudf_types.type_id.DECIMAL64: Decimal64Dtype(precision=Decimal64Dtype.MAX_PRECISION, scale=-cv.type().scale()) elif tid == libcudf_types.type_id.DECIMAL32: diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index 9aebfbaf6a9..fcf09e593d1 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -237,6 +237,7 @@ def __repr__(self): def __hash__(self): return hash(self._typ) + class Decimal32Dtype(_BaseDtype): name = "decimal" @@ -337,10 +338,6 @@ def serialize(self) -> Tuple[dict, list]: def deserialize(cls, header: dict, frames: list): return cls(header["precision"], header["scale"]) - @classmethod - def is_decimal32_dtype(): - pass - class Decimal64Dtype(_BaseDtype): @@ -442,9 +439,6 @@ def serialize(self) -> Tuple[dict, list]: def deserialize(cls, header: dict, frames: list): return cls(header["precision"], header["scale"]) - @classmethod - def is_decimal64_dtype(): - pass class IntervalDtype(StructDtype): name = "interval" diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index 0b59116f8e6..4311271130b 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -10,7 +10,7 @@ import numpy as np import pandas as pd import pyarrow as pa -from pandas.core.dtypes.common import infer_dtype_from_object +from pandas.core.dtypes.common import infer_dtype_from_object, pandas_dtype from pandas.core.dtypes.dtypes import CategoricalDtype, CategoricalDtypeType import cudf @@ -290,6 +290,30 @@ def is_decimal_dtype(obj): ) +def is_decimal32_dtype(obj): + return ( + type(obj) is cudf.core.dtypes.Decimal32Dtype + or obj is cudf.core.dtypes.Decimal32Dtype + or ( + isinstance(obj, str) + and obj == cudf.core.dtypes.Decimal32Dtype.name + ) + or (hasattr(obj, "dtype") and is_decimal32_dtype(obj.dtype)) + ) + + +def is_decimal64_dtype(obj): + return ( + type(obj) is cudf.core.dtypes.Decimal64Dtype + or obj is cudf.core.dtypes.Decimal64Dtype + or ( + isinstance(obj, str) + and obj == cudf.core.dtypes.Decimal64Dtype.name + ) + or (hasattr(obj, "dtype") and is_decimal64_dtype(obj.dtype)) + ) + + def _find_common_type_decimal(dtypes): # Find the largest scale and the largest difference between # precision and scale of the columns to be concatenated From 376766dd17ba05b308325312b351a344eee25f69 Mon Sep 17 00:00:00 2001 From: Sheilah Date: Wed, 9 Jun 2021 10:33:20 -0700 Subject: [PATCH 05/13] replaced DecimalColumn with Decimal64Column in all cudf-python files --- python/cudf/cudf/__init__.py | 1 + .../_lib/strings/convert/convert_fixed_point.pyx | 4 ++-- python/cudf/cudf/_lib/types.pyx | 6 +++--- python/cudf/cudf/core/column/__init__.py | 2 +- python/cudf/cudf/core/column/column.py | 10 +++++----- python/cudf/cudf/core/column/decimal.py | 10 +++++----- python/cudf/cudf/core/column/numerical.py | 6 +++--- python/cudf/cudf/core/column/string.py | 2 +- python/cudf/cudf/core/dtypes.py | 2 +- python/cudf/cudf/core/frame.py | 2 +- python/cudf/cudf/core/series.py | 2 +- python/cudf/cudf/tests/test_decimal.py | 12 ++++++------ python/cudf/cudf/utils/dtypes.py | 8 +------- 13 files changed, 31 insertions(+), 36 deletions(-) diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py index c8a4894f4be..5dfc241e54c 100644 --- a/python/cudf/cudf/__init__.py +++ b/python/cudf/cudf/__init__.py @@ -45,6 +45,7 @@ from cudf.core.dtypes import ( CategoricalDtype, Decimal64Dtype, + Decimal32Dtype, ListDtype, StructDtype, ) diff --git a/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx b/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx index 38d238b8266..e002d630fc3 100644 --- a/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx +++ b/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx @@ -28,7 +28,7 @@ from libcpp.string cimport string def from_decimal(Column input_col): """ - Converts a `DecimalColumn` to a `StringColumn`. + Converts a `Decimal64Column` to a `StringColumn`. Parameters ---------- @@ -50,7 +50,7 @@ def from_decimal(Column input_col): def to_decimal(Column input_col, object out_type): """ - Returns a `DecimalColumn` from the provided `StringColumn` + Returns a `Decimal64Column` from the provided `StringColumn` using the scale in the `out_type`. Parameters diff --git a/python/cudf/cudf/_lib/types.pyx b/python/cudf/cudf/_lib/types.pyx index d448fc31425..0cc56352c5f 100644 --- a/python/cudf/cudf/_lib/types.pyx +++ b/python/cudf/cudf/_lib/types.pyx @@ -14,8 +14,8 @@ from cudf._lib.types cimport ( ) from cudf._lib.cpp.column.column_view cimport column_view from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view -from cudf.core.dtypes import ListDtype, StructDtype, Decimal64Dtype -from cudf.utils.dtypes import is_decimal_dtype, is_list_dtype, is_struct_dtype, is_decimal32_dtype, is_decimal64_dtype +from cudf.core.dtypes import ListDtype, StructDtype, Decimal64Dtype, Decimal32Dtype +from cudf.utils.dtypes import is_decimal_dtype, is_list_dtype, is_struct_dtype, is_decimal64_dtype, is_decimal32_dtype cimport cudf._lib.cpp.types as libcudf_types @@ -218,7 +218,7 @@ cdef libcudf_types.data_type dtype_to_data_type(dtype) except *: ( np_to_cudf_types[np.dtype(dtype)])) - if isinstance(tid, [libcudf_types.type_id.DECIMAL32, libcudf_types.type_id.DECIMAL64]): + if tid in (libcudf_types.type_id.DECIMAL64, libcudf_types.type_id.DECIMAL32): return libcudf_types.data_type(tid, -dtype.scale) else: return libcudf_types.data_type(tid) diff --git a/python/cudf/cudf/core/column/__init__.py b/python/cudf/cudf/core/column/__init__.py index 32cb557548f..6a494e8885c 100644 --- a/python/cudf/cudf/core/column/__init__.py +++ b/python/cudf/cudf/core/column/__init__.py @@ -21,4 +21,4 @@ from cudf.core.column.struct import StructColumn # noqa: F401 from cudf.core.column.timedelta import TimeDeltaColumn # noqa: F401 from cudf.core.column.interval import IntervalColumn # noqa: F401 -from cudf.core.column.decimal import DecimalColumn # noqa: F401 +from cudf.core.column.decimal import Decimal64Column # noqa: F401 diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index fd3a47aea64..42f231e28c9 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -289,7 +289,7 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase: ): return cudf.core.column.IntervalColumn.from_arrow(array) elif isinstance(array.type, pa.Decimal128Type): - return cudf.core.column.DecimalColumn.from_arrow(array) + return cudf.core.column.Decimal64Column.from_arrow(array) result = libcudf.interop.from_arrow(data, data.column_names)._data[ "None" @@ -978,7 +978,7 @@ def as_string_column( def as_decimal_column( self, dtype: Dtype, **kwargs - ) -> "cudf.core.column.DecimalColumn": + ) -> "cudf.core.column.Decimal64Column": raise NotImplementedError def apply_boolean_mask(self, mask) -> ColumnBase: @@ -1490,7 +1490,7 @@ def build_column( elif is_decimal_dtype(dtype): if size is None: raise TypeError("Must specify size") - return cudf.core.column.DecimalColumn( + return cudf.core.column.Decimal64Column( data=data, size=size, offset=offset, @@ -1963,7 +1963,7 @@ def as_column( precision=dtype.precision, scale=dtype.scale ), ) - return cudf.core.column.DecimalColumn.from_arrow(data) + return cudf.core.column.Decimal64Column.from_arrow(data) dtype = pd.api.types.pandas_dtype(dtype) if is_categorical_dtype(dtype) or is_interval_dtype(dtype): raise TypeError @@ -2212,7 +2212,7 @@ def _copy_type_metadata_from_arrow( Decimal64Dtype, copy precisions. """ if pa.types.is_decimal(arrow_array.type) and isinstance( - cudf_column, cudf.core.column.DecimalColumn + cudf_column, cudf.core.column.Decimal64Column ): cudf_column.dtype.precision = arrow_array.type.precision elif pa.types.is_struct(arrow_array.type) and isinstance( diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py index 459cfae6fdb..d8c27ae58ed 100644 --- a/python/cudf/cudf/core/column/decimal.py +++ b/python/cudf/cudf/core/column/decimal.py @@ -24,7 +24,7 @@ from .numerical_base import NumericalBaseColumn -class DecimalColumn(NumericalBaseColumn): +class Decimal64Column(NumericalBaseColumn): dtype: Decimal64Dtype def __truediv__(self, other): @@ -98,7 +98,7 @@ def binary_operator(self, op, other, reflect=False): elif op in ("eq", "ne", "lt", "gt", "le", "ge"): if not isinstance( other, - (DecimalColumn, cudf.core.column.NumericalColumn, cudf.Scalar), + (Decimal64Dtype, cudf.core.column.NumericalColumn, cudf.Scalar), ): raise TypeError( f"Operator {op} not supported between" @@ -145,7 +145,7 @@ def _decimal_quantile( def as_decimal_column( self, dtype: Dtype, **kwargs - ) -> "cudf.core.column.DecimalColumn": + ) -> "cudf.core.column.Decimal64Column": if dtype == self.dtype: return self return libcudf.unary.cast(self, dtype) @@ -175,7 +175,7 @@ def fillna( if isinstance(value, (int, Decimal)): value = cudf.Scalar(value, dtype=self.dtype) elif ( - isinstance(value, DecimalColumn) + isinstance(value, Decimal64Column) or isinstance(value, NumericalColumn) and is_integer_dtype(value.dtype) ): @@ -215,7 +215,7 @@ def _copy_type_metadata(self: ColumnBase, other: ColumnBase) -> ColumnBase: In addition to the default behavior, if `other` is also a decimal column the precision is copied over. """ - if isinstance(other, DecimalColumn): + if isinstance(other, Decimal64Column): other.dtype.precision = self.dtype.precision # type: ignore # Have to ignore typing here because it misdiagnoses super(). return super()._copy_type_metadata(other) # type: ignore diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index e35cc744434..9fce3c61d2f 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -139,14 +139,14 @@ def binary_operator( ( NumericalColumn, cudf.Scalar, - cudf.core.column.DecimalColumn, + cudf.core.column.Decimal64Column, ), ) or np.isscalar(rhs) ): msg = "{!r} operator not supported between {} and {}" raise TypeError(msg.format(binop, type(self), type(rhs))) - if isinstance(rhs, cudf.core.column.DecimalColumn): + if isinstance(rhs, cudf.core.column.Decimal64Column): lhs: Union[ScalarLike, ColumnBase] = self.as_decimal_column( Decimal64Dtype(Decimal64Dtype.MAX_PRECISION, 0) ) @@ -249,7 +249,7 @@ def as_timedelta_column( def as_decimal_column( self, dtype: Dtype, **kwargs - ) -> "cudf.core.column.DecimalColumn": + ) -> "cudf.core.column.Decimal64Column": return libcudf.unary.cast(self, dtype) def as_numerical_column(self, dtype: Dtype) -> NumericalColumn: diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 0b83548a92d..44f8ab7451f 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -5146,7 +5146,7 @@ def as_timedelta_column( def as_decimal_column( self, dtype: Dtype, **kwargs - ) -> "cudf.core.column.DecimalColumn": + ) -> "cudf.core.column.Decimal64Column": return cpp_to_decimal(self, dtype) def as_string_column(self, dtype: Dtype, format=None) -> StringColumn: diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index fcf09e593d1..e58e4ef9d39 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -301,7 +301,7 @@ def from_arrow(cls, typ): @property def itemsize(self): - return 8 + return 4 def __repr__(self): return ( diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 4ea3decdc50..b989e9f5731 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -504,7 +504,7 @@ def _concat( # Reassign precision for any decimal cols for name, col in out._data.items(): - if isinstance(col, cudf.core.column.DecimalColumn): + if isinstance(col, cudf.core.column.Decimal64Column): col = tables[0]._data[name]._copy_type_metadata(col) # Reassign index and column names diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index c5a7b07d778..4ed528f3e1e 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -2406,7 +2406,7 @@ def _concat(cls, objs, axis=0, index=True): col = _concat_columns([o._column for o in objs]) - if isinstance(col, cudf.core.column.DecimalColumn): + if isinstance(col, cudf.core.column.Decimal64Column): col = objs[0]._column._copy_type_metadata(col) return cls(data=col, index=index, name=name) diff --git a/python/cudf/cudf/tests/test_decimal.py b/python/cudf/cudf/tests/test_decimal.py index 073a8e443c7..b93762c8e24 100644 --- a/python/cudf/cudf/tests/test_decimal.py +++ b/python/cudf/cudf/tests/test_decimal.py @@ -7,7 +7,7 @@ import pytest import cudf -from cudf.core.column import DecimalColumn, NumericalColumn +from cudf.core.column import Decimal64Column, NumericalColumn from cudf.core.dtypes import Decimal64Dtype from cudf.tests.utils import ( FLOAT_TYPES, @@ -42,13 +42,13 @@ ) def test_round_trip_decimal_column(data, typ): pa_arr = pa.array(data, type=typ) - col = DecimalColumn.from_arrow(pa_arr) + col = Decimal64Column.from_arrow(pa_arr) assert pa_arr.equals(col.to_arrow()) def test_from_arrow_max_precision(): with pytest.raises(ValueError): - DecimalColumn.from_arrow( + Decimal64Column.from_arrow( pa.array([1, 2, 3], type=pa.decimal128(scale=0, precision=19)) ) @@ -83,7 +83,7 @@ def test_typecast_from_float_to_decimal(data, from_dtype, to_dtype): pa_arr = got.to_arrow().cast( pa.decimal128(to_dtype.precision, to_dtype.scale) ) - expected = cudf.Series(DecimalColumn.from_arrow(pa_arr)) + expected = cudf.Series(Decimal64Column.from_arrow(pa_arr)) got = got.astype(to_dtype) @@ -123,7 +123,7 @@ def test_typecast_from_int_to_decimal(data, from_dtype, to_dtype): .cast("float64") .cast(pa.decimal128(to_dtype.precision, to_dtype.scale)) ) - expected = cudf.Series(DecimalColumn.from_arrow(pa_arr)) + expected = cudf.Series(Decimal64Column.from_arrow(pa_arr)) got = got.astype(to_dtype) @@ -163,7 +163,7 @@ def test_typecast_to_from_decimal(data, from_dtype, to_dtype): pa_arr = got.to_arrow().cast( pa.decimal128(to_dtype.precision, to_dtype.scale), safe=False ) - expected = cudf.Series(DecimalColumn.from_arrow(pa_arr)) + expected = cudf.Series(Decimal64Column.from_arrow(pa_arr)) got = got.astype(to_dtype) diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index 4311271130b..3033cb2f328 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -280,13 +280,7 @@ def is_interval_dtype(obj): def is_decimal_dtype(obj): return ( - type(obj) is cudf.core.dtypes.Decimal64Dtype - or obj is cudf.core.dtypes.Decimal64Dtype - or ( - isinstance(obj, str) - and obj == cudf.core.dtypes.Decimal64Dtype.name - ) - or (hasattr(obj, "dtype") and is_decimal_dtype(obj.dtype)) + is_decimal32_dtype(obj) or is_decimal64_dtype(obj) ) From 911a7c89392b483fc68da75c5efaaf73a65129fa Mon Sep 17 00:00:00 2001 From: Sheilah Date: Fri, 11 Jun 2021 12:34:44 -0700 Subject: [PATCH 06/13] introduced Decimal32Column in decimal.py --- python/cudf/cudf/_lib/types.pyx | 35 ++++++++++---- python/cudf/cudf/core/column/column.py | 5 +- python/cudf/cudf/core/column/decimal.py | 62 ++++++++++++++++++++++++- python/cudf/cudf/core/dtypes.py | 4 +- 4 files changed, 93 insertions(+), 13 deletions(-) diff --git a/python/cudf/cudf/_lib/types.pyx b/python/cudf/cudf/_lib/types.pyx index 0cc56352c5f..43e5c213947 100644 --- a/python/cudf/cudf/_lib/types.pyx +++ b/python/cudf/cudf/_lib/types.pyx @@ -14,9 +14,19 @@ from cudf._lib.types cimport ( ) from cudf._lib.cpp.column.column_view cimport column_view from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view -from cudf.core.dtypes import ListDtype, StructDtype, Decimal64Dtype, Decimal32Dtype -from cudf.utils.dtypes import is_decimal_dtype, is_list_dtype, is_struct_dtype, is_decimal64_dtype, is_decimal32_dtype - +from cudf.core.dtypes import ( + ListDtype, + StructDtype, + Decimal64Dtype, + Decimal32Dtype +) +from cudf.utils.dtypes import ( + is_decimal_dtype, + is_list_dtype, + is_struct_dtype, + is_decimal64_dtype, + is_decimal32_dtype +) cimport cudf._lib.cpp.types as libcudf_types @@ -197,10 +207,16 @@ cdef dtype_from_column_view(column_view cv): return dtype_from_lists_column_view(cv) elif tid == libcudf_types.type_id.STRUCT: return dtype_from_structs_column_view(cv) - elif tid == libcudf_types.type_id.DECIMAL64: - Decimal64Dtype(precision=Decimal64Dtype.MAX_PRECISION, scale=-cv.type().scale()) - elif tid == libcudf_types.type_id.DECIMAL32: - Decimal32Dtype(precision=Decimal32Dtype.MAX_PRECISION, scale=-cv.type().scale()) + elif tid == libcudf_types.type_id.DECIMAL64: + return Decimal64Dtype( + precision=Decimal64Dtype.MAX_PRECISION, + scale=-cv.type().scale() + ) + elif tid == libcudf_types.type_id.DECIMAL32: + return Decimal32Dtype( + precision=Decimal32Dtype.MAX_PRECISION, + scale=-cv.type().scale() + ) else: return cudf_to_np_types[(tid)] @@ -218,7 +234,10 @@ cdef libcudf_types.data_type dtype_to_data_type(dtype) except *: ( np_to_cudf_types[np.dtype(dtype)])) - if tid in (libcudf_types.type_id.DECIMAL64, libcudf_types.type_id.DECIMAL32): + if tid in ( + libcudf_types.type_id.DECIMAL64, + libcudf_types.type_id.DECIMAL32 + ): return libcudf_types.data_type(tid, -dtype.scale) else: return libcudf_types.data_type(tid) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 42f231e28c9..489f5f68049 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -1956,6 +1956,7 @@ def as_column( "Cannot create list column from given data" ) return as_column(data, nan_as_null=nan_as_null) + # breakpoint() if isinstance(dtype, cudf.core.dtypes.Decimal64Dtype): data = pa.array( arbitrary, @@ -1963,7 +1964,9 @@ def as_column( precision=dtype.precision, scale=dtype.scale ), ) - return cudf.core.column.Decimal64Column.from_arrow(data) + return cudf.core.column.Decimal64Column.from_arrow( + data + ) dtype = pd.api.types.pandas_dtype(dtype) if is_categorical_dtype(dtype) or is_interval_dtype(dtype): raise TypeError diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py index d8c27ae58ed..b5acc1f151a 100644 --- a/python/cudf/cudf/core/column/decimal.py +++ b/python/cudf/cudf/core/column/decimal.py @@ -17,13 +17,66 @@ from cudf._typing import Dtype from cudf.core.buffer import Buffer from cudf.core.column import ColumnBase, NumericalColumn, as_column -from cudf.core.dtypes import Decimal64Dtype +from cudf.core.dtypes import Decimal32Dtype, Decimal64Dtype from cudf.utils.dtypes import is_scalar from cudf.utils.utils import pa_mask_buffer_to_mask from .numerical_base import NumericalBaseColumn +class Decimal32Column(NumericalBaseColumn): + dtype: Decimal32Dtype + + @classmethod + def from_arrow(cls, data: pa.Array): + dtype = Decimal32Dtype.from_arrow(data.type) + mask_buf = data.buffers()[0] + mask = ( + mask_buf + if mask_buf is None + else pa_mask_buffer_to_mask(mask_buf, len(data)) + ) + data_128 = cp.array(np.frombuffer(data.buffers()[1]).view("int32")) + data_32 = data_128[::4].copy() + return cls( + data=Buffer(data_32.view("uint8")), + size=len(data), + dtype=dtype, + offset=data.offset, + mask=mask, + ) + + def to_arrow(self): + data_buf_32 = self.base_data.to_host_array().view("int32") + data_buf_128 = np.empty(len(data_buf_32) * 4, dtype="int32") + + # use striding to set the first 32 bits of each 128-bit chunk: + data_buf_128[::4] = data_buf_32 + # use striding again to set the remaining bits of each 128-bit chunk: + # 0 for non-negative values, -1 for negative values: + data_buf_128[1::4] = np.piecewise( + data_buf_32, [data_buf_32 < 0], [-1, 0] + ) + data_buf_128[2::4] = np.piecewise( + data_buf_32, [data_buf_32 < 0], [-1, 0] + ) + data_buf_128[3::4] = np.piecewise( + data_buf_32, [data_buf_32 < 0], [-1, 0] + ) + data_buf = pa.py_buffer(data_buf_128) + mask_buf = ( + self.base_mask + if self.base_mask is None + else pa.py_buffer(self.base_mask.to_host_array()) + ) + return pa.Array.from_buffers( + type=self.dtype.to_arrow(), + offset=self._offset, + length=self.size, + buffers=[mask_buf, data_buf], + ) + + class Decimal64Column(NumericalBaseColumn): dtype: Decimal64Dtype @@ -60,6 +113,7 @@ def from_arrow(cls, data: pa.Array): def to_arrow(self): data_buf_64 = self.base_data.to_host_array().view("int64") data_buf_128 = np.empty(len(data_buf_64) * 2, dtype="int64") + # use striding to set the first 64 bits of each 128-bit chunk: data_buf_128[::2] = data_buf_64 # use striding again to set the remaining bits of each 128-bit chunk: @@ -98,7 +152,11 @@ def binary_operator(self, op, other, reflect=False): elif op in ("eq", "ne", "lt", "gt", "le", "ge"): if not isinstance( other, - (Decimal64Dtype, cudf.core.column.NumericalColumn, cudf.Scalar), + ( + Decimal64Dtype, + cudf.core.column.NumericalColumn, + cudf.Scalar, + ), ): raise TypeError( f"Operator {op} not supported between" diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index e58e4ef9d39..0b43d6d7135 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -240,7 +240,7 @@ def __hash__(self): class Decimal32Dtype(_BaseDtype): - name = "decimal" + name = "decimal32" _metadata = ("precision", "scale") MAX_PRECISION = np.floor(np.log10(np.iinfo("int32").max)) @@ -341,7 +341,7 @@ def deserialize(cls, header: dict, frames: list): class Decimal64Dtype(_BaseDtype): - name = "decimal" + name = "decimal64" _metadata = ("precision", "scale") MAX_PRECISION = np.floor(np.log10(np.iinfo("int64").max)) From 494e987ad73ba3311db08ab5024eed5ae50dc7e2 Mon Sep 17 00:00:00 2001 From: Sheilah Date: Mon, 14 Jun 2021 10:05:51 -0700 Subject: [PATCH 07/13] fixed construct series with decimal32column --- python/cudf/cudf/core/column/__init__.py | 7 +++++-- python/cudf/cudf/core/column/column.py | 26 +++++++++++++++++++++++- 2 files changed, 30 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/core/column/__init__.py b/python/cudf/cudf/core/column/__init__.py index 6a494e8885c..388d998e479 100644 --- a/python/cudf/cudf/core/column/__init__.py +++ b/python/cudf/cudf/core/column/__init__.py @@ -15,10 +15,13 @@ serialize_columns, ) from cudf.core.column.datetime import DatetimeColumn # noqa: F401 +from cudf.core.column.decimal import ( # noqa: F401 + Decimal32Column, + Decimal64Column, +) +from cudf.core.column.interval import IntervalColumn # noqa: F401 from cudf.core.column.lists import ListColumn # noqa: F401 from cudf.core.column.numerical import NumericalColumn # noqa: F401 from cudf.core.column.string import StringColumn # noqa: F401 from cudf.core.column.struct import StructColumn # noqa: F401 from cudf.core.column.timedelta import TimeDeltaColumn # noqa: F401 -from cudf.core.column.interval import IntervalColumn # noqa: F401 -from cudf.core.column.decimal import Decimal64Column # noqa: F401 diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index b080a196190..4e122443caa 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -50,6 +50,8 @@ check_cast_unsupported_dtype, get_time_unit, is_categorical_dtype, + is_decimal32_dtype, + is_decimal64_dtype, is_decimal_dtype, is_interval_dtype, is_list_dtype, @@ -1487,7 +1489,7 @@ def build_column( null_count=null_count, children=children, ) - elif is_decimal_dtype(dtype): + elif is_decimal64_dtype(dtype): if size is None: raise TypeError("Must specify size") return cudf.core.column.Decimal64Column( @@ -1499,6 +1501,18 @@ def build_column( null_count=null_count, children=children, ) + elif is_decimal32_dtype(dtype): + if size is None: + raise TypeError("Must specify size") + return cudf.core.column.Decimal32Column( + data=data, + size=size, + offset=offset, + dtype=dtype, + mask=mask, + null_count=null_count, + children=children, + ) elif is_interval_dtype(dtype): return cudf.core.column.IntervalColumn( dtype=dtype, @@ -1967,6 +1981,16 @@ def as_column( return cudf.core.column.Decimal64Column.from_arrow( data ) + if isinstance(dtype, cudf.core.dtypes.Decimal32Dtype): + data = pa.array( + arbitrary, + type=pa.decimal128( + precision=dtype.precision, scale=dtype.scale + ), + ) + return cudf.core.column.Decimal32Column.from_arrow( + data + ) dtype = pd.api.types.pandas_dtype(dtype) if is_categorical_dtype(dtype) or is_interval_dtype(dtype): raise TypeError From 95d329140112b66abbfa153595427e84106a8695 Mon Sep 17 00:00:00 2001 From: Sheilah Date: Wed, 16 Jun 2021 16:28:04 -0700 Subject: [PATCH 08/13] . --- python/cudf/cudf/utils/dtypes.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index 07b5389ccea..6d1078d987e 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -8,12 +8,8 @@ import numpy as np import pandas as pd import pyarrow as pa -<<<<<<< HEAD from pandas.core.dtypes.common import infer_dtype_from_object, pandas_dtype from pandas.core.dtypes.dtypes import CategoricalDtype, CategoricalDtypeType -======= -from pandas.core.dtypes.common import infer_dtype_from_object ->>>>>>> 716dc12437a7b3bb33e8a2ccfa6ecb2c592568c7 import cudf from cudf.core._compat import PANDAS_GE_120 From 00ea7960ab6145ba1b7ff069cc90ab41051818e8 Mon Sep 17 00:00:00 2001 From: Sheilah Date: Wed, 16 Jun 2021 18:11:52 -0700 Subject: [PATCH 09/13] added to/from arrow tests for decimal32 --- python/cudf/cudf/core/column/__init__.py | 10 ++-- python/cudf/cudf/core/column/column.py | 1 - python/cudf/cudf/core/column/decimal.py | 4 +- python/cudf/cudf/tests/test_decimal.py | 74 ++++++++++++++---------- 4 files changed, 50 insertions(+), 39 deletions(-) diff --git a/python/cudf/cudf/core/column/__init__.py b/python/cudf/cudf/core/column/__init__.py index 388d998e479..34974b54106 100644 --- a/python/cudf/cudf/core/column/__init__.py +++ b/python/cudf/cudf/core/column/__init__.py @@ -15,13 +15,13 @@ serialize_columns, ) from cudf.core.column.datetime import DatetimeColumn # noqa: F401 -from cudf.core.column.decimal import ( # noqa: F401 - Decimal32Column, - Decimal64Column, -) -from cudf.core.column.interval import IntervalColumn # noqa: F401 from cudf.core.column.lists import ListColumn # noqa: F401 from cudf.core.column.numerical import NumericalColumn # noqa: F401 from cudf.core.column.string import StringColumn # noqa: F401 from cudf.core.column.struct import StructColumn # noqa: F401 from cudf.core.column.timedelta import TimeDeltaColumn # noqa: F401 +from cudf.core.column.interval import IntervalColumn # noqa: F401 +from cudf.core.column.decimal import ( # noqa: F401 + Decimal32Column, + Decimal64Column, +) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 4e122443caa..6776c8d195d 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -1970,7 +1970,6 @@ def as_column( "Cannot create list column from given data" ) return as_column(data, nan_as_null=nan_as_null) - # breakpoint() if isinstance(dtype, cudf.core.dtypes.Decimal64Dtype): data = pa.array( arbitrary, diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py index b5acc1f151a..0a8c2bae5ef 100644 --- a/python/cudf/cudf/core/column/decimal.py +++ b/python/cudf/cudf/core/column/decimal.py @@ -16,7 +16,7 @@ ) from cudf._typing import Dtype from cudf.core.buffer import Buffer -from cudf.core.column import ColumnBase, NumericalColumn, as_column +from cudf.core.column import ColumnBase, as_column from cudf.core.dtypes import Decimal32Dtype, Decimal64Dtype from cudf.utils.dtypes import is_scalar from cudf.utils.utils import pa_mask_buffer_to_mask @@ -234,7 +234,7 @@ def fillna( value = cudf.Scalar(value, dtype=self.dtype) elif ( isinstance(value, Decimal64Column) - or isinstance(value, NumericalColumn) + or isinstance(value, cudf.core.column.NumericalColumn) and is_integer_dtype(value.dtype) ): value = value.astype(self.dtype) diff --git a/python/cudf/cudf/tests/test_decimal.py b/python/cudf/cudf/tests/test_decimal.py index 70fd22778bb..2301c218d77 100644 --- a/python/cudf/cudf/tests/test_decimal.py +++ b/python/cudf/cudf/tests/test_decimal.py @@ -8,7 +8,7 @@ import pytest import cudf -from cudf.core.column import Decimal64Column, NumericalColumn +from cudf.core.column import Decimal32Column, Decimal64Column, NumericalColumn from cudf.core.dtypes import Decimal64Dtype from cudf.tests.utils import ( FLOAT_TYPES, @@ -18,42 +18,54 @@ assert_eq, ) - -@pytest.mark.parametrize( - "data", - [ - [Decimal("1.1"), Decimal("2.2"), Decimal("3.3"), Decimal("4.4")], - [Decimal("-1.1"), Decimal("2.2"), Decimal("3.3"), Decimal("4.4")], - [1], - [-1], - [1, 2, 3, 4], - [42, 1729, 4104], - [1, 2, None, 4], - [None, None, None], - [], - ], -) -@pytest.mark.parametrize( - "typ", - [ - pa.decimal128(precision=4, scale=2), - pa.decimal128(precision=5, scale=3), - pa.decimal128(precision=6, scale=4), - ], -) -def test_round_trip_decimal_column(data, typ): - pa_arr = pa.array(data, type=typ) - col = Decimal64Column.from_arrow(pa_arr) - assert pa_arr.equals(col.to_arrow()) - - -def test_from_arrow_max_precision(): +data_ = [ + [Decimal("1.1"), Decimal("2.2"), Decimal("3.3"), Decimal("4.4")], + [Decimal("-1.1"), Decimal("2.2"), Decimal("3.3"), Decimal("4.4")], + [1], + [-1], + [1, 2, 3, 4], + [42, 1729, 4104], + [1, 2, None, 4], + [None, None, None], + [], +] +typ_ = [ + pa.decimal128(precision=4, scale=2), + pa.decimal128(precision=5, scale=3), + pa.decimal128(precision=6, scale=4), +] + + +@pytest.mark.parametrize("data_", data_) +@pytest.mark.parametrize("typ_", typ_) +def test_round_trip_decimal64_column(data_, typ_): + pa_arr = pa.array(data_, type=typ_) + col_64 = Decimal64Column.from_arrow(pa_arr) + assert pa_arr.equals(col_64.to_arrow()) + + +@pytest.mark.parametrize("data_", data_) +@pytest.mark.parametrize("typ_", typ_) +def test_round_trip_decimal32_column(data_, typ_): + pa_arr = pa.array(data_, type=typ_) + col_32 = Decimal32Column.from_arrow(pa_arr) + assert pa_arr.equals(col_32.to_arrow()) + + +def test_from_arrow_max_precision_decimal64(): with pytest.raises(ValueError): Decimal64Column.from_arrow( pa.array([1, 2, 3], type=pa.decimal128(scale=0, precision=19)) ) +def test_from_arrow_max_precision_decimal32(): + with pytest.raises(ValueError): + Decimal32Column.from_arrow( + pa.array([1, 2, 3], type=pa.decimal128(scale=0, precision=10)) + ) + + @pytest.mark.parametrize( "data", [ From 4cb5b57ecc05ed8d9a6e97e65e80b0ee3d1a33f3 Mon Sep 17 00:00:00 2001 From: Sheilah Date: Fri, 18 Jun 2021 14:43:21 -0700 Subject: [PATCH 10/13] addressed review comments --- python/cudf/cudf/api/types.py | 10 +- python/cudf/cudf/core/column/column.py | 34 +++-- python/cudf/cudf/core/column/decimal.py | 6 +- python/cudf/cudf/core/dtypes.py | 34 +++-- python/cudf/cudf/utils/dtypes.py | 180 ++---------------------- 5 files changed, 69 insertions(+), 195 deletions(-) diff --git a/python/cudf/cudf/api/types.py b/python/cudf/cudf/api/types.py index a985efeca51..56398bd4f13 100644 --- a/python/cudf/cudf/api/types.py +++ b/python/cudf/cudf/api/types.py @@ -15,9 +15,11 @@ import cudf from cudf._lib.scalar import DeviceScalar -from cudf.core.dtypes import ( +from cudf.core.dtypes import ( # noqa: F401 _BaseDtype, is_categorical_dtype, + is_decimal32_dtype, + is_decimal64_dtype, is_decimal_dtype, is_interval_dtype, is_list_dtype, @@ -39,11 +41,15 @@ def is_numeric_dtype(obj): Whether or not the array or dtype is of a numeric dtype. """ if isclass(obj): - if issubclass(obj, cudf.Decimal64Dtype): + if issubclass(obj, (cudf.Decimal32Dtype, cudf.Decimal64Dtype)): return True if issubclass(obj, _BaseDtype): return False else: + if isinstance(obj, cudf.Decimal32Dtype) or isinstance( + getattr(obj, "dtype", None), cudf.Decimal32Dtype + ): + return True if isinstance(obj, cudf.Decimal64Dtype) or isinstance( getattr(obj, "dtype", None), cudf.Decimal64Dtype ): diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index d6fe020e5e6..576eb0fcf6b 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -47,30 +47,30 @@ ) from cudf.utils import ioutils, utils from cudf.utils.dtypes import ( - _is_non_decimal_numeric_dtype, - _is_scalar_or_zero_d_array, check_cast_unsupported_dtype, cudf_dtype_from_pa_type, get_time_unit, - is_categorical_dtype, - is_decimal32_dtype, - is_decimal64_dtype, - is_decimal_dtype, - is_interval_dtype, - is_list_dtype, - is_scalar, - is_string_dtype, - is_struct_dtype, min_unsigned_type, np_to_pa_dtype, ) from cudf.utils.utils import mask_dtype from ...api.types import ( + _is_non_decimal_numeric_dtype, + _is_scalar_or_zero_d_array, infer_dtype, is_bool_dtype, + is_categorical_dtype, + is_decimal32_dtype, + is_decimal64_dtype, + is_decimal_dtype, is_dtype_equal, is_integer_dtype, + is_interval_dtype, + is_list_dtype, + is_scalar, + is_string_dtype, + is_struct_dtype, pandas_dtype, ) @@ -975,9 +975,21 @@ def as_string_column( def as_decimal_column( self, dtype: Dtype, **kwargs + ) -> Union[ + "cudf.core.column.Decimal32Column", "cudf.core.column.Decimal64Column" + ]: + raise NotImplementedError + + def as_decimal64_column( + self, dtype: Dtype, **kwargs ) -> "cudf.core.column.Decimal64Column": raise NotImplementedError + def as_decimal32_column( + self, dtype: Dtype, **kwargs + ) -> "cudf.core.column.Decimal32Column": + raise NotImplementedError + def apply_boolean_mask(self, mask) -> ColumnBase: mask = as_column(mask, dtype="bool") result = ( diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py index 80999d77a71..40fc8a9d720 100644 --- a/python/cudf/cudf/core/column/decimal.py +++ b/python/cudf/cudf/core/column/decimal.py @@ -20,8 +20,8 @@ from cudf.utils.dtypes import is_scalar from cudf.utils.utils import pa_mask_buffer_to_mask -from .numerical_base import NumericalBaseColumn from ...api.types import is_integer_dtype +from .numerical_base import NumericalBaseColumn class Decimal32Column(NumericalBaseColumn): @@ -268,8 +268,8 @@ def __cuda_array_interface__(self): ) def _with_type_metadata( - self: "cudf.core.column.DecimalColumn", dtype: Dtype - ) -> "cudf.core.column.DecimalColumn": + self: "cudf.core.column.Decimal64Column", dtype: Dtype + ) -> "cudf.core.column.Decimal64Column": if isinstance(dtype, Decimal64Dtype): self.dtype.precision = dtype.precision diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index 1f4c576cd04..a27d60890f0 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -607,15 +607,7 @@ def is_decimal_dtype(obj): bool Whether or not the array-like or dtype is of the decimal dtype. """ - return ( - type(obj) is cudf.core.dtypes.Decimal64Dtype - or obj is cudf.core.dtypes.Decimal64Dtype - or ( - isinstance(obj, str) - and obj == cudf.core.dtypes.Decimal64Dtype.name - ) - or (hasattr(obj, "dtype") and is_decimal_dtype(obj.dtype)) - ) + return is_decimal32_dtype(obj) or is_decimal64_dtype(obj) def is_interval_dtype(obj): @@ -642,3 +634,27 @@ def is_interval_dtype(obj): ) or (hasattr(obj, "dtype") and is_interval_dtype(obj.dtype)) ) + + +def is_decimal32_dtype(obj): + return ( + type(obj) is cudf.core.dtypes.Decimal32Dtype + or obj is cudf.core.dtypes.Decimal32Dtype + or ( + isinstance(obj, str) + and obj == cudf.core.dtypes.Decimal32Dtype.name + ) + or (hasattr(obj, "dtype") and is_decimal32_dtype(obj.dtype)) + ) + + +def is_decimal64_dtype(obj): + return ( + type(obj) is cudf.core.dtypes.Decimal64Dtype + or obj is cudf.core.dtypes.Decimal64Dtype + or ( + isinstance(obj, str) + and obj == cudf.core.dtypes.Decimal64Dtype.name + ) + or (hasattr(obj, "dtype") and is_decimal64_dtype(obj.dtype)) + ) diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index 6d1078d987e..98f45002bbc 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -8,8 +8,6 @@ import numpy as np import pandas as pd import pyarrow as pa -from pandas.core.dtypes.common import infer_dtype_from_object, pandas_dtype -from pandas.core.dtypes.dtypes import CategoricalDtype, CategoricalDtypeType import cudf from cudf.core._compat import PANDAS_GE_120 @@ -17,8 +15,11 @@ from ..api.types import ( # noqa: F401 _is_non_decimal_numeric_dtype, _is_scalar_or_zero_d_array, + infer_dtype, is_categorical_dtype, is_datetime_dtype as is_datetime_dtype, + is_decimal32_dtype, + is_decimal64_dtype, is_decimal_dtype, is_integer, is_integer_dtype, @@ -30,6 +31,7 @@ is_string_dtype, is_struct_dtype, is_timedelta_dtype, + pandas_dtype, ) _NA_REP = "" @@ -159,170 +161,6 @@ def numeric_normalize_types(*args): return [a.astype(dtype) for a in args] -def is_numerical_dtype(obj): - # TODO: we should handle objects with a `.dtype` attribute, - # e.g., arrays, here. - try: - dtype = np.dtype(obj) - except TypeError: - return False - return dtype.kind in "biuf" - - -def is_integer_dtype(obj): - try: - dtype = np.dtype(obj) - except TypeError: - return pd.api.types.is_integer_dtype(obj) - return dtype.kind in "iu" - - -def is_integer(obj): - if isinstance(obj, cudf.Scalar): - return is_integer_dtype(obj.dtype) - return pd.api.types.is_integer(obj) - - -def is_string_dtype(obj): - return ( - pd.api.types.is_string_dtype(obj) - # Reject all cudf extension types. - and not is_categorical_dtype(obj) - and not is_decimal_dtype(obj) - and not is_list_dtype(obj) - and not is_struct_dtype(obj) - and not is_interval_dtype(obj) - ) - - -def is_datetime_dtype(obj): - if obj is None: - return False - if not hasattr(obj, "str"): - return False - return "M8" in obj.str - - -def is_timedelta_dtype(obj): - if obj is None: - return False - if not hasattr(obj, "str"): - return False - return "m8" in obj.str - - -def is_categorical_dtype(obj): - """Infer whether a given pandas, numpy, or cuDF Column, Series, or dtype - is a pandas CategoricalDtype. - """ - if obj is None: - return False - if isinstance(obj, cudf.CategoricalDtype): - return True - if obj is cudf.CategoricalDtype: - return True - if isinstance(obj, np.dtype): - return False - if isinstance(obj, CategoricalDtype): - return True - if obj is CategoricalDtype: - return True - if obj is CategoricalDtypeType: - return True - if isinstance(obj, str) and obj == "category": - return True - if isinstance( - obj, - ( - CategoricalDtype, - cudf.core.index.CategoricalIndex, - cudf.core.column.CategoricalColumn, - pd.Categorical, - pd.CategoricalIndex, - ), - ): - return True - if isinstance(obj, np.ndarray): - return False - if isinstance( - obj, - ( - cudf.Index, - cudf.Series, - cudf.core.column.ColumnBase, - pd.Index, - pd.Series, - ), - ): - return is_categorical_dtype(obj.dtype) - if hasattr(obj, "type"): - if obj.type is CategoricalDtypeType: - return True - return pd.api.types.is_categorical_dtype(obj) - - -def is_list_dtype(obj): - return ( - type(obj) is cudf.core.dtypes.ListDtype - or obj is cudf.core.dtypes.ListDtype - or type(obj) is cudf.core.column.ListColumn - or obj is cudf.core.column.ListColumn - or (isinstance(obj, str) and obj == cudf.core.dtypes.ListDtype.name) - or (hasattr(obj, "dtype") and is_list_dtype(obj.dtype)) - ) - - -def is_struct_dtype(obj): - return ( - isinstance(obj, cudf.core.dtypes.StructDtype) - or obj is cudf.core.dtypes.StructDtype - or (isinstance(obj, str) and obj == cudf.core.dtypes.StructDtype.name) - or (hasattr(obj, "dtype") and is_struct_dtype(obj.dtype)) - ) - - -def is_interval_dtype(obj): - return ( - isinstance(obj, cudf.core.dtypes.IntervalDtype) - or isinstance(obj, pd.core.dtypes.dtypes.IntervalDtype) - or obj is cudf.core.dtypes.IntervalDtype - or ( - isinstance(obj, str) and obj == cudf.core.dtypes.IntervalDtype.name - ) - or (hasattr(obj, "dtype") and is_interval_dtype(obj.dtype)) - ) - - -def is_decimal_dtype(obj): - return ( - is_decimal32_dtype(obj) or is_decimal64_dtype(obj) - ) - - -def is_decimal32_dtype(obj): - return ( - type(obj) is cudf.core.dtypes.Decimal32Dtype - or obj is cudf.core.dtypes.Decimal32Dtype - or ( - isinstance(obj, str) - and obj == cudf.core.dtypes.Decimal32Dtype.name - ) - or (hasattr(obj, "dtype") and is_decimal32_dtype(obj.dtype)) - ) - - -def is_decimal64_dtype(obj): - return ( - type(obj) is cudf.core.dtypes.Decimal64Dtype - or obj is cudf.core.dtypes.Decimal64Dtype - or ( - isinstance(obj, str) - and obj == cudf.core.dtypes.Decimal64Dtype.name - ) - or (hasattr(obj, "dtype") and is_decimal64_dtype(obj.dtype)) - ) - - def _find_common_type_decimal(dtypes): # Find the largest scale and the largest difference between # precision and scale of the columns to be concatenated @@ -341,12 +179,14 @@ def cudf_dtype_from_pydata_dtype(dtype): if is_categorical_dtype(dtype): return cudf.core.dtypes.CategoricalDtype - elif is_decimal_dtype(dtype): + elif is_decimal32_dtype(dtype): + return cudf.core.dtypes.Decimal32Dtype + elif is_decimal64_dtype(dtype): return cudf.core.dtypes.Decimal64Dtype elif dtype in cudf._lib.types.np_to_cudf_types: return dtype.type - return infer_dtype_from_object(dtype) + return infer_dtype(dtype) def cudf_dtype_to_pa_type(dtype): @@ -376,7 +216,7 @@ def cudf_dtype_from_pa_type(typ): elif pa.types.is_decimal(typ): return cudf.core.dtypes.Decimal64Dtype.from_arrow(typ) else: - return pd.api.types.pandas_dtype(typ.to_pandas_dtype()) + return pandas_dtype(typ.to_pandas_dtype()) def to_cudf_compatible_scalar(val, dtype=None): @@ -416,7 +256,7 @@ def to_cudf_compatible_scalar(val, dtype=None): elif isinstance(val, pd.Timedelta): val = val.to_timedelta64() - val = pd.api.types.pandas_dtype(type(val)).type(val) + val = pandas_dtype(type(val)).type(val) if dtype is not None: val = val.astype(dtype) From 53009c0a874c626a2942e7d93861f0a813190f41 Mon Sep 17 00:00:00 2001 From: Sheilah Date: Wed, 23 Jun 2021 12:34:05 -0700 Subject: [PATCH 11/13] fixed CI tests --- python/cudf/cudf/core/frame.py | 2 +- python/cudf/cudf/core/series.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 5da6f7d8f2f..062ad4001de 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -507,7 +507,7 @@ def _concat( # Reassign precision for any decimal cols for name, col in out._data.items(): - if isinstance(col, cudf.core.column.DecimalColumn): + if isinstance(col, cudf.core.column.Decimal64Column): col = col._with_type_metadata(tables[0]._data[name].dtype) # Reassign index and column names diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index ad2df0849c3..5eab470cb4f 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -2415,7 +2415,7 @@ def _concat(cls, objs, axis=0, index=True): col = _concat_columns([o._column for o in objs]) - if isinstance(col, cudf.core.column.DecimalColumn): + if isinstance(col, cudf.core.column.Decimal64Column): col = col._with_type_metadata(objs[0]._column.dtype) return cls(data=col, index=index, name=name) From 4d66196d4beebdbc434d60cd55360e3f8c572207 Mon Sep 17 00:00:00 2001 From: Sheilah Date: Thu, 24 Jun 2021 13:08:55 -0700 Subject: [PATCH 12/13] fixed failing tests-- infer_dtype --- python/cudf/cudf/utils/dtypes.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index 2225852fbd4..2edb94d5778 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -184,6 +184,8 @@ def cudf_dtype_from_pydata_dtype(dtype): return cudf.core.dtypes.Decimal64Dtype elif dtype in cudf._lib.types.np_to_cudf_types: return dtype.type + elif isinstance(pandas_dtype(dtype), np.dtype): + return dtype return infer_dtype(dtype) From c69ab3f7f3a4c512cffc1b04fd49515ce07865a6 Mon Sep 17 00:00:00 2001 From: Sheilah Date: Mon, 28 Jun 2021 17:31:50 -0700 Subject: [PATCH 13/13] reverted to infer_dtype_from_object --- python/cudf/cudf/utils/dtypes.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index 2edb94d5778..e1ae87e5089 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -8,6 +8,7 @@ import numpy as np import pandas as pd import pyarrow as pa +from pandas.core.dtypes.common import infer_dtype_from_object import cudf from cudf.api.types import ( # noqa: F401 @@ -184,10 +185,8 @@ def cudf_dtype_from_pydata_dtype(dtype): return cudf.core.dtypes.Decimal64Dtype elif dtype in cudf._lib.types.np_to_cudf_types: return dtype.type - elif isinstance(pandas_dtype(dtype), np.dtype): - return dtype - return infer_dtype(dtype) + return infer_dtype_from_object(dtype) def cudf_dtype_to_pa_type(dtype):