From 4f17e211af17ddb94368bf77c7ba93ac19e819ad Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Wed, 31 Mar 2021 11:10:20 -0400 Subject: [PATCH 1/7] Make all cudf dtypes inherit from a _BaseDtype --- python/cudf/cudf/core/dtypes.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index a18aad3872b..6ea21cdfcd5 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -14,7 +14,11 @@ from cudf._typing import Dtype -class CategoricalDtype(ExtensionDtype): +class _BaseDtype(ExtensionDtype): + pass + + +class CategoricalDtype(_BaseDtype): ordered: Optional[bool] @@ -121,7 +125,7 @@ def deserialize(cls, header, frames): return cls(categories=categories, ordered=ordered) -class ListDtype(ExtensionDtype): +class ListDtype(_BaseDtype): _typ: pa.ListType name: str = "list" @@ -180,7 +184,7 @@ def __hash__(self): return hash(self._typ) -class StructDtype(ExtensionDtype): +class StructDtype(_BaseDtype): name = "struct" @@ -231,7 +235,7 @@ def __hash__(self): return hash(self._typ) -class Decimal64Dtype(ExtensionDtype): +class Decimal64Dtype(_BaseDtype): name = "decimal" _metadata = ("precision", "scale") From 86be1ae740707e17af5cf0cd7c830ceb47d5fff8 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Wed, 31 Mar 2021 11:18:47 -0400 Subject: [PATCH 2/7] Test dtypes in assert_eq --- python/cudf/cudf/__init__.py | 7 ++++++- python/cudf/cudf/tests/utils.py | 7 +++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py index 2d9438b515f..94649069060 100644 --- a/python/cudf/cudf/__init__.py +++ b/python/cudf/cudf/__init__.py @@ -40,7 +40,12 @@ merge, ) from cudf.core.algorithms import factorize -from cudf.core.dtypes import CategoricalDtype, Decimal64Dtype +from cudf.core.dtypes import ( + CategoricalDtype, + Decimal64Dtype, + ListDtype, + StructDtype, +) from cudf.core.groupby import Grouper from cudf.core.ops import ( add, diff --git a/python/cudf/cudf/tests/utils.py b/python/cudf/cudf/tests/utils.py index 1163c3085e4..32436d5b871 100644 --- a/python/cudf/cudf/tests/utils.py +++ b/python/cudf/cudf/tests/utils.py @@ -74,6 +74,13 @@ def assert_eq(left, right, **kwargs): without switching between assert_frame_equal/assert_series_equal/... functions. """ + # dtypes that we support but Pandas doesn't will convert to + # `object`. Check equality before that happens: + if kwargs.get("check_dtype", True): + if hasattr(left, "dtype") and hasattr(right, "dtype"): + if isinstance(left.dtype, cudf.core.dtypes._BaseDtype): + assert_eq(left.dtype, right.dtype) + if hasattr(left, "to_pandas"): left = left.to_pandas() if hasattr(right, "to_pandas"): From a7fe2ce201c4ad93527a9e491dddfd58cbdc5dce Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Wed, 31 Mar 2021 11:24:49 -0400 Subject: [PATCH 3/7] Copy type metadata for decimal columns --- python/cudf/cudf/core/column/column.py | 7 +++++++ python/cudf/cudf/core/column/decimal.py | 19 ++++++++++--------- python/cudf/cudf/tests/test_indexing.py | 11 +++++++++++ 3 files changed, 28 insertions(+), 9 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index dd06d97d105..665f7e06c7b 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -1402,6 +1402,8 @@ def _copy_type_metadata(self: T, other: ColumnBase) -> ColumnBase: of `other` and the categories of `self`. * when both `self` and `other` are StructColumns, rename the fields of `other` to the field names of `self`. + * when both `self` and `other` are DecimalColumns, copy the precision + from self.dtype to other.dtype * when `self` and `other` are nested columns of the same type, recursively apply this function on the children of `self` to the and the children of `other`. @@ -1425,6 +1427,11 @@ def _copy_type_metadata(self: T, other: ColumnBase) -> ColumnBase: ): other = other._rename_fields(self.dtype.fields.keys()) + if isinstance(other, cudf.core.column.DecimalColumn) and isinstance( + self, cudf.core.column.DecimalColumn + ): + other.dtype.precision = self.dtype.precision + if type(self) is type(other): if self.base_children and other.base_children: base_children = tuple( diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py index 7fbe602f07a..7d89fcc7d92 100644 --- a/python/cudf/cudf/core/column/decimal.py +++ b/python/cudf/cudf/core/column/decimal.py @@ -1,26 +1,27 @@ # Copyright (c) 2021, NVIDIA CORPORATION. -import cudf +from typing import cast + import cupy as cp import numpy as np import pyarrow as pa from pandas.api.types import is_integer_dtype -from typing import cast +import cudf from cudf import _lib as libcudf -from cudf.core.buffer import Buffer -from cudf.core.column import ColumnBase -from cudf.core.dtypes import Decimal64Dtype -from cudf.utils.utils import pa_mask_buffer_to_mask - -from cudf._typing import Dtype from cudf._lib.strings.convert.convert_fixed_point import ( from_decimal as cpp_from_decimal, ) -from cudf.core.column import as_column +from cudf._typing import Dtype +from cudf.core.buffer import Buffer +from cudf.core.column import ColumnBase, as_column +from cudf.core.dtypes import Decimal64Dtype +from cudf.utils.utils import pa_mask_buffer_to_mask class DecimalColumn(ColumnBase): + dtype: Decimal64Dtype + @classmethod def from_arrow(cls, data: pa.Array): dtype = Decimal64Dtype.from_arrow(data.type) diff --git a/python/cudf/cudf/tests/test_indexing.py b/python/cudf/cudf/tests/test_indexing.py index cec2623027f..086d59ab0f2 100644 --- a/python/cudf/cudf/tests/test_indexing.py +++ b/python/cudf/cudf/tests/test_indexing.py @@ -1401,3 +1401,14 @@ def test_iloc_before_zero_terminate(arg, pobj): gobj = cudf.from_pandas(pobj) assert_eq(pobj.iloc[arg], gobj.iloc[arg]) + + +def test_iloc_decimal(): + sr = cudf.Series(["1.00", "2.00", "3.00", "4.00"]).astype( + cudf.Decimal64Dtype(scale=2, precision=3) + ) + got = sr.iloc[[3, 2, 1, 0]] + expect = cudf.Series(["4.00", "3.00", "2.00", "1.00"],).astype( + cudf.Decimal64Dtype(scale=2, precision=3) + ) + assert_eq(expect.reset_index(drop=True), got.reset_index(drop=True)) From 07184593ad300f25cf79007bdf1d669b25288131 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Wed, 31 Mar 2021 12:36:52 -0400 Subject: [PATCH 4/7] Remove explicit dtype equality checks as assert_eq now does that --- python/cudf/cudf/tests/test_decimal.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/python/cudf/cudf/tests/test_decimal.py b/python/cudf/cudf/tests/test_decimal.py index 80ff9d5734c..70fc63baba8 100644 --- a/python/cudf/cudf/tests/test_decimal.py +++ b/python/cudf/cudf/tests/test_decimal.py @@ -5,15 +5,14 @@ import numpy as np import pyarrow as pa import pytest -import cudf -from cudf.core.dtypes import Decimal64Dtype +import cudf from cudf.core.column import DecimalColumn, NumericalColumn - +from cudf.core.dtypes import Decimal64Dtype from cudf.tests.utils import ( - NUMERIC_TYPES, FLOAT_TYPES, INTEGER_TYPES, + NUMERIC_TYPES, assert_eq, ) @@ -88,7 +87,6 @@ def test_typecast_from_float_to_decimal(data, from_dtype, to_dtype): got = got.astype(to_dtype) assert_eq(got, expected) - assert_eq(got.dtype, expected.dtype) @pytest.mark.parametrize( @@ -129,7 +127,6 @@ def test_typecast_from_int_to_decimal(data, from_dtype, to_dtype): got = got.astype(to_dtype) assert_eq(got, expected) - assert_eq(got.dtype, expected.dtype) @pytest.mark.parametrize( @@ -170,7 +167,6 @@ def test_typecast_to_from_decimal(data, from_dtype, to_dtype): got = got.astype(to_dtype) assert_eq(got, expected) - assert_eq(got.dtype, expected.dtype) @pytest.mark.parametrize( @@ -205,4 +201,3 @@ def test_typecast_from_decimal(data, from_dtype, to_dtype): expected = cudf.Series(NumericalColumn.from_arrow(pa_arr)) assert_eq(got, expected) - assert_eq(got.dtype, expected.dtype) From 142ef8ef01b82c804389de9a02bdd286007bf59b Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Wed, 31 Mar 2021 12:37:25 -0400 Subject: [PATCH 5/7] Comment --- python/cudf/cudf/core/dtypes.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index 6ea21cdfcd5..edfaa753325 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -15,6 +15,7 @@ class _BaseDtype(ExtensionDtype): + # Base type for all cudf-specific dtypes pass From bf1102f87450ed472af623545d863403f56bcf83 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Wed, 31 Mar 2021 14:31:59 -0400 Subject: [PATCH 6/7] Copy type metadata after doing a scan op --- python/cudf/cudf/core/column/decimal.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py index 276281ffa61..769df083f7e 100644 --- a/python/cudf/cudf/core/column/decimal.py +++ b/python/cudf/cudf/core/column/decimal.py @@ -73,7 +73,8 @@ def binary_operator(self, op, other, reflect=False): return result def _apply_scan_op(self, op: str) -> ColumnBase: - return libcudf.reduce.scan(op, self, True) + result = libcudf.reduce.scan(op, self, True) + return self._copy_type_metadata(result) def as_decimal_column( self, dtype: Dtype, **kwargs From 3c1a8b6526de998d682d58464ec172b6e4e8ec79 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Wed, 31 Mar 2021 14:36:08 -0400 Subject: [PATCH 7/7] Leave categorical comparison to Pandas --- python/cudf/cudf/tests/utils.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/tests/utils.py b/python/cudf/cudf/tests/utils.py index 32436d5b871..37a74ab4760 100644 --- a/python/cudf/cudf/tests/utils.py +++ b/python/cudf/cudf/tests/utils.py @@ -78,7 +78,11 @@ def assert_eq(left, right, **kwargs): # `object`. Check equality before that happens: if kwargs.get("check_dtype", True): if hasattr(left, "dtype") and hasattr(right, "dtype"): - if isinstance(left.dtype, cudf.core.dtypes._BaseDtype): + if isinstance( + left.dtype, cudf.core.dtypes._BaseDtype + ) and not isinstance( + left.dtype, cudf.CategoricalDtype + ): # leave categorical comparison to Pandas assert_eq(left.dtype, right.dtype) if hasattr(left, "to_pandas"):