Add decimal column handling in copy_type_metadata (#7788)

* All decimal columns returned by libcudf have a precision of 18 (`Decimal64Dtype.MAX_PRECISION`). The `copy_type_metadata` method is extended to copy the precision from the input column to the result as often we want the returned column to have the same precision as the input column. * In adding a test for this, I realized that the `assert_eq` utility is broken: it doesn't correctly check for dtype equality when the dtype is a cudf-specific type. Tacked on a fix. Authors: - Ashwin Srinath (https://github.com/shwina) - Keith Kraus (https://github.com/kkraus14) - Ram (Ramakrishna Prabhu) (https://github.com/rgsl888prabhu) Approvers: - Michael Wang (https://github.com/isVoid) - GALI PREM SAGAR (https://github.com/galipremsagar) - Keith Kraus (https://github.com/kkraus14) URL: #7788
rapidsai · Apr 1, 2021 · ed469ca · ed469ca
1 parent f4d5bde
commit ed469ca
Show file tree

Hide file tree

Showing 7 changed files with 51 additions and 14 deletions.
diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py
@@ -40,7 +40,12 @@
     merge,
 )
 from cudf.core.algorithms import factorize
-from cudf.core.dtypes import CategoricalDtype, Decimal64Dtype
+from cudf.core.dtypes import (
+    CategoricalDtype,
+    Decimal64Dtype,
+    ListDtype,
+    StructDtype,
+)
 from cudf.core.groupby import Grouper
 from cudf.core.ops import (
     add,

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
@@ -1414,6 +1414,8 @@ def _copy_type_metadata(self: T, other: ColumnBase) -> ColumnBase:
           of `other`  and the categories of `self`.
         * when both `self` and `other` are StructColumns, rename the fields
           of `other` to the field names of `self`.
+        * when both `self` and `other` are DecimalColumns, copy the precision
+          from self.dtype to other.dtype
         * when `self` and `other` are nested columns of the same type,
           recursively apply this function on the children of `self` to the
           and the children of `other`.
@@ -1437,6 +1439,11 @@ def _copy_type_metadata(self: T, other: ColumnBase) -> ColumnBase:
         ):
             other = other._rename_fields(self.dtype.fields.keys())
 
+        if isinstance(other, cudf.core.column.DecimalColumn) and isinstance(
+            self, cudf.core.column.DecimalColumn
+        ):
+            other.dtype.precision = self.dtype.precision
+
         if type(self) is type(other):
             if self.base_children and other.base_children:
                 base_children = tuple(

diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py
@@ -22,6 +22,8 @@
 
 
 class DecimalColumn(ColumnBase):
+    dtype: Decimal64Dtype
+
     @classmethod
     def from_arrow(cls, data: pa.Array):
         dtype = Decimal64Dtype.from_arrow(data.type)
@@ -112,7 +114,8 @@ def normalize_binop_value(self, other):
             raise TypeError(f"cannot normalize {type(other)}")
 
     def _apply_scan_op(self, op: str) -> ColumnBase:
-        return libcudf.reduce.scan(op, self, True)
+        result = libcudf.reduce.scan(op, self, True)
+        return self._copy_type_metadata(result)
 
     def as_decimal_column(
         self, dtype: Dtype, **kwargs

diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py
@@ -14,7 +14,12 @@
 from cudf._typing import Dtype
 
 
-class CategoricalDtype(ExtensionDtype):
+class _BaseDtype(ExtensionDtype):
+    # Base type for all cudf-specific dtypes
+    pass
+
+
+class CategoricalDtype(_BaseDtype):
 
     ordered: Optional[bool]
 
@@ -121,7 +126,7 @@ def deserialize(cls, header, frames):
         return cls(categories=categories, ordered=ordered)
 
 
-class ListDtype(ExtensionDtype):
+class ListDtype(_BaseDtype):
     _typ: pa.ListType
     name: str = "list"
 
@@ -180,7 +185,7 @@ def __hash__(self):
         return hash(self._typ)
 
 
-class StructDtype(ExtensionDtype):
+class StructDtype(_BaseDtype):
 
     name = "struct"
 
@@ -231,7 +236,7 @@ def __hash__(self):
         return hash(self._typ)
 
 
-class Decimal64Dtype(ExtensionDtype):
+class Decimal64Dtype(_BaseDtype):
 
     name = "decimal"
     _metadata = ("precision", "scale")

diff --git a/python/cudf/cudf/tests/test_decimal.py b/python/cudf/cudf/tests/test_decimal.py
@@ -5,15 +5,14 @@
 import numpy as np
 import pyarrow as pa
 import pytest
-import cudf
 
-from cudf.core.dtypes import Decimal64Dtype
+import cudf
 from cudf.core.column import DecimalColumn, NumericalColumn
-
+from cudf.core.dtypes import Decimal64Dtype
 from cudf.tests.utils import (
-    NUMERIC_TYPES,
     FLOAT_TYPES,
     INTEGER_TYPES,
+    NUMERIC_TYPES,
     assert_eq,
 )
 
@@ -88,7 +87,6 @@ def test_typecast_from_float_to_decimal(data, from_dtype, to_dtype):
     got = got.astype(to_dtype)
 
     assert_eq(got, expected)
-    assert_eq(got.dtype, expected.dtype)
 
 
 @pytest.mark.parametrize(
@@ -129,7 +127,6 @@ def test_typecast_from_int_to_decimal(data, from_dtype, to_dtype):
     got = got.astype(to_dtype)
 
     assert_eq(got, expected)
-    assert_eq(got.dtype, expected.dtype)
 
 
 @pytest.mark.parametrize(
@@ -170,7 +167,6 @@ def test_typecast_to_from_decimal(data, from_dtype, to_dtype):
     got = got.astype(to_dtype)
 
     assert_eq(got, expected)
-    assert_eq(got.dtype, expected.dtype)
 
 
 @pytest.mark.parametrize(
@@ -205,4 +201,3 @@ def test_typecast_from_decimal(data, from_dtype, to_dtype):
     expected = cudf.Series(NumericalColumn.from_arrow(pa_arr))
 
     assert_eq(got, expected)
-    assert_eq(got.dtype, expected.dtype)
diff --git a/python/cudf/cudf/tests/test_indexing.py b/python/cudf/cudf/tests/test_indexing.py
@@ -1401,3 +1401,14 @@ def test_iloc_before_zero_terminate(arg, pobj):
     gobj = cudf.from_pandas(pobj)
 
     assert_eq(pobj.iloc[arg], gobj.iloc[arg])
+
+
+def test_iloc_decimal():
+    sr = cudf.Series(["1.00", "2.00", "3.00", "4.00"]).astype(
+        cudf.Decimal64Dtype(scale=2, precision=3)
+    )
+    got = sr.iloc[[3, 2, 1, 0]]
+    expect = cudf.Series(["4.00", "3.00", "2.00", "1.00"],).astype(
+        cudf.Decimal64Dtype(scale=2, precision=3)
+    )
+    assert_eq(expect.reset_index(drop=True), got.reset_index(drop=True))
diff --git a/python/cudf/cudf/tests/utils.py b/python/cudf/cudf/tests/utils.py
@@ -74,6 +74,17 @@ def assert_eq(left, right, **kwargs):
     without switching between assert_frame_equal/assert_series_equal/...
     functions.
     """
+    # dtypes that we support but Pandas doesn't will convert to
+    # `object`. Check equality before that happens:
+    if kwargs.get("check_dtype", True):
+        if hasattr(left, "dtype") and hasattr(right, "dtype"):
+            if isinstance(
+                left.dtype, cudf.core.dtypes._BaseDtype
+            ) and not isinstance(
+                left.dtype, cudf.CategoricalDtype
+            ):  # leave categorical comparison to Pandas
+                assert_eq(left.dtype, right.dtype)
+
     if hasattr(left, "to_pandas"):
         left = left.to_pandas()
     if hasattr(right, "to_pandas"):