Skip to content

Commit

Permalink
Add decimal column handling in copy_type_metadata (#7788)
Browse files Browse the repository at this point in the history
* All decimal columns returned by libcudf have a precision of 18 (`Decimal64Dtype.MAX_PRECISION`). The `copy_type_metadata` method is extended to copy the precision from the input column to the result as often we want the returned column to have the same precision as the input column.

* In adding a test for this, I realized that the `assert_eq` utility is broken: it doesn't correctly check for dtype equality when the dtype is a cudf-specific type. Tacked on a fix.

Authors:
  - Ashwin Srinath (https://github.com/shwina)
  - Keith Kraus (https://github.com/kkraus14)
  - Ram (Ramakrishna Prabhu) (https://github.com/rgsl888prabhu)

Approvers:
  - Michael Wang (https://github.com/isVoid)
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Keith Kraus (https://github.com/kkraus14)

URL: #7788
  • Loading branch information
shwina authored Apr 1, 2021
1 parent f4d5bde commit ed469ca
Show file tree
Hide file tree
Showing 7 changed files with 51 additions and 14 deletions.
7 changes: 6 additions & 1 deletion python/cudf/cudf/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,12 @@
merge,
)
from cudf.core.algorithms import factorize
from cudf.core.dtypes import CategoricalDtype, Decimal64Dtype
from cudf.core.dtypes import (
CategoricalDtype,
Decimal64Dtype,
ListDtype,
StructDtype,
)
from cudf.core.groupby import Grouper
from cudf.core.ops import (
add,
Expand Down
7 changes: 7 additions & 0 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -1414,6 +1414,8 @@ def _copy_type_metadata(self: T, other: ColumnBase) -> ColumnBase:
of `other` and the categories of `self`.
* when both `self` and `other` are StructColumns, rename the fields
of `other` to the field names of `self`.
* when both `self` and `other` are DecimalColumns, copy the precision
from self.dtype to other.dtype
* when `self` and `other` are nested columns of the same type,
recursively apply this function on the children of `self` to the
and the children of `other`.
Expand All @@ -1437,6 +1439,11 @@ def _copy_type_metadata(self: T, other: ColumnBase) -> ColumnBase:
):
other = other._rename_fields(self.dtype.fields.keys())

if isinstance(other, cudf.core.column.DecimalColumn) and isinstance(
self, cudf.core.column.DecimalColumn
):
other.dtype.precision = self.dtype.precision

if type(self) is type(other):
if self.base_children and other.base_children:
base_children = tuple(
Expand Down
5 changes: 4 additions & 1 deletion python/cudf/cudf/core/column/decimal.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@


class DecimalColumn(ColumnBase):
dtype: Decimal64Dtype

@classmethod
def from_arrow(cls, data: pa.Array):
dtype = Decimal64Dtype.from_arrow(data.type)
Expand Down Expand Up @@ -112,7 +114,8 @@ def normalize_binop_value(self, other):
raise TypeError(f"cannot normalize {type(other)}")

def _apply_scan_op(self, op: str) -> ColumnBase:
return libcudf.reduce.scan(op, self, True)
result = libcudf.reduce.scan(op, self, True)
return self._copy_type_metadata(result)

def as_decimal_column(
self, dtype: Dtype, **kwargs
Expand Down
13 changes: 9 additions & 4 deletions python/cudf/cudf/core/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,12 @@
from cudf._typing import Dtype


class CategoricalDtype(ExtensionDtype):
class _BaseDtype(ExtensionDtype):
# Base type for all cudf-specific dtypes
pass


class CategoricalDtype(_BaseDtype):

ordered: Optional[bool]

Expand Down Expand Up @@ -121,7 +126,7 @@ def deserialize(cls, header, frames):
return cls(categories=categories, ordered=ordered)


class ListDtype(ExtensionDtype):
class ListDtype(_BaseDtype):
_typ: pa.ListType
name: str = "list"

Expand Down Expand Up @@ -180,7 +185,7 @@ def __hash__(self):
return hash(self._typ)


class StructDtype(ExtensionDtype):
class StructDtype(_BaseDtype):

name = "struct"

Expand Down Expand Up @@ -231,7 +236,7 @@ def __hash__(self):
return hash(self._typ)


class Decimal64Dtype(ExtensionDtype):
class Decimal64Dtype(_BaseDtype):

name = "decimal"
_metadata = ("precision", "scale")
Expand Down
11 changes: 3 additions & 8 deletions python/cudf/cudf/tests/test_decimal.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,14 @@
import numpy as np
import pyarrow as pa
import pytest
import cudf

from cudf.core.dtypes import Decimal64Dtype
import cudf
from cudf.core.column import DecimalColumn, NumericalColumn

from cudf.core.dtypes import Decimal64Dtype
from cudf.tests.utils import (
NUMERIC_TYPES,
FLOAT_TYPES,
INTEGER_TYPES,
NUMERIC_TYPES,
assert_eq,
)

Expand Down Expand Up @@ -88,7 +87,6 @@ def test_typecast_from_float_to_decimal(data, from_dtype, to_dtype):
got = got.astype(to_dtype)

assert_eq(got, expected)
assert_eq(got.dtype, expected.dtype)


@pytest.mark.parametrize(
Expand Down Expand Up @@ -129,7 +127,6 @@ def test_typecast_from_int_to_decimal(data, from_dtype, to_dtype):
got = got.astype(to_dtype)

assert_eq(got, expected)
assert_eq(got.dtype, expected.dtype)


@pytest.mark.parametrize(
Expand Down Expand Up @@ -170,7 +167,6 @@ def test_typecast_to_from_decimal(data, from_dtype, to_dtype):
got = got.astype(to_dtype)

assert_eq(got, expected)
assert_eq(got.dtype, expected.dtype)


@pytest.mark.parametrize(
Expand Down Expand Up @@ -205,4 +201,3 @@ def test_typecast_from_decimal(data, from_dtype, to_dtype):
expected = cudf.Series(NumericalColumn.from_arrow(pa_arr))

assert_eq(got, expected)
assert_eq(got.dtype, expected.dtype)
11 changes: 11 additions & 0 deletions python/cudf/cudf/tests/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -1401,3 +1401,14 @@ def test_iloc_before_zero_terminate(arg, pobj):
gobj = cudf.from_pandas(pobj)

assert_eq(pobj.iloc[arg], gobj.iloc[arg])


def test_iloc_decimal():
sr = cudf.Series(["1.00", "2.00", "3.00", "4.00"]).astype(
cudf.Decimal64Dtype(scale=2, precision=3)
)
got = sr.iloc[[3, 2, 1, 0]]
expect = cudf.Series(["4.00", "3.00", "2.00", "1.00"],).astype(
cudf.Decimal64Dtype(scale=2, precision=3)
)
assert_eq(expect.reset_index(drop=True), got.reset_index(drop=True))
11 changes: 11 additions & 0 deletions python/cudf/cudf/tests/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,17 @@ def assert_eq(left, right, **kwargs):
without switching between assert_frame_equal/assert_series_equal/...
functions.
"""
# dtypes that we support but Pandas doesn't will convert to
# `object`. Check equality before that happens:
if kwargs.get("check_dtype", True):
if hasattr(left, "dtype") and hasattr(right, "dtype"):
if isinstance(
left.dtype, cudf.core.dtypes._BaseDtype
) and not isinstance(
left.dtype, cudf.CategoricalDtype
): # leave categorical comparison to Pandas
assert_eq(left.dtype, right.dtype)

if hasattr(left, "to_pandas"):
left = left.to_pandas()
if hasattr(right, "to_pandas"):
Expand Down

0 comments on commit ed469ca

Please sign in to comment.