Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enable type conversion from float to decimal type #7450

Merged
merged 16 commits into from
Mar 10, 2021
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 16 additions & 6 deletions python/cudf/cudf/_lib/unary.pyx
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Copyright (c) 2020, NVIDIA CORPORATION.
ChrisJar marked this conversation as resolved.
Show resolved Hide resolved

from enum import IntEnum
from cudf.utils.dtypes import is_decimal_dtype

from libcpp cimport bool
from libcpp.memory cimport unique_ptr
Expand All @@ -27,6 +28,7 @@ from cudf._lib.cpp.unary cimport (
from cudf._lib.types cimport underlying_type_t_type_id

cimport cudf._lib.cpp.unary as libcudf_unary
cimport cudf._lib.cpp.types as libcudf_types


class UnaryOp(IntEnum):
Expand Down Expand Up @@ -93,14 +95,22 @@ def is_valid(Column input):

def cast(Column input, object dtype=np.float64):
cdef column_view c_input = input.view()
cdef type_id tid = (
<type_id> (
<underlying_type_t_type_id> (
np_to_cudf_types[np.dtype(dtype)]
cdef type_id tid
cdef data_type c_dtype

if is_decimal_dtype(dtype):
tid = libcudf_types.type_id.DECIMAL64
c_dtype = data_type(tid, -dtype.scale)
else:
tid = (
<type_id> (
<underlying_type_t_type_id> (
np_to_cudf_types[np.dtype(dtype)]
)
)
)
ChrisJar marked this conversation as resolved.
Show resolved Hide resolved
)
cdef data_type c_dtype = data_type(tid)
c_dtype = data_type(tid)

cdef unique_ptr[column] c_result

with nogil:
Expand Down
7 changes: 7 additions & 0 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -1026,6 +1026,8 @@ def astype(self, dtype: Dtype, **kwargs) -> ColumnBase:
"Casting interval columns not currently supported"
)
return self
elif is_decimal_dtype(dtype):
return self.as_decimal_column(dtype, **kwargs)
elif np.issubdtype(dtype, np.datetime64):
return self.as_datetime_column(dtype, **kwargs)
elif np.issubdtype(dtype, np.timedelta64):
Expand Down Expand Up @@ -1081,6 +1083,11 @@ def as_numerical_column(
) -> "cudf.core.column.NumericalColumn":
raise NotImplementedError

def as_decimal_column(
self, dtype: Dtype, **kwargs
) -> "cudf.core.column.DecimalColumn":
raise NotImplementedError

def as_datetime_column(
self, dtype: Dtype, **kwargs
) -> "cudf.core.column.DatetimeColumn":
Expand Down
23 changes: 23 additions & 0 deletions python/cudf/cudf/core/column/decimal.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,15 @@
import cupy as cp
import numpy as np
import pyarrow as pa
from pandas.api.types import is_integer_dtype

import cudf
from cudf import _lib as libcudf
from cudf.core.buffer import Buffer
from cudf.core.column import ColumnBase
from cudf.core.dtypes import Decimal64Dtype
from cudf.utils.utils import pa_mask_buffer_to_mask
from cudf._typing import Dtype


class DecimalColumn(ColumnBase):
Expand Down Expand Up @@ -59,6 +62,26 @@ def binary_operator(self, op, other, reflect=False):
result.dtype.precision = _binop_precision(self.dtype, other.dtype, op)
return result

def as_decimal_column(
self, dtype: Dtype, **kwargs
) -> "cudf.core.column.DecimalColumn":
if dtype == self.dtype:
return self
result = libcudf.unary.cast(self, dtype)
if isinstance(dtype, cudf.core.dtypes.Decimal64Dtype):
result.dtype.precision = dtype.precision
return result

def as_numerical_column(
self, dtype: Dtype
) -> "cudf.core.column.NumericalColumn":
if is_integer_dtype(dtype):
raise NotImplementedError(
"Casting from decimal types to integer "
"types not currently supported"
)
return libcudf.unary.cast(self, dtype)


def _binop_precision(l_dtype, r_dtype, op):
"""
Expand Down
13 changes: 13 additions & 0 deletions python/cudf/cudf/core/column/numerical.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,19 @@ def as_timedelta_column(
),
)

def as_decimal_column(
self, dtype: Dtype, **kwargs
) -> "cudf.core.column.DecimalColumn":
if is_integer_dtype(self.dtype):
raise NotImplementedError(
"Casting from integer types to decimal "
"types not currently supported"
)
result = libcudf.unary.cast(self, dtype)
if isinstance(dtype, cudf.core.dtypes.Decimal64Dtype):
result.dtype.precision = dtype.precision
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is it required to update precision in result ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The libcudf unary cast always returns a precision of 18 so it needs to be patched with the correct precision on the python side

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah we are pretty much stuck with always patching the precision attribute after getting a decimal column back from libcudf :(

return result

def as_numerical_column(self, dtype: Dtype) -> NumericalColumn:
dtype = np.dtype(dtype)
if dtype == self.dtype:
Expand Down
121 changes: 120 additions & 1 deletion python/cudf/cudf/tests/test_decimal.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,18 @@

from decimal import Decimal

import numpy as np
import pyarrow as pa
import pytest
import cudf

from cudf.core.column import DecimalColumn
from cudf.core.dtypes import Decimal64Dtype
from cudf.core.column import DecimalColumn, NumericalColumn

from cudf.tests.utils import (
FLOAT_TYPES,
assert_eq,
)


@pytest.mark.parametrize(
Expand Down Expand Up @@ -41,3 +49,114 @@ def test_from_arrow_max_precision():
DecimalColumn.from_arrow(
pa.array([1, 2, 3], type=pa.decimal128(scale=0, precision=19))
)


@pytest.mark.parametrize(
"data",
[
cudf.Series(
rgsl888prabhu marked this conversation as resolved.
Show resolved Hide resolved
[
14.12302,
97938.2,
np.nan,
0.0,
-8.302014,
np.nan,
94.31304,
-112.2314,
0.3333333,
np.nan,
]
),
],
)
@pytest.mark.parametrize("from_dtype", FLOAT_TYPES)
@pytest.mark.parametrize(
"to_dtype",
[Decimal64Dtype(7, 2), Decimal64Dtype(11, 4), Decimal64Dtype(18, 9)],
)
def test_typecast_to_decimal(data, from_dtype, to_dtype):
actual = data.astype(from_dtype)
expected = actual

actual = actual.astype(to_dtype)
pa_arr = expected.to_arrow().cast(
pa.decimal128(to_dtype.precision, to_dtype.scale)
)
expected = cudf.Series(DecimalColumn.from_arrow(pa_arr))

assert_eq(actual, expected)


@pytest.mark.parametrize(
"data",
[
cudf.Series(
[
14.12309,
2.343942,
np.nan,
0.0,
-8.302082,
np.nan,
94.31308,
-112.2364,
-8.029972,
np.nan,
]
),
],
)
@pytest.mark.parametrize(
"from_dtype",
[Decimal64Dtype(7, 2), Decimal64Dtype(11, 4), Decimal64Dtype(18, 10)],
)
@pytest.mark.parametrize(
"to_dtype",
[Decimal64Dtype(7, 2), Decimal64Dtype(11, 4), Decimal64Dtype(18, 10)],
ChrisJar marked this conversation as resolved.
Show resolved Hide resolved
)
def test_typecast_to_from_decimal(data, from_dtype, to_dtype):
actual = data.astype(from_dtype)
expected = actual

actual = actual.astype(to_dtype)
pa_arr = expected.to_arrow().cast(
pa.decimal128(to_dtype.precision, to_dtype.scale), safe=False
)
expected = cudf.Series(DecimalColumn.from_arrow(pa_arr))

assert_eq(actual, expected)


@pytest.mark.parametrize(
"data",
[
cudf.Series(
[
14.12309,
2.343942,
np.nan,
0.0,
-8.302082,
np.nan,
94.31308,
-112.2364,
-8.029972,
np.nan,
]
),
],
)
@pytest.mark.parametrize(
"from_dtype",
[Decimal64Dtype(7, 2), Decimal64Dtype(11, 4), Decimal64Dtype(18, 10)],
)
@pytest.mark.parametrize("to_dtype", FLOAT_TYPES)
def test_typecast_from_decimal(data, from_dtype, to_dtype):
actual = data.astype(from_dtype)
pa_arr = actual.to_arrow().cast(to_dtype, safe=False)

actual = actual.astype(to_dtype)
expected = cudf.Series(NumericalColumn.from_arrow(pa_arr))

assert_eq(actual, expected)