Skip to content

Commit

Permalink
Expose a Decimal32Dtype in cuDF Python (#8438)
Browse files Browse the repository at this point in the history
Fixes: #8218

Similarly to libcudf's 64-bit decimal type,  this PR exposes the `Decimal32Dtype `and its corresponding `Decimal32Column` type.  Following this implementation, user can create a series or dataframe with `decimal32` dtype. 
Note: Only `to_arrow` and `from_arrow` methods are currently being supported. 



**Example:** 
```
>>> import cudf
>>> s = cudf.Series([1,2,3,4], dtype=cudf.Decimal32Dtype(precision=8, scale=2))
>>> s
0    1.00
1    2.00
2    3.00
3    4.00
dtype: decimal32
```

Authors:
  - Sheilah Kirui (https://github.com/skirui-source)

Approvers:
  - Michael Wang (https://github.com/isVoid)

URL: #8438
  • Loading branch information
skirui-source authored Jun 29, 2021
1 parent e6a0fe3 commit 2d9fd5f
Show file tree
Hide file tree
Showing 14 changed files with 360 additions and 94 deletions.
1 change: 1 addition & 0 deletions python/cudf/cudf/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
from cudf.core.dtypes import (
CategoricalDtype,
Decimal64Dtype,
Decimal32Dtype,
IntervalDtype,
ListDtype,
StructDtype,
Expand Down
4 changes: 2 additions & 2 deletions python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ from libcpp.string cimport string

def from_decimal(Column input_col):
"""
Converts a `DecimalColumn` to a `StringColumn`.
Converts a `Decimal64Column` to a `StringColumn`.
Parameters
----------
Expand All @@ -50,7 +50,7 @@ def from_decimal(Column input_col):

def to_decimal(Column input_col, object out_type):
"""
Returns a `DecimalColumn` from the provided `StringColumn`
Returns a `Decimal64Column` from the provided `StringColumn`
using the scale in the `out_type`.
Parameters
Expand Down
40 changes: 28 additions & 12 deletions python/cudf/cudf/_lib/types.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,19 @@ from cudf._lib.types cimport (
)
from cudf._lib.cpp.column.column_view cimport column_view
from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view
from cudf.core.dtypes import ListDtype, StructDtype, Decimal64Dtype
from cudf.utils.dtypes import is_decimal_dtype, is_list_dtype, is_struct_dtype

from cudf.core.dtypes import (
ListDtype,
StructDtype,
Decimal64Dtype,
Decimal32Dtype
)
from cudf.utils.dtypes import (
is_decimal_dtype,
is_list_dtype,
is_struct_dtype,
is_decimal64_dtype,
is_decimal32_dtype
)
cimport cudf._lib.cpp.types as libcudf_types


Expand Down Expand Up @@ -191,21 +201,22 @@ cdef dtype_from_structs_column_view(column_view cv):
}
return StructDtype(fields)

cdef dtype_from_decimal_column_view(column_view cv):
scale = -cv.type().scale()
return Decimal64Dtype(precision=Decimal64Dtype.MAX_PRECISION, scale=scale)

cdef dtype_from_column_view(column_view cv):
cdef libcudf_types.type_id tid = cv.type().id()
if tid == libcudf_types.type_id.LIST:
return dtype_from_lists_column_view(cv)
elif tid == libcudf_types.type_id.STRUCT:
return dtype_from_structs_column_view(cv)
elif tid == libcudf_types.type_id.DECIMAL64:
return dtype_from_decimal_column_view(cv)
return Decimal64Dtype(
precision=Decimal64Dtype.MAX_PRECISION,
scale=-cv.type().scale()
)
elif tid == libcudf_types.type_id.DECIMAL32:
raise NotImplementedError("decimal32 types are not supported yet. "
"Use decimal64 instead")
return Decimal32Dtype(
precision=Decimal32Dtype.MAX_PRECISION,
scale=-cv.type().scale()
)
else:
return cudf_to_np_types[<underlying_type_t_type_id>(tid)]

Expand All @@ -214,14 +225,19 @@ cdef libcudf_types.data_type dtype_to_data_type(dtype) except *:
tid = libcudf_types.type_id.LIST
elif is_struct_dtype(dtype):
tid = libcudf_types.type_id.STRUCT
elif is_decimal_dtype(dtype):
elif is_decimal64_dtype(dtype):
tid = libcudf_types.type_id.DECIMAL64
elif is_decimal32_dtype(dtype):
tid = libcudf_types.type_id.DECIMAL32
else:
tid = <libcudf_types.type_id> (
<underlying_type_t_type_id> (
np_to_cudf_types[np.dtype(dtype)]))

if tid == libcudf_types.type_id.DECIMAL64:
if tid in (
libcudf_types.type_id.DECIMAL64,
libcudf_types.type_id.DECIMAL32
):
return libcudf_types.data_type(tid, -dtype.scale)
else:
return libcudf_types.data_type(tid)
10 changes: 8 additions & 2 deletions python/cudf/cudf/api/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,11 @@

import cudf
from cudf._lib.scalar import DeviceScalar
from cudf.core.dtypes import (
from cudf.core.dtypes import ( # noqa: F401
_BaseDtype,
is_categorical_dtype,
is_decimal32_dtype,
is_decimal64_dtype,
is_decimal_dtype,
is_interval_dtype,
is_list_dtype,
Expand All @@ -39,11 +41,15 @@ def is_numeric_dtype(obj):
Whether or not the array or dtype is of a numeric dtype.
"""
if isclass(obj):
if issubclass(obj, cudf.Decimal64Dtype):
if issubclass(obj, (cudf.Decimal32Dtype, cudf.Decimal64Dtype)):
return True
if issubclass(obj, _BaseDtype):
return False
else:
if isinstance(obj, cudf.Decimal32Dtype) or isinstance(
getattr(obj, "dtype", None), cudf.Decimal32Dtype
):
return True
if isinstance(obj, cudf.Decimal64Dtype) or isinstance(
getattr(obj, "dtype", None), cudf.Decimal64Dtype
):
Expand Down
5 changes: 4 additions & 1 deletion python/cudf/cudf/core/column/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,4 +23,7 @@
from cudf.core.column.struct import StructColumn # noqa: F401
from cudf.core.column.timedelta import TimeDeltaColumn # noqa: F401
from cudf.core.column.interval import IntervalColumn # noqa: F401
from cudf.core.column.decimal import DecimalColumn # noqa: F401
from cudf.core.column.decimal import ( # noqa: F401
Decimal32Column,
Decimal64Column,
)
68 changes: 53 additions & 15 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,28 +47,30 @@
)
from cudf.utils import ioutils, utils
from cudf.utils.dtypes import (
_is_non_decimal_numeric_dtype,
_is_scalar_or_zero_d_array,
check_cast_unsupported_dtype,
cudf_dtype_from_pa_type,
get_time_unit,
is_categorical_dtype,
is_decimal_dtype,
is_interval_dtype,
is_list_dtype,
is_scalar,
is_string_dtype,
is_struct_dtype,
min_unsigned_type,
np_to_pa_dtype,
)
from cudf.utils.utils import mask_dtype

from ...api.types import (
_is_non_decimal_numeric_dtype,
_is_scalar_or_zero_d_array,
infer_dtype,
is_bool_dtype,
is_categorical_dtype,
is_decimal32_dtype,
is_decimal64_dtype,
is_decimal_dtype,
is_dtype_equal,
is_integer_dtype,
is_interval_dtype,
is_list_dtype,
is_scalar,
is_string_dtype,
is_struct_dtype,
pandas_dtype,
)

Expand Down Expand Up @@ -279,7 +281,7 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase:
):
return cudf.core.column.IntervalColumn.from_arrow(array)
elif isinstance(array.type, pa.Decimal128Type):
return cudf.core.column.DecimalColumn.from_arrow(array)
return cudf.core.column.Decimal64Column.from_arrow(array)

result = libcudf.interop.from_arrow(data, data.column_names)._data[
"None"
Expand Down Expand Up @@ -973,7 +975,19 @@ def as_string_column(

def as_decimal_column(
self, dtype: Dtype, **kwargs
) -> "cudf.core.column.DecimalColumn":
) -> Union[
"cudf.core.column.Decimal32Column", "cudf.core.column.Decimal64Column"
]:
raise NotImplementedError

def as_decimal64_column(
self, dtype: Dtype, **kwargs
) -> "cudf.core.column.Decimal64Column":
raise NotImplementedError

def as_decimal32_column(
self, dtype: Dtype, **kwargs
) -> "cudf.core.column.Decimal32Column":
raise NotImplementedError

def apply_boolean_mask(self, mask) -> ColumnBase:
Expand Down Expand Up @@ -1468,10 +1482,22 @@ def build_column(
null_count=null_count,
children=children,
)
elif is_decimal_dtype(dtype):
elif is_decimal64_dtype(dtype):
if size is None:
raise TypeError("Must specify size")
return cudf.core.column.DecimalColumn(
return cudf.core.column.Decimal64Column(
data=data,
size=size,
offset=offset,
dtype=dtype,
mask=mask,
null_count=null_count,
children=children,
)
elif is_decimal32_dtype(dtype):
if size is None:
raise TypeError("Must specify size")
return cudf.core.column.Decimal32Column(
data=data,
size=size,
offset=offset,
Expand Down Expand Up @@ -2020,8 +2046,20 @@ def as_column(
precision=dtype.precision, scale=dtype.scale
),
)
return cudf.core.column.DecimalColumn.from_arrow(data)
dtype = pandas_dtype(dtype)
return cudf.core.column.Decimal64Column.from_arrow(
data
)
if isinstance(dtype, cudf.core.dtypes.Decimal32Dtype):
data = pa.array(
arbitrary,
type=pa.decimal128(
precision=dtype.precision, scale=dtype.scale
),
)
return cudf.core.column.Decimal32Column.from_arrow(
data
)
dtype = pd.api.types.pandas_dtype(dtype)
if is_categorical_dtype(dtype) or is_interval_dtype(dtype):
raise TypeError
else:
Expand Down
78 changes: 69 additions & 9 deletions python/cudf/cudf/core/column/decimal.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,16 +16,69 @@
)
from cudf._typing import Dtype
from cudf.core.buffer import Buffer
from cudf.core.column import ColumnBase, NumericalColumn, as_column
from cudf.core.dtypes import Decimal64Dtype
from cudf.core.column import ColumnBase, as_column
from cudf.core.dtypes import Decimal32Dtype, Decimal64Dtype
from cudf.utils.dtypes import is_scalar
from cudf.utils.utils import pa_mask_buffer_to_mask

from ...api.types import is_integer_dtype
from .numerical_base import NumericalBaseColumn


class DecimalColumn(NumericalBaseColumn):
class Decimal32Column(NumericalBaseColumn):
dtype: Decimal32Dtype

@classmethod
def from_arrow(cls, data: pa.Array):
dtype = Decimal32Dtype.from_arrow(data.type)
mask_buf = data.buffers()[0]
mask = (
mask_buf
if mask_buf is None
else pa_mask_buffer_to_mask(mask_buf, len(data))
)
data_128 = cp.array(np.frombuffer(data.buffers()[1]).view("int32"))
data_32 = data_128[::4].copy()
return cls(
data=Buffer(data_32.view("uint8")),
size=len(data),
dtype=dtype,
offset=data.offset,
mask=mask,
)

def to_arrow(self):
data_buf_32 = self.base_data.to_host_array().view("int32")
data_buf_128 = np.empty(len(data_buf_32) * 4, dtype="int32")

# use striding to set the first 32 bits of each 128-bit chunk:
data_buf_128[::4] = data_buf_32
# use striding again to set the remaining bits of each 128-bit chunk:
# 0 for non-negative values, -1 for negative values:
data_buf_128[1::4] = np.piecewise(
data_buf_32, [data_buf_32 < 0], [-1, 0]
)
data_buf_128[2::4] = np.piecewise(
data_buf_32, [data_buf_32 < 0], [-1, 0]
)
data_buf_128[3::4] = np.piecewise(
data_buf_32, [data_buf_32 < 0], [-1, 0]
)
data_buf = pa.py_buffer(data_buf_128)
mask_buf = (
self.base_mask
if self.base_mask is None
else pa.py_buffer(self.base_mask.to_host_array())
)
return pa.Array.from_buffers(
type=self.dtype.to_arrow(),
offset=self._offset,
length=self.size,
buffers=[mask_buf, data_buf],
)


class Decimal64Column(NumericalBaseColumn):
dtype: Decimal64Dtype

def __truediv__(self, other):
Expand Down Expand Up @@ -61,6 +114,7 @@ def from_arrow(cls, data: pa.Array):
def to_arrow(self):
data_buf_64 = self.base_data.to_host_array().view("int64")
data_buf_128 = np.empty(len(data_buf_64) * 2, dtype="int64")

# use striding to set the first 64 bits of each 128-bit chunk:
data_buf_128[::2] = data_buf_64
# use striding again to set the remaining bits of each 128-bit chunk:
Expand Down Expand Up @@ -99,7 +153,11 @@ def binary_operator(self, op, other, reflect=False):
elif op in ("eq", "ne", "lt", "gt", "le", "ge"):
if not isinstance(
other,
(DecimalColumn, cudf.core.column.NumericalColumn, cudf.Scalar),
(
Decimal64Column,
cudf.core.column.NumericalColumn,
cudf.Scalar,
),
):
raise TypeError(
f"Operator {op} not supported between"
Expand Down Expand Up @@ -146,7 +204,9 @@ def _decimal_quantile(

def as_decimal_column(
self, dtype: Dtype, **kwargs
) -> "cudf.core.column.DecimalColumn":
) -> Union[
"cudf.core.column.Decimal32Column", "cudf.core.column.Decimal64Column"
]:
if (
isinstance(dtype, Decimal64Dtype)
and dtype.scale < self.dtype.scale
Expand Down Expand Up @@ -185,8 +245,8 @@ def fillna(
if isinstance(value, (int, Decimal)):
value = cudf.Scalar(value, dtype=self.dtype)
elif (
isinstance(value, DecimalColumn)
or isinstance(value, NumericalColumn)
isinstance(value, Decimal64Column)
or isinstance(value, cudf.core.column.NumericalColumn)
and is_integer_dtype(value.dtype)
):
value = value.astype(self.dtype)
Expand Down Expand Up @@ -220,8 +280,8 @@ def __cuda_array_interface__(self):
)

def _with_type_metadata(
self: "cudf.core.column.DecimalColumn", dtype: Dtype
) -> "cudf.core.column.DecimalColumn":
self: "cudf.core.column.Decimal64Column", dtype: Dtype
) -> "cudf.core.column.Decimal64Column":
if isinstance(dtype, Decimal64Dtype):
self.dtype.precision = dtype.precision

Expand Down
Loading

0 comments on commit 2d9fd5f

Please sign in to comment.