Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Expose a Decimal32Dtype in cuDF Python #8438

Merged
merged 25 commits into from
Jun 29, 2021
Merged
Show file tree
Hide file tree
Changes from 17 commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
08e119b
create pull request
skirui-source Jun 3, 2021
ad4a10b
create pull request
skirui-source Jun 4, 2021
725950d
Merge branch 'branch-21.08' of https://github.com/rapidsai/cudf into …
skirui-source Jun 7, 2021
2dc39e0
Merge branch 'branch-21.08' of https://github.com/rapidsai/cudf into …
skirui-source Jun 8, 2021
02ab554
.
skirui-source Jun 9, 2021
0cf41a3
added is_decimal32dtype/ is_decimal64_dtype
skirui-source Jun 9, 2021
376766d
replaced DecimalColumn with Decimal64Column in all cudf-python files
skirui-source Jun 9, 2021
911a7c8
introduced Decimal32Column in decimal.py
skirui-source Jun 11, 2021
c43d6a5
Merge branch 'branch-21.08' of https://github.com/rapidsai/cudf into …
skirui-source Jun 11, 2021
494e987
fixed construct series with decimal32column
skirui-source Jun 14, 2021
8d23df3
fixed merge conflict
skirui-source Jun 15, 2021
044fd25
fixed merge conflict
skirui-source Jun 16, 2021
95d3291
.
skirui-source Jun 16, 2021
00ea796
added to/from arrow tests for decimal32
skirui-source Jun 17, 2021
485ba03
Merge branch 'decimal32' of github.com:skirui-source/cudf into decimal32
skirui-source Jun 17, 2021
c279f26
Merge branch 'branch-21.08' of https://github.com/rapidsai/cudf into …
skirui-source Jun 18, 2021
4cb5b57
addressed review comments
skirui-source Jun 18, 2021
3425eae
added type-serialized, fixed merge conflict
skirui-source Jun 21, 2021
3dfeca6
Merge branch 'branch-21.08' of https://github.com/rapidsai/cudf into …
skirui-source Jun 21, 2021
53009c0
fixed CI tests
skirui-source Jun 23, 2021
8ad6e9b
Merge branch 'branch-21.08' of https://github.com/rapidsai/cudf into …
skirui-source Jun 23, 2021
4d66196
fixed failing tests-- infer_dtype
skirui-source Jun 24, 2021
0a134c9
Merge branch 'branch-21.08' of https://github.com/rapidsai/cudf into …
skirui-source Jun 28, 2021
c69ab3f
reverted to infer_dtype_from_object
skirui-source Jun 29, 2021
62ad755
fixed merge conflict in decimal.py
skirui-source Jun 29, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions python/cudf/cudf/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
from cudf.core.dtypes import (
CategoricalDtype,
Decimal64Dtype,
Decimal32Dtype,
IntervalDtype,
ListDtype,
StructDtype,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ from libcpp.string cimport string

def from_decimal(Column input_col):
"""
Converts a `DecimalColumn` to a `StringColumn`.
Converts a `Decimal64Column` to a `StringColumn`.

Parameters
----------
Expand All @@ -50,7 +50,7 @@ def from_decimal(Column input_col):

def to_decimal(Column input_col, object out_type):
"""
Returns a `DecimalColumn` from the provided `StringColumn`
Returns a `Decimal64Column` from the provided `StringColumn`
using the scale in the `out_type`.

Parameters
Expand Down
40 changes: 28 additions & 12 deletions python/cudf/cudf/_lib/types.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,19 @@ from cudf._lib.types cimport (
)
from cudf._lib.cpp.column.column_view cimport column_view
from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view
from cudf.core.dtypes import ListDtype, StructDtype, Decimal64Dtype
from cudf.utils.dtypes import is_decimal_dtype, is_list_dtype, is_struct_dtype

from cudf.core.dtypes import (
ListDtype,
StructDtype,
Decimal64Dtype,
Decimal32Dtype
)
from cudf.utils.dtypes import (
is_decimal_dtype,
is_list_dtype,
is_struct_dtype,
is_decimal64_dtype,
is_decimal32_dtype
)
cimport cudf._lib.cpp.types as libcudf_types


Expand Down Expand Up @@ -191,21 +201,22 @@ cdef dtype_from_structs_column_view(column_view cv):
}
return StructDtype(fields)

cdef dtype_from_decimal_column_view(column_view cv):
scale = -cv.type().scale()
return Decimal64Dtype(precision=Decimal64Dtype.MAX_PRECISION, scale=scale)

cdef dtype_from_column_view(column_view cv):
cdef libcudf_types.type_id tid = cv.type().id()
if tid == libcudf_types.type_id.LIST:
return dtype_from_lists_column_view(cv)
elif tid == libcudf_types.type_id.STRUCT:
return dtype_from_structs_column_view(cv)
elif tid == libcudf_types.type_id.DECIMAL64:
return dtype_from_decimal_column_view(cv)
return Decimal64Dtype(
precision=Decimal64Dtype.MAX_PRECISION,
scale=-cv.type().scale()
)
elif tid == libcudf_types.type_id.DECIMAL32:
raise NotImplementedError("decimal32 types are not supported yet. "
"Use decimal64 instead")
return Decimal32Dtype(
precision=Decimal32Dtype.MAX_PRECISION,
scale=-cv.type().scale()
)
else:
return cudf_to_np_types[<underlying_type_t_type_id>(tid)]

Expand All @@ -214,14 +225,19 @@ cdef libcudf_types.data_type dtype_to_data_type(dtype) except *:
tid = libcudf_types.type_id.LIST
elif is_struct_dtype(dtype):
tid = libcudf_types.type_id.STRUCT
elif is_decimal_dtype(dtype):
elif is_decimal64_dtype(dtype):
tid = libcudf_types.type_id.DECIMAL64
elif is_decimal32_dtype(dtype):
tid = libcudf_types.type_id.DECIMAL32
else:
tid = <libcudf_types.type_id> (
<underlying_type_t_type_id> (
np_to_cudf_types[np.dtype(dtype)]))

if tid == libcudf_types.type_id.DECIMAL64:
if tid in (
libcudf_types.type_id.DECIMAL64,
libcudf_types.type_id.DECIMAL32
):
return libcudf_types.data_type(tid, -dtype.scale)
else:
return libcudf_types.data_type(tid)
10 changes: 8 additions & 2 deletions python/cudf/cudf/api/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,11 @@

import cudf
from cudf._lib.scalar import DeviceScalar
from cudf.core.dtypes import (
from cudf.core.dtypes import ( # noqa: F401
_BaseDtype,
is_categorical_dtype,
is_decimal32_dtype,
is_decimal64_dtype,
is_decimal_dtype,
is_interval_dtype,
is_list_dtype,
Expand All @@ -39,11 +41,15 @@ def is_numeric_dtype(obj):
Whether or not the array or dtype is of a numeric dtype.
"""
if isclass(obj):
if issubclass(obj, cudf.Decimal64Dtype):
if issubclass(obj, (cudf.Decimal32Dtype, cudf.Decimal64Dtype)):
return True
if issubclass(obj, _BaseDtype):
return False
else:
if isinstance(obj, cudf.Decimal32Dtype) or isinstance(
getattr(obj, "dtype", None), cudf.Decimal32Dtype
):
return True
if isinstance(obj, cudf.Decimal64Dtype) or isinstance(
getattr(obj, "dtype", None), cudf.Decimal64Dtype
):
Expand Down
5 changes: 4 additions & 1 deletion python/cudf/cudf/core/column/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,4 +23,7 @@
from cudf.core.column.struct import StructColumn # noqa: F401
from cudf.core.column.timedelta import TimeDeltaColumn # noqa: F401
from cudf.core.column.interval import IntervalColumn # noqa: F401
from cudf.core.column.decimal import DecimalColumn # noqa: F401
from cudf.core.column.decimal import ( # noqa: F401
Decimal32Column,
Decimal64Column,
)
68 changes: 53 additions & 15 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,28 +47,30 @@
)
from cudf.utils import ioutils, utils
from cudf.utils.dtypes import (
_is_non_decimal_numeric_dtype,
_is_scalar_or_zero_d_array,
check_cast_unsupported_dtype,
cudf_dtype_from_pa_type,
get_time_unit,
is_categorical_dtype,
is_decimal_dtype,
is_interval_dtype,
is_list_dtype,
is_scalar,
is_string_dtype,
is_struct_dtype,
min_unsigned_type,
np_to_pa_dtype,
)
from cudf.utils.utils import mask_dtype

from ...api.types import (
_is_non_decimal_numeric_dtype,
_is_scalar_or_zero_d_array,
infer_dtype,
is_bool_dtype,
is_categorical_dtype,
is_decimal32_dtype,
is_decimal64_dtype,
is_decimal_dtype,
is_dtype_equal,
is_integer_dtype,
is_interval_dtype,
is_list_dtype,
is_scalar,
is_string_dtype,
is_struct_dtype,
pandas_dtype,
)

Expand Down Expand Up @@ -279,7 +281,7 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase:
):
return cudf.core.column.IntervalColumn.from_arrow(array)
elif isinstance(array.type, pa.Decimal128Type):
return cudf.core.column.DecimalColumn.from_arrow(array)
return cudf.core.column.Decimal64Column.from_arrow(array)

result = libcudf.interop.from_arrow(data, data.column_names)._data[
"None"
Expand Down Expand Up @@ -973,7 +975,19 @@ def as_string_column(

def as_decimal_column(
skirui-source marked this conversation as resolved.
Show resolved Hide resolved
self, dtype: Dtype, **kwargs
) -> "cudf.core.column.DecimalColumn":
) -> Union[
"cudf.core.column.Decimal32Column", "cudf.core.column.Decimal64Column"
]:
raise NotImplementedError

def as_decimal64_column(
self, dtype: Dtype, **kwargs
) -> "cudf.core.column.Decimal64Column":
raise NotImplementedError

def as_decimal32_column(
self, dtype: Dtype, **kwargs
) -> "cudf.core.column.Decimal32Column":
raise NotImplementedError

def apply_boolean_mask(self, mask) -> ColumnBase:
Expand Down Expand Up @@ -1468,10 +1482,22 @@ def build_column(
null_count=null_count,
children=children,
)
elif is_decimal_dtype(dtype):
elif is_decimal64_dtype(dtype):
if size is None:
raise TypeError("Must specify size")
return cudf.core.column.DecimalColumn(
return cudf.core.column.Decimal64Column(
data=data,
size=size,
offset=offset,
dtype=dtype,
mask=mask,
null_count=null_count,
children=children,
)
elif is_decimal32_dtype(dtype):
if size is None:
raise TypeError("Must specify size")
return cudf.core.column.Decimal32Column(
data=data,
size=size,
offset=offset,
Expand Down Expand Up @@ -2020,8 +2046,20 @@ def as_column(
precision=dtype.precision, scale=dtype.scale
),
)
return cudf.core.column.DecimalColumn.from_arrow(data)
dtype = pandas_dtype(dtype)
return cudf.core.column.Decimal64Column.from_arrow(
data
)
if isinstance(dtype, cudf.core.dtypes.Decimal32Dtype):
data = pa.array(
arbitrary,
type=pa.decimal128(
precision=dtype.precision, scale=dtype.scale
),
)
return cudf.core.column.Decimal32Column.from_arrow(
data
)
dtype = pd.api.types.pandas_dtype(dtype)
if is_categorical_dtype(dtype) or is_interval_dtype(dtype):
raise TypeError
else:
Expand Down
78 changes: 68 additions & 10 deletions python/cudf/cudf/core/column/decimal.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,16 +15,69 @@
)
from cudf._typing import Dtype
from cudf.core.buffer import Buffer
from cudf.core.column import ColumnBase, NumericalColumn, as_column
from cudf.core.dtypes import Decimal64Dtype
from cudf.core.column import ColumnBase, as_column
from cudf.core.dtypes import Decimal32Dtype, Decimal64Dtype
from cudf.utils.dtypes import is_scalar
from cudf.utils.utils import pa_mask_buffer_to_mask

from .numerical_base import NumericalBaseColumn
from ...api.types import is_integer_dtype
from .numerical_base import NumericalBaseColumn


class Decimal32Column(NumericalBaseColumn):
dtype: Decimal32Dtype

@classmethod
def from_arrow(cls, data: pa.Array):
dtype = Decimal32Dtype.from_arrow(data.type)
mask_buf = data.buffers()[0]
mask = (
mask_buf
if mask_buf is None
else pa_mask_buffer_to_mask(mask_buf, len(data))
)
data_128 = cp.array(np.frombuffer(data.buffers()[1]).view("int32"))
data_32 = data_128[::4].copy()
return cls(
data=Buffer(data_32.view("uint8")),
size=len(data),
dtype=dtype,
offset=data.offset,
mask=mask,
)

def to_arrow(self):
data_buf_32 = self.base_data.to_host_array().view("int32")
data_buf_128 = np.empty(len(data_buf_32) * 4, dtype="int32")

# use striding to set the first 32 bits of each 128-bit chunk:
data_buf_128[::4] = data_buf_32
# use striding again to set the remaining bits of each 128-bit chunk:
# 0 for non-negative values, -1 for negative values:
data_buf_128[1::4] = np.piecewise(
data_buf_32, [data_buf_32 < 0], [-1, 0]
)
data_buf_128[2::4] = np.piecewise(
data_buf_32, [data_buf_32 < 0], [-1, 0]
)
data_buf_128[3::4] = np.piecewise(
data_buf_32, [data_buf_32 < 0], [-1, 0]
)
data_buf = pa.py_buffer(data_buf_128)
mask_buf = (
self.base_mask
if self.base_mask is None
else pa.py_buffer(self.base_mask.to_host_array())
)
return pa.Array.from_buffers(
type=self.dtype.to_arrow(),
offset=self._offset,
length=self.size,
buffers=[mask_buf, data_buf],
)


class DecimalColumn(NumericalBaseColumn):
class Decimal64Column(NumericalBaseColumn):
dtype: Decimal64Dtype

def __truediv__(self, other):
Expand Down Expand Up @@ -60,6 +113,7 @@ def from_arrow(cls, data: pa.Array):
def to_arrow(self):
data_buf_64 = self.base_data.to_host_array().view("int64")
data_buf_128 = np.empty(len(data_buf_64) * 2, dtype="int64")

# use striding to set the first 64 bits of each 128-bit chunk:
data_buf_128[::2] = data_buf_64
# use striding again to set the remaining bits of each 128-bit chunk:
Expand Down Expand Up @@ -98,7 +152,11 @@ def binary_operator(self, op, other, reflect=False):
elif op in ("eq", "ne", "lt", "gt", "le", "ge"):
if not isinstance(
other,
(DecimalColumn, cudf.core.column.NumericalColumn, cudf.Scalar),
(
Decimal64Dtype,
skirui-source marked this conversation as resolved.
Show resolved Hide resolved
cudf.core.column.NumericalColumn,
cudf.Scalar,
),
):
raise TypeError(
f"Operator {op} not supported between"
Expand Down Expand Up @@ -145,7 +203,7 @@ def _decimal_quantile(

def as_decimal_column(
self, dtype: Dtype, **kwargs
) -> "cudf.core.column.DecimalColumn":
) -> "cudf.core.column.Decimal64Column":
if dtype == self.dtype:
return self
return libcudf.unary.cast(self, dtype)
Expand Down Expand Up @@ -175,8 +233,8 @@ def fillna(
if isinstance(value, (int, Decimal)):
value = cudf.Scalar(value, dtype=self.dtype)
elif (
isinstance(value, DecimalColumn)
or isinstance(value, NumericalColumn)
isinstance(value, Decimal64Column)
or isinstance(value, cudf.core.column.NumericalColumn)
and is_integer_dtype(value.dtype)
):
value = value.astype(self.dtype)
Expand Down Expand Up @@ -210,8 +268,8 @@ def __cuda_array_interface__(self):
)

def _with_type_metadata(
self: "cudf.core.column.DecimalColumn", dtype: Dtype
) -> "cudf.core.column.DecimalColumn":
self: "cudf.core.column.Decimal64Column", dtype: Dtype
) -> "cudf.core.column.Decimal64Column":
if isinstance(dtype, Decimal64Dtype):
self.dtype.precision = dtype.precision

Expand Down
Loading