Expose a Decimal32Dtype in cuDF Python (#8438)

Fixes: #8218 Similarly to libcudf's 64-bit decimal type, this PR exposes the `Decimal32Dtype `and its corresponding `Decimal32Column` type. Following this implementation, user can create a series or dataframe with `decimal32` dtype. Note: Only `to_arrow` and `from_arrow` methods are currently being supported. **Example:** ``` >>> import cudf >>> s = cudf.Series([1,2,3,4], dtype=cudf.Decimal32Dtype(precision=8, scale=2)) >>> s 0 1.00 1 2.00 2 3.00 3 4.00 dtype: decimal32 ``` Authors: - Sheilah Kirui (https://github.com/skirui-source) Approvers: - Michael Wang (https://github.com/isVoid) URL: #8438
rapidsai · Jun 29, 2021 · 2d9fd5f · 2d9fd5f
1 parent e6a0fe3
commit 2d9fd5f
Show file tree

Hide file tree

Showing 14 changed files with 360 additions and 94 deletions.
diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py
@@ -47,6 +47,7 @@
 from cudf.core.dtypes import (
     CategoricalDtype,
     Decimal64Dtype,
+    Decimal32Dtype,
     IntervalDtype,
     ListDtype,
     StructDtype,

diff --git a/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx b/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx
@@ -28,7 +28,7 @@ from libcpp.string cimport string
 
 def from_decimal(Column input_col):
     """
-    Converts a `DecimalColumn` to a `StringColumn`.
+    Converts a `Decimal64Column` to a `StringColumn`.
 
     Parameters
     ----------
@@ -50,7 +50,7 @@ def from_decimal(Column input_col):
 
 def to_decimal(Column input_col, object out_type):
     """
-    Returns a `DecimalColumn` from the provided `StringColumn`
+    Returns a `Decimal64Column` from the provided `StringColumn`
     using the scale in the `out_type`.
 
     Parameters

diff --git a/python/cudf/cudf/_lib/types.pyx b/python/cudf/cudf/_lib/types.pyx
@@ -14,9 +14,19 @@ from cudf._lib.types cimport (
 )
 from cudf._lib.cpp.column.column_view cimport column_view
 from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view
-from cudf.core.dtypes import ListDtype, StructDtype, Decimal64Dtype
-from cudf.utils.dtypes import is_decimal_dtype, is_list_dtype, is_struct_dtype
-
+from cudf.core.dtypes import (
+    ListDtype,
+    StructDtype,
+    Decimal64Dtype,
+    Decimal32Dtype
+)
+from cudf.utils.dtypes import (
+    is_decimal_dtype,
+    is_list_dtype,
+    is_struct_dtype,
+    is_decimal64_dtype,
+    is_decimal32_dtype
+)
 cimport cudf._lib.cpp.types as libcudf_types
 
 
@@ -191,21 +201,22 @@ cdef dtype_from_structs_column_view(column_view cv):
     }
     return StructDtype(fields)
 
-cdef dtype_from_decimal_column_view(column_view cv):
-    scale = -cv.type().scale()
-    return Decimal64Dtype(precision=Decimal64Dtype.MAX_PRECISION, scale=scale)
-
 cdef dtype_from_column_view(column_view cv):
     cdef libcudf_types.type_id tid = cv.type().id()
     if tid == libcudf_types.type_id.LIST:
         return dtype_from_lists_column_view(cv)
     elif tid == libcudf_types.type_id.STRUCT:
         return dtype_from_structs_column_view(cv)
     elif tid == libcudf_types.type_id.DECIMAL64:
-        return dtype_from_decimal_column_view(cv)
+        return Decimal64Dtype(
+            precision=Decimal64Dtype.MAX_PRECISION,
+            scale=-cv.type().scale()
+        )
     elif tid == libcudf_types.type_id.DECIMAL32:
-        raise NotImplementedError("decimal32 types are not supported yet. "
-                                  "Use decimal64 instead")
+        return Decimal32Dtype(
+            precision=Decimal32Dtype.MAX_PRECISION,
+            scale=-cv.type().scale()
+        )
     else:
         return cudf_to_np_types[<underlying_type_t_type_id>(tid)]
 
@@ -214,14 +225,19 @@ cdef libcudf_types.data_type dtype_to_data_type(dtype) except *:
         tid = libcudf_types.type_id.LIST
     elif is_struct_dtype(dtype):
         tid = libcudf_types.type_id.STRUCT
-    elif is_decimal_dtype(dtype):
+    elif is_decimal64_dtype(dtype):
         tid = libcudf_types.type_id.DECIMAL64
+    elif is_decimal32_dtype(dtype):
+        tid = libcudf_types.type_id.DECIMAL32
     else:
         tid = <libcudf_types.type_id> (
             <underlying_type_t_type_id> (
                 np_to_cudf_types[np.dtype(dtype)]))
 
-    if tid == libcudf_types.type_id.DECIMAL64:
+    if tid in (
+        libcudf_types.type_id.DECIMAL64,
+        libcudf_types.type_id.DECIMAL32
+    ):
         return libcudf_types.data_type(tid, -dtype.scale)
     else:
         return libcudf_types.data_type(tid)
diff --git a/python/cudf/cudf/api/types.py b/python/cudf/cudf/api/types.py
@@ -15,9 +15,11 @@
 
 import cudf
 from cudf._lib.scalar import DeviceScalar
-from cudf.core.dtypes import (
+from cudf.core.dtypes import (  # noqa: F401
     _BaseDtype,
     is_categorical_dtype,
+    is_decimal32_dtype,
+    is_decimal64_dtype,
     is_decimal_dtype,
     is_interval_dtype,
     is_list_dtype,
@@ -39,11 +41,15 @@ def is_numeric_dtype(obj):
         Whether or not the array or dtype is of a numeric dtype.
     """
     if isclass(obj):
-        if issubclass(obj, cudf.Decimal64Dtype):
+        if issubclass(obj, (cudf.Decimal32Dtype, cudf.Decimal64Dtype)):
             return True
         if issubclass(obj, _BaseDtype):
             return False
     else:
+        if isinstance(obj, cudf.Decimal32Dtype) or isinstance(
+            getattr(obj, "dtype", None), cudf.Decimal32Dtype
+        ):
+            return True
         if isinstance(obj, cudf.Decimal64Dtype) or isinstance(
             getattr(obj, "dtype", None), cudf.Decimal64Dtype
         ):

diff --git a/python/cudf/cudf/core/column/__init__.py b/python/cudf/cudf/core/column/__init__.py
@@ -23,4 +23,7 @@
 from cudf.core.column.struct import StructColumn  # noqa: F401
 from cudf.core.column.timedelta import TimeDeltaColumn  # noqa: F401
 from cudf.core.column.interval import IntervalColumn  # noqa: F401
-from cudf.core.column.decimal import DecimalColumn  # noqa: F401
+from cudf.core.column.decimal import (  # noqa: F401
+    Decimal32Column,
+    Decimal64Column,
+)
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
@@ -47,28 +47,30 @@
 )
 from cudf.utils import ioutils, utils
 from cudf.utils.dtypes import (
-    _is_non_decimal_numeric_dtype,
-    _is_scalar_or_zero_d_array,
     check_cast_unsupported_dtype,
     cudf_dtype_from_pa_type,
     get_time_unit,
-    is_categorical_dtype,
-    is_decimal_dtype,
-    is_interval_dtype,
-    is_list_dtype,
-    is_scalar,
-    is_string_dtype,
-    is_struct_dtype,
     min_unsigned_type,
     np_to_pa_dtype,
 )
 from cudf.utils.utils import mask_dtype
 
 from ...api.types import (
+    _is_non_decimal_numeric_dtype,
+    _is_scalar_or_zero_d_array,
     infer_dtype,
     is_bool_dtype,
+    is_categorical_dtype,
+    is_decimal32_dtype,
+    is_decimal64_dtype,
+    is_decimal_dtype,
     is_dtype_equal,
     is_integer_dtype,
+    is_interval_dtype,
+    is_list_dtype,
+    is_scalar,
+    is_string_dtype,
+    is_struct_dtype,
     pandas_dtype,
 )
 
@@ -279,7 +281,7 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase:
         ):
             return cudf.core.column.IntervalColumn.from_arrow(array)
         elif isinstance(array.type, pa.Decimal128Type):
-            return cudf.core.column.DecimalColumn.from_arrow(array)
+            return cudf.core.column.Decimal64Column.from_arrow(array)
 
         result = libcudf.interop.from_arrow(data, data.column_names)._data[
             "None"
@@ -973,7 +975,19 @@ def as_string_column(
 
     def as_decimal_column(
         self, dtype: Dtype, **kwargs
-    ) -> "cudf.core.column.DecimalColumn":
+    ) -> Union[
+        "cudf.core.column.Decimal32Column", "cudf.core.column.Decimal64Column"
+    ]:
+        raise NotImplementedError
+
+    def as_decimal64_column(
+        self, dtype: Dtype, **kwargs
+    ) -> "cudf.core.column.Decimal64Column":
+        raise NotImplementedError
+
+    def as_decimal32_column(
+        self, dtype: Dtype, **kwargs
+    ) -> "cudf.core.column.Decimal32Column":
         raise NotImplementedError
 
     def apply_boolean_mask(self, mask) -> ColumnBase:
@@ -1468,10 +1482,22 @@ def build_column(
             null_count=null_count,
             children=children,
         )
-    elif is_decimal_dtype(dtype):
+    elif is_decimal64_dtype(dtype):
         if size is None:
             raise TypeError("Must specify size")
-        return cudf.core.column.DecimalColumn(
+        return cudf.core.column.Decimal64Column(
+            data=data,
+            size=size,
+            offset=offset,
+            dtype=dtype,
+            mask=mask,
+            null_count=null_count,
+            children=children,
+        )
+    elif is_decimal32_dtype(dtype):
+        if size is None:
+            raise TypeError("Must specify size")
+        return cudf.core.column.Decimal32Column(
             data=data,
             size=size,
             offset=offset,
@@ -2020,8 +2046,20 @@ def as_column(
                                 precision=dtype.precision, scale=dtype.scale
                             ),
                         )
-                        return cudf.core.column.DecimalColumn.from_arrow(data)
-                    dtype = pandas_dtype(dtype)
+                        return cudf.core.column.Decimal64Column.from_arrow(
+                            data
+                        )
+                    if isinstance(dtype, cudf.core.dtypes.Decimal32Dtype):
+                        data = pa.array(
+                            arbitrary,
+                            type=pa.decimal128(
+                                precision=dtype.precision, scale=dtype.scale
+                            ),
+                        )
+                        return cudf.core.column.Decimal32Column.from_arrow(
+                            data
+                        )
+                    dtype = pd.api.types.pandas_dtype(dtype)
                     if is_categorical_dtype(dtype) or is_interval_dtype(dtype):
                         raise TypeError
                     else:

diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py
@@ -16,16 +16,69 @@
 )
 from cudf._typing import Dtype
 from cudf.core.buffer import Buffer
-from cudf.core.column import ColumnBase, NumericalColumn, as_column
-from cudf.core.dtypes import Decimal64Dtype
+from cudf.core.column import ColumnBase, as_column
+from cudf.core.dtypes import Decimal32Dtype, Decimal64Dtype
 from cudf.utils.dtypes import is_scalar
 from cudf.utils.utils import pa_mask_buffer_to_mask
 
 from ...api.types import is_integer_dtype
 from .numerical_base import NumericalBaseColumn
 
 
-class DecimalColumn(NumericalBaseColumn):
+class Decimal32Column(NumericalBaseColumn):
+    dtype: Decimal32Dtype
+
+    @classmethod
+    def from_arrow(cls, data: pa.Array):
+        dtype = Decimal32Dtype.from_arrow(data.type)
+        mask_buf = data.buffers()[0]
+        mask = (
+            mask_buf
+            if mask_buf is None
+            else pa_mask_buffer_to_mask(mask_buf, len(data))
+        )
+        data_128 = cp.array(np.frombuffer(data.buffers()[1]).view("int32"))
+        data_32 = data_128[::4].copy()
+        return cls(
+            data=Buffer(data_32.view("uint8")),
+            size=len(data),
+            dtype=dtype,
+            offset=data.offset,
+            mask=mask,
+        )
+
+    def to_arrow(self):
+        data_buf_32 = self.base_data.to_host_array().view("int32")
+        data_buf_128 = np.empty(len(data_buf_32) * 4, dtype="int32")
+
+        # use striding to set the first 32 bits of each 128-bit chunk:
+        data_buf_128[::4] = data_buf_32
+        # use striding again to set the remaining bits of each 128-bit chunk:
+        # 0 for non-negative values, -1 for negative values:
+        data_buf_128[1::4] = np.piecewise(
+            data_buf_32, [data_buf_32 < 0], [-1, 0]
+        )
+        data_buf_128[2::4] = np.piecewise(
+            data_buf_32, [data_buf_32 < 0], [-1, 0]
+        )
+        data_buf_128[3::4] = np.piecewise(
+            data_buf_32, [data_buf_32 < 0], [-1, 0]
+        )
+        data_buf = pa.py_buffer(data_buf_128)
+        mask_buf = (
+            self.base_mask
+            if self.base_mask is None
+            else pa.py_buffer(self.base_mask.to_host_array())
+        )
+        return pa.Array.from_buffers(
+            type=self.dtype.to_arrow(),
+            offset=self._offset,
+            length=self.size,
+            buffers=[mask_buf, data_buf],
+        )
+
+
+class Decimal64Column(NumericalBaseColumn):
     dtype: Decimal64Dtype
 
     def __truediv__(self, other):
@@ -61,6 +114,7 @@ def from_arrow(cls, data: pa.Array):
     def to_arrow(self):
         data_buf_64 = self.base_data.to_host_array().view("int64")
         data_buf_128 = np.empty(len(data_buf_64) * 2, dtype="int64")
+
         # use striding to set the first 64 bits of each 128-bit chunk:
         data_buf_128[::2] = data_buf_64
         # use striding again to set the remaining bits of each 128-bit chunk:
@@ -99,7 +153,11 @@ def binary_operator(self, op, other, reflect=False):
         elif op in ("eq", "ne", "lt", "gt", "le", "ge"):
             if not isinstance(
                 other,
-                (DecimalColumn, cudf.core.column.NumericalColumn, cudf.Scalar),
+                (
+                    Decimal64Column,
+                    cudf.core.column.NumericalColumn,
+                    cudf.Scalar,
+                ),
             ):
                 raise TypeError(
                     f"Operator {op} not supported between"
@@ -146,7 +204,9 @@ def _decimal_quantile(
 
     def as_decimal_column(
         self, dtype: Dtype, **kwargs
-    ) -> "cudf.core.column.DecimalColumn":
+    ) -> Union[
+        "cudf.core.column.Decimal32Column", "cudf.core.column.Decimal64Column"
+    ]:
         if (
             isinstance(dtype, Decimal64Dtype)
             and dtype.scale < self.dtype.scale
@@ -185,8 +245,8 @@ def fillna(
         if isinstance(value, (int, Decimal)):
             value = cudf.Scalar(value, dtype=self.dtype)
         elif (
-            isinstance(value, DecimalColumn)
-            or isinstance(value, NumericalColumn)
+            isinstance(value, Decimal64Column)
+            or isinstance(value, cudf.core.column.NumericalColumn)
             and is_integer_dtype(value.dtype)
         ):
             value = value.astype(self.dtype)
@@ -220,8 +280,8 @@ def __cuda_array_interface__(self):
         )
 
     def _with_type_metadata(
-        self: "cudf.core.column.DecimalColumn", dtype: Dtype
-    ) -> "cudf.core.column.DecimalColumn":
+        self: "cudf.core.column.Decimal64Column", dtype: Dtype
+    ) -> "cudf.core.column.Decimal64Column":
         if isinstance(dtype, Decimal64Dtype):
             self.dtype.precision = dtype.precision