Add cython for converting strings/fixed-point functions (rapidsai#7429)

Reference rapidsai#7285 This PR adds Cython wrappers for `cudf::strings::to_fixed_point`, `cudf::strings::from_fixed_point`, and `cudf::strings::is_fixed_point` libcudf functions. Authors: - David (@davidwendt) Approvers: - GALI PREM SAGAR (@galipremsagar) - Ashwin Srinath (@shwina) - Conor Hoekstra (@codereport) URL: rapidsai#7429
hyperbolic2346 · Mar 23, 2021 · d77a393 · d77a393
1 parent 213f1ad
commit d77a393
Show file tree

Hide file tree

Showing 6 changed files with 216 additions and 0 deletions.
diff --git a/python/cudf/cudf/_lib/cpp/strings/convert/convert_fixed_point.pxd b/python/cudf/cudf/_lib/cpp/strings/convert/convert_fixed_point.pxd
@@ -0,0 +1,21 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+
+from cudf._lib.cpp.column.column cimport column
+from cudf._lib.cpp.column.column_view cimport column_view
+from cudf._lib.cpp.types cimport data_type
+
+from libcpp.memory cimport unique_ptr
+
+cdef extern from "cudf/strings/convert/convert_fixed_point.hpp" namespace \
+        "cudf::strings" nogil:
+    cdef unique_ptr[column] to_fixed_point(
+        column_view input_col,
+        data_type output_type) except +
+
+    cdef unique_ptr[column] from_fixed_point(
+        column_view input_col) except +
+
+    cdef unique_ptr[column] is_fixed_point(
+        column_view source_strings,
+        data_type output_type
+    ) except +
diff --git a/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx b/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx
@@ -0,0 +1,107 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+
+import numpy as np
+
+from cudf._lib.column cimport Column
+from cudf._lib.types import np_to_cudf_types
+from cudf._lib.types cimport underlying_type_t_type_id
+from cudf._lib.cpp.types cimport DECIMAL64
+
+from cudf.core.column.column import as_column
+
+from cudf._lib.cpp.column.column cimport column
+from cudf._lib.cpp.column.column_view cimport column_view
+from cudf._lib.cpp.strings.convert.convert_fixed_point cimport (
+    to_fixed_point as cpp_to_fixed_point,
+    from_fixed_point as cpp_from_fixed_point,
+    is_fixed_point as cpp_is_fixed_point
+)
+from cudf._lib.cpp.types cimport (
+    type_id,
+    data_type,
+)
+
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+from libcpp.string cimport string
+
+
+def from_decimal(Column input_col):
+    """
+    Converts a `DecimalColumn` to a `StringColumn`.
+
+    Parameters
+    ----------
+    input_col : input column of type decimal
+
+    Returns
+    -------
+    A column of strings representing the input decimal values.
+    """
+    cdef column_view input_column_view = input_col.view()
+    cdef unique_ptr[column] c_result
+    with nogil:
+        c_result = move(
+            cpp_from_fixed_point(
+                input_column_view))
+
+    return Column.from_unique_ptr(move(c_result))
+
+
+def to_decimal(Column input_col, object out_type):
+    """
+    Returns a `DecimalColumn` from the provided `StringColumn`
+    using the scale in the `out_type`.
+
+    Parameters
+    ----------
+    input_col : input column of type string
+    out_type : The type and scale of the decimal column expected
+
+    Returns
+    -------
+    A column of decimals parsed from the string values.
+    """
+    cdef column_view input_column_view = input_col.view()
+    cdef unique_ptr[column] c_result
+    cdef int scale = out_type.scale
+    cdef data_type c_out_type = data_type(DECIMAL64, -scale)
+    with nogil:
+        c_result = move(
+            cpp_to_fixed_point(
+                input_column_view,
+                c_out_type))
+
+    result = Column.from_unique_ptr(move(c_result))
+    result.dtype.precision = out_type.precision
+    return result
+
+
+def is_fixed_point(Column input_col, object dtype):
+    """
+    Returns a Column of boolean values with True for `input_col`
+    that have fixed-point characters. The output row also has a
+    False value if the corresponding string would cause an integer
+    overflow. The scale of the `dtype` is used to determine overflow
+    in the output row.
+
+    Parameters
+    ----------
+    input_col : input column of type string
+    dtype : The type and scale of a decimal column
+
+    Returns
+    -------
+    A Column of booleans indicating valid decimal conversion.
+    """
+    cdef unique_ptr[column] c_result
+    cdef column_view source_view = input_col.view()
+    cdef int scale = dtype.scale
+    cdef data_type c_dtype = data_type(DECIMAL64, -scale)
+    with nogil:
+        c_result = move(cpp_is_fixed_point(
+            source_view,
+            c_dtype
+        ))
+
+    return Column.from_unique_ptr(move(c_result))
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
@@ -1036,6 +1036,8 @@ def astype(self, dtype: Dtype, **kwargs) -> ColumnBase:
                     "Casting interval columns not currently supported"
                 )
             return self
+        elif is_decimal_dtype(dtype):
+            return self.as_decimal_column(dtype, **kwargs)
         elif np.issubdtype(dtype, np.datetime64):
             return self.as_datetime_column(dtype, **kwargs)
         elif np.issubdtype(dtype, np.timedelta64):
@@ -1106,6 +1108,11 @@ def as_string_column(
     ) -> "cudf.core.column.StringColumn":
         raise NotImplementedError
 
+    def as_decimal_column(
+        self, dtype: Dtype, **kwargs
+    ) -> "cudf.core.column.DecimalColumn":
+        raise NotImplementedError
+
     def apply_boolean_mask(self, mask) -> ColumnBase:
         mask = as_column(mask, dtype="bool")
         result = (

diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py
@@ -1,14 +1,22 @@
 # Copyright (c) 2021, NVIDIA CORPORATION.
 
+import cudf
 import cupy as cp
 import numpy as np
 import pyarrow as pa
 
+from typing import cast
+
 from cudf import _lib as libcudf
 from cudf.core.buffer import Buffer
 from cudf.core.column import ColumnBase
 from cudf.core.dtypes import Decimal64Dtype
 from cudf.utils.utils import pa_mask_buffer_to_mask
+from cudf._lib.strings.convert.convert_fixed_point import (
+    from_decimal as cpp_from_decimal,
+)
+from cudf._typing import Dtype
+from cudf.core.column import as_column
 
 
 class DecimalColumn(ColumnBase):
@@ -59,6 +67,16 @@ def binary_operator(self, op, other, reflect=False):
         result.dtype.precision = _binop_precision(self.dtype, other.dtype, op)
         return result
 
+    def as_string_column(
+        self, dtype: Dtype, format=None
+    ) -> "cudf.core.column.StringColumn":
+        if len(self) > 0:
+            return cpp_from_decimal(self)
+        else:
+            return cast(
+                "cudf.core.column.StringColumn", as_column([], dtype="object")
+            )
+
 
 def _binop_precision(l_dtype, r_dtype, op):
     """

diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
@@ -86,6 +86,9 @@
     count_re as cpp_count_re,
     match_re as cpp_match_re,
 )
+from cudf._lib.strings.convert.convert_fixed_point import (
+    to_decimal as cpp_to_decimal,
+)
 from cudf._lib.strings.convert.convert_urls import (
     url_decode as cpp_url_decode,
     url_encode as cpp_url_encode,
@@ -4887,6 +4890,11 @@ def as_timedelta_column(
         format = "%D days %H:%M:%S"
         return self._as_datetime_or_timedelta_column(out_dtype, format)
 
+    def as_decimal_column(
+        self, dtype: Dtype, **kwargs
+    ) -> "cudf.core.column.DecimalColumn":
+        return cpp_to_decimal(self, dtype)
+
     def as_string_column(self, dtype: Dtype, format=None) -> StringColumn:
         return self
 

diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py
@@ -3,6 +3,7 @@
 import re
 import urllib.parse
 from contextlib import ExitStack as does_not_raise
+from decimal import Decimal
 from sys import getsizeof
 
 import cupy
@@ -209,6 +210,60 @@ def test_string_astype(dtype):
     assert_eq(expect, got)
 
 
+@pytest.mark.parametrize(
+    "data, scale, precision",
+    [
+        (["1.11", "2.22", "3.33"], 2, 3),
+        (["111", "222", "33"], 0, 3),
+        (["111000", "22000", "3000"], -3, 3),
+        ([None, None, None], 0, 5),
+        ([None, "-2345", None], 0, 5),
+        ([], 0, 5),
+    ],
+)
+def test_string_to_decimal(data, scale, precision):
+    gs = cudf.Series(data, dtype="str")
+    fp = gs.astype(cudf.Decimal64Dtype(scale=scale, precision=precision))
+    got = fp.astype("str")
+    assert_eq(gs, got)
+
+
+def test_string_empty_to_decimal():
+    gs = cudf.Series(["", "-85", ""], dtype="str")
+    got = gs.astype(cudf.Decimal64Dtype(scale=0, precision=5))
+    expected = cudf.Series(
+        [0, -85, 0], dtype=cudf.Decimal64Dtype(scale=0, precision=5),
+    )
+    assert_eq(expected, got)
+
+
+@pytest.mark.parametrize(
+    "data, scale, precision",
+    [
+        (["1.23", "-2.34", "3.45"], 2, 3),
+        (["123", "-234", "345"], 0, 3),
+        (["12300", "-400", "5000.0"], -2, 5),
+        ([None, None, None], 0, 5),
+        ([None, "-100", None], 0, 5),
+        ([], 0, 5),
+    ],
+)
+def test_string_from_decimal(data, scale, precision):
+    decimal_data = []
+    for d in data:
+        if d is None:
+            decimal_data.append(None)
+        else:
+            decimal_data.append(Decimal(d))
+    fp = cudf.Series(
+        decimal_data,
+        dtype=cudf.Decimal64Dtype(scale=scale, precision=precision),
+    )
+    gs = fp.astype("str")
+    got = gs.astype(cudf.Decimal64Dtype(scale=scale, precision=precision))
+    assert_eq(fp, got)
+
+
 @pytest.mark.parametrize(
     "dtype", NUMERIC_TYPES + DATETIME_TYPES + ["bool", "object", "str"]
 )