Skip to content

Commit

Permalink
Add cython for converting strings/fixed-point functions (rapidsai#7429)
Browse files Browse the repository at this point in the history
Reference rapidsai#7285 

This PR adds Cython wrappers for `cudf::strings::to_fixed_point`, `cudf::strings::from_fixed_point`, and `cudf::strings::is_fixed_point` libcudf functions.

Authors:
  - David (@davidwendt)

Approvers:
  - GALI PREM SAGAR (@galipremsagar)
  - Ashwin Srinath (@shwina)
  - Conor Hoekstra (@codereport)

URL: rapidsai#7429
  • Loading branch information
davidwendt authored and hyperbolic2346 committed Mar 23, 2021
1 parent 213f1ad commit d77a393
Show file tree
Hide file tree
Showing 6 changed files with 216 additions and 0 deletions.
21 changes: 21 additions & 0 deletions python/cudf/cudf/_lib/cpp/strings/convert/convert_fixed_point.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# Copyright (c) 2021, NVIDIA CORPORATION.

from cudf._lib.cpp.column.column cimport column
from cudf._lib.cpp.column.column_view cimport column_view
from cudf._lib.cpp.types cimport data_type

from libcpp.memory cimport unique_ptr

cdef extern from "cudf/strings/convert/convert_fixed_point.hpp" namespace \
"cudf::strings" nogil:
cdef unique_ptr[column] to_fixed_point(
column_view input_col,
data_type output_type) except +

cdef unique_ptr[column] from_fixed_point(
column_view input_col) except +

cdef unique_ptr[column] is_fixed_point(
column_view source_strings,
data_type output_type
) except +
107 changes: 107 additions & 0 deletions python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
# Copyright (c) 2021, NVIDIA CORPORATION.

import numpy as np

from cudf._lib.column cimport Column
from cudf._lib.types import np_to_cudf_types
from cudf._lib.types cimport underlying_type_t_type_id
from cudf._lib.cpp.types cimport DECIMAL64

from cudf.core.column.column import as_column

from cudf._lib.cpp.column.column cimport column
from cudf._lib.cpp.column.column_view cimport column_view
from cudf._lib.cpp.strings.convert.convert_fixed_point cimport (
to_fixed_point as cpp_to_fixed_point,
from_fixed_point as cpp_from_fixed_point,
is_fixed_point as cpp_is_fixed_point
)
from cudf._lib.cpp.types cimport (
type_id,
data_type,
)

from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move
from libcpp.string cimport string


def from_decimal(Column input_col):
"""
Converts a `DecimalColumn` to a `StringColumn`.
Parameters
----------
input_col : input column of type decimal
Returns
-------
A column of strings representing the input decimal values.
"""
cdef column_view input_column_view = input_col.view()
cdef unique_ptr[column] c_result
with nogil:
c_result = move(
cpp_from_fixed_point(
input_column_view))

return Column.from_unique_ptr(move(c_result))


def to_decimal(Column input_col, object out_type):
"""
Returns a `DecimalColumn` from the provided `StringColumn`
using the scale in the `out_type`.
Parameters
----------
input_col : input column of type string
out_type : The type and scale of the decimal column expected
Returns
-------
A column of decimals parsed from the string values.
"""
cdef column_view input_column_view = input_col.view()
cdef unique_ptr[column] c_result
cdef int scale = out_type.scale
cdef data_type c_out_type = data_type(DECIMAL64, -scale)
with nogil:
c_result = move(
cpp_to_fixed_point(
input_column_view,
c_out_type))

result = Column.from_unique_ptr(move(c_result))
result.dtype.precision = out_type.precision
return result


def is_fixed_point(Column input_col, object dtype):
"""
Returns a Column of boolean values with True for `input_col`
that have fixed-point characters. The output row also has a
False value if the corresponding string would cause an integer
overflow. The scale of the `dtype` is used to determine overflow
in the output row.
Parameters
----------
input_col : input column of type string
dtype : The type and scale of a decimal column
Returns
-------
A Column of booleans indicating valid decimal conversion.
"""
cdef unique_ptr[column] c_result
cdef column_view source_view = input_col.view()
cdef int scale = dtype.scale
cdef data_type c_dtype = data_type(DECIMAL64, -scale)
with nogil:
c_result = move(cpp_is_fixed_point(
source_view,
c_dtype
))

return Column.from_unique_ptr(move(c_result))
7 changes: 7 additions & 0 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -1036,6 +1036,8 @@ def astype(self, dtype: Dtype, **kwargs) -> ColumnBase:
"Casting interval columns not currently supported"
)
return self
elif is_decimal_dtype(dtype):
return self.as_decimal_column(dtype, **kwargs)
elif np.issubdtype(dtype, np.datetime64):
return self.as_datetime_column(dtype, **kwargs)
elif np.issubdtype(dtype, np.timedelta64):
Expand Down Expand Up @@ -1106,6 +1108,11 @@ def as_string_column(
) -> "cudf.core.column.StringColumn":
raise NotImplementedError

def as_decimal_column(
self, dtype: Dtype, **kwargs
) -> "cudf.core.column.DecimalColumn":
raise NotImplementedError

def apply_boolean_mask(self, mask) -> ColumnBase:
mask = as_column(mask, dtype="bool")
result = (
Expand Down
18 changes: 18 additions & 0 deletions python/cudf/cudf/core/column/decimal.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,22 @@
# Copyright (c) 2021, NVIDIA CORPORATION.

import cudf
import cupy as cp
import numpy as np
import pyarrow as pa

from typing import cast

from cudf import _lib as libcudf
from cudf.core.buffer import Buffer
from cudf.core.column import ColumnBase
from cudf.core.dtypes import Decimal64Dtype
from cudf.utils.utils import pa_mask_buffer_to_mask
from cudf._lib.strings.convert.convert_fixed_point import (
from_decimal as cpp_from_decimal,
)
from cudf._typing import Dtype
from cudf.core.column import as_column


class DecimalColumn(ColumnBase):
Expand Down Expand Up @@ -59,6 +67,16 @@ def binary_operator(self, op, other, reflect=False):
result.dtype.precision = _binop_precision(self.dtype, other.dtype, op)
return result

def as_string_column(
self, dtype: Dtype, format=None
) -> "cudf.core.column.StringColumn":
if len(self) > 0:
return cpp_from_decimal(self)
else:
return cast(
"cudf.core.column.StringColumn", as_column([], dtype="object")
)


def _binop_precision(l_dtype, r_dtype, op):
"""
Expand Down
8 changes: 8 additions & 0 deletions python/cudf/cudf/core/column/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,9 @@
count_re as cpp_count_re,
match_re as cpp_match_re,
)
from cudf._lib.strings.convert.convert_fixed_point import (
to_decimal as cpp_to_decimal,
)
from cudf._lib.strings.convert.convert_urls import (
url_decode as cpp_url_decode,
url_encode as cpp_url_encode,
Expand Down Expand Up @@ -4887,6 +4890,11 @@ def as_timedelta_column(
format = "%D days %H:%M:%S"
return self._as_datetime_or_timedelta_column(out_dtype, format)

def as_decimal_column(
self, dtype: Dtype, **kwargs
) -> "cudf.core.column.DecimalColumn":
return cpp_to_decimal(self, dtype)

def as_string_column(self, dtype: Dtype, format=None) -> StringColumn:
return self

Expand Down
55 changes: 55 additions & 0 deletions python/cudf/cudf/tests/test_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import re
import urllib.parse
from contextlib import ExitStack as does_not_raise
from decimal import Decimal
from sys import getsizeof

import cupy
Expand Down Expand Up @@ -209,6 +210,60 @@ def test_string_astype(dtype):
assert_eq(expect, got)


@pytest.mark.parametrize(
"data, scale, precision",
[
(["1.11", "2.22", "3.33"], 2, 3),
(["111", "222", "33"], 0, 3),
(["111000", "22000", "3000"], -3, 3),
([None, None, None], 0, 5),
([None, "-2345", None], 0, 5),
([], 0, 5),
],
)
def test_string_to_decimal(data, scale, precision):
gs = cudf.Series(data, dtype="str")
fp = gs.astype(cudf.Decimal64Dtype(scale=scale, precision=precision))
got = fp.astype("str")
assert_eq(gs, got)


def test_string_empty_to_decimal():
gs = cudf.Series(["", "-85", ""], dtype="str")
got = gs.astype(cudf.Decimal64Dtype(scale=0, precision=5))
expected = cudf.Series(
[0, -85, 0], dtype=cudf.Decimal64Dtype(scale=0, precision=5),
)
assert_eq(expected, got)


@pytest.mark.parametrize(
"data, scale, precision",
[
(["1.23", "-2.34", "3.45"], 2, 3),
(["123", "-234", "345"], 0, 3),
(["12300", "-400", "5000.0"], -2, 5),
([None, None, None], 0, 5),
([None, "-100", None], 0, 5),
([], 0, 5),
],
)
def test_string_from_decimal(data, scale, precision):
decimal_data = []
for d in data:
if d is None:
decimal_data.append(None)
else:
decimal_data.append(Decimal(d))
fp = cudf.Series(
decimal_data,
dtype=cudf.Decimal64Dtype(scale=scale, precision=precision),
)
gs = fp.astype("str")
got = gs.astype(cudf.Decimal64Dtype(scale=scale, precision=precision))
assert_eq(fp, got)


@pytest.mark.parametrize(
"dtype", NUMERIC_TYPES + DATETIME_TYPES + ["bool", "object", "str"]
)
Expand Down

0 comments on commit d77a393

Please sign in to comment.