Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add cython for converting strings/fixed-point functions #7429

Merged
merged 16 commits into from
Mar 4, 2021
Merged
Show file tree
Hide file tree
Changes from 15 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# Copyright (c) 2021, NVIDIA CORPORATION.

from cudf._lib.cpp.column.column cimport column
from cudf._lib.cpp.column.column_view cimport column_view
from cudf._lib.cpp.types cimport data_type

from libcpp.memory cimport unique_ptr

cdef extern from "cudf/strings/convert/convert_fixed_point.hpp" namespace \
"cudf::strings" nogil:
cdef unique_ptr[column] to_fixed_point(
column_view input_col,
data_type output_type) except +

cdef unique_ptr[column] from_fixed_point(
column_view input_col) except +

cdef unique_ptr[column] is_fixed_point(
column_view source_strings,
data_type output_type
) except +
107 changes: 107 additions & 0 deletions python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
# Copyright (c) 2021, NVIDIA CORPORATION.

import numpy as np

from cudf._lib.column cimport Column
from cudf._lib.types import np_to_cudf_types
from cudf._lib.types cimport underlying_type_t_type_id
from cudf._lib.cpp.types cimport DECIMAL64

from cudf.core.column.column import as_column

from cudf._lib.cpp.column.column cimport column
from cudf._lib.cpp.column.column_view cimport column_view
from cudf._lib.cpp.strings.convert.convert_fixed_point cimport (
to_fixed_point as cpp_to_fixed_point,
from_fixed_point as cpp_from_fixed_point,
is_fixed_point as cpp_is_fixed_point
)
from cudf._lib.cpp.types cimport (
type_id,
data_type,
)

from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move
from libcpp.string cimport string


def from_decimal(Column input_col):
"""
Converts a `DecimalColumn` to a `StringColumn`.

Parameters
----------
input_col : input column of type decimal

Returns
-------
A column of strings representing the input decimal values.
"""
cdef column_view input_column_view = input_col.view()
cdef unique_ptr[column] c_result
with nogil:
c_result = move(
cpp_from_fixed_point(
input_column_view))

return Column.from_unique_ptr(move(c_result))


def to_decimal(Column input_col, object out_type):
"""
Returns a `DecimalColumn` from the provided `StringColumn`
using the scale in the `out_type`.

Parameters
----------
input_col : input column of type string
out_type : The type and scale of the decimal column expected

Returns
-------
A column of decimals parsed from the string values.
"""
cdef column_view input_column_view = input_col.view()
cdef unique_ptr[column] c_result
cdef int scale = out_type.scale
cdef data_type c_out_type = data_type(DECIMAL64, -scale)
with nogil:
c_result = move(
cpp_to_fixed_point(
input_column_view,
c_out_type))

result = Column.from_unique_ptr(move(c_result))
result.dtype.precision = out_type.precision
return result


def is_fixed_point(Column input_col, object dtype):
"""
Returns a Column of boolean values with True for `input_col`
that have fixed-point characters. The output row also has a
False value if the corresponding string would cause an integer
overflow. The scale of the `dtype` is used to determine overflow
in the output row.

Parameters
----------
input_col : input column of type string
dtype : The type and scale of a decimal column

Returns
-------
A Column of booleans indicating valid decimal conversion.
"""
cdef unique_ptr[column] c_result
cdef column_view source_view = input_col.view()
cdef int scale = dtype.scale
cdef data_type c_dtype = data_type(DECIMAL64, -scale)
with nogil:
c_result = move(cpp_is_fixed_point(
source_view,
c_dtype
))

return Column.from_unique_ptr(move(c_result))
7 changes: 7 additions & 0 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -1026,6 +1026,8 @@ def astype(self, dtype: Dtype, **kwargs) -> ColumnBase:
"Casting interval columns not currently supported"
)
return self
elif is_decimal_dtype(dtype):
return self.as_decimal_column(dtype, **kwargs)
elif np.issubdtype(dtype, np.datetime64):
return self.as_datetime_column(dtype, **kwargs)
elif np.issubdtype(dtype, np.timedelta64):
Expand Down Expand Up @@ -1096,6 +1098,11 @@ def as_string_column(
) -> "cudf.core.column.StringColumn":
raise NotImplementedError

def as_decimal_column(
self, dtype: Dtype, **kwargs
) -> "cudf.core.column.DecimalColumn":
raise NotImplementedError

def apply_boolean_mask(self, mask) -> ColumnBase:
mask = as_column(mask, dtype="bool")
result = (
Expand Down
18 changes: 18 additions & 0 deletions python/cudf/cudf/core/column/decimal.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,22 @@
# Copyright (c) 2021, NVIDIA CORPORATION.

import cudf
import cupy as cp
import numpy as np
import pyarrow as pa

from typing import cast

from cudf import _lib as libcudf
from cudf.core.buffer import Buffer
from cudf.core.column import ColumnBase
from cudf.core.dtypes import Decimal64Dtype
from cudf.utils.utils import pa_mask_buffer_to_mask
from cudf._lib.strings.convert.convert_fixed_point import (
from_decimal as cpp_from_decimal,
)
from cudf._typing import Dtype
from cudf.core.column import as_column


class DecimalColumn(ColumnBase):
Expand Down Expand Up @@ -59,6 +67,16 @@ def binary_operator(self, op, other, reflect=False):
result.dtype.precision = _binop_precision(self.dtype, other.dtype, op)
return result

def as_string_column(
self, dtype: Dtype, format=None
) -> "cudf.core.column.StringColumn":
if len(self) > 0:
return cpp_from_decimal(self)
else:
return cast(
"cudf.core.column.StringColumn", as_column([], dtype="object")
)


def _binop_precision(l_dtype, r_dtype, op):
"""
Expand Down
8 changes: 8 additions & 0 deletions python/cudf/cudf/core/column/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,9 @@
count_re as cpp_count_re,
match_re as cpp_match_re,
)
from cudf._lib.strings.convert.convert_fixed_point import (
to_decimal as cpp_to_decimal,
)
from cudf._lib.strings.convert.convert_urls import (
url_decode as cpp_url_decode,
url_encode as cpp_url_encode,
Expand Down Expand Up @@ -4887,6 +4890,11 @@ def as_timedelta_column(
format = "%D days %H:%M:%S"
return self._as_datetime_or_timedelta_column(out_dtype, format)

def as_decimal_column(
self, dtype: Dtype, **kwargs
) -> "cudf.core.column.DecimalColumn":
return cpp_to_decimal(self, dtype)

def as_string_column(self, dtype: Dtype, format=None) -> StringColumn:
return self

Expand Down
55 changes: 55 additions & 0 deletions python/cudf/cudf/tests/test_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import re
import urllib.parse
from contextlib import ExitStack as does_not_raise
from decimal import Decimal
from sys import getsizeof

import cupy
Expand Down Expand Up @@ -208,6 +209,60 @@ def test_string_astype(dtype):
assert_eq(expect, got)


@pytest.mark.parametrize(
"data, scale, precision",
[
(["1.11", "2.22", "3.33"], 2, 3),
(["111", "222", "33"], 0, 3),
(["111000", "22000", "3000"], -3, 3),
davidwendt marked this conversation as resolved.
Show resolved Hide resolved
([None, None, None], 0, 5),
([None, "-2345", None], 0, 5),
([], 0, 5),
],
)
def test_string_to_decimal(data, scale, precision):
gs = cudf.Series(data, dtype="str")
fp = gs.astype(cudf.Decimal64Dtype(scale=scale, precision=precision))
got = fp.astype("str")
assert_eq(gs, got)


def test_string_empty_to_decimal():
gs = cudf.Series(["", "-85", ""], dtype="str")
got = gs.astype(cudf.Decimal64Dtype(scale=0, precision=5))
expected = cudf.Series(
[0, -85, 0], dtype=cudf.Decimal64Dtype(scale=0, precision=5),
)
assert_eq(expected, got)


@pytest.mark.parametrize(
"data, scale, precision",
[
(["1.23", "-2.34", "3.45"], 2, 3),
(["123", "-234", "345"], 0, 3),
(["12300", "-400", "5000.0"], -2, 5),
([None, None, None], 0, 5),
([None, "-100", None], 0, 5),
([], 0, 5),
],
)
def test_string_from_decimal(data, scale, precision):
decimal_data = []
for d in data:
if d is None:
decimal_data.append(None)
else:
decimal_data.append(Decimal(d))
fp = cudf.Series(
decimal_data,
dtype=cudf.Decimal64Dtype(scale=scale, precision=precision),
)
gs = fp.astype("str")
got = gs.astype(cudf.Decimal64Dtype(scale=scale, precision=precision))
assert_eq(fp, got)


Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@codereport For input on pytest test cases.

@pytest.mark.parametrize(
"dtype", NUMERIC_TYPES + DATETIME_TYPES + ["bool", "object", "str"]
)
Expand Down