Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add cython for converting strings/fixed-point functions #7429

Merged
merged 16 commits into from
Mar 4, 2021
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# Copyright (c) 2021, NVIDIA CORPORATION.

from cudf._lib.cpp.column.column cimport column
from cudf._lib.cpp.column.column_view cimport column_view
from cudf._lib.cpp.types cimport data_type

from libcpp.memory cimport unique_ptr

cdef extern from "cudf/strings/convert/convert_fixed_point.hpp" namespace \
"cudf::strings" nogil:
cdef unique_ptr[column] to_fixed_point(
column_view input_col,
data_type output_type) except +

cdef unique_ptr[column] from_fixed_point(
column_view input_col) except +

cdef unique_ptr[column] is_fixed_point(
column_view source_strings,
data_type output_type
) except +
53 changes: 52 additions & 1 deletion python/cudf/cudf/_lib/string_casting.pyx
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020, NVIDIA CORPORATION.
# Copyright (c) 2020-2021, NVIDIA CORPORATION.

import numpy as np

Expand All @@ -7,6 +7,7 @@ from cudf._lib.scalar import as_device_scalar
from cudf._lib.scalar cimport DeviceScalar
from cudf._lib.types import np_to_cudf_types
from cudf._lib.types cimport underlying_type_t_type_id
from cudf._lib.cpp.types cimport DECIMAL64

from cudf.core.column.column import as_column

Expand Down Expand Up @@ -45,6 +46,11 @@ from cudf._lib.cpp.strings.convert.convert_durations cimport (
to_durations as cpp_to_durations,
from_durations as cpp_from_durations
)
from cudf._lib.cpp.strings.convert.convert_fixed_point cimport (
to_fixed_point as cpp_to_fixed_point,
from_fixed_point as cpp_from_fixed_point,
is_fixed_point as cpp_is_fixed_point
)
from cudf._lib.cpp.types cimport (
type_id,
data_type,
Expand Down Expand Up @@ -771,3 +777,48 @@ def is_hex(Column source_strings):
))

return Column.from_unique_ptr(move(c_result))


def from_decimal(Column input_col):
cdef column_view input_column_view = input_col.view()
davidwendt marked this conversation as resolved.
Show resolved Hide resolved
cdef unique_ptr[column] c_result
with nogil:
c_result = move(
cpp_from_fixed_point(
input_column_view))

return Column.from_unique_ptr(move(c_result))


def to_decimal(Column input_col, object out_type):
cdef column_view input_column_view = input_col.view()
davidwendt marked this conversation as resolved.
Show resolved Hide resolved
cdef unique_ptr[column] c_result
cdef int scale = out_type.scale
cdef data_type c_out_type = data_type(DECIMAL64, -scale)
with nogil:
c_result = move(
cpp_to_fixed_point(
input_column_view,
c_out_type))

result = Column.from_unique_ptr(move(c_result))
result.dtype.precision = out_type.precision
return result


def is_fixed_point(Column input_col, object out_type):
davidwendt marked this conversation as resolved.
Show resolved Hide resolved
"""
Returns a Column of boolean values with True for `input_col`
that have fixed-point characters.
"""
cdef unique_ptr[column] c_result
cdef column_view source_view = input_col.view()
cdef int scale = out_type.scale
cdef data_type c_out_type = data_type(DECIMAL64, -scale)
with nogil:
c_result = move(cpp_is_fixed_point(
source_view,
c_out_type
))

return Column.from_unique_ptr(move(c_result))
7 changes: 7 additions & 0 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -1026,6 +1026,8 @@ def astype(self, dtype: Dtype, **kwargs) -> ColumnBase:
"Casting interval columns not currently supported"
)
return self
elif is_decimal_dtype(dtype):
return self.as_decimal_column(dtype, **kwargs)
elif np.issubdtype(dtype, np.datetime64):
return self.as_datetime_column(dtype, **kwargs)
elif np.issubdtype(dtype, np.timedelta64):
Expand Down Expand Up @@ -1096,6 +1098,11 @@ def as_string_column(
) -> "cudf.core.column.StringColumn":
raise NotImplementedError

def as_decimal_column(
self, dtype: Dtype, **kwargs
) -> "cudf.core.column.DecimalColumn":
raise NotImplementedError

def apply_boolean_mask(self, mask) -> ColumnBase:
mask = as_column(mask, dtype="bool")
result = (
Expand Down
16 changes: 16 additions & 0 deletions python/cudf/cudf/core/column/decimal.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,20 @@
# Copyright (c) 2021, NVIDIA CORPORATION.

import cudf
import cupy as cp
import numpy as np
import pyarrow as pa

from typing import cast

from cudf import _lib as libcudf
from cudf.core.buffer import Buffer
from cudf.core.column import ColumnBase
from cudf.core.dtypes import Decimal64Dtype
from cudf.utils.utils import pa_mask_buffer_to_mask
from cudf._lib import string_casting as str_cast
from cudf._typing import Dtype
from cudf.core.column import as_column


class DecimalColumn(ColumnBase):
Expand Down Expand Up @@ -59,6 +65,16 @@ def binary_operator(self, op, other, reflect=False):
result.dtype.precision = _binop_precision(self.dtype, other.dtype, op)
return result

def as_string_column(
self, dtype: Dtype, format=None
) -> "cudf.core.column.StringColumn":
if len(self) > 0:
return str_cast.from_decimal(self)
else:
return cast(
"cudf.core.column.StringColumn", as_column([], dtype="object")
)


def _binop_precision(l_dtype, r_dtype, op):
"""
Expand Down
5 changes: 5 additions & 0 deletions python/cudf/cudf/core/column/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -4887,6 +4887,11 @@ def as_timedelta_column(
format = "%D days %H:%M:%S"
return self._as_datetime_or_timedelta_column(out_dtype, format)

def as_decimal_column(
self, dtype: Dtype, **kwargs
) -> "cudf.core.column.DecimalColumn":
return str_cast.to_decimal(self, dtype)

def as_string_column(self, dtype: Dtype, format=None) -> StringColumn:
return self

Expand Down
34 changes: 34 additions & 0 deletions python/cudf/cudf/tests/test_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import re
import urllib.parse
from contextlib import ExitStack as does_not_raise
from decimal import Decimal
from sys import getsizeof

import cupy
Expand Down Expand Up @@ -208,6 +209,39 @@ def test_string_astype(dtype):
assert_eq(expect, got)


@pytest.mark.parametrize(
"data, scale, precision",
[
(["1.11", "2.22", "3.33"], 2, 3),
(["111", "222", "33"], 0, 3),
(["111000", "22000", "3000"], -3, 3),
davidwendt marked this conversation as resolved.
Show resolved Hide resolved
],
)
def test_string_to_decimal(data, scale, precision):
gs = cudf.Series(data, dtype="str")
fp = gs.astype(cudf.Decimal64Dtype(scale=scale, precision=precision))
got = fp.astype("str")
assert_eq(gs, got)


@pytest.mark.parametrize(
"data, scale, precision",
[
(["1.23", "-2.34", "3.45"], 2, 3),
(["123", "-234", "345"], 0, 3),
(["12300", "-400", "5000.0"], -2, 5),
],
)
def test_string_from_decimal(data, scale, precision):
fp = cudf.Series(
[Decimal(data[0]), Decimal(data[1]), Decimal(data[2])],
dtype=cudf.Decimal64Dtype(scale=scale, precision=precision),
)
gs = fp.astype("str")
got = gs.astype(cudf.Decimal64Dtype(scale=scale, precision=precision))
assert_eq(fp, got)


Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@codereport For input on pytest test cases.

@pytest.mark.parametrize(
"dtype", NUMERIC_TYPES + DATETIME_TYPES + ["bool", "object", "str"]
)
Expand Down