From 2dd15b05f144358b2c26e291b7a8231c110d9bca Mon Sep 17 00:00:00 2001 From: David <45795991+davidwendt@users.noreply.github.com> Date: Thu, 4 Mar 2021 16:05:10 -0500 Subject: [PATCH] Add cython for converting strings/fixed-point functions (#7429) Reference #7285 This PR adds Cython wrappers for `cudf::strings::to_fixed_point`, `cudf::strings::from_fixed_point`, and `cudf::strings::is_fixed_point` libcudf functions. Authors: - David (@davidwendt) Approvers: - GALI PREM SAGAR (@galipremsagar) - Ashwin Srinath (@shwina) - Conor Hoekstra (@codereport) URL: https://github.com/rapidsai/cudf/pull/7429 --- .../strings/convert/convert_fixed_point.pxd | 21 ++++ .../strings/convert/convert_fixed_point.pyx | 107 ++++++++++++++++++ python/cudf/cudf/core/column/column.py | 7 ++ python/cudf/cudf/core/column/decimal.py | 18 +++ python/cudf/cudf/core/column/string.py | 8 ++ python/cudf/cudf/tests/test_string.py | 55 +++++++++ 6 files changed, 216 insertions(+) create mode 100644 python/cudf/cudf/_lib/cpp/strings/convert/convert_fixed_point.pxd create mode 100644 python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx diff --git a/python/cudf/cudf/_lib/cpp/strings/convert/convert_fixed_point.pxd b/python/cudf/cudf/_lib/cpp/strings/convert/convert_fixed_point.pxd new file mode 100644 index 00000000000..77d72acb670 --- /dev/null +++ b/python/cudf/cudf/_lib/cpp/strings/convert/convert_fixed_point.pxd @@ -0,0 +1,21 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. + +from cudf._lib.cpp.column.column cimport column +from cudf._lib.cpp.column.column_view cimport column_view +from cudf._lib.cpp.types cimport data_type + +from libcpp.memory cimport unique_ptr + +cdef extern from "cudf/strings/convert/convert_fixed_point.hpp" namespace \ + "cudf::strings" nogil: + cdef unique_ptr[column] to_fixed_point( + column_view input_col, + data_type output_type) except + + + cdef unique_ptr[column] from_fixed_point( + column_view input_col) except + + + cdef unique_ptr[column] is_fixed_point( + column_view source_strings, + data_type output_type + ) except + diff --git a/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx b/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx new file mode 100644 index 00000000000..38d238b8266 --- /dev/null +++ b/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx @@ -0,0 +1,107 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. + +import numpy as np + +from cudf._lib.column cimport Column +from cudf._lib.types import np_to_cudf_types +from cudf._lib.types cimport underlying_type_t_type_id +from cudf._lib.cpp.types cimport DECIMAL64 + +from cudf.core.column.column import as_column + +from cudf._lib.cpp.column.column cimport column +from cudf._lib.cpp.column.column_view cimport column_view +from cudf._lib.cpp.strings.convert.convert_fixed_point cimport ( + to_fixed_point as cpp_to_fixed_point, + from_fixed_point as cpp_from_fixed_point, + is_fixed_point as cpp_is_fixed_point +) +from cudf._lib.cpp.types cimport ( + type_id, + data_type, +) + +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from libcpp.string cimport string + + +def from_decimal(Column input_col): + """ + Converts a `DecimalColumn` to a `StringColumn`. + + Parameters + ---------- + input_col : input column of type decimal + + Returns + ------- + A column of strings representing the input decimal values. + """ + cdef column_view input_column_view = input_col.view() + cdef unique_ptr[column] c_result + with nogil: + c_result = move( + cpp_from_fixed_point( + input_column_view)) + + return Column.from_unique_ptr(move(c_result)) + + +def to_decimal(Column input_col, object out_type): + """ + Returns a `DecimalColumn` from the provided `StringColumn` + using the scale in the `out_type`. + + Parameters + ---------- + input_col : input column of type string + out_type : The type and scale of the decimal column expected + + Returns + ------- + A column of decimals parsed from the string values. + """ + cdef column_view input_column_view = input_col.view() + cdef unique_ptr[column] c_result + cdef int scale = out_type.scale + cdef data_type c_out_type = data_type(DECIMAL64, -scale) + with nogil: + c_result = move( + cpp_to_fixed_point( + input_column_view, + c_out_type)) + + result = Column.from_unique_ptr(move(c_result)) + result.dtype.precision = out_type.precision + return result + + +def is_fixed_point(Column input_col, object dtype): + """ + Returns a Column of boolean values with True for `input_col` + that have fixed-point characters. The output row also has a + False value if the corresponding string would cause an integer + overflow. The scale of the `dtype` is used to determine overflow + in the output row. + + Parameters + ---------- + input_col : input column of type string + dtype : The type and scale of a decimal column + + Returns + ------- + A Column of booleans indicating valid decimal conversion. + """ + cdef unique_ptr[column] c_result + cdef column_view source_view = input_col.view() + cdef int scale = dtype.scale + cdef data_type c_dtype = data_type(DECIMAL64, -scale) + with nogil: + c_result = move(cpp_is_fixed_point( + source_view, + c_dtype + )) + + return Column.from_unique_ptr(move(c_result)) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 1bad2c3a451..7e7b39816d8 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -1026,6 +1026,8 @@ def astype(self, dtype: Dtype, **kwargs) -> ColumnBase: "Casting interval columns not currently supported" ) return self + elif is_decimal_dtype(dtype): + return self.as_decimal_column(dtype, **kwargs) elif np.issubdtype(dtype, np.datetime64): return self.as_datetime_column(dtype, **kwargs) elif np.issubdtype(dtype, np.timedelta64): @@ -1096,6 +1098,11 @@ def as_string_column( ) -> "cudf.core.column.StringColumn": raise NotImplementedError + def as_decimal_column( + self, dtype: Dtype, **kwargs + ) -> "cudf.core.column.DecimalColumn": + raise NotImplementedError + def apply_boolean_mask(self, mask) -> ColumnBase: mask = as_column(mask, dtype="bool") result = ( diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py index 58156c3826c..0056b3a8454 100644 --- a/python/cudf/cudf/core/column/decimal.py +++ b/python/cudf/cudf/core/column/decimal.py @@ -1,14 +1,22 @@ # Copyright (c) 2021, NVIDIA CORPORATION. +import cudf import cupy as cp import numpy as np import pyarrow as pa +from typing import cast + from cudf import _lib as libcudf from cudf.core.buffer import Buffer from cudf.core.column import ColumnBase from cudf.core.dtypes import Decimal64Dtype from cudf.utils.utils import pa_mask_buffer_to_mask +from cudf._lib.strings.convert.convert_fixed_point import ( + from_decimal as cpp_from_decimal, +) +from cudf._typing import Dtype +from cudf.core.column import as_column class DecimalColumn(ColumnBase): @@ -59,6 +67,16 @@ def binary_operator(self, op, other, reflect=False): result.dtype.precision = _binop_precision(self.dtype, other.dtype, op) return result + def as_string_column( + self, dtype: Dtype, format=None + ) -> "cudf.core.column.StringColumn": + if len(self) > 0: + return cpp_from_decimal(self) + else: + return cast( + "cudf.core.column.StringColumn", as_column([], dtype="object") + ) + def _binop_precision(l_dtype, r_dtype, op): """ diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index aa5172a9a89..0a1f6529cc7 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -86,6 +86,9 @@ count_re as cpp_count_re, match_re as cpp_match_re, ) +from cudf._lib.strings.convert.convert_fixed_point import ( + to_decimal as cpp_to_decimal, +) from cudf._lib.strings.convert.convert_urls import ( url_decode as cpp_url_decode, url_encode as cpp_url_encode, @@ -4887,6 +4890,11 @@ def as_timedelta_column( format = "%D days %H:%M:%S" return self._as_datetime_or_timedelta_column(out_dtype, format) + def as_decimal_column( + self, dtype: Dtype, **kwargs + ) -> "cudf.core.column.DecimalColumn": + return cpp_to_decimal(self, dtype) + def as_string_column(self, dtype: Dtype, format=None) -> StringColumn: return self diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py index a015f3387b4..98b8bfb870d 100644 --- a/python/cudf/cudf/tests/test_string.py +++ b/python/cudf/cudf/tests/test_string.py @@ -3,6 +3,7 @@ import re import urllib.parse from contextlib import ExitStack as does_not_raise +from decimal import Decimal from sys import getsizeof import cupy @@ -208,6 +209,60 @@ def test_string_astype(dtype): assert_eq(expect, got) +@pytest.mark.parametrize( + "data, scale, precision", + [ + (["1.11", "2.22", "3.33"], 2, 3), + (["111", "222", "33"], 0, 3), + (["111000", "22000", "3000"], -3, 3), + ([None, None, None], 0, 5), + ([None, "-2345", None], 0, 5), + ([], 0, 5), + ], +) +def test_string_to_decimal(data, scale, precision): + gs = cudf.Series(data, dtype="str") + fp = gs.astype(cudf.Decimal64Dtype(scale=scale, precision=precision)) + got = fp.astype("str") + assert_eq(gs, got) + + +def test_string_empty_to_decimal(): + gs = cudf.Series(["", "-85", ""], dtype="str") + got = gs.astype(cudf.Decimal64Dtype(scale=0, precision=5)) + expected = cudf.Series( + [0, -85, 0], dtype=cudf.Decimal64Dtype(scale=0, precision=5), + ) + assert_eq(expected, got) + + +@pytest.mark.parametrize( + "data, scale, precision", + [ + (["1.23", "-2.34", "3.45"], 2, 3), + (["123", "-234", "345"], 0, 3), + (["12300", "-400", "5000.0"], -2, 5), + ([None, None, None], 0, 5), + ([None, "-100", None], 0, 5), + ([], 0, 5), + ], +) +def test_string_from_decimal(data, scale, precision): + decimal_data = [] + for d in data: + if d is None: + decimal_data.append(None) + else: + decimal_data.append(Decimal(d)) + fp = cudf.Series( + decimal_data, + dtype=cudf.Decimal64Dtype(scale=scale, precision=precision), + ) + gs = fp.astype("str") + got = gs.astype(cudf.Decimal64Dtype(scale=scale, precision=precision)) + assert_eq(fp, got) + + @pytest.mark.parametrize( "dtype", NUMERIC_TYPES + DATETIME_TYPES + ["bool", "object", "str"] )