Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add string.convert.convert_datetime/convert_booleans APIs to pylibcudf #16971

Merged
Merged
110 changes: 18 additions & 92 deletions python/cudf/cudf/_lib/string_casting.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,6 @@
from cudf._lib.column cimport Column

from cudf._lib.scalar import as_device_scalar

from cudf._lib.scalar cimport DeviceScalar

from cudf._lib.types import SUPPORTED_NUMPY_TO_LIBCUDF_TYPES

from libcpp.memory cimport unique_ptr
Expand All @@ -14,14 +11,6 @@ from libcpp.utility cimport move

from pylibcudf.libcudf.column.column cimport column
from pylibcudf.libcudf.column.column_view cimport column_view
from pylibcudf.libcudf.scalar.scalar cimport string_scalar
from pylibcudf.libcudf.strings.convert.convert_booleans cimport (
from_booleans as cpp_from_booleans,
to_booleans as cpp_to_booleans,
)
from pylibcudf.libcudf.strings.convert.convert_datetime cimport (
is_timestamp as cpp_is_timestamp,
)
from pylibcudf.libcudf.strings.convert.convert_floats cimport (
from_floats as cpp_from_floats,
to_floats as cpp_to_floats,
Expand Down Expand Up @@ -427,77 +416,21 @@ def stoul(Column input_col):
return string_to_integer(input_col, cudf.dtype("uint64"))


def _to_booleans(Column input_col, object string_true="True"):
"""
Converting/Casting input column of type string to boolean column

Parameters
----------
input_col : input column of type string
string_true : string that represents True

Returns
-------
A Column with string values cast to boolean
"""

cdef DeviceScalar str_true = as_device_scalar(string_true)
cdef column_view input_column_view = input_col.view()
cdef const string_scalar* string_scalar_true = <const string_scalar*>(
str_true.get_raw_ptr())
cdef unique_ptr[column] c_result
with nogil:
c_result = move(
cpp_to_booleans(
input_column_view,
string_scalar_true[0]))

return Column.from_unique_ptr(move(c_result))


def to_booleans(Column input_col):

return _to_booleans(input_col)


def _from_booleans(
Column input_col,
object string_true="True",
object string_false="False"):
"""
Converting/Casting input column of type boolean to string column

Parameters
----------
input_col : input column of type boolean
string_true : string that represents True
string_false : string that represents False

Returns
-------
A Column with boolean values cast to string
"""

cdef DeviceScalar str_true = as_device_scalar(string_true)
cdef DeviceScalar str_false = as_device_scalar(string_false)
cdef column_view input_column_view = input_col.view()
cdef const string_scalar* string_scalar_true = <const string_scalar*>(
str_true.get_raw_ptr())
cdef const string_scalar* string_scalar_false = <const string_scalar*>(
str_false.get_raw_ptr())
cdef unique_ptr[column] c_result
with nogil:
c_result = move(
cpp_from_booleans(
input_column_view,
string_scalar_true[0],
string_scalar_false[0]))

return Column.from_unique_ptr(move(c_result))
plc_column = plc.strings.convert.convert_booleans.to_booleans(
input_col.to_pylibcudf(mode="read"),
as_device_scalar("True").c_value,
)
return Column.from_pylibcudf(plc_column)


def from_booleans(Column input_col):
return _from_booleans(input_col)
plc_column = plc.strings.convert.convert_booleans.from_booleans(
input_col.to_pylibcudf(mode="read"),
as_device_scalar("True").c_value,
as_device_scalar("False").c_value,
)
return Column.from_pylibcudf(plc_column)


def int2timestamp(
Expand All @@ -520,11 +453,10 @@ def int2timestamp(
A Column with date-time represented in string format

"""
cdef string c_timestamp_format = format.encode("UTF-8")
return Column.from_pylibcudf(
plc.strings.convert.convert_datetime.from_timestamps(
input_col.to_pylibcudf(mode="read"),
c_timestamp_format,
format,
names.to_pylibcudf(mode="read")
)
)
Expand All @@ -545,12 +477,11 @@ def timestamp2int(Column input_col, dtype, format):

"""
dtype = dtype_to_pylibcudf_type(dtype)
cdef string c_timestamp_format = format.encode('UTF-8')
return Column.from_pylibcudf(
plc.strings.convert.convert_datetime.to_timestamps(
input_col.to_pylibcudf(mode="read"),
dtype,
c_timestamp_format
format
)
)

Expand All @@ -572,16 +503,11 @@ def istimestamp(Column input_col, str format):
"""
if input_col.size == 0:
return cudf.core.column.column_empty(0, dtype=cudf.dtype("bool"))
cdef column_view input_column_view = input_col.view()
cdef string c_timestamp_format = <string>str(format).encode('UTF-8')
cdef unique_ptr[column] c_result
with nogil:
c_result = move(
cpp_is_timestamp(
input_column_view,
c_timestamp_format))

return Column.from_unique_ptr(move(c_result))
plc_column = plc.strings.convert.convert_datetime.is_timestamp(
input_col.to_pylibcudf(mode="read"),
format
)
return Column.from_pylibcudf(plc_column)


def timedelta2int(Column input_col, dtype, format):
Expand Down
4 changes: 2 additions & 2 deletions python/cudf_polars/cudf_polars/dsl/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -914,7 +914,7 @@ def do_evaluate(
col = self.children[0].evaluate(df, context=context, mapping=mapping)

is_timestamps = plc.strings.convert.convert_datetime.is_timestamp(
col.obj, format.encode()
col.obj, format
)

if strict:
Expand All @@ -937,7 +937,7 @@ def do_evaluate(
)
return Column(
plc.strings.convert.convert_datetime.to_timestamps(
res.columns()[0], self.dtype, format.encode()
res.columns()[0], self.dtype, format
)
)
elif self.name == pl_expr.StringFunction.Replace:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,10 @@ from pylibcudf.libcudf.scalar.scalar cimport string_scalar
cdef extern from "cudf/strings/convert/convert_booleans.hpp" namespace \
"cudf::strings" nogil:
cdef unique_ptr[column] to_booleans(
column_view input_col,
column_view input,
string_scalar true_string) except +

cdef unique_ptr[column] from_booleans(
column_view input_col,
column_view booleans,
string_scalar true_string,
string_scalar false_string) except +
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,14 @@ from pylibcudf.libcudf.types cimport data_type
cdef extern from "cudf/strings/convert/convert_datetime.hpp" namespace \
"cudf::strings" nogil:
cdef unique_ptr[column] to_timestamps(
column_view input_col,
column_view input,
data_type timestamp_type,
string format) except +

cdef unique_ptr[column] from_timestamps(
column_view input_col,
column_view timestamps,
string format,
column_view input_strings_names) except +
column_view names) except +

cdef unique_ptr[column] is_timestamp(
column_view input_col,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
# the License.
# =============================================================================

set(cython_sources convert_durations.pyx convert_datetime.pyx)
set(cython_sources convert_booleans.pyx convert_durations.pyx convert_datetime.pyx)

set(linked_libraries cudf::cudf)
rapids_cython_create_modules(
Expand Down
2 changes: 1 addition & 1 deletion python/pylibcudf/pylibcudf/strings/convert/__init__.pxd
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
# Copyright (c) 2024, NVIDIA CORPORATION.
from . cimport convert_datetime, convert_durations
from . cimport convert_booleans, convert_datetime, convert_durations
2 changes: 1 addition & 1 deletion python/pylibcudf/pylibcudf/strings/convert/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
# Copyright (c) 2024, NVIDIA CORPORATION.
from . import convert_datetime, convert_durations
from . import convert_booleans, convert_datetime, convert_durations
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from pylibcudf.column cimport Column
from pylibcudf.scalar cimport Scalar


cpdef Column to_booleans(Column input, Scalar true_string)

cpdef Column from_booleans(Column booleans, Scalar true_string, Scalar false_string)
91 changes: 91 additions & 0 deletions python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move
from pylibcudf.column cimport Column
from pylibcudf.libcudf.column.column cimport column
from pylibcudf.libcudf.scalar.scalar cimport string_scalar
from pylibcudf.libcudf.strings.convert cimport (
convert_booleans as cpp_convert_booleans,
)
from pylibcudf.scalar cimport Scalar

from cython.operator import dereference


cpdef Column to_booleans(Column input, Scalar true_string):
"""
Returns a new bool column by parsing boolean values from the strings
in the provided strings column.

For details, see :cpp:func:`cudf::strings::to_booleans`.

Parameters
----------
input : Column
Strings instance for this operation

true_string : Scalar
String to expect for true. Non-matching strings are false

Returns
-------
Column
New bool column converted from strings.
"""
cdef unique_ptr[column] c_result
cdef const string_scalar* c_true_string = <const string_scalar*>(
true_string.c_obj.get()
)

with nogil:
c_result = move(
cpp_convert_booleans.to_booleans(
input.view(),
dereference(c_true_string)
)
)

return Column.from_libcudf(move(c_result))

cpdef Column from_booleans(Column booleans, Scalar true_string, Scalar false_string):
"""
Returns a new strings column converting the boolean values from the
provided column into strings.

For details, see :cpp:func:`cudf::strings::from_booleans`.

Parameters
----------
booleans : Column
Boolean column to convert.

true_string : Scalar
String to use for true in the output column.

false_string : Scalar
String to use for false in the output column.

Returns
-------
Column
New strings column.
"""
cdef unique_ptr[column] c_result
cdef const string_scalar* c_true_string = <const string_scalar*>(
true_string.c_obj.get()
)
cdef const string_scalar* c_false_string = <const string_scalar*>(
false_string.c_obj.get()
)

with nogil:
c_result = move(
cpp_convert_booleans.from_booleans(
booleans.view(),
dereference(c_true_string),
dereference(c_false_string),
)
)

return Column.from_libcudf(move(c_result))
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,16 @@ from pylibcudf.types cimport DataType
cpdef Column to_timestamps(
Column input,
DataType timestamp_type,
const string& format
str format
)

cpdef Column from_timestamps(
Column input,
const string& format,
Column timestamps,
str format,
Column input_strings_names
)

cpdef Column is_timestamp(
Column input,
str format,
)
Loading
Loading