Skip to content

Commit

Permalink
Add string.convert.convert_datetime/convert_booleans APIs to pylibcudf (
Browse files Browse the repository at this point in the history
#16971)

Contributes to #15162

Also address a review in #16935 (comment)

This also modifies some `format` arguments in `convert_datetime.pyx` to accept `str` instead of `bytes` (`const string&`) to align more with Python. Let me know if you prefer to change this back

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: #16971
  • Loading branch information
mroeschke authored Oct 4, 2024
1 parent 04c17de commit efaa0b5
Show file tree
Hide file tree
Showing 15 changed files with 286 additions and 116 deletions.
110 changes: 18 additions & 92 deletions python/cudf/cudf/_lib/string_casting.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,6 @@
from cudf._lib.column cimport Column

from cudf._lib.scalar import as_device_scalar

from cudf._lib.scalar cimport DeviceScalar

from cudf._lib.types import SUPPORTED_NUMPY_TO_LIBCUDF_TYPES

from libcpp.memory cimport unique_ptr
Expand All @@ -14,14 +11,6 @@ from libcpp.utility cimport move

from pylibcudf.libcudf.column.column cimport column
from pylibcudf.libcudf.column.column_view cimport column_view
from pylibcudf.libcudf.scalar.scalar cimport string_scalar
from pylibcudf.libcudf.strings.convert.convert_booleans cimport (
from_booleans as cpp_from_booleans,
to_booleans as cpp_to_booleans,
)
from pylibcudf.libcudf.strings.convert.convert_datetime cimport (
is_timestamp as cpp_is_timestamp,
)
from pylibcudf.libcudf.strings.convert.convert_floats cimport (
from_floats as cpp_from_floats,
to_floats as cpp_to_floats,
Expand Down Expand Up @@ -427,77 +416,21 @@ def stoul(Column input_col):
return string_to_integer(input_col, cudf.dtype("uint64"))


def _to_booleans(Column input_col, object string_true="True"):
"""
Converting/Casting input column of type string to boolean column
Parameters
----------
input_col : input column of type string
string_true : string that represents True
Returns
-------
A Column with string values cast to boolean
"""

cdef DeviceScalar str_true = as_device_scalar(string_true)
cdef column_view input_column_view = input_col.view()
cdef const string_scalar* string_scalar_true = <const string_scalar*>(
str_true.get_raw_ptr())
cdef unique_ptr[column] c_result
with nogil:
c_result = move(
cpp_to_booleans(
input_column_view,
string_scalar_true[0]))

return Column.from_unique_ptr(move(c_result))


def to_booleans(Column input_col):

return _to_booleans(input_col)


def _from_booleans(
Column input_col,
object string_true="True",
object string_false="False"):
"""
Converting/Casting input column of type boolean to string column
Parameters
----------
input_col : input column of type boolean
string_true : string that represents True
string_false : string that represents False
Returns
-------
A Column with boolean values cast to string
"""

cdef DeviceScalar str_true = as_device_scalar(string_true)
cdef DeviceScalar str_false = as_device_scalar(string_false)
cdef column_view input_column_view = input_col.view()
cdef const string_scalar* string_scalar_true = <const string_scalar*>(
str_true.get_raw_ptr())
cdef const string_scalar* string_scalar_false = <const string_scalar*>(
str_false.get_raw_ptr())
cdef unique_ptr[column] c_result
with nogil:
c_result = move(
cpp_from_booleans(
input_column_view,
string_scalar_true[0],
string_scalar_false[0]))

return Column.from_unique_ptr(move(c_result))
plc_column = plc.strings.convert.convert_booleans.to_booleans(
input_col.to_pylibcudf(mode="read"),
as_device_scalar("True").c_value,
)
return Column.from_pylibcudf(plc_column)


def from_booleans(Column input_col):
return _from_booleans(input_col)
plc_column = plc.strings.convert.convert_booleans.from_booleans(
input_col.to_pylibcudf(mode="read"),
as_device_scalar("True").c_value,
as_device_scalar("False").c_value,
)
return Column.from_pylibcudf(plc_column)


def int2timestamp(
Expand All @@ -520,11 +453,10 @@ def int2timestamp(
A Column with date-time represented in string format
"""
cdef string c_timestamp_format = format.encode("UTF-8")
return Column.from_pylibcudf(
plc.strings.convert.convert_datetime.from_timestamps(
input_col.to_pylibcudf(mode="read"),
c_timestamp_format,
format,
names.to_pylibcudf(mode="read")
)
)
Expand All @@ -545,12 +477,11 @@ def timestamp2int(Column input_col, dtype, format):
"""
dtype = dtype_to_pylibcudf_type(dtype)
cdef string c_timestamp_format = format.encode('UTF-8')
return Column.from_pylibcudf(
plc.strings.convert.convert_datetime.to_timestamps(
input_col.to_pylibcudf(mode="read"),
dtype,
c_timestamp_format
format
)
)

Expand All @@ -572,16 +503,11 @@ def istimestamp(Column input_col, str format):
"""
if input_col.size == 0:
return cudf.core.column.column_empty(0, dtype=cudf.dtype("bool"))
cdef column_view input_column_view = input_col.view()
cdef string c_timestamp_format = <string>str(format).encode('UTF-8')
cdef unique_ptr[column] c_result
with nogil:
c_result = move(
cpp_is_timestamp(
input_column_view,
c_timestamp_format))

return Column.from_unique_ptr(move(c_result))
plc_column = plc.strings.convert.convert_datetime.is_timestamp(
input_col.to_pylibcudf(mode="read"),
format
)
return Column.from_pylibcudf(plc_column)


def timedelta2int(Column input_col, dtype, format):
Expand Down
4 changes: 2 additions & 2 deletions python/cudf_polars/cudf_polars/dsl/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -914,7 +914,7 @@ def do_evaluate(
col = self.children[0].evaluate(df, context=context, mapping=mapping)

is_timestamps = plc.strings.convert.convert_datetime.is_timestamp(
col.obj, format.encode()
col.obj, format
)

if strict:
Expand All @@ -937,7 +937,7 @@ def do_evaluate(
)
return Column(
plc.strings.convert.convert_datetime.to_timestamps(
res.columns()[0], self.dtype, format.encode()
res.columns()[0], self.dtype, format
)
)
elif self.name == pl_expr.StringFunction.Replace:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,10 @@ from pylibcudf.libcudf.scalar.scalar cimport string_scalar
cdef extern from "cudf/strings/convert/convert_booleans.hpp" namespace \
"cudf::strings" nogil:
cdef unique_ptr[column] to_booleans(
column_view input_col,
column_view input,
string_scalar true_string) except +

cdef unique_ptr[column] from_booleans(
column_view input_col,
column_view booleans,
string_scalar true_string,
string_scalar false_string) except +
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,14 @@ from pylibcudf.libcudf.types cimport data_type
cdef extern from "cudf/strings/convert/convert_datetime.hpp" namespace \
"cudf::strings" nogil:
cdef unique_ptr[column] to_timestamps(
column_view input_col,
column_view input,
data_type timestamp_type,
string format) except +

cdef unique_ptr[column] from_timestamps(
column_view input_col,
column_view timestamps,
string format,
column_view input_strings_names) except +
column_view names) except +

cdef unique_ptr[column] is_timestamp(
column_view input_col,
Expand Down
2 changes: 1 addition & 1 deletion python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
# the License.
# =============================================================================

set(cython_sources convert_durations.pyx convert_datetime.pyx)
set(cython_sources convert_booleans.pyx convert_durations.pyx convert_datetime.pyx)

set(linked_libraries cudf::cudf)
rapids_cython_create_modules(
Expand Down
2 changes: 1 addition & 1 deletion python/pylibcudf/pylibcudf/strings/convert/__init__.pxd
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
# Copyright (c) 2024, NVIDIA CORPORATION.
from . cimport convert_datetime, convert_durations
from . cimport convert_booleans, convert_datetime, convert_durations
2 changes: 1 addition & 1 deletion python/pylibcudf/pylibcudf/strings/convert/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
# Copyright (c) 2024, NVIDIA CORPORATION.
from . import convert_datetime, convert_durations
from . import convert_booleans, convert_datetime, convert_durations
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from pylibcudf.column cimport Column
from pylibcudf.scalar cimport Scalar


cpdef Column to_booleans(Column input, Scalar true_string)

cpdef Column from_booleans(Column booleans, Scalar true_string, Scalar false_string)
91 changes: 91 additions & 0 deletions python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move
from pylibcudf.column cimport Column
from pylibcudf.libcudf.column.column cimport column
from pylibcudf.libcudf.scalar.scalar cimport string_scalar
from pylibcudf.libcudf.strings.convert cimport (
convert_booleans as cpp_convert_booleans,
)
from pylibcudf.scalar cimport Scalar

from cython.operator import dereference


cpdef Column to_booleans(Column input, Scalar true_string):
"""
Returns a new bool column by parsing boolean values from the strings
in the provided strings column.
For details, see :cpp:func:`cudf::strings::to_booleans`.
Parameters
----------
input : Column
Strings instance for this operation
true_string : Scalar
String to expect for true. Non-matching strings are false
Returns
-------
Column
New bool column converted from strings.
"""
cdef unique_ptr[column] c_result
cdef const string_scalar* c_true_string = <const string_scalar*>(
true_string.c_obj.get()
)

with nogil:
c_result = move(
cpp_convert_booleans.to_booleans(
input.view(),
dereference(c_true_string)
)
)

return Column.from_libcudf(move(c_result))

cpdef Column from_booleans(Column booleans, Scalar true_string, Scalar false_string):
"""
Returns a new strings column converting the boolean values from the
provided column into strings.
For details, see :cpp:func:`cudf::strings::from_booleans`.
Parameters
----------
booleans : Column
Boolean column to convert.
true_string : Scalar
String to use for true in the output column.
false_string : Scalar
String to use for false in the output column.
Returns
-------
Column
New strings column.
"""
cdef unique_ptr[column] c_result
cdef const string_scalar* c_true_string = <const string_scalar*>(
true_string.c_obj.get()
)
cdef const string_scalar* c_false_string = <const string_scalar*>(
false_string.c_obj.get()
)

with nogil:
c_result = move(
cpp_convert_booleans.from_booleans(
booleans.view(),
dereference(c_true_string),
dereference(c_false_string),
)
)

return Column.from_libcudf(move(c_result))
11 changes: 8 additions & 3 deletions python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,16 @@ from pylibcudf.types cimport DataType
cpdef Column to_timestamps(
Column input,
DataType timestamp_type,
const string& format
str format
)

cpdef Column from_timestamps(
Column input,
const string& format,
Column timestamps,
str format,
Column input_strings_names
)

cpdef Column is_timestamp(
Column input,
str format,
)
Loading

0 comments on commit efaa0b5

Please sign in to comment.