Skip to content

Commit

Permalink
Add string.convert.convert_integers APIs to pylibcudf (#16991)
Browse files Browse the repository at this point in the history
  • Loading branch information
mroeschke authored Oct 15, 2024
1 parent 319ec3b commit c141ca5
Show file tree
Hide file tree
Showing 10 changed files with 354 additions and 101 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
================
convert_integers
================

.. automodule:: pylibcudf.strings.convert.convert_integers
:members:
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ convert
convert_durations
convert_fixed_point
convert_floats
convert_integers
convert_ipv4
convert_lists
convert_urls
128 changes: 37 additions & 91 deletions python/cudf/cudf/_lib/string_casting.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -2,28 +2,10 @@

from cudf._lib.column cimport Column

from cudf._lib.scalar import as_device_scalar
from cudf._lib.types import SUPPORTED_NUMPY_TO_LIBCUDF_TYPES

from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move

from pylibcudf.libcudf.column.column cimport column
from pylibcudf.libcudf.column.column_view cimport column_view
from pylibcudf.libcudf.strings.convert.convert_integers cimport (
from_integers as cpp_from_integers,
hex_to_integers as cpp_hex_to_integers,
integers_to_hex as cpp_integers_to_hex,
is_hex as cpp_is_hex,
to_integers as cpp_to_integers,
)
from pylibcudf.libcudf.types cimport data_type, type_id

from cudf._lib.types cimport underlying_type_t_type_id

import pylibcudf as plc
from pylibcudf.types cimport DataType

import cudf
from cudf._lib.scalar import as_device_scalar

from cudf._lib.types cimport dtype_to_pylibcudf_type

Expand All @@ -35,10 +17,10 @@ def floating_to_string(Column input_col):
return Column.from_pylibcudf(plc_column)


def string_to_floating(Column input_col, object out_type):
def string_to_floating(Column input_col, DataType out_type):
plc_column = plc.strings.convert.convert_floats.to_floats(
input_col.to_pylibcudf(mode="read"),
dtype_to_pylibcudf_type(out_type)
out_type
)
return Column.from_pylibcudf(plc_column)

Expand Down Expand Up @@ -72,7 +54,7 @@ def stod(Column input_col):
A Column with strings cast to double
"""

return string_to_floating(input_col, cudf.dtype("float64"))
return string_to_floating(input_col, plc.DataType(plc.TypeId.FLOAT64))


def ftos(Column input_col):
Expand Down Expand Up @@ -104,36 +86,22 @@ def stof(Column input_col):
A Column with strings cast to float
"""

return string_to_floating(input_col, cudf.dtype("float32"))
return string_to_floating(input_col, plc.DataType(plc.TypeId.FLOAT32))


def integer_to_string(Column input_col):
cdef column_view input_column_view = input_col.view()
cdef unique_ptr[column] c_result
with nogil:
c_result = move(
cpp_from_integers(
input_column_view))

return Column.from_unique_ptr(move(c_result))


def string_to_integer(Column input_col, object out_type):
cdef column_view input_column_view = input_col.view()
cdef unique_ptr[column] c_result
cdef type_id tid = <type_id> (
<underlying_type_t_type_id> (
SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[out_type]
)
plc_column = plc.strings.convert.convert_integers.from_integers(
input_col.to_pylibcudf(mode="read"),
)
cdef data_type c_out_type = data_type(tid)
with nogil:
c_result = move(
cpp_to_integers(
input_column_view,
c_out_type))
return Column.from_pylibcudf(plc_column)

return Column.from_unique_ptr(move(c_result))

def string_to_integer(Column input_col, DataType out_type):
plc_column = plc.strings.convert.convert_integers.to_integers(
input_col.to_pylibcudf(mode="read"),
out_type
)
return Column.from_pylibcudf(plc_column)


def i8tos(Column input_col):
Expand Down Expand Up @@ -165,7 +133,7 @@ def stoi8(Column input_col):
A Column with strings cast to int8
"""

return string_to_integer(input_col, cudf.dtype("int8"))
return string_to_integer(input_col, plc.DataType(plc.TypeId.INT8))


def i16tos(Column input_col):
Expand Down Expand Up @@ -197,7 +165,7 @@ def stoi16(Column input_col):
A Column with strings cast to int16
"""

return string_to_integer(input_col, cudf.dtype("int16"))
return string_to_integer(input_col, plc.DataType(plc.TypeId.INT16))


def itos(Column input_col):
Expand Down Expand Up @@ -229,7 +197,7 @@ def stoi(Column input_col):
A Column with strings cast to int32
"""

return string_to_integer(input_col, cudf.dtype("int32"))
return string_to_integer(input_col, plc.DataType(plc.TypeId.INT32))


def ltos(Column input_col):
Expand Down Expand Up @@ -261,7 +229,7 @@ def stol(Column input_col):
A Column with strings cast to int64
"""

return string_to_integer(input_col, cudf.dtype("int64"))
return string_to_integer(input_col, plc.DataType(plc.TypeId.INT64))


def ui8tos(Column input_col):
Expand Down Expand Up @@ -293,7 +261,7 @@ def stoui8(Column input_col):
A Column with strings cast to uint8
"""

return string_to_integer(input_col, cudf.dtype("uint8"))
return string_to_integer(input_col, plc.DataType(plc.TypeId.UINT8))


def ui16tos(Column input_col):
Expand Down Expand Up @@ -325,7 +293,7 @@ def stoui16(Column input_col):
A Column with strings cast to uint16
"""

return string_to_integer(input_col, cudf.dtype("uint16"))
return string_to_integer(input_col, plc.DataType(plc.TypeId.UINT16))


def uitos(Column input_col):
Expand Down Expand Up @@ -357,7 +325,7 @@ def stoui(Column input_col):
A Column with strings cast to uint32
"""

return string_to_integer(input_col, cudf.dtype("uint32"))
return string_to_integer(input_col, plc.DataType(plc.TypeId.UINT32))


def ultos(Column input_col):
Expand Down Expand Up @@ -389,7 +357,7 @@ def stoul(Column input_col):
A Column with strings cast to uint64
"""

return string_to_integer(input_col, cudf.dtype("uint64"))
return string_to_integer(input_col, plc.DataType(plc.TypeId.UINT64))


def to_booleans(Column input_col):
Expand Down Expand Up @@ -477,8 +445,6 @@ def istimestamp(Column input_col, str format):
A Column of boolean values identifying strings that matched the format.
"""
if input_col.size == 0:
return cudf.core.column.column_empty(0, dtype=cudf.dtype("bool"))
plc_column = plc.strings.convert.convert_datetime.is_timestamp(
input_col.to_pylibcudf(mode="read"),
format
Expand Down Expand Up @@ -582,7 +548,7 @@ def is_ipv4(Column source_strings):
return Column.from_pylibcudf(plc_column)


def htoi(Column input_col, **kwargs):
def htoi(Column input_col):
"""
Converting input column of type string having hex values
to integer of out_type
Expand All @@ -595,38 +561,22 @@ def htoi(Column input_col, **kwargs):
-------
A Column of integers parsed from hexadecimal string values.
"""

cdef column_view input_column_view = input_col.view()
cdef type_id tid = <type_id> (
<underlying_type_t_type_id> (
SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[cudf.dtype("int64")]
)
plc_column = plc.strings.convert.convert_integers.hex_to_integers(
input_col.to_pylibcudf(mode="read"),
plc.DataType(plc.TypeId.INT64)
)
cdef data_type c_out_type = data_type(tid)

cdef unique_ptr[column] c_result
with nogil:
c_result = move(
cpp_hex_to_integers(input_column_view,
c_out_type))

return Column.from_unique_ptr(move(c_result))
return Column.from_pylibcudf(plc_column)


def is_hex(Column source_strings):
"""
Returns a Column of boolean values with True for `source_strings`
that have hex characters.
"""
cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()

with nogil:
c_result = move(cpp_is_hex(
source_view
))

return Column.from_unique_ptr(move(c_result))
plc_column = plc.strings.convert.convert_integers.is_hex(
source_strings.to_pylibcudf(mode="read"),
)
return Column.from_pylibcudf(plc_column)


def itoh(Column input_col):
Expand All @@ -642,11 +592,7 @@ def itoh(Column input_col):
-------
A Column of strings with hexadecimal characters.
"""

cdef column_view input_column_view = input_col.view()
cdef unique_ptr[column] c_result
with nogil:
c_result = move(
cpp_integers_to_hex(input_column_view))

return Column.from_unique_ptr(move(c_result))
plc_column = plc.strings.convert.convert_integers.integers_to_hex(
input_col.to_pylibcudf(mode="read"),
)
return Column.from_pylibcudf(plc_column)
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Copyright (c) 2021-2024, NVIDIA CORPORATION.

from libcpp.memory cimport unique_ptr
from pylibcudf.exception_handler cimport libcudf_exception_handler
from pylibcudf.libcudf.column.column cimport column
from pylibcudf.libcudf.column.column_view cimport column_view
from pylibcudf.libcudf.types cimport data_type
Expand All @@ -9,23 +10,28 @@ from pylibcudf.libcudf.types cimport data_type
cdef extern from "cudf/strings/convert/convert_integers.hpp" namespace \
"cudf::strings" nogil:
cdef unique_ptr[column] to_integers(
column_view input_col,
data_type output_type) except +
column_view input,
data_type output_type) except +libcudf_exception_handler

cdef unique_ptr[column] from_integers(
column_view input_col) except +
column_view integers) except +libcudf_exception_handler

cdef unique_ptr[column] is_integer(
column_view input
) except +libcudf_exception_handler

cdef unique_ptr[column] is_integer(
column_view source_strings
) except +
column_view input,
data_type int_type
) except +libcudf_exception_handler

cdef unique_ptr[column] hex_to_integers(
column_view input_col,
column_view input,
data_type output_type) except +

cdef unique_ptr[column] is_hex(
column_view source_strings
) except +
column_view input
) except +libcudf_exception_handler

cdef unique_ptr[column] integers_to_hex(
column_view input_col) except +
column_view input) except +libcudf_exception_handler
2 changes: 1 addition & 1 deletion python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

set(cython_sources
convert_booleans.pyx convert_datetime.pyx convert_durations.pyx convert_fixed_point.pyx
convert_floats.pyx convert_ipv4.pyx convert_lists.pyx convert_urls.pyx
convert_floats.pyx convert_integers.pyx convert_ipv4.pyx convert_lists.pyx convert_urls.pyx
)

set(linked_libraries cudf::cudf)
Expand Down
1 change: 1 addition & 0 deletions python/pylibcudf/pylibcudf/strings/convert/__init__.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ from . cimport (
convert_durations,
convert_fixed_point,
convert_floats,
convert_integers,
convert_ipv4,
convert_lists,
convert_urls,
Expand Down
1 change: 1 addition & 0 deletions python/pylibcudf/pylibcudf/strings/convert/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
convert_durations,
convert_fixed_point,
convert_floats,
convert_integers,
convert_ipv4,
convert_lists,
convert_urls,
Expand Down
17 changes: 17 additions & 0 deletions python/pylibcudf/pylibcudf/strings/convert/convert_integers.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from pylibcudf.column cimport Column
from pylibcudf.types cimport DataType


cpdef Column to_integers(Column input, DataType output_type)

cpdef Column from_integers(Column integers)

cpdef Column is_integer(Column input, DataType int_type=*)

cpdef Column hex_to_integers(Column input, DataType output_type)

cpdef Column is_hex(Column input)

cpdef Column integers_to_hex(Column input)
Loading

0 comments on commit c141ca5

Please sign in to comment.