diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_integers.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_integers.rst new file mode 100644 index 00000000000..71d146c0379 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_integers.rst @@ -0,0 +1,6 @@ +================ +convert_integers +================ + +.. automodule:: pylibcudf.strings.convert.convert_integers + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/index.rst index fa05cb7d786..3d07c1271b4 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/index.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/index.rst @@ -9,6 +9,7 @@ convert convert_durations convert_fixed_point convert_floats + convert_integers convert_ipv4 convert_lists convert_urls diff --git a/python/cudf/cudf/_lib/string_casting.pyx b/python/cudf/cudf/_lib/string_casting.pyx index 93b67bd4c9d..06ee07d8e2b 100644 --- a/python/cudf/cudf/_lib/string_casting.pyx +++ b/python/cudf/cudf/_lib/string_casting.pyx @@ -2,28 +2,10 @@ from cudf._lib.column cimport Column -from cudf._lib.scalar import as_device_scalar -from cudf._lib.types import SUPPORTED_NUMPY_TO_LIBCUDF_TYPES - -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.strings.convert.convert_integers cimport ( - from_integers as cpp_from_integers, - hex_to_integers as cpp_hex_to_integers, - integers_to_hex as cpp_integers_to_hex, - is_hex as cpp_is_hex, - to_integers as cpp_to_integers, -) -from pylibcudf.libcudf.types cimport data_type, type_id - -from cudf._lib.types cimport underlying_type_t_type_id - import pylibcudf as plc +from pylibcudf.types cimport DataType -import cudf +from cudf._lib.scalar import as_device_scalar from cudf._lib.types cimport dtype_to_pylibcudf_type @@ -35,10 +17,10 @@ def floating_to_string(Column input_col): return Column.from_pylibcudf(plc_column) -def string_to_floating(Column input_col, object out_type): +def string_to_floating(Column input_col, DataType out_type): plc_column = plc.strings.convert.convert_floats.to_floats( input_col.to_pylibcudf(mode="read"), - dtype_to_pylibcudf_type(out_type) + out_type ) return Column.from_pylibcudf(plc_column) @@ -72,7 +54,7 @@ def stod(Column input_col): A Column with strings cast to double """ - return string_to_floating(input_col, cudf.dtype("float64")) + return string_to_floating(input_col, plc.DataType(plc.TypeId.FLOAT64)) def ftos(Column input_col): @@ -104,36 +86,22 @@ def stof(Column input_col): A Column with strings cast to float """ - return string_to_floating(input_col, cudf.dtype("float32")) + return string_to_floating(input_col, plc.DataType(plc.TypeId.FLOAT32)) def integer_to_string(Column input_col): - cdef column_view input_column_view = input_col.view() - cdef unique_ptr[column] c_result - with nogil: - c_result = move( - cpp_from_integers( - input_column_view)) - - return Column.from_unique_ptr(move(c_result)) - - -def string_to_integer(Column input_col, object out_type): - cdef column_view input_column_view = input_col.view() - cdef unique_ptr[column] c_result - cdef type_id tid = ( - ( - SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[out_type] - ) + plc_column = plc.strings.convert.convert_integers.from_integers( + input_col.to_pylibcudf(mode="read"), ) - cdef data_type c_out_type = data_type(tid) - with nogil: - c_result = move( - cpp_to_integers( - input_column_view, - c_out_type)) + return Column.from_pylibcudf(plc_column) - return Column.from_unique_ptr(move(c_result)) + +def string_to_integer(Column input_col, DataType out_type): + plc_column = plc.strings.convert.convert_integers.to_integers( + input_col.to_pylibcudf(mode="read"), + out_type + ) + return Column.from_pylibcudf(plc_column) def i8tos(Column input_col): @@ -165,7 +133,7 @@ def stoi8(Column input_col): A Column with strings cast to int8 """ - return string_to_integer(input_col, cudf.dtype("int8")) + return string_to_integer(input_col, plc.DataType(plc.TypeId.INT8)) def i16tos(Column input_col): @@ -197,7 +165,7 @@ def stoi16(Column input_col): A Column with strings cast to int16 """ - return string_to_integer(input_col, cudf.dtype("int16")) + return string_to_integer(input_col, plc.DataType(plc.TypeId.INT16)) def itos(Column input_col): @@ -229,7 +197,7 @@ def stoi(Column input_col): A Column with strings cast to int32 """ - return string_to_integer(input_col, cudf.dtype("int32")) + return string_to_integer(input_col, plc.DataType(plc.TypeId.INT32)) def ltos(Column input_col): @@ -261,7 +229,7 @@ def stol(Column input_col): A Column with strings cast to int64 """ - return string_to_integer(input_col, cudf.dtype("int64")) + return string_to_integer(input_col, plc.DataType(plc.TypeId.INT64)) def ui8tos(Column input_col): @@ -293,7 +261,7 @@ def stoui8(Column input_col): A Column with strings cast to uint8 """ - return string_to_integer(input_col, cudf.dtype("uint8")) + return string_to_integer(input_col, plc.DataType(plc.TypeId.UINT8)) def ui16tos(Column input_col): @@ -325,7 +293,7 @@ def stoui16(Column input_col): A Column with strings cast to uint16 """ - return string_to_integer(input_col, cudf.dtype("uint16")) + return string_to_integer(input_col, plc.DataType(plc.TypeId.UINT16)) def uitos(Column input_col): @@ -357,7 +325,7 @@ def stoui(Column input_col): A Column with strings cast to uint32 """ - return string_to_integer(input_col, cudf.dtype("uint32")) + return string_to_integer(input_col, plc.DataType(plc.TypeId.UINT32)) def ultos(Column input_col): @@ -389,7 +357,7 @@ def stoul(Column input_col): A Column with strings cast to uint64 """ - return string_to_integer(input_col, cudf.dtype("uint64")) + return string_to_integer(input_col, plc.DataType(plc.TypeId.UINT64)) def to_booleans(Column input_col): @@ -477,8 +445,6 @@ def istimestamp(Column input_col, str format): A Column of boolean values identifying strings that matched the format. """ - if input_col.size == 0: - return cudf.core.column.column_empty(0, dtype=cudf.dtype("bool")) plc_column = plc.strings.convert.convert_datetime.is_timestamp( input_col.to_pylibcudf(mode="read"), format @@ -582,7 +548,7 @@ def is_ipv4(Column source_strings): return Column.from_pylibcudf(plc_column) -def htoi(Column input_col, **kwargs): +def htoi(Column input_col): """ Converting input column of type string having hex values to integer of out_type @@ -595,22 +561,11 @@ def htoi(Column input_col, **kwargs): ------- A Column of integers parsed from hexadecimal string values. """ - - cdef column_view input_column_view = input_col.view() - cdef type_id tid = ( - ( - SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[cudf.dtype("int64")] - ) + plc_column = plc.strings.convert.convert_integers.hex_to_integers( + input_col.to_pylibcudf(mode="read"), + plc.DataType(plc.TypeId.INT64) ) - cdef data_type c_out_type = data_type(tid) - - cdef unique_ptr[column] c_result - with nogil: - c_result = move( - cpp_hex_to_integers(input_column_view, - c_out_type)) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf(plc_column) def is_hex(Column source_strings): @@ -618,15 +573,10 @@ def is_hex(Column source_strings): Returns a Column of boolean values with True for `source_strings` that have hex characters. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_is_hex( - source_view - )) - - return Column.from_unique_ptr(move(c_result)) + plc_column = plc.strings.convert.convert_integers.is_hex( + source_strings.to_pylibcudf(mode="read"), + ) + return Column.from_pylibcudf(plc_column) def itoh(Column input_col): @@ -642,11 +592,7 @@ def itoh(Column input_col): ------- A Column of strings with hexadecimal characters. """ - - cdef column_view input_column_view = input_col.view() - cdef unique_ptr[column] c_result - with nogil: - c_result = move( - cpp_integers_to_hex(input_column_view)) - - return Column.from_unique_ptr(move(c_result)) + plc_column = plc.strings.convert.convert_integers.integers_to_hex( + input_col.to_pylibcudf(mode="read"), + ) + return Column.from_pylibcudf(plc_column) diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_integers.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_integers.pxd index f12aab0a2e4..69d566b8c49 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_integers.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_integers.pxd @@ -1,6 +1,7 @@ # Copyright (c) 2021-2024, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr +from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view from pylibcudf.libcudf.types cimport data_type @@ -9,23 +10,28 @@ from pylibcudf.libcudf.types cimport data_type cdef extern from "cudf/strings/convert/convert_integers.hpp" namespace \ "cudf::strings" nogil: cdef unique_ptr[column] to_integers( - column_view input_col, - data_type output_type) except + + column_view input, + data_type output_type) except +libcudf_exception_handler cdef unique_ptr[column] from_integers( - column_view input_col) except + + column_view integers) except +libcudf_exception_handler + + cdef unique_ptr[column] is_integer( + column_view input + ) except +libcudf_exception_handler cdef unique_ptr[column] is_integer( - column_view source_strings - ) except + + column_view input, + data_type int_type + ) except +libcudf_exception_handler cdef unique_ptr[column] hex_to_integers( - column_view input_col, + column_view input, data_type output_type) except + cdef unique_ptr[column] is_hex( - column_view source_strings - ) except + + column_view input + ) except +libcudf_exception_handler cdef unique_ptr[column] integers_to_hex( - column_view input_col) except + + column_view input) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt b/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt index 846070870b1..8ba84ba7d50 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt @@ -14,7 +14,7 @@ set(cython_sources convert_booleans.pyx convert_datetime.pyx convert_durations.pyx convert_fixed_point.pyx - convert_floats.pyx convert_ipv4.pyx convert_lists.pyx convert_urls.pyx + convert_floats.pyx convert_integers.pyx convert_ipv4.pyx convert_lists.pyx convert_urls.pyx ) set(linked_libraries cudf::cudf) diff --git a/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd b/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd index 799532d72c6..85300936e4d 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd +++ b/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd @@ -5,6 +5,7 @@ from . cimport ( convert_durations, convert_fixed_point, convert_floats, + convert_integers, convert_ipv4, convert_lists, convert_urls, diff --git a/python/pylibcudf/pylibcudf/strings/convert/__init__.py b/python/pylibcudf/pylibcudf/strings/convert/__init__.py index deb2d8ab74b..aa27a7c8929 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/__init__.py +++ b/python/pylibcudf/pylibcudf/strings/convert/__init__.py @@ -5,6 +5,7 @@ convert_durations, convert_fixed_point, convert_floats, + convert_integers, convert_ipv4, convert_lists, convert_urls, diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_integers.pxd b/python/pylibcudf/pylibcudf/strings/convert/convert_integers.pxd new file mode 100644 index 00000000000..eff2e080c27 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_integers.pxd @@ -0,0 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column +from pylibcudf.types cimport DataType + + +cpdef Column to_integers(Column input, DataType output_type) + +cpdef Column from_integers(Column integers) + +cpdef Column is_integer(Column input, DataType int_type=*) + +cpdef Column hex_to_integers(Column input, DataType output_type) + +cpdef Column is_hex(Column input) + +cpdef Column integers_to_hex(Column input) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_integers.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_integers.pyx new file mode 100644 index 00000000000..5558683a502 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_integers.pyx @@ -0,0 +1,206 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.strings.convert cimport ( + convert_integers as cpp_convert_integers, +) +from pylibcudf.types cimport DataType + + +cpdef Column to_integers(Column input, DataType output_type): + """ + Returns a new integer numeric column parsing integer values from the + provided strings column. + + For details, cpp:func:`cudf::strings::to_integers`. + + Parameters + ---------- + input : Column + Strings instance for this operation. + + output_type : DataType + Type of integer numeric column to return. + + Returns + ------- + Column + New column with integers converted from strings. + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_convert_integers.to_integers( + input.view(), + output_type.c_obj + ) + ) + + return Column.from_libcudf(move(c_result)) + + +cpdef Column from_integers(Column integers): + """ + Returns a new strings column converting the integer values from the + provided column into strings. + + For details, cpp:func:`cudf::strings::from_integers`. + + Parameters + ---------- + integers : Column + Strings instance for this operation. + + Returns + ------- + Column + New strings column with integers as strings. + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_convert_integers.from_integers( + integers.view(), + ) + ) + + return Column.from_libcudf(move(c_result)) + + +cpdef Column is_integer(Column input, DataType int_type=None): + """ + Returns a boolean column identifying strings in which all + characters are valid for conversion to integers. + + For details, cpp:func:`cudf::strings::is_integer`. + + Parameters + ---------- + input : Column + Strings instance for this operation. + + int_type : DataType + Integer type used for checking underflow and overflow. + By default, does not check an integer type for underflow + or overflow. + + Returns + ------- + Column + New column of boolean results for each string. + """ + cdef unique_ptr[column] c_result + + if int_type is None: + with nogil: + c_result = move( + cpp_convert_integers.is_integer( + input.view(), + ) + ) + else: + with nogil: + c_result = move( + cpp_convert_integers.is_integer( + input.view(), + int_type.c_obj + ) + ) + + return Column.from_libcudf(move(c_result)) + + +cpdef Column hex_to_integers(Column input, DataType output_type): + """ + Returns a new integer numeric column parsing hexadecimal values + from the provided strings column. + + For details, cpp:func:`cudf::strings::hex_to_integers`. + + Parameters + ---------- + input : Column + Strings instance for this operation. + + output_type : DataType + Type of integer numeric column to return. + + Returns + ------- + Column + New column with integers converted from strings. + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_convert_integers.hex_to_integers( + input.view(), + output_type.c_obj + ) + ) + + return Column.from_libcudf(move(c_result)) + + +cpdef Column is_hex(Column input): + """ + Returns a boolean column identifying strings in which all + characters are valid for conversion to integers from hex. + + For details, cpp:func:`cudf::strings::is_hex`. + + Parameters + ---------- + input : Column + Strings instance for this operation. + + Returns + ------- + Column + New column of boolean results for each string. + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_convert_integers.is_hex( + input.view(), + ) + ) + + return Column.from_libcudf(move(c_result)) + + +cpdef Column integers_to_hex(Column input): + """ + Returns a new strings column converting integer columns to hexadecimal + characters. + + For details, cpp:func:`cudf::strings::integers_to_hex`. + + Parameters + ---------- + input : Column + Integer column to convert to hex. + + Returns + ------- + Column + New strings column with hexadecimal characters. + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_convert_integers.integers_to_hex( + input.view(), + ) + ) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_integers.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_integers.py new file mode 100644 index 00000000000..6d1d565af30 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_integers.py @@ -0,0 +1,69 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +import pyarrow as pa +import pylibcudf as plc +from utils import assert_column_eq + + +def test_to_integers(): + typ = pa.int8() + arr = pa.array(["1", "-1", None]) + result = plc.strings.convert.convert_integers.to_integers( + plc.interop.from_arrow(arr), plc.interop.from_arrow(typ) + ) + expected = arr.cast(typ) + assert_column_eq(result, expected) + + +def test_from_integers(): + arr = pa.array([1, -1, None]) + result = plc.strings.convert.convert_integers.from_integers( + plc.interop.from_arrow(arr) + ) + expected = pa.array(["1", "-1", None]) + assert_column_eq(result, expected) + + +def test_is_integer(): + arr = pa.array(["1", "-1", "1.2", "A", None]) + plc_column = plc.interop.from_arrow(arr) + result = plc.strings.convert.convert_integers.is_integer(plc_column) + expected = pa.array([True, True, False, False, None]) + assert_column_eq(result, expected) + + result = plc.strings.convert.convert_integers.is_integer( + plc_column, plc.interop.from_arrow(pa.uint8()) + ) + expected = pa.array([True, False, False, False, None]) + assert_column_eq(result, expected) + + +def test_hex_to_integers(): + typ = pa.int32() + data = ["0xff", "0x2a", None] + result = plc.strings.convert.convert_integers.hex_to_integers( + plc.interop.from_arrow(pa.array(data)), plc.interop.from_arrow(typ) + ) + expected = pa.array( + [int(val, 16) if isinstance(val, str) else val for val in data], + type=typ, + ) + assert_column_eq(result, expected) + + +def test_is_hex(): + arr = pa.array(["0xff", "123", "!", None]) + result = plc.strings.convert.convert_integers.is_hex( + plc.interop.from_arrow(arr) + ) + expected = pa.array([True, True, False, None]) + assert_column_eq(result, expected) + + +def test_integers_to_hex(): + data = [255, -42, None] + arr = pa.array(data) + result = plc.strings.convert.convert_integers.integers_to_hex( + plc.interop.from_arrow(arr) + ) + expected = pa.array(["FF", "FFFFFFFFFFFFFFD6", None]) + assert_column_eq(result, expected)