From f5955929b06e2a4609b9fca0e3f949afb9b1dadd Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 9 Dec 2024 14:22:04 -0800 Subject: [PATCH] Remove cudf._lib.string.convert/split in favor of inlining pylibcudf (#17496) Contributes to https://github.com/rapidsai/cudf/issues/17317 Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Matthew Murray (https://github.com/Matt711) URL: https://github.com/rapidsai/cudf/pull/17496 --- python/cudf/cudf/_lib/CMakeLists.txt | 1 - python/cudf/cudf/_lib/__init__.py | 1 - python/cudf/cudf/_lib/strings/CMakeLists.txt | 15 -- python/cudf/cudf/_lib/strings/__init__.py | 15 -- .../cudf/_lib/strings/convert/CMakeLists.txt | 24 -- .../cudf/_lib/strings/convert/__init__.pxd | 0 .../cudf/_lib/strings/convert/__init__.py | 0 .../strings/convert/convert_fixed_point.pyx | 76 ------ .../_lib/strings/convert/convert_floats.pyx | 19 -- .../_lib/strings/convert/convert_integers.pyx | 20 -- .../_lib/strings/convert/convert_lists.pyx | 32 --- .../_lib/strings/convert/convert_urls.pyx | 48 ---- .../cudf/_lib/strings/split/CMakeLists.txt | 22 -- .../cudf/cudf/_lib/strings/split/__init__.pxd | 0 .../cudf/cudf/_lib/strings/split/__init__.py | 0 .../cudf/_lib/strings/split/partition.pyx | 35 --- python/cudf/cudf/_lib/strings/split/split.pyx | 155 ----------- python/cudf/cudf/core/column/decimal.py | 15 +- python/cudf/cudf/core/column/lists.py | 10 +- python/cudf/cudf/core/column/string.py | 246 +++++++++++++++--- python/cudf/cudf/core/tools/datetimes.py | 5 +- python/cudf/cudf/core/tools/numeric.py | 66 ++--- 22 files changed, 262 insertions(+), 543 deletions(-) delete mode 100644 python/cudf/cudf/_lib/strings/CMakeLists.txt delete mode 100644 python/cudf/cudf/_lib/strings/convert/CMakeLists.txt delete mode 100644 python/cudf/cudf/_lib/strings/convert/__init__.pxd delete mode 100644 python/cudf/cudf/_lib/strings/convert/__init__.py delete mode 100644 python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx delete mode 100644 python/cudf/cudf/_lib/strings/convert/convert_floats.pyx delete mode 100644 python/cudf/cudf/_lib/strings/convert/convert_integers.pyx delete mode 100644 python/cudf/cudf/_lib/strings/convert/convert_lists.pyx delete mode 100644 python/cudf/cudf/_lib/strings/convert/convert_urls.pyx delete mode 100644 python/cudf/cudf/_lib/strings/split/CMakeLists.txt delete mode 100644 python/cudf/cudf/_lib/strings/split/__init__.pxd delete mode 100644 python/cudf/cudf/_lib/strings/split/__init__.py delete mode 100644 python/cudf/cudf/_lib/strings/split/partition.pyx delete mode 100644 python/cudf/cudf/_lib/strings/split/split.pyx diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt index e98cf283bbb..f9ac3a16940 100644 --- a/python/cudf/cudf/_lib/CMakeLists.txt +++ b/python/cudf/cudf/_lib/CMakeLists.txt @@ -46,4 +46,3 @@ target_link_libraries(interop PUBLIC nanoarrow) add_subdirectory(io) add_subdirectory(nvtext) -add_subdirectory(strings) diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py index 4758a933898..52e9b89da7b 100644 --- a/python/cudf/cudf/_lib/__init__.py +++ b/python/cudf/cudf/_lib/__init__.py @@ -12,7 +12,6 @@ sort, stream_compaction, string_casting, - strings, strings_udf, ) diff --git a/python/cudf/cudf/_lib/strings/CMakeLists.txt b/python/cudf/cudf/_lib/strings/CMakeLists.txt deleted file mode 100644 index dca9c4cc3fc..00000000000 --- a/python/cudf/cudf/_lib/strings/CMakeLists.txt +++ /dev/null @@ -1,15 +0,0 @@ -# ============================================================================= -# Copyright (c) 2022-2024, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except -# in compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under the License -# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express -# or implied. See the License for the specific language governing permissions and limitations under -# the License. -# ============================================================================= -add_subdirectory(convert) -add_subdirectory(split) diff --git a/python/cudf/cudf/_lib/strings/__init__.py b/python/cudf/cudf/_lib/strings/__init__.py index b795c54c112..341ba6d11c3 100644 --- a/python/cudf/cudf/_lib/strings/__init__.py +++ b/python/cudf/cudf/_lib/strings/__init__.py @@ -32,18 +32,3 @@ detokenize, tokenize_with_vocabulary, ) -from cudf._lib.strings.convert.convert_fixed_point import to_decimal -from cudf._lib.strings.convert.convert_floats import is_float -from cudf._lib.strings.convert.convert_integers import is_integer -from cudf._lib.strings.convert.convert_urls import url_decode, url_encode -from cudf._lib.strings.split.partition import partition, rpartition -from cudf._lib.strings.split.split import ( - rsplit, - rsplit_re, - rsplit_record, - rsplit_record_re, - split, - split_re, - split_record, - split_record_re, -) diff --git a/python/cudf/cudf/_lib/strings/convert/CMakeLists.txt b/python/cudf/cudf/_lib/strings/convert/CMakeLists.txt deleted file mode 100644 index e8a76b476a8..00000000000 --- a/python/cudf/cudf/_lib/strings/convert/CMakeLists.txt +++ /dev/null @@ -1,24 +0,0 @@ -# ============================================================================= -# Copyright (c) 2022-2024, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except -# in compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under the License -# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express -# or implied. See the License for the specific language governing permissions and limitations under -# the License. -# ============================================================================= - -set(cython_sources convert_fixed_point.pyx convert_floats.pyx convert_integers.pyx - convert_lists.pyx convert_urls.pyx -) - -set(linked_libraries cudf::cudf) -rapids_cython_create_modules( - CXX - SOURCE_FILES "${cython_sources}" - LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX strings_ ASSOCIATED_TARGETS cudf -) diff --git a/python/cudf/cudf/_lib/strings/convert/__init__.pxd b/python/cudf/cudf/_lib/strings/convert/__init__.pxd deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/python/cudf/cudf/_lib/strings/convert/__init__.py b/python/cudf/cudf/_lib/strings/convert/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx b/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx deleted file mode 100644 index 96dcd021c3b..00000000000 --- a/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx +++ /dev/null @@ -1,76 +0,0 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. - -from cudf.core.buffer import acquire_spill_lock - -from cudf._lib.column cimport Column -from cudf._lib.types cimport dtype_to_pylibcudf_type - -import pylibcudf as plc - - -@acquire_spill_lock() -def from_decimal(Column input_col): - """ - Converts a `Decimal64Column` to a `StringColumn`. - - Parameters - ---------- - input_col : input column of type decimal - - Returns - ------- - A column of strings representing the input decimal values. - """ - plc_column = plc.strings.convert.convert_fixed_point.from_fixed_point( - input_col.to_pylibcudf(mode="read"), - ) - return Column.from_pylibcudf(plc_column) - - -@acquire_spill_lock() -def to_decimal(Column input_col, object out_type): - """ - Returns a `Decimal64Column` from the provided `StringColumn` - using the scale in the `out_type`. - - Parameters - ---------- - input_col : input column of type string - out_type : The type and scale of the decimal column expected - - Returns - ------- - A column of decimals parsed from the string values. - """ - plc_column = plc.strings.convert.convert_fixed_point.to_fixed_point( - input_col.to_pylibcudf(mode="read"), - dtype_to_pylibcudf_type(out_type), - ) - result = Column.from_pylibcudf(plc_column) - result.dtype.precision = out_type.precision - return result - - -@acquire_spill_lock() -def is_fixed_point(Column input_col, object dtype): - """ - Returns a Column of boolean values with True for `input_col` - that have fixed-point characters. The output row also has a - False value if the corresponding string would cause an integer - overflow. The scale of the `dtype` is used to determine overflow - in the output row. - - Parameters - ---------- - input_col : input column of type string - dtype : The type and scale of a decimal column - - Returns - ------- - A Column of booleans indicating valid decimal conversion. - """ - plc_column = plc.strings.convert.convert_fixed_point.is_fixed_point( - input_col.to_pylibcudf(mode="read"), - dtype_to_pylibcudf_type(dtype), - ) - return Column.from_pylibcudf(plc_column) diff --git a/python/cudf/cudf/_lib/strings/convert/convert_floats.pyx b/python/cudf/cudf/_lib/strings/convert/convert_floats.pyx deleted file mode 100644 index 5da6e3f10cc..00000000000 --- a/python/cudf/cudf/_lib/strings/convert/convert_floats.pyx +++ /dev/null @@ -1,19 +0,0 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. - -from cudf.core.buffer import acquire_spill_lock - -from cudf._lib.column cimport Column - -import pylibcudf as plc - - -@acquire_spill_lock() -def is_float(Column source_strings): - """ - Returns a Column of boolean values with True for `source_strings` - that have floats. - """ - plc_column = plc.strings.convert.convert_floats.is_float( - source_strings.to_pylibcudf(mode="read") - ) - return Column.from_pylibcudf(plc_column) diff --git a/python/cudf/cudf/_lib/strings/convert/convert_integers.pyx b/python/cudf/cudf/_lib/strings/convert/convert_integers.pyx deleted file mode 100644 index 50113347ccb..00000000000 --- a/python/cudf/cudf/_lib/strings/convert/convert_integers.pyx +++ /dev/null @@ -1,20 +0,0 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. - -from cudf.core.buffer import acquire_spill_lock - -import pylibcudf as plc - -from cudf._lib.column cimport Column - - -@acquire_spill_lock() -def is_integer(Column source_strings): - """ - Returns a Column of boolean values with True for `source_strings` - that have integers. - """ - return Column.from_pylibcudf( - plc.strings.convert.convert_integers.is_integer( - source_strings.to_pylibcudf(mode="read") - ) - ) diff --git a/python/cudf/cudf/_lib/strings/convert/convert_lists.pyx b/python/cudf/cudf/_lib/strings/convert/convert_lists.pyx deleted file mode 100644 index 3a2cb4bd5c7..00000000000 --- a/python/cudf/cudf/_lib/strings/convert/convert_lists.pyx +++ /dev/null @@ -1,32 +0,0 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. - -import pylibcudf as plc - -from cudf.core.buffer import acquire_spill_lock - -from cudf._lib.column cimport Column - -from cudf._lib.scalar import as_device_scalar - - -@acquire_spill_lock() -def format_list_column(Column source_list, Column separators): - """ - Format a list column of strings into a strings column. - - Parameters - ---------- - input_col : input column of type list with strings child. - - separators: strings used for formatting (', ', '[', ']') - - Returns - ------- - Formatted strings column - """ - plc_column = plc.strings.convert.convert_lists.format_list_column( - source_list.to_pylibcudf(mode="read"), - as_device_scalar("None").c_value, - separators.to_pylibcudf(mode="read"), - ) - return Column.from_pylibcudf(plc_column) diff --git a/python/cudf/cudf/_lib/strings/convert/convert_urls.pyx b/python/cudf/cudf/_lib/strings/convert/convert_urls.pyx deleted file mode 100644 index d5c2f771970..00000000000 --- a/python/cudf/cudf/_lib/strings/convert/convert_urls.pyx +++ /dev/null @@ -1,48 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -import pylibcudf as plc - -from cudf.core.buffer import acquire_spill_lock - -from cudf._lib.column cimport Column - - -@acquire_spill_lock() -def url_decode(Column source_strings): - """ - Decode each string in column. No format checking is performed. - - Parameters - ---------- - input_col : input column of type string - - Returns - ------- - URL decoded string column - """ - plc_column = plc.strings.convert.convert_urls.url_decode( - source_strings.to_pylibcudf(mode="read") - ) - return Column.from_pylibcudf(plc_column) - - -@acquire_spill_lock() -def url_encode(Column source_strings): - """ - Encode each string in column. No format checking is performed. - All characters are encoded except for ASCII letters, digits, - and these characters: '.','_','-','~'. Encoding converts to - hex using UTF-8 encoded bytes. - - Parameters - ---------- - input_col : input column of type string - - Returns - ------- - URL encoded string column - """ - plc_column = plc.strings.convert.convert_urls.url_encode( - source_strings.to_pylibcudf(mode="read") - ) - return Column.from_pylibcudf(plc_column) diff --git a/python/cudf/cudf/_lib/strings/split/CMakeLists.txt b/python/cudf/cudf/_lib/strings/split/CMakeLists.txt deleted file mode 100644 index 4ede0a2fac5..00000000000 --- a/python/cudf/cudf/_lib/strings/split/CMakeLists.txt +++ /dev/null @@ -1,22 +0,0 @@ -# ============================================================================= -# Copyright (c) 2022-2024, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except -# in compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under the License -# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express -# or implied. See the License for the specific language governing permissions and limitations under -# the License. -# ============================================================================= - -set(cython_sources partition.pyx split.pyx) - -set(linked_libraries cudf::cudf) -rapids_cython_create_modules( - CXX - SOURCE_FILES "${cython_sources}" - LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX strings_ ASSOCIATED_TARGETS cudf -) diff --git a/python/cudf/cudf/_lib/strings/split/__init__.pxd b/python/cudf/cudf/_lib/strings/split/__init__.pxd deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/python/cudf/cudf/_lib/strings/split/__init__.py b/python/cudf/cudf/_lib/strings/split/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/python/cudf/cudf/_lib/strings/split/partition.pyx b/python/cudf/cudf/_lib/strings/split/partition.pyx deleted file mode 100644 index 5319addc41c..00000000000 --- a/python/cudf/cudf/_lib/strings/split/partition.pyx +++ /dev/null @@ -1,35 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from cudf.core.buffer import acquire_spill_lock - -from cudf._lib.column cimport Column - -import pylibcudf as plc - - -@acquire_spill_lock() -def partition(Column source_strings, - object py_delimiter): - """ - Returns data by splitting the `source_strings` - column at the first occurrence of the specified `py_delimiter`. - """ - plc_table = plc.strings.split.partition.partition( - source_strings.to_pylibcudf(mode="read"), - py_delimiter.device_value.c_value - ) - return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns())) - - -@acquire_spill_lock() -def rpartition(Column source_strings, - object py_delimiter): - """ - Returns a Column by splitting the `source_strings` - column at the last occurrence of the specified `py_delimiter`. - """ - plc_table = plc.strings.split.partition.rpartition( - source_strings.to_pylibcudf(mode="read"), - py_delimiter.device_value.c_value - ) - return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns())) diff --git a/python/cudf/cudf/_lib/strings/split/split.pyx b/python/cudf/cudf/_lib/strings/split/split.pyx deleted file mode 100644 index 4ec6c7073d8..00000000000 --- a/python/cudf/cudf/_lib/strings/split/split.pyx +++ /dev/null @@ -1,155 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from cudf.core.buffer import acquire_spill_lock - -from pylibcudf.libcudf.types cimport size_type - -from cudf._lib.column cimport Column - -import pylibcudf as plc - - -@acquire_spill_lock() -def split(Column source_strings, - object py_delimiter, - size_type maxsplit): - """ - Returns data by splitting the `source_strings` - column around the specified `py_delimiter`. - The split happens from beginning. - """ - plc_table = plc.strings.split.split.split( - source_strings.to_pylibcudf(mode="read"), - py_delimiter.device_value.c_value, - maxsplit, - ) - return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns())) - - -@acquire_spill_lock() -def split_record(Column source_strings, - object py_delimiter, - size_type maxsplit): - """ - Returns a Column by splitting the `source_strings` - column around the specified `py_delimiter`. - The split happens from beginning. - """ - plc_column = plc.strings.split.split.split_record( - source_strings.to_pylibcudf(mode="read"), - py_delimiter.device_value.c_value, - maxsplit, - ) - return Column.from_pylibcudf(plc_column) - - -@acquire_spill_lock() -def rsplit(Column source_strings, - object py_delimiter, - size_type maxsplit): - """ - Returns data by splitting the `source_strings` - column around the specified `py_delimiter`. - The split happens from the end. - """ - plc_table = plc.strings.split.split.rsplit( - source_strings.to_pylibcudf(mode="read"), - py_delimiter.device_value.c_value, - maxsplit, - ) - return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns())) - - -@acquire_spill_lock() -def rsplit_record(Column source_strings, - object py_delimiter, - size_type maxsplit): - """ - Returns a Column by splitting the `source_strings` - column around the specified `py_delimiter`. - The split happens from the end. - """ - plc_column = plc.strings.split.split.rsplit_record( - source_strings.to_pylibcudf(mode="read"), - py_delimiter.device_value.c_value, - maxsplit, - ) - return Column.from_pylibcudf(plc_column) - - -@acquire_spill_lock() -def split_re(Column source_strings, - object pattern, - size_type maxsplit): - """ - Returns data by splitting the `source_strings` - column around the delimiters identified by `pattern`. - """ - plc_table = plc.strings.split.split.split_re( - source_strings.to_pylibcudf(mode="read"), - plc.strings.regex_program.RegexProgram.create( - str(pattern), - plc.strings.regex_flags.RegexFlags.DEFAULT, - ), - maxsplit, - ) - return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns())) - - -@acquire_spill_lock() -def rsplit_re(Column source_strings, - object pattern, - size_type maxsplit): - """ - Returns data by splitting the `source_strings` - column around the delimiters identified by `pattern`. - The delimiters are searched starting from the end of each string. - """ - plc_table = plc.strings.split.split.rsplit_re( - source_strings.to_pylibcudf(mode="read"), - plc.strings.regex_program.RegexProgram.create( - str(pattern), - plc.strings.regex_flags.RegexFlags.DEFAULT, - ), - maxsplit, - ) - return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns())) - - -@acquire_spill_lock() -def split_record_re(Column source_strings, - object pattern, - size_type maxsplit): - """ - Returns a Column by splitting the `source_strings` - column around the delimiters identified by `pattern`. - """ - plc_column = plc.strings.split.split.split_record_re( - source_strings.to_pylibcudf(mode="read"), - plc.strings.regex_program.RegexProgram.create( - str(pattern), - plc.strings.regex_flags.RegexFlags.DEFAULT, - ), - maxsplit, - ) - return Column.from_pylibcudf(plc_column) - - -@acquire_spill_lock() -def rsplit_record_re(Column source_strings, - object pattern, - size_type maxsplit): - """ - Returns a Column by splitting the `source_strings` - column around the delimiters identified by `pattern`. - The delimiters are searched starting from the end of each string. - """ - plc_column = plc.strings.split.split.rsplit_record_re( - source_strings.to_pylibcudf(mode="read"), - plc.strings.regex_program.RegexProgram.create( - str(pattern), - plc.strings.regex_flags.RegexFlags.DEFAULT, - ), - maxsplit, - ) - return Column.from_pylibcudf(plc_column) diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py index 2c22724d3d7..9e6a73f1a9c 100644 --- a/python/cudf/cudf/core/column/decimal.py +++ b/python/cudf/cudf/core/column/decimal.py @@ -10,13 +10,12 @@ import numpy as np import pyarrow as pa +import pylibcudf as plc + import cudf -from cudf._lib.strings.convert.convert_fixed_point import ( - from_decimal as cpp_from_decimal, -) from cudf.api.types import is_scalar from cudf.core._internals import binaryop, unary -from cudf.core.buffer import as_buffer +from cudf.core.buffer import acquire_spill_lock, as_buffer from cudf.core.column.column import ColumnBase from cudf.core.column.numerical_base import NumericalBaseColumn from cudf.core.dtypes import ( @@ -89,7 +88,13 @@ def as_decimal_column( def as_string_column(self) -> cudf.core.column.StringColumn: if len(self) > 0: - return cpp_from_decimal(self) + with acquire_spill_lock(): + plc_column = ( + plc.strings.convert.convert_fixed_point.from_fixed_point( + self.to_pylibcudf(mode="read"), + ) + ) + return type(self).from_pylibcudf(plc_column) # type: ignore[return-value] else: return cast( cudf.core.column.StringColumn, diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index ea384888388..b95fb0a0d39 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -14,7 +14,6 @@ import cudf import cudf.core.column.column as column -from cudf._lib.strings.convert.convert_lists import format_list_column from cudf._lib.types import size_type_dtype from cudf.api.types import _is_non_decimal_numeric_dtype, is_scalar from cudf.core.buffer import acquire_spill_lock @@ -272,8 +271,13 @@ def as_string_column(self) -> cudf.core.column.StringColumn: # Separator strings to match the Python format separators = as_column([", ", "[", "]"]) - # Call libcudf to format the list column - return format_list_column(lc, separators) + with acquire_spill_lock(): + plc_column = plc.strings.convert.convert_lists.format_list_column( + lc.to_pylibcudf(mode="read"), + cudf.Scalar("None").device_value.c_value, + separators.to_pylibcudf(mode="read"), + ) + return type(self).from_pylibcudf(plc_column) # type: ignore[return-value] def _transform_leaves(self, func, *args, **kwargs) -> Self: # return a new list column with the same nested structure diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 6b45828568c..4a2483a80e3 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -19,6 +19,7 @@ import cudf.api.types import cudf.core.column.column as column import cudf.core.column.datetime as datetime +from cudf import _lib as libcudf from cudf._lib import string_casting as str_cast, strings as libstrings from cudf._lib.column import Column from cudf._lib.types import size_type_dtype @@ -44,6 +45,7 @@ SeriesOrIndex, ) from cudf.core.buffer import Buffer + from cudf.core.column.numerical import NumericalColumn def str_to_boolean(column: StringColumn): @@ -1336,7 +1338,7 @@ def isinteger(self) -> SeriesOrIndex: 2 False dtype: bool """ - return self._return_or_inplace(libstrings.is_integer(self._column)) + return self._return_or_inplace(self._column.is_integer()) def ishex(self) -> SeriesOrIndex: """ @@ -1468,7 +1470,7 @@ def isfloat(self) -> SeriesOrIndex: 3 False dtype: bool """ - return self._return_or_inplace(libstrings.is_float(self._column)) + return self._return_or_inplace(self._column.is_float()) def isdecimal(self) -> SeriesOrIndex: """ @@ -2710,26 +2712,25 @@ def split( if len(str(pat)) <= 1: regex = False + result_table: StringColumn | dict[int, StringColumn] if expand: if self._column.null_count == len(self._column): result_table = {0: self._column.copy()} else: if regex is True: - data = libstrings.split_re(self._column, pat, n) + data = self._column.split_re(pat, n) else: - data = libstrings.split( - self._column, cudf.Scalar(pat, "str"), n - ) + data = self._column.split(cudf.Scalar(pat, "str"), n) if len(data) == 1 and data[0].null_count == len(self._column): result_table = {} else: result_table = data else: if regex is True: - result_table = libstrings.split_record_re(self._column, pat, n) + result_table = self._column.split_record_re(pat, n) else: - result_table = libstrings.split_record( - self._column, cudf.Scalar(pat, "str"), n + result_table = self._column.split_record( + cudf.Scalar(pat, "str"), n ) return self._return_or_inplace(result_table, expand=expand) @@ -2883,28 +2884,25 @@ def rsplit( if regex and isinstance(pat, re.Pattern): pat = pat.pattern + result_table: StringColumn | dict[int, StringColumn] if expand: if self._column.null_count == len(self._column): result_table = {0: self._column.copy()} else: if regex is True: - data = libstrings.rsplit_re(self._column, pat, n) + data = self._column.rsplit_re(pat, n) else: - data = libstrings.rsplit( - self._column, cudf.Scalar(pat, "str"), n - ) + data = self._column.rsplit(cudf.Scalar(pat, "str"), n) if len(data) == 1 and data[0].null_count == len(self._column): result_table = {} else: result_table = data else: if regex is True: - result_table = libstrings.rsplit_record_re( - self._column, pat, n - ) + result_table = self._column.rsplit_record_re(pat, n) else: - result_table = libstrings.rsplit_record( - self._column, cudf.Scalar(pat, "str"), n + result_table = self._column.rsplit_record( + cudf.Scalar(pat, "str"), n ) return self._return_or_inplace(result_table, expand=expand) @@ -2989,7 +2987,7 @@ def partition(self, sep: str = " ", expand: bool = True) -> SeriesOrIndex: sep = " " return self._return_or_inplace( - libstrings.partition(self._column, cudf.Scalar(sep, "str")), + self._column.partition(cudf.Scalar(sep, "str")), expand=expand, ) @@ -3054,7 +3052,7 @@ def rpartition(self, sep: str = " ", expand: bool = True) -> SeriesOrIndex: sep = " " return self._return_or_inplace( - libstrings.rpartition(self._column, cudf.Scalar(sep, "str")), + self._column.rpartition(cudf.Scalar(sep, "str")), expand=expand, ) @@ -4499,8 +4497,7 @@ def url_decode(self) -> SeriesOrIndex: 1 https://medium.com/rapids-ai dtype: object """ - - return self._return_or_inplace(libstrings.url_decode(self._column)) + return self._return_or_inplace(self._column.url_decode()) def url_encode(self) -> SeriesOrIndex: """ @@ -4531,7 +4528,7 @@ def url_encode(self) -> SeriesOrIndex: 1 https%3A%2F%2Fmedium.com%2Frapids-ai dtype: object """ - return self._return_or_inplace(libstrings.url_encode(self._column)) + return self._return_or_inplace(self._column.url_encode()) def code_points(self) -> SeriesOrIndex: """ @@ -6015,13 +6012,13 @@ def as_numerical_column( out_dtype = cudf.api.types.dtype(dtype) string_col = self if out_dtype.kind in {"i", "u"}: - if not libstrings.is_integer(string_col).all(): + if not string_col.is_integer().all(): raise ValueError( "Could not convert strings to integer " "type due to presence of non-integer values." ) elif out_dtype.kind == "f": - if not libstrings.is_float(string_col).all(): + if not string_col.is_float().all(): raise ValueError( "Could not convert strings to float " "type due to presence of non-floating values." @@ -6099,10 +6096,17 @@ def as_timedelta_column( ) -> cudf.core.column.TimeDeltaColumn: return self.strptime(dtype, "%D days %H:%M:%S") # type: ignore[return-value] + @acquire_spill_lock() def as_decimal_column( self, dtype: Dtype - ) -> "cudf.core.column.DecimalBaseColumn": - return libstrings.to_decimal(self, dtype) + ) -> cudf.core.column.DecimalBaseColumn: + plc_column = plc.strings.convert.convert_fixed_point.to_fixed_point( + self.to_pylibcudf(mode="read"), + libcudf.types.dtype_to_pylibcudf_type(dtype), + ) + result = Column.from_pylibcudf(plc_column) + result.dtype.precision = dtype.precision # type: ignore[union-attr] + return result # type: ignore[return-value] def as_string_column(self) -> StringColumn: return self @@ -6138,12 +6142,9 @@ def can_cast_safely(self, to_dtype: Dtype) -> bool: if self.dtype == to_dtype: return True - elif ( - to_dtype.kind in {"i", "u"} - and not libstrings.is_integer(self).all() - ): + elif to_dtype.kind in {"i", "u"} and not self.is_integer().all(): return False - elif to_dtype.kind == "f" and not libstrings.is_float(self).all(): + elif to_dtype.kind == "f" and not self.is_float().all(): return False else: return True @@ -6333,11 +6334,180 @@ def title(self) -> Self: def is_title(self) -> Self: return self._modify_characters(plc.strings.capitalize.is_title) + @acquire_spill_lock() def replace_multiple(self, pattern: Self, replacements: Self) -> Self: - with acquire_spill_lock(): - plc_result = plc.strings.replace.replace_multiple( - self.to_pylibcudf(mode="read"), - pattern.to_pylibcudf(mode="read"), - replacements.to_pylibcudf(mode="read"), + plc_result = plc.strings.replace.replace_multiple( + self.to_pylibcudf(mode="read"), + pattern.to_pylibcudf(mode="read"), + replacements.to_pylibcudf(mode="read"), + ) + return cast(Self, Column.from_pylibcudf(plc_result)) + + @acquire_spill_lock() + def _split_record_re( + self, + pattern: str, + maxsplit: int, + method: Callable[ + [plc.Column, plc.strings.regex_program.RegexProgram, int], + plc.Column, + ], + ) -> Self: + plc_column = method( + self.to_pylibcudf(mode="read"), + plc.strings.regex_program.RegexProgram.create( + pattern, + plc.strings.regex_flags.RegexFlags.DEFAULT, + ), + maxsplit, + ) + return cast(Self, Column.from_pylibcudf(plc_column)) + + def split_record_re(self, pattern: str, maxsplit: int) -> Self: + return self._split_record_re( + pattern, maxsplit, plc.strings.split.split.split_record_re + ) + + def rsplit_record_re(self, pattern: str, maxsplit: int) -> Self: + return self._split_record_re( + pattern, maxsplit, plc.strings.split.split.rsplit_record_re + ) + + @acquire_spill_lock() + def _split_re( + self, + pattern: str, + maxsplit: int, + method: Callable[ + [plc.Column, plc.strings.regex_program.RegexProgram, int], + plc.Table, + ], + ) -> dict[int, Self]: + plc_table = method( + self.to_pylibcudf(mode="read"), + plc.strings.regex_program.RegexProgram.create( + pattern, + plc.strings.regex_flags.RegexFlags.DEFAULT, + ), + maxsplit, + ) + return dict( + enumerate( + Column.from_pylibcudf(col) # type: ignore[misc] + for col in plc_table.columns() ) - return cast(Self, Column.from_pylibcudf(plc_result)) + ) + + def split_re(self, pattern: str, maxsplit: int) -> dict[int, Self]: + return self._split_re( + pattern, maxsplit, plc.strings.split.split.split_re + ) + + def rsplit_re(self, pattern: str, maxsplit: int) -> dict[int, Self]: + return self._split_re( + pattern, maxsplit, plc.strings.split.split.rsplit_re + ) + + @acquire_spill_lock() + def _split_record( + self, + delimiter: cudf.Scalar, + maxsplit: int, + method: Callable[[plc.Column, plc.Scalar, int], plc.Column], + ) -> Self: + plc_column = method( + self.to_pylibcudf(mode="read"), + delimiter.device_value.c_value, + maxsplit, + ) + return type(self).from_pylibcudf(plc_column) # type: ignore[return-value] + + def split_record(self, delimiter: cudf.Scalar, maxsplit: int) -> Self: + return self._split_record( + delimiter, maxsplit, plc.strings.split.split.split_record + ) + + def rsplit_record(self, delimiter: cudf.Scalar, maxsplit: int) -> Self: + return self._split_record( + delimiter, maxsplit, plc.strings.split.split.rsplit_record + ) + + @acquire_spill_lock() + def _split( + self, + delimiter: cudf.Scalar, + maxsplit: int, + method: Callable[[plc.Column, plc.Scalar, int], plc.Column], + ) -> dict[int, Self]: + plc_table = method( + self.to_pylibcudf(mode="read"), + delimiter.device_value.c_value, + maxsplit, + ) + return dict( + enumerate( + Column.from_pylibcudf(col) # type: ignore[misc] + for col in plc_table.columns() + ) + ) + + def split(self, delimiter: cudf.Scalar, maxsplit: int) -> dict[int, Self]: + return self._split(delimiter, maxsplit, plc.strings.split.split.split) + + def rsplit(self, delimiter: cudf.Scalar, maxsplit: int) -> dict[int, Self]: + return self._split(delimiter, maxsplit, plc.strings.split.split.rsplit) + + @acquire_spill_lock() + def _partition( + self, + delimiter: cudf.Scalar, + method: Callable[[plc.Column, plc.Scalar], plc.Column], + ) -> dict[int, Self]: + plc_table = method( + self.to_pylibcudf(mode="read"), + delimiter.device_value.c_value, + ) + return dict( + enumerate( + Column.from_pylibcudf(col) # type: ignore[misc] + for col in plc_table.columns() + ) + ) + + def partition(self, delimiter: cudf.Scalar) -> dict[int, Self]: + return self._partition( + delimiter, plc.strings.split.partition.partition + ) + + def rpartition(self, delimiter: cudf.Scalar) -> dict[int, Self]: + return self._partition( + delimiter, plc.strings.split.partition.rpartition + ) + + @acquire_spill_lock() + def url_decode(self) -> Self: + plc_column = plc.strings.convert.convert_urls.url_decode( + self.to_pylibcudf(mode="read") + ) + return type(self).from_pylibcudf(plc_column) # type: ignore[return-value] + + @acquire_spill_lock() + def url_encode(self) -> Self: + plc_column = plc.strings.convert.convert_urls.url_encode( + self.to_pylibcudf(mode="read") + ) + return type(self).from_pylibcudf(plc_column) # type: ignore[return-value] + + @acquire_spill_lock() + def is_integer(self) -> NumericalColumn: + plc_column = plc.strings.convert.convert_integers.is_integer( + self.to_pylibcudf(mode="read") + ) + return type(self).from_pylibcudf(plc_column) # type: ignore[return-value] + + @acquire_spill_lock() + def is_float(self) -> NumericalColumn: + plc_column = plc.strings.convert.convert_floats.is_float( + self.to_pylibcudf(mode="read") + ) + return type(self).from_pylibcudf(plc_column) # type: ignore[return-value] diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py index 80ee078917a..8be336021b1 100644 --- a/python/cudf/cudf/core/tools/datetimes.py +++ b/python/cudf/cudf/core/tools/datetimes.py @@ -15,9 +15,6 @@ import cudf from cudf import _lib as libcudf -from cudf._lib.strings.convert.convert_integers import ( - is_integer as cpp_is_integer, -) from cudf.api.types import is_integer, is_scalar from cudf.core import column from cudf.core.buffer import acquire_spill_lock @@ -232,7 +229,7 @@ def to_datetime( ) break elif arg_col.dtype.kind == "O": - if not cpp_is_integer(arg_col).all(): + if not arg_col.is_integer().all(): col = new_series._column.strptime( cudf.dtype("datetime64[ns]"), format=format ) diff --git a/python/cudf/cudf/core/tools/numeric.py b/python/cudf/cudf/core/tools/numeric.py index 91f23490031..40348461f8c 100644 --- a/python/cudf/cudf/core/tools/numeric.py +++ b/python/cudf/cudf/core/tools/numeric.py @@ -2,14 +2,13 @@ from __future__ import annotations import warnings -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Literal import numpy as np import pandas as pd import cudf from cudf import _lib as libcudf -from cudf._lib import strings as libstrings from cudf.api.types import _is_non_decimal_numeric_dtype, is_string_dtype from cudf.core._internals import unary from cudf.core.column import as_column @@ -18,10 +17,16 @@ from cudf.utils.dtypes import can_convert_to_column if TYPE_CHECKING: - from cudf.core.column import ColumnBase + from cudf.core.column.numerical import NumericalColumn + from cudf.core.column.string import StringColumn -def to_numeric(arg, errors="raise", downcast=None, dtype_backend=None): +def to_numeric( + arg, + errors: Literal["raise", "coerce", "ignore"] = "raise", + downcast: Literal["integer", "signed", "unsigned", "float", None] = None, + dtype_backend=None, +): """ Convert argument into numerical types. @@ -130,7 +135,9 @@ def to_numeric(arg, errors="raise", downcast=None, dtype_backend=None): else: try: col = _convert_str_col( - col._get_decategorized_column(), errors, downcast + col._get_decategorized_column(), # type: ignore[attr-defined] + errors, + downcast, ) except ValueError as e: if errors == "ignore": @@ -139,7 +146,7 @@ def to_numeric(arg, errors="raise", downcast=None, dtype_backend=None): raise e elif is_string_dtype(dtype): try: - col = _convert_str_col(col, errors, downcast) + col = _convert_str_col(col, errors, downcast) # type: ignore[arg-type] except ValueError as e: if errors == "ignore": return arg @@ -186,7 +193,11 @@ def to_numeric(arg, errors="raise", downcast=None, dtype_backend=None): return col.values -def _convert_str_col(col, errors, _downcast=None): +def _convert_str_col( + col: StringColumn, + errors: Literal["raise", "coerce", "ignore"], + _downcast: Literal["integer", "signed", "unsigned", "float", None] = None, +) -> NumericalColumn: """ Converts a string column to numeric column @@ -212,13 +223,21 @@ def _convert_str_col(col, errors, _downcast=None): if not is_string_dtype(col): raise TypeError("col must be string dtype.") - is_integer = libstrings.is_integer(col) - if is_integer.all(): - return col.astype(dtype=cudf.dtype("i8")) + if col.is_integer().all(): + return col.astype(dtype=cudf.dtype("i8")) # type: ignore[return-value] - col = _proc_inf_empty_strings(col) + # TODO: This can be handled by libcudf in + # future see StringColumn.as_numerical_column + converted_col = ( + col.to_lower() + .find_and_replace(as_column([""]), as_column(["NaN"])) + .replace_multiple( + as_column(["+", "inf", "inity"]), # type: ignore[arg-type] + as_column(["", "Inf", ""]), # type: ignore[arg-type] + ) + ) - is_float = libstrings.is_float(col) + is_float = converted_col.is_float() if is_float.all(): if _downcast in {"unsigned", "signed", "integer"}: warnings.warn( @@ -227,27 +246,14 @@ def _convert_str_col(col, errors, _downcast=None): "limited by float32 precision." ) ) - return col.astype(dtype=cudf.dtype("float32")) + return converted_col.astype(dtype=cudf.dtype("float32")) # type: ignore[return-value] else: - return col.astype(dtype=cudf.dtype("float64")) + return converted_col.astype(dtype=cudf.dtype("float64")) # type: ignore[return-value] else: if errors == "coerce": - col = libcudf.string_casting.stod(col) + converted_col = libcudf.string_casting.stod(converted_col) non_numerics = is_float.unary_operator("not") - col[non_numerics] = None - return col + converted_col[non_numerics] = None + return converted_col # type: ignore[return-value] else: raise ValueError("Unable to convert some strings to numerics.") - - -def _proc_inf_empty_strings(col: ColumnBase) -> ColumnBase: - """Handles empty and infinity strings""" - col = col.to_lower() # type: ignore[attr-defined] - col = col.find_and_replace(as_column([""]), as_column(["NaN"])) - # TODO: This can be handled by libcudf in - # future see StringColumn.as_numerical_column - col = col.replace_multiple( # type: ignore[attr-defined] - as_column(["+", "inf", "inity"]), - as_column(["", "Inf", ""]), - ) - return col