From 6eaa65f6dbf396c6035bf987299f5cbb99157597 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 26 Nov 2024 17:00:54 -0800 Subject: [PATCH] Remove some cudf._lib.strings files in favor of inlining pylibcudf (#17394) Contributes to https://github.com/rapidsai/cudf/issues/17317 Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/17394 --- python/cudf/cudf/_lib/strings/CMakeLists.txt | 30 - python/cudf/cudf/_lib/strings/__init__.py | 56 -- python/cudf/cudf/_lib/strings/attributes.pyx | 43 - python/cudf/cudf/_lib/strings/capitalize.pyx | 34 - python/cudf/cudf/_lib/strings/case.pyx | 34 - python/cudf/cudf/_lib/strings/char_types.pyx | 141 --- python/cudf/cudf/_lib/strings/combine.pyx | 90 -- python/cudf/cudf/_lib/strings/contains.pyx | 60 -- python/cudf/cudf/_lib/strings/extract.pyx | 24 - python/cudf/cudf/_lib/strings/find.pyx | 139 --- .../cudf/cudf/_lib/strings/find_multiple.pyx | 20 - python/cudf/cudf/_lib/strings/findall.pyx | 41 - python/cudf/cudf/_lib/strings/json.pyx | 26 - python/cudf/cudf/_lib/strings/padding.pyx | 73 -- python/cudf/cudf/_lib/strings/repeat.pyx | 38 - python/cudf/cudf/_lib/strings/replace.pyx | 88 -- python/cudf/cudf/_lib/strings/replace_re.pyx | 69 -- python/cudf/cudf/_lib/strings/strip.pyx | 54 - python/cudf/cudf/_lib/strings/substring.pyx | 85 -- python/cudf/cudf/_lib/strings/translate.pyx | 42 - python/cudf/cudf/_lib/strings/wrap.pyx | 24 - python/cudf/cudf/core/column/string.py | 952 ++++++++++-------- python/cudf/cudf/core/tools/numeric.py | 5 +- python/cudf/cudf/tests/test_string.py | 2 +- 24 files changed, 550 insertions(+), 1620 deletions(-) delete mode 100644 python/cudf/cudf/_lib/strings/attributes.pyx delete mode 100644 python/cudf/cudf/_lib/strings/capitalize.pyx delete mode 100644 python/cudf/cudf/_lib/strings/case.pyx delete mode 100644 python/cudf/cudf/_lib/strings/char_types.pyx delete mode 100644 python/cudf/cudf/_lib/strings/combine.pyx delete mode 100644 python/cudf/cudf/_lib/strings/contains.pyx delete mode 100644 python/cudf/cudf/_lib/strings/extract.pyx delete mode 100644 python/cudf/cudf/_lib/strings/find.pyx delete mode 100644 python/cudf/cudf/_lib/strings/find_multiple.pyx delete mode 100644 python/cudf/cudf/_lib/strings/findall.pyx delete mode 100644 python/cudf/cudf/_lib/strings/json.pyx delete mode 100644 python/cudf/cudf/_lib/strings/padding.pyx delete mode 100644 python/cudf/cudf/_lib/strings/repeat.pyx delete mode 100644 python/cudf/cudf/_lib/strings/replace.pyx delete mode 100644 python/cudf/cudf/_lib/strings/replace_re.pyx delete mode 100644 python/cudf/cudf/_lib/strings/strip.pyx delete mode 100644 python/cudf/cudf/_lib/strings/substring.pyx delete mode 100644 python/cudf/cudf/_lib/strings/translate.pyx delete mode 100644 python/cudf/cudf/_lib/strings/wrap.pyx diff --git a/python/cudf/cudf/_lib/strings/CMakeLists.txt b/python/cudf/cudf/_lib/strings/CMakeLists.txt index ceeff71683c..dca9c4cc3fc 100644 --- a/python/cudf/cudf/_lib/strings/CMakeLists.txt +++ b/python/cudf/cudf/_lib/strings/CMakeLists.txt @@ -11,35 +11,5 @@ # or implied. See the License for the specific language governing permissions and limitations under # the License. # ============================================================================= - -set(cython_sources - attributes.pyx - capitalize.pyx - case.pyx - char_types.pyx - combine.pyx - contains.pyx - extract.pyx - find.pyx - find_multiple.pyx - findall.pyx - json.pyx - padding.pyx - repeat.pyx - replace.pyx - replace_re.pyx - strip.pyx - substring.pyx - translate.pyx - wrap.pyx -) - -set(linked_libraries cudf::cudf) -rapids_cython_create_modules( - CXX - SOURCE_FILES "${cython_sources}" - LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX strings_ ASSOCIATED_TARGETS cudf -) - add_subdirectory(convert) add_subdirectory(split) diff --git a/python/cudf/cudf/_lib/strings/__init__.py b/python/cudf/cudf/_lib/strings/__init__.py index 4c0ec2d9ac5..b795c54c112 100644 --- a/python/cudf/cudf/_lib/strings/__init__.py +++ b/python/cudf/cudf/_lib/strings/__init__.py @@ -32,62 +32,10 @@ detokenize, tokenize_with_vocabulary, ) -from cudf._lib.strings.attributes import ( - code_points, - count_bytes, - count_characters, -) -from cudf._lib.strings.capitalize import capitalize, is_title, title -from cudf._lib.strings.case import swapcase, to_lower, to_upper -from cudf._lib.strings.char_types import ( - filter_alphanum, - is_alnum, - is_alpha, - is_decimal, - is_digit, - is_lower, - is_numeric, - is_space, - is_upper, -) -from cudf._lib.strings.combine import ( - concatenate, - join, - join_lists_with_column, - join_lists_with_scalar, -) -from cudf._lib.strings.contains import contains_re, count_re, like, match_re from cudf._lib.strings.convert.convert_fixed_point import to_decimal from cudf._lib.strings.convert.convert_floats import is_float from cudf._lib.strings.convert.convert_integers import is_integer from cudf._lib.strings.convert.convert_urls import url_decode, url_encode -from cudf._lib.strings.extract import extract -from cudf._lib.strings.find import ( - contains, - contains_multiple, - endswith, - endswith_multiple, - find, - rfind, - startswith, - startswith_multiple, -) -from cudf._lib.strings.find_multiple import find_multiple -from cudf._lib.strings.findall import find_re, findall -from cudf._lib.strings.json import get_json_object -from cudf._lib.strings.padding import center, ljust, pad, rjust, zfill -from cudf._lib.strings.repeat import repeat_scalar, repeat_sequence -from cudf._lib.strings.replace import ( - insert, - replace, - replace_multi, - slice_replace, -) -from cudf._lib.strings.replace_re import ( - replace_multi_re, - replace_re, - replace_with_backrefs, -) from cudf._lib.strings.split.partition import partition, rpartition from cudf._lib.strings.split.split import ( rsplit, @@ -99,7 +47,3 @@ split_record, split_record_re, ) -from cudf._lib.strings.strip import lstrip, rstrip, strip -from cudf._lib.strings.substring import get, slice_from, slice_strings -from cudf._lib.strings.translate import filter_characters, translate -from cudf._lib.strings.wrap import wrap diff --git a/python/cudf/cudf/_lib/strings/attributes.pyx b/python/cudf/cudf/_lib/strings/attributes.pyx deleted file mode 100644 index df81b3942b4..00000000000 --- a/python/cudf/cudf/_lib/strings/attributes.pyx +++ /dev/null @@ -1,43 +0,0 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. - -from cudf.core.buffer import acquire_spill_lock - -from cudf._lib.column cimport Column - -import pylibcudf as plc - - -@acquire_spill_lock() -def count_characters(Column source_strings): - """ - Returns an integer numeric column containing the - length of each string in characters. - """ - plc_column = plc.strings.attributes.count_characters( - source_strings.to_pylibcudf(mode="read") - ) - return Column.from_pylibcudf(plc_column) - - -@acquire_spill_lock() -def count_bytes(Column source_strings): - """ - Returns an integer numeric column containing the - number of bytes of each string. - """ - plc_column = plc.strings.attributes.count_bytes( - source_strings.to_pylibcudf(mode="read") - ) - return Column.from_pylibcudf(plc_column) - - -@acquire_spill_lock() -def code_points(Column source_strings): - """ - Creates a numeric column with code point values (integers) - for each character of each string. - """ - plc_column = plc.strings.attributes.code_points( - source_strings.to_pylibcudf(mode="read") - ) - return Column.from_pylibcudf(plc_column) diff --git a/python/cudf/cudf/_lib/strings/capitalize.pyx b/python/cudf/cudf/_lib/strings/capitalize.pyx deleted file mode 100644 index 42c40e2e753..00000000000 --- a/python/cudf/cudf/_lib/strings/capitalize.pyx +++ /dev/null @@ -1,34 +0,0 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. - -from cudf.core.buffer import acquire_spill_lock - -from cudf._lib.column cimport Column - -import pylibcudf as plc - - -@acquire_spill_lock() -def capitalize(Column source_strings): - return Column.from_pylibcudf( - plc.strings.capitalize.capitalize( - source_strings.to_pylibcudf(mode="read") - ) - ) - - -@acquire_spill_lock() -def title(Column source_strings): - return Column.from_pylibcudf( - plc.strings.capitalize.title( - source_strings.to_pylibcudf(mode="read") - ) - ) - - -@acquire_spill_lock() -def is_title(Column source_strings): - return Column.from_pylibcudf( - plc.strings.capitalize.is_title( - source_strings.to_pylibcudf(mode="read") - ) - ) diff --git a/python/cudf/cudf/_lib/strings/case.pyx b/python/cudf/cudf/_lib/strings/case.pyx deleted file mode 100644 index ad4cbb6f088..00000000000 --- a/python/cudf/cudf/_lib/strings/case.pyx +++ /dev/null @@ -1,34 +0,0 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. - -from cudf.core.buffer import acquire_spill_lock - -from cudf._lib.column cimport Column - -from pylibcudf.strings import case - - -@acquire_spill_lock() -def to_upper(Column source_strings): - return Column.from_pylibcudf( - case.to_upper( - source_strings.to_pylibcudf(mode='read') - ) - ) - - -@acquire_spill_lock() -def to_lower(Column source_strings): - return Column.from_pylibcudf( - case.to_lower( - source_strings.to_pylibcudf(mode='read') - ) - ) - - -@acquire_spill_lock() -def swapcase(Column source_strings): - return Column.from_pylibcudf( - case.swapcase( - source_strings.to_pylibcudf(mode='read') - ) - ) diff --git a/python/cudf/cudf/_lib/strings/char_types.pyx b/python/cudf/cudf/_lib/strings/char_types.pyx deleted file mode 100644 index a57ce29eb45..00000000000 --- a/python/cudf/cudf/_lib/strings/char_types.pyx +++ /dev/null @@ -1,141 +0,0 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. - -from libcpp cimport bool - -from cudf.core.buffer import acquire_spill_lock - -from cudf._lib.column cimport Column - -from pylibcudf.strings import char_types - - -@acquire_spill_lock() -def filter_alphanum(Column source_strings, object py_repl, bool keep=True): - """ - Returns a Column of strings keeping only alphanumeric character types. - """ - plc_column = char_types.filter_characters_of_type( - source_strings.to_pylibcudf(mode="read"), - char_types.StringCharacterTypes.ALL_TYPES if keep - else char_types.StringCharacterTypes.ALPHANUM, - py_repl.device_value.c_value, - char_types.StringCharacterTypes.ALPHANUM if keep - else char_types.StringCharacterTypes.ALL_TYPES - ) - return Column.from_pylibcudf(plc_column) - - -@acquire_spill_lock() -def is_decimal(Column source_strings): - """ - Returns a Column of boolean values with True for `source_strings` - that contain only decimal characters -- those that can be used - to extract base10 numbers. - """ - plc_column = char_types.all_characters_of_type( - source_strings.to_pylibcudf(mode="read"), - char_types.StringCharacterTypes.DECIMAL, - char_types.StringCharacterTypes.ALL_TYPES - ) - return Column.from_pylibcudf(plc_column) - - -@acquire_spill_lock() -def is_alnum(Column source_strings): - """ - Returns a Column of boolean values with True for `source_strings` - that contain only alphanumeric characters. - - Equivalent to: is_alpha() or is_digit() or is_numeric() or is_decimal() - """ - plc_column = char_types.all_characters_of_type( - source_strings.to_pylibcudf(mode="read"), - char_types.StringCharacterTypes.ALPHANUM, - char_types.StringCharacterTypes.ALL_TYPES - ) - return Column.from_pylibcudf(plc_column) - - -@acquire_spill_lock() -def is_alpha(Column source_strings): - """ - Returns a Column of boolean values with True for `source_strings` - that contain only alphabetic characters. - """ - plc_column = char_types.all_characters_of_type( - source_strings.to_pylibcudf(mode="read"), - char_types.StringCharacterTypes.ALPHA, - char_types.StringCharacterTypes.ALL_TYPES - ) - return Column.from_pylibcudf(plc_column) - - -@acquire_spill_lock() -def is_digit(Column source_strings): - """ - Returns a Column of boolean values with True for `source_strings` - that contain only decimal and digit characters. - """ - plc_column = char_types.all_characters_of_type( - source_strings.to_pylibcudf(mode="read"), - char_types.StringCharacterTypes.DIGIT, - char_types.StringCharacterTypes.ALL_TYPES - ) - return Column.from_pylibcudf(plc_column) - - -@acquire_spill_lock() -def is_numeric(Column source_strings): - """ - Returns a Column of boolean values with True for `source_strings` - that contain only numeric characters. These include digit and - numeric characters. - """ - plc_column = char_types.all_characters_of_type( - source_strings.to_pylibcudf(mode="read"), - char_types.StringCharacterTypes.NUMERIC, - char_types.StringCharacterTypes.ALL_TYPES - ) - return Column.from_pylibcudf(plc_column) - - -@acquire_spill_lock() -def is_upper(Column source_strings): - """ - Returns a Column of boolean values with True for `source_strings` - that contain only upper-case characters. - """ - plc_column = char_types.all_characters_of_type( - source_strings.to_pylibcudf(mode="read"), - char_types.StringCharacterTypes.UPPER, - char_types.StringCharacterTypes.CASE_TYPES - ) - return Column.from_pylibcudf(plc_column) - - -@acquire_spill_lock() -def is_lower(Column source_strings): - """ - Returns a Column of boolean values with True for `source_strings` - that contain only lower-case characters. - """ - plc_column = char_types.all_characters_of_type( - source_strings.to_pylibcudf(mode="read"), - char_types.StringCharacterTypes.LOWER, - char_types.StringCharacterTypes.CASE_TYPES - ) - return Column.from_pylibcudf(plc_column) - - -@acquire_spill_lock() -def is_space(Column source_strings): - """ - Returns a Column of boolean values with True for `source_strings` - that contains all characters which are spaces only. - """ - plc_column = char_types.all_characters_of_type( - source_strings.to_pylibcudf(mode="read"), - char_types.StringCharacterTypes.SPACE, - char_types.StringCharacterTypes.ALL_TYPES - ) - return Column.from_pylibcudf(plc_column) diff --git a/python/cudf/cudf/_lib/strings/combine.pyx b/python/cudf/cudf/_lib/strings/combine.pyx deleted file mode 100644 index 0f7b27d85d7..00000000000 --- a/python/cudf/cudf/_lib/strings/combine.pyx +++ /dev/null @@ -1,90 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from cudf.core.buffer import acquire_spill_lock - -from cudf._lib.column cimport Column - -import pylibcudf as plc - -import cudf - - -@acquire_spill_lock() -def concatenate(list source_strings, - object sep, - object na_rep): - """ - Returns a Column by concatenating strings column-wise in `source_strings` - with the specified `sep` between each column and - `na`/`None` values are replaced by `na_rep` - """ - plc_column = plc.strings.combine.concatenate( - plc.Table([col.to_pylibcudf(mode="read") for col in source_strings]), - sep.device_value.c_value, - na_rep.device_value.c_value, - ) - return Column.from_pylibcudf(plc_column) - - -@acquire_spill_lock() -def join(Column source_strings, - object sep, - object na_rep): - """ - Returns a Column by concatenating strings row-wise in `source_strings` - with the specified `sep` between each column and - `na`/`None` values are replaced by `na_rep` - """ - plc_column = plc.strings.combine.join_strings( - source_strings.to_pylibcudf(mode="read"), - sep.device_value.c_value, - na_rep.device_value.c_value, - ) - return Column.from_pylibcudf(plc_column) - - -@acquire_spill_lock() -def join_lists_with_scalar( - Column source_strings, - object py_separator, - object py_narep): - """ - Returns a Column by concatenating Lists of strings row-wise - in `source_strings` with the specified `py_separator` - between each string in lists and ``/`None` values - are replaced by `py_narep` - """ - plc_column = plc.strings.combine.join_list_elements( - source_strings.to_pylibcudf(mode="read"), - py_separator.device_value.c_value, - py_narep.device_value.c_value, - cudf._lib.scalar.DeviceScalar("", cudf.dtype("object")).c_value, - plc.strings.combine.SeparatorOnNulls.YES, - plc.strings.combine.OutputIfEmptyList.NULL_ELEMENT, - ) - return Column.from_pylibcudf(plc_column) - - -@acquire_spill_lock() -def join_lists_with_column( - Column source_strings, - Column separator_strings, - object py_source_narep, - object py_separator_narep): - """ - Returns a Column by concatenating Lists of strings row-wise in - `source_strings` with a corresponding separator at the same - position in `separator_strings` and ``/`None` values in - `source_strings` are replaced by `py_source_narep` and - ``/`None` values in `separator_strings` are replaced - by `py_separator_narep` - """ - plc_column = plc.strings.combine.join_list_elements( - source_strings.to_pylibcudf(mode="read"), - separator_strings.to_pylibcudf(mode="read"), - py_separator_narep.device_value.c_value, - py_source_narep.device_value.c_value, - plc.strings.combine.SeparatorOnNulls.YES, - plc.strings.combine.OutputIfEmptyList.NULL_ELEMENT, - ) - return Column.from_pylibcudf(plc_column) diff --git a/python/cudf/cudf/_lib/strings/contains.pyx b/python/cudf/cudf/_lib/strings/contains.pyx deleted file mode 100644 index 03b4887f200..00000000000 --- a/python/cudf/cudf/_lib/strings/contains.pyx +++ /dev/null @@ -1,60 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from libc.stdint cimport uint32_t - -from cudf.core.buffer import acquire_spill_lock - -from cudf._lib.column cimport Column - -from pylibcudf.strings import contains -from pylibcudf.strings.regex_program import RegexProgram - - -@acquire_spill_lock() -def contains_re(Column source_strings, object reg_ex, uint32_t flags): - """ - Returns a Column of boolean values with True for `source_strings` - that contain regular expression `reg_ex`. - """ - prog = RegexProgram.create(str(reg_ex), flags) - return Column.from_pylibcudf( - contains.contains_re(source_strings.to_pylibcudf(mode="read"), prog) - ) - - -@acquire_spill_lock() -def count_re(Column source_strings, object reg_ex, uint32_t flags): - """ - Returns a Column with count of occurrences of `reg_ex` in - each string of `source_strings` - """ - prog = RegexProgram.create(str(reg_ex), flags) - return Column.from_pylibcudf( - contains.count_re(source_strings.to_pylibcudf(mode="read"), prog) - ) - - -@acquire_spill_lock() -def match_re(Column source_strings, object reg_ex, uint32_t flags): - """ - Returns a Column with each value True if the string matches `reg_ex` - regular expression with each record of `source_strings` - """ - prog = RegexProgram.create(str(reg_ex), flags) - return Column.from_pylibcudf( - contains.matches_re(source_strings.to_pylibcudf(mode="read"), prog) - ) - - -@acquire_spill_lock() -def like(Column source_strings, object py_pattern, object py_escape): - """ - Returns a Column with each value True if the string matches the - `py_pattern` like expression with each record of `source_strings` - """ - plc_column = contains.like( - source_strings.to_pylibcudf(mode="read"), - py_pattern.device_value.c_value, - py_escape.device_value.c_value, - ) - return Column.from_pylibcudf(plc_column) diff --git a/python/cudf/cudf/_lib/strings/extract.pyx b/python/cudf/cudf/_lib/strings/extract.pyx deleted file mode 100644 index 5bf336f4f3c..00000000000 --- a/python/cudf/cudf/_lib/strings/extract.pyx +++ /dev/null @@ -1,24 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from libc.stdint cimport uint32_t - -from cudf.core.buffer import acquire_spill_lock - -from cudf._lib.column cimport Column - -import pylibcudf as plc - - -@acquire_spill_lock() -def extract(Column source_strings, object pattern, uint32_t flags): - """ - Returns data which contains extracted capture groups provided in - `pattern` for all `source_strings`. - The returning data contains one row for each subject string, - and one column for each group. - """ - prog = plc.strings.regex_program.RegexProgram.create(str(pattern), flags) - plc_result = plc.strings.extract.extract( - source_strings.to_pylibcudf(mode="read"), prog - ) - return dict(enumerate(Column.from_pylibcudf(col) for col in plc_result.columns())) diff --git a/python/cudf/cudf/_lib/strings/find.pyx b/python/cudf/cudf/_lib/strings/find.pyx deleted file mode 100644 index 2d284d1aa9d..00000000000 --- a/python/cudf/cudf/_lib/strings/find.pyx +++ /dev/null @@ -1,139 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -import pylibcudf as plc - -from cudf.core.buffer import acquire_spill_lock - -from pylibcudf.libcudf.types cimport size_type - -from cudf._lib.column cimport Column - - -@acquire_spill_lock() -def contains(Column source_strings, object py_target): - """ - Returns a Column of boolean values with True for `source_strings` - that contain the pattern given in `py_target`. - """ - return Column.from_pylibcudf( - plc.strings.find.contains( - source_strings.to_pylibcudf(mode="read"), - py_target.device_value.c_value - ) - ) - - -@acquire_spill_lock() -def contains_multiple(Column source_strings, Column target_strings): - """ - Returns a Column of boolean values with True for `source_strings` - that contain the corresponding string in `target_strings`. - """ - return Column.from_pylibcudf( - plc.strings.find.contains( - source_strings.to_pylibcudf(mode="read"), - target_strings.to_pylibcudf(mode="read") - ) - ) - - -@acquire_spill_lock() -def endswith(Column source_strings, object py_target): - """ - Returns a Column of boolean values with True for `source_strings` - that contain strings that end with the pattern given in `py_target`. - """ - - return Column.from_pylibcudf( - plc.strings.find.ends_with( - source_strings.to_pylibcudf(mode="read"), - py_target.device_value.c_value - ) - ) - - -@acquire_spill_lock() -def endswith_multiple(Column source_strings, Column target_strings): - """ - Returns a Column of boolean values with True for `source_strings` - that contain strings that end with corresponding location - in `target_strings`. - """ - return Column.from_pylibcudf( - plc.strings.find.ends_with( - source_strings.to_pylibcudf(mode="read"), - target_strings.to_pylibcudf(mode="read") - ) - ) - - -@acquire_spill_lock() -def startswith(Column source_strings, object py_target): - """ - Returns a Column of boolean values with True for `source_strings` - that contain strings that start with the pattern given in `py_target`. - """ - return Column.from_pylibcudf( - plc.strings.find.starts_with( - source_strings.to_pylibcudf(mode="read"), - py_target.device_value.c_value - ) - ) - - -@acquire_spill_lock() -def startswith_multiple(Column source_strings, Column target_strings): - """ - Returns a Column of boolean values with True for `source_strings` - that contain strings that begin with corresponding location - in `target_strings`. - """ - return Column.from_pylibcudf( - plc.strings.find.starts_with( - source_strings.to_pylibcudf(mode="read"), - target_strings.to_pylibcudf(mode="read") - ) - ) - - -@acquire_spill_lock() -def find(Column source_strings, - object py_target, - size_type start, - size_type end): - """ - Returns a Column containing lowest indexes in each string of - `source_strings` that fully contain `py_target` string. - Scan portion of strings in `source_strings` can be - controlled by setting `start` and `end` values. - """ - return Column.from_pylibcudf( - plc.strings.find.find( - source_strings.to_pylibcudf(mode="read"), - py_target.device_value.c_value, - start, - end - ) - ) - - -@acquire_spill_lock() -def rfind(Column source_strings, - object py_target, - size_type start, - size_type end): - """ - Returns a Column containing highest indexes in each string of - `source_strings` that fully contain `py_target` string. - Scan portion of strings in `source_strings` can be - controlled by setting `start` and `end` values. - """ - - return Column.from_pylibcudf( - plc.strings.find.rfind( - source_strings.to_pylibcudf(mode="read"), - py_target.device_value.c_value, - start, - end - ) - ) diff --git a/python/cudf/cudf/_lib/strings/find_multiple.pyx b/python/cudf/cudf/_lib/strings/find_multiple.pyx deleted file mode 100644 index 39e0013769f..00000000000 --- a/python/cudf/cudf/_lib/strings/find_multiple.pyx +++ /dev/null @@ -1,20 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from cudf.core.buffer import acquire_spill_lock - -from cudf._lib.column cimport Column - -import pylibcudf as plc - - -@acquire_spill_lock() -def find_multiple(Column source_strings, Column target_strings): - """ - Returns a column with character position values where each - of the `target_strings` are found in each string of `source_strings`. - """ - plc_result = plc.strings.find_multiple.find_multiple( - source_strings.to_pylibcudf(mode="read"), - target_strings.to_pylibcudf(mode="read") - ) - return Column.from_pylibcudf(plc_result) diff --git a/python/cudf/cudf/_lib/strings/findall.pyx b/python/cudf/cudf/_lib/strings/findall.pyx deleted file mode 100644 index 3e7a504d535..00000000000 --- a/python/cudf/cudf/_lib/strings/findall.pyx +++ /dev/null @@ -1,41 +0,0 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. - -from libc.stdint cimport uint32_t - -from cudf.core.buffer import acquire_spill_lock - -from cudf._lib.column cimport Column - -import pylibcudf as plc - - -@acquire_spill_lock() -def findall(Column source_strings, object pattern, uint32_t flags): - """ - Returns data with all non-overlapping matches of `pattern` - in each string of `source_strings` as a lists column. - """ - prog = plc.strings.regex_program.RegexProgram.create( - str(pattern), flags - ) - plc_result = plc.strings.findall.findall( - source_strings.to_pylibcudf(mode="read"), - prog, - ) - return Column.from_pylibcudf(plc_result) - - -@acquire_spill_lock() -def find_re(Column source_strings, object pattern, uint32_t flags): - """ - Returns character positions where the pattern first matches - the elements in source_strings. - """ - prog = plc.strings.regex_program.RegexProgram.create( - str(pattern), flags - ) - plc_result = plc.strings.findall.find_re( - source_strings.to_pylibcudf(mode="read"), - prog, - ) - return Column.from_pylibcudf(plc_result) diff --git a/python/cudf/cudf/_lib/strings/json.pyx b/python/cudf/cudf/_lib/strings/json.pyx deleted file mode 100644 index 374a104635a..00000000000 --- a/python/cudf/cudf/_lib/strings/json.pyx +++ /dev/null @@ -1,26 +0,0 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. - -import pylibcudf as plc -from pylibcudf.json cimport GetJsonObjectOptions - -from cudf.core.buffer import acquire_spill_lock - -from cudf._lib.column cimport Column - - -@acquire_spill_lock() -def get_json_object( - Column col, - object py_json_path, - GetJsonObjectOptions options -): - """ - Apply a JSONPath string to all rows in an input column - of json strings. - """ - plc_column = plc.json.get_json_object( - col.to_pylibcudf(mode="read"), - py_json_path.device_value.c_value, - options - ) - return Column.from_pylibcudf(plc_column) diff --git a/python/cudf/cudf/_lib/strings/padding.pyx b/python/cudf/cudf/_lib/strings/padding.pyx deleted file mode 100644 index 015a2ebab8a..00000000000 --- a/python/cudf/cudf/_lib/strings/padding.pyx +++ /dev/null @@ -1,73 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. -from cudf.core.buffer import acquire_spill_lock - -from pylibcudf.libcudf.types cimport size_type - -from cudf._lib.column cimport Column - -import pylibcudf as plc - - -@acquire_spill_lock() -def pad(Column source_strings, - size_type width, - fill_char, - side=plc.strings.side_type.SideType.LEFT): - """ - Returns a Column by padding strings in `source_strings` - up to the given `width`. Direction of padding is to be specified by `side`. - The additional characters being filled can be changed by specifying - `fill_char`. - """ - plc_result = plc.strings.padding.pad( - source_strings.to_pylibcudf(mode="read"), - width, - side, - fill_char, - ) - return Column.from_pylibcudf(plc_result) - - -@acquire_spill_lock() -def zfill(Column source_strings, - size_type width): - """ - Returns a Column by prepending strings in `source_strings` - with '0' characters up to the given `width`. - """ - plc_result = plc.strings.padding.zfill( - source_strings.to_pylibcudf(mode="read"), - width - ) - return Column.from_pylibcudf(plc_result) - - -def center(Column source_strings, - size_type width, - fill_char): - """ - Returns a Column by filling left and right side of strings - in `source_strings` with additional character, `fill_char` - up to the given `width`. - """ - return pad(source_strings, width, fill_char, plc.strings.side_type.SideType.BOTH) - - -def ljust(Column source_strings, - size_type width, - fill_char): - """ - Returns a Column by filling right side of strings in `source_strings` - with additional character, `fill_char` up to the given `width`. - """ - return pad(source_strings, width, fill_char, plc.strings.side_type.SideType.RIGHT) - - -def rjust(Column source_strings, - size_type width, - fill_char): - """ - Returns a Column by filling left side of strings in `source_strings` - with additional character, `fill_char` up to the given `width`. - """ - return pad(source_strings, width, fill_char, plc.strings.side_type.SideType.LEFT) diff --git a/python/cudf/cudf/_lib/strings/repeat.pyx b/python/cudf/cudf/_lib/strings/repeat.pyx deleted file mode 100644 index 43649d4defe..00000000000 --- a/python/cudf/cudf/_lib/strings/repeat.pyx +++ /dev/null @@ -1,38 +0,0 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. -from cudf.core.buffer import acquire_spill_lock - -from pylibcudf.libcudf.types cimport size_type - -from cudf._lib.column cimport Column - -import pylibcudf as plc - - -@acquire_spill_lock() -def repeat_scalar(Column source_strings, - size_type repeats): - """ - Returns a Column after repeating - each string in `source_strings` - `repeats` number of times. - """ - plc_result = plc.strings.repeat.repeat_strings( - source_strings.to_pylibcudf(mode="read"), - repeats - ) - return Column.from_pylibcudf(plc_result) - - -@acquire_spill_lock() -def repeat_sequence(Column source_strings, - Column repeats): - """ - Returns a Column after repeating - each string in `source_strings` - `repeats` number of times. - """ - plc_result = plc.strings.repeat.repeat_strings( - source_strings.to_pylibcudf(mode="read"), - repeats.to_pylibcudf(mode="read") - ) - return Column.from_pylibcudf(plc_result) diff --git a/python/cudf/cudf/_lib/strings/replace.pyx b/python/cudf/cudf/_lib/strings/replace.pyx deleted file mode 100644 index a260c4e4f45..00000000000 --- a/python/cudf/cudf/_lib/strings/replace.pyx +++ /dev/null @@ -1,88 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from libc.stdint cimport int32_t - -from cudf.core.buffer import acquire_spill_lock - -from pylibcudf.libcudf.types cimport size_type - -from cudf._lib.column cimport Column -from cudf._lib.scalar cimport DeviceScalar - -import pylibcudf as plc - - -@acquire_spill_lock() -def slice_replace(Column source_strings, - size_type start, - size_type stop, - object py_repl): - """ - Returns a Column by replacing specified section - of each string with `py_repl`. Positions can be - specified with `start` and `stop` params. - """ - - cdef DeviceScalar repl = py_repl.device_value - - return Column.from_pylibcudf(plc.strings.replace.replace_slice( - source_strings.to_pylibcudf(mode="read"), - repl.c_value, - start, - stop - )) - - -@acquire_spill_lock() -def insert(Column source_strings, - size_type start, - object py_repl): - """ - Returns a Column by inserting a specified - string `py_repl` at a specific position in all strings. - """ - - cdef DeviceScalar repl = py_repl.device_value - - return Column.from_pylibcudf(plc.strings.replace.replace_slice( - source_strings.to_pylibcudf(mode="read"), - repl.c_value, - start, - start, - )) - - -@acquire_spill_lock() -def replace(Column source_strings, - object py_target, - object py_repl, - int32_t maxrepl): - """ - Returns a Column after replacing occurrences of - patterns `py_target` with `py_repl` in `source_strings`. - `maxrepl` indicates number of replacements to make from start. - """ - cdef DeviceScalar target = py_target.device_value - cdef DeviceScalar repl = py_repl.device_value - - return Column.from_pylibcudf(plc.strings.replace.replace( - source_strings.to_pylibcudf(mode="read"), - target.c_value, - repl.c_value, - maxrepl - )) - - -@acquire_spill_lock() -def replace_multi(Column source_strings, - Column target_strings, - Column repl_strings): - """ - Returns a Column after replacing occurrences of - patterns `target_strings` with `repl_strings` in `source_strings`. - """ - return Column.from_pylibcudf(plc.strings.replace.replace_multiple( - source_strings.to_pylibcudf(mode="read"), - target_strings.to_pylibcudf(mode="read"), - repl_strings.to_pylibcudf(mode="read"), - )) diff --git a/python/cudf/cudf/_lib/strings/replace_re.pyx b/python/cudf/cudf/_lib/strings/replace_re.pyx deleted file mode 100644 index 462d5c903e8..00000000000 --- a/python/cudf/cudf/_lib/strings/replace_re.pyx +++ /dev/null @@ -1,69 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from pylibcudf.libcudf.types cimport size_type -import pylibcudf as plc - -from cudf.core.buffer import acquire_spill_lock - -from cudf._lib.column cimport Column - - -@acquire_spill_lock() -def replace_re(Column source_strings, - object pattern, - object py_repl, - size_type n): - """ - Returns a Column after replacing occurrences regular - expressions `pattern` with `py_repl` in `source_strings`. - `n` indicates the number of resplacements to be made from - start. (-1 indicates all) - """ - plc_column = plc.strings.replace_re.replace_re( - source_strings.to_pylibcudf(mode="read"), - plc.strings.regex_program.RegexProgram.create( - str(pattern), - plc.strings.regex_flags.RegexFlags.DEFAULT - ), - py_repl.device_value.c_value, - n - ) - return Column.from_pylibcudf(plc_column) - - -@acquire_spill_lock() -def replace_with_backrefs( - Column source_strings, - object pattern, - object repl): - """ - Returns a Column after using the `repl` back-ref template to create - new string with the extracted elements found using - `pattern` regular expression in `source_strings`. - """ - plc_column = plc.strings.replace_re.replace_with_backrefs( - source_strings.to_pylibcudf(mode="read"), - plc.strings.regex_program.RegexProgram.create( - str(pattern), - plc.strings.regex_flags.RegexFlags.DEFAULT - ), - repl - ) - return Column.from_pylibcudf(plc_column) - - -@acquire_spill_lock() -def replace_multi_re(Column source_strings, - list patterns, - Column repl_strings): - """ - Returns a Column after replacing occurrences of multiple - regular expressions `patterns` with their corresponding - strings in `repl_strings` in `source_strings`. - """ - plc_column = plc.strings.replace_re.replace_re( - source_strings.to_pylibcudf(mode="read"), - patterns, - repl_strings.to_pylibcudf(mode="read") - ) - return Column.from_pylibcudf(plc_column) diff --git a/python/cudf/cudf/_lib/strings/strip.pyx b/python/cudf/cudf/_lib/strings/strip.pyx deleted file mode 100644 index 982c5a600e7..00000000000 --- a/python/cudf/cudf/_lib/strings/strip.pyx +++ /dev/null @@ -1,54 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from cudf.core.buffer import acquire_spill_lock - -from cudf._lib.column cimport Column -import pylibcudf as plc - - -@acquire_spill_lock() -def strip(Column source_strings, - object py_repl): - """ - Returns a Column by removing leading and trailing characters. - The set of characters need be stripped from left and right side - can be specified by `py_repl`. - """ - plc_result = plc.strings.strip.strip( - source_strings.to_pylibcudf(mode="read"), - plc.strings.side_type.SideType.BOTH, - py_repl.device_value.c_value, - ) - return Column.from_pylibcudf(plc_result) - - -@acquire_spill_lock() -def lstrip(Column source_strings, - object py_repl): - """ - Returns a Column by removing leading and trailing characters. - The set of characters need be stripped from left side can - be specified by `py_repl`. - """ - plc_result = plc.strings.strip.strip( - source_strings.to_pylibcudf(mode="read"), - plc.strings.side_type.SideType.LEFT, - py_repl.device_value.c_value, - ) - return Column.from_pylibcudf(plc_result) - - -@acquire_spill_lock() -def rstrip(Column source_strings, - object py_repl): - """ - Returns a Column by removing leading and trailing characters. - The set of characters need be stripped from right side can - be specified by `py_repl`. - """ - plc_result = plc.strings.strip.strip( - source_strings.to_pylibcudf(mode="read"), - plc.strings.side_type.SideType.RIGHT, - py_repl.device_value.c_value, - ) - return Column.from_pylibcudf(plc_result) diff --git a/python/cudf/cudf/_lib/strings/substring.pyx b/python/cudf/cudf/_lib/strings/substring.pyx deleted file mode 100644 index db96d99c7b6..00000000000 --- a/python/cudf/cudf/_lib/strings/substring.pyx +++ /dev/null @@ -1,85 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -import numpy as np - -from cudf.core.buffer import acquire_spill_lock - -from cudf._lib.column cimport Column - -from cudf._lib.scalar import as_device_scalar - -from cudf._lib.scalar cimport DeviceScalar - -import pylibcudf as plc - - -@acquire_spill_lock() -def slice_strings(Column source_strings, - object start, - object end, - object step): - """ - Returns a Column by extracting a substring of each string - at given start and end positions. Slicing can also be - performed in steps by skipping `step` number of - characters in a string. - """ - cdef DeviceScalar start_scalar = as_device_scalar(start, np.int32) - cdef DeviceScalar end_scalar = as_device_scalar(end, np.int32) - cdef DeviceScalar step_scalar = as_device_scalar(step, np.int32) - - return Column.from_pylibcudf( - plc.strings.slice.slice_strings( - source_strings.to_pylibcudf(mode="read"), - start_scalar.c_value, - end_scalar.c_value, - step_scalar.c_value - ) - ) - - -@acquire_spill_lock() -def slice_from(Column source_strings, - Column starts, - Column stops): - """ - Returns a Column by extracting a substring of each string - at given starts and stops positions. `starts` and `stops` - here are positions per element in the string-column. - """ - return Column.from_pylibcudf( - plc.strings.slice.slice_strings( - source_strings.to_pylibcudf(mode="read"), - starts.to_pylibcudf(mode="read"), - stops.to_pylibcudf(mode="read") - ) - ) - - -@acquire_spill_lock() -def get(Column source_strings, - object index): - """ - Returns a Column which contains only single - character from each input string. The index of - characters required can be controlled by passing `index`. - """ - - if index < 0: - next_index = index - 1 - step = -1 - else: - next_index = index + 1 - step = 1 - cdef DeviceScalar start_scalar = as_device_scalar(index, np.int32) - cdef DeviceScalar end_scalar = as_device_scalar(next_index, np.int32) - cdef DeviceScalar step_scalar = as_device_scalar(step, np.int32) - - return Column.from_pylibcudf( - plc.strings.slice.slice_strings( - source_strings.to_pylibcudf(mode="read"), - start_scalar.c_value, - end_scalar.c_value, - step_scalar.c_value - ) - ) diff --git a/python/cudf/cudf/_lib/strings/translate.pyx b/python/cudf/cudf/_lib/strings/translate.pyx deleted file mode 100644 index 3ef478532c2..00000000000 --- a/python/cudf/cudf/_lib/strings/translate.pyx +++ /dev/null @@ -1,42 +0,0 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. - -from libcpp cimport bool - -from cudf.core.buffer import acquire_spill_lock - -from cudf._lib.column cimport Column - -import pylibcudf as plc - - -@acquire_spill_lock() -def translate(Column source_strings, - object mapping_table): - """ - Translates individual characters within each string - if present in the mapping_table. - """ - plc_result = plc.strings.translate.translate( - source_strings.to_pylibcudf(mode="read"), - mapping_table, - ) - return Column.from_pylibcudf(plc_result) - - -@acquire_spill_lock() -def filter_characters(Column source_strings, - object mapping_table, - bool keep, - object py_repl): - """ - Removes or keeps individual characters within each string - using the provided mapping_table. - """ - plc_result = plc.strings.translate.filter_characters( - source_strings.to_pylibcudf(mode="read"), - mapping_table, - plc.strings.translate.FilterType.KEEP - if keep else plc.strings.translate.FilterType.REMOVE, - py_repl.device_value.c_value - ) - return Column.from_pylibcudf(plc_result) diff --git a/python/cudf/cudf/_lib/strings/wrap.pyx b/python/cudf/cudf/_lib/strings/wrap.pyx deleted file mode 100644 index 2b40f01f818..00000000000 --- a/python/cudf/cudf/_lib/strings/wrap.pyx +++ /dev/null @@ -1,24 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from cudf.core.buffer import acquire_spill_lock - -from pylibcudf.libcudf.types cimport size_type - -from cudf._lib.column cimport Column - -import pylibcudf as plc - - -@acquire_spill_lock() -def wrap(Column source_strings, - size_type width): - """ - Returns a Column by wrapping long strings - in the Column to be formatted in paragraphs - with length less than a given `width`. - """ - plc_result = plc.strings.wrap.wrap( - source_strings.to_pylibcudf(mode="read"), - width - ) - return Column.from_pylibcudf(plc_result) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 47763063c4c..d45c76d3ddb 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -2,14 +2,16 @@ from __future__ import annotations +import itertools import re import warnings from functools import cached_property -from typing import TYPE_CHECKING, cast, overload +from typing import TYPE_CHECKING, Literal, cast, overload import numpy as np import pandas as pd import pyarrow as pa +from typing_extensions import Self import pylibcudf as plc @@ -20,22 +22,15 @@ from cudf._lib.column import Column from cudf._lib.types import size_type_dtype from cudf.api.types import is_integer, is_scalar, is_string_dtype +from cudf.core.buffer import acquire_spill_lock from cudf.core.column import column, datetime from cudf.core.column.column import ColumnBase from cudf.core.column.methods import ColumnMethods from cudf.utils.docutils import copy_docstring from cudf.utils.dtypes import can_convert_to_column - -def str_to_boolean(column: StringColumn): - """Takes in string column and returns boolean column""" - return ( - libstrings.count_characters(column) > cudf.Scalar(0, dtype="int8") - ).fillna(False) - - if TYPE_CHECKING: - from collections.abc import Sequence + from collections.abc import Callable, Sequence import cupy import numba.cuda @@ -50,6 +45,16 @@ def str_to_boolean(column: StringColumn): from cudf.core.buffer import Buffer +def str_to_boolean(column: StringColumn): + """Takes in string column and returns boolean column""" + with acquire_spill_lock(): + plc_column = plc.strings.attributes.count_characters( + column.to_pylibcudf(mode="read") + ) + result = Column.from_pylibcudf(plc_column) + return (result > cudf.Scalar(0, dtype="int8")).fillna(False) + + _str_to_numeric_typecast_functions = { cudf.api.types.dtype("int8"): str_cast.stoi8, cudf.api.types.dtype("int16"): str_cast.stoi16, @@ -213,10 +218,12 @@ def len(self) -> SeriesOrIndex: 3 dtype: int32 """ - - return self._return_or_inplace( - libstrings.count_characters(self._column) - ) + with acquire_spill_lock(): + plc_column = plc.strings.attributes.count_characters( + self._column.to_pylibcudf(mode="read") + ) + result = Column.from_pylibcudf(plc_column) + return self._return_or_inplace(result) def byte_count(self) -> SeriesOrIndex: """ @@ -245,9 +252,12 @@ def byte_count(self) -> SeriesOrIndex: 2 11 dtype: int32 """ - return self._return_or_inplace( - libstrings.count_bytes(self._column), - ) + with acquire_spill_lock(): + plc_column = plc.strings.attributes.count_bytes( + self._column.to_pylibcudf(mode="read") + ) + result = Column.from_pylibcudf(plc_column) + return self._return_or_inplace(result) @overload def cat( @@ -347,19 +357,70 @@ def cat(self, others=None, sep=None, na_rep=None): sep = "" if others is None: - data = libstrings.join( - self._column, - cudf.Scalar(sep), - cudf.Scalar(na_rep, "str"), - ) + with acquire_spill_lock(): + plc_column = plc.strings.combine.join_strings( + self._column.to_pylibcudf(mode="read"), + cudf.Scalar(sep).device_value.c_value, + cudf.Scalar(na_rep, "str").device_value.c_value, + ) + data = Column.from_pylibcudf(plc_column) else: - other_cols = _get_cols_list(self._parent, others) - all_cols = [self._column] + other_cols - data = libstrings.concatenate( - all_cols, - cudf.Scalar(sep), - cudf.Scalar(na_rep, "str"), + parent_index = ( + self._parent.index + if isinstance(self._parent, cudf.Series) + else self._parent ) + if ( + can_convert_to_column(others) + and len(others) > 0 + and ( + can_convert_to_column( + others.iloc[0] + if isinstance(others, cudf.Series) + else others[0] + ) + ) + ): + other_cols = ( + column.as_column(frame.reindex(parent_index), dtype="str") + if ( + parent_index is not None + and isinstance(frame, cudf.Series) + and not frame.index.equals(parent_index) + ) + else column.as_column(frame, dtype="str") + for frame in others + ) + elif others is not None and not isinstance(others, StringMethods): + if ( + parent_index is not None + and isinstance(others, cudf.Series) + and not others.index.equals(parent_index) + ): + others = others.reindex(parent_index) + + other_cols = [column.as_column(others, dtype="str")] + else: + raise TypeError( + "others must be Series, Index, DataFrame, np.ndarrary " + "or list-like (either containing only strings or " + "containing only objects of type Series/Index/" + "np.ndarray[1-dim])" + ) + with acquire_spill_lock(): + plc_column = plc.strings.combine.concatenate( + plc.Table( + [ + col.to_pylibcudf(mode="read") + for col in itertools.chain( + [self._column], other_cols + ) + ] + ), + cudf.Scalar(sep).device_value.c_value, + cudf.Scalar(na_rep, "str").device_value.c_value, + ) + data = Column.from_pylibcudf(plc_column) if len(data) == 1 and data.null_count == 1: data = cudf.core.column.as_column("", length=len(data)) @@ -516,9 +577,18 @@ def join( strings_column = self._split_by_character() if is_scalar(sep): - data = libstrings.join_lists_with_scalar( - strings_column, cudf.Scalar(sep), cudf.Scalar(string_na_rep) - ) + with acquire_spill_lock(): + plc_column = plc.strings.combine.join_list_elements( + strings_column.to_pylibcudf(mode="read"), + cudf.Scalar(sep).device_value.c_value, + cudf.Scalar(string_na_rep).device_value.c_value, + cudf._lib.scalar.DeviceScalar( + "", cudf.dtype("object") + ).c_value, + plc.strings.combine.SeparatorOnNulls.YES, + plc.strings.combine.OutputIfEmptyList.NULL_ELEMENT, + ) + data = Column.from_pylibcudf(plc_column) elif can_convert_to_column(sep): sep_column = column.as_column(sep) if len(sep_column) != len(strings_column): @@ -531,13 +601,16 @@ def join( f"sep_na_rep should be a string scalar, got {sep_na_rep} " f"of type: {type(sep_na_rep)}" ) - - data = libstrings.join_lists_with_column( - strings_column, - sep_column, - cudf.Scalar(string_na_rep), - cudf.Scalar(sep_na_rep), - ) + with acquire_spill_lock(): + plc_column = plc.strings.combine.join_list_elements( + strings_column.to_pylibcudf(mode="read"), + sep_column.to_pylibcudf(mode="read"), + cudf.Scalar(sep_na_rep).device_value.c_value, + cudf.Scalar(string_na_rep).device_value.c_value, + plc.strings.combine.SeparatorOnNulls.YES, + plc.strings.combine.OutputIfEmptyList.NULL_ELEMENT, + ) + data = Column.from_pylibcudf(plc_column) else: raise TypeError( f"sep should be an str, array-like or Series object, " @@ -627,9 +700,18 @@ def extract( "unsupported value for `flags` parameter" ) - data = libstrings.extract(self._column, pat, flags) + with acquire_spill_lock(): + prog = plc.strings.regex_program.RegexProgram.create(pat, flags) + plc_result = plc.strings.extract.extract( + self._column.to_pylibcudf(mode="read"), prog + ) + data = dict( + enumerate( + Column.from_pylibcudf(col) for col in plc_result.columns() + ) + ) if len(data) == 1 and expand is False: - _, data = data.popitem() + _, data = data.popitem() # type: ignore[assignment] return self._return_or_inplace(data, expand=expand) def contains( @@ -765,26 +847,41 @@ def contains( if is_scalar(pat): if regex: - result_col = libstrings.contains_re(self._column, pat, flags) + with acquire_spill_lock(): + prog = plc.strings.regex_program.RegexProgram.create( + pat, flags + ) + plc_result = plc.strings.contains.contains_re( + self._column.to_pylibcudf(mode="read"), prog + ) + result_col = Column.from_pylibcudf(plc_result) else: if case is False: - input_column = libstrings.to_lower(self._column) - pat = cudf.Scalar(pat.lower(), dtype="str") # type: ignore + input_column = self.lower()._column # type: ignore[union-attr] + plc_pat = cudf.Scalar(pat.lower(), dtype="str") # type: ignore[union-attr] else: input_column = self._column - pat = cudf.Scalar(pat, dtype="str") # type: ignore - result_col = libstrings.contains(input_column, pat) + plc_pat = cudf.Scalar(pat, dtype="str") + with acquire_spill_lock(): + plc_result = plc.strings.find.contains( + input_column.to_pylibcudf(mode="read"), + plc_pat.device_value.c_value, + ) + result_col = Column.from_pylibcudf(plc_result) else: # TODO: we silently ignore the `regex=` flag here if case is False: - input_column = libstrings.to_lower(self._column) - col_pat = libstrings.to_lower( - column.as_column(pat, dtype="str") - ) + input_column = self.lower()._column # type: ignore[union-attr] + col_pat = cudf.Index(pat, dtype="str").str.lower()._column # type: ignore[union-attr] else: input_column = self._column col_pat = column.as_column(pat, dtype="str") - result_col = libstrings.contains_multiple(input_column, col_pat) + with acquire_spill_lock(): + plc_result = plc.strings.find.contains( + input_column.to_pylibcudf(mode="read"), + col_pat.to_pylibcudf(mode="read"), + ) + result_col = Column.from_pylibcudf(plc_result) return self._return_or_inplace(result_col) def like(self, pat: str, esc: str | None = None) -> SeriesOrIndex: @@ -850,11 +947,15 @@ def like(self, pat: str, esc: str | None = None) -> SeriesOrIndex: "expected esc to contain less than or equal to 1 characters" ) - result_col = libstrings.like( - self._column, cudf.Scalar(pat, "str"), cudf.Scalar(esc, "str") - ) + with acquire_spill_lock(): + plc_result = plc.strings.contains.like( + self._column.to_pylibcudf(mode="read"), + cudf.Scalar(pat, "str").device_value.c_value, + cudf.Scalar(esc, "str").device_value.c_value, + ) + result = Column.from_pylibcudf(plc_result) - return self._return_or_inplace(result_col) + return self._return_or_inplace(result) def repeat( self, @@ -901,17 +1002,16 @@ def repeat( 2 ccc dtype: object """ - if can_convert_to_column(repeats): - return self._return_or_inplace( - libstrings.repeat_sequence( - self._column, - column.as_column(repeats, dtype="int"), - ), + with acquire_spill_lock(): + if can_convert_to_column(repeats): + repeats = column.as_column(repeats, dtype="int").to_pylibcudf( + mode="read" + ) + plc_result = plc.strings.repeat.repeat_strings( + self._column.to_pylibcudf(mode="read"), repeats ) - - return self._return_or_inplace( - libstrings.repeat_scalar(self._column, repeats) - ) + result = Column.from_pylibcudf(plc_result) + return self._return_or_inplace(result) def replace( self, @@ -997,19 +1097,22 @@ def replace( "`pat` and `repl` are list-like inputs" ) - return self._return_or_inplace( - libstrings.replace_multi_re( - self._column, - list(pat), - column.as_column(repl, dtype="str"), + if regex: + with acquire_spill_lock(): + plc_result = plc.strings.replace_re.replace_re( + self._column.to_pylibcudf(mode="read"), + list(pat), + column.as_column(repl, dtype="str").to_pylibcudf( + mode="read" + ), + ) + result = Column.from_pylibcudf(plc_result) + else: + result = self._column.replace_multiple( + cast(StringColumn, column.as_column(pat, dtype="str")), + cast(StringColumn, column.as_column(repl, dtype="str")), ) - if regex - else libstrings.replace_multi( - self._column, - column.as_column(pat, dtype="str"), - column.as_column(repl, dtype="str"), - ), - ) + return self._return_or_inplace(result) # Pandas treats 0 as all if n == 0: n = -1 @@ -1019,18 +1122,25 @@ def replace( pat = pat.pattern # Pandas forces non-regex replace when pat is a single-character - return self._return_or_inplace( - libstrings.replace_re( - self._column, pat, cudf.Scalar(repl, "str"), n - ) - if regex is True and len(pat) > 1 - else libstrings.replace( - self._column, - cudf.Scalar(pat, "str"), - cudf.Scalar(repl, "str"), - n, - ), - ) + with acquire_spill_lock(): + if regex is True and len(pat) > 1: + plc_result = plc.strings.replace_re.replace_re( + self._column.to_pylibcudf(mode="read"), + plc.strings.regex_program.RegexProgram.create( + pat, plc.strings.regex_flags.RegexFlags.DEFAULT + ), + cudf.Scalar(repl, "str").device_value.c_value, + n, + ) + else: + plc_result = plc.strings.replace.replace( + self._column.to_pylibcudf(mode="read"), + cudf.Scalar(pat).device_value.c_value, + cudf.Scalar(repl).device_value.c_value, + n, + ) + result = Column.from_pylibcudf(plc_result) + return self._return_or_inplace(result) def replace_with_backrefs(self, pat: str, repl: str) -> SeriesOrIndex: r""" @@ -1058,14 +1168,20 @@ def replace_with_backrefs(self, pat: str, repl: str) -> SeriesOrIndex: 1 ZV576 dtype: object """ - # If 'pat' is re.Pattern then get the pattern string from it if isinstance(pat, re.Pattern): pat = pat.pattern - return self._return_or_inplace( - libstrings.replace_with_backrefs(self._column, pat, repl) - ) + with acquire_spill_lock(): + plc_result = plc.strings.replace_re.replace_with_backrefs( + self._column.to_pylibcudf(mode="read"), + plc.strings.regex_program.RegexProgram.create( + pat, plc.strings.regex_flags.RegexFlags.DEFAULT + ), + repl, + ) + result = Column.from_pylibcudf(plc_result) + return self._return_or_inplace(result) def slice( self, @@ -1136,10 +1252,28 @@ def slice( 2 cm dtype: object """ + param_dtype = np.dtype(np.int32) + with acquire_spill_lock(): + plc_result = plc.strings.slice.slice_strings( + self._column.to_pylibcudf(mode="read"), + cudf.Scalar(start, param_dtype).device_value.c_value, + cudf.Scalar(stop, param_dtype).device_value.c_value, + cudf.Scalar(step, param_dtype).device_value.c_value, + ) + result = Column.from_pylibcudf(plc_result) + return self._return_or_inplace(result) - return self._return_or_inplace( - libstrings.slice_strings(self._column, start, stop, step), - ) + def _all_characters_of_type( + self, + char_type: plc.strings.char_types.StringCharacterTypes, + case_type: plc.strings.char_types.StringCharacterTypes = plc.strings.char_types.StringCharacterTypes.ALL_TYPES, + ) -> SeriesOrIndex: + with acquire_spill_lock(): + plc_column = plc.strings.char_types.all_characters_of_type( + self._column.to_pylibcudf(mode="read"), char_type, case_type + ) + result = Column.from_pylibcudf(plc_column) + return self._return_or_inplace(result) def isinteger(self) -> SeriesOrIndex: """ @@ -1396,7 +1530,9 @@ def isdecimal(self) -> SeriesOrIndex: 3 False dtype: bool """ - return self._return_or_inplace(libstrings.is_decimal(self._column)) + return self._all_characters_of_type( + plc.strings.char_types.StringCharacterTypes.DECIMAL + ) def isalnum(self) -> SeriesOrIndex: """ @@ -1467,7 +1603,9 @@ def isalnum(self) -> SeriesOrIndex: 2 False dtype: bool """ - return self._return_or_inplace(libstrings.is_alnum(self._column)) + return self._all_characters_of_type( + plc.strings.char_types.StringCharacterTypes.ALPHANUM + ) def isalpha(self) -> SeriesOrIndex: """ @@ -1525,7 +1663,9 @@ def isalpha(self) -> SeriesOrIndex: 3 False dtype: bool """ - return self._return_or_inplace(libstrings.is_alpha(self._column)) + return self._all_characters_of_type( + plc.strings.char_types.StringCharacterTypes.ALPHA + ) def isdigit(self) -> SeriesOrIndex: """ @@ -1589,7 +1729,9 @@ def isdigit(self) -> SeriesOrIndex: 3 False dtype: bool """ - return self._return_or_inplace(libstrings.is_digit(self._column)) + return self._all_characters_of_type( + plc.strings.char_types.StringCharacterTypes.DIGIT + ) def isnumeric(self) -> SeriesOrIndex: """ @@ -1659,7 +1801,9 @@ def isnumeric(self) -> SeriesOrIndex: 3 False dtype: bool """ - return self._return_or_inplace(libstrings.is_numeric(self._column)) + return self._all_characters_of_type( + plc.strings.char_types.StringCharacterTypes.NUMERIC + ) def isupper(self) -> SeriesOrIndex: """ @@ -1718,7 +1862,10 @@ def isupper(self) -> SeriesOrIndex: 3 False dtype: bool """ - return self._return_or_inplace(libstrings.is_upper(self._column)) + return self._all_characters_of_type( + plc.strings.char_types.StringCharacterTypes.UPPER, + plc.strings.char_types.StringCharacterTypes.CASE_TYPES, + ) def islower(self) -> SeriesOrIndex: """ @@ -1777,7 +1924,10 @@ def islower(self) -> SeriesOrIndex: 3 False dtype: bool """ - return self._return_or_inplace(libstrings.is_lower(self._column)) + return self._all_characters_of_type( + plc.strings.char_types.StringCharacterTypes.LOWER, + plc.strings.char_types.StringCharacterTypes.CASE_TYPES, + ) def isipv4(self) -> SeriesOrIndex: """ @@ -1844,7 +1994,7 @@ def lower(self) -> SeriesOrIndex: 3 swapcase dtype: object """ - return self._return_or_inplace(libstrings.to_lower(self._column)) + return self._return_or_inplace(self._column.to_lower()) def upper(self) -> SeriesOrIndex: """ @@ -1895,7 +2045,7 @@ def upper(self) -> SeriesOrIndex: 3 SWAPCASE dtype: object """ - return self._return_or_inplace(libstrings.to_upper(self._column)) + return self._return_or_inplace(self._column.to_upper()) def capitalize(self) -> SeriesOrIndex: """ @@ -1923,7 +2073,7 @@ def capitalize(self) -> SeriesOrIndex: 1 Goodbye, friend dtype: object """ - return self._return_or_inplace(libstrings.capitalize(self._column)) + return self._return_or_inplace(self._column.capitalize()) def swapcase(self) -> SeriesOrIndex: """ @@ -1970,7 +2120,7 @@ def swapcase(self) -> SeriesOrIndex: 3 sWaPcAsE dtype: object """ - return self._return_or_inplace(libstrings.swapcase(self._column)) + return self._return_or_inplace(self._column.swapcase()) def title(self) -> SeriesOrIndex: """ @@ -2017,7 +2167,7 @@ def title(self) -> SeriesOrIndex: 3 Swapcase dtype: object """ - return self._return_or_inplace(libstrings.title(self._column)) + return self._return_or_inplace(self._column.title()) def istitle(self) -> SeriesOrIndex: """ @@ -2043,7 +2193,7 @@ def istitle(self) -> SeriesOrIndex: 3 False dtype: bool """ - return self._return_or_inplace(libstrings.is_title(self._column)) + return self._return_or_inplace(self._column.is_title()) def filter_alphanum( self, repl: str | None = None, keep: bool = True @@ -2078,14 +2228,22 @@ def filter_alphanum( if repl is None: repl = "" - return self._return_or_inplace( - libstrings.filter_alphanum( - self._column, cudf.Scalar(repl, "str"), keep - ), - ) + with acquire_spill_lock(): + plc_column = plc.strings.char_types.filter_characters_of_type( + self._column.to_pylibcudf(mode="read"), + plc.strings.char_types.StringCharacterTypes.ALL_TYPES + if keep + else plc.strings.char_types.StringCharacterTypes.ALPHANUM, + cudf.Scalar(repl, "str").device_value.c_value, + plc.strings.char_types.StringCharacterTypes.ALPHANUM + if keep + else plc.strings.char_types.StringCharacterTypes.ALL_TYPES, + ) + result = Column.from_pylibcudf(plc_column) + return self._return_or_inplace(result) def slice_from( - self, starts: "cudf.Series", stops: "cudf.Series" + self, starts: cudf.Series, stops: cudf.Series ) -> SeriesOrIndex: """ Return substring of each string using positions for each string. @@ -2122,14 +2280,14 @@ def slice_from( 1 re dtype: object """ - - return self._return_or_inplace( - libstrings.slice_from( - self._column, - column.as_column(starts), - column.as_column(stops), - ), - ) + with acquire_spill_lock(): + plc_result = plc.strings.slice.slice_strings( + self._column.to_pylibcudf(mode="read"), + starts._column.to_pylibcudf(mode="read"), + stops._column.to_pylibcudf(mode="read"), + ) + result = Column.from_pylibcudf(plc_result) + return self._return_or_inplace(result) def slice_replace( self, @@ -2217,11 +2375,15 @@ def slice_replace( if repl is None: repl = "" - return self._return_or_inplace( - libstrings.slice_replace( - self._column, start, stop, cudf.Scalar(repl, "str") - ), - ) + with acquire_spill_lock(): + plc_result = plc.strings.replace.replace_slice( + self._column.to_pylibcudf(mode="read"), + cudf.Scalar(repl, "str").device_value.c_value, + start, + stop, + ) + result = Column.from_pylibcudf(plc_result) + return self._return_or_inplace(result) def insert(self, start: int = 0, repl: str | None = None) -> SeriesOrIndex: """ @@ -2266,12 +2428,7 @@ def insert(self, start: int = 0, repl: str | None = None) -> SeriesOrIndex: 1 0123456789_ dtype: object """ - if repl is None: - repl = "" - - return self._return_or_inplace( - libstrings.insert(self._column, start, cudf.Scalar(repl, "str")), - ) + return self.slice_replace(start, start, repl) def get(self, i: int = 0) -> SeriesOrIndex: """ @@ -2314,17 +2471,22 @@ def get(self, i: int = 0) -> SeriesOrIndex: 2 f dtype: object """ - - return self._return_or_inplace(libstrings.get(self._column, i)) + if i < 0: + next_index = i - 1 + step = -1 + else: + next_index = i + 1 + step = 1 + return self.slice(i, next_index, step) def get_json_object( self, - json_path, + json_path: str, *, - allow_single_quotes=False, - strip_quotes_from_single_strings=True, - missing_fields_as_nulls=False, - ): + allow_single_quotes: bool = False, + strip_quotes_from_single_strings: bool = True, + missing_fields_as_nulls: bool = False, + ) -> SeriesOrIndex: r""" Applies a JSONPath string to an input strings column where each row in the column is a valid json string @@ -2394,11 +2556,14 @@ def get_json_object( ), missing_fields_as_nulls=missing_fields_as_nulls, ) - return self._return_or_inplace( - libstrings.get_json_object( - self._column, cudf.Scalar(json_path, "str"), options + with acquire_spill_lock(): + plc_result = plc.json.get_json_object( + self._column.to_pylibcudf(mode="read"), + cudf.Scalar(json_path, "str").device_value.c_value, + options, ) - ) + result = Column.from_pylibcudf(plc_result) + return self._return_or_inplace(result) def split( self, @@ -2893,7 +3058,10 @@ def rpartition(self, sep: str = " ", expand: bool = True) -> SeriesOrIndex: ) def pad( - self, width: int, side: str = "left", fillchar: str = " " + self, + width: int, + side: Literal["left", "both", "right"] = "left", + fillchar: str = " ", ) -> SeriesOrIndex: """ Pad strings in the Series/Index up to width. @@ -2974,10 +3142,15 @@ def pad( raise ValueError( "side has to be either one of {'left', 'right', 'both'}" ) - - return self._return_or_inplace( - libstrings.pad(self._column, width, fillchar, side) - ) + with acquire_spill_lock(): + plc_result = plc.strings.padding.pad( + self._column.to_pylibcudf(mode="read"), + width, + side, + fillchar, + ) + result = Column.from_pylibcudf(plc_result) + return self._return_or_inplace(result) def zfill(self, width: int) -> SeriesOrIndex: """ @@ -3043,7 +3216,12 @@ def zfill(self, width: int) -> SeriesOrIndex: msg = f"width must be of integer type, not {type(width).__name__}" raise TypeError(msg) - return self._return_or_inplace(libstrings.zfill(self._column, width)) + with acquire_spill_lock(): + plc_result = plc.strings.padding.zfill( + self._column.to_pylibcudf(mode="read"), width + ) + result = Column.from_pylibcudf(plc_result) + return self._return_or_inplace(result) def center(self, width: int, fillchar: str = " ") -> SeriesOrIndex: """ @@ -3100,22 +3278,7 @@ def center(self, width: int, fillchar: str = " ") -> SeriesOrIndex: 3 --d--- dtype: object """ - if not isinstance(fillchar, str): - msg = ( - f"fillchar must be a character, not {type(fillchar).__name__}" - ) - raise TypeError(msg) - - if len(fillchar) != 1: - raise TypeError("fillchar must be a character, not str") - - if not is_integer(width): - msg = f"width must be of integer type, not {type(width).__name__}" - raise TypeError(msg) - - return self._return_or_inplace( - libstrings.center(self._column, width, fillchar) - ) + return self.pad(width, "both", fillchar) def ljust(self, width: int, fillchar: str = " ") -> SeriesOrIndex: """ @@ -3154,22 +3317,7 @@ def ljust(self, width: int, fillchar: str = " ") -> SeriesOrIndex: 3 __ dtype: object """ - if not isinstance(fillchar, str): - msg = ( - f"fillchar must be a character, not {type(fillchar).__name__}" - ) - raise TypeError(msg) - - if len(fillchar) != 1: - raise TypeError("fillchar must be a character, not str") - - if not is_integer(width): - msg = f"width must be of integer type, not {type(width).__name__}" - raise TypeError(msg) - - return self._return_or_inplace( - libstrings.ljust(self._column, width, fillchar) - ) + return self.pad(width, "right", fillchar) def rjust(self, width: int, fillchar: str = " ") -> SeriesOrIndex: """ @@ -3208,22 +3356,21 @@ def rjust(self, width: int, fillchar: str = " ") -> SeriesOrIndex: 3 __ dtype: object """ - if not isinstance(fillchar, str): - msg = ( - f"fillchar must be a character, not {type(fillchar).__name__}" - ) - raise TypeError(msg) - - if len(fillchar) != 1: - raise TypeError("fillchar must be a character, not str") - - if not is_integer(width): - msg = f"width must be of integer type, not {type(width).__name__}" - raise TypeError(msg) + return self.pad(width, "left", fillchar) - return self._return_or_inplace( - libstrings.rjust(self._column, width, fillchar) - ) + def _strip( + self, side: plc.string.side_type.SideType, to_strip: str | None = None + ) -> SeriesOrIndex: + if to_strip is None: + to_strip = "" + with acquire_spill_lock(): + plc_result = plc.strings.strip.strip( + self._column.to_pylibcudf(mode="read"), + side, + cudf.Scalar(to_strip, "str").device_value.c_value, + ) + result = Column.from_pylibcudf(plc_result) + return self._return_or_inplace(result) def strip(self, to_strip: str | None = None) -> SeriesOrIndex: r""" @@ -3277,12 +3424,7 @@ def strip(self, to_strip: str | None = None) -> SeriesOrIndex: 3 dtype: object """ - if to_strip is None: - to_strip = "" - - return self._return_or_inplace( - libstrings.strip(self._column, cudf.Scalar(to_strip, "str")) - ) + return self._strip(plc.strings.side_type.SideType.BOTH, to_strip) def lstrip(self, to_strip: str | None = None) -> SeriesOrIndex: r""" @@ -3324,12 +3466,7 @@ def lstrip(self, to_strip: str | None = None) -> SeriesOrIndex: 3 dtype: object """ - if to_strip is None: - to_strip = "" - - return self._return_or_inplace( - libstrings.lstrip(self._column, cudf.Scalar(to_strip, "str")) - ) + return self._strip(plc.strings.side_type.SideType.LEFT, to_strip) def rstrip(self, to_strip: str | None = None) -> SeriesOrIndex: r""" @@ -3379,12 +3516,7 @@ def rstrip(self, to_strip: str | None = None) -> SeriesOrIndex: 3 dtype: object """ - if to_strip is None: - to_strip = "" - - return self._return_or_inplace( - libstrings.rstrip(self._column, cudf.Scalar(to_strip, "str")) - ) + return self._strip(plc.strings.side_type.SideType.RIGHT, to_strip) def wrap(self, width: int, **kwargs) -> SeriesOrIndex: r""" @@ -3478,7 +3610,12 @@ def wrap(self, width: int, **kwargs) -> SeriesOrIndex: "`break_on_hyphens`=False" ) - return self._return_or_inplace(libstrings.wrap(self._column, width)) + with acquire_spill_lock(): + plc_result = plc.strings.wrap.wrap( + self._column.to_pylibcudf(mode="read"), width + ) + result = Column.from_pylibcudf(plc_result) + return self._return_or_inplace(result) def count(self, pat: str, flags: int = 0) -> SeriesOrIndex: r""" @@ -3546,10 +3683,37 @@ def count(self, pat: str, flags: int = 0) -> SeriesOrIndex: raise NotImplementedError( "unsupported value for `flags` parameter" ) + with acquire_spill_lock(): + prog = plc.strings.regex_program.RegexProgram.create(pat, flags) + plc_result = plc.strings.contains.count_re( + self._column.to_pylibcudf(mode="read"), prog + ) + result = Column.from_pylibcudf(plc_result) + return self._return_or_inplace(result) - return self._return_or_inplace( - libstrings.count_re(self._column, pat, flags) - ) + def _findall( + self, + method: Callable[ + [plc.Column, plc.strings.regex_program.RegexProgram], plc.Column + ], + pat: str | re.Pattern, + flags: int = 0, + ) -> SeriesOrIndex: + if isinstance(pat, re.Pattern): + flags = pat.flags & ~re.U + pat = pat.pattern + if not _is_supported_regex_flags(flags): + raise NotImplementedError( + "unsupported value for `flags` parameter" + ) + with acquire_spill_lock(): + prog = plc.strings.regex_program.RegexProgram.create(pat, flags) + plc_result = method( + self._column.to_pylibcudf(mode="read"), + prog, + ) + result = Column.from_pylibcudf(plc_result) + return self._return_or_inplace(result) def findall(self, pat: str, flags: int = 0) -> SeriesOrIndex: """ @@ -3616,16 +3780,7 @@ def findall(self, pat: str, flags: int = 0) -> SeriesOrIndex: The `flags` parameter currently only supports re.DOTALL and re.MULTILINE. """ - if isinstance(pat, re.Pattern): - flags = pat.flags & ~re.U - pat = pat.pattern - if not _is_supported_regex_flags(flags): - raise NotImplementedError( - "unsupported value for `flags` parameter" - ) - - data = libstrings.findall(self._column, pat, flags) - return self._return_or_inplace(data) + return self._findall(plc.strings.findall.findall, pat, flags) def find_re(self, pat: str, flags: int = 0) -> SeriesOrIndex: """ @@ -3656,16 +3811,7 @@ def find_re(self, pat: str, flags: int = 0) -> SeriesOrIndex: 3 2 dtype: int32 """ - if isinstance(pat, re.Pattern): - flags = pat.flags & ~re.U - pat = pat.pattern - if not _is_supported_regex_flags(flags): - raise NotImplementedError( - "Unsupported value for `flags` parameter" - ) - - data = libstrings.find_re(self._column, pat, flags) - return self._return_or_inplace(data) + return self._findall(plc.strings.findall.find_re, pat, flags) def find_multiple(self, patterns: SeriesOrIndex) -> cudf.Series: """ @@ -3723,8 +3869,15 @@ def find_multiple(self, patterns: SeriesOrIndex) -> cudf.Series: f"got: {patterns_column.dtype}" ) + with acquire_spill_lock(): + plc_result = plc.strings.find_multiple.find_multiple( + self._column.to_pylibcudf(mode="read"), + patterns_column.to_pylibcudf(mode="read"), + ) + result = Column.from_pylibcudf(plc_result) + return cudf.Series._from_column( - libstrings.find_multiple(self._column, patterns_column), + result, name=self._parent.name, index=self._parent.index if isinstance(self._parent, cudf.Series) @@ -3816,9 +3969,34 @@ def isspace(self) -> SeriesOrIndex: 2 False dtype: bool """ - return self._return_or_inplace(libstrings.is_space(self._column)) + return self._all_characters_of_type( + plc.strings.char_types.StringCharacterTypes.SPACE + ) - def endswith(self, pat: str) -> SeriesOrIndex: + def _starts_ends_with( + self, + method: Callable[[plc.Column, plc.Column | plc.Scalar], plc.Column], + pat: str | Sequence, + ) -> SeriesOrIndex: + if pat is None: + raise TypeError( + f"expected a string or a sequence-like object, not " + f"{type(pat).__name__}" + ) + elif is_scalar(pat): + plc_pat = cudf.Scalar(pat, "str").device_value.c_value + else: + plc_pat = column.as_column(pat, dtype="str").to_pylibcudf( + mode="read" + ) + with acquire_spill_lock(): + plc_result = method( + self._column.to_pylibcudf(mode="read"), plc_pat + ) + result = Column.from_pylibcudf(plc_result) + return self._return_or_inplace(result) + + def endswith(self, pat: str | Sequence) -> SeriesOrIndex: """ Test if the end of each string element matches a pattern. @@ -3860,21 +4038,7 @@ def endswith(self, pat: str) -> SeriesOrIndex: `na` parameter is not yet supported, as cudf uses native strings instead of Python objects. """ - if pat is None: - raise TypeError( - f"expected a string or a sequence-like object, not " - f"{type(pat).__name__}" - ) - elif is_scalar(pat): - result_col = libstrings.endswith( - self._column, cudf.Scalar(pat, "str") - ) - else: - result_col = libstrings.endswith_multiple( - self._column, column.as_column(pat, dtype="str") - ) - - return self._return_or_inplace(result_col) + return self._starts_ends_with(plc.strings.find.ends_with, pat) def startswith(self, pat: str | Sequence) -> SeriesOrIndex: """ @@ -3923,21 +4087,7 @@ def startswith(self, pat: str | Sequence) -> SeriesOrIndex: 3 dtype: bool """ - if pat is None: - raise TypeError( - f"expected a string or a sequence-like object, not " - f"{type(pat).__name__}" - ) - elif is_scalar(pat): - result_col = libstrings.startswith( - self._column, cudf.Scalar(pat, "str") - ) - else: - result_col = libstrings.startswith_multiple( - self._column, column.as_column(pat, dtype="str") - ) - - return self._return_or_inplace(result_col) + return self._starts_ends_with(plc.strings.find.starts_with, pat) def removesuffix(self, suffix: str) -> SeriesOrIndex: """ @@ -3972,12 +4122,9 @@ def removesuffix(self, suffix: str) -> SeriesOrIndex: """ if suffix is None or len(suffix) == 0: return self._return_or_inplace(self._column) - ends_column = libstrings.endswith( - self._column, cudf.Scalar(suffix, "str") - ) - removed_column = libstrings.slice_strings( - self._column, 0, -len(suffix), None - ) + ends_column = self.endswith(suffix)._column # type: ignore[union-attr] + removed_column = self.slice(0, -len(suffix), None)._column # type: ignore[union-attr] + result = cudf._lib.copying.copy_if_else( removed_column, self._column, ends_column ) @@ -4016,17 +4163,38 @@ def removeprefix(self, prefix: str) -> SeriesOrIndex: """ if prefix is None or len(prefix) == 0: return self._return_or_inplace(self._column) - starts_column = libstrings.startswith( - self._column, cudf.Scalar(prefix, "str") - ) - removed_column = libstrings.slice_strings( - self._column, len(prefix), None, None - ) + starts_column = self.startswith(prefix)._column # type: ignore[union-attr] + removed_column = self.slice(len(prefix), None, None)._column # type: ignore[union-attr] result = cudf._lib.copying.copy_if_else( removed_column, self._column, starts_column ) return self._return_or_inplace(result) + def _find( + self, + method: Callable[[plc.Column, plc.Scalar, int, int], plc.Column], + sub: str, + start: int = 0, + end: int | None = None, + ) -> SeriesOrIndex: + if not isinstance(sub, str): + raise TypeError( + f"expected a string object, not {type(sub).__name__}" + ) + + if end is None: + end = -1 + + with acquire_spill_lock(): + plc_result = method( + self._column.to_pylibcudf(mode="read"), + cudf.Scalar(sub, "str").device_value.c_value, + start, + end, + ) + result = Column.from_pylibcudf(plc_result) + return self._return_or_inplace(result) + def find( self, sub: str, start: int = 0, end: int | None = None ) -> SeriesOrIndex: @@ -4070,19 +4238,7 @@ def find( 3 2 dtype: int32 """ - if not isinstance(sub, str): - raise TypeError( - f"expected a string object, not {type(sub).__name__}" - ) - - if end is None: - end = -1 - - result_col = libstrings.find( - self._column, cudf.Scalar(sub, "str"), start, end - ) - - return self._return_or_inplace(result_col) + return self._find(plc.strings.find.find, sub, start, end) def rfind( self, sub: str, start: int = 0, end: int | None = None @@ -4131,19 +4287,7 @@ def rfind( 2 -1 dtype: int32 """ - if not isinstance(sub, str): - raise TypeError( - f"expected a string object, not {type(sub).__name__}" - ) - - if end is None: - end = -1 - - result_col = libstrings.rfind( - self._column, cudf.Scalar(sub, "str"), start, end - ) - - return self._return_or_inplace(result_col) + return self._find(plc.strings.find.rfind, sub, start, end) def index( self, sub: str, start: int = 0, end: int | None = None @@ -4196,9 +4340,7 @@ def index( if end is None: end = -1 - result_col = libstrings.find( - self._column, cudf.Scalar(sub, "str"), start, end - ) + result_col = self.find(sub, start, end)._column # type: ignore[union-attr] result = self._return_or_inplace(result_col) @@ -4258,9 +4400,7 @@ def rindex( if end is None: end = -1 - result_col = libstrings.rfind( - self._column, cudf.Scalar(sub, "str"), start, end - ) + result_col = self.rfind(sub, start, end)._column # type: ignore[union-attr] result = self._return_or_inplace(result_col) @@ -4323,10 +4463,13 @@ def match( raise NotImplementedError( "unsupported value for `flags` parameter" ) - - return self._return_or_inplace( - libstrings.match_re(self._column, pat, flags) - ) + with acquire_spill_lock(): + prog = plc.strings.regex_program.RegexProgram.create(pat, flags) + plc_result = plc.strings.contains.matches_re( + self._column.to_pylibcudf(mode="read"), prog + ) + result = Column.from_pylibcudf(plc_result) + return self._return_or_inplace(result) def url_decode(self) -> SeriesOrIndex: """ @@ -4420,9 +4563,12 @@ def code_points(self) -> SeriesOrIndex: 2 99 dtype: int32 """ - return self._return_or_inplace( - libstrings.code_points(self._column), retain_index=False - ) + with acquire_spill_lock(): + plc_column = plc.strings.attributes.code_points( + self._column.to_pylibcudf(mode="read") + ) + result = Column.from_pylibcudf(plc_column) + return self._return_or_inplace(result, retain_index=False) def translate(self, table: dict) -> SeriesOrIndex: """ @@ -4465,9 +4611,12 @@ def translate(self, table: dict) -> SeriesOrIndex: dtype: object """ table = str.maketrans(table) - return self._return_or_inplace( - libstrings.translate(self._column, table) - ) + with acquire_spill_lock(): + plc_result = plc.strings.translate.translate( + self._column.to_pylibcudf(mode="read"), table + ) + result = Column.from_pylibcudf(plc_result) + return self._return_or_inplace(result) def filter_characters( self, table: dict, keep: bool = True, repl: str | None = None @@ -4516,11 +4665,17 @@ def filter_characters( if repl is None: repl = "" table = str.maketrans(table) - return self._return_or_inplace( - libstrings.filter_characters( - self._column, table, keep, cudf.Scalar(repl, "str") - ), - ) + with acquire_spill_lock(): + plc_result = plc.strings.translate.filter_characters( + self._column.to_pylibcudf(mode="read"), + table, + plc.strings.translate.FilterType.KEEP + if keep + else plc.strings.translate.FilterType.REMOVE, + cudf.Scalar(repl, "str").device_value.c_value, + ) + result = Column.from_pylibcudf(plc_result) + return self._return_or_inplace(result) def normalize_spaces(self) -> SeriesOrIndex: r""" @@ -5614,17 +5769,12 @@ def _massage_string_arg(value, name, allow_col=False): allowed_types.append("Column") - raise ValueError( - f"Expected {_expected_types_format(allowed_types)} " - f"for {name} but got {type(value)}" - ) - - -def _expected_types_format(types): - if len(types) == 1: - return types[0] + if len(allowed_types) == 1: + expected = allowed_types[0] + else: + expected = ", ".join(allowed_types[:-1]) + ", or " + allowed_types[-1] - return ", ".join(types[:-1]) + ", or " + types[-1] + raise ValueError(f"Expected {expected} for {name} but got {type(value)}") class StringColumn(column.ColumnBase): @@ -5844,11 +5994,13 @@ def sum( skipna=skipna, min_count=min_count ) if isinstance(result_col, type(self)): - return libstrings.join( - result_col, - sep=cudf.Scalar(""), - na_rep=cudf.Scalar(None, "str"), - ).element_indexing(0) + with acquire_spill_lock(): + plc_column = plc.strings.combine.join_strings( + result_col.to_pylibcudf(mode="read"), + cudf.Scalar("").device_value.c_value, + cudf.Scalar(None, "str").device_value.c_value, + ) + return Column.from_pylibcudf(plc_column).element_indexing(0) else: return result_col @@ -5897,13 +6049,12 @@ def strptime( ) is_nat = self == "NaT" without_nat = self.apply_boolean_mask(is_nat.unary_operator("not")) - all_same_length = ( - libstrings.count_characters(without_nat).distinct_count( - dropna=True + with acquire_spill_lock(): + plc_column = plc.strings.attributes.count_characters( + without_nat.to_pylibcudf(mode="read") ) - == 1 - ) - if not all_same_length: + char_counts = Column.from_pylibcudf(plc_column) + if char_counts.distinct_count(dropna=True) != 1: # Unfortunately disables OK cases like: # ["2020-01-01", "2020-01-01 00:00:00"] # But currently incorrect for cases like (drops 10): @@ -6104,14 +6255,18 @@ def _binaryop( rhs: cudf.Scalar | StringColumn lhs, rhs = (other, self) if reflect else (self, other) - return cast( - "column.ColumnBase", - libstrings.concatenate( - [lhs, rhs], - sep=cudf.Scalar(""), - na_rep=cudf.Scalar(None, "str"), - ), - ) + with acquire_spill_lock(): + plc_column = plc.strings.combine.concatenate( + plc.Table( + [ + lhs.to_pylibcudf(mode="read"), + rhs.to_pylibcudf(mode="read"), + ] + ), + cudf.Scalar("").device_value.c_value, + cudf.Scalar(None, "str").device_value.c_value, + ) + return Column.from_pylibcudf(plc_column) elif op in { "__eq__", "__ne__", @@ -6151,52 +6306,39 @@ def view(self, dtype) -> "cudf.core.column.ColumnBase": return to_view.view(dtype) - -def _get_cols_list(parent_obj, others): - parent_index = ( - parent_obj.index if isinstance(parent_obj, cudf.Series) else parent_obj - ) - - if ( - can_convert_to_column(others) - and len(others) > 0 - and ( - can_convert_to_column( - others.iloc[0] - if isinstance(others, cudf.Series) - else others[0] - ) - ) - ): + def _modify_characters( + self, method: Callable[[plc.Column], plc.Column] + ) -> Self: """ - If others is a list-like object (in our case lists & tuples) - just another Series/Index, great go ahead with concatenation. + Helper function for methods that modify characters e.g. to_lower """ - cols_list = [ - column.as_column(frame.reindex(parent_index), dtype="str") - if ( - parent_index is not None - and isinstance(frame, cudf.Series) - and not frame.index.equals(parent_index) - ) - else column.as_column(frame, dtype="str") - for frame in others - ] + with acquire_spill_lock(): + plc_column = method(self.to_pylibcudf(mode="read")) + return cast(Self, Column.from_pylibcudf(plc_column)) - return cols_list - elif others is not None and not isinstance(others, StringMethods): - if ( - parent_index is not None - and isinstance(others, cudf.Series) - and not others.index.equals(parent_index) - ): - others = others.reindex(parent_index) + def to_lower(self) -> Self: + return self._modify_characters(plc.strings.case.to_lower) - return [column.as_column(others, dtype="str")] - else: - raise TypeError( - "others must be Series, Index, DataFrame, np.ndarrary " - "or list-like (either containing only strings or " - "containing only objects of type Series/Index/" - "np.ndarray[1-dim])" - ) + def to_upper(self) -> Self: + return self._modify_characters(plc.strings.case.to_upper) + + def capitalize(self) -> Self: + return self._modify_characters(plc.strings.capitalize.capitalize) + + def swapcase(self) -> Self: + return self._modify_characters(plc.strings.case.swapcase) + + def title(self) -> Self: + return self._modify_characters(plc.strings.capitalize.title) + + def is_title(self) -> Self: + return self._modify_characters(plc.strings.capitalize.is_title) + + def replace_multiple(self, pattern: Self, replacements: Self) -> Self: + with acquire_spill_lock(): + plc_result = plc.strings.replace.replace_multiple( + self.to_pylibcudf(mode="read"), + pattern.to_pylibcudf(mode="read"), + replacements.to_pylibcudf(mode="read"), + ) + return cast(Self, Column.from_pylibcudf(plc_result)) diff --git a/python/cudf/cudf/core/tools/numeric.py b/python/cudf/cudf/core/tools/numeric.py index 9a22045ff78..91f23490031 100644 --- a/python/cudf/cudf/core/tools/numeric.py +++ b/python/cudf/cudf/core/tools/numeric.py @@ -242,12 +242,11 @@ def _convert_str_col(col, errors, _downcast=None): def _proc_inf_empty_strings(col: ColumnBase) -> ColumnBase: """Handles empty and infinity strings""" - col = libstrings.to_lower(col) + col = col.to_lower() # type: ignore[attr-defined] col = col.find_and_replace(as_column([""]), as_column(["NaN"])) # TODO: This can be handled by libcudf in # future see StringColumn.as_numerical_column - col = libstrings.replace_multi( - col, + col = col.replace_multiple( # type: ignore[attr-defined] as_column(["+", "inf", "inity"]), as_column(["", "Inf", ""]), ) diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py index e25f99d7bee..9700f548a16 100644 --- a/python/cudf/cudf/tests/test_string.py +++ b/python/cudf/cudf/tests/test_string.py @@ -1272,7 +1272,7 @@ def test_string_slice_from(): gs = cudf.Series(["hello world", "holy accéntéd", "batman", None, ""]) d_starts = cudf.Series([2, 3, 0, -1, -1], dtype=np.int32) d_stops = cudf.Series([-1, -1, 0, -1, -1], dtype=np.int32) - got = gs.str.slice_from(starts=d_starts._column, stops=d_stops._column) + got = gs.str.slice_from(starts=d_starts, stops=d_stops) expected = cudf.Series(["llo world", "y accéntéd", "", None, ""]) assert_eq(got, expected)