Remove cudf._lib.string.convert/split in favor of inlining pylibcudf

rapidsai · rapids-bot · Dec 9, 2024 · Dec 3, 2024 · Dec 4, 2024 · Dec 5, 2024
commit 04cecfc9b7bd024f8e71b7761833bd162a7ef7fb
@@ -52,4 +52,3 @@ target_link_libraries(interop PUBLIC nanoarrow)
 
 add_subdirectory(io)
 add_subdirectory(nvtext)
-add_subdirectory(strings)
@@ -16,7 +16,6 @@
     sort,
     stream_compaction,
     string_casting,
-    strings,
     strings_udf,
     text,
 )

@@ -32,18 +32,3 @@
     detokenize,
     tokenize_with_vocabulary,
 )
-from cudf._lib.strings.convert.convert_fixed_point import to_decimal
-from cudf._lib.strings.convert.convert_floats import is_float
-from cudf._lib.strings.convert.convert_integers import is_integer
-from cudf._lib.strings.convert.convert_urls import url_decode, url_encode
-from cudf._lib.strings.split.partition import partition, rpartition
-from cudf._lib.strings.split.split import (
-    rsplit,
-    rsplit_re,
-    rsplit_record,
-    rsplit_record_re,
-    split,
-    split_re,
-    split_record,
-    split_record_re,
-)
@@ -10,14 +10,13 @@
 import numpy as np
 import pyarrow as pa
 
+import pylibcudf as plc
+
 import cudf
 from cudf import _lib as libcudf
-from cudf._lib.strings.convert.convert_fixed_point import (
-    from_decimal as cpp_from_decimal,
-)
 from cudf.api.types import is_scalar
 from cudf.core._internals import unary
-from cudf.core.buffer import as_buffer
+from cudf.core.buffer import acquire_spill_lock, as_buffer
 from cudf.core.column.column import ColumnBase
 from cudf.core.column.numerical_base import NumericalBaseColumn
 from cudf.core.dtypes import (
@@ -88,7 +87,13 @@ def as_decimal_column(
 
     def as_string_column(self) -> cudf.core.column.StringColumn:
         if len(self) > 0:
-            return cpp_from_decimal(self)
+            with acquire_spill_lock():
+                plc_column = (
+                    plc.strings.convert.convert_fixed_point.from_fixed_point(
+                        self.to_pylibcudf(mode="read"),
+                    )
+                )
+                return type(self).from_pylibcudf(plc_column)  # type: ignore[return-value]
         else:
             return cast(
                 cudf.core.column.StringColumn,

@@ -14,7 +14,6 @@
 
 import cudf
 import cudf.core.column.column as column
-from cudf._lib.strings.convert.convert_lists import format_list_column
 from cudf._lib.types import size_type_dtype
 from cudf.api.types import _is_non_decimal_numeric_dtype, is_scalar
 from cudf.core.buffer import acquire_spill_lock
@@ -272,8 +271,13 @@ def as_string_column(self) -> cudf.core.column.StringColumn:
         # Separator strings to match the Python format
         separators = as_column([", ", "[", "]"])
 
-        # Call libcudf to format the list column
-        return format_list_column(lc, separators)
+        with acquire_spill_lock():
+            plc_column = plc.strings.convert.convert_lists.format_list_column(
+                lc.to_pylibcudf(mode="read"),
+                cudf.Scalar("None").device_value.c_value,
+                separators.to_pylibcudf(mode="read"),
+            )
+            return type(self).from_pylibcudf(plc_column)  # type: ignore[return-value]
 
     def _transform_leaves(self, func, *args, **kwargs) -> Self:
         # return a new list column with the same nested structure

@@ -44,6 +44,7 @@
         SeriesOrIndex,
     )
     from cudf.core.buffer import Buffer
+    from cudf.core.column.numerical import NumericalColumn
 
 
 def str_to_boolean(column: StringColumn):
@@ -1336,7 +1337,7 @@ def isinteger(self) -> SeriesOrIndex:
         2    False
         dtype: bool
         """
-        return self._return_or_inplace(libstrings.is_integer(self._column))
+        return self._return_or_inplace(self._column.is_integer())
 
     def ishex(self) -> SeriesOrIndex:
         """
@@ -1468,7 +1469,7 @@ def isfloat(self) -> SeriesOrIndex:
         3    False
         dtype: bool
         """
-        return self._return_or_inplace(libstrings.is_float(self._column))
+        return self._return_or_inplace(self._column.is_float())
 
     def isdecimal(self) -> SeriesOrIndex:
         """
@@ -2710,26 +2711,25 @@ def split(
         if len(str(pat)) <= 1:
             regex = False
 
+        result_table: StringColumn | dict[int, StringColumn]
         if expand:
             if self._column.null_count == len(self._column):
                 result_table = {0: self._column.copy()}
             else:
                 if regex is True:
-                    data = libstrings.split_re(self._column, pat, n)
+                    data = self._column.split_re(pat, n)
                 else:
-                    data = libstrings.split(
-                        self._column, cudf.Scalar(pat, "str"), n
-                    )
+                    data = self._column.split(cudf.Scalar(pat, "str"), n)
                 if len(data) == 1 and data[0].null_count == len(self._column):
                     result_table = {}
                 else:
                     result_table = data
         else:
             if regex is True:
-                result_table = libstrings.split_record_re(self._column, pat, n)
+                result_table = self._column.split_record_re(pat, n)
             else:
-                result_table = libstrings.split_record(
-                    self._column, cudf.Scalar(pat, "str"), n
+                result_table = self._column.split_record(
+                    cudf.Scalar(pat, "str"), n
                 )
 
         return self._return_or_inplace(result_table, expand=expand)
@@ -2883,28 +2883,25 @@ def rsplit(
         if regex and isinstance(pat, re.Pattern):
             pat = pat.pattern
 
+        result_table: StringColumn | dict[int, StringColumn]
         if expand:
             if self._column.null_count == len(self._column):
                 result_table = {0: self._column.copy()}
             else:
                 if regex is True:
-                    data = libstrings.rsplit_re(self._column, pat, n)
+                    data = self._column.rsplit_re(pat, n)
                 else:
-                    data = libstrings.rsplit(
-                        self._column, cudf.Scalar(pat, "str"), n
-                    )
+                    data = self._column.rsplit(cudf.Scalar(pat, "str"), n)
                 if len(data) == 1 and data[0].null_count == len(self._column):
                     result_table = {}
                 else:
                     result_table = data
         else:
             if regex is True:
-                result_table = libstrings.rsplit_record_re(
-                    self._column, pat, n
-                )
+                result_table = self._column.rsplit_record_re(pat, n)
             else:
-                result_table = libstrings.rsplit_record(
-                    self._column, cudf.Scalar(pat, "str"), n
+                result_table = self._column.rsplit_record(
+                    cudf.Scalar(pat, "str"), n
                 )
 
         return self._return_or_inplace(result_table, expand=expand)
@@ -2989,7 +2986,7 @@ def partition(self, sep: str = " ", expand: bool = True) -> SeriesOrIndex:
             sep = " "
 
         return self._return_or_inplace(
-            libstrings.partition(self._column, cudf.Scalar(sep, "str")),
+            self._column.partition(cudf.Scalar(sep, "str")),
             expand=expand,
         )
 
@@ -3054,7 +3051,7 @@ def rpartition(self, sep: str = " ", expand: bool = True) -> SeriesOrIndex:
             sep = " "
 
         return self._return_or_inplace(
-            libstrings.rpartition(self._column, cudf.Scalar(sep, "str")),
+            self._column.rpartition(cudf.Scalar(sep, "str")),
             expand=expand,
         )
 
@@ -4499,8 +4496,7 @@ def url_decode(self) -> SeriesOrIndex:
         1    https://medium.com/rapids-ai
         dtype: object
         """
-
-        return self._return_or_inplace(libstrings.url_decode(self._column))
+        return self._return_or_inplace(self._column.url_decode())
 
     def url_encode(self) -> SeriesOrIndex:
         """
@@ -4531,7 +4527,7 @@ def url_encode(self) -> SeriesOrIndex:
         1    https%3A%2F%2Fmedium.com%2Frapids-ai
         dtype: object
         """
-        return self._return_or_inplace(libstrings.url_encode(self._column))
+        return self._return_or_inplace(self._column.url_encode())
 
     def code_points(self) -> SeriesOrIndex:
         """
@@ -6015,13 +6011,13 @@ def as_numerical_column(
         out_dtype = cudf.api.types.dtype(dtype)
         string_col = self
         if out_dtype.kind in {"i", "u"}:
-            if not libstrings.is_integer(string_col).all():
+            if not string_col.is_integer().all():
                 raise ValueError(
                     "Could not convert strings to integer "
                     "type due to presence of non-integer values."
                 )
         elif out_dtype.kind == "f":
-            if not libstrings.is_float(string_col).all():
+            if not string_col.is_float().all():
                 raise ValueError(
                     "Could not convert strings to float "
                     "type due to presence of non-floating values."
@@ -6099,10 +6095,17 @@ def as_timedelta_column(
     ) -> cudf.core.column.TimeDeltaColumn:
         return self.strptime(dtype, "%D days %H:%M:%S")  # type: ignore[return-value]
 
+    @acquire_spill_lock()
     def as_decimal_column(
         self, dtype: Dtype
-    ) -> "cudf.core.column.DecimalBaseColumn":
-        return libstrings.to_decimal(self, dtype)
+    ) -> cudf.core.column.DecimalBaseColumn:
+        plc_column = plc.strings.convert.convert_fixed_point.to_fixed_point(
+            self.to_pylibcudf(mode="read"),
+            libcudf.types.dtype_to_pylibcudf_type(dtype),
+        )
+        result = Column.from_pylibcudf(plc_column)
+        result.dtype.precision = dtype.precision  # type: ignore[union-attr]
+        return result  # type: ignore[return-value]
 
     def as_string_column(self) -> StringColumn:
         return self
@@ -6138,12 +6141,9 @@ def can_cast_safely(self, to_dtype: Dtype) -> bool:
 
         if self.dtype == to_dtype:
             return True
-        elif (
-            to_dtype.kind in {"i", "u"}
-            and not libstrings.is_integer(self).all()
-        ):
+        elif to_dtype.kind in {"i", "u"} and not self.is_integer().all():
             return False
-        elif to_dtype.kind == "f" and not libstrings.is_float(self).all():
+        elif to_dtype.kind == "f" and not self.is_float().all():
             return False
         else:
             return True
@@ -6335,11 +6335,180 @@ def title(self) -> Self:
     def is_title(self) -> Self:
         return self._modify_characters(plc.strings.capitalize.is_title)
 
+    @acquire_spill_lock()
     def replace_multiple(self, pattern: Self, replacements: Self) -> Self:
-        with acquire_spill_lock():
-            plc_result = plc.strings.replace.replace_multiple(
-                self.to_pylibcudf(mode="read"),
-                pattern.to_pylibcudf(mode="read"),
-                replacements.to_pylibcudf(mode="read"),
+        plc_result = plc.strings.replace.replace_multiple(
+            self.to_pylibcudf(mode="read"),
+            pattern.to_pylibcudf(mode="read"),
+            replacements.to_pylibcudf(mode="read"),
+        )
+        return cast(Self, Column.from_pylibcudf(plc_result))
+
+    @acquire_spill_lock()
+    def _split_record_re(
+        self,
+        pattern: str,
+        maxsplit: int,
+        method: Callable[
+            [plc.Column, plc.strings.regex_program.RegexProgram, int],
+            plc.Column,
+        ],
+    ) -> Self:
+        plc_column = method(
+            self.to_pylibcudf(mode="read"),
+            plc.strings.regex_program.RegexProgram.create(
+                pattern,
+                plc.strings.regex_flags.RegexFlags.DEFAULT,
+            ),
+            maxsplit,
+        )
+        return cast(Self, Column.from_pylibcudf(plc_column))
+
+    def split_record_re(self, pattern: str, maxsplit: int) -> Self:
+        return self._split_record_re(
+            pattern, maxsplit, plc.strings.split.split.split_record_re
+        )
+
+    def rsplit_record_re(self, pattern: str, maxsplit: int) -> Self:
+        return self._split_record_re(
+            pattern, maxsplit, plc.strings.split.split.rsplit_record_re
+        )
+
+    @acquire_spill_lock()
+    def _split_re(
+        self,
+        pattern: str,
+        maxsplit: int,
+        method: Callable[
+            [plc.Column, plc.strings.regex_program.RegexProgram, int],
+            plc.Table,
+        ],
+    ) -> dict[int, Self]:
+        plc_table = method(
+            self.to_pylibcudf(mode="read"),
+            plc.strings.regex_program.RegexProgram.create(
+                pattern,
+                plc.strings.regex_flags.RegexFlags.DEFAULT,
+            ),
+            maxsplit,
+        )
+        return dict(
+            enumerate(
+                Column.from_pylibcudf(col)  # type: ignore[misc]
+                for col in plc_table.columns()
             )
-            return cast(Self, Column.from_pylibcudf(plc_result))
+        )
+
+    def split_re(self, pattern: str, maxsplit: int) -> dict[int, Self]:
+        return self._split_re(
+            pattern, maxsplit, plc.strings.split.split.split_re
+        )
+
+    def rsplit_re(self, pattern: str, maxsplit: int) -> dict[int, Self]:
+        return self._split_re(
+            pattern, maxsplit, plc.strings.split.split.rsplit_re
+        )
+
+    @acquire_spill_lock()
+    def _split_record(
+        self,
+        delimiter: cudf.Scalar,
+        maxsplit: int,
+        method: Callable[[plc.Column, plc.Scalar, int], plc.Column],
+    ) -> Self:
+        plc_column = method(
+            self.to_pylibcudf(mode="read"),
+            delimiter.device_value.c_value,
+            maxsplit,
+        )
+        return type(self).from_pylibcudf(plc_column)  # type: ignore[return-value]
+
+    def split_record(self, delimiter: cudf.Scalar, maxsplit: int) -> Self:
+        return self._split_record(
+            delimiter, maxsplit, plc.strings.split.split.split_record
+        )
+
+    def rsplit_record(self, delimiter: cudf.Scalar, maxsplit: int) -> Self:
+        return self._split_record(
+            delimiter, maxsplit, plc.strings.split.split.rsplit_record
+        )
+
+    @acquire_spill_lock()
+    def _split(
+        self,
+        delimiter: cudf.Scalar,
+        maxsplit: int,
+        method: Callable[[plc.Column, plc.Scalar, int], plc.Column],
+    ) -> dict[int, Self]:
+        plc_table = method(
+            self.to_pylibcudf(mode="read"),
+            delimiter.device_value.c_value,
+            maxsplit,
+        )
+        return dict(
+            enumerate(
+                Column.from_pylibcudf(col)  # type: ignore[misc]
+                for col in plc_table.columns()
+            )
+        )
+
+    def split(self, delimiter: cudf.Scalar, maxsplit: int) -> dict[int, Self]:
+        return self._split(delimiter, maxsplit, plc.strings.split.split.split)
+
+    def rsplit(self, delimiter: cudf.Scalar, maxsplit: int) -> dict[int, Self]:
+        return self._split(delimiter, maxsplit, plc.strings.split.split.rsplit)
+
+    @acquire_spill_lock()
+    def _partition(
+        self,
+        delimiter: cudf.Scalar,
+        method: Callable[[plc.Column, plc.Scalar], plc.Column],
+    ) -> dict[int, Self]:
+        plc_table = method(
+            self.to_pylibcudf(mode="read"),
+            delimiter.device_value.c_value,
+        )
+        return dict(
+            enumerate(
+                Column.from_pylibcudf(col)  # type: ignore[misc]
+                for col in plc_table.columns()
+            )
+        )
+
+    def partition(self, delimiter: cudf.Scalar) -> dict[int, Self]:
+        return self._partition(
+            delimiter, plc.strings.split.partition.partition
+        )
+
+    def rpartition(self, delimiter: cudf.Scalar) -> dict[int, Self]:
+        return self._partition(
+            delimiter, plc.strings.split.partition.rpartition
+        )
+
+    @acquire_spill_lock()
+    def url_decode(self) -> Self:
+        plc_column = plc.strings.convert.convert_urls.url_decode(
+            self.to_pylibcudf(mode="read")
+        )
+        return type(self).from_pylibcudf(plc_column)  # type: ignore[return-value]
+
+    @acquire_spill_lock()
+    def url_encode(self) -> Self:
+        plc_column = plc.strings.convert.convert_urls.url_encode(
+            self.to_pylibcudf(mode="read")
+        )
+        return type(self).from_pylibcudf(plc_column)  # type: ignore[return-value]
+
+    @acquire_spill_lock()
+    def is_integer(self) -> NumericalColumn:
+        plc_column = plc.strings.convert.convert_integers.is_integer(
+            self.to_pylibcudf(mode="read")
+        )
+        return type(self).from_pylibcudf(plc_column)  # type: ignore[return-value]
+
+    @acquire_spill_lock()
+    def is_float(self) -> NumericalColumn:
+        plc_column = plc.strings.convert.convert_floats.is_float(
+            self.to_pylibcudf(mode="read")
+        )
+        return type(self).from_pylibcudf(plc_column)  # type: ignore[return-value]
@@ -15,9 +15,6 @@
 
 import cudf
 from cudf import _lib as libcudf
-from cudf._lib.strings.convert.convert_integers import (
-    is_integer as cpp_is_integer,
-)
 from cudf.api.types import is_integer, is_scalar
 from cudf.core import column
 from cudf.core.buffer import acquire_spill_lock
@@ -232,7 +229,7 @@ def to_datetime(
                         )
                         break
                     elif arg_col.dtype.kind == "O":
-                        if not cpp_is_integer(arg_col).all():
+                        if not arg_col.is_integer().all():
                             col = new_series._column.strptime(
                                 cudf.dtype("datetime64[ns]"), format=format
                             )

@@ -2,14 +2,13 @@
 from __future__ import annotations
 
 import warnings
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Literal
 
 import numpy as np
 import pandas as pd
 
 import cudf
 from cudf import _lib as libcudf
-from cudf._lib import strings as libstrings
 from cudf.api.types import _is_non_decimal_numeric_dtype, is_string_dtype
 from cudf.core._internals import unary
 from cudf.core.column import as_column
@@ -18,10 +17,16 @@
 from cudf.utils.dtypes import can_convert_to_column
 
 if TYPE_CHECKING:
-    from cudf.core.column import ColumnBase
+    from cudf.core.column.numerical import NumericalColumn
+    from cudf.core.column.string import StringColumn
 
 
-def to_numeric(arg, errors="raise", downcast=None, dtype_backend=None):
+def to_numeric(
+    arg,
+    errors: Literal["raise", "coerce", "ignore"] = "raise",
+    downcast: Literal["integer", "signed", "unsigned", "float", None] = None,
+    dtype_backend=None,
+):
     """
     Convert argument into numerical types.
 
@@ -130,7 +135,9 @@ def to_numeric(arg, errors="raise", downcast=None, dtype_backend=None):
         else:
             try:
                 col = _convert_str_col(
-                    col._get_decategorized_column(), errors, downcast
+                    col._get_decategorized_column(),  # type: ignore[attr-defined]
+                    errors,
+                    downcast,
                 )
             except ValueError as e:
                 if errors == "ignore":
@@ -139,7 +146,7 @@ def to_numeric(arg, errors="raise", downcast=None, dtype_backend=None):
                     raise e
     elif is_string_dtype(dtype):
         try:
-            col = _convert_str_col(col, errors, downcast)
+            col = _convert_str_col(col, errors, downcast)  # type: ignore[arg-type]
         except ValueError as e:
             if errors == "ignore":
                 return arg
@@ -186,7 +193,11 @@ def to_numeric(arg, errors="raise", downcast=None, dtype_backend=None):
         return col.values
 
 
-def _convert_str_col(col, errors, _downcast=None):
+def _convert_str_col(
+    col: StringColumn,
+    errors: Literal["raise", "coerce", "ignore"],
+    _downcast: Literal["integer", "signed", "unsigned", "float", None] = None,
+) -> NumericalColumn:
     """
     Converts a string column to numeric column
 
@@ -212,13 +223,19 @@ def _convert_str_col(col, errors, _downcast=None):
     if not is_string_dtype(col):
         raise TypeError("col must be string dtype.")
 
-    is_integer = libstrings.is_integer(col)
-    if is_integer.all():
-        return col.astype(dtype=cudf.dtype("i8"))
+    if col.is_integer().all():
+        return col.astype(dtype=cudf.dtype("i8"))  # type: ignore[return-value]
 
-    col = _proc_inf_empty_strings(col)
+    converted_col = (
+        col.to_lower()
+        .find_and_replace(as_column([""]), as_column(["NaN"]))
+        .replace_multiple(
+            as_column(["+", "inf", "inity"]),  # type: ignore[arg-type]
+            as_column(["", "Inf", ""]),  # type: ignore[arg-type]
+        )
+    )
 
-    is_float = libstrings.is_float(col)
+    is_float = converted_col.is_float()
     if is_float.all():
         if _downcast in {"unsigned", "signed", "integer"}:
             warnings.warn(
@@ -227,27 +244,14 @@ def _convert_str_col(col, errors, _downcast=None):
                     "limited by float32 precision."
                 )
             )
-            return col.astype(dtype=cudf.dtype("float32"))
+            return converted_col.astype(dtype=cudf.dtype("float32"))  # type: ignore[return-value]
         else:
-            return col.astype(dtype=cudf.dtype("float64"))
+            return converted_col.astype(dtype=cudf.dtype("float64"))  # type: ignore[return-value]
     else:
         if errors == "coerce":
-            col = libcudf.string_casting.stod(col)
+            converted_col = libcudf.string_casting.stod(converted_col)
             non_numerics = is_float.unary_operator("not")
-            col[non_numerics] = None
-            return col
+            converted_col[non_numerics] = None
+            return converted_col  # type: ignore[return-value]
         else:
             raise ValueError("Unable to convert some strings to numerics.")
-
-
-def _proc_inf_empty_strings(col: ColumnBase) -> ColumnBase:
-    """Handles empty and infinity strings"""
-    col = col.to_lower()  # type: ignore[attr-defined]
-    col = col.find_and_replace(as_column([""]), as_column(["NaN"]))
-    # TODO: This can be handled by libcudf in
-    # future see StringColumn.as_numerical_column
-    col = col.replace_multiple(  # type: ignore[attr-defined]
-        as_column(["+", "inf", "inity"]),
-        as_column(["", "Inf", ""]),
-    )
-    return col
Original file line number	Diff line number	Diff line change
		@@ -52,4 +52,3 @@ target_link_libraries(interop PUBLIC nanoarrow)

		add_subdirectory(io)
		add_subdirectory(nvtext)
		add_subdirectory(strings)