Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove cudf._lib.string.convert/split in favor of inlining pylibcudf #17496

Merged
merged 6 commits into from
Dec 9, 2024
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
Remove cudf._lib.string.convert/split in favor of inlining pylibcudf
mroeschke committed Dec 3, 2024
commit 04cecfc9b7bd024f8e71b7761833bd162a7ef7fb
1 change: 0 additions & 1 deletion python/cudf/cudf/_lib/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -52,4 +52,3 @@ target_link_libraries(interop PUBLIC nanoarrow)

add_subdirectory(io)
add_subdirectory(nvtext)
add_subdirectory(strings)
1 change: 0 additions & 1 deletion python/cudf/cudf/_lib/__init__.py
Original file line number Diff line number Diff line change
@@ -16,7 +16,6 @@
sort,
stream_compaction,
string_casting,
strings,
strings_udf,
text,
)
15 changes: 0 additions & 15 deletions python/cudf/cudf/_lib/strings/CMakeLists.txt

This file was deleted.

15 changes: 0 additions & 15 deletions python/cudf/cudf/_lib/strings/__init__.py
Original file line number Diff line number Diff line change
@@ -32,18 +32,3 @@
detokenize,
tokenize_with_vocabulary,
)
from cudf._lib.strings.convert.convert_fixed_point import to_decimal
from cudf._lib.strings.convert.convert_floats import is_float
from cudf._lib.strings.convert.convert_integers import is_integer
from cudf._lib.strings.convert.convert_urls import url_decode, url_encode
from cudf._lib.strings.split.partition import partition, rpartition
from cudf._lib.strings.split.split import (
rsplit,
rsplit_re,
rsplit_record,
rsplit_record_re,
split,
split_re,
split_record,
split_record_re,
)
24 changes: 0 additions & 24 deletions python/cudf/cudf/_lib/strings/convert/CMakeLists.txt

This file was deleted.

Empty file.
Empty file.
76 changes: 0 additions & 76 deletions python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx

This file was deleted.

19 changes: 0 additions & 19 deletions python/cudf/cudf/_lib/strings/convert/convert_floats.pyx

This file was deleted.

20 changes: 0 additions & 20 deletions python/cudf/cudf/_lib/strings/convert/convert_integers.pyx

This file was deleted.

32 changes: 0 additions & 32 deletions python/cudf/cudf/_lib/strings/convert/convert_lists.pyx

This file was deleted.

48 changes: 0 additions & 48 deletions python/cudf/cudf/_lib/strings/convert/convert_urls.pyx

This file was deleted.

22 changes: 0 additions & 22 deletions python/cudf/cudf/_lib/strings/split/CMakeLists.txt

This file was deleted.

Empty file.
Empty file.
35 changes: 0 additions & 35 deletions python/cudf/cudf/_lib/strings/split/partition.pyx

This file was deleted.

155 changes: 0 additions & 155 deletions python/cudf/cudf/_lib/strings/split/split.pyx

This file was deleted.

15 changes: 10 additions & 5 deletions python/cudf/cudf/core/column/decimal.py
Original file line number Diff line number Diff line change
@@ -10,14 +10,13 @@
import numpy as np
import pyarrow as pa

import pylibcudf as plc

import cudf
from cudf import _lib as libcudf
from cudf._lib.strings.convert.convert_fixed_point import (
from_decimal as cpp_from_decimal,
)
from cudf.api.types import is_scalar
from cudf.core._internals import unary
from cudf.core.buffer import as_buffer
from cudf.core.buffer import acquire_spill_lock, as_buffer
from cudf.core.column.column import ColumnBase
from cudf.core.column.numerical_base import NumericalBaseColumn
from cudf.core.dtypes import (
@@ -88,7 +87,13 @@ def as_decimal_column(

def as_string_column(self) -> cudf.core.column.StringColumn:
if len(self) > 0:
return cpp_from_decimal(self)
with acquire_spill_lock():
plc_column = (
plc.strings.convert.convert_fixed_point.from_fixed_point(
self.to_pylibcudf(mode="read"),
)
)
return type(self).from_pylibcudf(plc_column) # type: ignore[return-value]
else:
return cast(
cudf.core.column.StringColumn,
10 changes: 7 additions & 3 deletions python/cudf/cudf/core/column/lists.py
Original file line number Diff line number Diff line change
@@ -14,7 +14,6 @@

import cudf
import cudf.core.column.column as column
from cudf._lib.strings.convert.convert_lists import format_list_column
from cudf._lib.types import size_type_dtype
from cudf.api.types import _is_non_decimal_numeric_dtype, is_scalar
from cudf.core.buffer import acquire_spill_lock
@@ -272,8 +271,13 @@ def as_string_column(self) -> cudf.core.column.StringColumn:
# Separator strings to match the Python format
separators = as_column([", ", "[", "]"])

# Call libcudf to format the list column
return format_list_column(lc, separators)
with acquire_spill_lock():
plc_column = plc.strings.convert.convert_lists.format_list_column(
lc.to_pylibcudf(mode="read"),
cudf.Scalar("None").device_value.c_value,
separators.to_pylibcudf(mode="read"),
)
return type(self).from_pylibcudf(plc_column) # type: ignore[return-value]

def _transform_leaves(self, func, *args, **kwargs) -> Self:
# return a new list column with the same nested structure
245 changes: 207 additions & 38 deletions python/cudf/cudf/core/column/string.py
Original file line number Diff line number Diff line change
@@ -44,6 +44,7 @@
SeriesOrIndex,
)
from cudf.core.buffer import Buffer
from cudf.core.column.numerical import NumericalColumn


def str_to_boolean(column: StringColumn):
@@ -1336,7 +1337,7 @@ def isinteger(self) -> SeriesOrIndex:
2 False
dtype: bool
"""
return self._return_or_inplace(libstrings.is_integer(self._column))
return self._return_or_inplace(self._column.is_integer())

def ishex(self) -> SeriesOrIndex:
"""
@@ -1468,7 +1469,7 @@ def isfloat(self) -> SeriesOrIndex:
3 False
dtype: bool
"""
return self._return_or_inplace(libstrings.is_float(self._column))
return self._return_or_inplace(self._column.is_float())

def isdecimal(self) -> SeriesOrIndex:
"""
@@ -2710,26 +2711,25 @@ def split(
if len(str(pat)) <= 1:
regex = False

result_table: StringColumn | dict[int, StringColumn]
if expand:
if self._column.null_count == len(self._column):
result_table = {0: self._column.copy()}
else:
if regex is True:
data = libstrings.split_re(self._column, pat, n)
data = self._column.split_re(pat, n)
else:
data = libstrings.split(
self._column, cudf.Scalar(pat, "str"), n
)
data = self._column.split(cudf.Scalar(pat, "str"), n)
if len(data) == 1 and data[0].null_count == len(self._column):
result_table = {}
else:
result_table = data
else:
if regex is True:
result_table = libstrings.split_record_re(self._column, pat, n)
result_table = self._column.split_record_re(pat, n)
else:
result_table = libstrings.split_record(
self._column, cudf.Scalar(pat, "str"), n
result_table = self._column.split_record(
cudf.Scalar(pat, "str"), n
)

return self._return_or_inplace(result_table, expand=expand)
@@ -2883,28 +2883,25 @@ def rsplit(
if regex and isinstance(pat, re.Pattern):
pat = pat.pattern

result_table: StringColumn | dict[int, StringColumn]
if expand:
if self._column.null_count == len(self._column):
result_table = {0: self._column.copy()}
else:
if regex is True:
data = libstrings.rsplit_re(self._column, pat, n)
data = self._column.rsplit_re(pat, n)
else:
data = libstrings.rsplit(
self._column, cudf.Scalar(pat, "str"), n
)
data = self._column.rsplit(cudf.Scalar(pat, "str"), n)
if len(data) == 1 and data[0].null_count == len(self._column):
result_table = {}
else:
result_table = data
else:
if regex is True:
result_table = libstrings.rsplit_record_re(
self._column, pat, n
)
result_table = self._column.rsplit_record_re(pat, n)
else:
result_table = libstrings.rsplit_record(
self._column, cudf.Scalar(pat, "str"), n
result_table = self._column.rsplit_record(
cudf.Scalar(pat, "str"), n
)

return self._return_or_inplace(result_table, expand=expand)
@@ -2989,7 +2986,7 @@ def partition(self, sep: str = " ", expand: bool = True) -> SeriesOrIndex:
sep = " "

return self._return_or_inplace(
libstrings.partition(self._column, cudf.Scalar(sep, "str")),
self._column.partition(cudf.Scalar(sep, "str")),
expand=expand,
)

@@ -3054,7 +3051,7 @@ def rpartition(self, sep: str = " ", expand: bool = True) -> SeriesOrIndex:
sep = " "

return self._return_or_inplace(
libstrings.rpartition(self._column, cudf.Scalar(sep, "str")),
self._column.rpartition(cudf.Scalar(sep, "str")),
expand=expand,
)

@@ -4499,8 +4496,7 @@ def url_decode(self) -> SeriesOrIndex:
1 https://medium.com/rapids-ai
dtype: object
"""

return self._return_or_inplace(libstrings.url_decode(self._column))
return self._return_or_inplace(self._column.url_decode())

def url_encode(self) -> SeriesOrIndex:
"""
@@ -4531,7 +4527,7 @@ def url_encode(self) -> SeriesOrIndex:
1 https%3A%2F%2Fmedium.com%2Frapids-ai
dtype: object
"""
return self._return_or_inplace(libstrings.url_encode(self._column))
return self._return_or_inplace(self._column.url_encode())

def code_points(self) -> SeriesOrIndex:
"""
@@ -6015,13 +6011,13 @@ def as_numerical_column(
out_dtype = cudf.api.types.dtype(dtype)
string_col = self
if out_dtype.kind in {"i", "u"}:
if not libstrings.is_integer(string_col).all():
if not string_col.is_integer().all():
raise ValueError(
"Could not convert strings to integer "
"type due to presence of non-integer values."
)
elif out_dtype.kind == "f":
if not libstrings.is_float(string_col).all():
if not string_col.is_float().all():
raise ValueError(
"Could not convert strings to float "
"type due to presence of non-floating values."
@@ -6099,10 +6095,17 @@ def as_timedelta_column(
) -> cudf.core.column.TimeDeltaColumn:
return self.strptime(dtype, "%D days %H:%M:%S") # type: ignore[return-value]

@acquire_spill_lock()
def as_decimal_column(
self, dtype: Dtype
) -> "cudf.core.column.DecimalBaseColumn":
return libstrings.to_decimal(self, dtype)
) -> cudf.core.column.DecimalBaseColumn:
plc_column = plc.strings.convert.convert_fixed_point.to_fixed_point(
self.to_pylibcudf(mode="read"),
libcudf.types.dtype_to_pylibcudf_type(dtype),
)
result = Column.from_pylibcudf(plc_column)
result.dtype.precision = dtype.precision # type: ignore[union-attr]
return result # type: ignore[return-value]

def as_string_column(self) -> StringColumn:
return self
@@ -6138,12 +6141,9 @@ def can_cast_safely(self, to_dtype: Dtype) -> bool:

if self.dtype == to_dtype:
return True
elif (
to_dtype.kind in {"i", "u"}
and not libstrings.is_integer(self).all()
):
elif to_dtype.kind in {"i", "u"} and not self.is_integer().all():
return False
elif to_dtype.kind == "f" and not libstrings.is_float(self).all():
elif to_dtype.kind == "f" and not self.is_float().all():
return False
else:
return True
@@ -6335,11 +6335,180 @@ def title(self) -> Self:
def is_title(self) -> Self:
return self._modify_characters(plc.strings.capitalize.is_title)

@acquire_spill_lock()
def replace_multiple(self, pattern: Self, replacements: Self) -> Self:
with acquire_spill_lock():
plc_result = plc.strings.replace.replace_multiple(
self.to_pylibcudf(mode="read"),
pattern.to_pylibcudf(mode="read"),
replacements.to_pylibcudf(mode="read"),
plc_result = plc.strings.replace.replace_multiple(
self.to_pylibcudf(mode="read"),
pattern.to_pylibcudf(mode="read"),
replacements.to_pylibcudf(mode="read"),
)
return cast(Self, Column.from_pylibcudf(plc_result))

@acquire_spill_lock()
def _split_record_re(
self,
pattern: str,
maxsplit: int,
method: Callable[
[plc.Column, plc.strings.regex_program.RegexProgram, int],
plc.Column,
],
) -> Self:
plc_column = method(
self.to_pylibcudf(mode="read"),
plc.strings.regex_program.RegexProgram.create(
pattern,
plc.strings.regex_flags.RegexFlags.DEFAULT,
),
maxsplit,
)
return cast(Self, Column.from_pylibcudf(plc_column))

def split_record_re(self, pattern: str, maxsplit: int) -> Self:
return self._split_record_re(
pattern, maxsplit, plc.strings.split.split.split_record_re
)

def rsplit_record_re(self, pattern: str, maxsplit: int) -> Self:
return self._split_record_re(
pattern, maxsplit, plc.strings.split.split.rsplit_record_re
)

@acquire_spill_lock()
def _split_re(
self,
pattern: str,
maxsplit: int,
method: Callable[
[plc.Column, plc.strings.regex_program.RegexProgram, int],
plc.Table,
],
) -> dict[int, Self]:
plc_table = method(
self.to_pylibcudf(mode="read"),
plc.strings.regex_program.RegexProgram.create(
pattern,
plc.strings.regex_flags.RegexFlags.DEFAULT,
),
maxsplit,
)
return dict(
enumerate(
Column.from_pylibcudf(col) # type: ignore[misc]
for col in plc_table.columns()
)
return cast(Self, Column.from_pylibcudf(plc_result))
)

def split_re(self, pattern: str, maxsplit: int) -> dict[int, Self]:
return self._split_re(
pattern, maxsplit, plc.strings.split.split.split_re
)

def rsplit_re(self, pattern: str, maxsplit: int) -> dict[int, Self]:
return self._split_re(
pattern, maxsplit, plc.strings.split.split.rsplit_re
)

@acquire_spill_lock()
def _split_record(
self,
delimiter: cudf.Scalar,
maxsplit: int,
method: Callable[[plc.Column, plc.Scalar, int], plc.Column],
) -> Self:
plc_column = method(
self.to_pylibcudf(mode="read"),
delimiter.device_value.c_value,
maxsplit,
)
return type(self).from_pylibcudf(plc_column) # type: ignore[return-value]

def split_record(self, delimiter: cudf.Scalar, maxsplit: int) -> Self:
return self._split_record(
delimiter, maxsplit, plc.strings.split.split.split_record
)

def rsplit_record(self, delimiter: cudf.Scalar, maxsplit: int) -> Self:
return self._split_record(
delimiter, maxsplit, plc.strings.split.split.rsplit_record
)

@acquire_spill_lock()
def _split(
self,
delimiter: cudf.Scalar,
maxsplit: int,
method: Callable[[plc.Column, plc.Scalar, int], plc.Column],
) -> dict[int, Self]:
plc_table = method(
self.to_pylibcudf(mode="read"),
delimiter.device_value.c_value,
maxsplit,
)
return dict(
enumerate(
Column.from_pylibcudf(col) # type: ignore[misc]
for col in plc_table.columns()
)
)

def split(self, delimiter: cudf.Scalar, maxsplit: int) -> dict[int, Self]:
return self._split(delimiter, maxsplit, plc.strings.split.split.split)

def rsplit(self, delimiter: cudf.Scalar, maxsplit: int) -> dict[int, Self]:
return self._split(delimiter, maxsplit, plc.strings.split.split.rsplit)

@acquire_spill_lock()
def _partition(
self,
delimiter: cudf.Scalar,
method: Callable[[plc.Column, plc.Scalar], plc.Column],
) -> dict[int, Self]:
plc_table = method(
self.to_pylibcudf(mode="read"),
delimiter.device_value.c_value,
)
return dict(
enumerate(
Column.from_pylibcudf(col) # type: ignore[misc]
for col in plc_table.columns()
)
)

def partition(self, delimiter: cudf.Scalar) -> dict[int, Self]:
return self._partition(
delimiter, plc.strings.split.partition.partition
)

def rpartition(self, delimiter: cudf.Scalar) -> dict[int, Self]:
return self._partition(
delimiter, plc.strings.split.partition.rpartition
)

@acquire_spill_lock()
def url_decode(self) -> Self:
plc_column = plc.strings.convert.convert_urls.url_decode(
self.to_pylibcudf(mode="read")
)
return type(self).from_pylibcudf(plc_column) # type: ignore[return-value]

@acquire_spill_lock()
def url_encode(self) -> Self:
plc_column = plc.strings.convert.convert_urls.url_encode(
self.to_pylibcudf(mode="read")
)
return type(self).from_pylibcudf(plc_column) # type: ignore[return-value]

@acquire_spill_lock()
def is_integer(self) -> NumericalColumn:
plc_column = plc.strings.convert.convert_integers.is_integer(
self.to_pylibcudf(mode="read")
)
return type(self).from_pylibcudf(plc_column) # type: ignore[return-value]

@acquire_spill_lock()
def is_float(self) -> NumericalColumn:
plc_column = plc.strings.convert.convert_floats.is_float(
self.to_pylibcudf(mode="read")
)
return type(self).from_pylibcudf(plc_column) # type: ignore[return-value]
5 changes: 1 addition & 4 deletions python/cudf/cudf/core/tools/datetimes.py
Original file line number Diff line number Diff line change
@@ -15,9 +15,6 @@

import cudf
from cudf import _lib as libcudf
from cudf._lib.strings.convert.convert_integers import (
is_integer as cpp_is_integer,
)
from cudf.api.types import is_integer, is_scalar
from cudf.core import column
from cudf.core.buffer import acquire_spill_lock
@@ -232,7 +229,7 @@ def to_datetime(
)
break
elif arg_col.dtype.kind == "O":
if not cpp_is_integer(arg_col).all():
if not arg_col.is_integer().all():
col = new_series._column.strptime(
cudf.dtype("datetime64[ns]"), format=format
)
64 changes: 34 additions & 30 deletions python/cudf/cudf/core/tools/numeric.py
Original file line number Diff line number Diff line change
@@ -2,14 +2,13 @@
from __future__ import annotations

import warnings
from typing import TYPE_CHECKING
from typing import TYPE_CHECKING, Literal

import numpy as np
import pandas as pd

import cudf
from cudf import _lib as libcudf
from cudf._lib import strings as libstrings
from cudf.api.types import _is_non_decimal_numeric_dtype, is_string_dtype
from cudf.core._internals import unary
from cudf.core.column import as_column
@@ -18,10 +17,16 @@
from cudf.utils.dtypes import can_convert_to_column

if TYPE_CHECKING:
from cudf.core.column import ColumnBase
from cudf.core.column.numerical import NumericalColumn
from cudf.core.column.string import StringColumn


def to_numeric(arg, errors="raise", downcast=None, dtype_backend=None):
def to_numeric(
arg,
errors: Literal["raise", "coerce", "ignore"] = "raise",
downcast: Literal["integer", "signed", "unsigned", "float", None] = None,
dtype_backend=None,
):
"""
Convert argument into numerical types.
@@ -130,7 +135,9 @@ def to_numeric(arg, errors="raise", downcast=None, dtype_backend=None):
else:
try:
col = _convert_str_col(
col._get_decategorized_column(), errors, downcast
col._get_decategorized_column(), # type: ignore[attr-defined]
errors,
downcast,
)
except ValueError as e:
if errors == "ignore":
@@ -139,7 +146,7 @@ def to_numeric(arg, errors="raise", downcast=None, dtype_backend=None):
raise e
elif is_string_dtype(dtype):
try:
col = _convert_str_col(col, errors, downcast)
col = _convert_str_col(col, errors, downcast) # type: ignore[arg-type]
except ValueError as e:
if errors == "ignore":
return arg
@@ -186,7 +193,11 @@ def to_numeric(arg, errors="raise", downcast=None, dtype_backend=None):
return col.values


def _convert_str_col(col, errors, _downcast=None):
def _convert_str_col(
col: StringColumn,
errors: Literal["raise", "coerce", "ignore"],
_downcast: Literal["integer", "signed", "unsigned", "float", None] = None,
) -> NumericalColumn:
"""
Converts a string column to numeric column
@@ -212,13 +223,19 @@ def _convert_str_col(col, errors, _downcast=None):
if not is_string_dtype(col):
raise TypeError("col must be string dtype.")

is_integer = libstrings.is_integer(col)
if is_integer.all():
return col.astype(dtype=cudf.dtype("i8"))
if col.is_integer().all():
return col.astype(dtype=cudf.dtype("i8")) # type: ignore[return-value]

col = _proc_inf_empty_strings(col)
converted_col = (
mroeschke marked this conversation as resolved.
Show resolved Hide resolved
col.to_lower()
.find_and_replace(as_column([""]), as_column(["NaN"]))
.replace_multiple(
as_column(["+", "inf", "inity"]), # type: ignore[arg-type]
as_column(["", "Inf", ""]), # type: ignore[arg-type]
)
)

is_float = libstrings.is_float(col)
is_float = converted_col.is_float()
if is_float.all():
if _downcast in {"unsigned", "signed", "integer"}:
warnings.warn(
@@ -227,27 +244,14 @@ def _convert_str_col(col, errors, _downcast=None):
"limited by float32 precision."
)
)
return col.astype(dtype=cudf.dtype("float32"))
return converted_col.astype(dtype=cudf.dtype("float32")) # type: ignore[return-value]
else:
return col.astype(dtype=cudf.dtype("float64"))
return converted_col.astype(dtype=cudf.dtype("float64")) # type: ignore[return-value]
else:
if errors == "coerce":
col = libcudf.string_casting.stod(col)
converted_col = libcudf.string_casting.stod(converted_col)
non_numerics = is_float.unary_operator("not")
col[non_numerics] = None
return col
converted_col[non_numerics] = None
return converted_col # type: ignore[return-value]
else:
raise ValueError("Unable to convert some strings to numerics.")


def _proc_inf_empty_strings(col: ColumnBase) -> ColumnBase:
"""Handles empty and infinity strings"""
col = col.to_lower() # type: ignore[attr-defined]
col = col.find_and_replace(as_column([""]), as_column(["NaN"]))
# TODO: This can be handled by libcudf in
# future see StringColumn.as_numerical_column
col = col.replace_multiple( # type: ignore[attr-defined]
as_column(["+", "inf", "inity"]),
as_column(["", "Inf", ""]),
)
return col