Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove cudf._lib.replace in favor of inlining pylibcudf #17428

Merged
merged 4 commits into from
Dec 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion python/cudf/cudf/_lib/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@ set(cython_sources
orc.pyx
parquet.pyx
reduce.pyx
replace.pyx
round.pyx
scalar.pyx
sort.pyx
Expand Down
1 change: 0 additions & 1 deletion python/cudf/cudf/_lib/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
orc,
parquet,
reduce,
replace,
round,
sort,
stream_compaction,
Expand Down
193 changes: 0 additions & 193 deletions python/cudf/cudf/_lib/replace.pyx

This file was deleted.

10 changes: 4 additions & 6 deletions python/cudf/cudf/core/column/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -843,9 +843,9 @@ def values(self):
"""
raise NotImplementedError("cudf.Categorical is not yet implemented")

def clip(self, lo: ScalarLike, hi: ScalarLike) -> "column.ColumnBase":
def clip(self, lo: ScalarLike, hi: ScalarLike) -> Self:
return (
self.astype(self.categories.dtype).clip(lo, hi).astype(self.dtype)
self.astype(self.categories.dtype).clip(lo, hi).astype(self.dtype) # type: ignore[return-value]
)

def data_array_view(
Expand Down Expand Up @@ -989,10 +989,8 @@ def find_and_replace(
replacement_col = catmap._data["index"].astype(replaced.codes.dtype)

replaced_codes = column.as_column(replaced.codes)
output = libcudf.replace.replace(
replaced_codes, to_replace_col, replacement_col
)
codes = as_unsigned_codes(len(new_cats["cats"]), output)
output = replaced_codes.replace(to_replace_col, replacement_col)
codes = as_unsigned_codes(len(new_cats["cats"]), output) # type: ignore[arg-type]

result = type(self)(
data=self.data, # type: ignore[arg-type]
Expand Down
53 changes: 46 additions & 7 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,8 +241,14 @@ def find_and_replace(
) -> Self:
raise NotImplementedError

def clip(self, lo: ScalarLike, hi: ScalarLike) -> ColumnBase:
return libcudf.replace.clip(self, lo, hi)
@acquire_spill_lock()
def clip(self, lo: ScalarLike, hi: ScalarLike) -> Self:
plc_column = plc.replace.clamp(
self.to_pylibcudf(mode="read"),
cudf.Scalar(lo, self.dtype).device_value.c_value,
cudf.Scalar(hi, self.dtype).device_value.c_value,
)
return type(self).from_pylibcudf(plc_column) # type: ignore[return-value]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I suppose this is because Column.from_pylibcudf only advertises that it produces ColumnBase rather than Self? All this typing is kind of weird because the C++ column is type-erased.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yup exactly. We have this occurrences in quite a few places in cudf Python since we know what column subclass should be returned.


def equals(self, other: ColumnBase, check_dtypes: bool = False) -> bool:
if self is other:
Expand Down Expand Up @@ -686,6 +692,18 @@ def _validate_fillna_value(
return cudf.Scalar(fill_value, dtype=self.dtype)
return as_column(fill_value)

@acquire_spill_lock()
def replace(
self, values_to_replace: Self, replacement_values: Self
) -> Self:
return type(self).from_pylibcudf( # type: ignore[return-value]
plc.replace.find_and_replace_all(
self.to_pylibcudf(mode="read"),
values_to_replace.to_pylibcudf(mode="read"),
replacement_values.to_pylibcudf(mode="read"),
)
)

def fillna(
self,
fill_value: ScalarLike | ColumnLike,
Expand All @@ -704,11 +722,32 @@ def fillna(
return self.copy()
else:
fill_value = self._validate_fillna_value(fill_value)
return libcudf.replace.replace_nulls(
input_col=self.nans_to_nulls(),
replacement=fill_value,
method=method,
)._with_type_metadata(self.dtype)

if fill_value is None and method is None:
raise ValueError("Must specify a fill 'value' or 'method'.")

if fill_value and method:
raise ValueError("Cannot specify both 'value' and 'method'.")

input_col = self.nans_to_nulls()

with acquire_spill_lock():
if method:
plc_replace = (
plc.replace.ReplacePolicy.PRECEDING
if method == "ffill"
else plc.replace.ReplacePolicy.FOLLOWING
)
elif is_scalar(fill_value):
plc_replace = cudf.Scalar(fill_value).device_value.c_value
else:
plc_replace = fill_value.to_pylibcudf(mode="read")
plc_column = plc.replace.replace_nulls(
input_col.to_pylibcudf(mode="read"),
plc_replace,
)
result = type(self).from_pylibcudf(plc_column)
return result._with_type_metadata(self.dtype) # type: ignore[return-value]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you know why we need to ignore type check here?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Because fillna claims to return a Self, but _with_type_metadata only returns a ColumnBase (because sometimes if you pass a different dtype in, the column type can change). Because return types are contravariant, in this scenario, Self is not subclass of ColumnBase.


def isnull(self) -> ColumnBase:
"""Identify missing values in a Column."""
Expand Down
8 changes: 3 additions & 5 deletions python/cudf/cudf/core/column/numerical.py
Original file line number Diff line number Diff line change
Expand Up @@ -482,7 +482,7 @@ def find_and_replace(
to_replace: ColumnLike,
replacement: ColumnLike,
all_nan: bool = False,
) -> NumericalColumn:
) -> Self:
"""
Return col with *to_replace* replaced with *value*.
"""
Expand Down Expand Up @@ -547,7 +547,7 @@ def find_and_replace(
)
elif len(replacement_col) == 1 and len(to_replace_col) == 0:
return self.copy()
replaced = self.astype(common_type)
replaced = cast(Self, self.astype(common_type))
df = cudf.DataFrame._from_data(
{
"old": to_replace_col.astype(common_type),
Expand All @@ -563,9 +563,7 @@ def find_and_replace(
)
df = df.dropna(subset=["old"])

return libcudf.replace.replace(
replaced, df._data["old"], df._data["new"]
)
return replaced.replace(df._data["old"], df._data["new"])

def _validate_fillna_value(
self, fill_value: ScalarLike | ColumnLike
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/core/column/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -6038,7 +6038,7 @@ def find_and_replace(
df = df.dropna(subset=["old"])
else:
res = self
return libcudf.replace.replace(res, df._data["old"], df._data["new"])
return res.replace(df._data["old"], df._data["new"])

def normalize_binop_value(self, other) -> column.ColumnBase | cudf.Scalar:
if (
Expand Down
Loading