Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove cudf._lib.replace in favor of inlining pylibcudf #17428

Merged
merged 4 commits into from
Dec 2, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion python/cudf/cudf/_lib/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@ set(cython_sources
parquet.pyx
partitioning.pyx
reduce.pyx
replace.pyx
reshape.pyx
rolling.pyx
round.pyx
Expand Down
1 change: 0 additions & 1 deletion python/cudf/cudf/_lib/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
parquet,
partitioning,
reduce,
replace,
reshape,
rolling,
round,
Expand Down
193 changes: 0 additions & 193 deletions python/cudf/cudf/_lib/replace.pyx

This file was deleted.

6 changes: 2 additions & 4 deletions python/cudf/cudf/core/column/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -989,10 +989,8 @@ def find_and_replace(
replacement_col = catmap._data["index"].astype(replaced.codes.dtype)

replaced_codes = column.as_column(replaced.codes)
output = libcudf.replace.replace(
replaced_codes, to_replace_col, replacement_col
)
codes = as_unsigned_codes(len(new_cats["cats"]), output)
output = replaced_codes.replace(to_replace_col, replacement_col)
codes = as_unsigned_codes(len(new_cats["cats"]), output) # type: ignore[arg-type]

result = type(self)(
data=self.data, # type: ignore[arg-type]
Expand Down
51 changes: 45 additions & 6 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,8 +241,14 @@ def find_and_replace(
) -> Self:
raise NotImplementedError

@acquire_spill_lock()
def clip(self, lo: ScalarLike, hi: ScalarLike) -> ColumnBase:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
def clip(self, lo: ScalarLike, hi: ScalarLike) -> ColumnBase:
def clip(self, lo: ScalarLike, hi: ScalarLike) -> Self:

return libcudf.replace.clip(self, lo, hi)
plc_column = plc.replace.clamp(
self.to_pylibcudf(mode="read"),
cudf.Scalar(lo, self.dtype).device_value.c_value,
cudf.Scalar(hi, self.dtype).device_value.c_value,
)
return type(self).from_pylibcudf(plc_column)

def equals(self, other: ColumnBase, check_dtypes: bool = False) -> bool:
if self is other:
Expand Down Expand Up @@ -686,6 +692,18 @@ def _validate_fillna_value(
return cudf.Scalar(fill_value, dtype=self.dtype)
return as_column(fill_value)

@acquire_spill_lock()
def replace(
self, values_to_replace: ColumnBase, replacement_values: ColumnBase
) -> ColumnBase:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
self, values_to_replace: ColumnBase, replacement_values: ColumnBase
) -> ColumnBase:
self, values_to_replace: Self, replacement_values: Self
) -> Self:

The values to replace replace and the replacements must have the same type as Self, and we get a Self back.

return type(self).from_pylibcudf(
plc.replace.find_and_replace_all(
self.to_pylibcudf(mode="read"),
values_to_replace.to_pylibcudf(mode="read"),
replacement_values.to_pylibcudf(mode="read"),
)
)

def fillna(
self,
fill_value: ScalarLike | ColumnLike,
Expand All @@ -704,11 +722,32 @@ def fillna(
return self.copy()
else:
fill_value = self._validate_fillna_value(fill_value)
return libcudf.replace.replace_nulls(
input_col=self.nans_to_nulls(),
replacement=fill_value,
method=method,
)._with_type_metadata(self.dtype)

if fill_value is None and method is None:
raise ValueError("Must specify a fill 'value' or 'method'.")

if fill_value and method:
raise ValueError("Cannot specify both 'value' and 'method'.")

input_col = self.nans_to_nulls()

with acquire_spill_lock():
if method:
plc_replace = (
plc.replace.ReplacePolicy.PRECEDING
if method == "ffill"
else plc.replace.ReplacePolicy.FOLLOWING
)
elif is_scalar(fill_value):
plc_replace = cudf.Scalar(fill_value).device_value.c_value
else:
plc_replace = fill_value.to_pylibcudf(mode="read")
plc_column = plc.replace.replace_nulls(
input_col.to_pylibcudf(mode="read"),
plc_replace,
)
result = type(self).from_pylibcudf(plc_column)
return result._with_type_metadata(self.dtype) # type: ignore[return-value]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you know why we need to ignore type check here?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Because fillna claims to return a Self, but _with_type_metadata only returns a ColumnBase (because sometimes if you pass a different dtype in, the column type can change). Because return types are contravariant, in this scenario, Self is not subclass of ColumnBase.


def isnull(self) -> ColumnBase:
"""Identify missing values in a Column."""
Expand Down
4 changes: 1 addition & 3 deletions python/cudf/cudf/core/column/numerical.py
Original file line number Diff line number Diff line change
Expand Up @@ -563,9 +563,7 @@ def find_and_replace(
)
df = df.dropna(subset=["old"])

return libcudf.replace.replace(
replaced, df._data["old"], df._data["new"]
)
return replaced.replace(df._data["old"], df._data["new"]) # type: ignore[return-value]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this one can go away if we type replace correctly?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was able to remove this, but I had to add a cast in an astype above (which is not able to always return self so it's typed as ColumnBase)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

AH, thanks


def _validate_fillna_value(
self, fill_value: ScalarLike | ColumnLike
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/core/column/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -6044,7 +6044,7 @@ def find_and_replace(
df = df.dropna(subset=["old"])
else:
res = self
return libcudf.replace.replace(res, df._data["old"], df._data["new"])
return res.replace(df._data["old"], df._data["new"]) # type: ignore[return-value]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Likewise here

Suggested change
return res.replace(df._data["old"], df._data["new"]) # type: ignore[return-value]
return res.replace(df._data["old"], df._data["new"])


def normalize_binop_value(self, other) -> column.ColumnBase | cudf.Scalar:
if (
Expand Down
Loading