Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Short circuit some Column methods #16246

Merged
merged 4 commits into from
Jul 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 7 additions & 5 deletions python/cudf/cudf/_lib/column.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -202,11 +202,13 @@ cdef class Column:

def _clear_cache(self):
self._distinct_count = {}
try:
del self.memory_usage
except AttributeError:
# `self.memory_usage` was never called before, So ignore.
pass
attrs = ("memory_usage", "is_monotonic_increasing", "is_monotonic_decreasing")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are there any places where we need to insert a _clear_cache call that didn't have one before now that these properties are also cached? I did a quick search and didn't find any, but wanted to confirm that you checked too.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I just looked and it doesn't looks like there's anywhere we truly modify a Column inplace where these attributes might change (I suspected __setitem__ but it appears it's "inplace" where the base data is swapped)

for attr in attrs:
try:
delattr(self, attr)
except AttributeError:
# attr was not called yet, so ignore.
pass
self._null_count = None

def set_mask(self, value):
Expand Down
50 changes: 39 additions & 11 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -274,7 +274,10 @@ def any(self, skipna: bool = True) -> bool:
return libcudf.reduce.reduce("any", self, dtype=np.bool_)

def dropna(self) -> Self:
return drop_nulls([self])[0]._with_type_metadata(self.dtype)
if self.has_nulls():
return drop_nulls([self])[0]._with_type_metadata(self.dtype)
else:
return self.copy()

def to_arrow(self) -> pa.Array:
"""Convert to PyArrow Array
Expand Down Expand Up @@ -699,6 +702,9 @@ def fillna(

def isnull(self) -> ColumnBase:
"""Identify missing values in a Column."""
if not self.has_nulls(include_nan=self.dtype.kind == "f"):
return as_column(False, length=len(self))

result = libcudf.unary.is_null(self)

if self.dtype.kind == "f":
Expand All @@ -710,6 +716,9 @@ def isnull(self) -> ColumnBase:

def notnull(self) -> ColumnBase:
"""Identify non-missing values in a Column."""
if not self.has_nulls(include_nan=self.dtype.kind == "f"):
return as_column(True, length=len(self))

result = libcudf.unary.is_valid(self)

if self.dtype.kind == "f":
Expand Down Expand Up @@ -922,15 +931,16 @@ def as_mask(self) -> Buffer:

@property
def is_unique(self) -> bool:
# distinct_count might already be cached
return self.distinct_count(dropna=False) == len(self)

@property
@cached_property
def is_monotonic_increasing(self) -> bool:
return not self.has_nulls(include_nan=True) and libcudf.sort.is_sorted(
[self], [True], None
)

@property
@cached_property
def is_monotonic_decreasing(self) -> bool:
return not self.has_nulls(include_nan=True) and libcudf.sort.is_sorted(
[self], [False], None
Expand All @@ -941,6 +951,10 @@ def sort_values(
ascending: bool = True,
na_position: str = "last",
) -> ColumnBase:
if (not ascending and self.is_monotonic_decreasing) or (
ascending and self.is_monotonic_increasing
):
return self.copy()
return libcudf.sort.sort(
[self], column_order=[ascending], null_precedence=[na_position]
)[0]
Expand Down Expand Up @@ -1090,11 +1104,22 @@ def apply_boolean_mask(self, mask) -> ColumnBase:
)

def argsort(
self, ascending: bool = True, na_position: str = "last"
) -> "cudf.core.column.NumericalColumn":
return libcudf.sort.order_by(
[self], [ascending], na_position, stable=True
)
self,
ascending: bool = True,
na_position: Literal["first", "last"] = "last",
) -> cudf.core.column.NumericalColumn:
if (ascending and self.is_monotonic_increasing) or (
not ascending and self.is_monotonic_decreasing
):
return as_column(range(len(self)))
elif (ascending and self.is_monotonic_decreasing) or (
not ascending and self.is_monotonic_increasing
):
return as_column(range(len(self) - 1, -1, -1))
else:
return libcudf.sort.order_by(
[self], [ascending], na_position, stable=True
)

def __arrow_array__(self, type=None):
raise TypeError(
Expand Down Expand Up @@ -1157,9 +1182,12 @@ def unique(self) -> ColumnBase:
"""
Get unique values in the data
"""
return drop_duplicates([self], keep="first")[0]._with_type_metadata(
self.dtype
)
if self.is_unique:
return self.copy()
else:
return drop_duplicates([self], keep="first")[
0
]._with_type_metadata(self.dtype)

def serialize(self) -> tuple[dict, list]:
# data model:
Expand Down
Loading