Skip to content

Commit

Permalink
Short circuit some Column methods (#16246)
Browse files Browse the repository at this point in the history
Adds some short circuiting, possibly cached checks (e.g. all values unique, no-NAs, monotonicity), to `dropna`, `isnull`, `notnull`, `argsort`, `unique` and `sort_values` allowing these ops to just copy / return a "simplified" result

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: #16246
  • Loading branch information
mroeschke authored Jul 17, 2024
1 parent 9db6723 commit 1dd63ea
Show file tree
Hide file tree
Showing 2 changed files with 46 additions and 16 deletions.
12 changes: 7 additions & 5 deletions python/cudf/cudf/_lib/column.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -202,11 +202,13 @@ cdef class Column:

def _clear_cache(self):
self._distinct_count = {}
try:
del self.memory_usage
except AttributeError:
# `self.memory_usage` was never called before, So ignore.
pass
attrs = ("memory_usage", "is_monotonic_increasing", "is_monotonic_decreasing")
for attr in attrs:
try:
delattr(self, attr)
except AttributeError:
# attr was not called yet, so ignore.
pass
self._null_count = None

def set_mask(self, value):
Expand Down
50 changes: 39 additions & 11 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -274,7 +274,10 @@ def any(self, skipna: bool = True) -> bool:
return libcudf.reduce.reduce("any", self, dtype=np.bool_)

def dropna(self) -> Self:
return drop_nulls([self])[0]._with_type_metadata(self.dtype)
if self.has_nulls():
return drop_nulls([self])[0]._with_type_metadata(self.dtype)
else:
return self.copy()

def to_arrow(self) -> pa.Array:
"""Convert to PyArrow Array
Expand Down Expand Up @@ -699,6 +702,9 @@ def fillna(

def isnull(self) -> ColumnBase:
"""Identify missing values in a Column."""
if not self.has_nulls(include_nan=self.dtype.kind == "f"):
return as_column(False, length=len(self))

result = libcudf.unary.is_null(self)

if self.dtype.kind == "f":
Expand All @@ -710,6 +716,9 @@ def isnull(self) -> ColumnBase:

def notnull(self) -> ColumnBase:
"""Identify non-missing values in a Column."""
if not self.has_nulls(include_nan=self.dtype.kind == "f"):
return as_column(True, length=len(self))

result = libcudf.unary.is_valid(self)

if self.dtype.kind == "f":
Expand Down Expand Up @@ -922,15 +931,16 @@ def as_mask(self) -> Buffer:

@property
def is_unique(self) -> bool:
# distinct_count might already be cached
return self.distinct_count(dropna=False) == len(self)

@property
@cached_property
def is_monotonic_increasing(self) -> bool:
return not self.has_nulls(include_nan=True) and libcudf.sort.is_sorted(
[self], [True], None
)

@property
@cached_property
def is_monotonic_decreasing(self) -> bool:
return not self.has_nulls(include_nan=True) and libcudf.sort.is_sorted(
[self], [False], None
Expand All @@ -941,6 +951,10 @@ def sort_values(
ascending: bool = True,
na_position: str = "last",
) -> ColumnBase:
if (not ascending and self.is_monotonic_decreasing) or (
ascending and self.is_monotonic_increasing
):
return self.copy()
return libcudf.sort.sort(
[self], column_order=[ascending], null_precedence=[na_position]
)[0]
Expand Down Expand Up @@ -1090,11 +1104,22 @@ def apply_boolean_mask(self, mask) -> ColumnBase:
)

def argsort(
self, ascending: bool = True, na_position: str = "last"
) -> "cudf.core.column.NumericalColumn":
return libcudf.sort.order_by(
[self], [ascending], na_position, stable=True
)
self,
ascending: bool = True,
na_position: Literal["first", "last"] = "last",
) -> cudf.core.column.NumericalColumn:
if (ascending and self.is_monotonic_increasing) or (
not ascending and self.is_monotonic_decreasing
):
return as_column(range(len(self)))
elif (ascending and self.is_monotonic_decreasing) or (
not ascending and self.is_monotonic_increasing
):
return as_column(range(len(self) - 1, -1, -1))
else:
return libcudf.sort.order_by(
[self], [ascending], na_position, stable=True
)

def __arrow_array__(self, type=None):
raise TypeError(
Expand Down Expand Up @@ -1157,9 +1182,12 @@ def unique(self) -> ColumnBase:
"""
Get unique values in the data
"""
return drop_duplicates([self], keep="first")[0]._with_type_metadata(
self.dtype
)
if self.is_unique:
return self.copy()
else:
return drop_duplicates([self], keep="first")[
0
]._with_type_metadata(self.dtype)

def serialize(self) -> tuple[dict, list]:
# data model:
Expand Down

0 comments on commit 1dd63ea

Please sign in to comment.