diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index 7155017b7af..e030147fdd3 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -202,11 +202,13 @@ cdef class Column: def _clear_cache(self): self._distinct_count = {} - try: - del self.memory_usage - except AttributeError: - # `self.memory_usage` was never called before, So ignore. - pass + attrs = ("memory_usage", "is_monotonic_increasing", "is_monotonic_decreasing") + for attr in attrs: + try: + delattr(self, attr) + except AttributeError: + # attr was not called yet, so ignore. + pass self._null_count = None def set_mask(self, value): diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index dbdf501e022..9467bbeed15 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -274,7 +274,10 @@ def any(self, skipna: bool = True) -> bool: return libcudf.reduce.reduce("any", self, dtype=np.bool_) def dropna(self) -> Self: - return drop_nulls([self])[0]._with_type_metadata(self.dtype) + if self.has_nulls(): + return drop_nulls([self])[0]._with_type_metadata(self.dtype) + else: + return self.copy() def to_arrow(self) -> pa.Array: """Convert to PyArrow Array @@ -699,6 +702,9 @@ def fillna( def isnull(self) -> ColumnBase: """Identify missing values in a Column.""" + if not self.has_nulls(include_nan=self.dtype.kind == "f"): + return as_column(False, length=len(self)) + result = libcudf.unary.is_null(self) if self.dtype.kind == "f": @@ -710,6 +716,9 @@ def isnull(self) -> ColumnBase: def notnull(self) -> ColumnBase: """Identify non-missing values in a Column.""" + if not self.has_nulls(include_nan=self.dtype.kind == "f"): + return as_column(True, length=len(self)) + result = libcudf.unary.is_valid(self) if self.dtype.kind == "f": @@ -922,15 +931,16 @@ def as_mask(self) -> Buffer: @property def is_unique(self) -> bool: + # distinct_count might already be cached return self.distinct_count(dropna=False) == len(self) - @property + @cached_property def is_monotonic_increasing(self) -> bool: return not self.has_nulls(include_nan=True) and libcudf.sort.is_sorted( [self], [True], None ) - @property + @cached_property def is_monotonic_decreasing(self) -> bool: return not self.has_nulls(include_nan=True) and libcudf.sort.is_sorted( [self], [False], None @@ -941,6 +951,10 @@ def sort_values( ascending: bool = True, na_position: str = "last", ) -> ColumnBase: + if (not ascending and self.is_monotonic_decreasing) or ( + ascending and self.is_monotonic_increasing + ): + return self.copy() return libcudf.sort.sort( [self], column_order=[ascending], null_precedence=[na_position] )[0] @@ -1090,11 +1104,22 @@ def apply_boolean_mask(self, mask) -> ColumnBase: ) def argsort( - self, ascending: bool = True, na_position: str = "last" - ) -> "cudf.core.column.NumericalColumn": - return libcudf.sort.order_by( - [self], [ascending], na_position, stable=True - ) + self, + ascending: bool = True, + na_position: Literal["first", "last"] = "last", + ) -> cudf.core.column.NumericalColumn: + if (ascending and self.is_monotonic_increasing) or ( + not ascending and self.is_monotonic_decreasing + ): + return as_column(range(len(self))) + elif (ascending and self.is_monotonic_decreasing) or ( + not ascending and self.is_monotonic_increasing + ): + return as_column(range(len(self) - 1, -1, -1)) + else: + return libcudf.sort.order_by( + [self], [ascending], na_position, stable=True + ) def __arrow_array__(self, type=None): raise TypeError( @@ -1157,9 +1182,12 @@ def unique(self) -> ColumnBase: """ Get unique values in the data """ - return drop_duplicates([self], keep="first")[0]._with_type_metadata( - self.dtype - ) + if self.is_unique: + return self.copy() + else: + return drop_duplicates([self], keep="first")[ + 0 + ]._with_type_metadata(self.dtype) def serialize(self) -> tuple[dict, list]: # data model: