From f21f8a86c36a6233c8a98bcb5755ac213c4c72e6 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 10 Jul 2024 15:48:28 -0700 Subject: [PATCH 1/2] Short circuit some Column methods --- python/cudf/cudf/_lib/column.pyx | 12 ++++++----- python/cudf/cudf/core/column/column.py | 29 ++++++++++++++++++++------ 2 files changed, 30 insertions(+), 11 deletions(-) diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index 7155017b7af..e030147fdd3 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -202,11 +202,13 @@ cdef class Column: def _clear_cache(self): self._distinct_count = {} - try: - del self.memory_usage - except AttributeError: - # `self.memory_usage` was never called before, So ignore. - pass + attrs = ("memory_usage", "is_monotonic_increasing", "is_monotonic_decreasing") + for attr in attrs: + try: + delattr(self, attr) + except AttributeError: + # attr was not called yet, so ignore. + pass self._null_count = None def set_mask(self, value): diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index adc783c20c4..2848a71ef4d 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -275,7 +275,10 @@ def any(self, skipna: bool = True) -> bool: return libcudf.reduce.reduce("any", self, dtype=np.bool_) def dropna(self) -> Self: - return drop_nulls([self])[0]._with_type_metadata(self.dtype) + if self.has_nulls(): + return drop_nulls([self])[0]._with_type_metadata(self.dtype) + else: + return self.copy() def to_arrow(self) -> pa.Array: """Convert to PyArrow Array @@ -700,6 +703,9 @@ def fillna( def isnull(self) -> ColumnBase: """Identify missing values in a Column.""" + if not self.has_nulls(include_nan=self.dtype.kind == "f"): + return as_column(False, length=len(self)) + result = libcudf.unary.is_null(self) if self.dtype.kind == "f": @@ -711,6 +717,9 @@ def isnull(self) -> ColumnBase: def notnull(self) -> ColumnBase: """Identify non-missing values in a Column.""" + if not self.has_nulls(include_nan=self.dtype.kind == "f"): + return as_column(True, length=len(self)) + result = libcudf.unary.is_valid(self) if self.dtype.kind == "f": @@ -923,15 +932,16 @@ def as_mask(self) -> Buffer: @property def is_unique(self) -> bool: + # distinct_count might already be cached return self.distinct_count(dropna=False) == len(self) - @property + @cached_property def is_monotonic_increasing(self) -> bool: return not self.has_nulls(include_nan=True) and libcudf.sort.is_sorted( [self], [True], None ) - @property + @cached_property def is_monotonic_decreasing(self) -> bool: return not self.has_nulls(include_nan=True) and libcudf.sort.is_sorted( [self], [False], None @@ -942,6 +952,10 @@ def sort_values( ascending: bool = True, na_position: str = "last", ) -> ColumnBase: + if (not ascending and self.is_monotonic_decreasing) or ( + ascending and self.is_monotonic_increasing + ): + return self.copy() return libcudf.sort.sort( [self], column_order=[ascending], null_precedence=[na_position] )[0] @@ -1160,9 +1174,12 @@ def unique(self) -> ColumnBase: """ Get unique values in the data """ - return drop_duplicates([self], keep="first")[0]._with_type_metadata( - self.dtype - ) + if self.is_unique: + return self.copy() + else: + return drop_duplicates([self], keep="first")[ + 0 + ]._with_type_metadata(self.dtype) def serialize(self) -> tuple[dict, list]: # data model: From 6883a40e4d6bb4b0c57bd85d75043402f18979f0 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 10 Jul 2024 16:12:34 -0700 Subject: [PATCH 2/2] Short argsort --- python/cudf/cudf/core/column/column.py | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 2848a71ef4d..b31503fbb16 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -1107,11 +1107,22 @@ def apply_boolean_mask(self, mask) -> ColumnBase: ) def argsort( - self, ascending: bool = True, na_position: str = "last" - ) -> "cudf.core.column.NumericalColumn": - return libcudf.sort.order_by( - [self], [ascending], na_position, stable=True - ) + self, + ascending: bool = True, + na_position: Literal["first", "last"] = "last", + ) -> cudf.core.column.NumericalColumn: + if (ascending and self.is_monotonic_increasing) or ( + not ascending and self.is_monotonic_decreasing + ): + return as_column(range(len(self))) + elif (ascending and self.is_monotonic_decreasing) or ( + not ascending and self.is_monotonic_increasing + ): + return as_column(range(len(self) - 1, -1, -1)) + else: + return libcudf.sort.order_by( + [self], [ascending], na_position, stable=True + ) def __arrow_array__(self, type=None): raise TypeError(