From 1c37c780ced37d6084c90b815b274b598665d60e Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 19 Jan 2024 11:50:33 -1000 Subject: [PATCH] Some `frame.py` typing, move seldom used methods in `frame.py` (#14766) * `_drop_na_columns` was only use on `IndexedFrame` and not `Frame` so moved the method to `IndexedFrame * `_has_nulls` is equivalent to `isna().any()` * Some typing Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/14766 --- python/cudf/cudf/core/frame.py | 97 +++++++++------------- python/cudf/cudf/core/groupby/groupby.py | 8 +- python/cudf/cudf/core/indexed_frame.py | 36 +++++++- python/cudf/cudf/core/udf/groupby_utils.py | 5 +- 4 files changed, 77 insertions(+), 69 deletions(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index cb8f8b9cc7b..fc313a62fd0 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -94,10 +94,6 @@ def _dtypes(self): zip(self._data.names, (col.dtype for col in self._data.columns)) ) - @property - def _has_nulls(self): - return any(col.has_nulls() for col in self._data.values()) - @_cudf_nvtx_annotate def serialize(self): # TODO: See if self._data can be serialized outright @@ -135,13 +131,13 @@ def deserialize(cls, header, frames): @classmethod @_cudf_nvtx_annotate - def _from_data(cls, data: MutableMapping): + def _from_data(cls, data: MutableMapping) -> Self: obj = cls.__new__(cls) Frame.__init__(obj, data) return obj @_cudf_nvtx_annotate - def _from_data_like_self(self, data: MutableMapping): + def _from_data_like_self(self, data: MutableMapping) -> Self: return self._from_data(data) @_cudf_nvtx_annotate @@ -179,7 +175,7 @@ def _mimic_inplace( @property @_cudf_nvtx_annotate - def size(self): + def size(self) -> int: """ Return the number of elements in the underlying data. @@ -272,7 +268,7 @@ def memory_usage(self, deep=False): raise NotImplementedError @_cudf_nvtx_annotate - def __len__(self): + def __len__(self) -> int: return self._num_rows @_cudf_nvtx_annotate @@ -291,7 +287,7 @@ def astype(self, dtype, copy: bool = False): ) @_cudf_nvtx_annotate - def equals(self, other): + def equals(self, other) -> bool: """ Test whether two objects contain the same elements. @@ -375,7 +371,7 @@ def _get_columns_by_label(self, labels, *, downcast=False) -> Self: @property @_cudf_nvtx_annotate - def values(self): + def values(self) -> cupy.ndarray: """ Return a CuPy representation of the DataFrame. @@ -391,7 +387,7 @@ def values(self): @property @_cudf_nvtx_annotate - def values_host(self): + def values_host(self) -> np.ndarray: """ Return a NumPy representation of the data. @@ -547,7 +543,7 @@ def to_numpy( ) @_cudf_nvtx_annotate - def where(self, cond, other=None, inplace=False): + def where(self, cond, other=None, inplace: bool = False) -> Optional[Self]: """ Replace values where the condition is False. @@ -618,7 +614,7 @@ def where(self, cond, other=None, inplace=False): raise NotImplementedError @_cudf_nvtx_annotate - def mask(self, cond, other=None, inplace=False): + def mask(self, cond, other=None, inplace: bool = False) -> Optional[Self]: """ Replace values where the condition is True. @@ -728,8 +724,13 @@ def pipe(self, func, *args, **kwargs): @_cudf_nvtx_annotate def fillna( - self, value=None, method=None, axis=None, inplace=False, limit=None - ): + self, + value=None, + method: Optional[Literal["ffill", "bfill", "pad", "backfill"]] = None, + axis=None, + inplace: bool = False, + limit=None, + ) -> Optional[Self]: """Fill null values with ``value`` or specified ``method``. Parameters @@ -848,15 +849,15 @@ def fillna( if isinstance(value, cudf.Series): value = value.reindex(self._data.names) elif isinstance(value, cudf.DataFrame): - if not self.index.equals(value.index): - value = value.reindex(self.index) + if not self.index.equals(value.index): # type: ignore[attr-defined] + value = value.reindex(self.index) # type: ignore[attr-defined] else: value = value elif not isinstance(value, abc.Mapping): value = {name: copy.deepcopy(value) for name in self._data.names} else: value = { - key: value.reindex(self.index) + key: value.reindex(self.index) # type: ignore[attr-defined] if isinstance(value, cudf.Series) else value for key, value in value.items() @@ -898,44 +899,14 @@ def _drop_column(self, name): raise KeyError(f"column '{name}' does not exist") del self._data[name] - @_cudf_nvtx_annotate - def _drop_na_columns(self, how="any", subset=None, thresh=None): - """ - Drop columns containing nulls - """ - out_cols = [] - - if subset is None: - df = self - else: - df = self.take(subset) - - if thresh is None: - if how == "all": - thresh = 1 - else: - thresh = len(df) - - for name, col in df._data.items(): - try: - check_col = col.nans_to_nulls() - except AttributeError: - check_col = col - no_threshold_valid_count = ( - len(col) - check_col.null_count - ) < thresh - if no_threshold_valid_count: - continue - out_cols.append(name) - - return self[out_cols] - @_cudf_nvtx_annotate def _quantile_table( self, - q, - interpolation="LINEAR", - is_sorted=False, + q: float, + interpolation: Literal[ + "LINEAR", "LOWER", "HIGHER", "MIDPOINT", "NEAREST" + ] = "LINEAR", + is_sorted: bool = False, column_order=(), null_precedence=(), ): @@ -963,7 +934,7 @@ def _quantile_table( @classmethod @_cudf_nvtx_annotate - def from_arrow(cls, data): + def from_arrow(cls, data: pa.Table) -> Self: """Convert from PyArrow Table to Frame Parameters @@ -1140,7 +1111,7 @@ def to_arrow(self): ) @_cudf_nvtx_annotate - def _positions_from_column_names(self, column_names): + def _positions_from_column_names(self, column_names) -> list[int]: """Map each column name into their positions in the frame. The order of indices returned corresponds to the column order in this @@ -1542,7 +1513,12 @@ def argsort( ).values @_cudf_nvtx_annotate - def _get_sorted_inds(self, by=None, ascending=True, na_position="last"): + def _get_sorted_inds( + self, + by=None, + ascending=True, + na_position: Literal["first", "last"] = "last", + ) -> ColumnBase: """ Get the indices required to sort self according to the columns specified in by. @@ -1556,13 +1532,14 @@ def _get_sorted_inds(self, by=None, ascending=True, na_position="last"): )._columns ] - # If given a scalar need to construct a sequence of length # of columns - if np.isscalar(ascending): - ascending = [ascending] * len(to_sort) + if is_scalar(ascending): + ascending_lst = [ascending] * len(to_sort) + else: + ascending_lst = list(ascending) return libcudf.sort.order_by( to_sort, - ascending, + ascending_lst, na_position, stable=True, ) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 6c83bcd9efb..e28ba233c56 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -1251,10 +1251,6 @@ def pipe(self, func, *args, **kwargs): def _jit_groupby_apply( self, function, group_names, offsets, group_keys, grouped_values, *args ): - # Nulls are not yet supported - if self.grouping._obj._has_nulls: - raise ValueError("Nulls not yet supported with groupby JIT engine") - chunk_results = jit_groupby_apply( offsets, grouped_values, function, *args ) @@ -1445,9 +1441,7 @@ def mult(df): group_names, offsets, group_keys, grouped_values = self._grouped() if engine == "auto": - if (not grouped_values._has_nulls) and _can_be_jitted( - grouped_values, function, args - ): + if _can_be_jitted(grouped_values, function, args): engine = "jit" else: engine = "cudf" diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 8cd9cc54889..3e564919090 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -2439,7 +2439,9 @@ def sort_values( out.columns = self._data.to_pandas_index() return out - def _n_largest_or_smallest(self, largest, n, columns, keep): + def _n_largest_or_smallest( + self, largest: bool, n: int, columns, keep: Literal["first", "last"] + ): # Get column to operate on if isinstance(columns, str): columns = [columns] @@ -3068,6 +3070,38 @@ def dropna( return self._mimic_inplace(result, inplace=inplace) + @_cudf_nvtx_annotate + def _drop_na_columns(self, how="any", subset=None, thresh=None): + """ + Drop columns containing nulls + """ + out_cols = [] + + if subset is None: + df = self + else: + df = self.take(subset) + + if thresh is None: + if how == "all": + thresh = 1 + else: + thresh = len(df) + + for name, col in df._data.items(): + try: + check_col = col.nans_to_nulls() + except AttributeError: + check_col = col + no_threshold_valid_count = ( + len(col) - check_col.null_count + ) < thresh + if no_threshold_valid_count: + continue + out_cols.append(name) + + return self[out_cols] + def _drop_na_rows(self, how="any", subset=None, thresh=None): """ Drop null rows from `self`. diff --git a/python/cudf/cudf/core/udf/groupby_utils.py b/python/cudf/cudf/core/udf/groupby_utils.py index c82f8d2cd7b..06d9296ca0f 100644 --- a/python/cudf/cudf/core/udf/groupby_utils.py +++ b/python/cudf/cudf/core/udf/groupby_utils.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022-2023, NVIDIA CORPORATION. +# Copyright (c) 2022-2024, NVIDIA CORPORATION. import cupy as cp @@ -209,6 +209,9 @@ def _can_be_jitted(frame, func, args): # Numba requires bytecode to be present to proceed. # See https://github.com/numba/numba/issues/4587 return False + + if any(col.has_nulls() for col in frame._data.values()): + return False np_field_types = np.dtype( list( _supported_dtypes_from_frame(