From ecc27a1140c0c287091f6a1291dfaf7ccd82cb19 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 19 Jul 2024 11:55:40 -1000 Subject: [PATCH] Align more DataFrame APIs with pandas (#16310) I have a script that did some signature comparisons between `pandas.DataFrame` and `cudf.DataFrame` API and it appears some signatures have changed between the pandas 1.x and 2.x release. The API changes in this PR are mostly adding implementations or adding missing keyword argument (although they might not be implemented). The APIs affected are: * `__init__` * `__array__` * `__arrow_c_stream__` * `to_dict` * `where` * `add_prefix` * `join` * `apply` * `to_records` * `from_records` * `unstack` * `pct_change` * `sort_values` Marking as breaking as I ensured some added keywords are in the same positions as pandas and therefore might break users who are using purely positional arguments. Authors: - Matthew Roeschke (https://github.com/mroeschke) - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/16310 --- python/cudf/cudf/core/dataframe.py | 169 +++++++++++++++++++++++-- python/cudf/cudf/core/frame.py | 2 +- python/cudf/cudf/core/indexed_frame.py | 13 +- python/cudf/cudf/core/reshape.py | 7 +- python/cudf/cudf/core/series.py | 32 ++++- 5 files changed, 202 insertions(+), 21 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 904bd4ccb2e..7e07078c95b 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -594,6 +594,9 @@ class DataFrame(IndexedFrame, Serializable, GetAttrGetItemMixin): dtype : dtype, default None Data type to force. Only a single dtype is allowed. If None, infer. + copy : bool or None, default None + Copy data from inputs. + Currently not implemented. nan_as_null : bool, Default True If ``None``/``True``, converts ``np.nan`` values to ``null`` values. @@ -680,8 +683,11 @@ def __init__( index=None, columns=None, dtype=None, + copy=None, nan_as_null=no_default, ): + if copy is not None: + raise NotImplementedError("copy is not currently implemented.") super().__init__() if nan_as_null is no_default: nan_as_null = not cudf.get_option("mode.pandas_compatible") @@ -1524,6 +1530,25 @@ def __array_function__(self, func, types, args, kwargs): pass return NotImplemented + def __arrow_c_stream__(self, requested_schema=None): + """ + Export the cudf DataFrame as an Arrow C stream PyCapsule. + + Parameters + ---------- + requested_schema : PyCapsule, default None + The schema to which the dataframe should be casted, passed as a + PyCapsule containing a C ArrowSchema representation of the + requested schema. Currently not implemented. + + Returns + ------- + PyCapsule + """ + if requested_schema is not None: + raise NotImplementedError("requested_schema is not supported") + return self.to_arrow().__arrow_c_stream__() + # The _get_numeric_data method is necessary for dask compatibility. @_performance_tracking def _get_numeric_data(self): @@ -2235,6 +2260,7 @@ def to_dict( self, orient: str = "dict", into: type[dict] = dict, + index: bool = True, ) -> dict | list[dict]: """ Convert the DataFrame to a dictionary. @@ -2268,6 +2294,13 @@ def to_dict( instance of the mapping type you want. If you want a collections.defaultdict, you must pass it initialized. + index : bool, default True + Whether to include the index item (and index_names item if `orient` + is 'tight') in the returned dictionary. Can only be ``False`` + when `orient` is 'split' or 'tight'. Note that when `orient` is + 'records', this parameter does not take effect (index item always + not included). + Returns ------- dict, list or collections.abc.Mapping @@ -2349,7 +2382,7 @@ def to_dict( raise TypeError(f"unsupported type: {into}") return cons(self.items()) # type: ignore[misc] - return self.to_pandas().to_dict(orient=orient, into=into) + return self.to_pandas().to_dict(orient=orient, into=into, index=index) @_performance_tracking def scatter_by_map( @@ -3004,7 +3037,12 @@ def fillna( ) @_performance_tracking - def where(self, cond, other=None, inplace=False): + def where(self, cond, other=None, inplace=False, axis=None, level=None): + if axis is not None: + raise NotImplementedError("axis is not supported.") + elif level is not None: + raise NotImplementedError("level is not supported.") + from cudf.core._internals.where import ( _check_and_cast_columns_with_other, _make_categorical_like, @@ -3614,7 +3652,9 @@ def rename( return result @_performance_tracking - def add_prefix(self, prefix): + def add_prefix(self, prefix, axis=None): + if axis is not None: + raise NotImplementedError("axis is currently not implemented.") # TODO: Change to deep=False when copy-on-write is default out = self.copy(deep=True) out.columns = [ @@ -4230,6 +4270,7 @@ def join( lsuffix="", rsuffix="", sort=False, + validate: str | None = None, ): """Join columns with other DataFrame on index or on a key column. @@ -4243,6 +4284,16 @@ def join( column names when avoiding conflicts. sort : bool Set to True to ensure sorted ordering. + validate : str, optional + If specified, checks if join is of specified type. + + * "one_to_one" or "1:1": check if join keys are unique in both left + and right datasets. + * "one_to_many" or "1:m": check if join keys are unique in left dataset. + * "many_to_one" or "m:1": check if join keys are unique in right dataset. + * "many_to_many" or "m:m": allowed, but does not result in checks. + + Currently not supported. Returns ------- @@ -4256,6 +4307,10 @@ def join( """ if on is not None: raise NotImplementedError("The on parameter is not yet supported") + elif validate is not None: + raise NotImplementedError( + "The validate parameter is not yet supported" + ) df = self.merge( other, @@ -4404,7 +4459,16 @@ def query(self, expr, local_dict=None): @_performance_tracking def apply( - self, func, axis=1, raw=False, result_type=None, args=(), **kwargs + self, + func, + axis=1, + raw=False, + result_type=None, + args=(), + by_row: Literal[False, "compat"] = "compat", + engine: Literal["python", "numba"] = "python", + engine_kwargs: dict[str, bool] | None = None, + **kwargs, ): """ Apply a function along an axis of the DataFrame. @@ -4432,6 +4496,25 @@ def apply( Not yet supported args: tuple Positional arguments to pass to func in addition to the dataframe. + by_row : False or "compat", default "compat" + Only has an effect when ``func`` is a listlike or dictlike of funcs + and the func isn't a string. + If "compat", will if possible first translate the func into pandas + methods (e.g. ``Series().apply(np.sum)`` will be translated to + ``Series().sum()``). If that doesn't work, will try call to apply again with + ``by_row=True`` and if that fails, will call apply again with + ``by_row=False`` (backward compatible). + If False, the funcs will be passed the whole Series at once. + + Currently not supported. + + engine : {'python', 'numba'}, default 'python' + Unused. Added for compatibility with pandas. + engine_kwargs : dict + Unused. Added for compatibility with pandas. + **kwargs + Additional keyword arguments to pass as keywords arguments to + `func`. Examples -------- @@ -4582,13 +4665,17 @@ def apply( """ if axis != 1: - raise ValueError( + raise NotImplementedError( "DataFrame.apply currently only supports row wise ops" ) if raw: - raise ValueError("The `raw` kwarg is not yet supported.") + raise NotImplementedError("The `raw` kwarg is not yet supported.") if result_type is not None: - raise ValueError("The `result_type` kwarg is not yet supported.") + raise NotImplementedError( + "The `result_type` kwarg is not yet supported." + ) + if by_row != "compat": + raise NotImplementedError("by_row is currently not supported.") return self._apply(func, _get_row_kernel, *args, **kwargs) @@ -5489,7 +5576,7 @@ def from_arrow(cls, table): return out @_performance_tracking - def to_arrow(self, preserve_index=None): + def to_arrow(self, preserve_index=None) -> pa.Table: """ Convert to a PyArrow Table. @@ -5579,18 +5666,36 @@ def to_arrow(self, preserve_index=None): return out.replace_schema_metadata(metadata) @_performance_tracking - def to_records(self, index=True): + def to_records(self, index=True, column_dtypes=None, index_dtypes=None): """Convert to a numpy recarray Parameters ---------- index : bool Whether to include the index in the output. + column_dtypes : str, type, dict, default None + If a string or type, the data type to store all columns. If + a dictionary, a mapping of column names and indices (zero-indexed) + to specific data types. Currently not supported. + index_dtypes : str, type, dict, default None + If a string or type, the data type to store all index levels. If + a dictionary, a mapping of index level names and indices + (zero-indexed) to specific data types. + This mapping is applied only if `index=True`. + Currently not supported. Returns ------- numpy recarray """ + if column_dtypes is not None: + raise NotImplementedError( + "column_dtypes is currently not supported." + ) + elif index_dtypes is not None: + raise NotImplementedError( + "column_dtypes is currently not supported." + ) members = [("index", self.index.dtype)] if index else [] members += [(col, self[col].dtype) for col in self._data.names] dtype = np.dtype(members) @@ -5603,7 +5708,16 @@ def to_records(self, index=True): @classmethod @_performance_tracking - def from_records(cls, data, index=None, columns=None, nan_as_null=False): + def from_records( + cls, + data, + index=None, + exclude=None, + columns=None, + coerce_float: bool = False, + nrows: int | None = None, + nan_as_null=False, + ): """ Convert structured or record ndarray to DataFrame. @@ -5613,13 +5727,32 @@ def from_records(cls, data, index=None, columns=None, nan_as_null=False): index : str, array-like The name of the index column in *data*. If None, the default index is used. + exclude : sequence, default None + Columns or fields to exclude. + Currently not implemented. columns : list of str List of column names to include. + coerce_float : bool, default False + Attempt to convert values of non-string, non-numeric objects (like + decimal.Decimal) to floating point, useful for SQL result sets. + Currently not implemented. + nrows : int, default None + Number of rows to read if data is an iterator. + Currently not implemented. Returns ------- DataFrame """ + if exclude is not None: + raise NotImplementedError("exclude is currently not supported.") + if coerce_float is not False: + raise NotImplementedError( + "coerce_float is currently not supported." + ) + if nrows is not None: + raise NotImplementedError("nrows is currently not supported.") + if data.ndim != 1 and data.ndim != 2: raise ValueError( f"records dimension expected 1 or 2 but found {data.ndim}" @@ -7344,9 +7477,9 @@ def pivot_table( @_performance_tracking @copy_docstring(reshape.unstack) - def unstack(self, level=-1, fill_value=None): + def unstack(self, level=-1, fill_value=None, sort: bool = True): return cudf.core.reshape.unstack( - self, level=level, fill_value=fill_value + self, level=level, fill_value=fill_value, sort=sort ) @_performance_tracking @@ -7392,7 +7525,12 @@ def explode(self, column, ignore_index=False): return super()._explode(column, ignore_index) def pct_change( - self, periods=1, fill_method=no_default, limit=no_default, freq=None + self, + periods=1, + fill_method=no_default, + limit=no_default, + freq=None, + **kwargs, ): """ Calculates the percent change between sequential elements @@ -7417,6 +7555,9 @@ def pct_change( freq : str, optional Increment to use from time series API. Not yet implemented. + **kwargs + Additional keyword arguments are passed into + `DataFrame.shift`. Returns ------- @@ -7462,7 +7603,7 @@ def pct_change( data = self.fillna(method=fill_method, limit=limit) return data.diff(periods=periods) / data.shift( - periods=periods, freq=freq + periods=periods, freq=freq, **kwargs ) def __dataframe__( diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index e3a2e840902..c82e073d7b7 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -389,7 +389,7 @@ def values_host(self) -> np.ndarray: return self.to_numpy() @_performance_tracking - def __array__(self, dtype=None): + def __array__(self, dtype=None, copy=None): raise TypeError( "Implicit conversion to a host NumPy array via __array__ is not " "allowed, To explicitly construct a GPU matrix, consider using " diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 576596f6f7d..60cd142db4b 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -3302,7 +3302,7 @@ def pad(self, value=None, axis=None, inplace=None, limit=None): ) return self.ffill(value=value, axis=axis, inplace=inplace, limit=limit) - def add_prefix(self, prefix): + def add_prefix(self, prefix, axis=None): """ Prefix labels with string `prefix`. @@ -3464,6 +3464,7 @@ def sort_values( kind="quicksort", na_position="last", ignore_index=False, + key=None, ): """Sort by the values along either axis. @@ -3479,6 +3480,14 @@ def sort_values( 'first' puts nulls at the beginning, 'last' puts nulls at the end ignore_index : bool, default False If True, index will not be sorted. + key : callable, optional + Apply the key function to the values + before sorting. This is similar to the ``key`` argument in the + builtin ``sorted`` function, with the notable difference that + this ``key`` function should be *vectorized*. It should expect a + ``Series`` and return a Series with the same shape as the input. + It will be applied to each column in `by` independently. + Currently not supported. Returns ------- @@ -3518,6 +3527,8 @@ def sort_values( ) if axis != 0: raise NotImplementedError("`axis` not currently implemented.") + if key is not None: + raise NotImplementedError("key is not currently supported.") if len(self) == 0: return self diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index 1120642947b..b538ae34b6f 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -1060,7 +1060,7 @@ def pivot(data, columns=None, index=no_default, values=no_default): return result -def unstack(df, level, fill_value=None): +def unstack(df, level, fill_value=None, sort: bool = True): """ Pivot one or more levels of the (necessarily hierarchical) index labels. @@ -1080,6 +1080,9 @@ def unstack(df, level, fill_value=None): levels of the index to pivot fill_value Non-functional argument provided for compatibility with Pandas. + sort : bool, default True + Sort the level(s) in the resulting MultiIndex columns. + Returns ------- @@ -1156,6 +1159,8 @@ def unstack(df, level, fill_value=None): if fill_value is not None: raise NotImplementedError("fill_value is not supported.") + elif sort is False: + raise NotImplementedError(f"{sort=} is not supported.") if pd.api.types.is_list_like(level): if not level: return df diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index baaa2eb46a1..b1e63806934 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -2063,6 +2063,7 @@ def sort_values( kind="quicksort", na_position="last", ignore_index=False, + key=None, ): """Sort by the values along either axis. @@ -2076,6 +2077,14 @@ def sort_values( 'first' puts nulls at the beginning, 'last' puts nulls at the end ignore_index : bool, default False If True, index will not be sorted. + key : callable, optional + Apply the key function to the values + before sorting. This is similar to the ``key`` argument in the + builtin ``sorted`` function, with the notable difference that + this ``key`` function should be *vectorized*. It should expect a + ``Series`` and return a Series with the same shape as the input. + It will be applied to each column in `by` independently. + Currently not supported. Returns ------- @@ -2107,6 +2116,7 @@ def sort_values( kind=kind, na_position=na_position, ignore_index=ignore_index, + key=key, ) @_performance_tracking @@ -3429,7 +3439,9 @@ def rename(self, index=None, copy=True): return Series._from_data(out_data, self.index, name=index) @_performance_tracking - def add_prefix(self, prefix): + def add_prefix(self, prefix, axis=None): + if axis is not None: + raise NotImplementedError("axis is currently not implemented.") return Series._from_data( # TODO: Change to deep=False when copy-on-write is default data=self._data.copy(deep=True), @@ -3527,7 +3539,12 @@ def explode(self, ignore_index=False): @_performance_tracking def pct_change( - self, periods=1, fill_method=no_default, limit=no_default, freq=None + self, + periods=1, + fill_method=no_default, + limit=no_default, + freq=None, + **kwargs, ): """ Calculates the percent change between sequential elements @@ -3552,6 +3569,9 @@ def pct_change( freq : str, optional Increment to use from time series API. Not yet implemented. + **kwargs + Additional keyword arguments are passed into + `Series.shift`. Returns ------- @@ -3596,11 +3616,15 @@ def pct_change( warnings.simplefilter("ignore") data = self.fillna(method=fill_method, limit=limit) diff = data.diff(periods=periods) - change = diff / data.shift(periods=periods, freq=freq) + change = diff / data.shift(periods=periods, freq=freq, **kwargs) return change @_performance_tracking - def where(self, cond, other=None, inplace=False): + def where(self, cond, other=None, inplace=False, axis=None, level=None): + if axis is not None: + raise NotImplementedError("axis is not supported.") + elif level is not None: + raise NotImplementedError("level is not supported.") result_col = super().where(cond, other, inplace) return self._mimic_inplace( self._from_data_like_self(