From 5774df8f0d6ba3383bb378296cdc8b11b3874531 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 18 Jul 2024 13:36:19 -0700 Subject: [PATCH 1/5] Align more DataFrame APIs with pandas --- python/cudf/cudf/core/dataframe.py | 170 +++++++++++++++++++++++-- python/cudf/cudf/core/frame.py | 2 +- python/cudf/cudf/core/indexed_frame.py | 13 +- python/cudf/cudf/core/reshape.py | 7 +- python/cudf/cudf/core/series.py | 32 ++++- 5 files changed, 203 insertions(+), 21 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index b3d938829c9..1d2c756c88c 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -695,6 +695,7 @@ def __init__( index=None, columns=None, dtype=None, + copy=None, nan_as_null=no_default, ): super().__init__() @@ -1538,6 +1539,31 @@ def __array_function__(self, func, types, args, kwargs): pass return NotImplemented + def __arrow_c_stream__(self, requested_schema=None): + """ + Export the pandas DataFrame as an Arrow C stream PyCapsule. + + This relies on pyarrow to convert the pandas DataFrame to the Arrow + format (and follows the default behaviour of ``pyarrow.Table.from_pandas`` + in its handling of the index, i.e. store the index as a column except + for RangeIndex). + This conversion is not necessarily zero-copy. + + Parameters + ---------- + requested_schema : PyCapsule, default None + The schema to which the dataframe should be casted, passed as a + PyCapsule containing a C ArrowSchema representation of the + requested schema. + + Returns + ------- + PyCapsule + """ + if requested_schema is not None: + raise NotImplementedError("requested_schema is not supported") + return self.to_arrow().__arrow_c_stream__() + # The _get_numeric_data method is necessary for dask compatibility. @_performance_tracking def _get_numeric_data(self): @@ -2249,6 +2275,7 @@ def to_dict( self, orient: str = "dict", into: type[dict] = dict, + index: bool = True, ) -> dict | list[dict]: """ Convert the DataFrame to a dictionary. @@ -2282,6 +2309,13 @@ def to_dict( instance of the mapping type you want. If you want a collections.defaultdict, you must pass it initialized. + index : bool, default True + Whether to include the index item (and index_names item if `orient` + is 'tight') in the returned dictionary. Can only be ``False`` + when `orient` is 'split' or 'tight'. Note that when `orient` is + 'records', this parameter does not take effect (index item always + not included). + Returns ------- dict, list or collections.abc.Mapping @@ -2363,7 +2397,7 @@ def to_dict( raise TypeError(f"unsupported type: {into}") return cons(self.items()) # type: ignore[misc] - return self.to_pandas().to_dict(orient=orient, into=into) + return self.to_pandas().to_dict(orient=orient, into=into, index=index) @_performance_tracking def scatter_by_map( @@ -3018,7 +3052,12 @@ def fillna( ) @_performance_tracking - def where(self, cond, other=None, inplace=False): + def where(self, cond, other=None, inplace=False, axis=None, level=None): + if axis is not None: + raise NotImplementedError("axis is not supported.") + elif level is not None: + raise NotImplementedError("level is not supported.") + from cudf.core._internals.where import ( _check_and_cast_columns_with_other, _make_categorical_like, @@ -3628,7 +3667,9 @@ def rename( return result @_performance_tracking - def add_prefix(self, prefix): + def add_prefix(self, prefix, axis=None): + if axis is not None: + raise NotImplementedError("axis is currently not implemented.") # TODO: Change to deep=False when copy-on-write is default out = self.copy(deep=True) out.columns = [ @@ -4244,6 +4285,7 @@ def join( lsuffix="", rsuffix="", sort=False, + validate: str | None = None, ): """Join columns with other DataFrame on index or on a key column. @@ -4257,6 +4299,16 @@ def join( column names when avoiding conflicts. sort : bool Set to True to ensure sorted ordering. + validate : str, optional + If specified, checks if join is of specified type. + + * "one_to_one" or "1:1": check if join keys are unique in both left + and right datasets. + * "one_to_many" or "1:m": check if join keys are unique in left dataset. + * "many_to_one" or "m:1": check if join keys are unique in right dataset. + * "many_to_many" or "m:m": allowed, but does not result in checks. + + Currently not supported. Returns ------- @@ -4270,6 +4322,10 @@ def join( """ if on is not None: raise NotImplementedError("The on parameter is not yet supported") + elif validate is not None: + raise NotImplementedError( + "The validate parameter is not yet supported" + ) df = self.merge( other, @@ -4420,7 +4476,16 @@ def query(self, expr, local_dict=None): @_performance_tracking def apply( - self, func, axis=1, raw=False, result_type=None, args=(), **kwargs + self, + func, + axis=1, + raw=False, + result_type=None, + args=(), + by_row: Literal[False, "compat"] = "compat", + engine: Literal["python", "numba"] = "python", + engine_kwargs: dict[str, bool] | None = None, + **kwargs, ): """ Apply a function along an axis of the DataFrame. @@ -4448,6 +4513,25 @@ def apply( Not yet supported args: tuple Positional arguments to pass to func in addition to the dataframe. + by_row : False or "compat", default "compat" + Only has an effect when ``func`` is a listlike or dictlike of funcs + and the func isn't a string. + If "compat", will if possible first translate the func into pandas + methods (e.g. ``Series().apply(np.sum)`` will be translated to + ``Series().sum()``). If that doesn't work, will try call to apply again with + ``by_row=True`` and if that fails, will call apply again with + ``by_row=False`` (backward compatible). + If False, the funcs will be passed the whole Series at once. + + Currently not supported. + + engine : {'python', 'numba'}, default 'python' + Unused. Added for compatibility with pandas. + engine_kwargs : dict + Unused. Added for compatibility with pandas. + **kwargs + Additional keyword arguments to pass as keywords arguments to + `func`. Examples -------- @@ -4598,13 +4682,17 @@ def apply( """ if axis != 1: - raise ValueError( + raise NotImplementedError( "DataFrame.apply currently only supports row wise ops" ) if raw: - raise ValueError("The `raw` kwarg is not yet supported.") + raise NotImplementedError("The `raw` kwarg is not yet supported.") if result_type is not None: - raise ValueError("The `result_type` kwarg is not yet supported.") + raise NotImplementedError( + "The `result_type` kwarg is not yet supported." + ) + if by_row != "compat": + raise NotImplementedError("by_row is currently not supported.") return self._apply(func, _get_row_kernel, *args, **kwargs) @@ -5504,7 +5592,7 @@ def from_arrow(cls, table): return out @_performance_tracking - def to_arrow(self, preserve_index=None): + def to_arrow(self, preserve_index=None) -> pa.Table: """ Convert to a PyArrow Table. @@ -5594,18 +5682,36 @@ def to_arrow(self, preserve_index=None): return out.replace_schema_metadata(metadata) @_performance_tracking - def to_records(self, index=True): + def to_records(self, index=True, column_dtypes=None, index_dtypes=None): """Convert to a numpy recarray Parameters ---------- index : bool Whether to include the index in the output. + column_dtypes : str, type, dict, default None + If a string or type, the data type to store all columns. If + a dictionary, a mapping of column names and indices (zero-indexed) + to specific data types. Currently not supported. + index_dtypes : str, type, dict, default None + If a string or type, the data type to store all index levels. If + a dictionary, a mapping of index level names and indices + (zero-indexed) to specific data types. + This mapping is applied only if `index=True`. + Currently not supported. Returns ------- numpy recarray """ + if column_dtypes is not None: + raise NotImplementedError( + "column_dtypes is currently not supported." + ) + elif index_dtypes is not None: + raise NotImplementedError( + "column_dtypes is currently not supported." + ) members = [("index", self.index.dtype)] if index else [] members += [(col, self[col].dtype) for col in self._data.names] dtype = np.dtype(members) @@ -5618,7 +5724,16 @@ def to_records(self, index=True): @classmethod @_performance_tracking - def from_records(cls, data, index=None, columns=None, nan_as_null=False): + def from_records( + cls, + data, + index=None, + exclude=None, + columns=None, + coerce_float: bool = False, + nrows: int | None = None, + nan_as_null=False, + ): """ Convert structured or record ndarray to DataFrame. @@ -5628,13 +5743,32 @@ def from_records(cls, data, index=None, columns=None, nan_as_null=False): index : str, array-like The name of the index column in *data*. If None, the default index is used. + exclude : sequence, default None + Columns or fields to exclude. + Currently not implemented. columns : list of str List of column names to include. + coerce_float : bool, default False + Attempt to convert values of non-string, non-numeric objects (like + decimal.Decimal) to floating point, useful for SQL result sets. + Currently not implemented. + nrows : int, default None + Number of rows to read if data is an iterator. + Currently not implemented. Returns ------- DataFrame """ + if exclude is not None: + raise NotImplementedError("exclude is currently not supported.") + if coerce_float is not False: + raise NotImplementedError( + "coerce_float is currently not supported." + ) + if nrows is not None: + raise NotImplementedError("nrows is currently not supported.") + if data.ndim != 1 and data.ndim != 2: raise ValueError( f"records dimension expected 1 or 2 but found {data.ndim}" @@ -7359,9 +7493,9 @@ def pivot_table( @_performance_tracking @copy_docstring(reshape.unstack) - def unstack(self, level=-1, fill_value=None): + def unstack(self, level=-1, fill_value=None, sort: bool = True): return cudf.core.reshape.unstack( - self, level=level, fill_value=fill_value + self, level=level, fill_value=fill_value, sort=sort ) @_performance_tracking @@ -7407,7 +7541,12 @@ def explode(self, column, ignore_index=False): return super()._explode(column, ignore_index) def pct_change( - self, periods=1, fill_method=no_default, limit=no_default, freq=None + self, + periods=1, + fill_method=no_default, + limit=no_default, + freq=None, + **kwargs, ): """ Calculates the percent change between sequential elements @@ -7432,6 +7571,9 @@ def pct_change( freq : str, optional Increment to use from time series API. Not yet implemented. + **kwargs + Additional keyword arguments are passed into + `DataFrame.shift`. Returns ------- @@ -7477,7 +7619,7 @@ def pct_change( data = self.fillna(method=fill_method, limit=limit) return data.diff(periods=periods) / data.shift( - periods=periods, freq=freq + periods=periods, freq=freq, **kwargs ) def __dataframe__( diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 802751e47ad..448e7507b9e 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -389,7 +389,7 @@ def values_host(self) -> np.ndarray: return self.to_numpy() @_performance_tracking - def __array__(self, dtype=None): + def __array__(self, dtype=None, copy=None): raise TypeError( "Implicit conversion to a host NumPy array via __array__ is not " "allowed, To explicitly construct a GPU matrix, consider using " diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 30b68574960..fe66eb3519b 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -3302,7 +3302,7 @@ def pad(self, value=None, axis=None, inplace=None, limit=None): ) return self.ffill(value=value, axis=axis, inplace=inplace, limit=limit) - def add_prefix(self, prefix): + def add_prefix(self, prefix, axis=None): """ Prefix labels with string `prefix`. @@ -3464,6 +3464,7 @@ def sort_values( kind="quicksort", na_position="last", ignore_index=False, + key=None, ): """Sort by the values along either axis. @@ -3479,6 +3480,14 @@ def sort_values( 'first' puts nulls at the beginning, 'last' puts nulls at the end ignore_index : bool, default False If True, index will not be sorted. + key : callable, optional + Apply the key function to the values + before sorting. This is similar to the `key` argument in the + builtin :meth:`sorted` function, with the notable difference that + this `key` function should be *vectorized*. It should expect a + ``Series`` and return a Series with the same shape as the input. + It will be applied to each column in `by` independently. + Currently not supported. Returns ------- @@ -3518,6 +3527,8 @@ def sort_values( ) if axis != 0: raise NotImplementedError("`axis` not currently implemented.") + if key is not None: + raise NotImplementedError("key is not currently supported.") if len(self) == 0: return self diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index 1120642947b..b538ae34b6f 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -1060,7 +1060,7 @@ def pivot(data, columns=None, index=no_default, values=no_default): return result -def unstack(df, level, fill_value=None): +def unstack(df, level, fill_value=None, sort: bool = True): """ Pivot one or more levels of the (necessarily hierarchical) index labels. @@ -1080,6 +1080,9 @@ def unstack(df, level, fill_value=None): levels of the index to pivot fill_value Non-functional argument provided for compatibility with Pandas. + sort : bool, default True + Sort the level(s) in the resulting MultiIndex columns. + Returns ------- @@ -1156,6 +1159,8 @@ def unstack(df, level, fill_value=None): if fill_value is not None: raise NotImplementedError("fill_value is not supported.") + elif sort is False: + raise NotImplementedError(f"{sort=} is not supported.") if pd.api.types.is_list_like(level): if not level: return df diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index e12cc3d52fb..7163ef4ed59 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -2063,6 +2063,7 @@ def sort_values( kind="quicksort", na_position="last", ignore_index=False, + key=None, ): """Sort by the values along either axis. @@ -2076,6 +2077,14 @@ def sort_values( 'first' puts nulls at the beginning, 'last' puts nulls at the end ignore_index : bool, default False If True, index will not be sorted. + key : callable, optional + Apply the key function to the values + before sorting. This is similar to the `key` argument in the + builtin :meth:`sorted` function, with the notable difference that + this `key` function should be *vectorized*. It should expect a + ``Series`` and return a Series with the same shape as the input. + It will be applied to each column in `by` independently. + Current not supported. Returns ------- @@ -2107,6 +2116,7 @@ def sort_values( kind=kind, na_position=na_position, ignore_index=ignore_index, + key=key, ) @_performance_tracking @@ -3431,7 +3441,9 @@ def rename(self, index=None, copy=True): return Series._from_data(out_data, self.index, name=index) @_performance_tracking - def add_prefix(self, prefix): + def add_prefix(self, prefix, axis=None): + if axis is not None: + raise NotImplementedError("axis is currently not implemented.") return Series._from_data( # TODO: Change to deep=False when copy-on-write is default data=self._data.copy(deep=True), @@ -3529,7 +3541,12 @@ def explode(self, ignore_index=False): @_performance_tracking def pct_change( - self, periods=1, fill_method=no_default, limit=no_default, freq=None + self, + periods=1, + fill_method=no_default, + limit=no_default, + freq=None, + **kwargs, ): """ Calculates the percent change between sequential elements @@ -3554,6 +3571,9 @@ def pct_change( freq : str, optional Increment to use from time series API. Not yet implemented. + **kwargs + Additional keyword arguments are passed into + `Series.shift`. Returns ------- @@ -3598,11 +3618,15 @@ def pct_change( warnings.simplefilter("ignore") data = self.fillna(method=fill_method, limit=limit) diff = data.diff(periods=periods) - change = diff / data.shift(periods=periods, freq=freq) + change = diff / data.shift(periods=periods, freq=freq, **kwargs) return change @_performance_tracking - def where(self, cond, other=None, inplace=False): + def where(self, cond, other=None, inplace=False, axis=None, level=None): + if axis is not None: + raise NotImplementedError("axis is not supported.") + elif level is not None: + raise NotImplementedError("level is not supported.") result_col = super().where(cond, other, inplace) return self._mimic_inplace( self._from_data_like_self( From 57811bf866e7c50ebb4b988fb12620b0e3f9a5a8 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 18 Jul 2024 13:50:55 -0700 Subject: [PATCH 2/5] Clean up __arrow_c_stream__ --- python/cudf/cudf/core/dataframe.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 1d2c756c88c..0c8357f69db 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1541,20 +1541,14 @@ def __array_function__(self, func, types, args, kwargs): def __arrow_c_stream__(self, requested_schema=None): """ - Export the pandas DataFrame as an Arrow C stream PyCapsule. - - This relies on pyarrow to convert the pandas DataFrame to the Arrow - format (and follows the default behaviour of ``pyarrow.Table.from_pandas`` - in its handling of the index, i.e. store the index as a column except - for RangeIndex). - This conversion is not necessarily zero-copy. + Export the cudf DataFrame as an Arrow C stream PyCapsule. Parameters ---------- requested_schema : PyCapsule, default None The schema to which the dataframe should be casted, passed as a PyCapsule containing a C ArrowSchema representation of the - requested schema. + requested schema. Currently not implemented. Returns ------- From 32c028fe2c388da5b27d8eb527b935b07f3f056a Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 18 Jul 2024 14:53:37 -0700 Subject: [PATCH 3/5] Make copy raise NotimplementedError --- python/cudf/cudf/core/dataframe.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 0c8357f69db..b0daa23e35d 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -609,6 +609,9 @@ class DataFrame(IndexedFrame, Serializable, GetAttrGetItemMixin): dtype : dtype, default None Data type to force. Only a single dtype is allowed. If None, infer. + copy : bool or None, default None + Copy data from inputs. + Currently not implemented. nan_as_null : bool, Default True If ``None``/``True``, converts ``np.nan`` values to ``null`` values. @@ -698,6 +701,8 @@ def __init__( copy=None, nan_as_null=no_default, ): + if copy is not None: + raise NotImplementedError("copy is not currently implemented.") super().__init__() if nan_as_null is no_default: nan_as_null = not cudf.get_option("mode.pandas_compatible") From cf4a63d52a939119016af07eafd69f63629e00e5 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 19 Jul 2024 11:26:14 -0700 Subject: [PATCH 4/5] fix doc ref --- python/cudf/cudf/core/indexed_frame.py | 6 +++--- python/cudf/cudf/core/series.py | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index fe66eb3519b..548fe1d2d54 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -3482,9 +3482,9 @@ def sort_values( If True, index will not be sorted. key : callable, optional Apply the key function to the values - before sorting. This is similar to the `key` argument in the - builtin :meth:`sorted` function, with the notable difference that - this `key` function should be *vectorized*. It should expect a + before sorting. This is similar to the ``key`` argument in the + builtin ``sorted`` function, with the notable difference that + this ``key`` function should be *vectorized*. It should expect a ``Series`` and return a Series with the same shape as the input. It will be applied to each column in `by` independently. Currently not supported. diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 7163ef4ed59..1e2d7cee09e 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -2079,9 +2079,9 @@ def sort_values( If True, index will not be sorted. key : callable, optional Apply the key function to the values - before sorting. This is similar to the `key` argument in the - builtin :meth:`sorted` function, with the notable difference that - this `key` function should be *vectorized*. It should expect a + before sorting. This is similar to the ``key`` argument in the + builtin ``sorted`` function, with the notable difference that + this ``key`` function should be *vectorized*. It should expect a ``Series`` and return a Series with the same shape as the input. It will be applied to each column in `by` independently. Current not supported. From 20c293451908c955b2dda9c4342e2eede4aa68f5 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 19 Jul 2024 11:26:31 -0700 Subject: [PATCH 5/5] Grammar --- python/cudf/cudf/core/series.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 1e2d7cee09e..1784c579bdf 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -2084,7 +2084,7 @@ def sort_values( this ``key`` function should be *vectorized*. It should expect a ``Series`` and return a Series with the same shape as the input. It will be applied to each column in `by` independently. - Current not supported. + Currently not supported. Returns -------