Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Align more DataFrame APIs with pandas #16310

Merged
merged 6 commits into from
Jul 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
169 changes: 155 additions & 14 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -609,6 +609,9 @@ class DataFrame(IndexedFrame, Serializable, GetAttrGetItemMixin):
dtype : dtype, default None
Data type to force. Only a single dtype is allowed.
If None, infer.
copy : bool or None, default None
Copy data from inputs.
Currently not implemented.
nan_as_null : bool, Default True
If ``None``/``True``, converts ``np.nan`` values to
``null`` values.
Expand Down Expand Up @@ -695,8 +698,11 @@ def __init__(
index=None,
columns=None,
dtype=None,
copy=None,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is copy meant to be non-functional for now? Should we raise NotImplementedError ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah good catch; this is currently not implemented. I'll make it raise a NotImplementedError

nan_as_null=no_default,
):
if copy is not None:
raise NotImplementedError("copy is not currently implemented.")
super().__init__()
if nan_as_null is no_default:
nan_as_null = not cudf.get_option("mode.pandas_compatible")
Expand Down Expand Up @@ -1538,6 +1544,25 @@ def __array_function__(self, func, types, args, kwargs):
pass
return NotImplemented

def __arrow_c_stream__(self, requested_schema=None):
"""
Export the cudf DataFrame as an Arrow C stream PyCapsule.

Parameters
----------
requested_schema : PyCapsule, default None
The schema to which the dataframe should be casted, passed as a
PyCapsule containing a C ArrowSchema representation of the
requested schema. Currently not implemented.

Returns
-------
PyCapsule
"""
if requested_schema is not None:
raise NotImplementedError("requested_schema is not supported")
return self.to_arrow().__arrow_c_stream__()
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If there's more native way to do this that doesn't rely on pyarrow happy to use that


# The _get_numeric_data method is necessary for dask compatibility.
@_performance_tracking
def _get_numeric_data(self):
Expand Down Expand Up @@ -2249,6 +2274,7 @@ def to_dict(
self,
orient: str = "dict",
into: type[dict] = dict,
index: bool = True,
) -> dict | list[dict]:
"""
Convert the DataFrame to a dictionary.
Expand Down Expand Up @@ -2282,6 +2308,13 @@ def to_dict(
instance of the mapping type you want. If you want a
collections.defaultdict, you must pass it initialized.

index : bool, default True
Whether to include the index item (and index_names item if `orient`
is 'tight') in the returned dictionary. Can only be ``False``
when `orient` is 'split' or 'tight'. Note that when `orient` is
'records', this parameter does not take effect (index item always
not included).

Returns
-------
dict, list or collections.abc.Mapping
Expand Down Expand Up @@ -2363,7 +2396,7 @@ def to_dict(
raise TypeError(f"unsupported type: {into}")
return cons(self.items()) # type: ignore[misc]

return self.to_pandas().to_dict(orient=orient, into=into)
return self.to_pandas().to_dict(orient=orient, into=into, index=index)

@_performance_tracking
def scatter_by_map(
Expand Down Expand Up @@ -3018,7 +3051,12 @@ def fillna(
)

@_performance_tracking
def where(self, cond, other=None, inplace=False):
def where(self, cond, other=None, inplace=False, axis=None, level=None):
if axis is not None:
raise NotImplementedError("axis is not supported.")
elif level is not None:
raise NotImplementedError("level is not supported.")

from cudf.core._internals.where import (
_check_and_cast_columns_with_other,
_make_categorical_like,
Expand Down Expand Up @@ -3628,7 +3666,9 @@ def rename(
return result

@_performance_tracking
def add_prefix(self, prefix):
def add_prefix(self, prefix, axis=None):
if axis is not None:
raise NotImplementedError("axis is currently not implemented.")
# TODO: Change to deep=False when copy-on-write is default
out = self.copy(deep=True)
out.columns = [
Expand Down Expand Up @@ -4244,6 +4284,7 @@ def join(
lsuffix="",
rsuffix="",
sort=False,
validate: str | None = None,
):
"""Join columns with other DataFrame on index or on a key column.

Expand All @@ -4257,6 +4298,16 @@ def join(
column names when avoiding conflicts.
sort : bool
Set to True to ensure sorted ordering.
validate : str, optional
If specified, checks if join is of specified type.

* "one_to_one" or "1:1": check if join keys are unique in both left
and right datasets.
* "one_to_many" or "1:m": check if join keys are unique in left dataset.
* "many_to_one" or "m:1": check if join keys are unique in right dataset.
* "many_to_many" or "m:m": allowed, but does not result in checks.

Currently not supported.

Returns
-------
Expand All @@ -4270,6 +4321,10 @@ def join(
"""
if on is not None:
raise NotImplementedError("The on parameter is not yet supported")
elif validate is not None:
raise NotImplementedError(
"The validate parameter is not yet supported"
)

df = self.merge(
other,
Expand Down Expand Up @@ -4420,7 +4475,16 @@ def query(self, expr, local_dict=None):

@_performance_tracking
def apply(
self, func, axis=1, raw=False, result_type=None, args=(), **kwargs
self,
func,
axis=1,
raw=False,
result_type=None,
args=(),
by_row: Literal[False, "compat"] = "compat",
engine: Literal["python", "numba"] = "python",
engine_kwargs: dict[str, bool] | None = None,
**kwargs,
):
"""
Apply a function along an axis of the DataFrame.
Expand Down Expand Up @@ -4448,6 +4512,25 @@ def apply(
Not yet supported
args: tuple
Positional arguments to pass to func in addition to the dataframe.
by_row : False or "compat", default "compat"
Only has an effect when ``func`` is a listlike or dictlike of funcs
and the func isn't a string.
If "compat", will if possible first translate the func into pandas
methods (e.g. ``Series().apply(np.sum)`` will be translated to
``Series().sum()``). If that doesn't work, will try call to apply again with
``by_row=True`` and if that fails, will call apply again with
``by_row=False`` (backward compatible).
If False, the funcs will be passed the whole Series at once.

Currently not supported.

engine : {'python', 'numba'}, default 'python'
Unused. Added for compatibility with pandas.
engine_kwargs : dict
Unused. Added for compatibility with pandas.
**kwargs
Additional keyword arguments to pass as keywords arguments to
`func`.

Examples
--------
Expand Down Expand Up @@ -4598,13 +4681,17 @@ def apply(
<https://docs.rapids.ai/api/cudf/stable/user_guide/guide-to-udfs.html>
"""
if axis != 1:
raise ValueError(
raise NotImplementedError(
"DataFrame.apply currently only supports row wise ops"
)
if raw:
raise ValueError("The `raw` kwarg is not yet supported.")
raise NotImplementedError("The `raw` kwarg is not yet supported.")
if result_type is not None:
raise ValueError("The `result_type` kwarg is not yet supported.")
raise NotImplementedError(
"The `result_type` kwarg is not yet supported."
)
if by_row != "compat":
raise NotImplementedError("by_row is currently not supported.")

return self._apply(func, _get_row_kernel, *args, **kwargs)

Expand Down Expand Up @@ -5504,7 +5591,7 @@ def from_arrow(cls, table):
return out

@_performance_tracking
def to_arrow(self, preserve_index=None):
def to_arrow(self, preserve_index=None) -> pa.Table:
"""
Convert to a PyArrow Table.

Expand Down Expand Up @@ -5594,18 +5681,36 @@ def to_arrow(self, preserve_index=None):
return out.replace_schema_metadata(metadata)

@_performance_tracking
def to_records(self, index=True):
def to_records(self, index=True, column_dtypes=None, index_dtypes=None):
"""Convert to a numpy recarray

Parameters
----------
index : bool
Whether to include the index in the output.
column_dtypes : str, type, dict, default None
If a string or type, the data type to store all columns. If
a dictionary, a mapping of column names and indices (zero-indexed)
to specific data types. Currently not supported.
index_dtypes : str, type, dict, default None
If a string or type, the data type to store all index levels. If
a dictionary, a mapping of index level names and indices
(zero-indexed) to specific data types.
This mapping is applied only if `index=True`.
Currently not supported.

Returns
-------
numpy recarray
"""
if column_dtypes is not None:
raise NotImplementedError(
"column_dtypes is currently not supported."
)
elif index_dtypes is not None:
raise NotImplementedError(
"column_dtypes is currently not supported."
)
members = [("index", self.index.dtype)] if index else []
members += [(col, self[col].dtype) for col in self._data.names]
dtype = np.dtype(members)
Expand All @@ -5618,7 +5723,16 @@ def to_records(self, index=True):

@classmethod
@_performance_tracking
def from_records(cls, data, index=None, columns=None, nan_as_null=False):
def from_records(
cls,
data,
index=None,
exclude=None,
columns=None,
coerce_float: bool = False,
nrows: int | None = None,
nan_as_null=False,
):
"""
Convert structured or record ndarray to DataFrame.

Expand All @@ -5628,13 +5742,32 @@ def from_records(cls, data, index=None, columns=None, nan_as_null=False):
index : str, array-like
The name of the index column in *data*.
If None, the default index is used.
exclude : sequence, default None
Columns or fields to exclude.
Currently not implemented.
columns : list of str
List of column names to include.
coerce_float : bool, default False
Attempt to convert values of non-string, non-numeric objects (like
decimal.Decimal) to floating point, useful for SQL result sets.
Currently not implemented.
nrows : int, default None
Number of rows to read if data is an iterator.
Currently not implemented.

Returns
-------
DataFrame
"""
if exclude is not None:
raise NotImplementedError("exclude is currently not supported.")
if coerce_float is not False:
raise NotImplementedError(
"coerce_float is currently not supported."
)
if nrows is not None:
raise NotImplementedError("nrows is currently not supported.")

if data.ndim != 1 and data.ndim != 2:
raise ValueError(
f"records dimension expected 1 or 2 but found {data.ndim}"
Expand Down Expand Up @@ -7359,9 +7492,9 @@ def pivot_table(

@_performance_tracking
@copy_docstring(reshape.unstack)
def unstack(self, level=-1, fill_value=None):
def unstack(self, level=-1, fill_value=None, sort: bool = True):
return cudf.core.reshape.unstack(
self, level=level, fill_value=fill_value
self, level=level, fill_value=fill_value, sort=sort
)

@_performance_tracking
Expand Down Expand Up @@ -7407,7 +7540,12 @@ def explode(self, column, ignore_index=False):
return super()._explode(column, ignore_index)

def pct_change(
self, periods=1, fill_method=no_default, limit=no_default, freq=None
self,
periods=1,
fill_method=no_default,
limit=no_default,
freq=None,
**kwargs,
):
"""
Calculates the percent change between sequential elements
Expand All @@ -7432,6 +7570,9 @@ def pct_change(
freq : str, optional
Increment to use from time series API.
Not yet implemented.
**kwargs
Additional keyword arguments are passed into
`DataFrame.shift`.

Returns
-------
Expand Down Expand Up @@ -7477,7 +7618,7 @@ def pct_change(
data = self.fillna(method=fill_method, limit=limit)

return data.diff(periods=periods) / data.shift(
periods=periods, freq=freq
periods=periods, freq=freq, **kwargs
)

def __dataframe__(
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -389,7 +389,7 @@ def values_host(self) -> np.ndarray:
return self.to_numpy()

@_performance_tracking
def __array__(self, dtype=None):
def __array__(self, dtype=None, copy=None):
raise TypeError(
"Implicit conversion to a host NumPy array via __array__ is not "
"allowed, To explicitly construct a GPU matrix, consider using "
Expand Down
13 changes: 12 additions & 1 deletion python/cudf/cudf/core/indexed_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -3302,7 +3302,7 @@ def pad(self, value=None, axis=None, inplace=None, limit=None):
)
return self.ffill(value=value, axis=axis, inplace=inplace, limit=limit)

def add_prefix(self, prefix):
def add_prefix(self, prefix, axis=None):
"""
Prefix labels with string `prefix`.

Expand Down Expand Up @@ -3464,6 +3464,7 @@ def sort_values(
kind="quicksort",
na_position="last",
ignore_index=False,
key=None,
):
"""Sort by the values along either axis.

Expand All @@ -3479,6 +3480,14 @@ def sort_values(
'first' puts nulls at the beginning, 'last' puts nulls at the end
ignore_index : bool, default False
If True, index will not be sorted.
key : callable, optional
Apply the key function to the values
before sorting. This is similar to the ``key`` argument in the
builtin ``sorted`` function, with the notable difference that
this ``key`` function should be *vectorized*. It should expect a
``Series`` and return a Series with the same shape as the input.
It will be applied to each column in `by` independently.
Currently not supported.

Returns
-------
Expand Down Expand Up @@ -3518,6 +3527,8 @@ def sort_values(
)
if axis != 0:
raise NotImplementedError("`axis` not currently implemented.")
if key is not None:
raise NotImplementedError("key is not currently supported.")

if len(self) == 0:
return self
Expand Down
Loading
Loading