Skip to content

Commit

Permalink
Deprecate to/from_dask_dataframe APIs in dask-cudf (#15592)
Browse files Browse the repository at this point in the history
The `to/from_dask_dataframe` APIs have been obsolete for a long time. It is always better to use `ddf.to_backend("cudf")` or `ddf.to_backend("pandas")` instead.

These APIs are also "dangerous" to use with dask-expr, because the same API names are still used to convert data to/from "legacy" Dask collections. Note that dask-expr also deprecated `to/from_dask_dataframe` in favor of `to/from_legacy_dataframe`, but the conflicting APIs still exist (for now).

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)

Approvers:
  - Charles Blackmon-Luca (https://github.com/charlesbluca)

URL: #15592
  • Loading branch information
rjzamora authored Apr 30, 2024
1 parent 5287580 commit b9c6d4c
Show file tree
Hide file tree
Showing 6 changed files with 77 additions and 15 deletions.
3 changes: 1 addition & 2 deletions docs/dask_cudf/source/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,11 @@ Creating and storing DataFrames
of DataFrames from a variety of storage formats. For on-disk data that
are not supported directly in Dask-cuDF, we recommend using Dask's
data reading facilities, followed by calling
:func:`.from_dask_dataframe` to obtain a Dask-cuDF object.
:meth:`*.to_backend("cudf")` to obtain a Dask-cuDF object.

.. automodule:: dask_cudf
:members:
from_cudf,
from_dask_dataframe,
from_delayed,
read_csv,
read_json,
Expand Down
30 changes: 26 additions & 4 deletions python/dask_cudf/dask_cudf/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,9 +55,20 @@ def __repr__(self):

@_dask_cudf_nvtx_annotate
def to_dask_dataframe(self, **kwargs):
"""Create a dask.dataframe object from a dask_cudf object"""
nullable = kwargs.get("nullable", False)
return self.map_partitions(M.to_pandas, nullable=nullable)
"""Create a dask.dataframe object from a dask_cudf object
WARNING: This API is deprecated, and may not work properly
when query-planning is active. Please use `*.to_backend("pandas")`
to convert the underlying data to pandas.
"""

warnings.warn(
"The `to_dask_dataframe` API is now deprecated. "
"Please use `*.to_backend('pandas')` instead.",
FutureWarning,
)

return self.to_backend("pandas", **kwargs)


concat = dd.concat
Expand Down Expand Up @@ -733,6 +744,10 @@ def from_dask_dataframe(df):
Convert a Dask :class:`dask.dataframe.DataFrame` to a Dask-cuDF
one.
WARNING: This API is deprecated, and may not work properly
when query-planning is active. Please use `*.to_backend("cudf")`
to convert the underlying data to cudf.
Parameters
----------
df : dask.dataframe.DataFrame
Expand All @@ -742,7 +757,14 @@ def from_dask_dataframe(df):
-------
dask_cudf.DataFrame : A new Dask collection backed by cuDF objects
"""
return df.map_partitions(cudf.from_pandas)

warnings.warn(
"The `from_dask_dataframe` API is now deprecated. "
"Please use `*.to_backend('cudf')` instead.",
FutureWarning,
)

return df.to_backend("cudf")


for name in (
Expand Down
29 changes: 23 additions & 6 deletions python/dask_cudf/dask_cudf/expr/_collection.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

import warnings
from functools import cached_property

from dask_expr import (
Expand All @@ -22,9 +23,25 @@
##


# VarMixin can be removed if cudf#15179 is addressed.
# See: https://github.com/rapidsai/cudf/issues/15179
class VarMixin:
class CudfFrameBase(FrameBase):
def to_dask_dataframe(self, **kwargs):
"""Create a dask.dataframe object from a dask_cudf object
WARNING: This API is deprecated, and may not work properly.
Please use `*.to_backend("pandas")` to convert the
underlying data to pandas.
"""

warnings.warn(
"The `to_dask_dataframe` API is now deprecated. "
"Please use `*.to_backend('pandas')` instead.",
FutureWarning,
)

return self.to_backend("pandas", **kwargs)

# var can be removed if cudf#15179 is addressed.
# See: https://github.com/rapidsai/cudf/issues/15179
def var(
self,
axis=0,
Expand All @@ -49,7 +66,7 @@ def var(
)


class DataFrame(VarMixin, DXDataFrame):
class DataFrame(DXDataFrame, CudfFrameBase):
@classmethod
def from_dict(cls, *args, **kwargs):
with config.set({"dataframe.backend": "cudf"}):
Expand Down Expand Up @@ -94,7 +111,7 @@ def read_text(*args, **kwargs):
return from_legacy_dataframe(ddf)


class Series(VarMixin, DXSeries):
class Series(DXSeries, CudfFrameBase):
def groupby(self, by, **kwargs):
from dask_cudf.expr._groupby import SeriesGroupBy

Expand All @@ -113,7 +130,7 @@ def struct(self):
return StructMethods(self)


class Index(DXIndex):
class Index(DXIndex, CudfFrameBase):
pass # Same as pandas (for now)


Expand Down
2 changes: 1 addition & 1 deletion python/dask_cudf/dask_cudf/io/tests/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ def test_roundtrip_from_dask_none_index_false(tmpdir):
@pytest.mark.parametrize("write_meta", [True, False])
def test_roundtrip_from_dask_cudf(tmpdir, write_meta):
tmpdir = str(tmpdir)
gddf = dask_cudf.from_dask_dataframe(ddf)
gddf = ddf.to_backend("cudf")
gddf.to_parquet(tmpdir, write_metadata_file=write_meta)

gddf2 = dask_cudf.read_parquet(tmpdir, calculate_divisions=True)
Expand Down
24 changes: 24 additions & 0 deletions python/dask_cudf/dask_cudf/tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,30 @@ def test_from_dict_backend_dispatch():
dd.assert_eq(expect, ddf)


def test_to_dask_dataframe_deprecated():
gdf = cudf.DataFrame({"a": range(100)})
ddf = dd.from_pandas(gdf, npartitions=2)
assert isinstance(ddf._meta, cudf.DataFrame)

with pytest.warns(FutureWarning, match="API is now deprecated"):
assert isinstance(
ddf.to_dask_dataframe()._meta,
pd.DataFrame,
)


def test_from_dask_dataframe_deprecated():
gdf = pd.DataFrame({"a": range(100)})
ddf = dd.from_pandas(gdf, npartitions=2)
assert isinstance(ddf._meta, pd.DataFrame)

with pytest.warns(FutureWarning, match="API is now deprecated"):
assert isinstance(
dask_cudf.from_dask_dataframe(ddf)._meta,
cudf.DataFrame,
)


def test_to_backend():
np.random.seed(0)
data = {
Expand Down
4 changes: 2 additions & 2 deletions python/dask_cudf/dask_cudf/tests/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -562,9 +562,9 @@ def test_groupby_reset_index_string_name():
def test_groupby_categorical_key():
# See https://github.com/rapidsai/cudf/issues/4608
df = dask.datasets.timeseries()
gddf = dask_cudf.from_dask_dataframe(df)
gddf = df.to_backend("cudf")
gddf["name"] = gddf["name"].astype("category")
ddf = gddf.to_dask_dataframe()
ddf = gddf.to_backend("pandas")

got = gddf.groupby("name", sort=True).agg(
{"x": ["mean", "max"], "y": ["mean", "count"]}
Expand Down

0 comments on commit b9c6d4c

Please sign in to comment.