From cf0ad3430f678c45fb8e80799e26be6f0a78abbd Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Fri, 26 Mar 2021 11:13:56 -0700 Subject: [PATCH 1/5] add dispatch from dask --- python/dask_cudf/dask_cudf/backends.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py index bb52ebce262..93df1680bf2 100644 --- a/python/dask_cudf/dask_cudf/backends.py +++ b/python/dask_cudf/dask_cudf/backends.py @@ -6,7 +6,11 @@ from dask.dataframe.categorical import categorical_dtype_dispatch from dask.dataframe.core import get_parallel_type, make_meta, meta_nonempty -from dask.dataframe.methods import concat_dispatch, tolist_dispatch +from dask.dataframe.methods import ( + concat_dispatch, + is_categorical_dtype_dispatch, + tolist_dispatch, +) from dask.dataframe.utils import ( UNKNOWN_CATEGORIES, _nonempty_scalar, @@ -220,6 +224,14 @@ def tolist_cudf(obj): return obj.to_arrow().to_pylist() +@is_categorical_dtype_dispatch.register( + (cudf.Series, cudf.Index, cudf.CategoricalDtype, Series) +) +def is_categorical_dtype_cudf(obj): + obj = getattr(obj, "dtype", obj) + return cudf.utils.dtypes.is_categorical_dtype(obj) + + try: from dask.dataframe.utils import group_split_dispatch, hash_object_dispatch From e377f891a4515aca5ca9a6f7d9225e69e2dd060c Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Fri, 26 Mar 2021 11:16:13 -0700 Subject: [PATCH 2/5] remove unnecessary code --- python/dask_cudf/dask_cudf/backends.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py index 93df1680bf2..2a43aa06a8f 100644 --- a/python/dask_cudf/dask_cudf/backends.py +++ b/python/dask_cudf/dask_cudf/backends.py @@ -228,7 +228,6 @@ def tolist_cudf(obj): (cudf.Series, cudf.Index, cudf.CategoricalDtype, Series) ) def is_categorical_dtype_cudf(obj): - obj = getattr(obj, "dtype", obj) return cudf.utils.dtypes.is_categorical_dtype(obj) From b9cee529e0c2162adc2cfc97e66faf17cefc2471 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Mon, 29 Mar 2021 16:23:12 -0700 Subject: [PATCH 3/5] add tests --- .../dask_cudf/tests/test_dispatch.py | 16 +++++++++++++ .../dask_cudf/dask_cudf/tests/test_onehot.py | 23 +++++++++++++++++-- 2 files changed, 37 insertions(+), 2 deletions(-) create mode 100644 python/dask_cudf/dask_cudf/tests/test_dispatch.py diff --git a/python/dask_cudf/dask_cudf/tests/test_dispatch.py b/python/dask_cudf/dask_cudf/tests/test_dispatch.py new file mode 100644 index 00000000000..6bf4b956404 --- /dev/null +++ b/python/dask_cudf/dask_cudf/tests/test_dispatch.py @@ -0,0 +1,16 @@ +import pandas as pd + +from dask.dataframe.methods import is_categorical_dtype + +import cudf + + +def test_is_categorical_dispatch(): + assert is_categorical_dtype(pd.CategoricalDtype([1, 2, 3])) + assert is_categorical_dtype(cudf.CategoricalDtype([1, 2, 3])) + + assert is_categorical_dtype(cudf.Series([1, 2, 3], dtype="category")) + assert is_categorical_dtype(pd.Series([1, 2, 3], dtype="category")) + + assert is_categorical_dtype(pd.Index([1, 2, 3], dtype="category")) + assert is_categorical_dtype(cudf.Index([1, 2, 3], dtype="category")) diff --git a/python/dask_cudf/dask_cudf/tests/test_onehot.py b/python/dask_cudf/dask_cudf/tests/test_onehot.py index d5fb9e9a110..a9d88b5203c 100644 --- a/python/dask_cudf/dask_cudf/tests/test_onehot.py +++ b/python/dask_cudf/dask_cudf/tests/test_onehot.py @@ -3,10 +3,10 @@ from dask import dataframe as dd -import dask_cudf - import cudf +import dask_cudf + def test_get_dummies_cat(): df = pd.DataFrame({"C": [], "A": []}) @@ -101,3 +101,22 @@ def test_get_dummies_large(): dd.get_dummies(gddf).compute(), check_dtype=False, ) + + +def test_get_dummies_categorical(): + # https://github.com/rapidsai/cudf/issues/7111 + gdf = cudf.DataFrame({"A": ["a", "b", "b"], "B": [1, 2, 3]}) + pdf = gdf.to_pandas() + + gddf = dask_cudf.from_cudf(gdf, npartitions=1) + gddf = gddf.categorize(columns=["B"]) + + pddf = dd.from_pandas(pdf, npartitions=1) + pddf = pddf.categorize(columns=["B"]) + + expect = dd.get_dummies(pddf, columns=["B"]) + got = dd.get_dummies(gddf, columns=["B"]) + + dd.assert_eq( + expect, got, + ) From c629417259bf1e9493b50a22e9cd373e785230b9 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Tue, 30 Mar 2021 07:05:44 -0700 Subject: [PATCH 4/5] bump and test dask version --- ci/gpu/build.sh | 4 ++-- conda/environments/cudf_dev_cuda10.1.yml | 2 +- conda/environments/cudf_dev_cuda10.2.yml | 2 +- conda/environments/cudf_dev_cuda11.0.yml | 2 +- conda/recipes/dask-cudf/meta.yaml | 4 ++-- 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh index 7614e19cc89..c88dd4c0315 100755 --- a/ci/gpu/build.sh +++ b/ci/gpu/build.sh @@ -83,8 +83,8 @@ gpuci_conda_retry install -y \ "ucx-py=${MINOR_VERSION}" # https://docs.rapids.ai/maintainers/depmgmt/ -# gpuci_conda_retry remove --force rapids-build-env rapids-notebook-env -# gpuci_conda_retry install -y "your-pkg=1.0.0" +gpuci_conda_retry remove --force rapids-build-env rapids-notebook-env +gpuci_conda_retry install -y "dask>=2021.3.1" gpuci_logger "Check compiler versions" diff --git a/conda/environments/cudf_dev_cuda10.1.yml b/conda/environments/cudf_dev_cuda10.1.yml index 35108ddd8ca..fa0b1126190 100644 --- a/conda/environments/cudf_dev_cuda10.1.yml +++ b/conda/environments/cudf_dev_cuda10.1.yml @@ -43,7 +43,7 @@ dependencies: - mypy=0.782 - typing_extensions - pre_commit - - dask>=2.22.0 + - dask>=2021.3.1 - distributed>=2.22.0 - streamz - dlpack diff --git a/conda/environments/cudf_dev_cuda10.2.yml b/conda/environments/cudf_dev_cuda10.2.yml index 3a24e38a397..52d82c4f4ef 100644 --- a/conda/environments/cudf_dev_cuda10.2.yml +++ b/conda/environments/cudf_dev_cuda10.2.yml @@ -43,7 +43,7 @@ dependencies: - mypy=0.782 - typing_extensions - pre_commit - - dask>=2.22.0 + - dask>=2021.3.1 - distributed>=2.22.0 - streamz - dlpack diff --git a/conda/environments/cudf_dev_cuda11.0.yml b/conda/environments/cudf_dev_cuda11.0.yml index 821c6f5320d..2e64365bdf6 100644 --- a/conda/environments/cudf_dev_cuda11.0.yml +++ b/conda/environments/cudf_dev_cuda11.0.yml @@ -43,7 +43,7 @@ dependencies: - mypy=0.782 - typing_extensions - pre_commit - - dask>=2.22.0 + - dask>=2021.3.1 - distributed>=2.22.0 - streamz - dlpack diff --git a/conda/recipes/dask-cudf/meta.yaml b/conda/recipes/dask-cudf/meta.yaml index 170075743bd..66bffdfd61e 100644 --- a/conda/recipes/dask-cudf/meta.yaml +++ b/conda/recipes/dask-cudf/meta.yaml @@ -23,12 +23,12 @@ requirements: host: - python - cudf {{ version }} - - dask >=2.22.0 + - dask>=2021.3.1 - distributed >=2.22.0 run: - python - cudf {{ version }} - - dask >=2.22.0 + - dask>=2021.3.1 - distributed >=2.22.0 test: From d6240ce22995c861cee90dd9dc7ad9afada41855 Mon Sep 17 00:00:00 2001 From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com> Date: Tue, 30 Mar 2021 12:04:19 -0500 Subject: [PATCH 5/5] Update ci/gpu/build.sh Co-authored-by: Keith Kraus --- ci/gpu/build.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh index c88dd4c0315..7614e19cc89 100755 --- a/ci/gpu/build.sh +++ b/ci/gpu/build.sh @@ -83,8 +83,8 @@ gpuci_conda_retry install -y \ "ucx-py=${MINOR_VERSION}" # https://docs.rapids.ai/maintainers/depmgmt/ -gpuci_conda_retry remove --force rapids-build-env rapids-notebook-env -gpuci_conda_retry install -y "dask>=2021.3.1" +# gpuci_conda_retry remove --force rapids-build-env rapids-notebook-env +# gpuci_conda_retry install -y "your-pkg=1.0.0" gpuci_logger "Check compiler versions"