From cb046bb329432e410460696a4c8bc604bed0783a Mon Sep 17 00:00:00 2001 From: rjzamora Date: Fri, 19 Apr 2024 12:31:03 -0700 Subject: [PATCH 1/7] fix categorical support for dask-expr - needs upstream fix to clear_known_categories --- python/cudf/cudf/core/series.py | 12 ++++++++++++ python/dask_cudf/dask_cudf/tests/test_accessor.py | 5 ++--- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 275dc664175..70284d4cbe5 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -86,6 +86,7 @@ to_cudf_compatible_scalar, ) from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate +from cudf.utils.utils import _warn_no_dask_cudf def _format_percentile_names(percentiles): @@ -3658,6 +3659,17 @@ def where(self, cond, other=None, inplace=False): inplace=inplace, ) + @_cudf_nvtx_annotate + @_warn_no_dask_cudf + def __dask_tokenize__(self): + from dask.base import normalize_token + + return [ + type(self), + str(self.dtype), + normalize_token(self.to_pandas()), + ] + def make_binop_func(op): # This function is used to wrap binary operations in Frame with an diff --git a/python/dask_cudf/dask_cudf/tests/test_accessor.py b/python/dask_cudf/dask_cudf/tests/test_accessor.py index ebb8e4be187..54094e06e54 100644 --- a/python/dask_cudf/dask_cudf/tests/test_accessor.py +++ b/python/dask_cudf/dask_cudf/tests/test_accessor.py @@ -111,7 +111,7 @@ def test_categorical_accessor_initialization2(data): dsr.cat -@xfail_dask_expr("TODO: Unexplained dask-expr failure") +@xfail_dask_expr("clear_known_categories needs to be generalize") @pytest.mark.parametrize("data", [data_cat_1()]) def test_categorical_basic(data): cat = data.copy() @@ -203,7 +203,6 @@ def test_categorical_compare_unordered(data): dsr < dsr -@xfail_dask_expr("TODO: Unexplained dask-expr failure") @pytest.mark.parametrize("data", [data_cat_3()]) def test_categorical_compare_ordered(data): cat1 = data[0].copy() @@ -274,7 +273,7 @@ def test_categorical_categories(): ) -@xfail_dask_expr("TODO: Unexplained dask-expr failure") +@xfail_dask_expr("Categories are ordered differently in cudf") def test_categorical_as_known(): df = dask_cudf.from_cudf(DataFrame({"col_1": [0, 1, 2, 3]}), npartitions=2) df["col_1"] = df["col_1"].astype("category") From fde16512280403c2e2950849f813cb06c4849dd7 Mon Sep 17 00:00:00 2001 From: rjzamora Date: Wed, 24 Apr 2024 07:20:04 -0700 Subject: [PATCH 2/7] adjust tests --- .../dask_cudf/dask_cudf/tests/test_accessor.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/python/dask_cudf/dask_cudf/tests/test_accessor.py b/python/dask_cudf/dask_cudf/tests/test_accessor.py index 54094e06e54..584926a647e 100644 --- a/python/dask_cudf/dask_cudf/tests/test_accessor.py +++ b/python/dask_cudf/dask_cudf/tests/test_accessor.py @@ -111,7 +111,8 @@ def test_categorical_accessor_initialization2(data): dsr.cat -@xfail_dask_expr("clear_known_categories needs to be generalize") +# TODO: Remove this once we are pinned to dask>=2024.5.0 +@xfail_dask_expr("Requires: https://github.com/dask/dask/pull/11059") @pytest.mark.parametrize("data", [data_cat_1()]) def test_categorical_basic(data): cat = data.copy() @@ -273,7 +274,6 @@ def test_categorical_categories(): ) -@xfail_dask_expr("Categories are ordered differently in cudf") def test_categorical_as_known(): df = dask_cudf.from_cudf(DataFrame({"col_1": [0, 1, 2, 3]}), npartitions=2) df["col_1"] = df["col_1"].astype("category") @@ -282,7 +282,19 @@ def test_categorical_as_known(): pdf = dd.from_pandas(pd.DataFrame({"col_1": [0, 1, 2, 3]}), npartitions=2) pdf["col_1"] = pdf["col_1"].astype("category") expected = pdf["col_1"].cat.as_known() - dd.assert_eq(expected, actual) + + # Note: Categories may be ordered differently in + # cudf and pandas. Therefore, we need to compare + # the global set of categories (before and after + # calling `compute`), then we need to check that + # the initial order of rows was preserved. + assert set(expected.cat.categories) == set( + actual.cat.categories.values_host + ) + assert set(expected.compute().cat.categories) == set( + actual.compute().cat.categories.values_host + ) + dd.assert_eq(expected, actual.astype(expected.dtype)) def test_str_slice(): From c2bc812821afd9a32a07473b85f8bbb7939a279a Mon Sep 17 00:00:00 2001 From: rjzamora Date: Thu, 25 Apr 2024 07:29:45 -0700 Subject: [PATCH 3/7] roll back Series.__dask_tokenize__ change in favor of simpler fix --- python/cudf/cudf/core/indexed_frame.py | 2 +- python/cudf/cudf/core/series.py | 12 ------------ 2 files changed, 1 insertion(+), 13 deletions(-) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 48e80d8162f..b647a48a50a 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -6308,7 +6308,7 @@ def __dask_tokenize__(self): return [ type(self), - normalize_token(self._dtypes), + str(self._dtypes), normalize_token(self.index), normalize_token(self.hash_values().values_host), ] diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 70284d4cbe5..275dc664175 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -86,7 +86,6 @@ to_cudf_compatible_scalar, ) from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate -from cudf.utils.utils import _warn_no_dask_cudf def _format_percentile_names(percentiles): @@ -3659,17 +3658,6 @@ def where(self, cond, other=None, inplace=False): inplace=inplace, ) - @_cudf_nvtx_annotate - @_warn_no_dask_cudf - def __dask_tokenize__(self): - from dask.base import normalize_token - - return [ - type(self), - str(self.dtype), - normalize_token(self.to_pandas()), - ] - def make_binop_func(op): # This function is used to wrap binary operations in Frame with an From 0f01712dcec27a8ca9526fab3870c0fbdbbf4a66 Mon Sep 17 00:00:00 2001 From: rjzamora Date: Thu, 25 Apr 2024 08:11:07 -0700 Subject: [PATCH 4/7] normalize categories just in case the list is too long for repr --- python/cudf/cudf/core/frame.py | 5 +++++ python/cudf/cudf/core/indexed_frame.py | 5 +++++ 2 files changed, 10 insertions(+) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 01842b5f0a9..925a5314ce6 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -1947,6 +1947,11 @@ def __dask_tokenize__(self): return [ type(self), str(self._dtypes), + *[ + normalize_token(cat.categories) + for cat in self._dtypes.values() + if cat == "category" + ], normalize_token(self.to_pandas()), ] diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index b647a48a50a..bec97bd3290 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -6309,6 +6309,11 @@ def __dask_tokenize__(self): return [ type(self), str(self._dtypes), + *[ + normalize_token(cat.categories) + for cat in self._dtypes.values() + if cat == "category" + ], normalize_token(self.index), normalize_token(self.hash_values().values_host), ] From b4e7c6632928ccdd7c8ae584e09955eab7795740 Mon Sep 17 00:00:00 2001 From: "Richard (Rick) Zamora" Date: Thu, 25 Apr 2024 10:19:49 -0500 Subject: [PATCH 5/7] Update python/cudf/cudf/core/frame.py --- python/cudf/cudf/core/frame.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 925a5314ce6..01842b5f0a9 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -1947,11 +1947,6 @@ def __dask_tokenize__(self): return [ type(self), str(self._dtypes), - *[ - normalize_token(cat.categories) - for cat in self._dtypes.values() - if cat == "category" - ], normalize_token(self.to_pandas()), ] From 5432d20bd7ee366f8504d8022260b058cd6c7210 Mon Sep 17 00:00:00 2001 From: rjzamora Date: Tue, 30 Apr 2024 08:01:09 -0700 Subject: [PATCH 6/7] use dask version instead of dask-expr version for lt_version --- python/dask_cudf/dask_cudf/io/tests/test_json.py | 4 ++-- python/dask_cudf/dask_cudf/io/tests/test_orc.py | 4 ++-- python/dask_cudf/dask_cudf/io/tests/test_parquet.py | 2 +- python/dask_cudf/dask_cudf/io/tests/test_text.py | 4 ++-- python/dask_cudf/dask_cudf/tests/utils.py | 11 +++++------ 5 files changed, 12 insertions(+), 13 deletions(-) diff --git a/python/dask_cudf/dask_cudf/io/tests/test_json.py b/python/dask_cudf/dask_cudf/io/tests/test_json.py index a09dfbff188..f8e5be0a417 100644 --- a/python/dask_cudf/dask_cudf/io/tests/test_json.py +++ b/python/dask_cudf/dask_cudf/io/tests/test_json.py @@ -12,8 +12,8 @@ import dask_cudf from dask_cudf.tests.utils import skip_dask_expr -# No dask-expr support for dask_expr<1.0.6 -pytestmark = skip_dask_expr(lt_version="1.0.6") +# No dask-expr support for dask<2024.4.0 +pytestmark = skip_dask_expr(lt_version="2024.4.0") def test_read_json_backend_dispatch(tmp_path): diff --git a/python/dask_cudf/dask_cudf/io/tests/test_orc.py b/python/dask_cudf/dask_cudf/io/tests/test_orc.py index 7be6c712511..457e5546bd9 100644 --- a/python/dask_cudf/dask_cudf/io/tests/test_orc.py +++ b/python/dask_cudf/dask_cudf/io/tests/test_orc.py @@ -14,8 +14,8 @@ import dask_cudf from dask_cudf.tests.utils import skip_dask_expr -# No dask-expr support for dask_expr<1.0.6 -pytestmark = skip_dask_expr(lt_version="1.0.6") +# No dask-expr support for dask<2024.4.0 +pytestmark = skip_dask_expr(lt_version="2024.4.0") cur_dir = os.path.dirname(__file__) sample_orc = os.path.join(cur_dir, "data/orc/sample.orc") diff --git a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py index 68460653119..413b4037ed4 100644 --- a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py +++ b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py @@ -536,7 +536,7 @@ def test_check_file_size(tmpdir): dask_cudf.io.read_parquet(fn, check_file_size=1).compute() -@xfail_dask_expr("HivePartitioning cannot be hashed", lt_version="1.0") +@xfail_dask_expr("HivePartitioning cannot be hashed", lt_version="2024.3.0") def test_null_partition(tmpdir): import pyarrow as pa from pyarrow.dataset import HivePartitioning diff --git a/python/dask_cudf/dask_cudf/io/tests/test_text.py b/python/dask_cudf/dask_cudf/io/tests/test_text.py index e3a9d380857..8912b7d5da6 100644 --- a/python/dask_cudf/dask_cudf/io/tests/test_text.py +++ b/python/dask_cudf/dask_cudf/io/tests/test_text.py @@ -11,8 +11,8 @@ import dask_cudf from dask_cudf.tests.utils import skip_dask_expr -# No dask-expr support for dask_expr<1.0.6 -pytestmark = skip_dask_expr(lt_version="1.0.6") +# No dask-expr support for dask<2024.4.0 +pytestmark = skip_dask_expr(lt_version="2024.4.0") cur_dir = os.path.dirname(__file__) text_file = os.path.join(cur_dir, "data/text/sample.pgn") diff --git a/python/dask_cudf/dask_cudf/tests/utils.py b/python/dask_cudf/dask_cudf/tests/utils.py index 1ca1758736b..c7dedbb6b4a 100644 --- a/python/dask_cudf/dask_cudf/tests/utils.py +++ b/python/dask_cudf/dask_cudf/tests/utils.py @@ -5,6 +5,7 @@ import pytest from packaging.version import Version +import dask import dask.dataframe as dd import cudf @@ -12,11 +13,9 @@ from dask_cudf.expr import QUERY_PLANNING_ON if QUERY_PLANNING_ON: - import dask_expr - - DASK_EXPR_VERSION = Version(dask_expr.__version__) + DASK_VERSION = Version(dask.__version__) else: - DASK_EXPR_VERSION = None + DASK_VERSION = None def _make_random_frame(nelem, npartitions=2, include_na=False): @@ -37,7 +36,7 @@ def _make_random_frame(nelem, npartitions=2, include_na=False): def skip_dask_expr(reason=_default_reason, lt_version=None): if lt_version is not None: - skip = QUERY_PLANNING_ON and DASK_EXPR_VERSION < Version(lt_version) + skip = QUERY_PLANNING_ON and DASK_VERSION < Version(lt_version) else: skip = QUERY_PLANNING_ON return pytest.mark.skipif(skip, reason=reason) @@ -45,7 +44,7 @@ def skip_dask_expr(reason=_default_reason, lt_version=None): def xfail_dask_expr(reason=_default_reason, lt_version=None): if lt_version is not None: - xfail = QUERY_PLANNING_ON and DASK_EXPR_VERSION < Version(lt_version) + xfail = QUERY_PLANNING_ON and DASK_VERSION < Version(lt_version) else: xfail = QUERY_PLANNING_ON return pytest.mark.xfail(xfail, reason=reason) From 164cc2dd410f3d4d959af06bf1edb008ea7f465f Mon Sep 17 00:00:00 2001 From: rjzamora Date: Tue, 30 Apr 2024 08:02:59 -0700 Subject: [PATCH 7/7] update test --- python/dask_cudf/dask_cudf/tests/test_accessor.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/python/dask_cudf/dask_cudf/tests/test_accessor.py b/python/dask_cudf/dask_cudf/tests/test_accessor.py index 584926a647e..ae17b89832a 100644 --- a/python/dask_cudf/dask_cudf/tests/test_accessor.py +++ b/python/dask_cudf/dask_cudf/tests/test_accessor.py @@ -111,8 +111,7 @@ def test_categorical_accessor_initialization2(data): dsr.cat -# TODO: Remove this once we are pinned to dask>=2024.5.0 -@xfail_dask_expr("Requires: https://github.com/dask/dask/pull/11059") +@xfail_dask_expr(lt_version="2024.5.0") @pytest.mark.parametrize("data", [data_cat_1()]) def test_categorical_basic(data): cat = data.copy()