From 67d427deb3cf18d1139b76aecc1e6a3e9d5253f3 Mon Sep 17 00:00:00 2001 From: "Richard (Rick) Zamora" Date: Wed, 1 May 2024 16:03:28 -0500 Subject: [PATCH] Fix categorical-accessor support and testing in dask-cudf (#15591) Related to https://github.com/rapidsai/cudf/issues/15027 Adds a minor tokenization fix, and adjusts testing for categorical-accessor support. Authors: - Richard (Rick) Zamora (https://github.com/rjzamora) Approvers: - Charles Blackmon-Luca (https://github.com/charlesbluca) - Matthew Roeschke (https://github.com/mroeschke) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/15591 --- python/cudf/cudf/core/indexed_frame.py | 7 ++++++- .../dask_cudf/dask_cudf/io/tests/test_json.py | 4 ++-- .../dask_cudf/dask_cudf/io/tests/test_orc.py | 4 ++-- .../dask_cudf/io/tests/test_parquet.py | 2 +- .../dask_cudf/dask_cudf/io/tests/test_text.py | 4 ++-- .../dask_cudf/dask_cudf/tests/test_accessor.py | 18 ++++++++++++++---- python/dask_cudf/dask_cudf/tests/utils.py | 11 +++++------ 7 files changed, 32 insertions(+), 18 deletions(-) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 48e80d8162f..bec97bd3290 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -6308,7 +6308,12 @@ def __dask_tokenize__(self): return [ type(self), - normalize_token(self._dtypes), + str(self._dtypes), + *[ + normalize_token(cat.categories) + for cat in self._dtypes.values() + if cat == "category" + ], normalize_token(self.index), normalize_token(self.hash_values().values_host), ] diff --git a/python/dask_cudf/dask_cudf/io/tests/test_json.py b/python/dask_cudf/dask_cudf/io/tests/test_json.py index a09dfbff188..f8e5be0a417 100644 --- a/python/dask_cudf/dask_cudf/io/tests/test_json.py +++ b/python/dask_cudf/dask_cudf/io/tests/test_json.py @@ -12,8 +12,8 @@ import dask_cudf from dask_cudf.tests.utils import skip_dask_expr -# No dask-expr support for dask_expr<1.0.6 -pytestmark = skip_dask_expr(lt_version="1.0.6") +# No dask-expr support for dask<2024.4.0 +pytestmark = skip_dask_expr(lt_version="2024.4.0") def test_read_json_backend_dispatch(tmp_path): diff --git a/python/dask_cudf/dask_cudf/io/tests/test_orc.py b/python/dask_cudf/dask_cudf/io/tests/test_orc.py index 7be6c712511..457e5546bd9 100644 --- a/python/dask_cudf/dask_cudf/io/tests/test_orc.py +++ b/python/dask_cudf/dask_cudf/io/tests/test_orc.py @@ -14,8 +14,8 @@ import dask_cudf from dask_cudf.tests.utils import skip_dask_expr -# No dask-expr support for dask_expr<1.0.6 -pytestmark = skip_dask_expr(lt_version="1.0.6") +# No dask-expr support for dask<2024.4.0 +pytestmark = skip_dask_expr(lt_version="2024.4.0") cur_dir = os.path.dirname(__file__) sample_orc = os.path.join(cur_dir, "data/orc/sample.orc") diff --git a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py index 8ca27df8fec..6f4737db5be 100644 --- a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py +++ b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py @@ -536,7 +536,7 @@ def test_check_file_size(tmpdir): dask_cudf.io.read_parquet(fn, check_file_size=1).compute() -@xfail_dask_expr("HivePartitioning cannot be hashed", lt_version="1.0") +@xfail_dask_expr("HivePartitioning cannot be hashed", lt_version="2024.3.0") def test_null_partition(tmpdir): import pyarrow as pa from pyarrow.dataset import HivePartitioning diff --git a/python/dask_cudf/dask_cudf/io/tests/test_text.py b/python/dask_cudf/dask_cudf/io/tests/test_text.py index e3a9d380857..8912b7d5da6 100644 --- a/python/dask_cudf/dask_cudf/io/tests/test_text.py +++ b/python/dask_cudf/dask_cudf/io/tests/test_text.py @@ -11,8 +11,8 @@ import dask_cudf from dask_cudf.tests.utils import skip_dask_expr -# No dask-expr support for dask_expr<1.0.6 -pytestmark = skip_dask_expr(lt_version="1.0.6") +# No dask-expr support for dask<2024.4.0 +pytestmark = skip_dask_expr(lt_version="2024.4.0") cur_dir = os.path.dirname(__file__) text_file = os.path.join(cur_dir, "data/text/sample.pgn") diff --git a/python/dask_cudf/dask_cudf/tests/test_accessor.py b/python/dask_cudf/dask_cudf/tests/test_accessor.py index ebb8e4be187..ae17b89832a 100644 --- a/python/dask_cudf/dask_cudf/tests/test_accessor.py +++ b/python/dask_cudf/dask_cudf/tests/test_accessor.py @@ -111,7 +111,7 @@ def test_categorical_accessor_initialization2(data): dsr.cat -@xfail_dask_expr("TODO: Unexplained dask-expr failure") +@xfail_dask_expr(lt_version="2024.5.0") @pytest.mark.parametrize("data", [data_cat_1()]) def test_categorical_basic(data): cat = data.copy() @@ -203,7 +203,6 @@ def test_categorical_compare_unordered(data): dsr < dsr -@xfail_dask_expr("TODO: Unexplained dask-expr failure") @pytest.mark.parametrize("data", [data_cat_3()]) def test_categorical_compare_ordered(data): cat1 = data[0].copy() @@ -274,7 +273,6 @@ def test_categorical_categories(): ) -@xfail_dask_expr("TODO: Unexplained dask-expr failure") def test_categorical_as_known(): df = dask_cudf.from_cudf(DataFrame({"col_1": [0, 1, 2, 3]}), npartitions=2) df["col_1"] = df["col_1"].astype("category") @@ -283,7 +281,19 @@ def test_categorical_as_known(): pdf = dd.from_pandas(pd.DataFrame({"col_1": [0, 1, 2, 3]}), npartitions=2) pdf["col_1"] = pdf["col_1"].astype("category") expected = pdf["col_1"].cat.as_known() - dd.assert_eq(expected, actual) + + # Note: Categories may be ordered differently in + # cudf and pandas. Therefore, we need to compare + # the global set of categories (before and after + # calling `compute`), then we need to check that + # the initial order of rows was preserved. + assert set(expected.cat.categories) == set( + actual.cat.categories.values_host + ) + assert set(expected.compute().cat.categories) == set( + actual.compute().cat.categories.values_host + ) + dd.assert_eq(expected, actual.astype(expected.dtype)) def test_str_slice(): diff --git a/python/dask_cudf/dask_cudf/tests/utils.py b/python/dask_cudf/dask_cudf/tests/utils.py index 1ca1758736b..c7dedbb6b4a 100644 --- a/python/dask_cudf/dask_cudf/tests/utils.py +++ b/python/dask_cudf/dask_cudf/tests/utils.py @@ -5,6 +5,7 @@ import pytest from packaging.version import Version +import dask import dask.dataframe as dd import cudf @@ -12,11 +13,9 @@ from dask_cudf.expr import QUERY_PLANNING_ON if QUERY_PLANNING_ON: - import dask_expr - - DASK_EXPR_VERSION = Version(dask_expr.__version__) + DASK_VERSION = Version(dask.__version__) else: - DASK_EXPR_VERSION = None + DASK_VERSION = None def _make_random_frame(nelem, npartitions=2, include_na=False): @@ -37,7 +36,7 @@ def _make_random_frame(nelem, npartitions=2, include_na=False): def skip_dask_expr(reason=_default_reason, lt_version=None): if lt_version is not None: - skip = QUERY_PLANNING_ON and DASK_EXPR_VERSION < Version(lt_version) + skip = QUERY_PLANNING_ON and DASK_VERSION < Version(lt_version) else: skip = QUERY_PLANNING_ON return pytest.mark.skipif(skip, reason=reason) @@ -45,7 +44,7 @@ def skip_dask_expr(reason=_default_reason, lt_version=None): def xfail_dask_expr(reason=_default_reason, lt_version=None): if lt_version is not None: - xfail = QUERY_PLANNING_ON and DASK_EXPR_VERSION < Version(lt_version) + xfail = QUERY_PLANNING_ON and DASK_VERSION < Version(lt_version) else: xfail = QUERY_PLANNING_ON return pytest.mark.xfail(xfail, reason=reason)