From 61171f4456d4a9765286a4e5b24a8d0149eb5115 Mon Sep 17 00:00:00 2001 From: rjzamora Date: Mon, 20 May 2024 10:42:21 -0700 Subject: [PATCH 1/6] add warning and skip problematic test for now --- python/dask_cudf/dask_cudf/expr/_collection.py | 17 +++++++++++++++++ python/dask_cudf/dask_cudf/tests/test_sort.py | 6 +++--- 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/python/dask_cudf/dask_cudf/expr/_collection.py b/python/dask_cudf/dask_cudf/expr/_collection.py index d50dfb24256..8153b4de48c 100644 --- a/python/dask_cudf/dask_cudf/expr/_collection.py +++ b/python/dask_cudf/dask_cudf/expr/_collection.py @@ -15,6 +15,7 @@ from dask import config from dask.dataframe.core import is_dataframe_like +from dask.dataframe.dispatch import is_categorical_dtype import cudf @@ -81,6 +82,22 @@ def from_dict(cls, *args, **kwargs): with config.set({"dataframe.backend": "cudf"}): return DXDataFrame.from_dict(*args, **kwargs) + def sort_values( + self, + by, + **kwargs, + ): + for col in by if isinstance(by, list) else [by]: + if is_categorical_dtype(self.dtypes.get(col, None)): + warnings.warn( + "Dask-cudf has limited support for sorting on categorical " + "columns when query-planning is enabled. Please consider " + "using the legacy API if you run into any issues." + f"\n{_LEGACY_WORKAROUND}", + FutureWarning, + ) + return super().sort_values(by, **kwargs) + def groupby( self, by, diff --git a/python/dask_cudf/dask_cudf/tests/test_sort.py b/python/dask_cudf/dask_cudf/tests/test_sort.py index 400600a1598..4df008ef00c 100644 --- a/python/dask_cudf/dask_cudf/tests/test_sort.py +++ b/python/dask_cudf/dask_cudf/tests/test_sort.py @@ -10,7 +10,7 @@ import cudf import dask_cudf -from dask_cudf.tests.utils import xfail_dask_expr +from dask_cudf.tests.utils import skip_dask_expr, xfail_dask_expr @pytest.mark.parametrize("ascending", [True, False]) @@ -22,8 +22,8 @@ "c", pytest.param( "d", - marks=xfail_dask_expr( - "Dask-expr fails to sort by categorical column." + marks=skip_dask_expr( + "Possible segfault when sorting by categorical column.", ), ), ["a", "b"], From 4d1476e097457ff2a5c1ea49578651a1f208ef69 Mon Sep 17 00:00:00 2001 From: rjzamora Date: Mon, 20 May 2024 10:58:26 -0700 Subject: [PATCH 2/6] switch to error, and add test coverage --- python/dask_cudf/dask_cudf/expr/_collection.py | 18 +++++++++--------- python/dask_cudf/dask_cudf/tests/test_sort.py | 15 +++++++++++++-- 2 files changed, 22 insertions(+), 11 deletions(-) diff --git a/python/dask_cudf/dask_cudf/expr/_collection.py b/python/dask_cudf/dask_cudf/expr/_collection.py index 8153b4de48c..0deb5e399fe 100644 --- a/python/dask_cudf/dask_cudf/expr/_collection.py +++ b/python/dask_cudf/dask_cudf/expr/_collection.py @@ -87,15 +87,15 @@ def sort_values( by, **kwargs, ): - for col in by if isinstance(by, list) else [by]: - if is_categorical_dtype(self.dtypes.get(col, None)): - warnings.warn( - "Dask-cudf has limited support for sorting on categorical " - "columns when query-planning is enabled. Please consider " - "using the legacy API if you run into any issues." - f"\n{_LEGACY_WORKAROUND}", - FutureWarning, - ) + # Check if first column is categorical and raise if it is + check_by = by[0] if isinstance(by, list) else by + if is_categorical_dtype(self.dtypes.get(check_by, None)): + raise NotImplementedError( + "Dask-cudf does not support sorting on categorical " + "columns when query-planning is enabled. Please use " + "the legacy API for now." + f"\n{_LEGACY_WORKAROUND}", + ) return super().sort_values(by, **kwargs) def groupby( diff --git a/python/dask_cudf/dask_cudf/tests/test_sort.py b/python/dask_cudf/dask_cudf/tests/test_sort.py index 4df008ef00c..3ea11437c38 100644 --- a/python/dask_cudf/dask_cudf/tests/test_sort.py +++ b/python/dask_cudf/dask_cudf/tests/test_sort.py @@ -10,7 +10,7 @@ import cudf import dask_cudf -from dask_cudf.tests.utils import skip_dask_expr, xfail_dask_expr +from dask_cudf.tests.utils import xfail_dask_expr @pytest.mark.parametrize("ascending", [True, False]) @@ -22,7 +22,7 @@ "c", pytest.param( "d", - marks=skip_dask_expr( + marks=xfail_dask_expr( "Possible segfault when sorting by categorical column.", ), ), @@ -47,6 +47,17 @@ def test_sort_values(nelem, nparts, by, ascending): dd.assert_eq(got, expect, check_index=False) +@pytest.mark.parametrize("by", ["b", ["a", "b"]]) +def test_sort_values_categorical_raises(by): + df = cudf.DataFrame() + df["a"] = np.ascontiguousarray(np.arange(10)[::-1]) + df["b"] = df["a"].astype("category") + ddf = dd.from_pandas(df, npartitions=10) + + with pytest.raises(NotImplementedError, match="sorting on categorical"): + ddf.sort_values(by=by) + + @pytest.mark.parametrize("ascending", [True, False]) @pytest.mark.parametrize("by", ["a", "b", ["a", "b"]]) def test_sort_values_single_partition(by, ascending): From d6e523018a17f5de5436105c9ef46da03ec6b3ec Mon Sep 17 00:00:00 2001 From: rjzamora Date: Mon, 20 May 2024 12:17:39 -0700 Subject: [PATCH 3/6] update comment to be more clear and to link a related issue --- python/dask_cudf/dask_cudf/expr/_collection.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/dask_cudf/dask_cudf/expr/_collection.py b/python/dask_cudf/dask_cudf/expr/_collection.py index 0deb5e399fe..9214a801934 100644 --- a/python/dask_cudf/dask_cudf/expr/_collection.py +++ b/python/dask_cudf/dask_cudf/expr/_collection.py @@ -87,7 +87,8 @@ def sort_values( by, **kwargs, ): - # Check if first column is categorical and raise if it is + # Raise if the first column is categorical, otherwise the + # upstream divisions logic may produce errors check_by = by[0] if isinstance(by, list) else by if is_categorical_dtype(self.dtypes.get(check_by, None)): raise NotImplementedError( From 36be035fce81c09cd1727efc8fb40ef54bd8e075 Mon Sep 17 00:00:00 2001 From: "Richard (Rick) Zamora" Date: Mon, 20 May 2024 14:21:52 -0500 Subject: [PATCH 4/6] Update python/dask_cudf/dask_cudf/expr/_collection.py Co-authored-by: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> --- python/dask_cudf/dask_cudf/expr/_collection.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/python/dask_cudf/dask_cudf/expr/_collection.py b/python/dask_cudf/dask_cudf/expr/_collection.py index 0deb5e399fe..926b7cfaf0e 100644 --- a/python/dask_cudf/dask_cudf/expr/_collection.py +++ b/python/dask_cudf/dask_cudf/expr/_collection.py @@ -87,7 +87,9 @@ def sort_values( by, **kwargs, ): - # Check if first column is categorical and raise if it is + # Raise if the first column is categorical, otherwise the + # upstream divisions logic may produce errors + # (See: https://github.com/rapidsai/cudf/issues/11795) check_by = by[0] if isinstance(by, list) else by if is_categorical_dtype(self.dtypes.get(check_by, None)): raise NotImplementedError( From 5b4e2571bf5089ccdab58917a367a5cd84d2078a Mon Sep 17 00:00:00 2001 From: "Richard (Rick) Zamora" Date: Mon, 20 May 2024 15:34:46 -0500 Subject: [PATCH 5/6] Update python/dask_cudf/dask_cudf/tests/test_sort.py Co-authored-by: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> --- python/dask_cudf/dask_cudf/tests/test_sort.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/dask_cudf/dask_cudf/tests/test_sort.py b/python/dask_cudf/dask_cudf/tests/test_sort.py index 3ea11437c38..132ce8622df 100644 --- a/python/dask_cudf/dask_cudf/tests/test_sort.py +++ b/python/dask_cudf/dask_cudf/tests/test_sort.py @@ -47,7 +47,7 @@ def test_sort_values(nelem, nparts, by, ascending): dd.assert_eq(got, expect, check_index=False) -@pytest.mark.parametrize("by", ["b", ["a", "b"]]) +@pytest.mark.parametrize("by", ["b", ["b", "a"]]) def test_sort_values_categorical_raises(by): df = cudf.DataFrame() df["a"] = np.ascontiguousarray(np.arange(10)[::-1]) From 5107167b661c08f4be26291bf3a64bfe810cce2f Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Mon, 20 May 2024 22:41:57 +0000 Subject: [PATCH 6/6] fix pytest --- python/dask_cudf/dask_cudf/tests/test_sort.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/python/dask_cudf/dask_cudf/tests/test_sort.py b/python/dask_cudf/dask_cudf/tests/test_sort.py index 132ce8622df..9d9fe297248 100644 --- a/python/dask_cudf/dask_cudf/tests/test_sort.py +++ b/python/dask_cudf/dask_cudf/tests/test_sort.py @@ -10,7 +10,7 @@ import cudf import dask_cudf -from dask_cudf.tests.utils import xfail_dask_expr +from dask_cudf.tests.utils import QUERY_PLANNING_ON, xfail_dask_expr @pytest.mark.parametrize("ascending", [True, False]) @@ -54,8 +54,11 @@ def test_sort_values_categorical_raises(by): df["b"] = df["a"].astype("category") ddf = dd.from_pandas(df, npartitions=10) - with pytest.raises(NotImplementedError, match="sorting on categorical"): - ddf.sort_values(by=by) + if QUERY_PLANNING_ON: + with pytest.raises( + NotImplementedError, match="sorting on categorical" + ): + ddf.sort_values(by=by) @pytest.mark.parametrize("ascending", [True, False])