From 61fae84659eef9e6ef6f5197da1acfae77f7333e Mon Sep 17 00:00:00 2001 From: rjzamora Date: Mon, 22 Jan 2024 11:57:58 -0800 Subject: [PATCH 1/5] update __dask_tokenize__ --- python/cudf/cudf/core/frame.py | 6 ++++-- python/cudf/cudf/core/indexed_frame.py | 8 +++++--- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index fc313a62fd0..2680dccfd34 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -2853,10 +2853,12 @@ def _repeat( @_cudf_nvtx_annotate @_warn_no_dask_cudf def __dask_tokenize__(self): + from dask.base import normalize_token + return [ type(self), - self._dtypes, - self.to_pandas(), + normalize_token(self._dtypes), + normalize_token(self.to_pandas()), ] diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 3e564919090..6320d1fb5fb 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -5198,11 +5198,13 @@ def convert_dtypes( @_warn_no_dask_cudf def __dask_tokenize__(self): + from dask.base import normalize_token + return [ type(self), - self._dtypes, - self.index, - self.hash_values().values_host, + normalize_token(self._dtypes), + normalize_token(self.index), + normalize_token(self.hash_values().values_host), ] From 92a36d5858b3527e79904fc978883a669f10dbcc Mon Sep 17 00:00:00 2001 From: rjzamora Date: Mon, 22 Jan 2024 12:03:01 -0800 Subject: [PATCH 2/5] add test coverage --- python/dask_cudf/dask_cudf/tests/test_dispatch.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/python/dask_cudf/dask_cudf/tests/test_dispatch.py b/python/dask_cudf/dask_cudf/tests/test_dispatch.py index c64e25fd437..f37e82d4c4e 100644 --- a/python/dask_cudf/dask_cudf/tests/test_dispatch.py +++ b/python/dask_cudf/dask_cudf/tests/test_dispatch.py @@ -1,5 +1,7 @@ # Copyright (c) 2021-2023, NVIDIA CORPORATION. +from datetime import datetime + import numpy as np import pandas as pd import pytest @@ -82,6 +84,16 @@ def test_deterministic_tokenize(index): assert tokenize(df2) == tokenize(df2) +def test_deterministic_tokenize_multiindex(): + dt = datetime.strptime("1995-03-15", "%Y-%m-%d") + index = cudf.MultiIndex( + levels=[[1, 2], [dt]], + codes=[[0, 1], [0, 0]], + ) + df = cudf.DataFrame(index=index) + assert tokenize(df) == tokenize(df) + + @pytest.mark.parametrize("preserve_index", [True, False]) def test_pyarrow_schema_dispatch(preserve_index): from dask.dataframe.dispatch import ( From c3d69f96b55c4cae6e1d4e329a81c50d0375d74b Mon Sep 17 00:00:00 2001 From: rjzamora Date: Tue, 23 Jan 2024 05:17:25 -0800 Subject: [PATCH 3/5] fix date --- python/dask_cudf/dask_cudf/tests/test_dispatch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/dask_cudf/dask_cudf/tests/test_dispatch.py b/python/dask_cudf/dask_cudf/tests/test_dispatch.py index f37e82d4c4e..76703206726 100644 --- a/python/dask_cudf/dask_cudf/tests/test_dispatch.py +++ b/python/dask_cudf/dask_cudf/tests/test_dispatch.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2023, NVIDIA CORPORATION. +# Copyright (c) 2021-2024, NVIDIA CORPORATION. from datetime import datetime From c01002be6b66ddae923290e715e0387c34fc790a Mon Sep 17 00:00:00 2001 From: rjzamora Date: Tue, 30 Jan 2024 14:43:45 -0800 Subject: [PATCH 4/5] move dask tests that depend on cudf code --- python/cudf/cudf/tests/test_dask.py | 70 +++++++++++++++++-- .../dask_cudf/tests/test_dispatch.py | 53 -------------- 2 files changed, 63 insertions(+), 60 deletions(-) diff --git a/python/cudf/cudf/tests/test_dask.py b/python/cudf/cudf/tests/test_dask.py index 3af21b4a7ff..e7eccb5cd36 100644 --- a/python/cudf/cudf/tests/test_dask.py +++ b/python/cudf/cudf/tests/test_dask.py @@ -1,17 +1,19 @@ -# Copyright (c) 2019, NVIDIA CORPORATION. +# Copyright (c) 2019-2024, NVIDIA CORPORATION. + +from datetime import datetime import pytest import cudf -is_dataframe_like = pytest.importorskip( - "dask.dataframe.utils" -).is_dataframe_like -is_index_like = pytest.importorskip("dask.dataframe.utils").is_index_like -is_series_like = pytest.importorskip("dask.dataframe.utils").is_series_like - def test_is_dataframe_like(): + is_dataframe_like = pytest.importorskip( + "dask.dataframe.utils" + ).is_dataframe_like + is_index_like = pytest.importorskip("dask.dataframe.utils").is_index_like + is_series_like = pytest.importorskip("dask.dataframe.utils").is_series_like + df = cudf.DataFrame({"x": [1, 2, 3]}) assert is_dataframe_like(df) assert is_series_like(df.x) @@ -19,3 +21,57 @@ def test_is_dataframe_like(): assert not is_dataframe_like(df.x) assert not is_series_like(df) assert not is_index_like(df) + + +@pytest.mark.parametrize("index", [None, [1, 2] * 5]) +def test_deterministic_tokenize(index): + tokenize = pytest.importorskip("dask.base").tokenize + + # Checks that `dask.base.normalize_token` correctly + # dispatches to the logic defined in `backends.py` + # (making `tokenize()` deterministic). + df = cudf.DataFrame( + {"A": range(10), "B": ["dog", "cat"] * 5, "C": range(10, 0, -1)}, + index=index, + ) + + # Matching data should produce the same token + assert tokenize(df) == tokenize(df) + assert tokenize(df.A) == tokenize(df.A) + assert tokenize(df.index) == tokenize(df.index) + assert tokenize(df) == tokenize(df.copy(deep=True)) + assert tokenize(df.A) == tokenize(df.A.copy(deep=True)) + assert tokenize(df.index) == tokenize(df.index.copy(deep=True)) + + # Modifying a column element should change the token + original_token = tokenize(df) + original_token_a = tokenize(df.A) + df.A.iloc[2] = 10 + assert original_token != tokenize(df) + assert original_token_a != tokenize(df.A) + + # Modifying an index element should change the token + original_token = tokenize(df) + original_token_index = tokenize(df.index) + new_index = df.index.values + new_index[2] = 10 + df.index = new_index + assert original_token != tokenize(df) + assert original_token_index != tokenize(df.index) + + # Check MultiIndex case + df2 = df.set_index(["B", "C"], drop=False) + assert tokenize(df) != tokenize(df2) + assert tokenize(df2) == tokenize(df2) + + +def test_deterministic_tokenize_multiindex(): + tokenize = pytest.importorskip("dask.base").tokenize + + dt = datetime.strptime("1995-03-15", "%Y-%m-%d") + index = cudf.MultiIndex( + levels=[[1, 2], [dt]], + codes=[[0, 1], [0, 0]], + ) + df = cudf.DataFrame(index=index) + assert tokenize(df) == tokenize(df) diff --git a/python/dask_cudf/dask_cudf/tests/test_dispatch.py b/python/dask_cudf/dask_cudf/tests/test_dispatch.py index 76703206726..34d7e0f1b3c 100644 --- a/python/dask_cudf/dask_cudf/tests/test_dispatch.py +++ b/python/dask_cudf/dask_cudf/tests/test_dispatch.py @@ -1,12 +1,9 @@ # Copyright (c) 2021-2024, NVIDIA CORPORATION. -from datetime import datetime - import numpy as np import pandas as pd import pytest -from dask.base import tokenize from dask.dataframe import assert_eq from dask.dataframe.methods import is_categorical_dtype @@ -44,56 +41,6 @@ def test_pyarrow_conversion_dispatch(preserve_index): assert not isinstance(df2.index, cudf.RangeIndex) -@pytest.mark.parametrize("index", [None, [1, 2] * 5]) -def test_deterministic_tokenize(index): - # Checks that `dask.base.normalize_token` correctly - # dispatches to the logic defined in `backends.py` - # (making `tokenize()` deterministic). - df = cudf.DataFrame( - {"A": range(10), "B": ["dog", "cat"] * 5, "C": range(10, 0, -1)}, - index=index, - ) - - # Matching data should produce the same token - assert tokenize(df) == tokenize(df) - assert tokenize(df.A) == tokenize(df.A) - assert tokenize(df.index) == tokenize(df.index) - assert tokenize(df) == tokenize(df.copy(deep=True)) - assert tokenize(df.A) == tokenize(df.A.copy(deep=True)) - assert tokenize(df.index) == tokenize(df.index.copy(deep=True)) - - # Modifying a column element should change the token - original_token = tokenize(df) - original_token_a = tokenize(df.A) - df.A.iloc[2] = 10 - assert original_token != tokenize(df) - assert original_token_a != tokenize(df.A) - - # Modifying an index element should change the token - original_token = tokenize(df) - original_token_index = tokenize(df.index) - new_index = df.index.values - new_index[2] = 10 - df.index = new_index - assert original_token != tokenize(df) - assert original_token_index != tokenize(df.index) - - # Check MultiIndex case - df2 = df.set_index(["B", "C"], drop=False) - assert tokenize(df) != tokenize(df2) - assert tokenize(df2) == tokenize(df2) - - -def test_deterministic_tokenize_multiindex(): - dt = datetime.strptime("1995-03-15", "%Y-%m-%d") - index = cudf.MultiIndex( - levels=[[1, 2], [dt]], - codes=[[0, 1], [0, 0]], - ) - df = cudf.DataFrame(index=index) - assert tokenize(df) == tokenize(df) - - @pytest.mark.parametrize("preserve_index", [True, False]) def test_pyarrow_schema_dispatch(preserve_index): from dask.dataframe.dispatch import ( From 90defebd6ec21320a3176f17e04f54159ed05d5a Mon Sep 17 00:00:00 2001 From: rjzamora Date: Tue, 30 Jan 2024 18:28:56 -0800 Subject: [PATCH 5/5] revert change --- python/cudf/cudf/tests/test_dask.py | 70 ++----------------- .../dask_cudf/tests/test_dispatch.py | 53 ++++++++++++++ 2 files changed, 60 insertions(+), 63 deletions(-) diff --git a/python/cudf/cudf/tests/test_dask.py b/python/cudf/cudf/tests/test_dask.py index e7eccb5cd36..3af21b4a7ff 100644 --- a/python/cudf/cudf/tests/test_dask.py +++ b/python/cudf/cudf/tests/test_dask.py @@ -1,19 +1,17 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. - -from datetime import datetime +# Copyright (c) 2019, NVIDIA CORPORATION. import pytest import cudf +is_dataframe_like = pytest.importorskip( + "dask.dataframe.utils" +).is_dataframe_like +is_index_like = pytest.importorskip("dask.dataframe.utils").is_index_like +is_series_like = pytest.importorskip("dask.dataframe.utils").is_series_like -def test_is_dataframe_like(): - is_dataframe_like = pytest.importorskip( - "dask.dataframe.utils" - ).is_dataframe_like - is_index_like = pytest.importorskip("dask.dataframe.utils").is_index_like - is_series_like = pytest.importorskip("dask.dataframe.utils").is_series_like +def test_is_dataframe_like(): df = cudf.DataFrame({"x": [1, 2, 3]}) assert is_dataframe_like(df) assert is_series_like(df.x) @@ -21,57 +19,3 @@ def test_is_dataframe_like(): assert not is_dataframe_like(df.x) assert not is_series_like(df) assert not is_index_like(df) - - -@pytest.mark.parametrize("index", [None, [1, 2] * 5]) -def test_deterministic_tokenize(index): - tokenize = pytest.importorskip("dask.base").tokenize - - # Checks that `dask.base.normalize_token` correctly - # dispatches to the logic defined in `backends.py` - # (making `tokenize()` deterministic). - df = cudf.DataFrame( - {"A": range(10), "B": ["dog", "cat"] * 5, "C": range(10, 0, -1)}, - index=index, - ) - - # Matching data should produce the same token - assert tokenize(df) == tokenize(df) - assert tokenize(df.A) == tokenize(df.A) - assert tokenize(df.index) == tokenize(df.index) - assert tokenize(df) == tokenize(df.copy(deep=True)) - assert tokenize(df.A) == tokenize(df.A.copy(deep=True)) - assert tokenize(df.index) == tokenize(df.index.copy(deep=True)) - - # Modifying a column element should change the token - original_token = tokenize(df) - original_token_a = tokenize(df.A) - df.A.iloc[2] = 10 - assert original_token != tokenize(df) - assert original_token_a != tokenize(df.A) - - # Modifying an index element should change the token - original_token = tokenize(df) - original_token_index = tokenize(df.index) - new_index = df.index.values - new_index[2] = 10 - df.index = new_index - assert original_token != tokenize(df) - assert original_token_index != tokenize(df.index) - - # Check MultiIndex case - df2 = df.set_index(["B", "C"], drop=False) - assert tokenize(df) != tokenize(df2) - assert tokenize(df2) == tokenize(df2) - - -def test_deterministic_tokenize_multiindex(): - tokenize = pytest.importorskip("dask.base").tokenize - - dt = datetime.strptime("1995-03-15", "%Y-%m-%d") - index = cudf.MultiIndex( - levels=[[1, 2], [dt]], - codes=[[0, 1], [0, 0]], - ) - df = cudf.DataFrame(index=index) - assert tokenize(df) == tokenize(df) diff --git a/python/dask_cudf/dask_cudf/tests/test_dispatch.py b/python/dask_cudf/dask_cudf/tests/test_dispatch.py index 34d7e0f1b3c..76703206726 100644 --- a/python/dask_cudf/dask_cudf/tests/test_dispatch.py +++ b/python/dask_cudf/dask_cudf/tests/test_dispatch.py @@ -1,9 +1,12 @@ # Copyright (c) 2021-2024, NVIDIA CORPORATION. +from datetime import datetime + import numpy as np import pandas as pd import pytest +from dask.base import tokenize from dask.dataframe import assert_eq from dask.dataframe.methods import is_categorical_dtype @@ -41,6 +44,56 @@ def test_pyarrow_conversion_dispatch(preserve_index): assert not isinstance(df2.index, cudf.RangeIndex) +@pytest.mark.parametrize("index", [None, [1, 2] * 5]) +def test_deterministic_tokenize(index): + # Checks that `dask.base.normalize_token` correctly + # dispatches to the logic defined in `backends.py` + # (making `tokenize()` deterministic). + df = cudf.DataFrame( + {"A": range(10), "B": ["dog", "cat"] * 5, "C": range(10, 0, -1)}, + index=index, + ) + + # Matching data should produce the same token + assert tokenize(df) == tokenize(df) + assert tokenize(df.A) == tokenize(df.A) + assert tokenize(df.index) == tokenize(df.index) + assert tokenize(df) == tokenize(df.copy(deep=True)) + assert tokenize(df.A) == tokenize(df.A.copy(deep=True)) + assert tokenize(df.index) == tokenize(df.index.copy(deep=True)) + + # Modifying a column element should change the token + original_token = tokenize(df) + original_token_a = tokenize(df.A) + df.A.iloc[2] = 10 + assert original_token != tokenize(df) + assert original_token_a != tokenize(df.A) + + # Modifying an index element should change the token + original_token = tokenize(df) + original_token_index = tokenize(df.index) + new_index = df.index.values + new_index[2] = 10 + df.index = new_index + assert original_token != tokenize(df) + assert original_token_index != tokenize(df.index) + + # Check MultiIndex case + df2 = df.set_index(["B", "C"], drop=False) + assert tokenize(df) != tokenize(df2) + assert tokenize(df2) == tokenize(df2) + + +def test_deterministic_tokenize_multiindex(): + dt = datetime.strptime("1995-03-15", "%Y-%m-%d") + index = cudf.MultiIndex( + levels=[[1, 2], [dt]], + codes=[[0, 1], [0, 0]], + ) + df = cudf.DataFrame(index=index) + assert tokenize(df) == tokenize(df) + + @pytest.mark.parametrize("preserve_index", [True, False]) def test_pyarrow_schema_dispatch(preserve_index): from dask.dataframe.dispatch import (