From ee105fe1fbd2139304bb9e3d3a359d4268984312 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Thu, 1 Apr 2021 19:10:10 -0500 Subject: [PATCH 1/3] Revert dask versioning of concat dispatch (#7823) Dependent on : https://github.com/dask/dask/pull/7500 With the changes in above dask PR we would not have to version the concat dispatch in `dask-cudf`. Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - https://github.com/jakirkham URL: https://github.com/rapidsai/cudf/pull/7823 --- python/dask_cudf/dask_cudf/backends.py | 61 ++++++++------------------ 1 file changed, 19 insertions(+), 42 deletions(-) diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py index 66b06acc858..0570654fde3 100644 --- a/python/dask_cudf/dask_cudf/backends.py +++ b/python/dask_cudf/dask_cudf/backends.py @@ -1,13 +1,10 @@ # Copyright (c) 2020-2021, NVIDIA CORPORATION. -from distutils.version import LooseVersion - import cupy as cp import numpy as np import pandas as pd import pyarrow as pa -import dask from dask.dataframe.categorical import categorical_dtype_dispatch from dask.dataframe.core import get_parallel_type, make_meta, meta_nonempty from dask.dataframe.methods import ( @@ -31,7 +28,6 @@ get_parallel_type.register(cudf.DataFrame, lambda _: DataFrame) get_parallel_type.register(cudf.Series, lambda _: Series) get_parallel_type.register(cudf.Index, lambda _: Index) -DASK_VERSION = LooseVersion(dask.__version__) @meta_nonempty.register(cudf.Index) @@ -205,45 +201,26 @@ def make_meta_object(x, index=None): raise TypeError(f"Don't know how to create metadata from {x}") -if DASK_VERSION > "2021.03.1": - - @concat_dispatch.register((cudf.DataFrame, cudf.Series, cudf.Index)) - def concat_cudf( - dfs, - axis=0, - join="outer", - uniform=False, - filter_warning=True, - sort=None, - ignore_index=False, - **kwargs, - ): - assert join == "outer" - - ignore_order = kwargs.get("ignore_order", False) - if ignore_order: - raise NotImplementedError( - "ignore_order parameter is not yet supported in dask-cudf" - ) - - return cudf.concat(dfs, axis=axis, ignore_index=ignore_index) - - -else: - - @concat_dispatch.register((cudf.DataFrame, cudf.Series, cudf.Index)) - def concat_cudf( - dfs, - axis=0, - join="outer", - uniform=False, - filter_warning=True, - sort=None, - ignore_index=False, - ): - assert join == "outer" +@concat_dispatch.register((cudf.DataFrame, cudf.Series, cudf.Index)) +def concat_cudf( + dfs, + axis=0, + join="outer", + uniform=False, + filter_warning=True, + sort=None, + ignore_index=False, + **kwargs, +): + assert join == "outer" + + ignore_order = kwargs.get("ignore_order", False) + if ignore_order: + raise NotImplementedError( + "ignore_order parameter is not yet supported in dask-cudf" + ) - return cudf.concat(dfs, axis=axis, ignore_index=ignore_index) + return cudf.concat(dfs, axis=axis, ignore_index=ignore_index) @categorical_dtype_dispatch.register((cudf.DataFrame, cudf.Series, cudf.Index)) From 11358ce259e63452983be0d97748dcb4a1534049 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath <3190405+shwina@users.noreply.github.com> Date: Thu, 1 Apr 2021 21:00:19 -0400 Subject: [PATCH 2/3] Constrain dask and distributed versions to 2021.3.1 (#7825) Authors: - Ashwin Srinath (https://github.com/shwina) Approvers: - Keith Kraus (https://github.com/kkraus14) - Mike Wendt (https://github.com/mike-wendt) URL: https://github.com/rapidsai/cudf/pull/7825 --- conda/environments/cudf_dev_cuda10.1.yml | 4 ++-- conda/environments/cudf_dev_cuda10.2.yml | 4 ++-- conda/environments/cudf_dev_cuda11.0.yml | 4 ++-- conda/environments/cudf_dev_cuda11.1.yml | 4 ++-- conda/environments/cudf_dev_cuda11.2.yml | 4 ++-- conda/recipes/custreamz/meta.yaml | 2 +- conda/recipes/dask-cudf/meta.yaml | 8 ++++---- python/custreamz/dev_requirements.txt | 4 ++-- python/dask_cudf/dev_requirements.txt | 6 +++--- python/dask_cudf/setup.py | 8 ++++---- 10 files changed, 24 insertions(+), 24 deletions(-) diff --git a/conda/environments/cudf_dev_cuda10.1.yml b/conda/environments/cudf_dev_cuda10.1.yml index 2ec8af368f2..1e7fcbdc8ff 100644 --- a/conda/environments/cudf_dev_cuda10.1.yml +++ b/conda/environments/cudf_dev_cuda10.1.yml @@ -42,8 +42,8 @@ dependencies: - mypy=0.782 - typing_extensions - pre_commit - - dask>=2021.3.1 - - distributed>=2.22.0 + - dask==2021.3.1 + - distributed>=2.22.0,<=2021.3.1 - streamz - dlpack - arrow-cpp=1.0.1 diff --git a/conda/environments/cudf_dev_cuda10.2.yml b/conda/environments/cudf_dev_cuda10.2.yml index 64dcc458196..105a89bf902 100644 --- a/conda/environments/cudf_dev_cuda10.2.yml +++ b/conda/environments/cudf_dev_cuda10.2.yml @@ -42,8 +42,8 @@ dependencies: - mypy=0.782 - typing_extensions - pre_commit - - dask>=2021.3.1 - - distributed>=2.22.0 + - dask==2021.3.1 + - distributed>=2.22.0,<=2021.3.1 - streamz - dlpack - arrow-cpp=1.0.1 diff --git a/conda/environments/cudf_dev_cuda11.0.yml b/conda/environments/cudf_dev_cuda11.0.yml index f464738eaa8..bf0f0d236b1 100644 --- a/conda/environments/cudf_dev_cuda11.0.yml +++ b/conda/environments/cudf_dev_cuda11.0.yml @@ -42,8 +42,8 @@ dependencies: - mypy=0.782 - typing_extensions - pre_commit - - dask>=2021.3.1 - - distributed>=2.22.0 + - dask==2021.3.1 + - distributed>=2.22.0,<=2021.3.1 - streamz - dlpack - arrow-cpp=1.0.1 diff --git a/conda/environments/cudf_dev_cuda11.1.yml b/conda/environments/cudf_dev_cuda11.1.yml index fae94974502..a1169539b4a 100644 --- a/conda/environments/cudf_dev_cuda11.1.yml +++ b/conda/environments/cudf_dev_cuda11.1.yml @@ -42,8 +42,8 @@ dependencies: - mypy=0.782 - typing_extensions - pre_commit - - dask>=2021.3.1 - - distributed>=2.22.0 + - dask==2021.3.1 + - distributed>=2.22.0,<=2021.3.1 - streamz - dlpack - arrow-cpp=1.0.1 diff --git a/conda/environments/cudf_dev_cuda11.2.yml b/conda/environments/cudf_dev_cuda11.2.yml index fa0a69ad1ea..0952b4ea338 100644 --- a/conda/environments/cudf_dev_cuda11.2.yml +++ b/conda/environments/cudf_dev_cuda11.2.yml @@ -42,8 +42,8 @@ dependencies: - mypy=0.782 - typing_extensions - pre_commit - - dask>=2021.3.1 - - distributed>=2.22.0 + - dask==2021.3.1 + - distributed>=2.22.0,<=2021.3.1 - streamz - dlpack - arrow-cpp=1.0.1 diff --git a/conda/recipes/custreamz/meta.yaml b/conda/recipes/custreamz/meta.yaml index ffda6d0c3c6..4b763813001 100644 --- a/conda/recipes/custreamz/meta.yaml +++ b/conda/recipes/custreamz/meta.yaml @@ -29,7 +29,7 @@ requirements: - streamz - cudf {{ version }} - dask >=2.22.0 - - distributed >=2.22.0 + - distributed >=2.22.0,<=2021.3.1 - python-confluent-kafka - cudf_kafka {{ version }} diff --git a/conda/recipes/dask-cudf/meta.yaml b/conda/recipes/dask-cudf/meta.yaml index 66bffdfd61e..38bcfd469b9 100644 --- a/conda/recipes/dask-cudf/meta.yaml +++ b/conda/recipes/dask-cudf/meta.yaml @@ -23,13 +23,13 @@ requirements: host: - python - cudf {{ version }} - - dask>=2021.3.1 - - distributed >=2.22.0 + - dask==2021.3.1 + - distributed >=2.22.0,<=2021.3.1 run: - python - cudf {{ version }} - - dask>=2021.3.1 - - distributed >=2.22.0 + - dask==2021.3.1 + - distributed >=2.22.0,<=2021.3.1 test: requires: diff --git a/python/custreamz/dev_requirements.txt b/python/custreamz/dev_requirements.txt index 7e300376f4e..61b1b8cd7a1 100644 --- a/python/custreamz/dev_requirements.txt +++ b/python/custreamz/dev_requirements.txt @@ -3,8 +3,8 @@ flake8==3.8.3 black==19.10b0 isort==5.0.7 -dask>=2021.3.1 -distributed>=2.22.0 +dask==2021.3.1 +distributed>=2.22.0,<=2021.3.1 streamz python-confluent-kafka pytest diff --git a/python/dask_cudf/dev_requirements.txt b/python/dask_cudf/dev_requirements.txt index 39da0da3012..b19501845d1 100644 --- a/python/dask_cudf/dev_requirements.txt +++ b/python/dask_cudf/dev_requirements.txt @@ -1,7 +1,7 @@ # Copyright (c) 2021, NVIDIA CORPORATION. -dask>=2021.3.1 -distributed>=2.22.0 +dask==2021.3.1 +distributed>=2.22.0,<=2021.3.1 fsspec>=0.6.0 numba>=0.49.0,!=0.51.0 numpy @@ -11,4 +11,4 @@ setuptools wheel flake8==3.8.3 black==19.10b0 -isort==5.0.7 \ No newline at end of file +isort==5.0.7 diff --git a/python/dask_cudf/setup.py b/python/dask_cudf/setup.py index 1318701b845..caf9bb6bbb0 100644 --- a/python/dask_cudf/setup.py +++ b/python/dask_cudf/setup.py @@ -10,8 +10,8 @@ install_requires = [ "cudf", - "dask>=2021.3.1", - "distributed>=2.22.0", + "dask==2021.3.1", + "distributed>=2.22.0,<=2021.3.1", "fsspec>=0.6.0", "numpy", "pandas>=1.0,<1.3.0dev0", @@ -23,8 +23,8 @@ "pandas>=1.0,<1.3.0dev0", "pytest", "numba>=0.49.0,!=0.51.0", - "dask>=2021.3.1", - "distributed>=2.22.0", + "dask==2021.3.1", + "distributed>=2.22.0,<=2021.3.1", ] } From dee9238dffc73c25d7109cab8735e607ba9e474b Mon Sep 17 00:00:00 2001 From: ChrisJar Date: Thu, 1 Apr 2021 21:56:51 -0500 Subject: [PATCH 3/3] Enable join on decimal columns (#7764) This enables joins on decimal columns with the same precision and scale. Closes #7497 Depends on #7788 Authors: - https://github.com/ChrisJar Approvers: - Keith Kraus (https://github.com/kkraus14) - Ashwin Srinath (https://github.com/shwina) - https://github.com/brandon-b-miller - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/7764 --- python/cudf/cudf/_lib/replace.pyx | 2 +- python/cudf/cudf/core/column/decimal.py | 14 +- python/cudf/cudf/core/join/_join_helpers.py | 8 ++ python/cudf/cudf/tests/test_joining.py | 135 +++++++++++++++++++- 4 files changed, 155 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/_lib/replace.pyx b/python/cudf/cudf/_lib/replace.pyx index 2732142dd15..cdedd3ac022 100644 --- a/python/cudf/cudf/_lib/replace.pyx +++ b/python/cudf/cudf/_lib/replace.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2021, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr from libcpp.utility cimport move diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py index 4fee5060fe4..d9e4610832d 100644 --- a/python/cudf/cudf/core/column/decimal.py +++ b/python/cudf/cudf/core/column/decimal.py @@ -1,7 +1,7 @@ # Copyright (c) 2021, NVIDIA CORPORATION. from decimal import Decimal -from typing import cast +from typing import cast, Any import cupy as cp import numpy as np @@ -170,6 +170,18 @@ def sum_of_squares( "sum_of_squares", skipna=skipna, dtype=dtype, min_count=min_count ) + def fillna( + self, value: Any = None, method: str = None, dtype: Dtype = None + ): + """Fill null values with ``value``. + + Returns a copy with null filled. + """ + result = libcudf.replace.replace_nulls( + input_col=self, replacement=value, method=method, dtype=dtype + ) + return self._copy_type_metadata(result) + def _binop_scale(l_dtype, r_dtype, op): # This should at some point be hooked up to libcudf's diff --git a/python/cudf/cudf/core/join/_join_helpers.py b/python/cudf/cudf/core/join/_join_helpers.py index 3807f408369..5e15ddfc359 100644 --- a/python/cudf/cudf/core/join/_join_helpers.py +++ b/python/cudf/cudf/core/join/_join_helpers.py @@ -97,6 +97,14 @@ def _match_join_keys( if pd.api.types.is_dtype_equal(ltype, rtype): return lcol, rcol + if isinstance(ltype, cudf.Decimal64Dtype) or isinstance( + rtype, cudf.Decimal64Dtype + ): + raise TypeError( + "Decimal columns can only be merged with decimal columns " + "of the same precision and scale" + ) + if (np.issubdtype(ltype, np.number)) and (np.issubdtype(rtype, np.number)): common_type = ( max(ltype, rtype) diff --git a/python/cudf/cudf/tests/test_joining.py b/python/cudf/cudf/tests/test_joining.py index 9164bfe98d1..2dae2bf1e97 100644 --- a/python/cudf/cudf/tests/test_joining.py +++ b/python/cudf/cudf/tests/test_joining.py @@ -6,7 +6,7 @@ import cudf from cudf.core._compat import PANDAS_GE_120 -from cudf.core.dtypes import CategoricalDtype +from cudf.core.dtypes import CategoricalDtype, Decimal64Dtype from cudf.tests.utils import ( INTEGER_TYPES, NUMERIC_TYPES, @@ -89,7 +89,7 @@ def assert_join_results_equal(expect, got, how, **kwargs): ): # can't sort_values() on a df without columns return assert_eq(expect, got, **kwargs) - return assert_eq( + assert_eq( expect.sort_values(expect.columns.to_list()).reset_index( drop=True ), @@ -1152,6 +1152,137 @@ def test_typecast_on_join_overflow_unsafe(dtypes): merged = lhs.merge(rhs, on="a", how="left") # noqa: F841 +@pytest.mark.parametrize( + "dtype", + [Decimal64Dtype(5, 2), Decimal64Dtype(7, 5), Decimal64Dtype(12, 7)], +) +def test_decimal_typecast_inner(dtype): + other_data = ["a", "b", "c", "d", "e"] + + join_data_l = cudf.Series(["1.6", "9.5", "7.2", "8.7", "2.3"]).astype( + dtype + ) + join_data_r = cudf.Series(["1.6", "9.5", "7.2", "4.5", "2.3"]).astype( + dtype + ) + + gdf_l = cudf.DataFrame({"join_col": join_data_l, "B": other_data}) + gdf_r = cudf.DataFrame({"join_col": join_data_r, "B": other_data}) + + exp_join_data = ["1.6", "9.5", "7.2", "2.3"] + exp_other_data = ["a", "b", "c", "e"] + + exp_join_col = cudf.Series(exp_join_data).astype(dtype) + + expected = cudf.DataFrame( + { + "join_col": exp_join_col, + "B_x": exp_other_data, + "B_y": exp_other_data, + } + ) + + got = gdf_l.merge(gdf_r, on="join_col", how="inner") + + assert_join_results_equal(expected, got, how="inner") + assert_eq(dtype, got["join_col"].dtype) + + +@pytest.mark.parametrize( + "dtype", + [Decimal64Dtype(7, 3), Decimal64Dtype(9, 5), Decimal64Dtype(14, 10)], +) +def test_decimal_typecast_left(dtype): + other_data = ["a", "b", "c", "d"] + + join_data_l = cudf.Series(["95.05", "384.26", "74.22", "1456.94"]).astype( + dtype + ) + join_data_r = cudf.Series( + ["95.05", "62.4056", "74.22", "1456.9472"] + ).astype(dtype) + + gdf_l = cudf.DataFrame({"join_col": join_data_l, "B": other_data}) + gdf_r = cudf.DataFrame({"join_col": join_data_r, "B": other_data}) + + exp_join_data = ["95.05", "74.22", "384.26", "1456.94"] + exp_other_data_x = ["a", "c", "b", "d"] + exp_other_data_y = ["a", "c", None, None] + + exp_join_col = cudf.Series(exp_join_data).astype(dtype) + + expected = cudf.DataFrame( + { + "join_col": exp_join_col, + "B_x": exp_other_data_x, + "B_y": exp_other_data_y, + } + ) + + got = gdf_l.merge(gdf_r, on="join_col", how="left") + + assert_join_results_equal(expected, got, how="left") + assert_eq(dtype, got["join_col"].dtype) + + +@pytest.mark.parametrize( + "dtype", + [Decimal64Dtype(7, 3), Decimal64Dtype(10, 5), Decimal64Dtype(18, 9)], +) +def test_decimal_typecast_outer(dtype): + other_data = ["a", "b", "c"] + join_data_l = cudf.Series(["741.248", "1029.528", "3627.292"]).astype( + dtype + ) + join_data_r = cudf.Series(["9284.103", "1029.528", "948.637"]).astype( + dtype + ) + gdf_l = cudf.DataFrame({"join_col": join_data_l, "B": other_data}) + gdf_r = cudf.DataFrame({"join_col": join_data_r, "B": other_data}) + exp_join_data = ["9284.103", "948.637", "1029.528", "741.248", "3627.292"] + exp_other_data_x = [None, None, "b", "a", "c"] + exp_other_data_y = ["a", "c", "b", None, None] + exp_join_col = cudf.Series(exp_join_data).astype(dtype) + expected = cudf.DataFrame( + { + "join_col": exp_join_col, + "B_x": exp_other_data_x, + "B_y": exp_other_data_y, + } + ) + got = gdf_l.merge(gdf_r, on="join_col", how="outer") + + assert_join_results_equal(expected, got, how="outer") + assert_eq(dtype, got["join_col"].dtype) + + +@pytest.mark.parametrize( + "dtype_l", [Decimal64Dtype(7, 3), Decimal64Dtype(9, 5)], +) +@pytest.mark.parametrize( + "dtype_r", [Decimal64Dtype(8, 3), Decimal64Dtype(11, 6)], +) +def test_mixed_decimal_typecast(dtype_l, dtype_r): + other_data = ["a", "b", "c", "d"] + + join_data_l = cudf.Series(["95.05", "34.6", "74.22", "14.94"]).astype( + dtype_r + ) + join_data_r = cudf.Series(["95.05", "62.4056", "74.22", "1.42"]).astype( + dtype_l + ) + + gdf_l = cudf.DataFrame({"join_col": join_data_l, "B": other_data}) + gdf_r = cudf.DataFrame({"join_col": join_data_r, "B": other_data}) + + with pytest.raises( + TypeError, + match="Decimal columns can only be merged with decimal columns " + "of the same precision and scale", + ): + gdf_l.merge(gdf_r, on="join_col", how="inner") + + @pytest.mark.parametrize( "dtype_l", ["datetime64[s]", "datetime64[ms]", "datetime64[us]", "datetime64[ns]"],