Skip to content

Commit

Permalink
Merge branch 'branch-0.19' of github.com:rapidsai/cudf into jitify2
Browse files Browse the repository at this point in the history
  • Loading branch information
cwharris committed Apr 2, 2021
2 parents 998c86e + dee9238 commit e71dbc1
Show file tree
Hide file tree
Showing 15 changed files with 198 additions and 70 deletions.
4 changes: 2 additions & 2 deletions conda/environments/cudf_dev_cuda10.1.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,8 @@ dependencies:
- mypy=0.782
- typing_extensions
- pre_commit
- dask>=2021.3.1
- distributed>=2.22.0
- dask==2021.3.1
- distributed>=2.22.0,<=2021.3.1
- streamz
- dlpack
- arrow-cpp=1.0.1
Expand Down
4 changes: 2 additions & 2 deletions conda/environments/cudf_dev_cuda10.2.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,8 @@ dependencies:
- mypy=0.782
- typing_extensions
- pre_commit
- dask>=2021.3.1
- distributed>=2.22.0
- dask==2021.3.1
- distributed>=2.22.0,<=2021.3.1
- streamz
- dlpack
- arrow-cpp=1.0.1
Expand Down
4 changes: 2 additions & 2 deletions conda/environments/cudf_dev_cuda11.0.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,8 @@ dependencies:
- mypy=0.782
- typing_extensions
- pre_commit
- dask>=2021.3.1
- distributed>=2.22.0
- dask==2021.3.1
- distributed>=2.22.0,<=2021.3.1
- streamz
- dlpack
- arrow-cpp=1.0.1
Expand Down
4 changes: 2 additions & 2 deletions conda/environments/cudf_dev_cuda11.1.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,8 @@ dependencies:
- mypy=0.782
- typing_extensions
- pre_commit
- dask>=2021.3.1
- distributed>=2.22.0
- dask==2021.3.1
- distributed>=2.22.0,<=2021.3.1
- streamz
- dlpack
- arrow-cpp=1.0.1
Expand Down
4 changes: 2 additions & 2 deletions conda/environments/cudf_dev_cuda11.2.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,8 @@ dependencies:
- mypy=0.782
- typing_extensions
- pre_commit
- dask>=2021.3.1
- distributed>=2.22.0
- dask==2021.3.1
- distributed>=2.22.0,<=2021.3.1
- streamz
- dlpack
- arrow-cpp=1.0.1
Expand Down
2 changes: 1 addition & 1 deletion conda/recipes/custreamz/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ requirements:
- streamz
- cudf {{ version }}
- dask >=2.22.0
- distributed >=2.22.0
- distributed >=2.22.0,<=2021.3.1
- python-confluent-kafka
- cudf_kafka {{ version }}

Expand Down
8 changes: 4 additions & 4 deletions conda/recipes/dask-cudf/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,13 @@ requirements:
host:
- python
- cudf {{ version }}
- dask>=2021.3.1
- distributed >=2.22.0
- dask==2021.3.1
- distributed >=2.22.0,<=2021.3.1
run:
- python
- cudf {{ version }}
- dask>=2021.3.1
- distributed >=2.22.0
- dask==2021.3.1
- distributed >=2.22.0,<=2021.3.1

test:
requires:
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/_lib/replace.pyx
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020, NVIDIA CORPORATION.
# Copyright (c) 2020-2021, NVIDIA CORPORATION.

from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move
Expand Down
14 changes: 13 additions & 1 deletion python/cudf/cudf/core/column/decimal.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Copyright (c) 2021, NVIDIA CORPORATION.

from decimal import Decimal
from typing import cast
from typing import cast, Any

import cupy as cp
import numpy as np
Expand Down Expand Up @@ -170,6 +170,18 @@ def sum_of_squares(
"sum_of_squares", skipna=skipna, dtype=dtype, min_count=min_count
)

def fillna(
self, value: Any = None, method: str = None, dtype: Dtype = None
):
"""Fill null values with ``value``.
Returns a copy with null filled.
"""
result = libcudf.replace.replace_nulls(
input_col=self, replacement=value, method=method, dtype=dtype
)
return self._copy_type_metadata(result)


def _binop_scale(l_dtype, r_dtype, op):
# This should at some point be hooked up to libcudf's
Expand Down
8 changes: 8 additions & 0 deletions python/cudf/cudf/core/join/_join_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,14 @@ def _match_join_keys(
if pd.api.types.is_dtype_equal(ltype, rtype):
return lcol, rcol

if isinstance(ltype, cudf.Decimal64Dtype) or isinstance(
rtype, cudf.Decimal64Dtype
):
raise TypeError(
"Decimal columns can only be merged with decimal columns "
"of the same precision and scale"
)

if (np.issubdtype(ltype, np.number)) and (np.issubdtype(rtype, np.number)):
common_type = (
max(ltype, rtype)
Expand Down
135 changes: 133 additions & 2 deletions python/cudf/cudf/tests/test_joining.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

import cudf
from cudf.core._compat import PANDAS_GE_120
from cudf.core.dtypes import CategoricalDtype
from cudf.core.dtypes import CategoricalDtype, Decimal64Dtype
from cudf.tests.utils import (
INTEGER_TYPES,
NUMERIC_TYPES,
Expand Down Expand Up @@ -89,7 +89,7 @@ def assert_join_results_equal(expect, got, how, **kwargs):
): # can't sort_values() on a df without columns
return assert_eq(expect, got, **kwargs)

return assert_eq(
assert_eq(
expect.sort_values(expect.columns.to_list()).reset_index(
drop=True
),
Expand Down Expand Up @@ -1152,6 +1152,137 @@ def test_typecast_on_join_overflow_unsafe(dtypes):
merged = lhs.merge(rhs, on="a", how="left") # noqa: F841


@pytest.mark.parametrize(
"dtype",
[Decimal64Dtype(5, 2), Decimal64Dtype(7, 5), Decimal64Dtype(12, 7)],
)
def test_decimal_typecast_inner(dtype):
other_data = ["a", "b", "c", "d", "e"]

join_data_l = cudf.Series(["1.6", "9.5", "7.2", "8.7", "2.3"]).astype(
dtype
)
join_data_r = cudf.Series(["1.6", "9.5", "7.2", "4.5", "2.3"]).astype(
dtype
)

gdf_l = cudf.DataFrame({"join_col": join_data_l, "B": other_data})
gdf_r = cudf.DataFrame({"join_col": join_data_r, "B": other_data})

exp_join_data = ["1.6", "9.5", "7.2", "2.3"]
exp_other_data = ["a", "b", "c", "e"]

exp_join_col = cudf.Series(exp_join_data).astype(dtype)

expected = cudf.DataFrame(
{
"join_col": exp_join_col,
"B_x": exp_other_data,
"B_y": exp_other_data,
}
)

got = gdf_l.merge(gdf_r, on="join_col", how="inner")

assert_join_results_equal(expected, got, how="inner")
assert_eq(dtype, got["join_col"].dtype)


@pytest.mark.parametrize(
"dtype",
[Decimal64Dtype(7, 3), Decimal64Dtype(9, 5), Decimal64Dtype(14, 10)],
)
def test_decimal_typecast_left(dtype):
other_data = ["a", "b", "c", "d"]

join_data_l = cudf.Series(["95.05", "384.26", "74.22", "1456.94"]).astype(
dtype
)
join_data_r = cudf.Series(
["95.05", "62.4056", "74.22", "1456.9472"]
).astype(dtype)

gdf_l = cudf.DataFrame({"join_col": join_data_l, "B": other_data})
gdf_r = cudf.DataFrame({"join_col": join_data_r, "B": other_data})

exp_join_data = ["95.05", "74.22", "384.26", "1456.94"]
exp_other_data_x = ["a", "c", "b", "d"]
exp_other_data_y = ["a", "c", None, None]

exp_join_col = cudf.Series(exp_join_data).astype(dtype)

expected = cudf.DataFrame(
{
"join_col": exp_join_col,
"B_x": exp_other_data_x,
"B_y": exp_other_data_y,
}
)

got = gdf_l.merge(gdf_r, on="join_col", how="left")

assert_join_results_equal(expected, got, how="left")
assert_eq(dtype, got["join_col"].dtype)


@pytest.mark.parametrize(
"dtype",
[Decimal64Dtype(7, 3), Decimal64Dtype(10, 5), Decimal64Dtype(18, 9)],
)
def test_decimal_typecast_outer(dtype):
other_data = ["a", "b", "c"]
join_data_l = cudf.Series(["741.248", "1029.528", "3627.292"]).astype(
dtype
)
join_data_r = cudf.Series(["9284.103", "1029.528", "948.637"]).astype(
dtype
)
gdf_l = cudf.DataFrame({"join_col": join_data_l, "B": other_data})
gdf_r = cudf.DataFrame({"join_col": join_data_r, "B": other_data})
exp_join_data = ["9284.103", "948.637", "1029.528", "741.248", "3627.292"]
exp_other_data_x = [None, None, "b", "a", "c"]
exp_other_data_y = ["a", "c", "b", None, None]
exp_join_col = cudf.Series(exp_join_data).astype(dtype)
expected = cudf.DataFrame(
{
"join_col": exp_join_col,
"B_x": exp_other_data_x,
"B_y": exp_other_data_y,
}
)
got = gdf_l.merge(gdf_r, on="join_col", how="outer")

assert_join_results_equal(expected, got, how="outer")
assert_eq(dtype, got["join_col"].dtype)


@pytest.mark.parametrize(
"dtype_l", [Decimal64Dtype(7, 3), Decimal64Dtype(9, 5)],
)
@pytest.mark.parametrize(
"dtype_r", [Decimal64Dtype(8, 3), Decimal64Dtype(11, 6)],
)
def test_mixed_decimal_typecast(dtype_l, dtype_r):
other_data = ["a", "b", "c", "d"]

join_data_l = cudf.Series(["95.05", "34.6", "74.22", "14.94"]).astype(
dtype_r
)
join_data_r = cudf.Series(["95.05", "62.4056", "74.22", "1.42"]).astype(
dtype_l
)

gdf_l = cudf.DataFrame({"join_col": join_data_l, "B": other_data})
gdf_r = cudf.DataFrame({"join_col": join_data_r, "B": other_data})

with pytest.raises(
TypeError,
match="Decimal columns can only be merged with decimal columns "
"of the same precision and scale",
):
gdf_l.merge(gdf_r, on="join_col", how="inner")


@pytest.mark.parametrize(
"dtype_l",
["datetime64[s]", "datetime64[ms]", "datetime64[us]", "datetime64[ns]"],
Expand Down
4 changes: 2 additions & 2 deletions python/custreamz/dev_requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
flake8==3.8.3
black==19.10b0
isort==5.0.7
dask>=2021.3.1
distributed>=2.22.0
dask==2021.3.1
distributed>=2.22.0,<=2021.3.1
streamz
python-confluent-kafka
pytest
Expand Down
61 changes: 19 additions & 42 deletions python/dask_cudf/dask_cudf/backends.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,10 @@
# Copyright (c) 2020-2021, NVIDIA CORPORATION.

from distutils.version import LooseVersion

import cupy as cp
import numpy as np
import pandas as pd
import pyarrow as pa

import dask
from dask.dataframe.categorical import categorical_dtype_dispatch
from dask.dataframe.core import get_parallel_type, make_meta, meta_nonempty
from dask.dataframe.methods import (
Expand All @@ -31,7 +28,6 @@
get_parallel_type.register(cudf.DataFrame, lambda _: DataFrame)
get_parallel_type.register(cudf.Series, lambda _: Series)
get_parallel_type.register(cudf.Index, lambda _: Index)
DASK_VERSION = LooseVersion(dask.__version__)


@meta_nonempty.register(cudf.Index)
Expand Down Expand Up @@ -205,45 +201,26 @@ def make_meta_object(x, index=None):
raise TypeError(f"Don't know how to create metadata from {x}")


if DASK_VERSION > "2021.03.1":

@concat_dispatch.register((cudf.DataFrame, cudf.Series, cudf.Index))
def concat_cudf(
dfs,
axis=0,
join="outer",
uniform=False,
filter_warning=True,
sort=None,
ignore_index=False,
**kwargs,
):
assert join == "outer"

ignore_order = kwargs.get("ignore_order", False)
if ignore_order:
raise NotImplementedError(
"ignore_order parameter is not yet supported in dask-cudf"
)

return cudf.concat(dfs, axis=axis, ignore_index=ignore_index)


else:

@concat_dispatch.register((cudf.DataFrame, cudf.Series, cudf.Index))
def concat_cudf(
dfs,
axis=0,
join="outer",
uniform=False,
filter_warning=True,
sort=None,
ignore_index=False,
):
assert join == "outer"
@concat_dispatch.register((cudf.DataFrame, cudf.Series, cudf.Index))
def concat_cudf(
dfs,
axis=0,
join="outer",
uniform=False,
filter_warning=True,
sort=None,
ignore_index=False,
**kwargs,
):
assert join == "outer"

ignore_order = kwargs.get("ignore_order", False)
if ignore_order:
raise NotImplementedError(
"ignore_order parameter is not yet supported in dask-cudf"
)

return cudf.concat(dfs, axis=axis, ignore_index=ignore_index)
return cudf.concat(dfs, axis=axis, ignore_index=ignore_index)


@categorical_dtype_dispatch.register((cudf.DataFrame, cudf.Series, cudf.Index))
Expand Down
Loading

0 comments on commit e71dbc1

Please sign in to comment.