Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[REVIEW] Enable join on decimal columns #7764

Merged
merged 9 commits into from
Apr 2, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion python/cudf/cudf/_lib/replace.pyx
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020, NVIDIA CORPORATION.
# Copyright (c) 2020-2021, NVIDIA CORPORATION.

from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move
Expand Down
14 changes: 13 additions & 1 deletion python/cudf/cudf/core/column/decimal.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Copyright (c) 2021, NVIDIA CORPORATION.

from decimal import Decimal
from typing import cast
from typing import cast, Any

import cupy as cp
import numpy as np
Expand Down Expand Up @@ -170,6 +170,18 @@ def sum_of_squares(
"sum_of_squares", skipna=skipna, dtype=dtype, min_count=min_count
)

def fillna(
self, value: Any = None, method: str = None, dtype: Dtype = None
):
"""Fill null values with ``value``.

Returns a copy with null filled.
"""
result = libcudf.replace.replace_nulls(
input_col=self, replacement=value, method=method, dtype=dtype
)
return self._copy_type_metadata(result)


def _binop_scale(l_dtype, r_dtype, op):
# This should at some point be hooked up to libcudf's
Expand Down
8 changes: 8 additions & 0 deletions python/cudf/cudf/core/join/_join_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,14 @@ def _match_join_keys(
if pd.api.types.is_dtype_equal(ltype, rtype):
return lcol, rcol

if isinstance(ltype, cudf.Decimal64Dtype) or isinstance(
kkraus14 marked this conversation as resolved.
Show resolved Hide resolved
rtype, cudf.Decimal64Dtype
):
raise TypeError(
"Decimal columns can only be merged with decimal columns "
"of the same precision and scale"
)

if (np.issubdtype(ltype, np.number)) and (np.issubdtype(rtype, np.number)):
common_type = (
max(ltype, rtype)
Expand Down
135 changes: 133 additions & 2 deletions python/cudf/cudf/tests/test_joining.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

import cudf
from cudf.core._compat import PANDAS_GE_120
from cudf.core.dtypes import CategoricalDtype
from cudf.core.dtypes import CategoricalDtype, Decimal64Dtype
from cudf.tests.utils import (
INTEGER_TYPES,
NUMERIC_TYPES,
Expand Down Expand Up @@ -89,7 +89,7 @@ def assert_join_results_equal(expect, got, how, **kwargs):
): # can't sort_values() on a df without columns
return assert_eq(expect, got, **kwargs)

return assert_eq(
assert_eq(
expect.sort_values(expect.columns.to_list()).reset_index(
drop=True
),
Expand Down Expand Up @@ -1152,6 +1152,137 @@ def test_typecast_on_join_overflow_unsafe(dtypes):
merged = lhs.merge(rhs, on="a", how="left") # noqa: F841


ChrisJar marked this conversation as resolved.
Show resolved Hide resolved
@pytest.mark.parametrize(
"dtype",
[Decimal64Dtype(5, 2), Decimal64Dtype(7, 5), Decimal64Dtype(12, 7)],
)
def test_decimal_typecast_inner(dtype):
other_data = ["a", "b", "c", "d", "e"]

join_data_l = cudf.Series(["1.6", "9.5", "7.2", "8.7", "2.3"]).astype(
dtype
)
join_data_r = cudf.Series(["1.6", "9.5", "7.2", "4.5", "2.3"]).astype(
dtype
)

gdf_l = cudf.DataFrame({"join_col": join_data_l, "B": other_data})
gdf_r = cudf.DataFrame({"join_col": join_data_r, "B": other_data})

exp_join_data = ["1.6", "9.5", "7.2", "2.3"]
exp_other_data = ["a", "b", "c", "e"]

exp_join_col = cudf.Series(exp_join_data).astype(dtype)

expected = cudf.DataFrame(
{
"join_col": exp_join_col,
"B_x": exp_other_data,
"B_y": exp_other_data,
}
)

got = gdf_l.merge(gdf_r, on="join_col", how="inner")

assert_join_results_equal(expected, got, how="inner")
assert_eq(dtype, got["join_col"].dtype)


@pytest.mark.parametrize(
"dtype",
[Decimal64Dtype(7, 3), Decimal64Dtype(9, 5), Decimal64Dtype(14, 10)],
)
def test_decimal_typecast_left(dtype):
other_data = ["a", "b", "c", "d"]

join_data_l = cudf.Series(["95.05", "384.26", "74.22", "1456.94"]).astype(
dtype
)
join_data_r = cudf.Series(
["95.05", "62.4056", "74.22", "1456.9472"]
).astype(dtype)

gdf_l = cudf.DataFrame({"join_col": join_data_l, "B": other_data})
gdf_r = cudf.DataFrame({"join_col": join_data_r, "B": other_data})

exp_join_data = ["95.05", "74.22", "384.26", "1456.94"]
exp_other_data_x = ["a", "c", "b", "d"]
exp_other_data_y = ["a", "c", None, None]

exp_join_col = cudf.Series(exp_join_data).astype(dtype)

expected = cudf.DataFrame(
{
"join_col": exp_join_col,
"B_x": exp_other_data_x,
"B_y": exp_other_data_y,
}
)

got = gdf_l.merge(gdf_r, on="join_col", how="left")

assert_join_results_equal(expected, got, how="left")
assert_eq(dtype, got["join_col"].dtype)


@pytest.mark.parametrize(
"dtype",
[Decimal64Dtype(7, 3), Decimal64Dtype(10, 5), Decimal64Dtype(18, 9)],
)
def test_decimal_typecast_outer(dtype):
other_data = ["a", "b", "c"]
join_data_l = cudf.Series(["741.248", "1029.528", "3627.292"]).astype(
dtype
)
join_data_r = cudf.Series(["9284.103", "1029.528", "948.637"]).astype(
dtype
)
gdf_l = cudf.DataFrame({"join_col": join_data_l, "B": other_data})
gdf_r = cudf.DataFrame({"join_col": join_data_r, "B": other_data})
exp_join_data = ["9284.103", "948.637", "1029.528", "741.248", "3627.292"]
exp_other_data_x = [None, None, "b", "a", "c"]
exp_other_data_y = ["a", "c", "b", None, None]
exp_join_col = cudf.Series(exp_join_data).astype(dtype)
expected = cudf.DataFrame(
{
"join_col": exp_join_col,
"B_x": exp_other_data_x,
"B_y": exp_other_data_y,
}
)
got = gdf_l.merge(gdf_r, on="join_col", how="outer")

assert_join_results_equal(expected, got, how="outer")
assert_eq(dtype, got["join_col"].dtype)


@pytest.mark.parametrize(
"dtype_l", [Decimal64Dtype(7, 3), Decimal64Dtype(9, 5)],
)
@pytest.mark.parametrize(
"dtype_r", [Decimal64Dtype(8, 3), Decimal64Dtype(11, 6)],
)
def test_mixed_decimal_typecast(dtype_l, dtype_r):
other_data = ["a", "b", "c", "d"]

join_data_l = cudf.Series(["95.05", "34.6", "74.22", "14.94"]).astype(
dtype_r
)
join_data_r = cudf.Series(["95.05", "62.4056", "74.22", "1.42"]).astype(
dtype_l
)

gdf_l = cudf.DataFrame({"join_col": join_data_l, "B": other_data})
gdf_r = cudf.DataFrame({"join_col": join_data_r, "B": other_data})

with pytest.raises(
TypeError,
match="Decimal columns can only be merged with decimal columns "
"of the same precision and scale",
):
gdf_l.merge(gdf_r, on="join_col", how="inner")


@pytest.mark.parametrize(
"dtype_l",
["datetime64[s]", "datetime64[ms]", "datetime64[us]", "datetime64[ns]"],
Expand Down