Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[REVIEW] Enable join on decimal columns #7764

Merged
merged 9 commits into from
Apr 2, 2021
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 9 additions & 5 deletions python/cudf/cudf/_lib/replace.pyx
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
# Copyright (c) 2020, NVIDIA CORPORATION.
# Copyright (c) 2020-2021, NVIDIA CORPORATION.

from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move

from cudf.utils.dtypes import is_scalar
from cudf.utils.dtypes import is_scalar, is_decimal_dtype

from cudf._lib.column cimport Column
from cudf._lib.scalar import as_device_scalar
Expand Down Expand Up @@ -137,14 +137,18 @@ def replace_nulls(
raise ValueError("Cannot specify both 'value' and 'method'.")

if method:
return replace_nulls_fill(input_col, method)
result = replace_nulls_fill(input_col, method)
elif is_scalar(replacement):
return replace_nulls_scalar(
result = replace_nulls_scalar(
input_col,
as_device_scalar(replacement, dtype=dtype)
)
else:
return replace_nulls_column(input_col, replacement)
result = replace_nulls_column(input_col, replacement)

if is_decimal_dtype(result.dtype) and is_decimal_dtype(input_col.dtype):
result.dtype.precision = input_col.dtype.precision
kkraus14 marked this conversation as resolved.
Show resolved Hide resolved
return result


def clamp(Column input_col, DeviceScalar lo, DeviceScalar lo_replace,
Expand Down
7 changes: 7 additions & 0 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -1412,6 +1412,8 @@ def _copy_type_metadata(self: T, other: ColumnBase) -> ColumnBase:
of `other` and the categories of `self`.
* when both `self` and `other` are StructColumns, rename the fields
of `other` to the field names of `self`.
* when `self` and `other` are DecimalColumns, copy the precision
over from `self` to `other`.
* when `self` and `other` are nested columns of the same type,
recursively apply this function on the children of `self` to the
and the children of `other`.
Expand All @@ -1435,6 +1437,11 @@ def _copy_type_metadata(self: T, other: ColumnBase) -> ColumnBase:
):
other = other._rename_fields(self.dtype.fields.keys())

if isinstance(self.dtype, cudf.Decimal64Dtype) and isinstance(
kkraus14 marked this conversation as resolved.
Show resolved Hide resolved
other.dtype, cudf.Decimal64Dtype
):
other.dtype.precision = self.dtype.precision

if type(self) is type(other):
if self.base_children and other.base_children:
base_children = tuple(
Expand Down
8 changes: 8 additions & 0 deletions python/cudf/cudf/core/join/_join_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,14 @@ def _match_join_keys(
if pd.api.types.is_dtype_equal(ltype, rtype):
return lcol, rcol

if isinstance(ltype, cudf.Decimal64Dtype) or isinstance(
kkraus14 marked this conversation as resolved.
Show resolved Hide resolved
rtype, cudf.Decimal64Dtype
):
raise TypeError(
"Decimal columns can only be merged with decimal columns "
"of the same precision and scale"
)

if (np.issubdtype(ltype, np.number)) and (np.issubdtype(rtype, np.number)):
common_type = (
max(ltype, rtype)
Expand Down
106 changes: 105 additions & 1 deletion python/cudf/cudf/tests/test_joining.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

import cudf
from cudf.core._compat import PANDAS_GE_120
from cudf.core.dtypes import CategoricalDtype
from cudf.core.dtypes import CategoricalDtype, Decimal64Dtype
from cudf.tests.utils import (
INTEGER_TYPES,
NUMERIC_TYPES,
Expand Down Expand Up @@ -1152,6 +1152,110 @@ def test_typecast_on_join_overflow_unsafe(dtypes):
merged = lhs.merge(rhs, on="a", how="left") # noqa: F841


ChrisJar marked this conversation as resolved.
Show resolved Hide resolved
@pytest.mark.parametrize(
"dtype",
[Decimal64Dtype(5, 2), Decimal64Dtype(7, 5), Decimal64Dtype(12, 7)],
)
def test_decimal_typecast_inner(dtype):
other_data = ["a", "b", "c", "d", "e"]

join_data_l = cudf.Series(["1.6", "9.5", "7.2", "8.7", "2.3"]).astype(
dtype
)
join_data_r = cudf.Series(["1.6", "9.5", "7.2", "4.5", "2.3"]).astype(
dtype
)

gdf_l = cudf.DataFrame({"join_col": join_data_l, "B": other_data})
gdf_r = cudf.DataFrame({"join_col": join_data_r, "B": other_data})

exp_join_data = ["1.6", "9.5", "7.2", "2.3"]
exp_other_data = ["a", "b", "c", "e"]

exp_join_col = cudf.Series(exp_join_data).astype(dtype)

expected = cudf.DataFrame(
{
"join_col": exp_join_col,
"B_x": exp_other_data,
"B_y": exp_other_data,
}
)

got = gdf_l.merge(gdf_r, on="join_col", how="inner")

assert_join_results_equal(got, expected, how="inner")
assert_eq(got["join_col"].dtype, dtype)


@pytest.mark.parametrize(
"dtype",
[Decimal64Dtype(7, 3), Decimal64Dtype(9, 5), Decimal64Dtype(14, 10)],
)
def test_decimal_typecast_left(dtype):
other_data = ["a", "b", "c", "d"]

join_data_l = cudf.Series(["95.05", "384.26", "74.22", "1456.94"]).astype(
dtype
)
join_data_r = cudf.Series(
["95.05", "62.4056", "74.22", "1456.9472"]
).astype(dtype)

gdf_l = cudf.DataFrame({"join_col": join_data_l, "B": other_data})
gdf_r = cudf.DataFrame({"join_col": join_data_r, "B": other_data})

exp_join_data = ["95.05", "74.22", "384.26", "1456.94"]
exp_other_data_x = ["a", "c", "b", "d"]
exp_other_data_y = ["a", "c", None, None]

exp_join_col = cudf.Series(exp_join_data).astype(dtype)

expected = cudf.DataFrame(
{
"join_col": exp_join_col,
"B_x": exp_other_data_x,
"B_y": exp_other_data_y,
}
)

got = gdf_l.merge(gdf_r, on="join_col", how="left")

assert_join_results_equal(got, expected, how="left")
assert_eq(got["join_col"].dtype, dtype)


@pytest.mark.parametrize(
"dtype",
[Decimal64Dtype(7, 3), Decimal64Dtype(10, 5), Decimal64Dtype(18, 9)],
)
def test_decimal_typecast_outer(dtype):
other_data = ["a", "b", "c"]
join_data_l = cudf.Series(["741.248", "1029.528", "3627.292"]).astype(
dtype
)
join_data_r = cudf.Series(["9284.103", "1029.528", "948.637"]).astype(
dtype
)
gdf_l = cudf.DataFrame({"join_col": join_data_l, "B": other_data})
gdf_r = cudf.DataFrame({"join_col": join_data_r, "B": other_data})
exp_join_data = ["9284.103", "948.637", "1029.528", "741.248", "3627.292"]
exp_other_data_x = [None, None, "b", "a", "c"]
exp_other_data_y = ["a", "c", "b", None, None]
exp_join_col = cudf.Series(exp_join_data).astype(dtype)
expected = cudf.DataFrame(
{
"join_col": exp_join_col,
"B_x": exp_other_data_x,
"B_y": exp_other_data_y,
}
)
got = gdf_l.merge(gdf_r, on="join_col", how="outer")

assert_join_results_equal(got, expected, how="outer")
assert_eq(got["join_col"].dtype, dtype)


@pytest.mark.parametrize(
"dtype_l",
["datetime64[s]", "datetime64[ms]", "datetime64[us]", "datetime64[ns]"],
Expand Down