From 2e584834ed803ba7c519597b0960664f478efcb3 Mon Sep 17 00:00:00 2001 From: Chris Jarrett Date: Tue, 30 Mar 2021 13:33:14 -0700 Subject: [PATCH 1/6] Enable join on decimal columns --- python/cudf/cudf/core/column/column.py | 5 ++ python/cudf/cudf/core/join/_join_helpers.py | 8 +++ python/cudf/cudf/core/join/join.py | 23 +++++++ python/cudf/cudf/tests/test_joining.py | 75 ++++++++++++++++++++- 4 files changed, 110 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index e59b395ec0f..eded703071e 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -1435,6 +1435,11 @@ def _copy_type_metadata(self: T, other: ColumnBase) -> ColumnBase: ): other = other._rename_fields(self.dtype.fields.keys()) + if isinstance(self.dtype, cudf.Decimal64Dtype) and isinstance( + other.dtype, cudf.Decimal64Dtype + ): + other.dtype.precision = self.dtype.precision + if type(self) is type(other): if self.base_children and other.base_children: base_children = tuple( diff --git a/python/cudf/cudf/core/join/_join_helpers.py b/python/cudf/cudf/core/join/_join_helpers.py index 3807f408369..5e15ddfc359 100644 --- a/python/cudf/cudf/core/join/_join_helpers.py +++ b/python/cudf/cudf/core/join/_join_helpers.py @@ -97,6 +97,14 @@ def _match_join_keys( if pd.api.types.is_dtype_equal(ltype, rtype): return lcol, rcol + if isinstance(ltype, cudf.Decimal64Dtype) or isinstance( + rtype, cudf.Decimal64Dtype + ): + raise TypeError( + "Decimal columns can only be merged with decimal columns " + "of the same precision and scale" + ) + if (np.issubdtype(ltype, np.number)) and (np.issubdtype(rtype, np.number)): common_type = ( max(ltype, rtype) diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py index 1a4826d0570..0f565c680df 100644 --- a/python/cudf/cudf/core/join/join.py +++ b/python/cudf/cudf/core/join/join.py @@ -464,6 +464,29 @@ def _restore_categorical_keys( ) return out_lhs, out_rhs + def _patch_decimal_precision( + self, lhs: Frame, rhs: Frame + ) -> Tuple[Frame, Frame]: + out_lhs = lhs.copy(deep=False) + out_rhs = rhs.copy(deep=False) + for left_key, right_key in zip(*self._keys): + if isinstance( + left_key.get(self.lhs).dtype, cudf.Decimal64Dtype + ) and isinstance( + right_key.get(self.rhs).dtype, cudf.Decimal64Dtype + ): + left_key.set( + out_lhs, + left_key.get(out_lhs).astype("category"), + validate=False, + ) + right_key.set( + out_rhs, + right_key.get(out_rhs).astype("category"), + validate=False, + ) + return out_lhs, out_rhs + class MergeSemi(Merge): def __init__(self, *args, **kwargs): diff --git a/python/cudf/cudf/tests/test_joining.py b/python/cudf/cudf/tests/test_joining.py index 9164bfe98d1..fe5aa480c16 100644 --- a/python/cudf/cudf/tests/test_joining.py +++ b/python/cudf/cudf/tests/test_joining.py @@ -6,7 +6,7 @@ import cudf from cudf.core._compat import PANDAS_GE_120 -from cudf.core.dtypes import CategoricalDtype +from cudf.core.dtypes import CategoricalDtype, Decimal64Dtype from cudf.tests.utils import ( INTEGER_TYPES, NUMERIC_TYPES, @@ -1152,6 +1152,79 @@ def test_typecast_on_join_overflow_unsafe(dtypes): merged = lhs.merge(rhs, on="a", how="left") # noqa: F841 +@pytest.mark.parametrize( + "dtype", + [Decimal64Dtype(5, 2), Decimal64Dtype(7, 5), Decimal64Dtype(12, 7)], +) +def test_decimal_typecast_inner(dtype): + other_data = ["a", "b", "c", "d", "e"] + + join_data_l = cudf.Series(["1.6", "9.5", "7.2", "8.7", "2.3"]).astype( + dtype + ) + join_data_r = cudf.Series(["1.6", "9.5", "7.2", "4.5", "2.3"]).astype( + dtype + ) + + gdf_l = cudf.DataFrame({"join_col": join_data_l, "B": other_data}) + gdf_r = cudf.DataFrame({"join_col": join_data_r, "B": other_data}) + + exp_join_data = ["1.6", "9.5", "7.2", "2.3"] + exp_other_data = ["a", "b", "c", "e"] + + exp_join_col = cudf.Series(exp_join_data).astype(dtype) + + expected = cudf.DataFrame( + { + "join_col": exp_join_col, + "B_x": exp_other_data, + "B_y": exp_other_data, + } + ) + + got = gdf_l.merge(gdf_r, on="join_col", how="inner") + + assert_eq(got, expected) + assert_eq(got["join_col"].dtype, dtype) + + +@pytest.mark.parametrize( + "dtype", + [Decimal64Dtype(7, 3), Decimal64Dtype(9, 5), Decimal64Dtype(14, 10)], +) +def test_decimal_typecast_left(dtype): + other_data = ["a", "b", "c", "d"] + + join_data_l = cudf.Series(["95.05", "384.26", "74.22", "1456.94"]).astype( + dtype + ) + join_data_r = cudf.Series( + ["95.05", "62.4056", "74.22", "1456.9472"] + ).astype(dtype) + + gdf_l = cudf.DataFrame({"join_col": join_data_l, "B": other_data}) + gdf_r = cudf.DataFrame({"join_col": join_data_r, "B": other_data}) + + exp_join_data = ["95.05", "74.22", "384.26", "1456.94"] + exp_other_data_x = ["a", "c", "b", "d"] + exp_other_data_y = ["a", "c", None, None] + + exp_join_col = cudf.Series(exp_join_data).astype(dtype) + + expected = cudf.DataFrame( + { + "join_col": exp_join_col, + "B_x": exp_other_data_x, + "B_y": exp_other_data_y, + } + ) + + got = gdf_l.merge(gdf_r, on="join_col", how="left") + + assert_eq(got, expected) + assert_eq(got["join_col"].dtype, dtype) + + @pytest.mark.parametrize( "dtype_l", ["datetime64[s]", "datetime64[ms]", "datetime64[us]", "datetime64[ns]"], From dcdbf6048a8cb34d1385df6dbfce2efc52f6f65a Mon Sep 17 00:00:00 2001 From: Chris Jarrett Date: Tue, 30 Mar 2021 18:29:32 -0700 Subject: [PATCH 2/6] Clean unused code --- python/cudf/cudf/core/column/column.py | 2 ++ python/cudf/cudf/core/join/join.py | 23 ----------------------- 2 files changed, 2 insertions(+), 23 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index eded703071e..f6319e84d69 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -1412,6 +1412,8 @@ def _copy_type_metadata(self: T, other: ColumnBase) -> ColumnBase: of `other` and the categories of `self`. * when both `self` and `other` are StructColumns, rename the fields of `other` to the field names of `self`. + * when `self` and `other` are DecimalColumns, copy the precision + over from `self` to `other`. * when `self` and `other` are nested columns of the same type, recursively apply this function on the children of `self` to the and the children of `other`. diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py index 0f565c680df..1a4826d0570 100644 --- a/python/cudf/cudf/core/join/join.py +++ b/python/cudf/cudf/core/join/join.py @@ -464,29 +464,6 @@ def _restore_categorical_keys( ) return out_lhs, out_rhs - def _patch_decimal_precision( - self, lhs: Frame, rhs: Frame - ) -> Tuple[Frame, Frame]: - out_lhs = lhs.copy(deep=False) - out_rhs = rhs.copy(deep=False) - for left_key, right_key in zip(*self._keys): - if isinstance( - left_key.get(self.lhs).dtype, cudf.Decimal64Dtype - ) and isinstance( - right_key.get(self.rhs).dtype, cudf.Decimal64Dtype - ): - left_key.set( - out_lhs, - left_key.get(out_lhs).astype("category"), - validate=False, - ) - right_key.set( - out_rhs, - right_key.get(out_rhs).astype("category"), - validate=False, - ) - return out_lhs, out_rhs - class MergeSemi(Merge): def __init__(self, *args, **kwargs): From 56c2483a57931d975bed7c9d45a3bfd0ff05d36f Mon Sep 17 00:00:00 2001 From: Chris Jarrett Date: Wed, 31 Mar 2021 09:06:46 -0700 Subject: [PATCH 3/6] Add tests for outer join --- python/cudf/cudf/_lib/replace.pyx | 12 ++++++--- python/cudf/cudf/tests/test_joining.py | 35 ++++++++++++++++++++++++-- 2 files changed, 41 insertions(+), 6 deletions(-) diff --git a/python/cudf/cudf/_lib/replace.pyx b/python/cudf/cudf/_lib/replace.pyx index 2732142dd15..55e30e9f9ba 100644 --- a/python/cudf/cudf/_lib/replace.pyx +++ b/python/cudf/cudf/_lib/replace.pyx @@ -3,7 +3,7 @@ from libcpp.memory cimport unique_ptr from libcpp.utility cimport move -from cudf.utils.dtypes import is_scalar +from cudf.utils.dtypes import is_scalar, is_decimal_dtype from cudf._lib.column cimport Column from cudf._lib.scalar import as_device_scalar @@ -137,14 +137,18 @@ def replace_nulls( raise ValueError("Cannot specify both 'value' and 'method'.") if method: - return replace_nulls_fill(input_col, method) + result = replace_nulls_fill(input_col, method) elif is_scalar(replacement): - return replace_nulls_scalar( + result = replace_nulls_scalar( input_col, as_device_scalar(replacement, dtype=dtype) ) else: - return replace_nulls_column(input_col, replacement) + result = replace_nulls_column(input_col, replacement) + + if is_decimal_dtype(result.dtype) and is_decimal_dtype(input_col.dtype): + result.dtype.precision = input_col.dtype.precision + return result def clamp(Column input_col, DeviceScalar lo, DeviceScalar lo_replace, diff --git a/python/cudf/cudf/tests/test_joining.py b/python/cudf/cudf/tests/test_joining.py index fe5aa480c16..0d1e6ff43e2 100644 --- a/python/cudf/cudf/tests/test_joining.py +++ b/python/cudf/cudf/tests/test_joining.py @@ -1184,7 +1184,7 @@ def test_decimal_typecast_inner(dtype): got = gdf_l.merge(gdf_r, on="join_col", how="inner") - assert_eq(got, expected) + assert_join_results_equal(got, expected, how="inner") assert_eq(got["join_col"].dtype, dtype) @@ -1221,7 +1221,38 @@ def test_decimal_typecast_left(dtype): got = gdf_l.merge(gdf_r, on="join_col", how="left") - assert_eq(got, expected) + assert_join_results_equal(got, expected, how="left") + assert_eq(got["join_col"].dtype, dtype) + + +@pytest.mark.parametrize( + "dtype", + [Decimal64Dtype(7, 3), Decimal64Dtype(10, 5), Decimal64Dtype(18, 9)], +) +def test_decimal_typecast_outer(dtype): + other_data = ["a", "b", "c"] + join_data_l = cudf.Series(["741.248", "1029.528", "3627.292"]).astype( + dtype + ) + join_data_r = cudf.Series(["9284.103", "1029.528", "948.637"]).astype( + dtype + ) + gdf_l = cudf.DataFrame({"join_col": join_data_l, "B": other_data}) + gdf_r = cudf.DataFrame({"join_col": join_data_r, "B": other_data}) + exp_join_data = ["9284.103", "948.637", "1029.528", "741.248", "3627.292"] + exp_other_data_x = [None, None, "b", "a", "c"] + exp_other_data_y = ["a", "c", "b", None, None] + exp_join_col = cudf.Series(exp_join_data).astype(dtype) + expected = cudf.DataFrame( + { + "join_col": exp_join_col, + "B_x": exp_other_data_x, + "B_y": exp_other_data_y, + } + ) + got = gdf_l.merge(gdf_r, on="join_col", how="outer") + + assert_join_results_equal(got, expected, how="outer") assert_eq(got["join_col"].dtype, dtype) From 3db44caecfabf2fe8a06968d7037f01ac5046f20 Mon Sep 17 00:00:00 2001 From: Chris Jarrett Date: Wed, 31 Mar 2021 09:11:51 -0700 Subject: [PATCH 4/6] Update copyright --- python/cudf/cudf/_lib/replace.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/_lib/replace.pyx b/python/cudf/cudf/_lib/replace.pyx index 55e30e9f9ba..ef850532f55 100644 --- a/python/cudf/cudf/_lib/replace.pyx +++ b/python/cudf/cudf/_lib/replace.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2021, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr from libcpp.utility cimport move From e2bbd21e640856d3d5b4090532c78900906923c5 Mon Sep 17 00:00:00 2001 From: Chris Jarrett Date: Thu, 1 Apr 2021 15:05:08 -0700 Subject: [PATCH 5/6] Update to use _copy_type_metadata --- python/cudf/cudf/_lib/replace.pyx | 12 +++---- python/cudf/cudf/core/column/decimal.py | 14 +++++++- python/cudf/cudf/tests/test_joining.py | 44 +++++++++++++++++++++---- 3 files changed, 54 insertions(+), 16 deletions(-) diff --git a/python/cudf/cudf/_lib/replace.pyx b/python/cudf/cudf/_lib/replace.pyx index ef850532f55..cdedd3ac022 100644 --- a/python/cudf/cudf/_lib/replace.pyx +++ b/python/cudf/cudf/_lib/replace.pyx @@ -3,7 +3,7 @@ from libcpp.memory cimport unique_ptr from libcpp.utility cimport move -from cudf.utils.dtypes import is_scalar, is_decimal_dtype +from cudf.utils.dtypes import is_scalar from cudf._lib.column cimport Column from cudf._lib.scalar import as_device_scalar @@ -137,18 +137,14 @@ def replace_nulls( raise ValueError("Cannot specify both 'value' and 'method'.") if method: - result = replace_nulls_fill(input_col, method) + return replace_nulls_fill(input_col, method) elif is_scalar(replacement): - result = replace_nulls_scalar( + return replace_nulls_scalar( input_col, as_device_scalar(replacement, dtype=dtype) ) else: - result = replace_nulls_column(input_col, replacement) - - if is_decimal_dtype(result.dtype) and is_decimal_dtype(input_col.dtype): - result.dtype.precision = input_col.dtype.precision - return result + return replace_nulls_column(input_col, replacement) def clamp(Column input_col, DeviceScalar lo, DeviceScalar lo_replace, diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py index 4fee5060fe4..d9e4610832d 100644 --- a/python/cudf/cudf/core/column/decimal.py +++ b/python/cudf/cudf/core/column/decimal.py @@ -1,7 +1,7 @@ # Copyright (c) 2021, NVIDIA CORPORATION. from decimal import Decimal -from typing import cast +from typing import cast, Any import cupy as cp import numpy as np @@ -170,6 +170,18 @@ def sum_of_squares( "sum_of_squares", skipna=skipna, dtype=dtype, min_count=min_count ) + def fillna( + self, value: Any = None, method: str = None, dtype: Dtype = None + ): + """Fill null values with ``value``. + + Returns a copy with null filled. + """ + result = libcudf.replace.replace_nulls( + input_col=self, replacement=value, method=method, dtype=dtype + ) + return self._copy_type_metadata(result) + def _binop_scale(l_dtype, r_dtype, op): # This should at some point be hooked up to libcudf's diff --git a/python/cudf/cudf/tests/test_joining.py b/python/cudf/cudf/tests/test_joining.py index 0d1e6ff43e2..e40d063e9b0 100644 --- a/python/cudf/cudf/tests/test_joining.py +++ b/python/cudf/cudf/tests/test_joining.py @@ -3,6 +3,7 @@ import numpy as np import pandas as pd import pytest +import re import cudf from cudf.core._compat import PANDAS_GE_120 @@ -89,7 +90,7 @@ def assert_join_results_equal(expect, got, how, **kwargs): ): # can't sort_values() on a df without columns return assert_eq(expect, got, **kwargs) - return assert_eq( + assert_eq( expect.sort_values(expect.columns.to_list()).reset_index( drop=True ), @@ -1184,8 +1185,8 @@ def test_decimal_typecast_inner(dtype): got = gdf_l.merge(gdf_r, on="join_col", how="inner") - assert_join_results_equal(got, expected, how="inner") - assert_eq(got["join_col"].dtype, dtype) + assert_join_results_equal(expected, got, how="inner") + assert_eq(dtype, got["join_col"].dtype) @pytest.mark.parametrize( @@ -1221,8 +1222,8 @@ def test_decimal_typecast_left(dtype): got = gdf_l.merge(gdf_r, on="join_col", how="left") - assert_join_results_equal(got, expected, how="left") - assert_eq(got["join_col"].dtype, dtype) + assert_join_results_equal(expected, got, how="left") + assert_eq(dtype, got["join_col"].dtype) @pytest.mark.parametrize( @@ -1252,8 +1253,37 @@ def test_decimal_typecast_outer(dtype): ) got = gdf_l.merge(gdf_r, on="join_col", how="outer") - assert_join_results_equal(got, expected, how="outer") - assert_eq(got["join_col"].dtype, dtype) + assert_join_results_equal(expected, got, how="outer") + assert_eq(dtype, got["join_col"].dtype) + + +@pytest.mark.parametrize( + "dtype_l", [Decimal64Dtype(7, 3), Decimal64Dtype(9, 5)], +) +@pytest.mark.parametrize( + "dtype_r", [Decimal64Dtype(8, 3), Decimal64Dtype(11, 6)], +) +def test_mixed_decimal_typecast(dtype_l, dtype_r): + other_data = ["a", "b", "c", "d"] + + join_data_l = cudf.Series(["95.05", "34.6", "74.22", "14.94"]).astype( + dtype_r + ) + join_data_r = cudf.Series(["95.05", "62.4056", "74.22", "1.42"]).astype( + dtype_l + ) + + gdf_l = cudf.DataFrame({"join_col": join_data_l, "B": other_data}) + gdf_r = cudf.DataFrame({"join_col": join_data_r, "B": other_data}) + + with pytest.raises( + TypeError, + match=re.escape( + "Decimal columns can only be merged with decimal columns " + "of the same precision and scale" + ), + ): + gdf_l.merge(gdf_r, on="join_col", how="inner") @pytest.mark.parametrize( From bbd4bd2f8d30138476933ec3caa83eaba1a3a17b Mon Sep 17 00:00:00 2001 From: Chris Jarrett Date: Thu, 1 Apr 2021 15:45:30 -0700 Subject: [PATCH 6/6] Cleanup mixed decimal type test --- python/cudf/cudf/tests/test_joining.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/python/cudf/cudf/tests/test_joining.py b/python/cudf/cudf/tests/test_joining.py index e40d063e9b0..2dae2bf1e97 100644 --- a/python/cudf/cudf/tests/test_joining.py +++ b/python/cudf/cudf/tests/test_joining.py @@ -3,7 +3,6 @@ import numpy as np import pandas as pd import pytest -import re import cudf from cudf.core._compat import PANDAS_GE_120 @@ -1278,10 +1277,8 @@ def test_mixed_decimal_typecast(dtype_l, dtype_r): with pytest.raises( TypeError, - match=re.escape( - "Decimal columns can only be merged with decimal columns " - "of the same precision and scale" - ), + match="Decimal columns can only be merged with decimal columns " + "of the same precision and scale", ): gdf_l.merge(gdf_r, on="join_col", how="inner")