From 9fa86792853145c7a86c812c82e878f00ea1ddc3 Mon Sep 17 00:00:00 2001 From: ChrisJar Date: Thu, 1 Apr 2021 21:56:51 -0500 Subject: [PATCH] Enable join on decimal columns (#7764) This enables joins on decimal columns with the same precision and scale. Closes #7497 Depends on #7788 Authors: - https://github.com/ChrisJar Approvers: - Keith Kraus (https://github.com/kkraus14) - Ashwin Srinath (https://github.com/shwina) - https://github.com/brandon-b-miller - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/7764 --- python/cudf/cudf/_lib/replace.pyx | 2 +- python/cudf/cudf/core/column/decimal.py | 14 +- python/cudf/cudf/core/join/_join_helpers.py | 8 ++ python/cudf/cudf/tests/test_joining.py | 135 +++++++++++++++++++- 4 files changed, 155 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/_lib/replace.pyx b/python/cudf/cudf/_lib/replace.pyx index 2732142dd15..cdedd3ac022 100644 --- a/python/cudf/cudf/_lib/replace.pyx +++ b/python/cudf/cudf/_lib/replace.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2021, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr from libcpp.utility cimport move diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py index 4fee5060fe4..d9e4610832d 100644 --- a/python/cudf/cudf/core/column/decimal.py +++ b/python/cudf/cudf/core/column/decimal.py @@ -1,7 +1,7 @@ # Copyright (c) 2021, NVIDIA CORPORATION. from decimal import Decimal -from typing import cast +from typing import cast, Any import cupy as cp import numpy as np @@ -170,6 +170,18 @@ def sum_of_squares( "sum_of_squares", skipna=skipna, dtype=dtype, min_count=min_count ) + def fillna( + self, value: Any = None, method: str = None, dtype: Dtype = None + ): + """Fill null values with ``value``. + + Returns a copy with null filled. + """ + result = libcudf.replace.replace_nulls( + input_col=self, replacement=value, method=method, dtype=dtype + ) + return self._copy_type_metadata(result) + def _binop_scale(l_dtype, r_dtype, op): # This should at some point be hooked up to libcudf's diff --git a/python/cudf/cudf/core/join/_join_helpers.py b/python/cudf/cudf/core/join/_join_helpers.py index 3807f408369..5e15ddfc359 100644 --- a/python/cudf/cudf/core/join/_join_helpers.py +++ b/python/cudf/cudf/core/join/_join_helpers.py @@ -97,6 +97,14 @@ def _match_join_keys( if pd.api.types.is_dtype_equal(ltype, rtype): return lcol, rcol + if isinstance(ltype, cudf.Decimal64Dtype) or isinstance( + rtype, cudf.Decimal64Dtype + ): + raise TypeError( + "Decimal columns can only be merged with decimal columns " + "of the same precision and scale" + ) + if (np.issubdtype(ltype, np.number)) and (np.issubdtype(rtype, np.number)): common_type = ( max(ltype, rtype) diff --git a/python/cudf/cudf/tests/test_joining.py b/python/cudf/cudf/tests/test_joining.py index 9164bfe98d1..2dae2bf1e97 100644 --- a/python/cudf/cudf/tests/test_joining.py +++ b/python/cudf/cudf/tests/test_joining.py @@ -6,7 +6,7 @@ import cudf from cudf.core._compat import PANDAS_GE_120 -from cudf.core.dtypes import CategoricalDtype +from cudf.core.dtypes import CategoricalDtype, Decimal64Dtype from cudf.tests.utils import ( INTEGER_TYPES, NUMERIC_TYPES, @@ -89,7 +89,7 @@ def assert_join_results_equal(expect, got, how, **kwargs): ): # can't sort_values() on a df without columns return assert_eq(expect, got, **kwargs) - return assert_eq( + assert_eq( expect.sort_values(expect.columns.to_list()).reset_index( drop=True ), @@ -1152,6 +1152,137 @@ def test_typecast_on_join_overflow_unsafe(dtypes): merged = lhs.merge(rhs, on="a", how="left") # noqa: F841 +@pytest.mark.parametrize( + "dtype", + [Decimal64Dtype(5, 2), Decimal64Dtype(7, 5), Decimal64Dtype(12, 7)], +) +def test_decimal_typecast_inner(dtype): + other_data = ["a", "b", "c", "d", "e"] + + join_data_l = cudf.Series(["1.6", "9.5", "7.2", "8.7", "2.3"]).astype( + dtype + ) + join_data_r = cudf.Series(["1.6", "9.5", "7.2", "4.5", "2.3"]).astype( + dtype + ) + + gdf_l = cudf.DataFrame({"join_col": join_data_l, "B": other_data}) + gdf_r = cudf.DataFrame({"join_col": join_data_r, "B": other_data}) + + exp_join_data = ["1.6", "9.5", "7.2", "2.3"] + exp_other_data = ["a", "b", "c", "e"] + + exp_join_col = cudf.Series(exp_join_data).astype(dtype) + + expected = cudf.DataFrame( + { + "join_col": exp_join_col, + "B_x": exp_other_data, + "B_y": exp_other_data, + } + ) + + got = gdf_l.merge(gdf_r, on="join_col", how="inner") + + assert_join_results_equal(expected, got, how="inner") + assert_eq(dtype, got["join_col"].dtype) + + +@pytest.mark.parametrize( + "dtype", + [Decimal64Dtype(7, 3), Decimal64Dtype(9, 5), Decimal64Dtype(14, 10)], +) +def test_decimal_typecast_left(dtype): + other_data = ["a", "b", "c", "d"] + + join_data_l = cudf.Series(["95.05", "384.26", "74.22", "1456.94"]).astype( + dtype + ) + join_data_r = cudf.Series( + ["95.05", "62.4056", "74.22", "1456.9472"] + ).astype(dtype) + + gdf_l = cudf.DataFrame({"join_col": join_data_l, "B": other_data}) + gdf_r = cudf.DataFrame({"join_col": join_data_r, "B": other_data}) + + exp_join_data = ["95.05", "74.22", "384.26", "1456.94"] + exp_other_data_x = ["a", "c", "b", "d"] + exp_other_data_y = ["a", "c", None, None] + + exp_join_col = cudf.Series(exp_join_data).astype(dtype) + + expected = cudf.DataFrame( + { + "join_col": exp_join_col, + "B_x": exp_other_data_x, + "B_y": exp_other_data_y, + } + ) + + got = gdf_l.merge(gdf_r, on="join_col", how="left") + + assert_join_results_equal(expected, got, how="left") + assert_eq(dtype, got["join_col"].dtype) + + +@pytest.mark.parametrize( + "dtype", + [Decimal64Dtype(7, 3), Decimal64Dtype(10, 5), Decimal64Dtype(18, 9)], +) +def test_decimal_typecast_outer(dtype): + other_data = ["a", "b", "c"] + join_data_l = cudf.Series(["741.248", "1029.528", "3627.292"]).astype( + dtype + ) + join_data_r = cudf.Series(["9284.103", "1029.528", "948.637"]).astype( + dtype + ) + gdf_l = cudf.DataFrame({"join_col": join_data_l, "B": other_data}) + gdf_r = cudf.DataFrame({"join_col": join_data_r, "B": other_data}) + exp_join_data = ["9284.103", "948.637", "1029.528", "741.248", "3627.292"] + exp_other_data_x = [None, None, "b", "a", "c"] + exp_other_data_y = ["a", "c", "b", None, None] + exp_join_col = cudf.Series(exp_join_data).astype(dtype) + expected = cudf.DataFrame( + { + "join_col": exp_join_col, + "B_x": exp_other_data_x, + "B_y": exp_other_data_y, + } + ) + got = gdf_l.merge(gdf_r, on="join_col", how="outer") + + assert_join_results_equal(expected, got, how="outer") + assert_eq(dtype, got["join_col"].dtype) + + +@pytest.mark.parametrize( + "dtype_l", [Decimal64Dtype(7, 3), Decimal64Dtype(9, 5)], +) +@pytest.mark.parametrize( + "dtype_r", [Decimal64Dtype(8, 3), Decimal64Dtype(11, 6)], +) +def test_mixed_decimal_typecast(dtype_l, dtype_r): + other_data = ["a", "b", "c", "d"] + + join_data_l = cudf.Series(["95.05", "34.6", "74.22", "14.94"]).astype( + dtype_r + ) + join_data_r = cudf.Series(["95.05", "62.4056", "74.22", "1.42"]).astype( + dtype_l + ) + + gdf_l = cudf.DataFrame({"join_col": join_data_l, "B": other_data}) + gdf_r = cudf.DataFrame({"join_col": join_data_r, "B": other_data}) + + with pytest.raises( + TypeError, + match="Decimal columns can only be merged with decimal columns " + "of the same precision and scale", + ): + gdf_l.merge(gdf_r, on="join_col", how="inner") + + @pytest.mark.parametrize( "dtype_l", ["datetime64[s]", "datetime64[ms]", "datetime64[us]", "datetime64[ns]"],