From 0571d18041a3f60ff2c4d8076a60988dfaf240ef Mon Sep 17 00:00:00 2001 From: Chris Jarrett Date: Tue, 18 May 2021 15:08:44 -0700 Subject: [PATCH 1/2] Enable implicit casting when concatenating columns of decimal and numeric types --- python/cudf/cudf/core/frame.py | 20 +-- python/cudf/cudf/core/series.py | 26 +++- python/cudf/cudf/tests/test_concat.py | 194 ++++++++++++++++++++++++++ python/cudf/cudf/utils/dtypes.py | 17 ++- 4 files changed, 242 insertions(+), 15 deletions(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 25552009444..870bef484d4 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -32,6 +32,7 @@ is_decimal_dtype, is_scalar, min_scalar_type, + _find_common_type_decimal, ) T = TypeVar("T", bound="Frame") @@ -3971,14 +3972,17 @@ def _find_common_dtypes_and_categories(non_null_columns, dtypes): elif all( isinstance(col, cudf.core.column.DecimalColumn) for col in cols ): - # Find the largest scale and the largest difference between - # precision and scale of the columns to be concatenated - s = max([col.dtype.scale for col in cols]) - lhs = max([col.dtype.precision - col.dtype.scale for col in cols]) - # Combine to get the necessary precision and clip at the maximum - # precision - p = min(cudf.Decimal64Dtype.MAX_PRECISION, s + lhs) - dtypes[idx] = cudf.Decimal64Dtype(p, s) + dtypes[idx] = _find_common_type_decimal( + [col.dtype for col in cols] + ) + elif all( + isinstance(col, cudf.core.column.DecimalColumn) + or is_numerical_dtype(col.dtype) + for col in cols + ): + dtypes[idx] = _find_common_type_decimal( + [col.dtype for col in cols if is_decimal_dtype(col.dtype)] + ) # Otherwise raise an error if columns have different dtypes elif not all(is_dtype_equal(c.dtype, dtypes[idx]) for c in cols): raise ValueError("All columns must be the same type") diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 7b1e6454394..0559e2ca7df 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -47,6 +47,7 @@ from cudf.utils.dtypes import ( _decimal_normalize_types, can_convert_to_column, + is_numerical_dtype, is_decimal_dtype, is_list_dtype, is_list_like, @@ -54,6 +55,7 @@ is_scalar, min_scalar_type, numeric_normalize_types, + _find_common_type_decimal, ) from cudf.utils.utils import ( get_appropriate_dispatched_func, @@ -2402,10 +2404,30 @@ def _concat(cls, objs, axis=0, index=True): ) if dtype_mismatch: - if isinstance(objs[0]._column, cudf.core.column.DecimalColumn): + if all( + [ + isinstance(obj._column, cudf.core.column.DecimalColumn) + for obj in objs + ] + ): objs = _decimal_normalize_types(*objs) - else: + elif all([is_numerical_dtype(obj.dtype) for obj in objs]): objs = numeric_normalize_types(*objs) + elif all( + [ + isinstance(obj._column, cudf.core.column.DecimalColumn) + or is_numerical_dtype(obj.dtype) + for obj in objs + ] + ): + decimal_type = _find_common_type_decimal( + [ + obj.dtype + for obj in objs + if is_decimal_dtype(obj.dtype) + ] + ) + objs = [obj.astype(decimal_type) for obj in objs] col = ColumnBase._concat([o._column for o in objs]) diff --git a/python/cudf/cudf/tests/test_concat.py b/python/cudf/cudf/tests/test_concat.py index 31dc6012905..b37b772e612 100644 --- a/python/cudf/cudf/tests/test_concat.py +++ b/python/cudf/cudf/tests/test_concat.py @@ -5,6 +5,7 @@ import numpy as np import pandas as pd import pytest +from decimal import Decimal import cudf as gd from cudf.tests.utils import assert_eq, assert_exceptions_equal @@ -1262,3 +1263,196 @@ def test_concat_decimal_series(ltype, rtype): expected = pd.concat([ps1, ps2]) assert_eq(expected, got) + + +@pytest.mark.parametrize( + "df1, df2, df3, expected", + [ + ( + gd.DataFrame( + {"val": [Decimal("42.5"), Decimal("8.7")]}, + dtype=Decimal64Dtype(5, 2), + ), + gd.DataFrame( + {"val": [Decimal("9.23"), Decimal("-67.49")]}, + dtype=Decimal64Dtype(6, 4), + ), + gd.DataFrame({"val": [8, -5]}, dtype="int32"), + gd.DataFrame( + { + "val": [ + Decimal("42.5"), + Decimal("8.7"), + Decimal("9.23"), + Decimal("-67.49"), + Decimal("8"), + Decimal("-5"), + ] + }, + dtype=Decimal64Dtype(7, 4), + index=[0, 1, 0, 1, 0, 1], + ), + ), + ( + gd.DataFrame( + {"val": [Decimal("95.2"), Decimal("23.4")]}, + dtype=Decimal64Dtype(5, 2), + ), + gd.DataFrame({"val": [54, 509]}, dtype="uint16"), + gd.DataFrame({"val": [24, -48]}, dtype="int32"), + gd.DataFrame( + { + "val": [ + Decimal("95.2"), + Decimal("23.4"), + Decimal("54"), + Decimal("509"), + Decimal("24"), + Decimal("-48"), + ] + }, + dtype=Decimal64Dtype(5, 2), + index=[0, 1, 0, 1, 0, 1], + ), + ), + ( + gd.DataFrame( + {"val": [Decimal("36.56"), Decimal("-59.24")]}, + dtype=Decimal64Dtype(9, 4), + ), + gd.DataFrame({"val": [403.21, 45.13]}, dtype="float32"), + gd.DataFrame({"val": [52.262, -49.25]}, dtype="float64"), + gd.DataFrame( + { + "val": [ + Decimal("36.56"), + Decimal("-59.24"), + Decimal("403.21"), + Decimal("45.13"), + Decimal("52.262"), + Decimal("-49.25"), + ] + }, + dtype=Decimal64Dtype(9, 4), + index=[0, 1, 0, 1, 0, 1], + ), + ), + ( + gd.DataFrame( + {"val": [Decimal("9563.24"), Decimal("236.633")]}, + dtype=Decimal64Dtype(9, 4), + ), + gd.DataFrame({"val": [5393, -95832]}, dtype="int64"), + gd.DataFrame({"val": [-29.234, -31.945]}, dtype="float64"), + gd.DataFrame( + { + "val": [ + Decimal("9563.24"), + Decimal("236.633"), + Decimal("5393"), + Decimal("-95832"), + Decimal("-29.234"), + Decimal("-31.945"), + ] + }, + dtype=Decimal64Dtype(9, 4), + index=[0, 1, 0, 1, 0, 1], + ), + ), + ], +) +def test_concat_decimal_numeric_dataframe(df1, df2, df3, expected): + df = gd.concat([df1, df2, df3]) + assert_eq(df, expected) + assert_eq(df.val.dtype, expected.val.dtype) + + +@pytest.mark.parametrize( + "s1, s2, s3, expected", + [ + ( + gd.Series( + [Decimal("32.8"), Decimal("-87.7")], dtype=Decimal64Dtype(6, 2) + ), + gd.Series( + [Decimal("101.243"), Decimal("-92.449")], + dtype=Decimal64Dtype(9, 6), + ), + gd.Series([94, -22], dtype="int32"), + gd.Series( + [ + Decimal("32.8"), + Decimal("-87.7"), + Decimal("101.243"), + Decimal("-92.449"), + Decimal("94"), + Decimal("-22"), + ], + dtype=Decimal64Dtype(10, 6), + index=[0, 1, 0, 1, 0, 1], + ), + ), + ( + gd.Series( + [Decimal("7.2"), Decimal("122.1")], dtype=Decimal64Dtype(5, 2) + ), + gd.Series([33, 984], dtype="uint32"), + gd.Series([593, -702], dtype="int32"), + gd.Series( + [ + Decimal("7.2"), + Decimal("122.1"), + Decimal("33"), + Decimal("984"), + Decimal("593"), + Decimal("-702"), + ], + dtype=Decimal64Dtype(5, 2), + index=[0, 1, 0, 1, 0, 1], + ), + ), + ( + gd.Series( + [Decimal("982.94"), Decimal("-493.626")], + dtype=Decimal64Dtype(9, 4), + ), + gd.Series([847.98, 254.442], dtype="float32"), + gd.Series([5299.262, -2049.25], dtype="float64"), + gd.Series( + [ + Decimal("982.94"), + Decimal("-493.626"), + Decimal("847.98"), + Decimal("254.442"), + Decimal("5299.262"), + Decimal("-2049.25"), + ], + dtype=Decimal64Dtype(9, 4), + index=[0, 1, 0, 1, 0, 1], + ), + ), + ( + gd.Series( + [Decimal("492.204"), Decimal("-72824.455")], + dtype=Decimal64Dtype(9, 4), + ), + gd.Series([8438, -27462], dtype="int64"), + gd.Series([-40.292, 49202.953], dtype="float64"), + gd.Series( + [ + Decimal("492.204"), + Decimal("-72824.455"), + Decimal("8438"), + Decimal("-27462"), + Decimal("-40.292"), + Decimal("49202.953"), + ], + dtype=Decimal64Dtype(9, 4), + index=[0, 1, 0, 1, 0, 1], + ), + ), + ], +) +def test_concat_decimal_numeric_series(s1, s2, s3, expected): + s = gd.concat([s1, s2, s3]) + assert_eq(s, expected) diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index 16c35bab4b1..959cb1af70c 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -291,14 +291,21 @@ def is_decimal_dtype(obj): def _decimal_normalize_types(*args): - s = max([a.dtype.scale for a in args]) - lhs = max([a.dtype.precision - a.dtype.scale for a in args]) - p = min(cudf.Decimal64Dtype.MAX_PRECISION, s + lhs) - dtype = cudf.Decimal64Dtype(p, s) - + dtype = _find_common_type_decimal([a.dtype for a in args]) return [a.astype(dtype) for a in args] +def _find_common_type_decimal(dtypes): + # Find the largest scale and the largest difference between + # precision and scale of the columns to be concatenated + s = max([dtype.scale for dtype in dtypes]) + lhs = max([dtype.precision - dtype.scale for dtype in dtypes]) + # Combine to get the necessary precision and clip at the maximum + # precision + p = min(cudf.Decimal64Dtype.MAX_PRECISION, s + lhs) + return cudf.Decimal64Dtype(p, s) + + def cudf_dtype_from_pydata_dtype(dtype): """ Given a numpy or pandas dtype, converts it into the equivalent cuDF Python dtype. From 633fb894dec5ec691ed4fb69ef058162d4b7b0fd Mon Sep 17 00:00:00 2001 From: Chris Jarrett Date: Thu, 20 May 2021 13:43:26 -0700 Subject: [PATCH 2/2] Refactor to use find_common_type --- python/cudf/cudf/core/frame.py | 23 +++------ python/cudf/cudf/core/series.py | 31 ++---------- python/cudf/cudf/tests/test_concat.py | 71 +++++++++++++++++++++++++++ python/cudf/cudf/utils/dtypes.py | 17 ++++--- 4 files changed, 89 insertions(+), 53 deletions(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 020820e50b2..cda4e8cbd4c 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -32,7 +32,7 @@ is_numerical_dtype, is_scalar, min_scalar_type, - _find_common_type_decimal, + find_common_type, ) T = TypeVar("T", bound="Frame") @@ -4030,8 +4030,11 @@ def _find_common_dtypes_and_categories(non_null_columns, dtypes): # default to the first non-null dtype dtypes[idx] = cols[0].dtype # If all the non-null dtypes are int/float, find a common dtype - if all(is_numerical_dtype(col.dtype) for col in cols): - dtypes[idx] = np.find_common_type([col.dtype for col in cols], []) + if all( + is_numerical_dtype(col.dtype) or is_decimal_dtype(col.dtype) + for col in cols + ): + dtypes[idx] = find_common_type([col.dtype for col in cols]) # If all categorical dtypes, combine the categories elif all( isinstance(col, cudf.core.column.CategoricalColumn) for col in cols @@ -4046,20 +4049,6 @@ def _find_common_dtypes_and_categories(non_null_columns, dtypes): # Set the column dtype to the codes' dtype. The categories # will be re-assigned at the end dtypes[idx] = min_scalar_type(len(categories[idx])) - elif all( - isinstance(col, cudf.core.column.DecimalColumn) for col in cols - ): - dtypes[idx] = _find_common_type_decimal( - [col.dtype for col in cols] - ) - elif all( - isinstance(col, cudf.core.column.DecimalColumn) - or is_numerical_dtype(col.dtype) - for col in cols - ): - dtypes[idx] = _find_common_type_decimal( - [col.dtype for col in cols if is_decimal_dtype(col.dtype)] - ) # Otherwise raise an error if columns have different dtypes elif not all(is_dtype_equal(c.dtype, dtypes[idx]) for c in cols): raise ValueError("All columns must be the same type") diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 904ceb2aed1..a894baf8235 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -45,17 +45,14 @@ from cudf.utils import cudautils, docutils, ioutils from cudf.utils.docutils import copy_docstring from cudf.utils.dtypes import ( - _decimal_normalize_types, can_convert_to_column, - is_numerical_dtype, is_decimal_dtype, is_list_dtype, is_list_like, is_mixed_with_object_dtype, is_scalar, min_scalar_type, - numeric_normalize_types, - _find_common_type_decimal, + find_common_type, ) from cudf.utils.utils import ( get_appropriate_dispatched_func, @@ -2404,30 +2401,8 @@ def _concat(cls, objs, axis=0, index=True): ) if dtype_mismatch: - if all( - [ - isinstance(obj._column, cudf.core.column.DecimalColumn) - for obj in objs - ] - ): - objs = _decimal_normalize_types(*objs) - elif all([is_numerical_dtype(obj.dtype) for obj in objs]): - objs = numeric_normalize_types(*objs) - elif all( - [ - isinstance(obj._column, cudf.core.column.DecimalColumn) - or is_numerical_dtype(obj.dtype) - for obj in objs - ] - ): - decimal_type = _find_common_type_decimal( - [ - obj.dtype - for obj in objs - if is_decimal_dtype(obj.dtype) - ] - ) - objs = [obj.astype(decimal_type) for obj in objs] + common_dtype = find_common_type([obj.dtype for obj in objs]) + objs = [obj.astype(common_dtype) for obj in objs] col = _concat_columns([o._column for o in objs]) diff --git a/python/cudf/cudf/tests/test_concat.py b/python/cudf/cudf/tests/test_concat.py index b37b772e612..5c4c121db4d 100644 --- a/python/cudf/cudf/tests/test_concat.py +++ b/python/cudf/cudf/tests/test_concat.py @@ -1456,3 +1456,74 @@ def test_concat_decimal_numeric_dataframe(df1, df2, df3, expected): def test_concat_decimal_numeric_series(s1, s2, s3, expected): s = gd.concat([s1, s2, s3]) assert_eq(s, expected) + + +@pytest.mark.parametrize( + "s1, s2, expected", + [ + ( + gd.Series( + [Decimal("955.22"), Decimal("8.2")], dtype=Decimal64Dtype(5, 2) + ), + gd.Series(["2007-06-12", "2006-03-14"], dtype="datetime64"), + gd.Series( + [ + "955.22", + "8.20", + "2007-06-12 00:00:00", + "2006-03-14 00:00:00", + ], + index=[0, 1, 0, 1], + ), + ), + ( + gd.Series( + [Decimal("-52.44"), Decimal("365.22")], + dtype=Decimal64Dtype(5, 2), + ), + gd.Series( + np.arange( + "2005-02-01T12", "2005-02-01T15", dtype="datetime64[h]" + ), + dtype="datetime64[s]", + ), + gd.Series( + [ + "-52.44", + "365.22", + "2005-02-01 12:00:00", + "2005-02-01 13:00:00", + "2005-02-01 14:00:00", + ], + index=[0, 1, 0, 1, 2], + ), + ), + ( + gd.Series( + [Decimal("753.0"), Decimal("94.22")], + dtype=Decimal64Dtype(5, 2), + ), + gd.Series([np.timedelta64(111, "s"), np.timedelta64(509, "s")]), + gd.Series( + ["753.00", "94.22", "0 days 00:01:51", "0 days 00:08:29"], + index=[0, 1, 0, 1], + ), + ), + ( + gd.Series( + [Decimal("753.0"), Decimal("94.22")], + dtype=Decimal64Dtype(5, 2), + ), + gd.Series( + [np.timedelta64(940252, "s"), np.timedelta64(758385, "s")] + ), + gd.Series( + ["753.00", "94.22", "10 days 21:10:52", "8 days 18:39:45"], + index=[0, 1, 0, 1], + ), + ), + ], +) +def test_concat_decimal_non_numeric(s1, s2, expected): + s = gd.concat([s1, s2]) + assert_eq(s, expected) diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index 959cb1af70c..0b59116f8e6 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -290,11 +290,6 @@ def is_decimal_dtype(obj): ) -def _decimal_normalize_types(*args): - dtype = _find_common_type_decimal([a.dtype for a in args]) - return [a.astype(dtype) for a in args] - - def _find_common_type_decimal(dtypes): # Find the largest scale and the largest difference between # precision and scale of the columns to be concatenated @@ -697,9 +692,15 @@ def find_common_type(dtypes): dtypes = set(dtypes) if any(is_decimal_dtype(dtype) for dtype in dtypes): - raise NotImplementedError( - "DecimalDtype is not yet supported in find_common_type" - ) + if all( + is_decimal_dtype(dtype) or is_numerical_dtype(dtype) + for dtype in dtypes + ): + return _find_common_type_decimal( + [dtype for dtype in dtypes if is_decimal_dtype(dtype)] + ) + else: + return np.dtype("O") # Corner case 1: # Resort to np.result_type to handle "M" and "m" types separately