From 8428e89b352686860e9ac3816921e29ed5a72df9 Mon Sep 17 00:00:00 2001 From: Sarah Yurick <53962159+sarahyurick@users.noreply.github.com> Date: Mon, 2 Aug 2021 11:05:13 -0400 Subject: [PATCH 1/4] convert_alias and tests --- python/cudf/cudf/core/column/column.py | 21 ++++++++++++++++++++- python/cudf/cudf/tests/test_column.py | 24 ++++++++++++++++++++++++ 2 files changed, 44 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index a5e49b026f3..131aba35050 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -883,7 +883,12 @@ def astype(self, dtype: Dtype, **kwargs) -> ColumnBase: dtype = pandas_dtypes_to_cudf_dtypes.get(dtype, dtype) if _is_non_decimal_numeric_dtype(dtype): - return self.as_numerical_column(dtype, **kwargs) + try: + return self.as_numerical_column(dtype, **kwargs) + except TypeError: + return self.as_numerical_column( + self.convert_alias(dtype), **kwargs + ) elif is_categorical_dtype(dtype): return self.as_categorical_column(dtype, **kwargs) elif pandas_dtype(dtype).type in { @@ -963,6 +968,20 @@ def as_numerical_column( ) -> "cudf.core.column.NumericalColumn": raise NotImplementedError + def convert_alias(self, dtype: str): + aliases = { + "UInt8": "uint8", + "UInt16": "uint16", + "UInt32": "uint32", + "UInt64": "uint64", + "Int8": "int8", + "Int16": "int16", + "Int32": "int32", + "Int64": "int64", + "boolean": "bool", + } + return aliases[dtype] + def as_datetime_column( self, dtype: Dtype, **kwargs ) -> "cudf.core.column.DatetimeColumn": diff --git a/python/cudf/cudf/tests/test_column.py b/python/cudf/cudf/tests/test_column.py index f3387b3d27d..11b2e4bc9f9 100644 --- a/python/cudf/cudf/tests/test_column.py +++ b/python/cudf/cudf/tests/test_column.py @@ -481,3 +481,27 @@ def test_concatenate_large_column_strings(): match="total size of output is too large for a cudf column", ): cudf.concat([s_1, s_2]) + + +@pytest.mark.parametrize( + "alias,expect_dtype", + [ + ("UInt8", "uint8"), + ("UInt16", "uint16"), + ("UInt32", "uint32"), + ("UInt64", "uint64"), + ("Int8", "int8"), + ("Int16", "int16"), + ("Int32", "int32"), + ("Int64", "int64"), + ("boolean", "bool"), + ], +) +@pytest.mark.parametrize( + "data", [[1, 2, 0]], +) +def test_astype_with_aliases(alias, expect_dtype, data): + pd_data = pd.Series(data) + gd_data = cudf.Series.from_pandas(pd_data) + + assert_eq(pd_data.astype(expect_dtype), gd_data.astype(alias)) From b65d32d42d56377add277e1ba647d57d5a4b533a Mon Sep 17 00:00:00 2001 From: Sarah Yurick <53962159+sarahyurick@users.noreply.github.com> Date: Mon, 2 Aug 2021 11:23:41 -0400 Subject: [PATCH 2/4] remove argument type --- python/cudf/cudf/core/column/column.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 131aba35050..24a4b17ca3c 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -968,7 +968,7 @@ def as_numerical_column( ) -> "cudf.core.column.NumericalColumn": raise NotImplementedError - def convert_alias(self, dtype: str): + def convert_alias(self, dtype): aliases = { "UInt8": "uint8", "UInt16": "uint16", From 38e98934146f052eb5d5e60f5ecb2acc3ff70565 Mon Sep 17 00:00:00 2001 From: Sarah Yurick <53962159+sarahyurick@users.noreply.github.com> Date: Mon, 2 Aug 2021 15:34:01 -0400 Subject: [PATCH 3/4] moved aliases to dtypes.py --- python/cudf/cudf/core/column/column.py | 28 +++++++------------------- python/cudf/cudf/tests/test_column.py | 2 ++ python/cudf/cudf/utils/dtypes.py | 14 +++++++++++++ 3 files changed, 23 insertions(+), 21 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 24a4b17ca3c..02231f26f61 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -71,6 +71,7 @@ min_unsigned_type, np_to_pa_dtype, pandas_dtypes_to_cudf_dtypes, + pandas_dtypes_alias_to_cudf_alias, ) from cudf.utils.utils import mask_dtype @@ -881,14 +882,13 @@ def astype(self, dtype: Dtype, **kwargs) -> ColumnBase: if is_categorical_dtype(dtype): return self.as_categorical_column(dtype, **kwargs) - dtype = pandas_dtypes_to_cudf_dtypes.get(dtype, dtype) + dtype = ( + pandas_dtypes_alias_to_cudf_alias.get(dtype, dtype) + if isinstance(dtype, str) + else pandas_dtypes_to_cudf_dtypes.get(dtype, dtype) + ) if _is_non_decimal_numeric_dtype(dtype): - try: - return self.as_numerical_column(dtype, **kwargs) - except TypeError: - return self.as_numerical_column( - self.convert_alias(dtype), **kwargs - ) + return self.as_numerical_column(dtype, **kwargs) elif is_categorical_dtype(dtype): return self.as_categorical_column(dtype, **kwargs) elif pandas_dtype(dtype).type in { @@ -968,20 +968,6 @@ def as_numerical_column( ) -> "cudf.core.column.NumericalColumn": raise NotImplementedError - def convert_alias(self, dtype): - aliases = { - "UInt8": "uint8", - "UInt16": "uint16", - "UInt32": "uint32", - "UInt64": "uint64", - "Int8": "int8", - "Int16": "int16", - "Int32": "int32", - "Int64": "int64", - "boolean": "bool", - } - return aliases[dtype] - def as_datetime_column( self, dtype: Dtype, **kwargs ) -> "cudf.core.column.DatetimeColumn": diff --git a/python/cudf/cudf/tests/test_column.py b/python/cudf/cudf/tests/test_column.py index 11b2e4bc9f9..761b2f32f18 100644 --- a/python/cudf/cudf/tests/test_column.py +++ b/python/cudf/cudf/tests/test_column.py @@ -495,6 +495,8 @@ def test_concatenate_large_column_strings(): ("Int32", "int32"), ("Int64", "int64"), ("boolean", "bool"), + ("Float32", "float32"), + ("Float64", "float64"), ], ) @pytest.mark.parametrize( diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index e1ae87e5089..46bd1b449c4 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -92,11 +92,25 @@ pd.StringDtype(): np.dtype("object"), } +pandas_dtypes_alias_to_cudf_alias = { + "UInt8": "uint8", + "UInt16": "uint16", + "UInt32": "uint32", + "UInt64": "uint64", + "Int8": "int8", + "Int16": "int16", + "Int32": "int32", + "Int64": "int64", + "boolean": "bool", +} + if PANDAS_GE_120: cudf_dtypes_to_pandas_dtypes[np.dtype("float32")] = pd.Float32Dtype() cudf_dtypes_to_pandas_dtypes[np.dtype("float64")] = pd.Float64Dtype() pandas_dtypes_to_cudf_dtypes[pd.Float32Dtype()] = np.dtype("float32") pandas_dtypes_to_cudf_dtypes[pd.Float64Dtype()] = np.dtype("float64") + pandas_dtypes_alias_to_cudf_alias["Float32"] = "float32" + pandas_dtypes_alias_to_cudf_alias["Float64"] = "float64" SIGNED_INTEGER_TYPES = {"int8", "int16", "int32", "int64"} UNSIGNED_TYPES = {"uint8", "uint16", "uint32", "uint64"} From 5e1e412e2a81913fd9b360413d034f5a241be90b Mon Sep 17 00:00:00 2001 From: Sarah Yurick <53962159+sarahyurick@users.noreply.github.com> Date: Mon, 2 Aug 2021 15:57:07 -0400 Subject: [PATCH 4/4] sort imports alphabetically --- python/cudf/cudf/core/column/column.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 02231f26f61..2b0f5f05774 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -70,8 +70,8 @@ get_time_unit, min_unsigned_type, np_to_pa_dtype, - pandas_dtypes_to_cudf_dtypes, pandas_dtypes_alias_to_cudf_alias, + pandas_dtypes_to_cudf_dtypes, ) from cudf.utils.utils import mask_dtype