From fd5d5963711ff9e7a295fc99ab1507ec300d395e Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 16 Jan 2024 14:03:48 -0800 Subject: [PATCH 1/2] Remove **kwargs from astype --- python/cudf/cudf/core/column/column.py | 10 ++++------ python/cudf/cudf/core/dataframe.py | 10 ++++++++-- python/cudf/cudf/core/frame.py | 13 +++++-------- python/cudf/cudf/core/indexed_frame.py | 11 ++++++++--- python/cudf/cudf/core/series.py | 10 ++++++++-- python/cudf/cudf/tests/test_dataframe.py | 17 +++++++++-------- 6 files changed, 42 insertions(+), 29 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index c13ec33c51c..0b21dc72444 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -958,9 +958,7 @@ def distinct_count(self, dropna: bool = True) -> int: def can_cast_safely(self, to_dtype: Dtype) -> bool: raise NotImplementedError() - def astype( - self, dtype: Dtype, copy: bool = False, format: str | None = None - ) -> ColumnBase: + def astype(self, dtype: Dtype, copy: bool = False) -> ColumnBase: if copy: col = self.copy() else: @@ -1000,7 +998,7 @@ def astype( f"Casting to {dtype} is not supported, use " "`.astype('str')` instead." ) - return col.as_string_column(dtype, format=format) + return col.as_string_column(dtype) elif isinstance(dtype, (ListDtype, StructDtype)): if not col.dtype == dtype: raise NotImplementedError( @@ -1012,9 +1010,9 @@ def astype( elif isinstance(dtype, cudf.core.dtypes.DecimalDtype): return col.as_decimal_column(dtype) elif np.issubdtype(cast(Any, dtype), np.datetime64): - return col.as_datetime_column(dtype, format=format) + return col.as_datetime_column(dtype) elif np.issubdtype(cast(Any, dtype), np.timedelta64): - return col.as_timedelta_column(dtype, format=format) + return col.as_timedelta_column(dtype) else: return col.as_numerical_column(dtype) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index f9cf180ff44..2f18c194fde 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -19,6 +19,7 @@ Callable, Dict, List, + Literal, MutableMapping, Optional, Set, @@ -1774,7 +1775,12 @@ def _concat( return out - def astype(self, dtype, copy=False, errors="raise", **kwargs): + def astype( + self, + dtype, + copy: bool = False, + errors: Literal["raise", "ignore"] = "raise", + ): if is_dict_like(dtype): if len(set(dtype.keys()) - set(self._data.names)) > 0: raise KeyError( @@ -1783,7 +1789,7 @@ def astype(self, dtype, copy=False, errors="raise", **kwargs): ) else: dtype = {cc: dtype for cc in self._data.names} - return super().astype(dtype, copy, errors, **kwargs) + return super().astype(dtype, copy, errors) def _clean_renderable_dataframe(self, output): """ diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 5f7a86e86d8..d5781fad966 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -286,14 +286,11 @@ def __len__(self): return self._num_rows @_cudf_nvtx_annotate - def astype(self, dtype, copy=False, **kwargs): - result_data = {} - for col_name, col in self._data.items(): - dt = dtype.get(col_name, col.dtype) - if not is_dtype_equal(dt, col.dtype): - result_data[col_name] = col.astype(dt, copy=copy, **kwargs) - else: - result_data[col_name] = col.copy() if copy else col + def astype(self, dtype, copy: bool = False): + result_data = { + col_name: col.astype(dtype.get(col_name, col.dtype), copy=copy) + for col_name, col in self._data.items() + } return ColumnAccessor._create_unsafe( data=result_data, diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 2a35ac0f959..8896205c626 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -14,6 +14,7 @@ Callable, Dict, List, + Literal, MutableMapping, Optional, Tuple, @@ -3753,7 +3754,12 @@ def _append( return cudf.concat(to_concat, ignore_index=ignore_index, sort=sort) - def astype(self, dtype, copy=False, errors="raise", **kwargs): + def astype( + self, + dtype, + copy: bool = False, + errors: Literal["raise", "ignore"] = "raise", + ): """Cast the object to the given dtype. Parameters @@ -3774,7 +3780,6 @@ def astype(self, dtype, copy=False, errors="raise", **kwargs): - ``raise`` : allow exceptions to be raised - ``ignore`` : suppress exceptions. On error return original object. - **kwargs : extra arguments to pass on to the constructor Returns ------- @@ -3865,7 +3870,7 @@ def astype(self, dtype, copy=False, errors="raise", **kwargs): raise ValueError("invalid error value specified") try: - data = super().astype(dtype, copy, **kwargs) + data = super().astype(dtype, copy) except Exception as e: if errors == "raise": raise e diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index bc1eaef86db..55100343306 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -12,6 +12,7 @@ from typing import ( Any, Dict, + Literal, MutableMapping, Optional, Sequence, @@ -2141,7 +2142,12 @@ def nullmask(self): return cudf.Series(self._column.nullmask) @_cudf_nvtx_annotate - def astype(self, dtype, copy=False, errors="raise", **kwargs): + def astype( + self, + dtype, + copy: bool = False, + errors: Literal["raise", "ignore"] = "raise", + ): if is_dict_like(dtype): if len(dtype) > 1 or self.name not in dtype: raise KeyError( @@ -2150,7 +2156,7 @@ def astype(self, dtype, copy=False, errors="raise", **kwargs): ) else: dtype = {self.name: dtype} - return super().astype(dtype, copy, errors, **kwargs) + return super().astype(dtype, copy, errors) @_cudf_nvtx_annotate def sort_index(self, axis=0, *args, **kwargs): diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 30530e9d2a3..115635269ae 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2023, NVIDIA CORPORATION. +# Copyright (c) 2018-2024, NVIDIA CORPORATION. import array as arr import contextlib @@ -5114,15 +5114,16 @@ def test_df_astype_to_categorical_ordered(ordered): @pytest.mark.parametrize( - "dtype,args", - [(dtype, {}) for dtype in ALL_TYPES] - + [("category", {"ordered": True}), ("category", {"ordered": False})], + "dtype", + [dtype for dtype in ALL_TYPES] + + [ + cudf.CategoricalDtype(ordered=True), + cudf.CategoricalDtype(ordered=False), + ], ) -def test_empty_df_astype(dtype, args): +def test_empty_df_astype(dtype): df = cudf.DataFrame() - kwargs = {} - kwargs.update(args) - assert_eq(df, df.astype(dtype=dtype, **kwargs)) + assert_eq(df, df.astype(dtype=dtype)) @pytest.mark.parametrize( From 6dd5d5222df635990add4d5da3abd8241b149631 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 18 Jan 2024 10:12:41 -0800 Subject: [PATCH 2/2] Add pandas astype comp --- python/cudf/cudf/tests/test_dataframe.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 115635269ae..d75db7dfaae 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -5123,7 +5123,9 @@ def test_df_astype_to_categorical_ordered(ordered): ) def test_empty_df_astype(dtype): df = cudf.DataFrame() - assert_eq(df, df.astype(dtype=dtype)) + result = df.astype(dtype=dtype) + assert_eq(df, result) + assert_eq(df.to_pandas().astype(dtype=dtype), result) @pytest.mark.parametrize(