From 64049a3cf78590964b86b031432a95f9a5009dfb Mon Sep 17 00:00:00 2001 From: Dmitry Chigarev Date: Thu, 16 Nov 2023 19:13:51 +0100 Subject: [PATCH] Apply review suggestions Signed-off-by: Dmitry Chigarev --- .../dataframe/pandas/dataframe/dataframe.py | 4 +- .../core/dataframe/pandas/metadata/dtypes.py | 68 ++++++++++--------- 2 files changed, 38 insertions(+), 34 deletions(-) diff --git a/modin/core/dataframe/pandas/dataframe/dataframe.py b/modin/core/dataframe/pandas/dataframe/dataframe.py index c4889ff3d2a..40a761b4e47 100644 --- a/modin/core/dataframe/pandas/dataframe/dataframe.py +++ b/modin/core/dataframe/pandas/dataframe/dataframe.py @@ -324,9 +324,7 @@ def _maybe_update_proxies(self, dtypes, new_parent=None): if isinstance(dtypes, ModinDtypes): dtypes = dtypes.maybe_specify_new_frame_ref(new_parent) if isinstance(dtypes, pandas.Series): - for key, value in dtypes.items(): - if isinstance(value, LazyProxyCategoricalDtype): - dtypes[key] = value._update_proxy(new_parent, column_name=key) + LazyProxyCategoricalDtype.update_dtypes(dtypes, new_parent) return dtypes def set_dtypes_cache(self, dtypes): diff --git a/modin/core/dataframe/pandas/metadata/dtypes.py b/modin/core/dataframe/pandas/metadata/dtypes.py index f9fb1f057ec..a9ad91b8823 100644 --- a/modin/core/dataframe/pandas/metadata/dtypes.py +++ b/modin/core/dataframe/pandas/metadata/dtypes.py @@ -32,7 +32,7 @@ class DtypesDescriptor: Parameters ---------- - known_dtypes : dict[IndexLabel, np.dtype], optional + known_dtypes : dict[IndexLabel, np.dtype] or pandas.Series, optional Columns that we know dtypes for. cols_with_unknown_dtypes : list[IndexLabel], optional Column names that have unknown dtypes. If specified together with `remaining_dtype`, must describe all @@ -59,7 +59,7 @@ class DtypesDescriptor: def __init__( self, - known_dtypes: Optional[dict[IndexLabel, np.dtype]] = None, + known_dtypes: Optional[Union[dict[IndexLabel, np.dtype], pandas.Series]] = None, cols_with_unknown_dtypes: Optional[list[IndexLabel]] = None, remaining_dtype: Optional[np.dtype] = None, parent_df: Optional["PandasDataframe"] = None, @@ -68,7 +68,7 @@ def __init__( _schema_is_known: Optional[bool] = None, ): if not know_all_names and remaining_dtype is not None: - raise RuntimeError( + raise ValueError( "It's not allowed to pass 'remaining_dtype' and 'know_all_names=False' at the same time." ) # columns with known dtypes @@ -90,21 +90,19 @@ def __init__( [] if cols_with_unknown_dtypes is None else cols_with_unknown_dtypes ) # whether 'known_dtypes' describe all columns in the dataframe - if _schema_is_known is None: - self._schema_is_known: bool = ( - len(cols_with_unknown_dtypes) == 0 - if ( - # if 'cols_with_unknown_dtypes' was explicitly specified as an empty list and - # we don't have any 'remaining_dtype', then we assume that 'known_dtypes' are complete - cols_with_unknown_dtypes is not None - and know_all_names - and remaining_dtype is None - and len(self._known_dtypes) > 0 - ) - else False - ) - else: - self._schema_is_known: bool = _schema_is_known + self._schema_is_known: Optional[bool] = _schema_is_known + if self._schema_is_known is None: + self._schema_is_known = False + if ( + # if 'cols_with_unknown_dtypes' was explicitly specified as an empty list and + # we don't have any 'remaining_dtype', then we assume that 'known_dtypes' are complete + cols_with_unknown_dtypes is not None + and know_all_names + and remaining_dtype is None + and len(self._known_dtypes) > 0 + ): + self._schema_is_known = len(cols_with_unknown_dtypes) == 0 + self._know_all_names: bool = know_all_names # a common dtype for columns that are not present in 'known_dtypes' nor in 'cols_with_unknown_dtypes' self._remaining_dtype: Optional[np.dtype] = remaining_dtype @@ -115,12 +113,12 @@ def __init__( self.columns_order else: if remaining_dtype is not None: - raise RuntimeError( + raise ValueError( "Passing 'columns_order' and 'remaining_dtype' is ambiguous. You have to manually " + "complete 'known_dtypes' using the information from 'columns_order' and 'remaining_dtype'." ) elif not self._know_all_names: - raise RuntimeError( + raise ValueError( "Passing 'columns_order' and 'know_all_names=False' is ambiguous. You have to manually " + "complete 'cols_with_unknown_dtypes' using the information from 'columns_order' " + "and pass 'know_all_names=True'." @@ -128,7 +126,7 @@ def __init__( elif len(columns_order) != ( len(self._cols_with_unknown_dtypes) + len(self._known_dtypes) ): - raise RuntimeError( + raise ValueError( "The length of 'columns_order' doesn't match to 'known_dtypes' and 'cols_with_unknown_dtypes'" ) self._columns_order: Optional[dict[int, IndexLabel]] = columns_order @@ -142,11 +140,7 @@ def update_parent(self, new_parent: "PandasDataframe"): new_parent : PandasDataframe """ self._parent_df = new_parent - for key, value in self._known_dtypes.items(): - if isinstance(value, LazyProxyCategoricalDtype): - self._known_dtypes[key] = value._update_proxy( - new_parent, column_name=key - ) + LazyProxyCategoricalDtype.update_dtypes(self._known_dtypes, new_parent) # try to compute '._columns_order' using 'new_parent' self.columns_order @@ -595,11 +589,7 @@ def maybe_specify_new_frame_ref( """ new_self = self.copy() if new_self.is_materialized: - for key, value in new_self._value.items(): - if isinstance(value, LazyProxyCategoricalDtype): - new_self._value[key] = value._update_proxy( - new_parent, column_name=key - ) + LazyProxyCategoricalDtype.update_dtypes(new_self._value, new_parent) return new_self if isinstance(self._value, DtypesDescriptor): new_self._value.update_parent(new_parent) @@ -826,6 +816,22 @@ def __init__(self, categories=None, ordered=False): ) super().__init__(categories, ordered) + @staticmethod + def update_dtypes(dtypes, new_parent): + """ + Update a parent for categorical proxies in a dtype object. + + Parameters + ---------- + dtypes : dict-like + A dict-like object describing dtypes. The method will walk through every dtype + an update parents for categorical proxies inplace. + new_parent : object + """ + for key, value in dtypes.items(): + if isinstance(value, LazyProxyCategoricalDtype): + dtypes[key] = value._update_proxy(new_parent, column_name=key) + def _update_proxy(self, parent, column_name): """ Create a new proxy, if either parent or column name are different.