Skip to content

Commit

Permalink
Apply review suggestions
Browse files Browse the repository at this point in the history
Signed-off-by: Dmitry Chigarev <[email protected]>
  • Loading branch information
dchigarev committed Nov 16, 2023
1 parent e89d7b6 commit 64049a3
Show file tree
Hide file tree
Showing 2 changed files with 38 additions and 34 deletions.
4 changes: 1 addition & 3 deletions modin/core/dataframe/pandas/dataframe/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -324,9 +324,7 @@ def _maybe_update_proxies(self, dtypes, new_parent=None):
if isinstance(dtypes, ModinDtypes):
dtypes = dtypes.maybe_specify_new_frame_ref(new_parent)
if isinstance(dtypes, pandas.Series):
for key, value in dtypes.items():
if isinstance(value, LazyProxyCategoricalDtype):
dtypes[key] = value._update_proxy(new_parent, column_name=key)
LazyProxyCategoricalDtype.update_dtypes(dtypes, new_parent)
return dtypes

def set_dtypes_cache(self, dtypes):
Expand Down
68 changes: 37 additions & 31 deletions modin/core/dataframe/pandas/metadata/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ class DtypesDescriptor:
Parameters
----------
known_dtypes : dict[IndexLabel, np.dtype], optional
known_dtypes : dict[IndexLabel, np.dtype] or pandas.Series, optional
Columns that we know dtypes for.
cols_with_unknown_dtypes : list[IndexLabel], optional
Column names that have unknown dtypes. If specified together with `remaining_dtype`, must describe all
Expand All @@ -59,7 +59,7 @@ class DtypesDescriptor:

def __init__(
self,
known_dtypes: Optional[dict[IndexLabel, np.dtype]] = None,
known_dtypes: Optional[Union[dict[IndexLabel, np.dtype], pandas.Series]] = None,
cols_with_unknown_dtypes: Optional[list[IndexLabel]] = None,
remaining_dtype: Optional[np.dtype] = None,
parent_df: Optional["PandasDataframe"] = None,
Expand All @@ -68,7 +68,7 @@ def __init__(
_schema_is_known: Optional[bool] = None,
):
if not know_all_names and remaining_dtype is not None:
raise RuntimeError(
raise ValueError(
"It's not allowed to pass 'remaining_dtype' and 'know_all_names=False' at the same time."
)
# columns with known dtypes
Expand All @@ -90,21 +90,19 @@ def __init__(
[] if cols_with_unknown_dtypes is None else cols_with_unknown_dtypes
)
# whether 'known_dtypes' describe all columns in the dataframe
if _schema_is_known is None:
self._schema_is_known: bool = (
len(cols_with_unknown_dtypes) == 0
if (
# if 'cols_with_unknown_dtypes' was explicitly specified as an empty list and
# we don't have any 'remaining_dtype', then we assume that 'known_dtypes' are complete
cols_with_unknown_dtypes is not None
and know_all_names
and remaining_dtype is None
and len(self._known_dtypes) > 0
)
else False
)
else:
self._schema_is_known: bool = _schema_is_known
self._schema_is_known: Optional[bool] = _schema_is_known
if self._schema_is_known is None:
self._schema_is_known = False
if (
# if 'cols_with_unknown_dtypes' was explicitly specified as an empty list and
# we don't have any 'remaining_dtype', then we assume that 'known_dtypes' are complete
cols_with_unknown_dtypes is not None
and know_all_names
and remaining_dtype is None
and len(self._known_dtypes) > 0
):
self._schema_is_known = len(cols_with_unknown_dtypes) == 0

self._know_all_names: bool = know_all_names
# a common dtype for columns that are not present in 'known_dtypes' nor in 'cols_with_unknown_dtypes'
self._remaining_dtype: Optional[np.dtype] = remaining_dtype
Expand All @@ -115,20 +113,20 @@ def __init__(
self.columns_order
else:
if remaining_dtype is not None:
raise RuntimeError(
raise ValueError(
"Passing 'columns_order' and 'remaining_dtype' is ambiguous. You have to manually "
+ "complete 'known_dtypes' using the information from 'columns_order' and 'remaining_dtype'."
)
elif not self._know_all_names:
raise RuntimeError(
raise ValueError(
"Passing 'columns_order' and 'know_all_names=False' is ambiguous. You have to manually "
+ "complete 'cols_with_unknown_dtypes' using the information from 'columns_order' "
+ "and pass 'know_all_names=True'."
)
elif len(columns_order) != (
len(self._cols_with_unknown_dtypes) + len(self._known_dtypes)
):
raise RuntimeError(
raise ValueError(
"The length of 'columns_order' doesn't match to 'known_dtypes' and 'cols_with_unknown_dtypes'"
)
self._columns_order: Optional[dict[int, IndexLabel]] = columns_order
Expand All @@ -142,11 +140,7 @@ def update_parent(self, new_parent: "PandasDataframe"):
new_parent : PandasDataframe
"""
self._parent_df = new_parent
for key, value in self._known_dtypes.items():
if isinstance(value, LazyProxyCategoricalDtype):
self._known_dtypes[key] = value._update_proxy(
new_parent, column_name=key
)
LazyProxyCategoricalDtype.update_dtypes(self._known_dtypes, new_parent)
# try to compute '._columns_order' using 'new_parent'
self.columns_order

Expand Down Expand Up @@ -595,11 +589,7 @@ def maybe_specify_new_frame_ref(
"""
new_self = self.copy()
if new_self.is_materialized:
for key, value in new_self._value.items():
if isinstance(value, LazyProxyCategoricalDtype):
new_self._value[key] = value._update_proxy(
new_parent, column_name=key
)
LazyProxyCategoricalDtype.update_dtypes(new_self._value, new_parent)
return new_self
if isinstance(self._value, DtypesDescriptor):
new_self._value.update_parent(new_parent)
Expand Down Expand Up @@ -826,6 +816,22 @@ def __init__(self, categories=None, ordered=False):
)
super().__init__(categories, ordered)

@staticmethod
def update_dtypes(dtypes, new_parent):
"""
Update a parent for categorical proxies in a dtype object.
Parameters
----------
dtypes : dict-like
A dict-like object describing dtypes. The method will walk through every dtype
an update parents for categorical proxies inplace.
new_parent : object
"""
for key, value in dtypes.items():
if isinstance(value, LazyProxyCategoricalDtype):
dtypes[key] = value._update_proxy(new_parent, column_name=key)

def _update_proxy(self, parent, column_name):
"""
Create a new proxy, if either parent or column name are different.
Expand Down

0 comments on commit 64049a3

Please sign in to comment.