Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

REFACTOR-#5310: Remove some hasattr('columns') checks. #5311

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 20 additions & 7 deletions modin/pandas/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,22 @@ class BasePandasDataset(ClassLogger):
# but lives in "pandas" namespace.
_pandas_class = pandas.core.generic.NDFrame

@pandas.util.cache_readonly
def _is_dataframe(self) -> bool:
mvashishtha marked this conversation as resolved.
Show resolved Hide resolved
mvashishtha marked this conversation as resolved.
Show resolved Hide resolved
"""
Tell whether this is a dataframe.

Ideally, other methods of BasePandasDataset shouldn't care whether this
is a dataframe or a series, but sometimes we need to know. This method
is better than hasattr(self, "columns"), which for series will call
self.__getattr__("columns"), which requires materializing the index.

Returns
-------
bool : Whether this is a dataframe.
"""
return issubclass(self._pandas_class, pandas.DataFrame)

def _add_sibling(self, sibling):
"""
Add a DataFrame or Series object to the list of siblings.
Expand Down Expand Up @@ -162,12 +178,10 @@ def _build_repr_df(self, num_rows, num_cols):
A pandas dataset with `num_rows` or fewer rows and `num_cols` or fewer columns.
"""
# Fast track for empty dataframe.
if len(self.index) == 0 or (
hasattr(self, "columns") and len(self.columns) == 0
):
if len(self.index) == 0 or (self._is_dataframe and len(self.columns) == 0):
return pandas.DataFrame(
index=self.index,
columns=self.columns if hasattr(self, "columns") else None,
columns=self.columns if self._is_dataframe else None,
)
if len(self.index) <= num_rows:
row_indexer = slice(None)
Expand All @@ -188,7 +202,7 @@ def _build_repr_df(self, num_rows, num_cols):
if num_rows_for_tail is not None
else []
)
if hasattr(self, "columns"):
if self._is_dataframe:
if len(self.columns) <= num_cols:
col_indexer = slice(None)
else:
Expand Down Expand Up @@ -3632,8 +3646,7 @@ def __getitem__(self, key):
# This lets us reuse code in pandas to error check
indexer = None
if isinstance(key, slice) or (
isinstance(key, str)
and (not hasattr(self, "columns") or key not in self.columns)
isinstance(key, str) and (not self._is_dataframe or key not in self.columns)
):
indexer = convert_to_index_sliceable(
pandas.DataFrame(index=self.index), key
Expand Down
2 changes: 1 addition & 1 deletion modin/pandas/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -964,7 +964,7 @@ def _compute_index_grouped(self, numerical=False):
# `dropna` param is the only one that matters for the group indices result
dropna = self._kwargs.get("dropna", True)

if hasattr(self._by, "columns") and is_multi_by:
if isinstance(self._by, BaseQueryCompiler) and is_multi_by:
by = list(self._by.columns)

if is_multi_by:
Expand Down
2 changes: 1 addition & 1 deletion modin/pandas/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -718,7 +718,7 @@ def __getitem__(self, key):
):
result.index = result.index.droplevel(list(range(len(row_loc_as_list))))
if (
hasattr(result, "columns")
isinstance(result, DataFrame)
and not isinstance(col_loc_as_list, slice)
and not levels_already_dropped
and result._query_compiler.has_multiindex(axis=1)
Expand Down
2 changes: 1 addition & 1 deletion modin/pandas/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -315,7 +315,7 @@ def broadcast_item(
index_values = obj.index[row_lookup]
if not index_values.equals(item.index):
axes_to_reindex["index"] = index_values
if need_columns_reindex and hasattr(item, "columns"):
if need_columns_reindex and isinstance(item, (pandas.DataFrame, DataFrame)):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this looks super-weird, should we really support both pandas and Modin versions of DataFrame here?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We just need DataFrame. Done.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It turns out we need both. Tests fail if we don't include both.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder if this should be a bug that we need both... cc @devin-petersohn

column_values = obj.columns[col_lookup]
if not column_values.equals(item.columns):
axes_to_reindex["columns"] = column_values
Expand Down