From eac3c77baf456c7bd7e1e5fde81790a4ed3ebb27 Mon Sep 17 00:00:00 2001 From: Arun Jose <40291569+arunjose696@users.noreply.github.com> Date: Fri, 14 Jun 2024 10:34:06 +0200 Subject: [PATCH] REFACTOR-modin-project#7313: Add similar methods as in 7294 for operating on columns (#7314) Signed-off-by: arunjose696 --- modin/core/dataframe/algebra/binary.py | 17 +++----- .../storage_formats/base/query_compiler.py | 43 +++++++++++++++++++ .../storage_formats/pandas/aggregations.py | 2 +- modin/core/storage_formats/pandas/merge.py | 4 +- .../storage_formats/pandas/query_compiler.py | 4 +- .../storage_formats/pandas/test_internals.py | 16 +++---- 6 files changed, 63 insertions(+), 23 deletions(-) diff --git a/modin/core/dataframe/algebra/binary.py b/modin/core/dataframe/algebra/binary.py index 2afb7733baf..b5e701d2d4b 100644 --- a/modin/core/dataframe/algebra/binary.py +++ b/modin/core/dataframe/algebra/binary.py @@ -205,13 +205,10 @@ def maybe_build_dtypes_series( Finds a union of columns and finds dtypes for all these columns. """ if not trigger_computations: - if not first._modin_frame.has_columns_cache: + if not first.frame_has_columns_cache: return None - if ( - isinstance(second, type(first)) - and not second._modin_frame.has_columns_cache - ): + if isinstance(second, type(first)) and not second.frame_has_columns_cache: return None columns_first = set(first.columns) @@ -384,8 +381,8 @@ def caller( if isinstance(other, type(query_compiler)): if broadcast: if ( - query_compiler._modin_frame.has_materialized_columns - and other._modin_frame.has_materialized_columns + query_compiler.frame_has_materialized_columns + and other.frame_has_materialized_columns ): if ( len(query_compiler.columns) == 1 @@ -408,8 +405,8 @@ def caller( ) else: if ( - query_compiler._modin_frame.has_materialized_columns - and other._modin_frame.has_materialized_columns + query_compiler.frame_has_materialized_columns + and other.frame_has_materialized_columns ): if ( len(query_compiler.columns) == 1 @@ -440,7 +437,7 @@ def caller( ) else: if ( - query_compiler._modin_frame.has_materialized_columns + query_compiler.frame_has_materialized_columns and len(query_compiler._modin_frame.columns) == 1 and is_scalar(other) ): diff --git a/modin/core/storage_formats/base/query_compiler.py b/modin/core/storage_formats/base/query_compiler.py index f239f1b46ae..80e89a577a2 100644 --- a/modin/core/storage_formats/base/query_compiler.py +++ b/modin/core/storage_formats/base/query_compiler.py @@ -4532,6 +4532,28 @@ def frame_has_materialized_dtypes(self) -> bool: """ return self._modin_frame.has_materialized_dtypes + @property + def frame_has_materialized_columns(self) -> bool: + """ + Check if the undelying dataframe has materialized columns. + + Returns + ------- + bool + """ + return self._modin_frame.has_materialized_columns + + @property + def frame_has_materialized_index(self) -> bool: + """ + Check if the undelying dataframe has materialized index. + + Returns + ------- + bool + """ + return self._modin_frame.has_materialized_index + def set_frame_dtypes_cache(self, dtypes): """ Set dtypes cache for the underlying dataframe frame. @@ -4552,6 +4574,16 @@ def set_frame_index_cache(self, index): """ self._modin_frame.set_index_cache(index) + def set_frame_columns_cache(self, index): + """ + Set columns cache for underlying dataframe. + + Parameters + ---------- + index : sequence, callable or None + """ + self._modin_frame.set_columns_cache(index) + @property def frame_has_index_cache(self): """ @@ -4563,6 +4595,17 @@ def frame_has_index_cache(self): """ return self._modin_frame.has_index_cache + @property + def frame_has_columns_cache(self): + """ + Check if the columns cache exists for underlying dataframe. + + Returns + ------- + bool + """ + return self._modin_frame.has_columns_cache + @property def frame_has_dtypes_cache(self) -> bool: """ diff --git a/modin/core/storage_formats/pandas/aggregations.py b/modin/core/storage_formats/pandas/aggregations.py index e8905e857bc..b0367d007ef 100644 --- a/modin/core/storage_formats/pandas/aggregations.py +++ b/modin/core/storage_formats/pandas/aggregations.py @@ -62,7 +62,7 @@ def corr_method( method=method, min_periods=min_periods, numeric_only=numeric_only ) - if not numeric_only and qc._modin_frame.has_materialized_columns: + if not numeric_only and qc.frame_has_materialized_columns: new_index, new_columns = ( qc._modin_frame.copy_columns_cache(), qc._modin_frame.copy_columns_cache(), diff --git a/modin/core/storage_formats/pandas/merge.py b/modin/core/storage_formats/pandas/merge.py index 37a9c325bd0..62583bc5ddb 100644 --- a/modin/core/storage_formats/pandas/merge.py +++ b/modin/core/storage_formats/pandas/merge.py @@ -216,7 +216,7 @@ def map_func( # it's fine too, we can also decide that by columns, which tend to be already # materialized quite often compared to the indexes. keep_index = False - if left._modin_frame.has_materialized_index: + if left.frame_has_materialized_index: keep_index = should_keep_index(left, right) else: # Have to trigger columns materialization. Hope they're already available at this point. @@ -286,7 +286,7 @@ def _compute_result_metadata( new_columns = None new_dtypes = None - if not left._modin_frame.has_materialized_columns: + if not left.frame_has_materialized_columns: return new_columns, new_dtypes if left_on is None and right_on is None: diff --git a/modin/core/storage_formats/pandas/query_compiler.py b/modin/core/storage_formats/pandas/query_compiler.py index d29901b8fdb..7c4f7e79f55 100644 --- a/modin/core/storage_formats/pandas/query_compiler.py +++ b/modin/core/storage_formats/pandas/query_compiler.py @@ -623,7 +623,7 @@ def _reset(df, *axis_lengths, partition_idx): # pragma: no cover new_columns = None if kwargs["drop"]: dtypes = self._modin_frame.copy_dtypes_cache() - if self._modin_frame.has_columns_cache: + if self.frame_has_columns_cache: new_columns = self._modin_frame.copy_columns_cache( copy_lengths=True ) @@ -642,7 +642,7 @@ def _reset(df, *axis_lengths, partition_idx): # pragma: no cover dtypes = None if ( # can precompute new columns if we know columns and index names - self._modin_frame.has_materialized_columns + self.frame_has_materialized_columns and index_dtypes is not None ): empty_index = ( diff --git a/modin/tests/core/storage_formats/pandas/test_internals.py b/modin/tests/core/storage_formats/pandas/test_internals.py index 99846655df1..e893e48582b 100644 --- a/modin/tests/core/storage_formats/pandas/test_internals.py +++ b/modin/tests/core/storage_formats/pandas/test_internals.py @@ -1171,13 +1171,13 @@ def test_concat_dont_materialize_opposite_axis(axis): def assert_no_cache(df, axis): if axis: - assert not df._query_compiler._modin_frame.has_materialized_columns + assert not df._query_compiler.frame_has_materialized_columns else: - assert not df._query_compiler._modin_frame.has_materialized_index + assert not df._query_compiler.frame_has_materialized_index def remove_cache(df, axis): if axis: - df._query_compiler._modin_frame.set_columns_cache(None) + df._query_compiler.set_frame_columns_cache(None) else: df._query_compiler.set_frame_index_cache(None) assert_no_cache(df, axis) @@ -2038,7 +2038,7 @@ def test_concat_axis_1( or remaining_dtype is not None ) # setting columns cache to 'None', in order to prevent completing 'dtypes' with the materialized columns - md_df._query_compiler._modin_frame.set_columns_cache(None) + md_df._query_compiler.set_frame_columns_cache(None) md_df._query_compiler.set_frame_dtypes_cache( ModinDtypes( DtypesDescriptor( @@ -2401,10 +2401,10 @@ def test_preserve_dtypes_reset_index(self, drop, has_materialized_index): # case 1: 'df' has complete dtype by default df = pd.DataFrame({"a": [1, 2, 3]}) if has_materialized_index: - assert df._query_compiler._modin_frame.has_materialized_index + assert df._query_compiler.frame_has_materialized_index else: df._query_compiler.set_frame_index_cache(None) - assert not df._query_compiler._modin_frame.has_materialized_index + assert not df._query_compiler.frame_has_materialized_index assert df._query_compiler.frame_has_materialized_dtypes res = df.reset_index(drop=drop) @@ -2444,10 +2444,10 @@ def test_preserve_dtypes_reset_index(self, drop, has_materialized_index): ) ) if has_materialized_index: - assert df._query_compiler._modin_frame.has_materialized_index + assert df._query_compiler.frame_has_materialized_index else: df._query_compiler.set_frame_index_cache(None) - assert not df._query_compiler._modin_frame.has_materialized_index + assert not df._query_compiler.frame_has_materialized_index res = df.reset_index(drop=drop) if drop: