From 080eb2f7f4d32f6c08a3a8029bf108898fddb416 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Sun, 5 Dec 2021 01:02:20 +0300 Subject: [PATCH 1/4] REFACTOR-#3780: remove '_row_lengths', '_column_widths' funcs in 'PandasOnDaskDataframe' Signed-off-by: Anatoly Myachev --- .../pandas_on_dask/dataframe/dataframe.py | 39 ------------------- 1 file changed, 39 deletions(-) diff --git a/modin/core/execution/dask/implementations/pandas_on_dask/dataframe/dataframe.py b/modin/core/execution/dask/implementations/pandas_on_dask/dataframe/dataframe.py index 1db4fd986f4..78c40853ec1 100644 --- a/modin/core/execution/dask/implementations/pandas_on_dask/dataframe/dataframe.py +++ b/modin/core/execution/dask/implementations/pandas_on_dask/dataframe/dataframe.py @@ -16,8 +16,6 @@ from modin.core.dataframe.pandas.dataframe.dataframe import PandasDataframe from ..partitioning.partition_manager import PandasOnDaskDataframePartitionManager -from distributed.client import default_client - class PandasOnDaskDataframe(PandasDataframe): """ @@ -42,40 +40,3 @@ class PandasOnDaskDataframe(PandasDataframe): """ _partition_mgr_cls = PandasOnDaskDataframePartitionManager - - @property - def _row_lengths(self): - """ - Compute the row partitions lengths if they are not cached. - - Returns - ------- - list - A list of row partitions lengths. - """ - client = default_client() - if self._row_lengths_cache is None: - self._row_lengths_cache = client.gather( - [obj.apply(lambda df: len(df)).future for obj in self._partitions.T[0]] - ) - return self._row_lengths_cache - - @property - def _column_widths(self): - """ - Compute the column partitions widths if they are not cached. - - Returns - ------- - list - A list of column partitions widths. - """ - client = default_client() - if self._column_widths_cache is None: - self._column_widths_cache = client.gather( - [ - obj.apply(lambda df: len(df.columns)).future - for obj in self._partitions[0] - ] - ) - return self._column_widths_cache From d6dfe3f971ccc88ddb70ade40b50975c13ef41a9 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Tue, 22 Feb 2022 02:50:09 +0300 Subject: [PATCH 2/4] add cache check Signed-off-by: Anatoly Myachev --- .../dataframe/pandas/dataframe/dataframe.py | 22 +++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/modin/core/dataframe/pandas/dataframe/dataframe.py b/modin/core/dataframe/pandas/dataframe/dataframe.py index a340aafe25e..96e2ed3a0a3 100644 --- a/modin/core/dataframe/pandas/dataframe/dataframe.py +++ b/modin/core/dataframe/pandas/dataframe/dataframe.py @@ -212,9 +212,14 @@ def _row_lengths(self): """ if self._row_lengths_cache is None: if len(self._partitions.T) > 0: - self._row_lengths_cache = [ - obj.length() for obj in self._partitions.T[0] - ] + partitions = self._partitions.T[0] + if all(map(lambda t: t._length_cache is not None, partitions)): + self._row_lengths_cache = [obj.length() for obj in partitions] + else: + lengths = [obj.apply(lambda df: len(df)) for obj in partitions] + self._row_lengths_cache = ( + self._partition_mgr_cls.get_objects_from_partitions(lengths) + ) else: self._row_lengths_cache = [] return self._row_lengths_cache @@ -231,7 +236,16 @@ def _column_widths(self): """ if self._column_widths_cache is None: if len(self._partitions) > 0: - self._column_widths_cache = [obj.width() for obj in self._partitions[0]] + partitions = self._partitions[0] + if all(map(lambda t: t._width_cache is not None, partitions)): + self._column_widths_cache = [obj.width() for obj in partitions] + else: + widths = [ + obj.apply(lambda df: len(df.columns)) for obj in partitions + ] + self._column_widths_cache = ( + self._partition_mgr_cls.get_objects_from_partitions(widths) + ) else: self._column_widths_cache = [] return self._column_widths_cache From bec8bdbce75bb996d82c0153540ce1984d13a3a8 Mon Sep 17 00:00:00 2001 From: Myachev Date: Thu, 1 Sep 2022 13:48:16 +0200 Subject: [PATCH 3/4] Revert "add cache check" This reverts commit d6dfe3f971ccc88ddb70ade40b50975c13ef41a9. --- .../dataframe/pandas/dataframe/dataframe.py | 22 ++++--------------- 1 file changed, 4 insertions(+), 18 deletions(-) diff --git a/modin/core/dataframe/pandas/dataframe/dataframe.py b/modin/core/dataframe/pandas/dataframe/dataframe.py index 96e2ed3a0a3..a340aafe25e 100644 --- a/modin/core/dataframe/pandas/dataframe/dataframe.py +++ b/modin/core/dataframe/pandas/dataframe/dataframe.py @@ -212,14 +212,9 @@ def _row_lengths(self): """ if self._row_lengths_cache is None: if len(self._partitions.T) > 0: - partitions = self._partitions.T[0] - if all(map(lambda t: t._length_cache is not None, partitions)): - self._row_lengths_cache = [obj.length() for obj in partitions] - else: - lengths = [obj.apply(lambda df: len(df)) for obj in partitions] - self._row_lengths_cache = ( - self._partition_mgr_cls.get_objects_from_partitions(lengths) - ) + self._row_lengths_cache = [ + obj.length() for obj in self._partitions.T[0] + ] else: self._row_lengths_cache = [] return self._row_lengths_cache @@ -236,16 +231,7 @@ def _column_widths(self): """ if self._column_widths_cache is None: if len(self._partitions) > 0: - partitions = self._partitions[0] - if all(map(lambda t: t._width_cache is not None, partitions)): - self._column_widths_cache = [obj.width() for obj in partitions] - else: - widths = [ - obj.apply(lambda df: len(df.columns)) for obj in partitions - ] - self._column_widths_cache = ( - self._partition_mgr_cls.get_objects_from_partitions(widths) - ) + self._column_widths_cache = [obj.width() for obj in self._partitions[0]] else: self._column_widths_cache = [] return self._column_widths_cache From b3045eec9fd658a08f84dd4438c64ce49e7c0895 Mon Sep 17 00:00:00 2001 From: Myachev Date: Thu, 1 Sep 2022 13:51:46 +0200 Subject: [PATCH 4/4] add release note Signed-off-by: Myachev --- docs/release_notes/release_notes-0.16.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/release_notes/release_notes-0.16.0.rst b/docs/release_notes/release_notes-0.16.0.rst index 1050ed3c2f7..9e5a87af549 100644 --- a/docs/release_notes/release_notes-0.16.0.rst +++ b/docs/release_notes/release_notes-0.16.0.rst @@ -79,6 +79,7 @@ Key Features and Updates * REFACTOR-#4832: unify `split_result_of_axis_func_pandas` (#4831) * REFACTOR-#4796: Introduce constant for __reduced__ column name (#4799) * REFACTOR-#4000: Remove code duplication for `PandasOnRayDataframePartitionManager` (#4895) + * REFACTOR-#3780: Remove code duplication for `PandasOnDaskDataframe` (#3781) * REFACTOR-#4530: Unify access to physical data for any partition type (#4829) * Pandas API implementations and improvements * FEAT-#4670: Implement convert_dtypes by mapping across partitions (#4671)