From 21b515be60574f11238df516465f242037b0c5fc Mon Sep 17 00:00:00 2001 From: Devin Petersohn Date: Fri, 24 Jul 2020 06:57:30 -0700 Subject: [PATCH] FEAT-#1598: Update iterator implemetion to `iloc` (#1599) Co-authored-by: Dmitry Chigarev <62142979+dchigarev@users.noreply.github.com> Signed-off-by: Devin Petersohn --- modin/pandas/dataframe.py | 29 ++++++++---------------- modin/pandas/iterator.py | 23 ++++++++++--------- modin/pandas/series.py | 11 +++------- modin/pandas/test/test_dataframe.py | 34 ++++++++++++++++++++--------- 4 files changed, 49 insertions(+), 48 deletions(-) diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py index dad2511a5c1..29c9daa9614 100644 --- a/modin/pandas/dataframe.py +++ b/modin/pandas/dataframe.py @@ -1180,14 +1180,11 @@ def iterrows(self): Returns: A generator that iterates over the rows of the frame. """ - index_iter = iter(self.index) - def iterrow_builder(df): - df.columns = self.columns - df.index = [next(index_iter)] - return df.iterrows() + def iterrow_builder(s): + return s.name, s - partition_iterator = PartitionIterator(self._query_compiler, 0, iterrow_builder) + partition_iterator = PartitionIterator(self, 0, iterrow_builder) for v in partition_iterator: yield v @@ -1202,14 +1199,11 @@ def items(self): Returns: A generator that iterates over the columns of the frame. """ - col_iter = iter(self.columns) - def items_builder(df): - df.columns = [next(col_iter)] - df.index = self.index - return df.items() + def items_builder(s): + return s.name, s - partition_iterator = PartitionIterator(self._query_compiler, 1, items_builder) + partition_iterator = PartitionIterator(self, 1, items_builder) for v in partition_iterator: yield v @@ -1240,16 +1234,11 @@ def itertuples(self, index=True, name="Pandas"): Returns: A tuple representing row data. See args for varying tuples. """ - index_iter = iter(self.index) - def itertuples_builder(df): - df.columns = self.columns - df.index = [next(index_iter)] - return df.itertuples(index=index, name=name) + def itertuples_builder(s): + return next(s._to_pandas().to_frame().T.itertuples(index=index, name=name)) - partition_iterator = PartitionIterator( - self._query_compiler, 0, itertuples_builder - ) + partition_iterator = PartitionIterator(self, 0, itertuples_builder) for v in partition_iterator: yield v diff --git a/modin/pandas/iterator.py b/modin/pandas/iterator.py index e525f5d33c6..21bd6b7643c 100644 --- a/modin/pandas/iterator.py +++ b/modin/pandas/iterator.py @@ -15,21 +15,27 @@ class PartitionIterator(Iterator): - def __init__(self, query_compiler, axis, func): + def __init__(self, df, axis, func): """PartitionIterator class to define a generator on partitioned data Args: - query_compiler: Data manager for the dataframe + df: The dataframe to iterate over axis: axis to iterate over func: The function to get inner iterables from each partition """ - self.query_compiler = query_compiler + self.df = df self.axis = axis self.index_iter = ( - iter(self.query_compiler.columns) + zip( + iter(slice(None) for _ in range(len(self.df.columns))), + range(len(self.df.columns)), + ) if axis - else iter(range(len(self.query_compiler.index))) + else zip( + range(len(self.df.index)), + iter(slice(None) for _ in range(len(self.df.index))), + ) ) self.func = func @@ -41,8 +47,5 @@ def __next__(self): def next(self): key = next(self.index_iter) - if self.axis: - df = self.query_compiler.getitem_column_array([key]).to_pandas() - else: - df = self.query_compiler.getitem_row_array([key]).to_pandas() - return next(self.func(df)) + df = self.df.iloc[key] + return self.func(df) diff --git a/modin/pandas/series.py b/modin/pandas/series.py index 6ebea5e07a3..5c2ef11d840 100644 --- a/modin/pandas/series.py +++ b/modin/pandas/series.py @@ -934,15 +934,10 @@ def item(self): return self[0] def items(self): - index_iter = iter(self.index) + def item_builder(s): + return s.name, s.squeeze() - def item_builder(df): - s = df.iloc[:, 0] - s.index = [next(index_iter)] - s.name = self.name - return s.items() - - partition_iterator = PartitionIterator(self._query_compiler, 0, item_builder) + partition_iterator = PartitionIterator(self.to_frame(), 0, item_builder) for v in partition_iterator: yield v diff --git a/modin/pandas/test/test_dataframe.py b/modin/pandas/test/test_dataframe.py index 6daed7af609..a18f3557cdd 100644 --- a/modin/pandas/test/test_dataframe.py +++ b/modin/pandas/test/test_dataframe.py @@ -5381,8 +5381,10 @@ def test_iterrows(self, data): df_equals(pandas_series, modin_series) assert pandas_index == modin_index + @pytest.mark.parametrize("name", [None, "NotPandas", "Pandas"]) + @pytest.mark.parametrize("index", [True, False]) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test_itertuples(self, data): + def test_itertuples(self, name, index, data): modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) @@ -5392,16 +5394,28 @@ def test_itertuples(self, data): for modin_row, pandas_row in zip(modin_it_default, pandas_it_default): np.testing.assert_equal(modin_row, pandas_row) - # test all combinations of custom params - indices = [True, False] - names = [None, "NotPandas", "Pandas"] + modin_it_custom = modin_df.itertuples(index=index, name=name) + pandas_it_custom = pandas_df.itertuples(index=index, name=name) + for modin_row, pandas_row in zip(modin_it_custom, pandas_it_custom): + np.testing.assert_equal(modin_row, pandas_row) - for index in indices: - for name in names: - modin_it_custom = modin_df.itertuples(index=index, name=name) - pandas_it_custom = pandas_df.itertuples(index=index, name=name) - for modin_row, pandas_row in zip(modin_it_custom, pandas_it_custom): - np.testing.assert_equal(modin_row, pandas_row) + mi_index_modin = pd.MultiIndex.from_tuples( + [(i // 4, i // 2, i) for i in range(len(modin_df.columns))] + ) + mi_index_pandas = pandas.MultiIndex.from_tuples( + [(i // 4, i // 2, i) for i in range(len(pandas_df.columns))] + ) + modin_df.columns = mi_index_modin + pandas_df.columns = mi_index_pandas + modin_it_default = modin_df.itertuples() + pandas_it_default = pandas_df.itertuples() + for modin_row, pandas_row in zip(modin_it_default, pandas_it_default): + np.testing.assert_equal(modin_row, pandas_row) + + modin_it_custom = modin_df.itertuples(index=index, name=name) + pandas_it_custom = pandas_df.itertuples(index=index, name=name) + for modin_row, pandas_row in zip(modin_it_custom, pandas_it_custom): + np.testing.assert_equal(modin_row, pandas_row) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test___iter__(self, data):