diff --git a/sdks/python/apache_beam/dataframe/frames.py b/sdks/python/apache_beam/dataframe/frames.py index 627366f6ed18..2fab90db479c 100644 --- a/sdks/python/apache_beam/dataframe/frames.py +++ b/sdks/python/apache_beam/dataframe/frames.py @@ -143,6 +143,14 @@ def wrapper(self, *args, **kwargs): class DeferredDataFrameOrSeries(frame_base.DeferredFrame): + def _render_indexes(self): + if self.index.nlevels == 1: + return 'index=' + ( + '' if self.index.name is None else repr(self.index.name)) + else: + return 'indexes=[' + ', '.join( + '' if ix is None else repr(ix) + for ix in self.index.names) + ']' __array__ = frame_base.wont_implement_method( pd.Series, '__array__', reason="non-deferred-result") @@ -1036,6 +1044,11 @@ def _set_index(self, value): @populate_not_implemented(pd.Series) @frame_base.DeferredFrame._register_for(pd.Series) class DeferredSeries(DeferredDataFrameOrSeries): + def __repr__(self): + return ( + f'DeferredSeries(name={self.name!r}, dtype={self.dtype}, ' + f'{self._render_indexes()})') + @property # type: ignore @frame_base.with_docs_from(pd.Series) def name(self): @@ -1942,6 +1955,11 @@ def repeat(self, repeats, axis): @populate_not_implemented(pd.DataFrame) @frame_base.DeferredFrame._register_for(pd.DataFrame) class DeferredDataFrame(DeferredDataFrameOrSeries): + def __repr__(self): + return ( + f'DeferredDataFrame(columns={list(self.columns)}, ' + f'{self._render_indexes()})') + @property # type: ignore @frame_base.with_docs_from(pd.DataFrame) def columns(self): diff --git a/sdks/python/apache_beam/dataframe/frames_test.py b/sdks/python/apache_beam/dataframe/frames_test.py index 2f782bab8832..634d24043de5 100644 --- a/sdks/python/apache_beam/dataframe/frames_test.py +++ b/sdks/python/apache_beam/dataframe/frames_test.py @@ -2059,5 +2059,116 @@ def test_docs_defined(self, beam_type, pd_type): f'operations:\n{docstring_missing}') +class ReprTest(unittest.TestCase): + def test_basic_dataframe(self): + df = frame_base.DeferredFrame.wrap( + expressions.ConstantExpression(GROUPBY_DF)) + self.assertEqual( + repr(df), + ( + "DeferredDataFrame(columns=['group', 'foo', 'bar', 'baz', 'bool', " + "'str'], index=)")) + + def test_dataframe_with_named_index(self): + df = frame_base.DeferredFrame.wrap( + expressions.ConstantExpression(GROUPBY_DF.set_index('group'))) + self.assertEqual( + repr(df), + ( + "DeferredDataFrame(columns=['foo', 'bar', 'baz', 'bool', 'str'], " + "index='group')")) + + def test_dataframe_with_partial_named_index(self): + df = frame_base.DeferredFrame.wrap( + expressions.ConstantExpression( + GROUPBY_DF.set_index([GROUPBY_DF.index, 'group']))) + self.assertEqual( + repr(df), + ( + "DeferredDataFrame(columns=['foo', 'bar', 'baz', 'bool', 'str'], " + "indexes=[, 'group'])")) + + def test_dataframe_with_named_multi_index(self): + df = frame_base.DeferredFrame.wrap( + expressions.ConstantExpression(GROUPBY_DF.set_index(['str', 'group']))) + self.assertEqual( + repr(df), + ( + "DeferredDataFrame(columns=['foo', 'bar', 'baz', 'bool'], " + "indexes=['str', 'group'])")) + + def test_dataframe_with_multiple_column_levels(self): + df = pd.DataFrame({ + 'foofoofoo': ['one', 'one', 'one', 'two', 'two', 'two'], + 'barbar': ['A', 'B', 'C', 'A', 'B', 'C'], + 'bazzy': [1, 2, 3, 4, 5, 6], + 'zoop': ['x', 'y', 'z', 'q', 'w', 't'] + }) + + df = df.pivot(index='foofoofoo', columns='barbar') + df = frame_base.DeferredFrame.wrap(expressions.ConstantExpression(df)) + self.assertEqual( + repr(df), + ( + "DeferredDataFrame(columns=[('bazzy', 'A'), ('bazzy', 'B'), " + "('bazzy', 'C'), ('zoop', 'A'), ('zoop', 'B'), ('zoop', 'C')], " + "index='foofoofoo')")) + + def test_dataframe_with_multiple_column_and_multiple_index_levels(self): + df = pd.DataFrame({ + 'foofoofoo': ['one', 'one', 'one', 'two', 'two', 'two'], + 'barbar': ['A', 'B', 'C', 'A', 'B', 'C'], + 'bazzy': [1, 2, 3, 4, 5, 6], + 'zoop': ['x', 'y', 'z', 'q', 'w', 't'] + }) + + df = df.pivot(index='foofoofoo', columns='barbar') + df.index = [['a', 'b'], df.index] + + # pandas repr displays this: + # bazzy zoop + # barbar A B C A B C + # foofoofoo + # a one 1 2 3 x y z + # b two 4 5 6 q w t + df = frame_base.DeferredFrame.wrap(expressions.ConstantExpression(df)) + self.assertEqual( + repr(df), + ( + "DeferredDataFrame(columns=[('bazzy', 'A'), ('bazzy', 'B'), " + "('bazzy', 'C'), ('zoop', 'A'), ('zoop', 'B'), ('zoop', 'C')], " + "indexes=[, 'foofoofoo'])")) + + def test_basic_series(self): + df = frame_base.DeferredFrame.wrap( + expressions.ConstantExpression(GROUPBY_DF['bool'])) + self.assertEqual( + repr(df), "DeferredSeries(name='bool', dtype=bool, index=)") + + def test_series_with_named_index(self): + df = frame_base.DeferredFrame.wrap( + expressions.ConstantExpression(GROUPBY_DF.set_index('group')['str'])) + self.assertEqual( + repr(df), "DeferredSeries(name='str', dtype=object, index='group')") + + def test_series_with_partial_named_index(self): + df = frame_base.DeferredFrame.wrap( + expressions.ConstantExpression( + GROUPBY_DF.set_index([GROUPBY_DF.index, 'group'])['bar'])) + self.assertEqual( + repr(df), + ( + "DeferredSeries(name='bar', dtype=float64, " + "indexes=[, 'group'])")) + + def test_series_with_named_multi_index(self): + df = frame_base.DeferredFrame.wrap( + expressions.ConstantExpression( + GROUPBY_DF.set_index(['str', 'group'])['baz'])) + self.assertEqual( + repr(df), + "DeferredSeries(name='baz', dtype=float64, indexes=['str', 'group'])") + + if __name__ == '__main__': unittest.main()