From c369687cb12c4205c58399dccd86d26dbd6699db Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 30 Nov 2017 08:19:38 -0500 Subject: [PATCH] API/BUG: .apply will correctly infer output shape when axis=1 closes #16353 closes #17348 closes #17437 closes #18573 closes #17970 closes #17892 closes #17602 --- doc/source/whatsnew/v0.22.0.txt | 73 ++++++++++++++++++++- pandas/core/frame.py | 49 +++++++++++---- pandas/tests/frame/test_apply.py | 105 +++++++++++++++++++++++++++++++ 3 files changed, 213 insertions(+), 14 deletions(-) diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index 304ccd1f9350b6..cd097ea94631ae 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -89,8 +89,80 @@ Backwards incompatible API changes - +.. _whatsnew_0220.api_breaking.apply: +Apply Changes +~~~~~~~~~~~~~ +:func:`DataFrame.apply` was inconsistent when applying an arbitrary user-defined-function that returned a list-like with ``axis=1``. Several bugs and inconsistencies +are resolved (:issue:`16353`, :issue:`17437`, :issue:`17970`, :issue:`17348`, :issue:`17892`, :issue:`18573`, :issue:`17602`) + +.. ipython:: python + + df = pd.DataFrame(np.random.randn(6, 3), columns=['A', 'B', 'C']) + df + +Previous Behavior. If the returned shape happened to match the index, this would return a list-like. + +.. code-block:: python + + In [3]: df.apply(lambda x: [1, 2, 3], axis=1) + Out[3]: + A B C + 0 1 2 3 + 1 1 2 3 + 2 1 2 3 + 3 1 2 3 + 4 1 2 3 + 5 1 2 3 + + In [4]: df.apply(lambda x: [1, 2], axis=1) + Out[4]: + 0 [1, 2] + 1 [1, 2] + 2 [1, 2] + 3 [1, 2] + 4 [1, 2] + 5 [1, 2] + dtype: object + + +New Behavior. The behaviour is consistent. + +.. ipython:: python + + df.apply(lambda x: [1, 2, 3], axis=1) + df.apply(lambda x: [1, 2], axis=1) + +The returned input will also *not* return a Series with the list-wrapper as previously. + +.. ipython:: python + + df = pd.DataFrame([[1,2], [1,2]], columns=['a','b']) + + +Previous Behavior + +.. code-block:: python + + In [3]: df.apply(lambda x: {'s':x['a'] + x['b']}, 1) + Out[3]: + 0 {'s': 3} + 1 {'s': 3} + dtype: object + + +New Behaviour + +.. ipython:: python + + df.apply(lambda x: {'s':x['a'] + x['b']}, 1) + +To achieve the original effect, you can operate on a ``Series`` + +.. ipython:: python + + (df['a'] + df['b']).apply(lambda x: {'s': x}) .. _whatsnew_0220.api: @@ -224,7 +296,6 @@ Reshaping - Bug in :func:`DataFrame.stack` which fails trying to sort mixed type levels under Python 3 (:issue:`18310`) -- - Numeric diff --git a/pandas/core/frame.py b/pandas/core/frame.py index ff42e39d9dbdd5..3e628bd0a59014 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2139,7 +2139,7 @@ def __getitem__(self, key): try: if key in self.columns and not is_mi_columns: return self._getitem_column(key) - except: + except Exception: pass # see if we can slice the rows @@ -2582,7 +2582,7 @@ def _ensure_valid_index(self, value): if not len(self.index) and is_list_like(value): try: value = Series(value) - except: + except Exception: raise ValueError('Cannot set a frame with no defined index ' 'and a value that cannot be converted to a ' 'Series') @@ -4922,8 +4922,7 @@ def _apply_standard(self, func, axis, ignore_failures=False, reduce=True): # skip if we are mixed datelike and trying reduce across axes # GH6125 - if (reduce and axis == 1 and self._is_mixed_type and - self._is_datelike_mixed_type): + if reduce and axis == 1: reduce = False # try to reduce first (by default) @@ -4996,16 +4995,40 @@ def _apply_standard(self, func, axis, ignore_failures=False, reduce=True): raise if len(results) > 0 and is_sequence(results[0]): - if not isinstance(results[0], Series): - index = res_columns - else: - index = None + # map to rows + if axis == 0: + result = self._constructor(data=results) + + if not isinstance(results[0], Series): + try: + result.index = res_columns + except ValueError: + pass - result = self._constructor(data=results, index=index) - result.columns = res_index + try: + result.columns = res_index + except ValueError: + pass - if axis == 1: + # map to columns + else: + + result = self._constructor(data=results) result = result.T + + # try to assign the result indices; + # this may fail, if so we have + # received an invalid return shape + try: + result.index = res_index + except ValueError: + pass + + try: + result.columns = res_columns + except ValueError: + pass + result = result._convert(datetime=True, timedelta=True, copy=False) else: @@ -5741,7 +5764,7 @@ def f(x): if result.ndim == self.ndim: result = result.iloc[0] return result - except: + except Exception: pass if filter_type is None or filter_type == 'numeric': @@ -6256,7 +6279,7 @@ def convert(v): values = np.array([convert(v) for v in values]) else: values = convert(values) - except: + except Exception: values = convert(values) else: diff --git a/pandas/tests/frame/test_apply.py b/pandas/tests/frame/test_apply.py index ab2e810d776347..55f04bf189f0cc 100644 --- a/pandas/tests/frame/test_apply.py +++ b/pandas/tests/frame/test_apply.py @@ -470,6 +470,111 @@ def test_apply_non_numpy_dtype(self): assert_frame_equal(result, df) +class TestInferOutputShape(object): + # the user has supplied an opaque UDF where + # they are transforming the input that requires + # us to infer the output + + def test_infer_row_shape(self): + # gh-17437 + # if row shape is changing, infer it + df = pd.DataFrame(np.random.rand(10, 2)) + result = df.apply(np.fft.fft, axis=0) + assert result.shape == (10, 2) + + result = df.apply(np.fft.rfft, axis=0) + assert result.shape == (6, 2) + + def test_with_dictlike_columns(self): + # gh 17602 + + df = DataFrame([[1, 2], [1, 2]], columns=['a', 'b']) + result = df.apply(lambda x: {'s': x['a'] + x['b']}, 1) + expected = DataFrame({'s': df['a'].values + df['b'].values}) + assert_frame_equal(result, expected) + + df['tm'] = [pd.Timestamp('2017-05-01 00:00:00'), + pd.Timestamp('2017-05-02 00:00:00')] + result = df.apply(lambda x: {'s': x['a'] + x['b']}, 1) + assert_frame_equal(result, expected) + + # compose a series + result = (df['a'] + df['b']).apply(lambda x: {'s': x}) + expected = Series([{'s': 3}, {'s': 3}]) + assert_series_equal(result, expected) + + def test_with_listlike_columns(self): + # gh-17348 + df = DataFrame({'a': Series(np.random.randn(4)), + 'b': ['a', 'list', 'of', 'words'], + 'ts': date_range('2016-10-01', periods=4, freq='H')}) + + result = df[['a', 'b']].apply(tuple, axis=1) + expected = df[['a', 'b']] + assert_frame_equal(result, expected) + + result = df[['a', 'ts']].apply(tuple, axis=1) + expected = df[['a', 'ts']] + assert_frame_equal(result, expected) + + def test_infer_output_shape_columns(self): + # gh-18573 + + df = DataFrame({'number': [1., 2.], + 'string': ['foo', 'bar'], + 'datetime': [pd.Timestamp('2017-11-29 03:30:00'), + pd.Timestamp('2017-11-29 03:45:00')]}) + result = df.apply(lambda row: (row.number, row.string), axis=1) + expected = df[['number', 'string']].copy() + expected.columns = [0, 1] + assert_frame_equal(result, expected) + + def test_infer_output_shape_listlike_columns(self): + # gh-16353 + + df = DataFrame(np.random.randn(6, 3), columns=['A', 'B', 'C']) + + result = df.apply(lambda x: [1, 2, 3], axis=1) + expected = DataFrame({'A': 1, 'B': 2, 'C': 3}, + index=range(6)).reindex(columns=df.columns) + assert_frame_equal(result, expected) + + result = df.apply(lambda x: [1, 2], axis=1) + expected = DataFrame({0: 1, 1: 2}, + index=range(6)).reindex(columns=[0, 1]) + assert_frame_equal(result, expected) + + # gh-17970 + df = DataFrame({"a": [1, 2, 3]}) + + result = df.apply(lambda row: np.ones(1), axis=1) + expected = DataFrame({'a': 1.0}, + index=range(3)) + assert_frame_equal(result, expected) + + result = df.apply(lambda row: np.ones(2), axis=1) + expected = DataFrame({0: 1., 1: 1.}, + index=range(3)).reindex(columns=[0, 1]) + assert_frame_equal(result, expected) + + # gh-17892 + df = pd.DataFrame({'a': [pd.Timestamp('2010-02-01'), + pd.Timestamp('2010-02-04'), + pd.Timestamp('2010-02-05'), + pd.Timestamp('2010-02-06')], + 'b': [9, 5, 4, 3], + 'c': [5, 3, 4, 2], + 'd': [1, 2, 3, 4]}) + + def fun(x): + return (1, 2) + + result = df.apply(fun, axis=1) + expected = DataFrame({0: 1, 1: 2}, + index=range(4)).reindex(columns=[0, 1]) + assert_frame_equal(result, expected) + + def zip_frames(*frames): """ take a list of frames, zip the columns together for each