diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 41b6519eb740f..a56212328f5c3 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -850,7 +850,8 @@ Bug Fixes - Bug in ``SparseSeries.reindex`` on single level with list of length 1 (:issue:`15447`) -- Bug in groupby operations with timedelta64 when passing ``numeric_only=False`` (:issue:`5724`) +- Bug in groupby operations with timedelta64 when passing ``numeric_only=False`` (:issue:`5724`) +- Bug in ``groupby.apply()`` coercing ``object`` dtypes to numeric types, when not all values were numeric (:issue:`14423`, :issue:`15421`, :issue:`15670`) - Bug in ``DataFrame.to_html`` with ``index=False`` and ``max_rows`` raising in ``IndexError`` (:issue:`14998`) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index a10be078a8f96..7a017ffae284c 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -10,6 +10,7 @@ zip, range, lzip, callable, map ) + from pandas import compat from pandas.compat.numpy import function as nv from pandas.compat.numpy import _np_version_under1p8 @@ -3424,6 +3425,7 @@ def _decide_output_index(self, output, labels): def _wrap_applied_output(self, keys, values, not_indexed_same=False): from pandas.core.index import _all_indexes_same + from pandas.tools.util import to_numeric if len(keys) == 0: return DataFrame(index=keys) @@ -3566,7 +3568,8 @@ def first_non_None_value(values): # as we are stacking can easily have object dtypes here so = self._selected_obj if (so.ndim == 2 and so.dtypes.apply(is_datetimelike).any()): - result = result._convert(numeric=True) + result = result.apply( + lambda x: to_numeric(x, errors='ignore')) date_cols = self._selected_obj.select_dtypes( include=['datetime', 'timedelta']).columns date_cols = date_cols.intersection(result.columns) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index d7fa3beda0abf..c25974c94bfd1 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -4314,6 +4314,54 @@ def test_cummin_cummax(self): expected = pd.Series([1, 2, 1], name='b') tm.assert_series_equal(result, expected) + def test_apply_numeric_coercion_when_datetime(self): + # In the past, group-by/apply operations have been over-eager + # in converting dtypes to numeric, in the presence of datetime + # columns. Various GH issues were filed, the reproductions + # for which are here. + + # GH 15670 + df = pd.DataFrame({'Number': [1, 2], + 'Date': ["2017-03-02"] * 2, + 'Str': ["foo", "inf"]}) + expected = df.groupby(['Number']).apply(lambda x: x.iloc[0]) + df.Date = pd.to_datetime(df.Date) + result = df.groupby(['Number']).apply(lambda x: x.iloc[0]) + tm.assert_series_equal(result['Str'], expected['Str']) + + # GH 15421 + df = pd.DataFrame({'A': [10, 20, 30], + 'B': ['foo', '3', '4'], + 'T': [pd.Timestamp("12:31:22")] * 3}) + + def get_B(g): + return g.iloc[0][['B']] + result = df.groupby('A').apply(get_B)['B'] + expected = df.B + expected.index = df.A + tm.assert_series_equal(result, expected) + + # GH 14423 + def predictions(tool): + out = pd.Series(index=['p1', 'p2', 'useTime'], dtype=object) + if 'step1' in list(tool.State): + out['p1'] = str(tool[tool.State == 'step1'].Machine.values[0]) + if 'step2' in list(tool.State): + out['p2'] = str(tool[tool.State == 'step2'].Machine.values[0]) + out['useTime'] = str( + tool[tool.State == 'step2'].oTime.values[0]) + return out + df1 = pd.DataFrame({'Key': ['B', 'B', 'A', 'A'], + 'State': ['step1', 'step2', 'step1', 'step2'], + 'oTime': ['', '2016-09-19 05:24:33', + '', '2016-09-19 23:59:04'], + 'Machine': ['23', '36L', '36R', '36R']}) + df2 = df1.copy() + df2.oTime = pd.to_datetime(df2.oTime) + expected = df1.groupby('Key').apply(predictions).p1 + result = df2.groupby('Key').apply(predictions).p1 + tm.assert_series_equal(expected, result) + def _check_groupby(df, result, keys, field, f=lambda x: x.sum()): tups = lmap(tuple, df[keys].values)