Skip to content

Commit

Permalink
BUG: Group-by numeric type-coercion with datetime
Browse files Browse the repository at this point in the history
closes #14423
closes #15421
closes #15670

During a group-by/apply
on a DataFrame, in the presence of one or more  DateTime-like columns,
Pandas would incorrectly coerce the type of all  other columns to
numeric.  E.g. a String column would be coerced to  numeric, producing
NaNs.

Author: Greg Williams <[email protected]>

Closes #15680 from gwpdt/bugfix14423 and squashes the following commits:

e1ed104 [Greg Williams] TST: Rename and expand test_numeric_coercion
0a15674 [Greg Williams] CLN: move import, add whatsnew entry
c8844e0 [Greg Williams] CLN: PEP8 (whitespace fixes)
46d12c2 [Greg Williams] BUG: Group-by numeric type-coericion with datetime
  • Loading branch information
gwpdt authored and jreback committed Mar 16, 2017
1 parent e7956c4 commit 37e5f78
Show file tree
Hide file tree
Showing 3 changed files with 54 additions and 2 deletions.
3 changes: 2 additions & 1 deletion doc/source/whatsnew/v0.20.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -850,7 +850,8 @@ Bug Fixes
- Bug in ``SparseSeries.reindex`` on single level with list of length 1 (:issue:`15447`)


- Bug in groupby operations with timedelta64 when passing ``numeric_only=False`` (:issue:`5724`)
- Bug in groupby operations with timedelta64 when passing ``numeric_only=False`` (:issue:`5724`)
- Bug in ``groupby.apply()`` coercing ``object`` dtypes to numeric types, when not all values were numeric (:issue:`14423`, :issue:`15421`, :issue:`15670`)


- Bug in ``DataFrame.to_html`` with ``index=False`` and ``max_rows`` raising in ``IndexError`` (:issue:`14998`)
Expand Down
5 changes: 4 additions & 1 deletion pandas/core/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
zip, range, lzip,
callable, map
)

from pandas import compat
from pandas.compat.numpy import function as nv
from pandas.compat.numpy import _np_version_under1p8
Expand Down Expand Up @@ -3424,6 +3425,7 @@ def _decide_output_index(self, output, labels):

def _wrap_applied_output(self, keys, values, not_indexed_same=False):
from pandas.core.index import _all_indexes_same
from pandas.tools.util import to_numeric

if len(keys) == 0:
return DataFrame(index=keys)
Expand Down Expand Up @@ -3566,7 +3568,8 @@ def first_non_None_value(values):
# as we are stacking can easily have object dtypes here
so = self._selected_obj
if (so.ndim == 2 and so.dtypes.apply(is_datetimelike).any()):
result = result._convert(numeric=True)
result = result.apply(
lambda x: to_numeric(x, errors='ignore'))
date_cols = self._selected_obj.select_dtypes(
include=['datetime', 'timedelta']).columns
date_cols = date_cols.intersection(result.columns)
Expand Down
48 changes: 48 additions & 0 deletions pandas/tests/groupby/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -4314,6 +4314,54 @@ def test_cummin_cummax(self):
expected = pd.Series([1, 2, 1], name='b')
tm.assert_series_equal(result, expected)

def test_apply_numeric_coercion_when_datetime(self):
# In the past, group-by/apply operations have been over-eager
# in converting dtypes to numeric, in the presence of datetime
# columns. Various GH issues were filed, the reproductions
# for which are here.

# GH 15670
df = pd.DataFrame({'Number': [1, 2],
'Date': ["2017-03-02"] * 2,
'Str': ["foo", "inf"]})
expected = df.groupby(['Number']).apply(lambda x: x.iloc[0])
df.Date = pd.to_datetime(df.Date)
result = df.groupby(['Number']).apply(lambda x: x.iloc[0])
tm.assert_series_equal(result['Str'], expected['Str'])

# GH 15421
df = pd.DataFrame({'A': [10, 20, 30],
'B': ['foo', '3', '4'],
'T': [pd.Timestamp("12:31:22")] * 3})

def get_B(g):
return g.iloc[0][['B']]
result = df.groupby('A').apply(get_B)['B']
expected = df.B
expected.index = df.A
tm.assert_series_equal(result, expected)

# GH 14423
def predictions(tool):
out = pd.Series(index=['p1', 'p2', 'useTime'], dtype=object)
if 'step1' in list(tool.State):
out['p1'] = str(tool[tool.State == 'step1'].Machine.values[0])
if 'step2' in list(tool.State):
out['p2'] = str(tool[tool.State == 'step2'].Machine.values[0])
out['useTime'] = str(
tool[tool.State == 'step2'].oTime.values[0])
return out
df1 = pd.DataFrame({'Key': ['B', 'B', 'A', 'A'],
'State': ['step1', 'step2', 'step1', 'step2'],
'oTime': ['', '2016-09-19 05:24:33',
'', '2016-09-19 23:59:04'],
'Machine': ['23', '36L', '36R', '36R']})
df2 = df1.copy()
df2.oTime = pd.to_datetime(df2.oTime)
expected = df1.groupby('Key').apply(predictions).p1
result = df2.groupby('Key').apply(predictions).p1
tm.assert_series_equal(expected, result)


def _check_groupby(df, result, keys, field, f=lambda x: x.sum()):
tups = lmap(tuple, df[keys].values)
Expand Down

0 comments on commit 37e5f78

Please sign in to comment.