Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: agg() function on groupby dataframe changes dtype of datetime64[ns] column to float64 #12992

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
.vagrant
.noseids
.ipynb_checkpoints
.tags

# Compiled source #
###################
Expand Down
6 changes: 6 additions & 0 deletions doc/source/whatsnew/v0.19.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -910,5 +910,11 @@ Bug Fixes
- Bug in ``Index.union`` returns an incorrect result with a named empty index (:issue:`13432`)
- Bugs in ``Index.difference`` and ``DataFrame.join`` raise in Python3 when using mixed-integer indexes (:issue:`13432`, :issue:`12814`)
- Bug in ``.to_excel()`` when DataFrame contains a MultiIndex which contains a label with a NaN value (:issue:`13511`)

- Bug in ``agg()`` function on groupby dataframe changes dtype of ``datetime64[ns]`` column to ``float64`` (:issue:`12821`)

- Bug in operations on ``NaT`` returning ``float`` instead of ``datetime64[ns]`` (:issue:`12941`)

- Bug in ``pd.read_csv`` in Python 2.x with non-UTF8 encoded, multi-character separated data (:issue:`3404`)

- Bug in ``Index`` raises ``KeyError`` displaying incorrect column when column is not in the df and columns contains duplicate values (:issue:`13822`)
13 changes: 9 additions & 4 deletions pandas/core/nanops.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
is_integer, is_complex, is_float_dtype,
is_complex_dtype, is_integer_dtype,
is_bool_dtype, is_object_dtype,
is_numeric_dtype,
is_datetime64_dtype, is_timedelta64_dtype,
is_datetime_or_timedelta_dtype,
is_int_or_datetime_dtype, is_any_int_dtype)
Expand Down Expand Up @@ -638,11 +639,15 @@ def _maybe_null_out(result, axis, mask):
if axis is not None and getattr(result, 'ndim', False):
null_mask = (mask.shape[axis] - mask.sum(axis)) == 0
if np.any(null_mask):
if np.iscomplexobj(result):
result = result.astype('c16')
if is_numeric_dtype(result):
if np.iscomplexobj(result):
result = result.astype('c16')
else:
result = result.astype('f8')
result[null_mask] = np.nan
else:
result = result.astype('f8')
result[null_mask] = np.nan
# GH12941, use None to auto cast null
result[null_mask] = None
elif result is not tslib.NaT:
null_mask = mask.size - mask.sum()
if null_mask == 0:
Expand Down
24 changes: 24 additions & 0 deletions pandas/tests/frame/test_timeseries.py
Original file line number Diff line number Diff line change
Expand Up @@ -341,3 +341,27 @@ def test_first_last_valid(self):
empty = DataFrame()
self.assertIsNone(empty.last_valid_index())
self.assertIsNone(empty.first_valid_index())

def test_operation_on_NaT(self):
# Both NaT and Timestamp are in DataFrame.
df = pd.DataFrame({'foo': [pd.NaT, pd.NaT,
pd.Timestamp('2012-05-01')]})

res = df.min()
exp = pd.Series([pd.Timestamp('2012-05-01')], index=["foo"])
tm.assert_series_equal(res, exp)

res = df.max()
exp = pd.Series([pd.Timestamp('2012-05-01')], index=["foo"])
tm.assert_series_equal(res, exp)

# GH12941, only NaTs are in DataFrame.
df = pd.DataFrame({'foo': [pd.NaT, pd.NaT]})

res = df.min()
exp = pd.Series([pd.NaT], index=["foo"])
tm.assert_series_equal(res, exp)

res = df.max()
exp = pd.Series([pd.NaT], index=["foo"])
tm.assert_series_equal(res, exp)
26 changes: 26 additions & 0 deletions pandas/tests/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -720,6 +720,32 @@ def test_agg_period_index(self):
grouped = df.groupby(df.index.month)
list(grouped)

def test_agg_dict_parameter_cast_result_dtypes(self):
# GH 12821

df = DataFrame(
{'class': ['A', 'A', 'B', 'B', 'C', 'C', 'D', 'D'],
'time': date_range('1/1/2011', periods=8, freq='H')})
df.loc[[0, 1, 2, 5], 'time'] = None

# test for `first` function
exp = df.loc[[0, 3, 4, 6]].set_index('class')
grouped = df.groupby('class')
assert_frame_equal(grouped.first(), exp)
assert_frame_equal(grouped.agg('first'), exp)
assert_frame_equal(grouped.agg({'time': 'first'}), exp)
assert_series_equal(grouped.time.first(), exp['time'])
assert_series_equal(grouped.time.agg('first'), exp['time'])

# test for `last` function
exp = df.loc[[0, 3, 4, 7]].set_index('class')
grouped = df.groupby('class')
assert_frame_equal(grouped.last(), exp)
assert_frame_equal(grouped.agg('last'), exp)
assert_frame_equal(grouped.agg({'time': 'last'}), exp)
assert_series_equal(grouped.time.last(), exp['time'])
assert_series_equal(grouped.time.agg('last'), exp['time'])

def test_agg_must_agg(self):
grouped = self.df.groupby('A')['C']
self.assertRaises(Exception, grouped.agg, lambda x: x.describe())
Expand Down
3 changes: 2 additions & 1 deletion pandas/types/cast.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,8 @@ def trans(x): # noqa
return new_result

# a datetimelike
elif dtype.kind in ['M', 'm'] and result.dtype.kind in ['i']:
# GH12821, iNaT is casted to float
elif dtype.kind in ['M', 'm'] and result.dtype.kind in ['i', 'f']:
try:
result = result.astype(dtype)
except:
Expand Down